Use TCP_CORK/TCP_NOFLUSH socket options where available to coalesce writes for multi...
[m6w6/libmemcached] / libmemcached / memcached_io.c
1 /*
2 Basic socket buffered IO
3 */
4
5 #include "common.h"
6 #include "memcached_io.h"
7 #include <sys/select.h>
8 #include <poll.h>
9
10 typedef enum {
11 MEM_READ,
12 MEM_WRITE
13 } memc_read_or_write;
14
15 static ssize_t io_flush(memcached_server_st *ptr, memcached_return *error);
16 static void increment_udp_message_id(memcached_server_st *ptr);
17
18 static memcached_return io_wait(memcached_server_st *ptr,
19 memc_read_or_write read_or_write)
20 {
21 struct pollfd fds= {
22 .fd= ptr->fd,
23 .events = POLLIN
24 };
25 int error;
26
27 unlikely (read_or_write == MEM_WRITE) /* write */
28 fds.events= POLLOUT;
29
30 /*
31 ** We are going to block on write, but at least on Solaris we might block
32 ** on write if we haven't read anything from our input buffer..
33 ** Try to purge the input buffer if we don't do any flow control in the
34 ** application layer (just sending a lot of data etc)
35 ** The test is moved down in the purge function to avoid duplication of
36 ** the test.
37 */
38 if (read_or_write == MEM_WRITE)
39 {
40 memcached_return rc= memcached_purge(ptr);
41 if (rc != MEMCACHED_SUCCESS && rc != MEMCACHED_STORED)
42 return MEMCACHED_FAILURE;
43 }
44
45 int timeout= ptr->root->poll_timeout;
46 if ((ptr->root->flags & MEM_NO_BLOCK) == 0)
47 timeout= -1;
48
49 error= poll(&fds, 1, timeout);
50
51 if (error == 1)
52 return MEMCACHED_SUCCESS;
53 else if (error == 0)
54 return MEMCACHED_TIMEOUT;
55
56 /* Imposssible for anything other then -1 */
57 WATCHPOINT_ASSERT(error == -1);
58 memcached_quit_server(ptr, 1);
59
60 return MEMCACHED_FAILURE;
61 }
62
63 /**
64 * Try to fill the input buffer for a server with as much
65 * data as possible.
66 *
67 * @param ptr the server to pack
68 */
69 static bool repack_input_buffer(memcached_server_st *ptr)
70 {
71 if (ptr->read_ptr != ptr->read_buffer)
72 {
73 /* Move all of the data to the beginning of the buffer so
74 ** that we can fit more data into the buffer...
75 */
76 memmove(ptr->read_buffer, ptr->read_ptr, ptr->read_buffer_length);
77 ptr->read_ptr= ptr->read_buffer;
78 ptr->read_data_length= ptr->read_buffer_length;
79 }
80
81 /* There is room in the buffer, try to fill it! */
82 if (ptr->read_buffer_length != MEMCACHED_MAX_BUFFER)
83 {
84 /* Just try a single read to grab what's available */
85 ssize_t nr= read(ptr->fd,
86 ptr->read_ptr + ptr->read_data_length,
87 MEMCACHED_MAX_BUFFER - ptr->read_data_length);
88
89 if (nr > 0)
90 {
91 ptr->read_data_length+= (size_t)nr;
92 ptr->read_buffer_length+= (size_t)nr;
93 return true;
94 }
95 }
96 return false;
97 }
98
99 /**
100 * If the we have callbacks connected to this server structure
101 * we may start process the input queue and fire the callbacks
102 * for the incomming messages. This function is _only_ called
103 * when the input buffer is full, so that we _know_ that we have
104 * at least _one_ message to process.
105 *
106 * @param ptr the server to star processing iput messages for
107 * @return true if we processed anything, false otherwise
108 */
109 static bool process_input_buffer(memcached_server_st *ptr)
110 {
111 /*
112 ** We might be able to process some of the response messages if we
113 ** have a callback set up
114 */
115 if (ptr->root->callbacks != NULL && (ptr->root->flags & MEM_USE_UDP) == 0)
116 {
117 /*
118 * We might have responses... try to read them out and fire
119 * callbacks
120 */
121 memcached_callback_st cb= *ptr->root->callbacks;
122
123 char buffer[MEMCACHED_DEFAULT_COMMAND_SIZE];
124 memcached_return error;
125 error= memcached_response(ptr, buffer, sizeof(buffer),
126 &ptr->root->result);
127 if (error == MEMCACHED_SUCCESS)
128 {
129 for (unsigned int x= 0; x < cb.number_of_callback; x++)
130 {
131 error= (*cb.callback[x])(ptr->root, &ptr->root->result, cb.context);
132 if (error != MEMCACHED_SUCCESS)
133 break;
134 }
135
136 /* @todo what should I do with the error message??? */
137 }
138 /* @todo what should I do with other error messages?? */
139 return true;
140 }
141
142 return false;
143 }
144
145 #ifdef TCP_CORK
146 #define CORK TCP_CORK
147 #elif defined TCP_NOPUSH
148 #define CORK TCP_NOPUSH
149 #endif
150
151 static void memcached_io_cork(memcached_server_st *ptr, int enable)
152 {
153 #ifdef CORK
154 if (ptr->type != MEMCACHED_CONNECTION_TCP)
155 return;
156
157 if ((enable && ptr->is_corked) || (!enable && !ptr->is_corked))
158 return;
159
160 int err= setsockopt(ptr->fd, IPPROTO_TCP, CORK,
161 &enable, (socklen_t)sizeof(int));
162 if (!err)
163 ptr->is_corked= enable;
164 #endif
165 }
166
167 #ifdef UNUSED
168 void memcached_io_preread(memcached_st *ptr)
169 {
170 unsigned int x;
171
172 return;
173
174 for (x= 0; x < ptr->number_of_hosts; x++)
175 {
176 if (memcached_server_response_count(ptr, x) &&
177 ptr->hosts[x].read_data_length < MEMCACHED_MAX_BUFFER )
178 {
179 size_t data_read;
180
181 data_read= read(ptr->hosts[x].fd,
182 ptr->hosts[x].read_ptr + ptr->hosts[x].read_data_length,
183 MEMCACHED_MAX_BUFFER - ptr->hosts[x].read_data_length);
184 if (data_read == -1)
185 continue;
186
187 ptr->hosts[x].read_buffer_length+= data_read;
188 ptr->hosts[x].read_data_length+= data_read;
189 }
190 }
191 }
192 #endif
193
194 memcached_return memcached_io_read(memcached_server_st *ptr,
195 void *buffer, size_t length, ssize_t *nread)
196 {
197 char *buffer_ptr;
198
199 buffer_ptr= buffer;
200
201 while (length)
202 {
203 if (!ptr->read_buffer_length)
204 {
205 ssize_t data_read;
206
207 while (1)
208 {
209 data_read= read(ptr->fd, ptr->read_buffer, MEMCACHED_MAX_BUFFER);
210 if (data_read > 0)
211 break;
212 else if (data_read == -1)
213 {
214 ptr->cached_errno= errno;
215 memcached_return rc= MEMCACHED_UNKNOWN_READ_FAILURE;
216 switch (errno)
217 {
218 case EAGAIN:
219 case EINTR:
220 if ((rc= io_wait(ptr, MEM_READ)) == MEMCACHED_SUCCESS)
221 continue;
222 /* fall through */
223
224 default:
225 {
226 memcached_quit_server(ptr, 1);
227 *nread= -1;
228 return rc;
229 }
230 }
231 }
232 else
233 {
234 /*
235 EOF. Any data received so far is incomplete
236 so discard it. This always reads by byte in case of TCP
237 and protocol enforcement happens at memcached_response()
238 looking for '\n'. We do not care for UDB which requests 8 bytes
239 at once. Generally, this means that connection went away. Since
240 for blocking I/O we do not return 0 and for non-blocking case
241 it will return EGAIN if data is not immediatly available.
242 */
243 memcached_quit_server(ptr, 1);
244 *nread= -1;
245 return MEMCACHED_UNKNOWN_READ_FAILURE;
246 }
247 }
248
249 ptr->io_bytes_sent = 0;
250 ptr->read_data_length= (size_t) data_read;
251 ptr->read_buffer_length= (size_t) data_read;
252 ptr->read_ptr= ptr->read_buffer;
253 }
254
255 if (length > 1)
256 {
257 size_t difference;
258
259 difference= (length > ptr->read_buffer_length) ? ptr->read_buffer_length : length;
260
261 memcpy(buffer_ptr, ptr->read_ptr, difference);
262 length -= difference;
263 ptr->read_ptr+= difference;
264 ptr->read_buffer_length-= difference;
265 buffer_ptr+= difference;
266 }
267 else
268 {
269 *buffer_ptr= *ptr->read_ptr;
270 ptr->read_ptr++;
271 ptr->read_buffer_length--;
272 buffer_ptr++;
273 break;
274 }
275 }
276
277 ptr->server_failure_counter= 0;
278 *nread = (ssize_t)(buffer_ptr - (char*)buffer);
279 return MEMCACHED_SUCCESS;
280 }
281
282 ssize_t memcached_io_write(memcached_server_st *ptr,
283 const void *buffer, size_t length, char with_flush)
284 {
285 size_t original_length;
286 const char* buffer_ptr;
287
288 WATCHPOINT_ASSERT(ptr->fd != -1);
289
290 original_length= length;
291 buffer_ptr= buffer;
292
293 /* more writable data is coming if a flush isn't required, so delay send */
294 if (!with_flush)
295 memcached_io_cork(ptr, 1);
296
297 while (length)
298 {
299 char *write_ptr;
300 size_t should_write;
301 size_t buffer_end;
302
303 if (ptr->type == MEMCACHED_CONNECTION_UDP)
304 {
305 //UDP does not support partial writes
306 buffer_end= MAX_UDP_DATAGRAM_LENGTH;
307 should_write= length;
308 if (ptr->write_buffer_offset + should_write > buffer_end)
309 return -1;
310 }
311 else
312 {
313 buffer_end= MEMCACHED_MAX_BUFFER;
314 should_write= buffer_end - ptr->write_buffer_offset;
315 should_write= (should_write < length) ? should_write : length;
316 }
317
318 write_ptr= ptr->write_buffer + ptr->write_buffer_offset;
319 memcpy(write_ptr, buffer_ptr, should_write);
320 ptr->write_buffer_offset+= should_write;
321 buffer_ptr+= should_write;
322 length-= should_write;
323
324 if (ptr->write_buffer_offset == buffer_end && ptr->type != MEMCACHED_CONNECTION_UDP)
325 {
326 memcached_return rc;
327 ssize_t sent_length;
328
329 WATCHPOINT_ASSERT(ptr->fd != -1);
330 sent_length= io_flush(ptr, &rc);
331 if (sent_length == -1)
332 return -1;
333
334 /* If io_flush calls memcached_purge, sent_length may be 0 */
335 unlikely (sent_length != 0)
336 {
337 WATCHPOINT_ASSERT(sent_length == (ssize_t)buffer_end);
338 }
339 }
340 }
341
342 if (with_flush)
343 {
344 memcached_return rc;
345 WATCHPOINT_ASSERT(ptr->fd != -1);
346 if (io_flush(ptr, &rc) == -1)
347 return -1;
348 memcached_io_cork(ptr, 0);
349 }
350
351 return (ssize_t) original_length;
352 }
353
354 memcached_return memcached_io_close(memcached_server_st *ptr)
355 {
356 int r;
357
358 if (ptr->fd == -1)
359 return MEMCACHED_SUCCESS;
360
361 /* in case of death shutdown to avoid blocking at close() */
362 if (1)
363 {
364 r= shutdown(ptr->fd, SHUT_RDWR);
365
366 #ifdef DEBUG
367 if (r && errno != ENOTCONN)
368 {
369 WATCHPOINT_NUMBER(ptr->fd);
370 WATCHPOINT_ERRNO(errno);
371 WATCHPOINT_ASSERT(errno);
372 }
373 #endif
374 }
375
376 r= close(ptr->fd);
377 #ifdef DEBUG
378 if (r != 0)
379 WATCHPOINT_ERRNO(errno);
380 #endif
381
382 return MEMCACHED_SUCCESS;
383 }
384
385 memcached_server_st *memcached_io_get_readable_server(memcached_st *memc)
386 {
387 #define MAX_SERVERS_TO_POLL 100
388 struct pollfd fds[MAX_SERVERS_TO_POLL];
389 unsigned int host_index= 0;
390
391 for (unsigned int x= 0;
392 x< memc->number_of_hosts && host_index < MAX_SERVERS_TO_POLL;
393 ++x)
394 {
395 if (memc->hosts[x].read_buffer_length > 0) /* I have data in the buffer */
396 return &memc->hosts[x];
397
398 if (memcached_server_response_count(&memc->hosts[x]) > 0)
399 {
400 fds[host_index].events = POLLIN;
401 fds[host_index].revents = 0;
402 fds[host_index].fd = memc->hosts[x].fd;
403 ++host_index;
404 }
405 }
406
407 if (host_index < 2)
408 {
409 /* We have 0 or 1 server with pending events.. */
410 for (unsigned int x= 0; x< memc->number_of_hosts; ++x)
411 if (memcached_server_response_count(&memc->hosts[x]) > 0)
412 return &memc->hosts[x];
413
414 return NULL;
415 }
416
417 int err= poll(fds, host_index, memc->poll_timeout);
418 switch (err) {
419 case -1:
420 memc->cached_errno = errno;
421 /* FALLTHROUGH */
422 case 0:
423 break;
424 default:
425 for (unsigned int x= 0; x < host_index; ++x)
426 if (fds[x].revents & POLLIN)
427 for (unsigned int y= 0; y < memc->number_of_hosts; ++y)
428 if (memc->hosts[y].fd == fds[x].fd)
429 return &memc->hosts[y];
430 }
431
432 return NULL;
433 }
434
435 static ssize_t io_flush(memcached_server_st *ptr,
436 memcached_return *error)
437 {
438 /*
439 ** We might want to purge the input buffer if we haven't consumed
440 ** any output yet... The test for the limits is the purge is inline
441 ** in the purge function to avoid duplicating the logic..
442 */
443 {
444 memcached_return rc;
445 WATCHPOINT_ASSERT(ptr->fd != -1);
446 rc= memcached_purge(ptr);
447
448 if (rc != MEMCACHED_SUCCESS && rc != MEMCACHED_STORED)
449 return -1;
450 }
451 ssize_t sent_length;
452 size_t return_length;
453 char *local_write_ptr= ptr->write_buffer;
454 size_t write_length= ptr->write_buffer_offset;
455
456 *error= MEMCACHED_SUCCESS;
457
458 WATCHPOINT_ASSERT(ptr->fd != -1);
459
460 // UDP Sanity check, make sure that we are not sending somthing too big
461 if (ptr->type == MEMCACHED_CONNECTION_UDP && write_length > MAX_UDP_DATAGRAM_LENGTH)
462 return -1;
463
464 if (ptr->write_buffer_offset == 0 || (ptr->type == MEMCACHED_CONNECTION_UDP
465 && ptr->write_buffer_offset == UDP_DATAGRAM_HEADER_LENGTH))
466 return 0;
467
468 /* Looking for memory overflows */
469 #if defined(DEBUG)
470 if (write_length == MEMCACHED_MAX_BUFFER)
471 WATCHPOINT_ASSERT(ptr->write_buffer == local_write_ptr);
472 WATCHPOINT_ASSERT((ptr->write_buffer + MEMCACHED_MAX_BUFFER) >= (local_write_ptr + write_length));
473 #endif
474
475 return_length= 0;
476 while (write_length)
477 {
478 WATCHPOINT_ASSERT(ptr->fd != -1);
479 WATCHPOINT_ASSERT(write_length > 0);
480 sent_length= 0;
481 if (ptr->type == MEMCACHED_CONNECTION_UDP)
482 increment_udp_message_id(ptr);
483 sent_length= write(ptr->fd, local_write_ptr, write_length);
484
485 if (sent_length == -1)
486 {
487 ptr->cached_errno= errno;
488 switch (errno)
489 {
490 case ENOBUFS:
491 continue;
492 case EAGAIN:
493 {
494 /*
495 * We may be blocked on write because the input buffer
496 * is full. Let's check if we have room in our input
497 * buffer for more data and retry the write before
498 * waiting..
499 */
500 if (repack_input_buffer(ptr) ||
501 process_input_buffer(ptr))
502 continue;
503
504 memcached_return rc;
505 rc= io_wait(ptr, MEM_WRITE);
506
507 if (rc == MEMCACHED_SUCCESS || rc == MEMCACHED_TIMEOUT)
508 continue;
509
510 memcached_quit_server(ptr, 1);
511 return -1;
512 }
513 default:
514 memcached_quit_server(ptr, 1);
515 *error= MEMCACHED_ERRNO;
516 return -1;
517 }
518 }
519
520 if (ptr->type == MEMCACHED_CONNECTION_UDP &&
521 (size_t)sent_length != write_length)
522 {
523 memcached_quit_server(ptr, 1);
524 return -1;
525 }
526
527 ptr->io_bytes_sent += (uint32_t) sent_length;
528
529 local_write_ptr+= sent_length;
530 write_length-= (uint32_t) sent_length;
531 return_length+= (uint32_t) sent_length;
532 }
533
534 WATCHPOINT_ASSERT(write_length == 0);
535 // Need to study this assert() WATCHPOINT_ASSERT(return_length ==
536 // ptr->write_buffer_offset);
537
538 // if we are a udp server, the begining of the buffer is reserverd for
539 // the upd frame header
540 if (ptr->type == MEMCACHED_CONNECTION_UDP)
541 ptr->write_buffer_offset= UDP_DATAGRAM_HEADER_LENGTH;
542 else
543 ptr->write_buffer_offset= 0;
544
545 return (ssize_t) return_length;
546 }
547
548 /*
549 Eventually we will just kill off the server with the problem.
550 */
551 void memcached_io_reset(memcached_server_st *ptr)
552 {
553 memcached_quit_server(ptr, 1);
554 }
555
556 /**
557 * Read a given number of bytes from the server and place it into a specific
558 * buffer. Reset the IO channel on this server if an error occurs.
559 */
560 memcached_return memcached_safe_read(memcached_server_st *ptr,
561 void *dta,
562 size_t size)
563 {
564 size_t offset= 0;
565 char *data= dta;
566
567 while (offset < size)
568 {
569 ssize_t nread;
570 memcached_return rc= memcached_io_read(ptr, data + offset, size - offset,
571 &nread);
572 if (rc != MEMCACHED_SUCCESS)
573 return rc;
574
575 offset+= (size_t) nread;
576 }
577
578 return MEMCACHED_SUCCESS;
579 }
580
581 memcached_return memcached_io_readline(memcached_server_st *ptr,
582 char *buffer_ptr,
583 size_t size)
584 {
585 bool line_complete= false;
586 size_t total_nr= 0;
587
588 while (!line_complete)
589 {
590 if (ptr->read_buffer_length == 0)
591 {
592 /*
593 * We don't have any data in the buffer, so let's fill the read
594 * buffer. Call the standard read function to avoid duplicating
595 * the logic.
596 */
597 ssize_t nread;
598 memcached_return rc= memcached_io_read(ptr, buffer_ptr, 1, &nread);
599 if (rc != MEMCACHED_SUCCESS)
600 return rc;
601
602 if (*buffer_ptr == '\n')
603 line_complete= true;
604
605 ++buffer_ptr;
606 ++total_nr;
607 }
608
609 /* Now let's look in the buffer and copy as we go! */
610 while (ptr->read_buffer_length && total_nr < size && !line_complete)
611 {
612 *buffer_ptr = *ptr->read_ptr;
613 if (*buffer_ptr == '\n')
614 line_complete = true;
615 --ptr->read_buffer_length;
616 ++ptr->read_ptr;
617 ++total_nr;
618 ++buffer_ptr;
619 }
620
621 if (total_nr == size)
622 return MEMCACHED_PROTOCOL_ERROR;
623 }
624
625 return MEMCACHED_SUCCESS;
626 }
627
628 /*
629 * The udp request id consists of two seperate sections
630 * 1) The thread id
631 * 2) The message number
632 * The thread id should only be set when the memcached_st struct is created
633 * and should not be changed.
634 *
635 * The message num is incremented for each new message we send, this function
636 * extracts the message number from message_id, increments it and then
637 * writes the new value back into the header
638 */
639 static void increment_udp_message_id(memcached_server_st *ptr)
640 {
641 struct udp_datagram_header_st *header= (struct udp_datagram_header_st *)ptr->write_buffer;
642 uint16_t cur_req= get_udp_datagram_request_id(header);
643 int msg_num= get_msg_num_from_request_id(cur_req);
644 int thread_id= get_thread_id_from_request_id(cur_req);
645
646 if (((++msg_num) & UDP_REQUEST_ID_THREAD_MASK) != 0)
647 msg_num= 0;
648
649 header->request_id= htons((uint16_t) (thread_id | msg_num));
650 }
651
652 memcached_return memcached_io_init_udp_header(memcached_server_st *ptr, uint16_t thread_id)
653 {
654 if (thread_id > UDP_REQUEST_ID_MAX_THREAD_ID)
655 return MEMCACHED_FAILURE;
656
657 struct udp_datagram_header_st *header= (struct udp_datagram_header_st *)ptr->write_buffer;
658 header->request_id= htons((uint16_t) (generate_udp_request_thread_id(thread_id)));
659 header->num_datagrams= htons(1);
660 header->sequence_number= htons(0);
661
662 return MEMCACHED_SUCCESS;
663 }