f4bbe1e78d4e5ed8ff90ca80933239adc7e9fbc6
[awesomized/libmemcached] / memcached / thread.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Thread management for memcached.
4 */
5 #include "memcached.h"
6 #include <assert.h>
7 #include <stdio.h>
8 #include <errno.h>
9 #include <stdlib.h>
10 #include <errno.h>
11 #include <string.h>
12 #include <pthread.h>
13
14 #ifdef __sun
15 #include <atomic.h>
16 #endif
17
18 #define ITEMS_PER_ALLOC 64
19
20 /* An item in the connection queue. */
21 typedef struct conn_queue_item CQ_ITEM;
22 struct conn_queue_item {
23 int sfd;
24 enum conn_states init_state;
25 int event_flags;
26 int read_buffer_size;
27 enum network_transport transport;
28 CQ_ITEM *next;
29 };
30
31 /* A connection queue. */
32 typedef struct conn_queue CQ;
33 struct conn_queue {
34 CQ_ITEM *head;
35 CQ_ITEM *tail;
36 pthread_mutex_t lock;
37 pthread_cond_t cond;
38 };
39
40 /* Lock for cache operations (item_*, assoc_*) */
41 pthread_mutex_t cache_lock;
42
43 /* Connection lock around accepting new connections */
44 pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER;
45
46 #if !defined(HAVE_GCC_ATOMICS) && !defined(__sun)
47 pthread_mutex_t atomics_mutex = PTHREAD_MUTEX_INITIALIZER;
48 #endif
49
50 /* Lock for global stats */
51 static pthread_mutex_t stats_lock;
52
53 /* Free list of CQ_ITEM structs */
54 static CQ_ITEM *cqi_freelist;
55 static pthread_mutex_t cqi_freelist_lock;
56
57 static pthread_mutex_t *item_locks;
58 /* size of the item lock hash table */
59 static uint32_t item_lock_count;
60 /* size - 1 for lookup masking */
61 static uint32_t item_lock_mask;
62
63 static LIBEVENT_DISPATCHER_THREAD dispatcher_thread;
64
65 /*
66 * Each libevent instance has a wakeup pipe, which other threads
67 * can use to signal that they've put a new connection on its queue.
68 */
69 static LIBEVENT_THREAD *threads;
70
71 /*
72 * Number of worker threads that have finished setting themselves up.
73 */
74 static int init_count = 0;
75 static pthread_mutex_t init_lock;
76 static pthread_cond_t init_cond;
77
78
79 static void thread_libevent_process(int fd, short which, void *arg);
80
81 unsigned short refcount_incr(unsigned short *refcount) {
82 #ifdef HAVE_GCC_ATOMICS
83 return __sync_add_and_fetch(refcount, 1);
84 #elif defined(__sun)
85 return atomic_inc_ushort_nv(refcount);
86 #else
87 unsigned short res;
88 mutex_lock(&atomics_mutex);
89 (*refcount)++;
90 res = *refcount;
91 mutex_unlock(&atomics_mutex);
92 return res;
93 #endif
94 }
95
96 unsigned short refcount_decr(unsigned short *refcount) {
97 #ifdef HAVE_GCC_ATOMICS
98 return __sync_sub_and_fetch(refcount, 1);
99 #elif defined(__sun)
100 return atomic_dec_ushort_nv(refcount);
101 #else
102 unsigned short res;
103 mutex_lock(&atomics_mutex);
104 (*refcount)--;
105 res = *refcount;
106 mutex_unlock(&atomics_mutex);
107 return res;
108 #endif
109 }
110
111 void item_lock(uint32_t hv) {
112 mutex_lock(&item_locks[hv & item_lock_mask]);
113 }
114
115 void item_unlock(uint32_t hv) {
116 mutex_unlock(&item_locks[hv & item_lock_mask]);
117 }
118
119 /*
120 * Initializes a connection queue.
121 */
122 static void cq_init(CQ *cq) {
123 pthread_mutex_init(&cq->lock, NULL);
124 pthread_cond_init(&cq->cond, NULL);
125 cq->head = NULL;
126 cq->tail = NULL;
127 }
128
129 /*
130 * Looks for an item on a connection queue, but doesn't block if there isn't
131 * one.
132 * Returns the item, or NULL if no item is available
133 */
134 static CQ_ITEM *cq_pop(CQ *cq) {
135 CQ_ITEM *item;
136
137 pthread_mutex_lock(&cq->lock);
138 item = cq->head;
139 if (NULL != item) {
140 cq->head = item->next;
141 if (NULL == cq->head)
142 cq->tail = NULL;
143 }
144 pthread_mutex_unlock(&cq->lock);
145
146 return item;
147 }
148
149 /*
150 * Adds an item to a connection queue.
151 */
152 static void cq_push(CQ *cq, CQ_ITEM *item) {
153 item->next = NULL;
154
155 pthread_mutex_lock(&cq->lock);
156 if (NULL == cq->tail)
157 cq->head = item;
158 else
159 cq->tail->next = item;
160 cq->tail = item;
161 pthread_cond_signal(&cq->cond);
162 pthread_mutex_unlock(&cq->lock);
163 }
164
165 /*
166 * Returns a fresh connection queue item.
167 */
168 static CQ_ITEM *cqi_new(void) {
169 CQ_ITEM *item = NULL;
170 pthread_mutex_lock(&cqi_freelist_lock);
171 if (cqi_freelist) {
172 item = cqi_freelist;
173 cqi_freelist = item->next;
174 }
175 pthread_mutex_unlock(&cqi_freelist_lock);
176
177 if (NULL == item) {
178 int i;
179
180 /* Allocate a bunch of items at once to reduce fragmentation */
181 item = malloc(sizeof(CQ_ITEM) * ITEMS_PER_ALLOC);
182 if (NULL == item)
183 return NULL;
184
185 /*
186 * Link together all the new items except the first one
187 * (which we'll return to the caller) for placement on
188 * the freelist.
189 */
190 for (i = 2; i < ITEMS_PER_ALLOC; i++)
191 item[i - 1].next = &item[i];
192
193 pthread_mutex_lock(&cqi_freelist_lock);
194 item[ITEMS_PER_ALLOC - 1].next = cqi_freelist;
195 cqi_freelist = &item[1];
196 pthread_mutex_unlock(&cqi_freelist_lock);
197 }
198
199 return item;
200 }
201
202
203 /*
204 * Frees a connection queue item (adds it to the freelist.)
205 */
206 static void cqi_free(CQ_ITEM *item) {
207 pthread_mutex_lock(&cqi_freelist_lock);
208 item->next = cqi_freelist;
209 cqi_freelist = item;
210 pthread_mutex_unlock(&cqi_freelist_lock);
211 }
212
213
214 /*
215 * Creates a worker thread.
216 */
217 static void create_worker(void *(*func)(void *), void *arg) {
218 pthread_t thread;
219 pthread_attr_t attr;
220 int ret;
221
222 pthread_attr_init(&attr);
223
224 if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) {
225 fprintf(stderr, "Can't create thread: %s\n",
226 strerror(ret));
227 exit(1);
228 }
229 }
230
231 /*
232 * Sets whether or not we accept new connections.
233 */
234 void accept_new_conns(const bool do_accept) {
235 pthread_mutex_lock(&conn_lock);
236 do_accept_new_conns(do_accept);
237 pthread_mutex_unlock(&conn_lock);
238 }
239 /****************************** LIBEVENT THREADS *****************************/
240
241 /*
242 * Set up a thread's information.
243 */
244 static void setup_thread(LIBEVENT_THREAD *me) {
245 me->base = event_init();
246 if (! me->base) {
247 fprintf(stderr, "Can't allocate event base\n");
248 exit(1);
249 }
250
251 /* Listen for notifications from other threads */
252 event_set(&me->notify_event, me->notify_receive_fd,
253 EV_READ | EV_PERSIST, thread_libevent_process, me);
254 event_base_set(me->base, &me->notify_event);
255
256 if (event_add(&me->notify_event, 0) == -1) {
257 fprintf(stderr, "Can't monitor libevent notify pipe\n");
258 exit(1);
259 }
260
261 me->new_conn_queue = malloc(sizeof(struct conn_queue));
262 if (me->new_conn_queue == NULL) {
263 perror("Failed to allocate memory for connection queue");
264 exit(EXIT_FAILURE);
265 }
266 cq_init(me->new_conn_queue);
267
268 if (pthread_mutex_init(&me->stats.mutex, NULL) != 0) {
269 perror("Failed to initialize mutex");
270 exit(EXIT_FAILURE);
271 }
272
273 me->suffix_cache = cache_create("suffix", SUFFIX_SIZE, sizeof(char*),
274 NULL, NULL);
275 if (me->suffix_cache == NULL) {
276 fprintf(stderr, "Failed to create suffix cache\n");
277 exit(EXIT_FAILURE);
278 }
279 }
280
281
282 /*
283 * Worker thread: main event loop
284 */
285 static void *worker_libevent(void *arg) {
286 LIBEVENT_THREAD *me = arg;
287
288 /* Any per-thread setup can happen here; thread_init() will block until
289 * all threads have finished initializing.
290 */
291
292 pthread_mutex_lock(&init_lock);
293 init_count++;
294 pthread_cond_signal(&init_cond);
295 pthread_mutex_unlock(&init_lock);
296
297 event_base_loop(me->base, 0);
298 return NULL;
299 }
300
301
302 /*
303 * Processes an incoming "handle a new connection" item. This is called when
304 * input arrives on the libevent wakeup pipe.
305 */
306 static void thread_libevent_process(int fd, short which, void *arg) {
307 LIBEVENT_THREAD *me = arg;
308 CQ_ITEM *item;
309 char buf[1];
310
311 if (read(fd, buf, 1) != 1)
312 if (settings.verbose > 0)
313 fprintf(stderr, "Can't read from libevent pipe\n");
314
315 item = cq_pop(me->new_conn_queue);
316
317 if (NULL != item) {
318 conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
319 item->read_buffer_size, item->transport, me->base);
320 if (c == NULL) {
321 if (IS_UDP(item->transport)) {
322 fprintf(stderr, "Can't listen for events on UDP socket\n");
323 exit(1);
324 } else {
325 if (settings.verbose > 0) {
326 fprintf(stderr, "Can't listen for events on fd %d\n",
327 item->sfd);
328 }
329 close(item->sfd);
330 }
331 } else {
332 c->thread = me;
333 }
334 cqi_free(item);
335 }
336 }
337
338 /* Which thread we assigned a connection to most recently. */
339 static int last_thread = -1;
340
341 /*
342 * Dispatches a new connection to another thread. This is only ever called
343 * from the main thread, either during initialization (for UDP) or because
344 * of an incoming connection.
345 */
346 void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags,
347 int read_buffer_size, enum network_transport transport) {
348 CQ_ITEM *item = cqi_new();
349 int tid = (last_thread + 1) % settings.num_threads;
350
351 LIBEVENT_THREAD *thread = threads + tid;
352
353 last_thread = tid;
354
355 item->sfd = sfd;
356 item->init_state = init_state;
357 item->event_flags = event_flags;
358 item->read_buffer_size = read_buffer_size;
359 item->transport = transport;
360
361 cq_push(thread->new_conn_queue, item);
362
363 MEMCACHED_CONN_DISPATCH(sfd, thread->thread_id);
364 if (write(thread->notify_send_fd, "", 1) != 1) {
365 perror("Writing to thread notify pipe");
366 }
367 }
368
369 /*
370 * Returns true if this is the thread that listens for new TCP connections.
371 */
372 int is_listen_thread() {
373 return pthread_self() == dispatcher_thread.thread_id;
374 }
375
376 /********************************* ITEM ACCESS *******************************/
377
378 /*
379 * Allocates a new item.
380 */
381 item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
382 item *it;
383 /* do_item_alloc handles its own locks */
384 it = do_item_alloc(key, nkey, flags, exptime, nbytes);
385 return it;
386 }
387
388 /*
389 * Returns an item if it hasn't been marked as expired,
390 * lazy-expiring as needed.
391 */
392 item *item_get(const char *key, const size_t nkey) {
393 item *it;
394 uint32_t hv;
395 hv = hash(key, nkey, 0);
396 item_lock(hv);
397 it = do_item_get(key, nkey, hv);
398 item_unlock(hv);
399 return it;
400 }
401
402 item *item_touch(const char *key, size_t nkey, uint32_t exptime) {
403 item *it;
404 uint32_t hv;
405 hv = hash(key, nkey, 0);
406 item_lock(hv);
407 it = do_item_touch(key, nkey, exptime, hv);
408 item_unlock(hv);
409 return it;
410 }
411
412 /*
413 * Links an item into the LRU and hashtable.
414 */
415 int item_link(item *item) {
416 int ret;
417 uint32_t hv;
418
419 hv = hash(ITEM_key(item), item->nkey, 0);
420 item_lock(hv);
421 ret = do_item_link(item, hv);
422 item_unlock(hv);
423 return ret;
424 }
425
426 /*
427 * Decrements the reference count on an item and adds it to the freelist if
428 * needed.
429 */
430 void item_remove(item *item) {
431 uint32_t hv;
432 hv = hash(ITEM_key(item), item->nkey, 0);
433
434 item_lock(hv);
435 do_item_remove(item);
436 item_unlock(hv);
437 }
438
439 /*
440 * Replaces one item with another in the hashtable.
441 * Unprotected by a mutex lock since the core server does not require
442 * it to be thread-safe.
443 */
444 int item_replace(item *old_it, item *new_it, const uint32_t hv) {
445 return do_item_replace(old_it, new_it, hv);
446 }
447
448 /*
449 * Unlinks an item from the LRU and hashtable.
450 */
451 void item_unlink(item *item) {
452 uint32_t hv;
453 hv = hash(ITEM_key(item), item->nkey, 0);
454 item_lock(hv);
455 do_item_unlink(item, hv);
456 item_unlock(hv);
457 }
458
459 /*
460 * Moves an item to the back of the LRU queue.
461 */
462 void item_update(item *item) {
463 uint32_t hv;
464 hv = hash(ITEM_key(item), item->nkey, 0);
465
466 item_lock(hv);
467 do_item_update(item);
468 item_unlock(hv);
469 }
470
471 /*
472 * Does arithmetic on a numeric item value.
473 */
474 enum delta_result_type add_delta(conn *c, const char *key,
475 const size_t nkey, int incr,
476 const int64_t delta, char *buf,
477 uint64_t *cas) {
478 enum delta_result_type ret;
479 uint32_t hv;
480
481 hv = hash(key, nkey, 0);
482 item_lock(hv);
483 ret = do_add_delta(c, key, nkey, incr, delta, buf, cas, hv);
484 item_unlock(hv);
485 return ret;
486 }
487
488 /*
489 * Stores an item in the cache (high level, obeys set/add/replace semantics)
490 */
491 enum store_item_type store_item(item *item, int comm, conn* c) {
492 enum store_item_type ret;
493 uint32_t hv;
494
495 hv = hash(ITEM_key(item), item->nkey, 0);
496 item_lock(hv);
497 ret = do_store_item(item, comm, c, hv);
498 item_unlock(hv);
499 return ret;
500 }
501
502 /*
503 * Flushes expired items after a flush_all call
504 */
505 void item_flush_expired() {
506 mutex_lock(&cache_lock);
507 do_item_flush_expired();
508 mutex_unlock(&cache_lock);
509 }
510
511 /*
512 * Dumps part of the cache
513 */
514 char *item_cachedump(unsigned int slabs_clsid, unsigned int limit, unsigned int *bytes) {
515 char *ret;
516
517 mutex_lock(&cache_lock);
518 ret = do_item_cachedump(slabs_clsid, limit, bytes);
519 mutex_unlock(&cache_lock);
520 return ret;
521 }
522
523 /*
524 * Dumps statistics about slab classes
525 */
526 void item_stats(ADD_STAT add_stats, void *c) {
527 mutex_lock(&cache_lock);
528 do_item_stats(add_stats, c);
529 mutex_unlock(&cache_lock);
530 }
531
532 /*
533 * Dumps a list of objects of each size in 32-byte increments
534 */
535 void item_stats_sizes(ADD_STAT add_stats, void *c) {
536 mutex_lock(&cache_lock);
537 do_item_stats_sizes(add_stats, c);
538 mutex_unlock(&cache_lock);
539 }
540
541 /******************************* GLOBAL STATS ******************************/
542
543 void STATS_LOCK() {
544 pthread_mutex_lock(&stats_lock);
545 }
546
547 void STATS_UNLOCK() {
548 pthread_mutex_unlock(&stats_lock);
549 }
550
551 void threadlocal_stats_reset(void) {
552 int ii, sid;
553 for (ii = 0; ii < settings.num_threads; ++ii) {
554 pthread_mutex_lock(&threads[ii].stats.mutex);
555
556 threads[ii].stats.get_cmds = 0;
557 threads[ii].stats.get_misses = 0;
558 threads[ii].stats.touch_cmds = 0;
559 threads[ii].stats.touch_misses = 0;
560 threads[ii].stats.delete_misses = 0;
561 threads[ii].stats.incr_misses = 0;
562 threads[ii].stats.decr_misses = 0;
563 threads[ii].stats.cas_misses = 0;
564 threads[ii].stats.bytes_read = 0;
565 threads[ii].stats.bytes_written = 0;
566 threads[ii].stats.flush_cmds = 0;
567 threads[ii].stats.conn_yields = 0;
568 threads[ii].stats.auth_cmds = 0;
569 threads[ii].stats.auth_errors = 0;
570
571 for(sid = 0; sid < MAX_NUMBER_OF_SLAB_CLASSES; sid++) {
572 threads[ii].stats.slab_stats[sid].set_cmds = 0;
573 threads[ii].stats.slab_stats[sid].get_hits = 0;
574 threads[ii].stats.slab_stats[sid].touch_hits = 0;
575 threads[ii].stats.slab_stats[sid].delete_hits = 0;
576 threads[ii].stats.slab_stats[sid].incr_hits = 0;
577 threads[ii].stats.slab_stats[sid].decr_hits = 0;
578 threads[ii].stats.slab_stats[sid].cas_hits = 0;
579 threads[ii].stats.slab_stats[sid].cas_badval = 0;
580 }
581
582 pthread_mutex_unlock(&threads[ii].stats.mutex);
583 }
584 }
585
586 void threadlocal_stats_aggregate(struct thread_stats *stats) {
587 int ii, sid;
588
589 /* The struct has a mutex, but we can safely set the whole thing
590 * to zero since it is unused when aggregating. */
591 memset(stats, 0, sizeof(*stats));
592
593 for (ii = 0; ii < settings.num_threads; ++ii) {
594 pthread_mutex_lock(&threads[ii].stats.mutex);
595
596 stats->get_cmds += threads[ii].stats.get_cmds;
597 stats->get_misses += threads[ii].stats.get_misses;
598 stats->touch_cmds += threads[ii].stats.touch_cmds;
599 stats->touch_misses += threads[ii].stats.touch_misses;
600 stats->delete_misses += threads[ii].stats.delete_misses;
601 stats->decr_misses += threads[ii].stats.decr_misses;
602 stats->incr_misses += threads[ii].stats.incr_misses;
603 stats->cas_misses += threads[ii].stats.cas_misses;
604 stats->bytes_read += threads[ii].stats.bytes_read;
605 stats->bytes_written += threads[ii].stats.bytes_written;
606 stats->flush_cmds += threads[ii].stats.flush_cmds;
607 stats->conn_yields += threads[ii].stats.conn_yields;
608 stats->auth_cmds += threads[ii].stats.auth_cmds;
609 stats->auth_errors += threads[ii].stats.auth_errors;
610
611 for (sid = 0; sid < MAX_NUMBER_OF_SLAB_CLASSES; sid++) {
612 stats->slab_stats[sid].set_cmds +=
613 threads[ii].stats.slab_stats[sid].set_cmds;
614 stats->slab_stats[sid].get_hits +=
615 threads[ii].stats.slab_stats[sid].get_hits;
616 stats->slab_stats[sid].touch_hits +=
617 threads[ii].stats.slab_stats[sid].touch_hits;
618 stats->slab_stats[sid].delete_hits +=
619 threads[ii].stats.slab_stats[sid].delete_hits;
620 stats->slab_stats[sid].decr_hits +=
621 threads[ii].stats.slab_stats[sid].decr_hits;
622 stats->slab_stats[sid].incr_hits +=
623 threads[ii].stats.slab_stats[sid].incr_hits;
624 stats->slab_stats[sid].cas_hits +=
625 threads[ii].stats.slab_stats[sid].cas_hits;
626 stats->slab_stats[sid].cas_badval +=
627 threads[ii].stats.slab_stats[sid].cas_badval;
628 }
629
630 pthread_mutex_unlock(&threads[ii].stats.mutex);
631 }
632 }
633
634 void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out) {
635 int sid;
636
637 out->set_cmds = 0;
638 out->get_hits = 0;
639 out->touch_hits = 0;
640 out->delete_hits = 0;
641 out->incr_hits = 0;
642 out->decr_hits = 0;
643 out->cas_hits = 0;
644 out->cas_badval = 0;
645
646 for (sid = 0; sid < MAX_NUMBER_OF_SLAB_CLASSES; sid++) {
647 out->set_cmds += stats->slab_stats[sid].set_cmds;
648 out->get_hits += stats->slab_stats[sid].get_hits;
649 out->touch_hits += stats->slab_stats[sid].touch_hits;
650 out->delete_hits += stats->slab_stats[sid].delete_hits;
651 out->decr_hits += stats->slab_stats[sid].decr_hits;
652 out->incr_hits += stats->slab_stats[sid].incr_hits;
653 out->cas_hits += stats->slab_stats[sid].cas_hits;
654 out->cas_badval += stats->slab_stats[sid].cas_badval;
655 }
656 }
657
658 /*
659 * Initializes the thread subsystem, creating various worker threads.
660 *
661 * nthreads Number of worker event handler threads to spawn
662 * main_base Event base for main thread
663 */
664 void thread_init(int nthreads, struct event_base *main_base) {
665 int i;
666 int power;
667
668 pthread_mutex_init(&cache_lock, NULL);
669 pthread_mutex_init(&stats_lock, NULL);
670
671 pthread_mutex_init(&init_lock, NULL);
672 pthread_cond_init(&init_cond, NULL);
673
674 pthread_mutex_init(&cqi_freelist_lock, NULL);
675 cqi_freelist = NULL;
676
677 /* Want a wide lock table, but don't waste memory */
678 if (nthreads < 3) {
679 power = 10;
680 } else if (nthreads < 4) {
681 power = 11;
682 } else if (nthreads < 5) {
683 power = 12;
684 } else {
685 /* 8192 buckets, and central locks don't scale much past 5 threads */
686 power = 13;
687 }
688
689 item_lock_count = ((unsigned long int)1 << (power));
690 item_lock_mask = item_lock_count - 1;
691
692 item_locks = calloc(item_lock_count, sizeof(pthread_mutex_t));
693 if (! item_locks) {
694 perror("Can't allocate item locks");
695 exit(1);
696 }
697 for (i = 0; i < item_lock_count; i++) {
698 pthread_mutex_init(&item_locks[i], NULL);
699 }
700
701 threads = calloc(nthreads, sizeof(LIBEVENT_THREAD));
702 if (! threads) {
703 perror("Can't allocate thread descriptors");
704 exit(1);
705 }
706
707 dispatcher_thread.base = main_base;
708 dispatcher_thread.thread_id = pthread_self();
709
710 for (i = 0; i < nthreads; i++) {
711 int fds[2];
712 if (pipe(fds)) {
713 perror("Can't create notify pipe");
714 exit(1);
715 }
716
717 threads[i].notify_receive_fd = fds[0];
718 threads[i].notify_send_fd = fds[1];
719
720 setup_thread(&threads[i]);
721 /* Reserve three fds for the libevent base, and two for the pipe */
722 stats.reserved_fds += 5;
723 }
724
725 /* Create threads after we've done all the libevent setup. */
726 for (i = 0; i < nthreads; i++) {
727 create_worker(worker_libevent, &threads[i]);
728 }
729
730 /* Wait for all the threads to set themselves up before returning. */
731 pthread_mutex_lock(&init_lock);
732 while (init_count < nthreads) {
733 pthread_cond_wait(&init_cond, &init_lock);
734 }
735 pthread_mutex_unlock(&init_lock);
736 }
737