Update memcached, fix style in test cases.
[m6w6/libmemcached] / memcached / slabs.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 */
10 #include "memcached.h"
11 #include <sys/stat.h>
12 #include <sys/socket.h>
13 #include <sys/signal.h>
14 #include <sys/resource.h>
15 #include <fcntl.h>
16 #include <netinet/in.h>
17 #include <errno.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <assert.h>
22 #include <pthread.h>
23
24 /* powers-of-N allocation structures */
25
26 typedef struct {
27 unsigned int size; /* sizes of items */
28 unsigned int perslab; /* how many items per slab */
29
30 void *slots; /* list of item ptrs */
31 unsigned int sl_curr; /* total free items in list */
32
33 void *end_page_ptr; /* pointer to next free item at end of page, or 0 */
34 unsigned int end_page_free; /* number of items remaining at end of last alloced page */
35
36 unsigned int slabs; /* how many slabs were allocated for this class */
37
38 void **slab_list; /* array of slab pointers */
39 unsigned int list_size; /* size of prev array */
40
41 unsigned int killing; /* index+1 of dying slab, or zero if none */
42 size_t requested; /* The number of requested bytes */
43 } slabclass_t;
44
45 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
46 static size_t mem_limit = 0;
47 static size_t mem_malloced = 0;
48 static int power_largest;
49
50 static void *mem_base = NULL;
51 static void *mem_current = NULL;
52 static size_t mem_avail = 0;
53
54 /**
55 * Access to the slab allocator is protected by this lock
56 */
57 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
58
59 /*
60 * Forward Declarations
61 */
62 static int do_slabs_newslab(const unsigned int id);
63 static void *memory_allocate(size_t size);
64
65 #ifndef DONT_PREALLOC_SLABS
66 /* Preallocate as many slab pages as possible (called from slabs_init)
67 on start-up, so users don't get confused out-of-memory errors when
68 they do have free (in-slab) space, but no space to make new slabs.
69 if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
70 slab types can be made. if max memory is less than 18 MB, only the
71 smaller ones will be made. */
72 static void slabs_preallocate (const unsigned int maxslabs);
73 #endif
74
75 /*
76 * Figures out which slab class (chunk size) is required to store an item of
77 * a given size.
78 *
79 * Given object size, return id to use when allocating/freeing memory for object
80 * 0 means error: can't store such a large object
81 */
82
83 unsigned int slabs_clsid(const size_t size) {
84 int res = POWER_SMALLEST;
85
86 if (size == 0)
87 return 0;
88 while (size > slabclass[res].size)
89 if (res++ == power_largest) /* won't fit in the biggest slab */
90 return 0;
91 return res;
92 }
93
94 /**
95 * Determines the chunk sizes and initializes the slab class descriptors
96 * accordingly.
97 */
98 void slabs_init(const size_t limit, const double factor, const bool prealloc) {
99 int i = POWER_SMALLEST - 1;
100 unsigned int size = sizeof(item) + settings.chunk_size;
101
102 mem_limit = limit;
103
104 if (prealloc) {
105 /* Allocate everything in a big chunk with malloc */
106 mem_base = malloc(mem_limit);
107 if (mem_base != NULL) {
108 mem_current = mem_base;
109 mem_avail = mem_limit;
110 } else {
111 fprintf(stderr, "Warning: Failed to allocate requested memory in"
112 " one large chunk.\nWill allocate in smaller chunks\n");
113 }
114 }
115
116 memset(slabclass, 0, sizeof(slabclass));
117
118 while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
119 /* Make sure items are always n-byte aligned */
120 if (size % CHUNK_ALIGN_BYTES)
121 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
122
123 slabclass[i].size = size;
124 slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
125 size *= factor;
126 if (settings.verbose > 1) {
127 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
128 i, slabclass[i].size, slabclass[i].perslab);
129 }
130 }
131
132 power_largest = i;
133 slabclass[power_largest].size = settings.item_size_max;
134 slabclass[power_largest].perslab = 1;
135 if (settings.verbose > 1) {
136 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
137 i, slabclass[i].size, slabclass[i].perslab);
138 }
139
140 /* for the test suite: faking of how much we've already malloc'd */
141 {
142 char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
143 if (t_initial_malloc) {
144 mem_malloced = (size_t)atol(t_initial_malloc);
145 }
146
147 }
148
149 #ifndef DONT_PREALLOC_SLABS
150 {
151 char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC");
152
153 if (pre_alloc == NULL || atoi(pre_alloc) != 0) {
154 slabs_preallocate(power_largest);
155 }
156 }
157 #endif
158 }
159
160 #ifndef DONT_PREALLOC_SLABS
161 static void slabs_preallocate (const unsigned int maxslabs) {
162 int i;
163 unsigned int prealloc = 0;
164
165 /* pre-allocate a 1MB slab in every size class so people don't get
166 confused by non-intuitive "SERVER_ERROR out of memory"
167 messages. this is the most common question on the mailing
168 list. if you really don't want this, you can rebuild without
169 these three lines. */
170
171 for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
172 if (++prealloc > maxslabs)
173 return;
174 do_slabs_newslab(i);
175 }
176
177 }
178 #endif
179
180 static int grow_slab_list (const unsigned int id) {
181 slabclass_t *p = &slabclass[id];
182 if (p->slabs == p->list_size) {
183 size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
184 void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
185 if (new_list == 0) return 0;
186 p->list_size = new_size;
187 p->slab_list = new_list;
188 }
189 return 1;
190 }
191
192 #ifndef __INTEL_COMPILER
193 #pragma GCC diagnostic ignored "-Wsign-compare"
194 #endif
195
196 static int do_slabs_newslab(const unsigned int id) {
197 slabclass_t *p = &slabclass[id];
198 int len = settings.slab_reassign ? settings.item_size_max
199 : p->size * p->perslab;
200 char *ptr;
201
202 if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
203 (grow_slab_list(id) == 0) ||
204 ((ptr = memory_allocate((size_t)len)) == 0)) {
205
206 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
207 return 0;
208 }
209
210 memset(ptr, 0, (size_t)len);
211 p->end_page_ptr = ptr;
212 p->end_page_free = p->perslab;
213
214 p->slab_list[p->slabs++] = ptr;
215 mem_malloced += len;
216 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
217
218 return 1;
219 }
220
221 /*@null@*/
222 static void *do_slabs_alloc(const size_t size, unsigned int id) {
223 slabclass_t *p;
224 void *ret = NULL;
225 item *it = NULL;
226
227 if (id < POWER_SMALLEST || id > power_largest) {
228 MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
229 return NULL;
230 }
231
232 p = &slabclass[id];
233 assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
234
235 #ifdef USE_SYSTEM_MALLOC
236 if (mem_limit && mem_malloced + size > mem_limit) {
237 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
238 return 0;
239 }
240 mem_malloced += size;
241 ret = malloc(size);
242 MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret);
243 return ret;
244 #endif
245
246 /* fail unless we have space at the end of a recently allocated page,
247 we have something on our freelist, or we could allocate a new page */
248 if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||
249 do_slabs_newslab(id) != 0)) {
250 /* We don't have more memory available */
251 ret = NULL;
252 } else if (p->sl_curr != 0) {
253 /* return off our freelist */
254 it = (item *)p->slots;
255 p->slots = it->next;
256 if (it->next) it->next->prev = 0;
257 p->sl_curr--;
258 ret = (void *)it;
259 } else {
260 /* if we recently allocated a whole page, return from that */
261 assert(p->end_page_ptr != NULL);
262 ret = p->end_page_ptr;
263 if (--p->end_page_free != 0) {
264 p->end_page_ptr = ((caddr_t)p->end_page_ptr) + p->size;
265 } else {
266 p->end_page_ptr = 0;
267 }
268 }
269
270 if (ret) {
271 p->requested += size;
272 MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
273 } else {
274 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
275 }
276
277 return ret;
278 }
279
280 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
281 slabclass_t *p;
282 item *it;
283
284 assert(((item *)ptr)->slabs_clsid == 0);
285 assert(id >= POWER_SMALLEST && id <= power_largest);
286 if (id < POWER_SMALLEST || id > power_largest)
287 return;
288
289 MEMCACHED_SLABS_FREE(size, id, ptr);
290 p = &slabclass[id];
291
292 #ifdef USE_SYSTEM_MALLOC
293 mem_malloced -= size;
294 free(ptr);
295 return;
296 #endif
297
298 it = (item *)ptr;
299 it->it_flags |= ITEM_SLABBED;
300 it->prev = 0;
301 it->next = p->slots;
302 if (it->next) it->next->prev = it;
303 p->slots = it;
304
305 p->sl_curr++;
306 p->requested -= size;
307 return;
308 }
309
310 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
311 int zlength=strlen(z);
312 return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
313 }
314
315 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
316 bool ret = true;
317
318 if (add_stats != NULL) {
319 if (!stat_type) {
320 /* prepare general statistics for the engine */
321 STATS_LOCK();
322 APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
323 APPEND_STAT("curr_items", "%u", stats.curr_items);
324 APPEND_STAT("total_items", "%u", stats.total_items);
325 APPEND_STAT("evictions", "%llu",
326 (unsigned long long)stats.evictions);
327 APPEND_STAT("reclaimed", "%llu",
328 (unsigned long long)stats.reclaimed);
329 STATS_UNLOCK();
330 } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
331 item_stats(add_stats, c);
332 } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
333 slabs_stats(add_stats, c);
334 } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
335 item_stats_sizes(add_stats, c);
336 } else {
337 ret = false;
338 }
339 } else {
340 ret = false;
341 }
342
343 return ret;
344 }
345
346 /*@null@*/
347 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
348 int i, total;
349 /* Get the per-thread stats which contain some interesting aggregates */
350 struct thread_stats thread_stats;
351 threadlocal_stats_aggregate(&thread_stats);
352
353 total = 0;
354 for(i = POWER_SMALLEST; i <= power_largest; i++) {
355 slabclass_t *p = &slabclass[i];
356 if (p->slabs != 0) {
357 uint32_t perslab, slabs;
358 slabs = p->slabs;
359 perslab = p->perslab;
360
361 char key_str[STAT_KEY_LEN];
362 char val_str[STAT_VAL_LEN];
363 int klen = 0, vlen = 0;
364
365 APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
366 APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
367 APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
368 APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
369 APPEND_NUM_STAT(i, "used_chunks", "%u",
370 slabs*perslab - p->sl_curr - p->end_page_free);
371 APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
372 APPEND_NUM_STAT(i, "free_chunks_end", "%u", p->end_page_free);
373 APPEND_NUM_STAT(i, "mem_requested", "%llu",
374 (unsigned long long)p->requested);
375 APPEND_NUM_STAT(i, "get_hits", "%llu",
376 (unsigned long long)thread_stats.slab_stats[i].get_hits);
377 APPEND_NUM_STAT(i, "cmd_set", "%llu",
378 (unsigned long long)thread_stats.slab_stats[i].set_cmds);
379 APPEND_NUM_STAT(i, "delete_hits", "%llu",
380 (unsigned long long)thread_stats.slab_stats[i].delete_hits);
381 APPEND_NUM_STAT(i, "incr_hits", "%llu",
382 (unsigned long long)thread_stats.slab_stats[i].incr_hits);
383 APPEND_NUM_STAT(i, "decr_hits", "%llu",
384 (unsigned long long)thread_stats.slab_stats[i].decr_hits);
385 APPEND_NUM_STAT(i, "cas_hits", "%llu",
386 (unsigned long long)thread_stats.slab_stats[i].cas_hits);
387 APPEND_NUM_STAT(i, "cas_badval", "%llu",
388 (unsigned long long)thread_stats.slab_stats[i].cas_badval);
389 APPEND_NUM_STAT(i, "touch_hits", "%llu",
390 (unsigned long long)thread_stats.slab_stats[i].touch_hits);
391 total++;
392 }
393 }
394
395 /* add overall slab stats and append terminator */
396
397 APPEND_STAT("active_slabs", "%d", total);
398 APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
399 add_stats(NULL, 0, NULL, 0, c);
400 }
401
402 static void *memory_allocate(size_t size) {
403 void *ret;
404
405 if (mem_base == NULL) {
406 /* We are not using a preallocated large memory chunk */
407 ret = malloc(size);
408 } else {
409 ret = mem_current;
410
411 if (size > mem_avail) {
412 return NULL;
413 }
414
415 /* mem_current pointer _must_ be aligned!!! */
416 if (size % CHUNK_ALIGN_BYTES) {
417 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
418 }
419
420 mem_current = ((char*)mem_current) + size;
421 if (size < mem_avail) {
422 mem_avail -= size;
423 } else {
424 mem_avail = 0;
425 }
426 }
427
428 return ret;
429 }
430
431 void *slabs_alloc(size_t size, unsigned int id) {
432 void *ret;
433
434 pthread_mutex_lock(&slabs_lock);
435 ret = do_slabs_alloc(size, id);
436 pthread_mutex_unlock(&slabs_lock);
437 return ret;
438 }
439
440 void slabs_free(void *ptr, size_t size, unsigned int id) {
441 pthread_mutex_lock(&slabs_lock);
442 do_slabs_free(ptr, size, id);
443 pthread_mutex_unlock(&slabs_lock);
444 }
445
446 void slabs_stats(ADD_STAT add_stats, void *c) {
447 pthread_mutex_lock(&slabs_lock);
448 do_slabs_stats(add_stats, c);
449 pthread_mutex_unlock(&slabs_lock);
450 }
451
452 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
453 {
454 pthread_mutex_lock(&slabs_lock);
455 slabclass_t *p;
456 if (id < POWER_SMALLEST || id > power_largest) {
457 fprintf(stderr, "Internal error! Invalid slab class\n");
458 abort();
459 }
460
461 p = &slabclass[id];
462 p->requested = p->requested - old + ntotal;
463 pthread_mutex_unlock(&slabs_lock);
464 }
465
466 static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
467 static volatile int do_run_slab_thread = 1;
468
469 #define DEFAULT_SLAB_BULK_CHECK 1
470 int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
471
472 static int slab_rebalance_start(void) {
473 slabclass_t *s_cls;
474 slabclass_t *d_cls;
475 int no_go = 0;
476
477 pthread_mutex_lock(&cache_lock);
478 pthread_mutex_lock(&slabs_lock);
479
480 if (slab_rebal.s_clsid < POWER_SMALLEST ||
481 slab_rebal.s_clsid > power_largest ||
482 slab_rebal.d_clsid < POWER_SMALLEST ||
483 slab_rebal.d_clsid > power_largest ||
484 slab_rebal.s_clsid == slab_rebal.d_clsid)
485 no_go = -2;
486
487 s_cls = &slabclass[slab_rebal.s_clsid];
488 d_cls = &slabclass[slab_rebal.d_clsid];
489
490 if (d_cls->end_page_ptr || s_cls->end_page_ptr ||
491 !grow_slab_list(slab_rebal.d_clsid)) {
492 no_go = -1;
493 }
494
495 if (s_cls->slabs < 2)
496 no_go = -3;
497
498 if (no_go != 0) {
499 pthread_mutex_unlock(&slabs_lock);
500 pthread_mutex_unlock(&cache_lock);
501 return no_go; /* Should use a wrapper function... */
502 }
503
504 s_cls->killing = 1;
505
506 slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
507 slab_rebal.slab_end = (char *)slab_rebal.slab_start +
508 (s_cls->size * s_cls->perslab);
509 slab_rebal.slab_pos = slab_rebal.slab_start;
510 slab_rebal.done = 0;
511
512 /* Also tells do_item_get to search for items in this slab */
513 slab_rebalance_signal = 2;
514
515 if (settings.verbose > 1) {
516 fprintf(stderr, "Started a slab rebalance\n");
517 }
518
519 pthread_mutex_unlock(&slabs_lock);
520 pthread_mutex_unlock(&cache_lock);
521
522 STATS_LOCK();
523 stats.slab_reassign_running = true;
524 STATS_UNLOCK();
525
526 return 0;
527 }
528
529 enum move_status {
530 MOVE_PASS=0, MOVE_DONE, MOVE_BUSY
531 };
532
533 /* refcount == 0 is safe since nobody can incr while cache_lock is held.
534 * refcount != 0 is impossible since flags/etc can be modified in other
535 * threads. instead, note we found a busy one and bail. logic in do_item_get
536 * will prevent busy items from continuing to be busy
537 */
538 static int slab_rebalance_move(void) {
539 slabclass_t *s_cls;
540 int x;
541 int was_busy = 0;
542 int refcount = 0;
543 enum move_status status = MOVE_PASS;
544
545 pthread_mutex_lock(&cache_lock);
546 pthread_mutex_lock(&slabs_lock);
547
548 s_cls = &slabclass[slab_rebal.s_clsid];
549
550 for (x = 0; x < slab_bulk_check; x++) {
551 item *it = slab_rebal.slab_pos;
552 status = MOVE_PASS;
553 if (it->slabs_clsid != 255) {
554 refcount = refcount_incr(&it->refcount);
555 if (refcount == 1) { /* item is unlinked, unused */
556 if (it->it_flags & ITEM_SLABBED) {
557 /* remove from slab freelist */
558 if (s_cls->slots == it) {
559 s_cls->slots = it->next;
560 }
561 if (it->next) it->next->prev = it->prev;
562 if (it->prev) it->prev->next = it->next;
563 s_cls->sl_curr--;
564 status = MOVE_DONE;
565 } else {
566 status = MOVE_BUSY;
567 }
568 } else if (refcount == 2) { /* item is linked but not busy */
569 if ((it->it_flags & ITEM_LINKED) != 0) {
570 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
571 status = MOVE_DONE;
572 } else {
573 /* refcount == 1 + !ITEM_LINKED means the item is being
574 * uploaded to, or was just unlinked but hasn't been freed
575 * yet. Let it bleed off on its own and try again later */
576 status = MOVE_BUSY;
577 }
578 } else {
579 if (settings.verbose > 2) {
580 fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
581 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
582 }
583 status = MOVE_BUSY;
584 }
585 }
586
587 switch (status) {
588 case MOVE_DONE:
589 it->refcount = 0;
590 it->it_flags = 0;
591 it->slabs_clsid = 255;
592 break;
593 case MOVE_BUSY:
594 slab_rebal.busy_items++;
595 was_busy++;
596 refcount_decr(&it->refcount);
597 break;
598 case MOVE_PASS:
599 break;
600 }
601
602 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
603 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
604 break;
605 }
606
607 if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
608 /* Some items were busy, start again from the top */
609 if (slab_rebal.busy_items) {
610 slab_rebal.slab_pos = slab_rebal.slab_start;
611 slab_rebal.busy_items = 0;
612 } else {
613 slab_rebal.done++;
614 }
615 }
616
617 pthread_mutex_unlock(&slabs_lock);
618 pthread_mutex_unlock(&cache_lock);
619
620 return was_busy;
621 }
622
623 static void slab_rebalance_finish(void) {
624 slabclass_t *s_cls;
625 slabclass_t *d_cls;
626
627 pthread_mutex_lock(&cache_lock);
628 pthread_mutex_lock(&slabs_lock);
629
630 s_cls = &slabclass[slab_rebal.s_clsid];
631 d_cls = &slabclass[slab_rebal.d_clsid];
632
633 /* At this point the stolen slab is completely clear */
634 s_cls->slab_list[s_cls->killing - 1] =
635 s_cls->slab_list[s_cls->slabs - 1];
636 s_cls->slabs--;
637 s_cls->killing = 0;
638
639 memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
640
641 d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
642 d_cls->end_page_ptr = slab_rebal.slab_start;
643 d_cls->end_page_free = d_cls->perslab;
644
645 slab_rebal.done = 0;
646 slab_rebal.s_clsid = 0;
647 slab_rebal.d_clsid = 0;
648 slab_rebal.slab_start = NULL;
649 slab_rebal.slab_end = NULL;
650 slab_rebal.slab_pos = NULL;
651
652 slab_rebalance_signal = 0;
653
654 pthread_mutex_unlock(&slabs_lock);
655 pthread_mutex_unlock(&cache_lock);
656
657 STATS_LOCK();
658 stats.slab_reassign_running = false;
659 stats.slabs_moved++;
660 STATS_UNLOCK();
661
662 if (settings.verbose > 1) {
663 fprintf(stderr, "finished a slab move\n");
664 }
665 }
666
667 /* Return 1 means a decision was reached.
668 * Move to its own thread (created/destroyed as needed) once automover is more
669 * complex.
670 */
671 static int slab_automove_decision(int *src, int *dst) {
672 static uint64_t evicted_old[POWER_LARGEST];
673 static unsigned int slab_zeroes[POWER_LARGEST];
674 static unsigned int slab_winner = 0;
675 static unsigned int slab_wins = 0;
676 uint64_t evicted_new[POWER_LARGEST];
677 uint64_t evicted_diff = 0;
678 uint64_t evicted_max = 0;
679 unsigned int highest_slab = 0;
680 unsigned int total_pages[POWER_LARGEST];
681 int i;
682 int source = 0;
683 int dest = 0;
684 static rel_time_t next_run;
685
686 /* Run less frequently than the slabmove tester. */
687 if (current_time >= next_run) {
688 next_run = current_time + 10;
689 } else {
690 return 0;
691 }
692
693 item_stats_evictions(evicted_new);
694 pthread_mutex_lock(&cache_lock);
695 for (i = POWER_SMALLEST; i < power_largest; i++) {
696 total_pages[i] = slabclass[i].slabs;
697 }
698 pthread_mutex_unlock(&cache_lock);
699
700 /* Find a candidate source; something with zero evicts 3+ times */
701 for (i = POWER_SMALLEST; i < power_largest; i++) {
702 evicted_diff = evicted_new[i] - evicted_old[i];
703 if (evicted_diff == 0 && total_pages[i] > 2) {
704 slab_zeroes[i]++;
705 if (source == 0 && slab_zeroes[i] >= 3)
706 source = i;
707 } else {
708 slab_zeroes[i] = 0;
709 if (evicted_diff > evicted_max) {
710 evicted_max = evicted_diff;
711 highest_slab = i;
712 }
713 }
714 evicted_old[i] = evicted_new[i];
715 }
716
717 /* Pick a valid destination */
718 if (slab_winner != 0 && slab_winner == highest_slab) {
719 slab_wins++;
720 if (slab_wins >= 3)
721 dest = slab_winner;
722 } else {
723 slab_wins = 1;
724 slab_winner = highest_slab;
725 }
726
727 if (source && dest) {
728 *src = source;
729 *dst = dest;
730 return 1;
731 }
732 return 0;
733 }
734
735 #ifndef __INTEL_COMPILER
736 #pragma GCC diagnostic ignored "-Wunused-parameter"
737 #endif
738 /* Slab rebalancer thread.
739 * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
740 * go to sleep if locks are contended
741 */
742 static void *slab_maintenance_thread(void *arg) {
743 int was_busy = 0;
744 int src, dest;
745
746 while (do_run_slab_thread) {
747 if (slab_rebalance_signal == 1) {
748 if (slab_rebalance_start() < 0) {
749 /* Handle errors with more specifity as required. */
750 slab_rebalance_signal = 0;
751 }
752
753 } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
754 /* If we have a decision to continue, continue it */
755 was_busy = slab_rebalance_move();
756 } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) {
757 /* Blind to the return codes. It will retry on its own */
758 slabs_reassign(src, dest);
759 }
760
761 if (slab_rebal.done) {
762 slab_rebalance_finish();
763 }
764
765 /* Sleep a bit if no work to do, or waiting on busy objects */
766 if (was_busy || !slab_rebalance_signal)
767 sleep(1);
768 }
769 return NULL;
770 }
771
772 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
773 if (slab_rebalance_signal != 0)
774 return REASSIGN_RUNNING;
775
776 if (src == dst)
777 return REASSIGN_SRC_DST_SAME;
778
779 if (src < POWER_SMALLEST || src > power_largest ||
780 dst < POWER_SMALLEST || dst > power_largest)
781 return REASSIGN_BADCLASS;
782
783 if (slabclass[src].slabs < 2)
784 return REASSIGN_NOSPARE;
785
786 if (slabclass[dst].end_page_ptr)
787 return REASSIGN_DEST_NOT_FULL;
788
789 if (slabclass[src].end_page_ptr)
790 return REASSIGN_SRC_NOT_SAFE;
791
792 slab_rebal.s_clsid = src;
793 slab_rebal.d_clsid = dst;
794
795 slab_rebalance_signal = 1;
796
797 return REASSIGN_OK;
798 }
799
800 enum reassign_result_type slabs_reassign(int src, int dst) {
801 enum reassign_result_type ret;
802 mutex_lock(&slabs_lock);
803 ret = do_slabs_reassign(src, dst);
804 pthread_mutex_unlock(&slabs_lock);
805 return ret;
806 }
807
808 static pthread_t maintenance_tid;
809
810 int start_slab_maintenance_thread(void) {
811 int ret;
812 slab_rebalance_signal = 0;
813 slab_rebal.slab_start = NULL;
814 char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
815 if (env != NULL) {
816 slab_bulk_check = atoi(env);
817 if (slab_bulk_check == 0) {
818 slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
819 }
820 }
821 if ((ret = pthread_create(&maintenance_tid, NULL,
822 slab_maintenance_thread, NULL)) != 0) {
823 fprintf(stderr, "Can't create thread: %s\n", strerror(ret));
824 return -1;
825 }
826 return 0;
827 }
828
829 void stop_slab_maintenance_thread(void) {
830 mutex_lock(&cache_lock);
831 do_run_slab_thread = 0;
832 pthread_cond_signal(&maintenance_cond);
833 pthread_mutex_unlock(&cache_lock);
834
835 /* Wait for the maintenance thread to stop */
836 pthread_join(maintenance_tid, NULL);
837 }