7cffe43474927aa1bfb953e23611455214fac6d0
[m6w6/libmemcached] / memcached / slabs.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 */
10 #include "memcached.h"
11 #include <sys/stat.h>
12 #include <sys/socket.h>
13 #include <sys/signal.h>
14 #include <sys/resource.h>
15 #include <fcntl.h>
16 #include <netinet/in.h>
17 #include <errno.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <assert.h>
22 #include <pthread.h>
23
24 /* powers-of-N allocation structures */
25
26 typedef struct {
27 unsigned int size; /* sizes of items */
28 unsigned int perslab; /* how many items per slab */
29
30 void *slots; /* list of item ptrs */
31 unsigned int sl_curr; /* total free items in list */
32
33 void *end_page_ptr; /* pointer to next free item at end of page, or 0 */
34 unsigned int end_page_free; /* number of items remaining at end of last alloced page */
35
36 unsigned int slabs; /* how many slabs were allocated for this class */
37
38 void **slab_list; /* array of slab pointers */
39 unsigned int list_size; /* size of prev array */
40
41 unsigned int killing; /* index+1 of dying slab, or zero if none */
42 size_t requested; /* The number of requested bytes */
43 } slabclass_t;
44
45 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
46 static size_t mem_limit = 0;
47 static size_t mem_malloced = 0;
48 static int power_largest;
49
50 static void *mem_base = NULL;
51 static void *mem_current = NULL;
52 static size_t mem_avail = 0;
53
54 /**
55 * Access to the slab allocator is protected by this lock
56 */
57 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
58
59 /*
60 * Forward Declarations
61 */
62 static int do_slabs_newslab(const unsigned int id);
63 static void *memory_allocate(size_t size);
64
65 #ifndef DONT_PREALLOC_SLABS
66 /* Preallocate as many slab pages as possible (called from slabs_init)
67 on start-up, so users don't get confused out-of-memory errors when
68 they do have free (in-slab) space, but no space to make new slabs.
69 if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
70 slab types can be made. if max memory is less than 18 MB, only the
71 smaller ones will be made. */
72 static void slabs_preallocate (const unsigned int maxslabs);
73 #endif
74
75 /*
76 * Figures out which slab class (chunk size) is required to store an item of
77 * a given size.
78 *
79 * Given object size, return id to use when allocating/freeing memory for object
80 * 0 means error: can't store such a large object
81 */
82
83 unsigned int slabs_clsid(const size_t size) {
84 int res = POWER_SMALLEST;
85
86 if (size == 0)
87 return 0;
88 while (size > slabclass[res].size)
89 if (res++ == power_largest) /* won't fit in the biggest slab */
90 return 0;
91 return res;
92 }
93
94 /**
95 * Determines the chunk sizes and initializes the slab class descriptors
96 * accordingly.
97 */
98 void slabs_init(const size_t limit, const double factor, const bool prealloc) {
99 int i = POWER_SMALLEST - 1;
100 unsigned int size = sizeof(item) + settings.chunk_size;
101
102 mem_limit = limit;
103
104 if (prealloc) {
105 /* Allocate everything in a big chunk with malloc */
106 mem_base = malloc(mem_limit);
107 if (mem_base != NULL) {
108 mem_current = mem_base;
109 mem_avail = mem_limit;
110 } else {
111 fprintf(stderr, "Warning: Failed to allocate requested memory in"
112 " one large chunk.\nWill allocate in smaller chunks\n");
113 }
114 }
115
116 memset(slabclass, 0, sizeof(slabclass));
117
118 while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
119 /* Make sure items are always n-byte aligned */
120 if (size % CHUNK_ALIGN_BYTES)
121 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
122
123 slabclass[i].size = size;
124 slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
125 size *= factor;
126 if (settings.verbose > 1) {
127 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
128 i, slabclass[i].size, slabclass[i].perslab);
129 }
130 }
131
132 power_largest = i;
133 slabclass[power_largest].size = settings.item_size_max;
134 slabclass[power_largest].perslab = 1;
135 if (settings.verbose > 1) {
136 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
137 i, slabclass[i].size, slabclass[i].perslab);
138 }
139
140 /* for the test suite: faking of how much we've already malloc'd */
141 {
142 char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
143 if (t_initial_malloc) {
144 mem_malloced = (size_t)atol(t_initial_malloc);
145 }
146
147 }
148
149 #ifndef DONT_PREALLOC_SLABS
150 {
151 char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC");
152
153 if (pre_alloc == NULL || atoi(pre_alloc) != 0) {
154 slabs_preallocate(power_largest);
155 }
156 }
157 #endif
158 }
159
160 #ifndef DONT_PREALLOC_SLABS
161 static void slabs_preallocate (const unsigned int maxslabs) {
162 int i;
163 unsigned int prealloc = 0;
164
165 /* pre-allocate a 1MB slab in every size class so people don't get
166 confused by non-intuitive "SERVER_ERROR out of memory"
167 messages. this is the most common question on the mailing
168 list. if you really don't want this, you can rebuild without
169 these three lines. */
170
171 for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
172 if (++prealloc > maxslabs)
173 return;
174 do_slabs_newslab(i);
175 }
176
177 }
178 #endif
179
180 static int grow_slab_list (const unsigned int id) {
181 slabclass_t *p = &slabclass[id];
182 if (p->slabs == p->list_size) {
183 size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
184 void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
185 if (new_list == 0) return 0;
186 p->list_size = new_size;
187 p->slab_list = new_list;
188 }
189 return 1;
190 }
191
192 #ifndef __INTEL_COMPILER
193 #pragma GCC diagnostic ignored "-Wsign-compare"
194 #endif
195
196 static int do_slabs_newslab(const unsigned int id) {
197 slabclass_t *p = &slabclass[id];
198 int len = settings.slab_reassign ? settings.item_size_max
199 : p->size * p->perslab;
200 char *ptr;
201
202 if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
203 (grow_slab_list(id) == 0) ||
204 ((ptr = memory_allocate((size_t)len)) == 0)) {
205
206 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
207 return 0;
208 }
209
210 memset(ptr, 0, (size_t)len);
211 p->end_page_ptr = ptr;
212 p->end_page_free = p->perslab;
213
214 p->slab_list[p->slabs++] = ptr;
215 mem_malloced += len;
216 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
217
218 return 1;
219 }
220
221 /*@null@*/
222 static void *do_slabs_alloc(const size_t size, unsigned int id) {
223 slabclass_t *p;
224 void *ret = NULL;
225 item *it = NULL;
226
227 if (id < POWER_SMALLEST || id > power_largest) {
228 MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
229 return NULL;
230 }
231
232 p = &slabclass[id];
233 assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
234
235 #ifdef USE_SYSTEM_MALLOC
236 if (mem_limit && mem_malloced + size > mem_limit) {
237 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
238 return 0;
239 }
240 mem_malloced += size;
241 ret = malloc(size);
242 MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret);
243 return ret;
244 #endif
245
246 /* fail unless we have space at the end of a recently allocated page,
247 we have something on our freelist, or we could allocate a new page */
248 if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||
249 do_slabs_newslab(id) != 0)) {
250 /* We don't have more memory available */
251 ret = NULL;
252 } else if (p->sl_curr != 0) {
253 /* return off our freelist */
254 it = (item *)p->slots;
255 p->slots = it->next;
256 if (it->next) it->next->prev = 0;
257 p->sl_curr--;
258 ret = (void *)it;
259 } else {
260 /* if we recently allocated a whole page, return from that */
261 assert(p->end_page_ptr != NULL);
262 ret = p->end_page_ptr;
263 if (--p->end_page_free != 0) {
264 p->end_page_ptr = ((caddr_t)p->end_page_ptr) + p->size;
265 } else {
266 p->end_page_ptr = 0;
267 }
268 }
269
270 if (ret) {
271 p->requested += size;
272 MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
273 } else {
274 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
275 }
276
277 return ret;
278 }
279
280 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
281 slabclass_t *p;
282 item *it;
283
284 assert(((item *)ptr)->slabs_clsid == 0);
285 assert(id >= POWER_SMALLEST && id <= power_largest);
286 if (id < POWER_SMALLEST || id > power_largest)
287 return;
288
289 MEMCACHED_SLABS_FREE(size, id, ptr);
290 p = &slabclass[id];
291
292 #ifdef USE_SYSTEM_MALLOC
293 mem_malloced -= size;
294 free(ptr);
295 return;
296 #endif
297
298 it = (item *)ptr;
299 it->it_flags |= ITEM_SLABBED;
300 it->prev = 0;
301 it->next = p->slots;
302 if (it->next) it->next->prev = it;
303 p->slots = it;
304
305 p->sl_curr++;
306 p->requested -= size;
307 return;
308 }
309
310 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
311 int zlength=strlen(z);
312 return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
313 }
314
315 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
316 bool ret = true;
317
318 if (add_stats != NULL) {
319 if (!stat_type) {
320 /* prepare general statistics for the engine */
321 STATS_LOCK();
322 APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
323 APPEND_STAT("curr_items", "%u", stats.curr_items);
324 APPEND_STAT("total_items", "%u", stats.total_items);
325 APPEND_STAT("evictions", "%llu",
326 (unsigned long long)stats.evictions);
327 APPEND_STAT("reclaimed", "%llu",
328 (unsigned long long)stats.reclaimed);
329 STATS_UNLOCK();
330 } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
331 item_stats(add_stats, c);
332 } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
333 slabs_stats(add_stats, c);
334 } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
335 item_stats_sizes(add_stats, c);
336 } else {
337 ret = false;
338 }
339 } else {
340 ret = false;
341 }
342
343 return ret;
344 }
345
346 /*@null@*/
347 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
348 int i, total;
349 /* Get the per-thread stats which contain some interesting aggregates */
350 struct thread_stats thread_stats;
351 threadlocal_stats_aggregate(&thread_stats);
352
353 total = 0;
354 for(i = POWER_SMALLEST; i <= power_largest; i++) {
355 slabclass_t *p = &slabclass[i];
356 if (p->slabs != 0) {
357 uint32_t perslab, slabs;
358 slabs = p->slabs;
359 perslab = p->perslab;
360
361 char key_str[STAT_KEY_LEN];
362 char val_str[STAT_VAL_LEN];
363 int klen = 0, vlen = 0;
364
365 APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
366 APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
367 APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
368 APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
369 APPEND_NUM_STAT(i, "used_chunks", "%u",
370 slabs*perslab - p->sl_curr - p->end_page_free);
371 APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
372 APPEND_NUM_STAT(i, "free_chunks_end", "%u", p->end_page_free);
373 APPEND_NUM_STAT(i, "mem_requested", "%llu",
374 (unsigned long long)p->requested);
375 APPEND_NUM_STAT(i, "get_hits", "%llu",
376 (unsigned long long)thread_stats.slab_stats[i].get_hits);
377 APPEND_NUM_STAT(i, "cmd_set", "%llu",
378 (unsigned long long)thread_stats.slab_stats[i].set_cmds);
379 APPEND_NUM_STAT(i, "delete_hits", "%llu",
380 (unsigned long long)thread_stats.slab_stats[i].delete_hits);
381 APPEND_NUM_STAT(i, "incr_hits", "%llu",
382 (unsigned long long)thread_stats.slab_stats[i].incr_hits);
383 APPEND_NUM_STAT(i, "decr_hits", "%llu",
384 (unsigned long long)thread_stats.slab_stats[i].decr_hits);
385 APPEND_NUM_STAT(i, "cas_hits", "%llu",
386 (unsigned long long)thread_stats.slab_stats[i].cas_hits);
387 APPEND_NUM_STAT(i, "cas_badval", "%llu",
388 (unsigned long long)thread_stats.slab_stats[i].cas_badval);
389 APPEND_NUM_STAT(i, "touch_hits", "%llu",
390 (unsigned long long)thread_stats.slab_stats[i].touch_hits);
391 total++;
392 }
393 }
394
395 /* add overall slab stats and append terminator */
396
397 APPEND_STAT("active_slabs", "%d", total);
398 APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
399 add_stats(NULL, 0, NULL, 0, c);
400 }
401
402 static void *memory_allocate(size_t size) {
403 void *ret;
404
405 if (mem_base == NULL) {
406 /* We are not using a preallocated large memory chunk */
407 ret = malloc(size);
408 } else {
409 ret = mem_current;
410
411 if (size > mem_avail) {
412 return NULL;
413 }
414
415 /* mem_current pointer _must_ be aligned!!! */
416 if (size % CHUNK_ALIGN_BYTES) {
417 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
418 }
419
420 mem_current = ((char*)mem_current) + size;
421 if (size < mem_avail) {
422 mem_avail -= size;
423 } else {
424 mem_avail = 0;
425 }
426 }
427
428 return ret;
429 }
430
431 void *slabs_alloc(size_t size, unsigned int id) {
432 void *ret;
433
434 pthread_mutex_lock(&slabs_lock);
435 ret = do_slabs_alloc(size, id);
436 pthread_mutex_unlock(&slabs_lock);
437 return ret;
438 }
439
440 void slabs_free(void *ptr, size_t size, unsigned int id) {
441 pthread_mutex_lock(&slabs_lock);
442 do_slabs_free(ptr, size, id);
443 pthread_mutex_unlock(&slabs_lock);
444 }
445
446 void slabs_stats(ADD_STAT add_stats, void *c) {
447 pthread_mutex_lock(&slabs_lock);
448 do_slabs_stats(add_stats, c);
449 pthread_mutex_unlock(&slabs_lock);
450 }
451
452 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
453 {
454 pthread_mutex_lock(&slabs_lock);
455 slabclass_t *p;
456 if (id < POWER_SMALLEST || id > power_largest) {
457 fprintf(stderr, "Internal error! Invalid slab class\n");
458 abort();
459 }
460
461 p = &slabclass[id];
462 p->requested = p->requested - old + ntotal;
463 pthread_mutex_unlock(&slabs_lock);
464 }
465
466 static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
467 static volatile int do_run_slab_thread = 1;
468
469 #define DEFAULT_SLAB_BULK_CHECK 1
470 int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
471
472 static int slab_rebalance_start(void) {
473 slabclass_t *s_cls;
474 slabclass_t *d_cls;
475 int no_go = 0;
476
477 pthread_mutex_lock(&cache_lock);
478 pthread_mutex_lock(&slabs_lock);
479
480 if (slab_rebal.s_clsid < POWER_SMALLEST ||
481 slab_rebal.s_clsid > power_largest ||
482 slab_rebal.d_clsid < POWER_SMALLEST ||
483 slab_rebal.d_clsid > power_largest ||
484 slab_rebal.s_clsid == slab_rebal.d_clsid)
485 no_go = -2;
486
487 s_cls = &slabclass[slab_rebal.s_clsid];
488 d_cls = &slabclass[slab_rebal.d_clsid];
489
490 if (d_cls->end_page_ptr || s_cls->end_page_ptr ||
491 !grow_slab_list(slab_rebal.d_clsid)) {
492 no_go = -1;
493 }
494
495 if (s_cls->slabs < 2)
496 no_go = -3;
497
498 if (no_go != 0) {
499 pthread_mutex_unlock(&slabs_lock);
500 pthread_mutex_unlock(&cache_lock);
501 return no_go; /* Should use a wrapper function... */
502 }
503
504 s_cls->killing = 1;
505
506 slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
507 slab_rebal.slab_end = (char *)slab_rebal.slab_start +
508 (s_cls->size * s_cls->perslab);
509 slab_rebal.slab_pos = slab_rebal.slab_start;
510 slab_rebal.done = 0;
511
512 /* Also tells do_item_get to search for items in this slab */
513 slab_rebalance_signal = 2;
514
515 if (settings.verbose > 1) {
516 fprintf(stderr, "Started a slab rebalance\n");
517 }
518
519 pthread_mutex_unlock(&slabs_lock);
520 pthread_mutex_unlock(&cache_lock);
521
522 STATS_LOCK();
523 stats.slab_reassign_running = true;
524 STATS_UNLOCK();
525
526 return 0;
527 }
528
529 enum move_status {
530 MOVE_PASS=0, MOVE_DONE, MOVE_BUSY
531 };
532
533 /* refcount == 0 is safe since nobody can incr while cache_lock is held.
534 * refcount != 0 is impossible since flags/etc can be modified in other
535 * threads. instead, note we found a busy one and bail. logic in do_item_get
536 * will prevent busy items from continuing to be busy
537 */
538 static int slab_rebalance_move(void) {
539 slabclass_t *s_cls;
540 int x;
541 int was_busy = 0;
542 int refcount = 0;
543 enum move_status status = MOVE_PASS;
544
545 pthread_mutex_lock(&cache_lock);
546 pthread_mutex_lock(&slabs_lock);
547
548 s_cls = &slabclass[slab_rebal.s_clsid];
549
550 for (x = 0; x < slab_bulk_check; x++) {
551 item *it = slab_rebal.slab_pos;
552 status = MOVE_PASS;
553 if (it->slabs_clsid != 255) {
554 refcount = refcount_incr(&it->refcount);
555 if (refcount == 1) { /* item is unlinked, unused */
556 if (it->it_flags & ITEM_SLABBED) {
557 /* remove from slab freelist */
558 if (s_cls->slots == it) {
559 s_cls->slots = it->next;
560 }
561 if (it->next) it->next->prev = it->prev;
562 if (it->prev) it->prev->next = it->next;
563 s_cls->sl_curr--;
564 status = MOVE_DONE;
565 } else {
566 status = MOVE_BUSY;
567 }
568 } else if (refcount == 2) { /* item is linked but not busy */
569 if ((it->it_flags & ITEM_LINKED) != 0) {
570 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
571 status = MOVE_DONE;
572 } else {
573 /* refcount == 1 + !ITEM_LINKED means the item is being
574 * uploaded to, or was just unlinked but hasn't been freed
575 * yet. Let it bleed off on its own and try again later */
576 status = MOVE_BUSY;
577 }
578 } else {
579 if (settings.verbose > 2) {
580 fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
581 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
582 }
583 status = MOVE_BUSY;
584 }
585 }
586
587 switch (status) {
588 case MOVE_DONE:
589 it->refcount = 0;
590 it->it_flags = 0;
591 it->slabs_clsid = 255;
592 break;
593 case MOVE_BUSY:
594 slab_rebal.busy_items++;
595 was_busy++;
596 refcount_decr(&it->refcount);
597 break;
598 case MOVE_PASS:
599 break;
600 default:
601 assert(false);
602 abort();
603 }
604
605 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
606 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
607 break;
608 }
609
610 if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
611 /* Some items were busy, start again from the top */
612 if (slab_rebal.busy_items) {
613 slab_rebal.slab_pos = slab_rebal.slab_start;
614 slab_rebal.busy_items = 0;
615 } else {
616 slab_rebal.done++;
617 }
618 }
619
620 pthread_mutex_unlock(&slabs_lock);
621 pthread_mutex_unlock(&cache_lock);
622
623 return was_busy;
624 }
625
626 static void slab_rebalance_finish(void) {
627 slabclass_t *s_cls;
628 slabclass_t *d_cls;
629
630 pthread_mutex_lock(&cache_lock);
631 pthread_mutex_lock(&slabs_lock);
632
633 s_cls = &slabclass[slab_rebal.s_clsid];
634 d_cls = &slabclass[slab_rebal.d_clsid];
635
636 /* At this point the stolen slab is completely clear */
637 s_cls->slab_list[s_cls->killing - 1] =
638 s_cls->slab_list[s_cls->slabs - 1];
639 s_cls->slabs--;
640 s_cls->killing = 0;
641
642 memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
643
644 d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
645 d_cls->end_page_ptr = slab_rebal.slab_start;
646 d_cls->end_page_free = d_cls->perslab;
647
648 slab_rebal.done = 0;
649 slab_rebal.s_clsid = 0;
650 slab_rebal.d_clsid = 0;
651 slab_rebal.slab_start = NULL;
652 slab_rebal.slab_end = NULL;
653 slab_rebal.slab_pos = NULL;
654
655 slab_rebalance_signal = 0;
656
657 pthread_mutex_unlock(&slabs_lock);
658 pthread_mutex_unlock(&cache_lock);
659
660 STATS_LOCK();
661 stats.slab_reassign_running = false;
662 stats.slabs_moved++;
663 STATS_UNLOCK();
664
665 if (settings.verbose > 1) {
666 fprintf(stderr, "finished a slab move\n");
667 }
668 }
669
670 /* Return 1 means a decision was reached.
671 * Move to its own thread (created/destroyed as needed) once automover is more
672 * complex.
673 */
674 static int slab_automove_decision(int *src, int *dst) {
675 static uint64_t evicted_old[POWER_LARGEST];
676 static unsigned int slab_zeroes[POWER_LARGEST];
677 static unsigned int slab_winner = 0;
678 static unsigned int slab_wins = 0;
679 uint64_t evicted_new[POWER_LARGEST];
680 uint64_t evicted_diff = 0;
681 uint64_t evicted_max = 0;
682 unsigned int highest_slab = 0;
683 unsigned int total_pages[POWER_LARGEST];
684 int i;
685 int source = 0;
686 int dest = 0;
687 static rel_time_t next_run;
688
689 /* Run less frequently than the slabmove tester. */
690 if (current_time >= next_run) {
691 next_run = current_time + 10;
692 } else {
693 return 0;
694 }
695
696 item_stats_evictions(evicted_new);
697 pthread_mutex_lock(&cache_lock);
698 for (i = POWER_SMALLEST; i < power_largest; i++) {
699 total_pages[i] = slabclass[i].slabs;
700 }
701 pthread_mutex_unlock(&cache_lock);
702
703 /* Find a candidate source; something with zero evicts 3+ times */
704 for (i = POWER_SMALLEST; i < power_largest; i++) {
705 evicted_diff = evicted_new[i] - evicted_old[i];
706 if (evicted_diff == 0 && total_pages[i] > 2) {
707 slab_zeroes[i]++;
708 if (source == 0 && slab_zeroes[i] >= 3)
709 source = i;
710 } else {
711 slab_zeroes[i] = 0;
712 if (evicted_diff > evicted_max) {
713 evicted_max = evicted_diff;
714 highest_slab = i;
715 }
716 }
717 evicted_old[i] = evicted_new[i];
718 }
719
720 /* Pick a valid destination */
721 if (slab_winner != 0 && slab_winner == highest_slab) {
722 slab_wins++;
723 if (slab_wins >= 3)
724 dest = slab_winner;
725 } else {
726 slab_wins = 1;
727 slab_winner = highest_slab;
728 }
729
730 if (source && dest) {
731 *src = source;
732 *dst = dest;
733 return 1;
734 }
735 return 0;
736 }
737
738 #ifndef __INTEL_COMPILER
739 #pragma GCC diagnostic ignored "-Wunused-parameter"
740 #endif
741 /* Slab rebalancer thread.
742 * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
743 * go to sleep if locks are contended
744 */
745 static void *slab_maintenance_thread(void *arg) {
746 int was_busy = 0;
747 int src, dest;
748
749 while (do_run_slab_thread) {
750 if (slab_rebalance_signal == 1) {
751 if (slab_rebalance_start() < 0) {
752 /* Handle errors with more specifity as required. */
753 slab_rebalance_signal = 0;
754 }
755
756 } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
757 /* If we have a decision to continue, continue it */
758 was_busy = slab_rebalance_move();
759 } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) {
760 /* Blind to the return codes. It will retry on its own */
761 slabs_reassign(src, dest);
762 }
763
764 if (slab_rebal.done) {
765 slab_rebalance_finish();
766 }
767
768 /* Sleep a bit if no work to do, or waiting on busy objects */
769 if (was_busy || !slab_rebalance_signal)
770 sleep(1);
771 }
772 return NULL;
773 }
774
775 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
776 if (slab_rebalance_signal != 0)
777 return REASSIGN_RUNNING;
778
779 if (src == dst)
780 return REASSIGN_SRC_DST_SAME;
781
782 if (src < POWER_SMALLEST || src > power_largest ||
783 dst < POWER_SMALLEST || dst > power_largest)
784 return REASSIGN_BADCLASS;
785
786 if (slabclass[src].slabs < 2)
787 return REASSIGN_NOSPARE;
788
789 if (slabclass[dst].end_page_ptr)
790 return REASSIGN_DEST_NOT_FULL;
791
792 if (slabclass[src].end_page_ptr)
793 return REASSIGN_SRC_NOT_SAFE;
794
795 slab_rebal.s_clsid = src;
796 slab_rebal.d_clsid = dst;
797
798 slab_rebalance_signal = 1;
799
800 return REASSIGN_OK;
801 }
802
803 enum reassign_result_type slabs_reassign(int src, int dst) {
804 enum reassign_result_type ret;
805 mutex_lock(&slabs_lock);
806 ret = do_slabs_reassign(src, dst);
807 pthread_mutex_unlock(&slabs_lock);
808 return ret;
809 }
810
811 static pthread_t maintenance_tid;
812
813 int start_slab_maintenance_thread(void) {
814 int ret;
815 slab_rebalance_signal = 0;
816 slab_rebal.slab_start = NULL;
817 char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
818 if (env != NULL) {
819 slab_bulk_check = atoi(env);
820 if (slab_bulk_check == 0) {
821 slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
822 }
823 }
824 if ((ret = pthread_create(&maintenance_tid, NULL,
825 slab_maintenance_thread, NULL)) != 0) {
826 fprintf(stderr, "Can't create thread: %s\n", strerror(ret));
827 return -1;
828 }
829 return 0;
830 }
831
832 void stop_slab_maintenance_thread(void) {
833 mutex_lock(&cache_lock);
834 do_run_slab_thread = 0;
835 pthread_cond_signal(&maintenance_cond);
836 pthread_mutex_unlock(&cache_lock);
837
838 /* Wait for the maintenance thread to stop */
839 pthread_join(maintenance_tid, NULL);
840 }