Update hardening rules.
[awesomized/libmemcached] / memcached / slabs.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 */
10 #include "memcached.h"
11 #include <sys/stat.h>
12 #include <sys/socket.h>
13 #include <sys/signal.h>
14 #include <sys/resource.h>
15 #include <fcntl.h>
16 #include <netinet/in.h>
17 #include <errno.h>
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <assert.h>
22 #include <pthread.h>
23
24 /* powers-of-N allocation structures */
25
26 typedef struct {
27 unsigned int size; /* sizes of items */
28 unsigned int perslab; /* how many items per slab */
29
30 void *slots; /* list of item ptrs */
31 unsigned int sl_curr; /* total free items in list */
32
33 unsigned int slabs; /* how many slabs were allocated for this class */
34
35 void **slab_list; /* array of slab pointers */
36 unsigned int list_size; /* size of prev array */
37
38 unsigned int killing; /* index+1 of dying slab, or zero if none */
39 size_t requested; /* The number of requested bytes */
40 } slabclass_t;
41
42 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43 static size_t mem_limit = 0;
44 static size_t mem_malloced = 0;
45 static int power_largest;
46
47 static void *mem_base = NULL;
48 static void *mem_current = NULL;
49 static size_t mem_avail = 0;
50
51 /**
52 * Access to the slab allocator is protected by this lock
53 */
54 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
55 static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
56
57 /*
58 * Forward Declarations
59 */
60 static int do_slabs_newslab(const unsigned int id);
61 static void *memory_allocate(size_t size);
62 static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
63
64 /* Preallocate as many slab pages as possible (called from slabs_init)
65 on start-up, so users don't get confused out-of-memory errors when
66 they do have free (in-slab) space, but no space to make new slabs.
67 if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
68 slab types can be made. if max memory is less than 18 MB, only the
69 smaller ones will be made. */
70 static void slabs_preallocate (const unsigned int maxslabs);
71
72 /*
73 * Figures out which slab class (chunk size) is required to store an item of
74 * a given size.
75 *
76 * Given object size, return id to use when allocating/freeing memory for object
77 * 0 means error: can't store such a large object
78 */
79
80 unsigned int slabs_clsid(const size_t size) {
81 int res = POWER_SMALLEST;
82
83 if (size == 0)
84 return 0;
85 while (size > slabclass[res].size)
86 if (res++ == power_largest) /* won't fit in the biggest slab */
87 return 0;
88 return res;
89 }
90
91 /**
92 * Determines the chunk sizes and initializes the slab class descriptors
93 * accordingly.
94 */
95 void slabs_init(const size_t limit, const double factor, const bool prealloc) {
96 int i = POWER_SMALLEST - 1;
97 unsigned int size = sizeof(item) + settings.chunk_size;
98
99 mem_limit = limit;
100
101 if (prealloc) {
102 /* Allocate everything in a big chunk with malloc */
103 mem_base = malloc(mem_limit);
104 if (mem_base != NULL) {
105 mem_current = mem_base;
106 mem_avail = mem_limit;
107 } else {
108 fprintf(stderr, "Warning: Failed to allocate requested memory in"
109 " one large chunk.\nWill allocate in smaller chunks\n");
110 }
111 }
112
113 memset(slabclass, 0, sizeof(slabclass));
114
115 while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
116 /* Make sure items are always n-byte aligned */
117 if (size % CHUNK_ALIGN_BYTES)
118 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
119
120 slabclass[i].size = size;
121 slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
122 size *= factor;
123 if (settings.verbose > 1) {
124 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
125 i, slabclass[i].size, slabclass[i].perslab);
126 }
127 }
128
129 power_largest = i;
130 slabclass[power_largest].size = settings.item_size_max;
131 slabclass[power_largest].perslab = 1;
132 if (settings.verbose > 1) {
133 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
134 i, slabclass[i].size, slabclass[i].perslab);
135 }
136
137 /* for the test suite: faking of how much we've already malloc'd */
138 {
139 char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
140 if (t_initial_malloc) {
141 mem_malloced = (size_t)atol(t_initial_malloc);
142 }
143
144 }
145
146 if (prealloc) {
147 slabs_preallocate(power_largest);
148 }
149 }
150
151 static void slabs_preallocate (const unsigned int maxslabs) {
152 int i;
153 unsigned int prealloc = 0;
154
155 /* pre-allocate a 1MB slab in every size class so people don't get
156 confused by non-intuitive "SERVER_ERROR out of memory"
157 messages. this is the most common question on the mailing
158 list. if you really don't want this, you can rebuild without
159 these three lines. */
160
161 for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
162 if (++prealloc > maxslabs)
163 return;
164 if (do_slabs_newslab(i) == 0) {
165 fprintf(stderr, "Error while preallocating slab memory!\n"
166 "If using -L or other prealloc options, max memory must be "
167 "at least %d megabytes.\n", power_largest);
168 exit(1);
169 }
170 }
171
172 }
173
174 static int grow_slab_list (const unsigned int id) {
175 slabclass_t *p = &slabclass[id];
176 if (p->slabs == p->list_size) {
177 size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
178 void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
179 if (new_list == 0) return 0;
180 p->list_size = new_size;
181 p->slab_list = new_list;
182 }
183 return 1;
184 }
185
186 static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
187 slabclass_t *p = &slabclass[id];
188 int x;
189 for (x = 0; x < p->perslab; x++) {
190 do_slabs_free(ptr, 0, id);
191 ptr += p->size;
192 }
193 }
194
195 static int do_slabs_newslab(const unsigned int id) {
196 slabclass_t *p = &slabclass[id];
197 int len = settings.slab_reassign ? settings.item_size_max
198 : p->size * p->perslab;
199 char *ptr;
200
201 if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
202 (grow_slab_list(id) == 0) ||
203 ((ptr = memory_allocate((size_t)len)) == 0)) {
204
205 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
206 return 0;
207 }
208
209 memset(ptr, 0, (size_t)len);
210 split_slab_page_into_freelist(ptr, id);
211
212 p->slab_list[p->slabs++] = ptr;
213 mem_malloced += len;
214 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
215
216 return 1;
217 }
218
219 /*@null@*/
220 static void *do_slabs_alloc(const size_t size, unsigned int id) {
221 slabclass_t *p;
222 void *ret = NULL;
223 item *it = NULL;
224
225 if (id < POWER_SMALLEST || id > power_largest) {
226 MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
227 return NULL;
228 }
229
230 p = &slabclass[id];
231 assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
232
233 /* fail unless we have space at the end of a recently allocated page,
234 we have something on our freelist, or we could allocate a new page */
235 if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
236 /* We don't have more memory available */
237 ret = NULL;
238 } else if (p->sl_curr != 0) {
239 /* return off our freelist */
240 it = (item *)p->slots;
241 p->slots = it->next;
242 if (it->next) it->next->prev = 0;
243 p->sl_curr--;
244 ret = (void *)it;
245 }
246
247 if (ret) {
248 p->requested += size;
249 MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
250 } else {
251 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
252 }
253
254 return ret;
255 }
256
257 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
258 slabclass_t *p;
259 item *it;
260
261 assert(((item *)ptr)->slabs_clsid == 0);
262 assert(id >= POWER_SMALLEST && id <= power_largest);
263 if (id < POWER_SMALLEST || id > power_largest)
264 return;
265
266 MEMCACHED_SLABS_FREE(size, id, ptr);
267 p = &slabclass[id];
268
269 it = (item *)ptr;
270 it->it_flags |= ITEM_SLABBED;
271 it->prev = 0;
272 it->next = p->slots;
273 if (it->next) it->next->prev = it;
274 p->slots = it;
275
276 p->sl_curr++;
277 p->requested -= size;
278 return;
279 }
280
281 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
282 int zlength=strlen(z);
283 return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
284 }
285
286 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
287 bool ret = true;
288
289 if (add_stats != NULL) {
290 if (!stat_type) {
291 /* prepare general statistics for the engine */
292 STATS_LOCK();
293 APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
294 APPEND_STAT("curr_items", "%u", stats.curr_items);
295 APPEND_STAT("total_items", "%u", stats.total_items);
296 APPEND_STAT("evictions", "%llu",
297 (unsigned long long)stats.evictions);
298 APPEND_STAT("reclaimed", "%llu",
299 (unsigned long long)stats.reclaimed);
300 STATS_UNLOCK();
301 } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
302 item_stats(add_stats, c);
303 } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
304 slabs_stats(add_stats, c);
305 } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
306 item_stats_sizes(add_stats, c);
307 } else {
308 ret = false;
309 }
310 } else {
311 ret = false;
312 }
313
314 return ret;
315 }
316
317 /*@null@*/
318 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
319 int i, total;
320 /* Get the per-thread stats which contain some interesting aggregates */
321 struct thread_stats thread_stats;
322 threadlocal_stats_aggregate(&thread_stats);
323
324 total = 0;
325 for(i = POWER_SMALLEST; i <= power_largest; i++) {
326 slabclass_t *p = &slabclass[i];
327 if (p->slabs != 0) {
328 uint32_t perslab, slabs;
329 slabs = p->slabs;
330 perslab = p->perslab;
331
332 char key_str[STAT_KEY_LEN];
333 char val_str[STAT_VAL_LEN];
334 int klen = 0, vlen = 0;
335
336 APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
337 APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
338 APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
339 APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
340 APPEND_NUM_STAT(i, "used_chunks", "%u",
341 slabs*perslab - p->sl_curr);
342 APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
343 /* Stat is dead, but displaying zero instead of removing it. */
344 APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
345 APPEND_NUM_STAT(i, "mem_requested", "%llu",
346 (unsigned long long)p->requested);
347 APPEND_NUM_STAT(i, "get_hits", "%llu",
348 (unsigned long long)thread_stats.slab_stats[i].get_hits);
349 APPEND_NUM_STAT(i, "cmd_set", "%llu",
350 (unsigned long long)thread_stats.slab_stats[i].set_cmds);
351 APPEND_NUM_STAT(i, "delete_hits", "%llu",
352 (unsigned long long)thread_stats.slab_stats[i].delete_hits);
353 APPEND_NUM_STAT(i, "incr_hits", "%llu",
354 (unsigned long long)thread_stats.slab_stats[i].incr_hits);
355 APPEND_NUM_STAT(i, "decr_hits", "%llu",
356 (unsigned long long)thread_stats.slab_stats[i].decr_hits);
357 APPEND_NUM_STAT(i, "cas_hits", "%llu",
358 (unsigned long long)thread_stats.slab_stats[i].cas_hits);
359 APPEND_NUM_STAT(i, "cas_badval", "%llu",
360 (unsigned long long)thread_stats.slab_stats[i].cas_badval);
361 APPEND_NUM_STAT(i, "touch_hits", "%llu",
362 (unsigned long long)thread_stats.slab_stats[i].touch_hits);
363 total++;
364 }
365 }
366
367 /* add overall slab stats and append terminator */
368
369 APPEND_STAT("active_slabs", "%d", total);
370 APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
371 add_stats(NULL, 0, NULL, 0, c);
372 }
373
374 static void *memory_allocate(size_t size) {
375 void *ret;
376
377 if (mem_base == NULL) {
378 /* We are not using a preallocated large memory chunk */
379 ret = malloc(size);
380 } else {
381 ret = mem_current;
382
383 if (size > mem_avail) {
384 return NULL;
385 }
386
387 /* mem_current pointer _must_ be aligned!!! */
388 if (size % CHUNK_ALIGN_BYTES) {
389 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
390 }
391
392 mem_current = ((char*)mem_current) + size;
393 if (size < mem_avail) {
394 mem_avail -= size;
395 } else {
396 mem_avail = 0;
397 }
398 }
399
400 return ret;
401 }
402
403 void *slabs_alloc(size_t size, unsigned int id) {
404 void *ret;
405
406 pthread_mutex_lock(&slabs_lock);
407 ret = do_slabs_alloc(size, id);
408 pthread_mutex_unlock(&slabs_lock);
409 return ret;
410 }
411
412 void slabs_free(void *ptr, size_t size, unsigned int id) {
413 pthread_mutex_lock(&slabs_lock);
414 do_slabs_free(ptr, size, id);
415 pthread_mutex_unlock(&slabs_lock);
416 }
417
418 void slabs_stats(ADD_STAT add_stats, void *c) {
419 pthread_mutex_lock(&slabs_lock);
420 do_slabs_stats(add_stats, c);
421 pthread_mutex_unlock(&slabs_lock);
422 }
423
424 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
425 {
426 pthread_mutex_lock(&slabs_lock);
427 slabclass_t *p;
428 if (id < POWER_SMALLEST || id > power_largest) {
429 fprintf(stderr, "Internal error! Invalid slab class\n");
430 abort();
431 }
432
433 p = &slabclass[id];
434 p->requested = p->requested - old + ntotal;
435 pthread_mutex_unlock(&slabs_lock);
436 }
437
438 static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
439 static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
440 static volatile int do_run_slab_thread = 1;
441 static volatile int do_run_slab_rebalance_thread = 1;
442
443 #define DEFAULT_SLAB_BULK_CHECK 1
444 int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
445
446 static int slab_rebalance_start(void) {
447 slabclass_t *s_cls;
448 int no_go = 0;
449
450 pthread_mutex_lock(&cache_lock);
451 pthread_mutex_lock(&slabs_lock);
452
453 if (slab_rebal.s_clsid < POWER_SMALLEST ||
454 slab_rebal.s_clsid > power_largest ||
455 slab_rebal.d_clsid < POWER_SMALLEST ||
456 slab_rebal.d_clsid > power_largest ||
457 slab_rebal.s_clsid == slab_rebal.d_clsid)
458 no_go = -2;
459
460 s_cls = &slabclass[slab_rebal.s_clsid];
461
462 if (!grow_slab_list(slab_rebal.d_clsid)) {
463 no_go = -1;
464 }
465
466 if (s_cls->slabs < 2)
467 no_go = -3;
468
469 if (no_go != 0) {
470 pthread_mutex_unlock(&slabs_lock);
471 pthread_mutex_unlock(&cache_lock);
472 return no_go; /* Should use a wrapper function... */
473 }
474
475 s_cls->killing = 1;
476
477 slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
478 slab_rebal.slab_end = (char *)slab_rebal.slab_start +
479 (s_cls->size * s_cls->perslab);
480 slab_rebal.slab_pos = slab_rebal.slab_start;
481 slab_rebal.done = 0;
482
483 /* Also tells do_item_get to search for items in this slab */
484 slab_rebalance_signal = 2;
485
486 if (settings.verbose > 1) {
487 fprintf(stderr, "Started a slab rebalance\n");
488 }
489
490 pthread_mutex_unlock(&slabs_lock);
491 pthread_mutex_unlock(&cache_lock);
492
493 STATS_LOCK();
494 stats.slab_reassign_running = true;
495 STATS_UNLOCK();
496
497 return 0;
498 }
499
500 enum move_status {
501 MOVE_PASS=0, MOVE_DONE, MOVE_BUSY
502 };
503
504 /* refcount == 0 is safe since nobody can incr while cache_lock is held.
505 * refcount != 0 is impossible since flags/etc can be modified in other
506 * threads. instead, note we found a busy one and bail. logic in do_item_get
507 * will prevent busy items from continuing to be busy
508 */
509 static int slab_rebalance_move(void) {
510 slabclass_t *s_cls;
511 int x;
512 int was_busy = 0;
513 int refcount = 0;
514 enum move_status status = MOVE_PASS;
515
516 pthread_mutex_lock(&cache_lock);
517 pthread_mutex_lock(&slabs_lock);
518
519 s_cls = &slabclass[slab_rebal.s_clsid];
520
521 for (x = 0; x < slab_bulk_check; x++) {
522 item *it = slab_rebal.slab_pos;
523 status = MOVE_PASS;
524 if (it->slabs_clsid != 255) {
525 refcount = refcount_incr(&it->refcount);
526 if (refcount == 1) { /* item is unlinked, unused */
527 if (it->it_flags & ITEM_SLABBED) {
528 /* remove from slab freelist */
529 if (s_cls->slots == it) {
530 s_cls->slots = it->next;
531 }
532 if (it->next) it->next->prev = it->prev;
533 if (it->prev) it->prev->next = it->next;
534 s_cls->sl_curr--;
535 status = MOVE_DONE;
536 } else {
537 status = MOVE_BUSY;
538 }
539 } else if (refcount == 2) { /* item is linked but not busy */
540 if ((it->it_flags & ITEM_LINKED) != 0) {
541 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
542 status = MOVE_DONE;
543 } else {
544 /* refcount == 1 + !ITEM_LINKED means the item is being
545 * uploaded to, or was just unlinked but hasn't been freed
546 * yet. Let it bleed off on its own and try again later */
547 status = MOVE_BUSY;
548 }
549 } else {
550 if (settings.verbose > 2) {
551 fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
552 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
553 }
554 status = MOVE_BUSY;
555 }
556 }
557
558 switch (status) {
559 case MOVE_DONE:
560 it->refcount = 0;
561 it->it_flags = 0;
562 it->slabs_clsid = 255;
563 break;
564 case MOVE_BUSY:
565 slab_rebal.busy_items++;
566 was_busy++;
567 refcount_decr(&it->refcount);
568 break;
569 case MOVE_PASS:
570 break;
571 }
572
573 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
574 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
575 break;
576 }
577
578 if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
579 /* Some items were busy, start again from the top */
580 if (slab_rebal.busy_items) {
581 slab_rebal.slab_pos = slab_rebal.slab_start;
582 slab_rebal.busy_items = 0;
583 } else {
584 slab_rebal.done++;
585 }
586 }
587
588 pthread_mutex_unlock(&slabs_lock);
589 pthread_mutex_unlock(&cache_lock);
590
591 return was_busy;
592 }
593
594 static void slab_rebalance_finish(void) {
595 slabclass_t *s_cls;
596 slabclass_t *d_cls;
597
598 pthread_mutex_lock(&cache_lock);
599 pthread_mutex_lock(&slabs_lock);
600
601 s_cls = &slabclass[slab_rebal.s_clsid];
602 d_cls = &slabclass[slab_rebal.d_clsid];
603
604 /* At this point the stolen slab is completely clear */
605 s_cls->slab_list[s_cls->killing - 1] =
606 s_cls->slab_list[s_cls->slabs - 1];
607 s_cls->slabs--;
608 s_cls->killing = 0;
609
610 memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
611
612 d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
613 split_slab_page_into_freelist(slab_rebal.slab_start,
614 slab_rebal.d_clsid);
615
616 slab_rebal.done = 0;
617 slab_rebal.s_clsid = 0;
618 slab_rebal.d_clsid = 0;
619 slab_rebal.slab_start = NULL;
620 slab_rebal.slab_end = NULL;
621 slab_rebal.slab_pos = NULL;
622
623 slab_rebalance_signal = 0;
624
625 pthread_mutex_unlock(&slabs_lock);
626 pthread_mutex_unlock(&cache_lock);
627
628 STATS_LOCK();
629 stats.slab_reassign_running = false;
630 stats.slabs_moved++;
631 STATS_UNLOCK();
632
633 if (settings.verbose > 1) {
634 fprintf(stderr, "finished a slab move\n");
635 }
636 }
637
638 /* Return 1 means a decision was reached.
639 * Move to its own thread (created/destroyed as needed) once automover is more
640 * complex.
641 */
642 static int slab_automove_decision(int *src, int *dst) {
643 static uint64_t evicted_old[POWER_LARGEST];
644 static unsigned int slab_zeroes[POWER_LARGEST];
645 static unsigned int slab_winner = 0;
646 static unsigned int slab_wins = 0;
647 uint64_t evicted_new[POWER_LARGEST];
648 uint64_t evicted_diff = 0;
649 uint64_t evicted_max = 0;
650 unsigned int highest_slab = 0;
651 unsigned int total_pages[POWER_LARGEST];
652 int i;
653 int source = 0;
654 int dest = 0;
655 static rel_time_t next_run;
656
657 /* Run less frequently than the slabmove tester. */
658 if (current_time >= next_run) {
659 next_run = current_time + 10;
660 } else {
661 return 0;
662 }
663
664 item_stats_evictions(evicted_new);
665 pthread_mutex_lock(&cache_lock);
666 for (i = POWER_SMALLEST; i < power_largest; i++) {
667 total_pages[i] = slabclass[i].slabs;
668 }
669 pthread_mutex_unlock(&cache_lock);
670
671 /* Find a candidate source; something with zero evicts 3+ times */
672 for (i = POWER_SMALLEST; i < power_largest; i++) {
673 evicted_diff = evicted_new[i] - evicted_old[i];
674 if (evicted_diff == 0 && total_pages[i] > 2) {
675 slab_zeroes[i]++;
676 if (source == 0 && slab_zeroes[i] >= 3)
677 source = i;
678 } else {
679 slab_zeroes[i] = 0;
680 if (evicted_diff > evicted_max) {
681 evicted_max = evicted_diff;
682 highest_slab = i;
683 }
684 }
685 evicted_old[i] = evicted_new[i];
686 }
687
688 /* Pick a valid destination */
689 if (slab_winner != 0 && slab_winner == highest_slab) {
690 slab_wins++;
691 if (slab_wins >= 3)
692 dest = slab_winner;
693 } else {
694 slab_wins = 1;
695 slab_winner = highest_slab;
696 }
697
698 if (source && dest) {
699 *src = source;
700 *dst = dest;
701 return 1;
702 }
703 return 0;
704 }
705
706 /* Slab rebalancer thread.
707 * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
708 * go to sleep if locks are contended
709 */
710 static void *slab_maintenance_thread(void *arg) {
711 int src, dest;
712
713 while (do_run_slab_thread) {
714 if (settings.slab_automove == 1) {
715 if (slab_automove_decision(&src, &dest) == 1) {
716 /* Blind to the return codes. It will retry on its own */
717 slabs_reassign(src, dest);
718 }
719 sleep(1);
720 } else {
721 /* Don't wake as often if we're not enabled.
722 * This is lazier than setting up a condition right now. */
723 sleep(5);
724 }
725 }
726 return NULL;
727 }
728
729 /* Slab mover thread.
730 * Sits waiting for a condition to jump off and shovel some memory about
731 */
732 static void *slab_rebalance_thread(void *arg) {
733 int was_busy = 0;
734
735 while (do_run_slab_rebalance_thread) {
736 if (slab_rebalance_signal == 1) {
737 if (slab_rebalance_start() < 0) {
738 /* Handle errors with more specifity as required. */
739 slab_rebalance_signal = 0;
740 }
741
742 was_busy = 0;
743 } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
744 was_busy = slab_rebalance_move();
745 }
746
747 if (slab_rebal.done) {
748 slab_rebalance_finish();
749 } else if (was_busy) {
750 /* Stuck waiting for some items to unlock, so slow down a bit
751 * to give them a chance to free up */
752 usleep(50);
753 }
754
755 if (slab_rebalance_signal == 0) {
756 /* always hold this lock while we're running */
757 pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
758 }
759 }
760 return NULL;
761 }
762
763 /* Iterate at most once through the slab classes and pick a "random" source.
764 * I like this better than calling rand() since rand() is slow enough that we
765 * can just check all of the classes once instead.
766 */
767 static int slabs_reassign_pick_any(int dst) {
768 static int cur = POWER_SMALLEST - 1;
769 int tries = power_largest - POWER_SMALLEST + 1;
770 for (; tries > 0; tries--) {
771 cur++;
772 if (cur > power_largest)
773 cur = POWER_SMALLEST;
774 if (cur == dst)
775 continue;
776 if (slabclass[cur].slabs > 1) {
777 return cur;
778 }
779 }
780 return -1;
781 }
782
783 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
784 if (slab_rebalance_signal != 0)
785 return REASSIGN_RUNNING;
786
787 if (src == dst)
788 return REASSIGN_SRC_DST_SAME;
789
790 /* Special indicator to choose ourselves. */
791 if (src == -1) {
792 src = slabs_reassign_pick_any(dst);
793 /* TODO: If we end up back at -1, return a new error type */
794 }
795
796 if (src < POWER_SMALLEST || src > power_largest ||
797 dst < POWER_SMALLEST || dst > power_largest)
798 return REASSIGN_BADCLASS;
799
800 if (slabclass[src].slabs < 2)
801 return REASSIGN_NOSPARE;
802
803 slab_rebal.s_clsid = src;
804 slab_rebal.d_clsid = dst;
805
806 slab_rebalance_signal = 1;
807 pthread_cond_signal(&slab_rebalance_cond);
808
809 return REASSIGN_OK;
810 }
811
812 enum reassign_result_type slabs_reassign(int src, int dst) {
813 enum reassign_result_type ret;
814 if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
815 return REASSIGN_RUNNING;
816 }
817 ret = do_slabs_reassign(src, dst);
818 pthread_mutex_unlock(&slabs_rebalance_lock);
819 return ret;
820 }
821
822 static pthread_t maintenance_tid;
823 static pthread_t rebalance_tid;
824
825 int start_slab_maintenance_thread(void) {
826 int ret;
827 slab_rebalance_signal = 0;
828 slab_rebal.slab_start = NULL;
829 char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
830 if (env != NULL) {
831 slab_bulk_check = atoi(env);
832 if (slab_bulk_check == 0) {
833 slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
834 }
835 }
836
837 if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
838 fprintf(stderr, "Can't intiialize rebalance condition\n");
839 return -1;
840 }
841 pthread_mutex_init(&slabs_rebalance_lock, NULL);
842
843 if ((ret = pthread_create(&maintenance_tid, NULL,
844 slab_maintenance_thread, NULL)) != 0) {
845 fprintf(stderr, "Can't create slab maint thread: %s\n", strerror(ret));
846 return -1;
847 }
848 if ((ret = pthread_create(&rebalance_tid, NULL,
849 slab_rebalance_thread, NULL)) != 0) {
850 fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
851 return -1;
852 }
853 return 0;
854 }
855
856 void stop_slab_maintenance_thread(void) {
857 mutex_lock(&cache_lock);
858 do_run_slab_thread = 0;
859 do_run_slab_rebalance_thread = 0;
860 pthread_cond_signal(&maintenance_cond);
861 pthread_mutex_unlock(&cache_lock);
862
863 /* Wait for the maintenance thread to stop */
864 pthread_join(maintenance_tid, NULL);
865 pthread_join(rebalance_tid, NULL);
866 }