58fc871787aae955ca21243b50c5ab26a3db5310
[awesomized/libmemcached] / memcached / items.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include <sys/stat.h>
4 #include <sys/socket.h>
5 #include <sys/signal.h>
6 #include <sys/resource.h>
7 #include <fcntl.h>
8 #include <netinet/in.h>
9 #include <errno.h>
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <time.h>
14 #include <assert.h>
15
16 /* Forward Declarations */
17 static void item_link_q(item *it);
18 static void item_unlink_q(item *it);
19
20 /*
21 * We only reposition items in the LRU queue if they haven't been repositioned
22 * in this many seconds. That saves us from churning on frequently-accessed
23 * items.
24 */
25 #define ITEM_UPDATE_INTERVAL 60
26
27 #define LARGEST_ID POWER_LARGEST
28 typedef struct {
29 uint64_t evicted;
30 uint64_t evicted_nonzero;
31 rel_time_t evicted_time;
32 uint64_t reclaimed;
33 uint64_t outofmemory;
34 uint64_t tailrepairs;
35 uint64_t expired_unfetched;
36 uint64_t evicted_unfetched;
37 } itemstats_t;
38
39 static item *heads[LARGEST_ID];
40 static item *tails[LARGEST_ID];
41 static itemstats_t itemstats[LARGEST_ID];
42 static unsigned int sizes[LARGEST_ID];
43
44 void item_stats_reset(void) {
45 mutex_lock(&cache_lock);
46 memset(itemstats, 0, sizeof(itemstats));
47 pthread_mutex_unlock(&cache_lock);
48 }
49
50
51 /* Get the next CAS id for a new item. */
52 uint64_t get_cas_id(void) {
53 static uint64_t cas_id = 0;
54 return ++cas_id;
55 }
56
57 /* Enable this for reference-count debugging. */
58 #if 0
59 # define DEBUG_REFCNT(it,op) \
60 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
61 it, op, it->refcount, \
62 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
63 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
64 #else
65 # define DEBUG_REFCNT(it,op) while(0)
66 #endif
67
68 /**
69 * Generates the variable-sized part of the header for an object.
70 *
71 * key - The key
72 * nkey - The length of the key
73 * flags - key flags
74 * nbytes - Number of bytes to hold value and addition CRLF terminator
75 * suffix - Buffer for the "VALUE" line suffix (flags, size).
76 * nsuffix - The length of the suffix is stored here.
77 *
78 * Returns the total size of the header.
79 */
80 static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
81 char *suffix, uint8_t *nsuffix) {
82 /* suffix is defined at 40 chars elsewhere.. */
83 *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2);
84 return sizeof(item) + nkey + *nsuffix + nbytes;
85 }
86
87 /*@null@*/
88 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
89 uint8_t nsuffix;
90 item *it = NULL;
91 char suffix[40];
92 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
93 if (settings.use_cas) {
94 ntotal += sizeof(uint64_t);
95 }
96
97 unsigned int id = slabs_clsid(ntotal);
98 if (id == 0)
99 return 0;
100
101 mutex_lock(&cache_lock);
102 /* do a quick check if we have any expired items in the tail.. */
103 item *search;
104 rel_time_t oldest_live = settings.oldest_live;
105
106 search = tails[id];
107 if (search != NULL && (refcount_incr(&search->refcount) == 2)) {
108 if ((search->exptime != 0 && search->exptime < current_time)
109 || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush
110 STATS_LOCK();
111 stats.reclaimed++;
112 STATS_UNLOCK();
113 itemstats[id].reclaimed++;
114 if ((search->it_flags & ITEM_FETCHED) == 0) {
115 STATS_LOCK();
116 stats.expired_unfetched++;
117 STATS_UNLOCK();
118 itemstats[id].expired_unfetched++;
119 }
120 it = search;
121 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
122 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
123 /* Initialize the item block: */
124 it->slabs_clsid = 0;
125 } else if ((it = slabs_alloc(ntotal, id)) == NULL) {
126 if (settings.evict_to_free == 0) {
127 itemstats[id].outofmemory++;
128 pthread_mutex_unlock(&cache_lock);
129 return NULL;
130 }
131 itemstats[id].evicted++;
132 itemstats[id].evicted_time = current_time - search->time;
133 if (search->exptime != 0)
134 itemstats[id].evicted_nonzero++;
135 if ((search->it_flags & ITEM_FETCHED) == 0) {
136 STATS_LOCK();
137 stats.evicted_unfetched++;
138 STATS_UNLOCK();
139 itemstats[id].evicted_unfetched++;
140 }
141 STATS_LOCK();
142 stats.evictions++;
143 STATS_UNLOCK();
144 it = search;
145 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
146 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
147 /* Initialize the item block: */
148 it->slabs_clsid = 0;
149 } else {
150 refcount_decr(&search->refcount);
151 }
152 } else {
153 /* If the LRU is empty or locked, attempt to allocate memory */
154 it = slabs_alloc(ntotal, id);
155 if (search != NULL)
156 refcount_decr(&search->refcount);
157 }
158
159 if (it == NULL) {
160 itemstats[id].outofmemory++;
161 /* Last ditch effort. There was a very rare bug which caused
162 * refcount leaks. We leave this just in case they ever happen again.
163 * We can reasonably assume no item can stay locked for more than
164 * three hours, so if we find one in the tail which is that old,
165 * free it anyway.
166 */
167 if (search != NULL &&
168 search->refcount != 2 &&
169 search->time + TAIL_REPAIR_TIME < current_time) {
170 itemstats[id].tailrepairs++;
171 search->refcount = 1;
172 do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0));
173 }
174 pthread_mutex_unlock(&cache_lock);
175 return NULL;
176 }
177
178 assert(it->slabs_clsid == 0);
179 assert(it != heads[id]);
180
181 /* Item initialization can happen outside of the lock; the item's already
182 * been removed from the slab LRU.
183 */
184 it->refcount = 1; /* the caller will have a reference */
185 pthread_mutex_unlock(&cache_lock);
186 it->next = it->prev = it->h_next = 0;
187 it->slabs_clsid = id;
188
189 DEBUG_REFCNT(it, '*');
190 it->it_flags = settings.use_cas ? ITEM_CAS : 0;
191 it->nkey = nkey;
192 it->nbytes = nbytes;
193 memcpy(ITEM_key(it), key, nkey);
194 it->exptime = exptime;
195 memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
196 it->nsuffix = nsuffix;
197 return it;
198 }
199
200 void item_free(item *it) {
201 size_t ntotal = ITEM_ntotal(it);
202 unsigned int clsid;
203 assert((it->it_flags & ITEM_LINKED) == 0);
204 assert(it != heads[it->slabs_clsid]);
205 assert(it != tails[it->slabs_clsid]);
206 assert(it->refcount == 0);
207
208 /* so slab size changer can tell later if item is already free or not */
209 clsid = it->slabs_clsid;
210 it->slabs_clsid = 0;
211 DEBUG_REFCNT(it, 'F');
212 slabs_free(it, ntotal, clsid);
213 }
214
215 /**
216 * Returns true if an item will fit in the cache (its size does not exceed
217 * the maximum for a cache entry.)
218 */
219 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
220 char prefix[40];
221 uint8_t nsuffix;
222
223 size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
224 prefix, &nsuffix);
225 if (settings.use_cas) {
226 ntotal += sizeof(uint64_t);
227 }
228
229 return slabs_clsid(ntotal) != 0;
230 }
231
232 static void item_link_q(item *it) { /* item is the new head */
233 item **head, **tail;
234 assert(it->slabs_clsid < LARGEST_ID);
235 assert((it->it_flags & ITEM_SLABBED) == 0);
236
237 head = &heads[it->slabs_clsid];
238 tail = &tails[it->slabs_clsid];
239 assert(it != *head);
240 assert((*head && *tail) || (*head == 0 && *tail == 0));
241 it->prev = 0;
242 it->next = *head;
243 if (it->next) it->next->prev = it;
244 *head = it;
245 if (*tail == 0) *tail = it;
246 sizes[it->slabs_clsid]++;
247 return;
248 }
249
250 static void item_unlink_q(item *it) {
251 item **head, **tail;
252 assert(it->slabs_clsid < LARGEST_ID);
253 head = &heads[it->slabs_clsid];
254 tail = &tails[it->slabs_clsid];
255
256 if (*head == it) {
257 assert(it->prev == 0);
258 *head = it->next;
259 }
260 if (*tail == it) {
261 assert(it->next == 0);
262 *tail = it->prev;
263 }
264 assert(it->next != it);
265 assert(it->prev != it);
266
267 if (it->next) it->next->prev = it->prev;
268 if (it->prev) it->prev->next = it->next;
269 sizes[it->slabs_clsid]--;
270 return;
271 }
272
273 int do_item_link(item *it, const uint32_t hv) {
274 MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
275 assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
276 mutex_lock(&cache_lock);
277 it->it_flags |= ITEM_LINKED;
278 it->time = current_time;
279
280 STATS_LOCK();
281 stats.curr_bytes += ITEM_ntotal(it);
282 stats.curr_items += 1;
283 stats.total_items += 1;
284 STATS_UNLOCK();
285
286 /* Allocate a new CAS ID on link. */
287 ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
288 assoc_insert(it, hv);
289 item_link_q(it);
290 refcount_incr(&it->refcount);
291 pthread_mutex_unlock(&cache_lock);
292
293 return 1;
294 }
295
296 void do_item_unlink(item *it, const uint32_t hv) {
297 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
298 mutex_lock(&cache_lock);
299 if ((it->it_flags & ITEM_LINKED) != 0) {
300 it->it_flags &= ~ITEM_LINKED;
301 STATS_LOCK();
302 stats.curr_bytes -= ITEM_ntotal(it);
303 stats.curr_items -= 1;
304 STATS_UNLOCK();
305 assoc_delete(ITEM_key(it), it->nkey, hv);
306 item_unlink_q(it);
307 do_item_remove(it);
308 }
309 pthread_mutex_unlock(&cache_lock);
310 }
311
312 /* FIXME: Is it necessary to keep this copy/pasted code? */
313 void do_item_unlink_nolock(item *it, const uint32_t hv) {
314 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
315 if ((it->it_flags & ITEM_LINKED) != 0) {
316 it->it_flags &= ~ITEM_LINKED;
317 STATS_LOCK();
318 stats.curr_bytes -= ITEM_ntotal(it);
319 stats.curr_items -= 1;
320 STATS_UNLOCK();
321 assoc_delete(ITEM_key(it), it->nkey, hv);
322 item_unlink_q(it);
323 do_item_remove(it);
324 }
325 }
326
327 void do_item_remove(item *it) {
328 MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
329 assert((it->it_flags & ITEM_SLABBED) == 0);
330
331 if (refcount_decr(&it->refcount) == 0) {
332 item_free(it);
333 }
334 }
335
336 void do_item_update(item *it) {
337 MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
338 if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
339 assert((it->it_flags & ITEM_SLABBED) == 0);
340
341 mutex_lock(&cache_lock);
342 if ((it->it_flags & ITEM_LINKED) != 0) {
343 item_unlink_q(it);
344 it->time = current_time;
345 item_link_q(it);
346 }
347 pthread_mutex_unlock(&cache_lock);
348 }
349 }
350
351 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
352 MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
353 ITEM_key(new_it), new_it->nkey, new_it->nbytes);
354 assert((it->it_flags & ITEM_SLABBED) == 0);
355
356 do_item_unlink(it, hv);
357 return do_item_link(new_it, hv);
358 }
359
360 #ifndef __INTEL_COMPILER
361 #pragma GCC diagnostic ignored "-Wshadow"
362 #endif
363
364 /*@null@*/
365 char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
366 unsigned int memlimit = 2 * 1024 * 1024; /* 2MB max response size */
367 char *buffer;
368 unsigned int bufcurr;
369 item *it;
370 unsigned int len;
371 unsigned int shown = 0;
372 char key_temp[KEY_MAX_LENGTH + 1];
373 char temp[512];
374
375 it = heads[slabs_clsid];
376
377 buffer = malloc((size_t)memlimit);
378 if (buffer == 0) return NULL;
379 bufcurr = 0;
380
381 while (it != NULL && (limit == 0 || shown < limit)) {
382 assert(it->nkey <= KEY_MAX_LENGTH);
383 /* Copy the key since it may not be null-terminated in the struct */
384 strncpy(key_temp, ITEM_key(it), it->nkey);
385 key_temp[it->nkey] = 0x00; /* terminate */
386 len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %lu s]\r\n",
387 key_temp, it->nbytes - 2,
388 (unsigned long)it->exptime + process_started);
389 if (bufcurr + len + 6 > memlimit) /* 6 is END\r\n\0 */
390 break;
391 memcpy(buffer + bufcurr, temp, len);
392 bufcurr += len;
393 shown++;
394 it = it->next;
395 }
396
397 memcpy(buffer + bufcurr, "END\r\n", 6);
398 bufcurr += 5;
399
400 *bytes = bufcurr;
401 return buffer;
402 }
403
404 void item_stats_evictions(uint64_t *evicted) {
405 int i;
406 mutex_lock(&cache_lock);
407 for (i = 0; i < LARGEST_ID; i++) {
408 evicted[i] = itemstats[i].evicted;
409 }
410 pthread_mutex_unlock(&cache_lock);
411 }
412
413 void do_item_stats(ADD_STAT add_stats, void *c) {
414 int i;
415 for (i = 0; i < LARGEST_ID; i++) {
416 if (tails[i] != NULL) {
417 const char *fmt = "items:%d:%s";
418 char key_str[STAT_KEY_LEN];
419 char val_str[STAT_VAL_LEN];
420 int klen = 0, vlen = 0;
421 if (tails[i] == NULL) {
422 /* We removed all of the items in this slab class */
423 continue;
424 }
425 APPEND_NUM_FMT_STAT(fmt, i, "number", "%u", sizes[i]);
426 APPEND_NUM_FMT_STAT(fmt, i, "age", "%u", current_time - tails[i]->time);
427 APPEND_NUM_FMT_STAT(fmt, i, "evicted",
428 "%llu", (unsigned long long)itemstats[i].evicted);
429 APPEND_NUM_FMT_STAT(fmt, i, "evicted_nonzero",
430 "%llu", (unsigned long long)itemstats[i].evicted_nonzero);
431 APPEND_NUM_FMT_STAT(fmt, i, "evicted_time",
432 "%u", itemstats[i].evicted_time);
433 APPEND_NUM_FMT_STAT(fmt, i, "outofmemory",
434 "%llu", (unsigned long long)itemstats[i].outofmemory);
435 APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs",
436 "%llu", (unsigned long long)itemstats[i].tailrepairs);
437 APPEND_NUM_FMT_STAT(fmt, i, "reclaimed",
438 "%llu", (unsigned long long)itemstats[i].reclaimed);
439 APPEND_NUM_FMT_STAT(fmt, i, "expired_unfetched",
440 "%llu", (unsigned long long)itemstats[i].expired_unfetched);
441 APPEND_NUM_FMT_STAT(fmt, i, "evicted_unfetched",
442 "%llu", (unsigned long long)itemstats[i].evicted_unfetched);
443 }
444 }
445
446 /* getting here means both ascii and binary terminators fit */
447 add_stats(NULL, 0, NULL, 0, c);
448 }
449
450 /** dumps out a list of objects of each size, with granularity of 32 bytes */
451 /*@null@*/
452 void do_item_stats_sizes(ADD_STAT add_stats, void *c) {
453
454 /* max 1MB object, divided into 32 bytes size buckets */
455 const int num_buckets = 32768;
456 unsigned int *histogram = calloc(num_buckets, sizeof(int));
457
458 if (histogram != NULL) {
459 int i;
460
461 /* build the histogram */
462 for (i = 0; i < LARGEST_ID; i++) {
463 item *iter = heads[i];
464 while (iter) {
465 int ntotal = ITEM_ntotal(iter);
466 int bucket = ntotal / 32;
467 if ((ntotal % 32) != 0) bucket++;
468 if (bucket < num_buckets) histogram[bucket]++;
469 iter = iter->next;
470 }
471 }
472
473 /* write the buffer */
474 for (i = 0; i < num_buckets; i++) {
475 if (histogram[i] != 0) {
476 char key[8];
477 snprintf(key, sizeof(key), "%d", i * 32);
478 APPEND_STAT(key, "%u", histogram[i]);
479 }
480 }
481 free(histogram);
482 }
483 add_stats(NULL, 0, NULL, 0, c);
484 }
485
486 /** wrapper around assoc_find which does the lazy expiration logic */
487 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) {
488 mutex_lock(&cache_lock);
489 item *it = assoc_find(key, nkey, hv);
490 if (it != NULL) {
491 refcount_incr(&it->refcount);
492 /* Optimization for slab reassignment. prevents popular items from
493 * jamming in busy wait. Can only do this here to satisfy lock order
494 * of item_lock, cache_lock, slabs_lock. */
495 if (slab_rebalance_signal &&
496 ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
497 do_item_unlink_nolock(it, hv);
498 do_item_remove(it);
499 it = NULL;
500 }
501 }
502 pthread_mutex_unlock(&cache_lock);
503 int was_found = 0;
504
505 if (settings.verbose > 2) {
506 if (it == NULL) {
507 fprintf(stderr, "> NOT FOUND %s", key);
508 } else {
509 fprintf(stderr, "> FOUND KEY %s", ITEM_key(it));
510 was_found++;
511 }
512 }
513
514 if (it != NULL) {
515 if (settings.oldest_live != 0 && settings.oldest_live <= current_time &&
516 it->time <= settings.oldest_live) {
517 do_item_unlink(it, hv);
518 do_item_remove(it);
519 it = NULL;
520 if (was_found) {
521 fprintf(stderr, " -nuked by flush");
522 }
523 } else if (it->exptime != 0 && it->exptime <= current_time) {
524 do_item_unlink(it, hv);
525 do_item_remove(it);
526 it = NULL;
527 if (was_found) {
528 fprintf(stderr, " -nuked by expire");
529 }
530 } else {
531 it->it_flags |= ITEM_FETCHED;
532 DEBUG_REFCNT(it, '+');
533 }
534 }
535
536 if (settings.verbose > 2)
537 fprintf(stderr, "\n");
538
539 return it;
540 }
541
542 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
543 const uint32_t hv) {
544 item *it = do_item_get(key, nkey, hv);
545 if (it != NULL) {
546 it->exptime = exptime;
547 }
548 return it;
549 }
550
551 /* expires items that are more recent than the oldest_live setting. */
552 void do_item_flush_expired(void) {
553 int i;
554 item *iter, *next;
555 if (settings.oldest_live == 0)
556 return;
557 for (i = 0; i < LARGEST_ID; i++) {
558 /* The LRU is sorted in decreasing time order, and an item's timestamp
559 * is never newer than its last access time, so we only need to walk
560 * back until we hit an item older than the oldest_live time.
561 * The oldest_live checking will auto-expire the remaining items.
562 */
563 for (iter = heads[i]; iter != NULL; iter = next) {
564 if (iter->time >= settings.oldest_live) {
565 next = iter->next;
566 if ((iter->it_flags & ITEM_SLABBED) == 0) {
567 do_item_unlink_nolock(iter, hash(ITEM_key(iter), iter->nkey, 0));
568 }
569 } else {
570 /* We've hit the first old item. Continue to the next queue. */
571 break;
572 }
573 }
574 }
575 }