84011c4402a39f71eba3b097e11e6799b6f31b0a
[awesomized/libmemcached] / memcached / items.c
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include <sys/stat.h>
4 #include <sys/socket.h>
5 #include <sys/signal.h>
6 #include <sys/resource.h>
7 #include <fcntl.h>
8 #include <netinet/in.h>
9 #include <errno.h>
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <string.h>
13 #include <time.h>
14 #include <assert.h>
15
16 /* Forward Declarations */
17 static void item_link_q(item *it);
18 static void item_unlink_q(item *it);
19
20 /*
21 * We only reposition items in the LRU queue if they haven't been repositioned
22 * in this many seconds. That saves us from churning on frequently-accessed
23 * items.
24 */
25 #define ITEM_UPDATE_INTERVAL 60
26
27 #define LARGEST_ID POWER_LARGEST
28 typedef struct {
29 uint64_t evicted;
30 uint64_t evicted_nonzero;
31 rel_time_t evicted_time;
32 uint64_t reclaimed;
33 uint64_t outofmemory;
34 uint64_t tailrepairs;
35 uint64_t expired_unfetched;
36 uint64_t evicted_unfetched;
37 } itemstats_t;
38
39 static item *heads[LARGEST_ID];
40 static item *tails[LARGEST_ID];
41 static itemstats_t itemstats[LARGEST_ID];
42 static unsigned int sizes[LARGEST_ID];
43
44 void item_stats_reset(void) {
45 mutex_lock(&cache_lock);
46 memset(itemstats, 0, sizeof(itemstats));
47 mutex_unlock(&cache_lock);
48 }
49
50
51 /* Get the next CAS id for a new item. */
52 uint64_t get_cas_id(void) {
53 static uint64_t cas_id = 0;
54 return ++cas_id;
55 }
56
57 /* Enable this for reference-count debugging. */
58 #if 0
59 # define DEBUG_REFCNT(it,op) \
60 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
61 it, op, it->refcount, \
62 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
63 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
64 #else
65 # define DEBUG_REFCNT(it,op) while(0)
66 #endif
67
68 /**
69 * Generates the variable-sized part of the header for an object.
70 *
71 * key - The key
72 * nkey - The length of the key
73 * flags - key flags
74 * nbytes - Number of bytes to hold value and addition CRLF terminator
75 * suffix - Buffer for the "VALUE" line suffix (flags, size).
76 * nsuffix - The length of the suffix is stored here.
77 *
78 * Returns the total size of the header.
79 */
80 static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,
81 char *suffix, uint8_t *nsuffix) {
82 /* suffix is defined at 40 chars elsewhere.. */
83 *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2);
84 return sizeof(item) + nkey + *nsuffix + nbytes;
85 }
86
87 /*@null@*/
88 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
89 uint8_t nsuffix;
90 item *it = NULL;
91 char suffix[40];
92 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
93 if (settings.use_cas) {
94 ntotal += sizeof(uint64_t);
95 }
96
97 unsigned int id = slabs_clsid(ntotal);
98 if (id == 0)
99 return 0;
100
101 mutex_lock(&cache_lock);
102 /* do a quick check if we have any expired items in the tail.. */
103 item *search;
104 rel_time_t oldest_live = settings.oldest_live;
105
106 search = tails[id];
107 if (search != NULL && (refcount_incr(&search->refcount) == 2)) {
108 if ((search->exptime != 0 && search->exptime < current_time)
109 || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush
110 STATS_LOCK();
111 stats.reclaimed++;
112 STATS_UNLOCK();
113 itemstats[id].reclaimed++;
114 if ((search->it_flags & ITEM_FETCHED) == 0) {
115 STATS_LOCK();
116 stats.expired_unfetched++;
117 STATS_UNLOCK();
118 itemstats[id].expired_unfetched++;
119 }
120 it = search;
121 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
122 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
123 /* Initialize the item block: */
124 it->slabs_clsid = 0;
125 } else if ((it = slabs_alloc(ntotal, id)) == NULL) {
126 if (settings.evict_to_free == 0) {
127 itemstats[id].outofmemory++;
128 mutex_unlock(&cache_lock);
129 return NULL;
130 }
131 itemstats[id].evicted++;
132 itemstats[id].evicted_time = current_time - search->time;
133 if (search->exptime != 0)
134 itemstats[id].evicted_nonzero++;
135 if ((search->it_flags & ITEM_FETCHED) == 0) {
136 STATS_LOCK();
137 stats.evicted_unfetched++;
138 STATS_UNLOCK();
139 itemstats[id].evicted_unfetched++;
140 }
141 STATS_LOCK();
142 stats.evictions++;
143 STATS_UNLOCK();
144 it = search;
145 slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
146 do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
147 /* Initialize the item block: */
148 it->slabs_clsid = 0;
149
150 /* If we've just evicted an item, and the automover is set to
151 * angry bird mode, attempt to rip memory into this slab class.
152 * TODO: Move valid object detection into a function, and on a
153 * "successful" memory pull, look behind and see if the next alloc
154 * would be an eviction. Then kick off the slab mover before the
155 * eviction happens.
156 */
157 if (settings.slab_automove == 2)
158 slabs_reassign(-1, id);
159 } else {
160 refcount_decr(&search->refcount);
161 }
162 } else {
163 /* If the LRU is empty or locked, attempt to allocate memory */
164 it = slabs_alloc(ntotal, id);
165 if (search != NULL)
166 refcount_decr(&search->refcount);
167 }
168
169 if (it == NULL) {
170 itemstats[id].outofmemory++;
171 /* Last ditch effort. There was a very rare bug which caused
172 * refcount leaks. We leave this just in case they ever happen again.
173 * We can reasonably assume no item can stay locked for more than
174 * three hours, so if we find one in the tail which is that old,
175 * free it anyway.
176 */
177 if (search != NULL &&
178 search->refcount != 2 &&
179 search->time + TAIL_REPAIR_TIME < current_time) {
180 itemstats[id].tailrepairs++;
181 search->refcount = 1;
182 do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0));
183 }
184 mutex_unlock(&cache_lock);
185 return NULL;
186 }
187
188 assert(it->slabs_clsid == 0);
189 assert(it != heads[id]);
190
191 /* Item initialization can happen outside of the lock; the item's already
192 * been removed from the slab LRU.
193 */
194 it->refcount = 1; /* the caller will have a reference */
195 mutex_unlock(&cache_lock);
196 it->next = it->prev = it->h_next = 0;
197 it->slabs_clsid = id;
198
199 DEBUG_REFCNT(it, '*');
200 it->it_flags = settings.use_cas ? ITEM_CAS : 0;
201 it->nkey = nkey;
202 it->nbytes = nbytes;
203 memcpy(ITEM_key(it), key, nkey);
204 it->exptime = exptime;
205 memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
206 it->nsuffix = nsuffix;
207 return it;
208 }
209
210 void item_free(item *it) {
211 size_t ntotal = ITEM_ntotal(it);
212 unsigned int clsid;
213 assert((it->it_flags & ITEM_LINKED) == 0);
214 assert(it != heads[it->slabs_clsid]);
215 assert(it != tails[it->slabs_clsid]);
216 assert(it->refcount == 0);
217
218 /* so slab size changer can tell later if item is already free or not */
219 clsid = it->slabs_clsid;
220 it->slabs_clsid = 0;
221 DEBUG_REFCNT(it, 'F');
222 slabs_free(it, ntotal, clsid);
223 }
224
225 /**
226 * Returns true if an item will fit in the cache (its size does not exceed
227 * the maximum for a cache entry.)
228 */
229 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
230 char prefix[40];
231 uint8_t nsuffix;
232
233 size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
234 prefix, &nsuffix);
235 if (settings.use_cas) {
236 ntotal += sizeof(uint64_t);
237 }
238
239 return slabs_clsid(ntotal) != 0;
240 }
241
242 static void item_link_q(item *it) { /* item is the new head */
243 item **head, **tail;
244 assert(it->slabs_clsid < LARGEST_ID);
245 assert((it->it_flags & ITEM_SLABBED) == 0);
246
247 head = &heads[it->slabs_clsid];
248 tail = &tails[it->slabs_clsid];
249 assert(it != *head);
250 assert((*head && *tail) || (*head == 0 && *tail == 0));
251 it->prev = 0;
252 it->next = *head;
253 if (it->next) it->next->prev = it;
254 *head = it;
255 if (*tail == 0) *tail = it;
256 sizes[it->slabs_clsid]++;
257 return;
258 }
259
260 static void item_unlink_q(item *it) {
261 item **head, **tail;
262 assert(it->slabs_clsid < LARGEST_ID);
263 head = &heads[it->slabs_clsid];
264 tail = &tails[it->slabs_clsid];
265
266 if (*head == it) {
267 assert(it->prev == 0);
268 *head = it->next;
269 }
270 if (*tail == it) {
271 assert(it->next == 0);
272 *tail = it->prev;
273 }
274 assert(it->next != it);
275 assert(it->prev != it);
276
277 if (it->next) it->next->prev = it->prev;
278 if (it->prev) it->prev->next = it->next;
279 sizes[it->slabs_clsid]--;
280 return;
281 }
282
283 int do_item_link(item *it, const uint32_t hv) {
284 MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
285 assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
286 mutex_lock(&cache_lock);
287 it->it_flags |= ITEM_LINKED;
288 it->time = current_time;
289
290 STATS_LOCK();
291 stats.curr_bytes += ITEM_ntotal(it);
292 stats.curr_items += 1;
293 stats.total_items += 1;
294 STATS_UNLOCK();
295
296 /* Allocate a new CAS ID on link. */
297 ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
298 assoc_insert(it, hv);
299 item_link_q(it);
300 refcount_incr(&it->refcount);
301 mutex_unlock(&cache_lock);
302
303 return 1;
304 }
305
306 void do_item_unlink(item *it, const uint32_t hv) {
307 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
308 mutex_lock(&cache_lock);
309 if ((it->it_flags & ITEM_LINKED) != 0) {
310 it->it_flags &= ~ITEM_LINKED;
311 STATS_LOCK();
312 stats.curr_bytes -= ITEM_ntotal(it);
313 stats.curr_items -= 1;
314 STATS_UNLOCK();
315 assoc_delete(ITEM_key(it), it->nkey, hv);
316 item_unlink_q(it);
317 do_item_remove(it);
318 }
319 mutex_unlock(&cache_lock);
320 }
321
322 /* FIXME: Is it necessary to keep this copy/pasted code? */
323 void do_item_unlink_nolock(item *it, const uint32_t hv) {
324 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
325 if ((it->it_flags & ITEM_LINKED) != 0) {
326 it->it_flags &= ~ITEM_LINKED;
327 STATS_LOCK();
328 stats.curr_bytes -= ITEM_ntotal(it);
329 stats.curr_items -= 1;
330 STATS_UNLOCK();
331 assoc_delete(ITEM_key(it), it->nkey, hv);
332 item_unlink_q(it);
333 do_item_remove(it);
334 }
335 }
336
337 void do_item_remove(item *it) {
338 MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
339 assert((it->it_flags & ITEM_SLABBED) == 0);
340
341 if (refcount_decr(&it->refcount) == 0) {
342 item_free(it);
343 }
344 }
345
346 void do_item_update(item *it) {
347 MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
348 if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
349 assert((it->it_flags & ITEM_SLABBED) == 0);
350
351 mutex_lock(&cache_lock);
352 if ((it->it_flags & ITEM_LINKED) != 0) {
353 item_unlink_q(it);
354 it->time = current_time;
355 item_link_q(it);
356 }
357 mutex_unlock(&cache_lock);
358 }
359 }
360
361 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
362 MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
363 ITEM_key(new_it), new_it->nkey, new_it->nbytes);
364 assert((it->it_flags & ITEM_SLABBED) == 0);
365
366 do_item_unlink(it, hv);
367 return do_item_link(new_it, hv);
368 }
369
370 /*@null@*/
371 char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
372 unsigned int memlimit = 2 * 1024 * 1024; /* 2MB max response size */
373 char *buffer;
374 unsigned int bufcurr;
375 item *it;
376 unsigned int len;
377 unsigned int shown = 0;
378 char key_temp[KEY_MAX_LENGTH + 1];
379 char temp[512];
380
381 it = heads[slabs_clsid];
382
383 buffer = malloc((size_t)memlimit);
384 if (buffer == 0) return NULL;
385 bufcurr = 0;
386
387 while (it != NULL && (limit == 0 || shown < limit)) {
388 assert(it->nkey <= KEY_MAX_LENGTH);
389 /* Copy the key since it may not be null-terminated in the struct */
390 strncpy(key_temp, ITEM_key(it), it->nkey);
391 key_temp[it->nkey] = 0x00; /* terminate */
392 len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %lu s]\r\n",
393 key_temp, it->nbytes - 2,
394 (unsigned long)it->exptime + process_started);
395 if (bufcurr + len + 6 > memlimit) /* 6 is END\r\n\0 */
396 break;
397 memcpy(buffer + bufcurr, temp, len);
398 bufcurr += len;
399 shown++;
400 it = it->next;
401 }
402
403 memcpy(buffer + bufcurr, "END\r\n", 6);
404 bufcurr += 5;
405
406 *bytes = bufcurr;
407 return buffer;
408 }
409
410 void item_stats_evictions(uint64_t *evicted) {
411 int i;
412 mutex_lock(&cache_lock);
413 for (i = 0; i < LARGEST_ID; i++) {
414 evicted[i] = itemstats[i].evicted;
415 }
416 mutex_unlock(&cache_lock);
417 }
418
419 void do_item_stats(ADD_STAT add_stats, void *c) {
420 int i;
421 for (i = 0; i < LARGEST_ID; i++) {
422 if (tails[i] != NULL) {
423 const char *fmt = "items:%d:%s";
424 char key_str[STAT_KEY_LEN];
425 char val_str[STAT_VAL_LEN];
426 int klen = 0, vlen = 0;
427 if (tails[i] == NULL) {
428 /* We removed all of the items in this slab class */
429 continue;
430 }
431 APPEND_NUM_FMT_STAT(fmt, i, "number", "%u", sizes[i]);
432 APPEND_NUM_FMT_STAT(fmt, i, "age", "%u", current_time - tails[i]->time);
433 APPEND_NUM_FMT_STAT(fmt, i, "evicted",
434 "%llu", (unsigned long long)itemstats[i].evicted);
435 APPEND_NUM_FMT_STAT(fmt, i, "evicted_nonzero",
436 "%llu", (unsigned long long)itemstats[i].evicted_nonzero);
437 APPEND_NUM_FMT_STAT(fmt, i, "evicted_time",
438 "%u", itemstats[i].evicted_time);
439 APPEND_NUM_FMT_STAT(fmt, i, "outofmemory",
440 "%llu", (unsigned long long)itemstats[i].outofmemory);
441 APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs",
442 "%llu", (unsigned long long)itemstats[i].tailrepairs);
443 APPEND_NUM_FMT_STAT(fmt, i, "reclaimed",
444 "%llu", (unsigned long long)itemstats[i].reclaimed);
445 APPEND_NUM_FMT_STAT(fmt, i, "expired_unfetched",
446 "%llu", (unsigned long long)itemstats[i].expired_unfetched);
447 APPEND_NUM_FMT_STAT(fmt, i, "evicted_unfetched",
448 "%llu", (unsigned long long)itemstats[i].evicted_unfetched);
449 }
450 }
451
452 /* getting here means both ascii and binary terminators fit */
453 add_stats(NULL, 0, NULL, 0, c);
454 }
455
456 /** dumps out a list of objects of each size, with granularity of 32 bytes */
457 /*@null@*/
458 void do_item_stats_sizes(ADD_STAT add_stats, void *c) {
459
460 /* max 1MB object, divided into 32 bytes size buckets */
461 const int num_buckets = 32768;
462 unsigned int *histogram = calloc(num_buckets, sizeof(int));
463
464 if (histogram != NULL) {
465 int i;
466
467 /* build the histogram */
468 for (i = 0; i < LARGEST_ID; i++) {
469 item *iter = heads[i];
470 while (iter) {
471 int ntotal = ITEM_ntotal(iter);
472 int bucket = ntotal / 32;
473 if ((ntotal % 32) != 0) bucket++;
474 if (bucket < num_buckets) histogram[bucket]++;
475 iter = iter->next;
476 }
477 }
478
479 /* write the buffer */
480 for (i = 0; i < num_buckets; i++) {
481 if (histogram[i] != 0) {
482 char key[8];
483 snprintf(key, sizeof(key), "%d", i * 32);
484 APPEND_STAT(key, "%u", histogram[i]);
485 }
486 }
487 free(histogram);
488 }
489 add_stats(NULL, 0, NULL, 0, c);
490 }
491
492 /** wrapper around assoc_find which does the lazy expiration logic */
493 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) {
494 mutex_lock(&cache_lock);
495 item *it = assoc_find(key, nkey, hv);
496 if (it != NULL) {
497 refcount_incr(&it->refcount);
498 /* Optimization for slab reassignment. prevents popular items from
499 * jamming in busy wait. Can only do this here to satisfy lock order
500 * of item_lock, cache_lock, slabs_lock. */
501 if (slab_rebalance_signal &&
502 ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
503 do_item_unlink_nolock(it, hv);
504 do_item_remove(it);
505 it = NULL;
506 }
507 }
508 mutex_unlock(&cache_lock);
509 int was_found = 0;
510
511 if (settings.verbose > 2) {
512 if (it == NULL) {
513 fprintf(stderr, "> NOT FOUND %s", key);
514 } else {
515 fprintf(stderr, "> FOUND KEY %s", ITEM_key(it));
516 was_found++;
517 }
518 }
519
520 if (it != NULL) {
521 if (settings.oldest_live != 0 && settings.oldest_live <= current_time &&
522 it->time <= settings.oldest_live) {
523 do_item_unlink(it, hv);
524 do_item_remove(it);
525 it = NULL;
526 if (was_found) {
527 fprintf(stderr, " -nuked by flush");
528 }
529 } else if (it->exptime != 0 && it->exptime <= current_time) {
530 do_item_unlink(it, hv);
531 do_item_remove(it);
532 it = NULL;
533 if (was_found) {
534 fprintf(stderr, " -nuked by expire");
535 }
536 } else {
537 it->it_flags |= ITEM_FETCHED;
538 DEBUG_REFCNT(it, '+');
539 }
540 }
541
542 if (settings.verbose > 2)
543 fprintf(stderr, "\n");
544
545 return it;
546 }
547
548 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
549 const uint32_t hv) {
550 item *it = do_item_get(key, nkey, hv);
551 if (it != NULL) {
552 it->exptime = exptime;
553 }
554 return it;
555 }
556
557 /* expires items that are more recent than the oldest_live setting. */
558 void do_item_flush_expired(void) {
559 int i;
560 item *iter, *next;
561 if (settings.oldest_live == 0)
562 return;
563 for (i = 0; i < LARGEST_ID; i++) {
564 /* The LRU is sorted in decreasing time order, and an item's timestamp
565 * is never newer than its last access time, so we only need to walk
566 * back until we hit an item older than the oldest_live time.
567 * The oldest_live checking will auto-expire the remaining items.
568 */
569 for (iter = heads[i]; iter != NULL; iter = next) {
570 if (iter->time >= settings.oldest_live) {
571 next = iter->next;
572 if ((iter->it_flags & ITEM_SLABBED) == 0) {
573 do_item_unlink_nolock(iter, hash(ITEM_key(iter), iter->nkey, 0));
574 }
575 } else {
576 /* We've hit the first old item. Continue to the next queue. */
577 break;
578 }
579 }
580 }
581 }