1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
4 #include <sys/socket.h>
5 #include <sys/signal.h>
6 #include <sys/resource.h>
8 #include <netinet/in.h>
16 /* Forward Declarations */
17 static void item_link_q(item
*it
);
18 static void item_unlink_q(item
*it
);
21 * We only reposition items in the LRU queue if they haven't been repositioned
22 * in this many seconds. That saves us from churning on frequently-accessed
25 #define ITEM_UPDATE_INTERVAL 60
27 #define LARGEST_ID POWER_LARGEST
30 uint64_t evicted_nonzero
;
31 rel_time_t evicted_time
;
35 uint64_t expired_unfetched
;
36 uint64_t evicted_unfetched
;
39 static item
*heads
[LARGEST_ID
];
40 static item
*tails
[LARGEST_ID
];
41 static itemstats_t itemstats
[LARGEST_ID
];
42 static unsigned int sizes
[LARGEST_ID
];
44 void item_stats_reset(void) {
45 mutex_lock(&cache_lock
);
46 memset(itemstats
, 0, sizeof(itemstats
));
47 mutex_unlock(&cache_lock
);
51 /* Get the next CAS id for a new item. */
52 uint64_t get_cas_id(void) {
53 static uint64_t cas_id
= 0;
57 /* Enable this for reference-count debugging. */
59 # define DEBUG_REFCNT(it,op) \
60 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
61 it, op, it->refcount, \
62 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
63 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
65 # define DEBUG_REFCNT(it,op) while(0)
69 * Generates the variable-sized part of the header for an object.
72 * nkey - The length of the key
74 * nbytes - Number of bytes to hold value and addition CRLF terminator
75 * suffix - Buffer for the "VALUE" line suffix (flags, size).
76 * nsuffix - The length of the suffix is stored here.
78 * Returns the total size of the header.
80 static size_t item_make_header(const uint8_t nkey
, const int flags
, const int nbytes
,
81 char *suffix
, uint8_t *nsuffix
) {
82 /* suffix is defined at 40 chars elsewhere.. */
83 *nsuffix
= (uint8_t) snprintf(suffix
, 40, " %d %d\r\n", flags
, nbytes
- 2);
84 return sizeof(item
) + nkey
+ *nsuffix
+ nbytes
;
88 item
*do_item_alloc(char *key
, const size_t nkey
, const int flags
, const rel_time_t exptime
, const int nbytes
) {
92 size_t ntotal
= item_make_header(nkey
+ 1, flags
, nbytes
, suffix
, &nsuffix
);
93 if (settings
.use_cas
) {
94 ntotal
+= sizeof(uint64_t);
97 unsigned int id
= slabs_clsid(ntotal
);
101 mutex_lock(&cache_lock
);
102 /* do a quick check if we have any expired items in the tail.. */
104 rel_time_t oldest_live
= settings
.oldest_live
;
107 if (search
!= NULL
&& (refcount_incr(&search
->refcount
) == 2)) {
108 if ((search
->exptime
!= 0 && search
->exptime
< current_time
)
109 || (search
->time
<= oldest_live
&& oldest_live
<= current_time
)) { // dead by flush
113 itemstats
[id
].reclaimed
++;
114 if ((search
->it_flags
& ITEM_FETCHED
) == 0) {
116 stats
.expired_unfetched
++;
118 itemstats
[id
].expired_unfetched
++;
121 slabs_adjust_mem_requested(it
->slabs_clsid
, ITEM_ntotal(it
), ntotal
);
122 do_item_unlink_nolock(it
, hash(ITEM_key(it
), it
->nkey
, 0));
123 /* Initialize the item block: */
125 } else if ((it
= slabs_alloc(ntotal
, id
)) == NULL
) {
126 if (settings
.evict_to_free
== 0) {
127 itemstats
[id
].outofmemory
++;
128 mutex_unlock(&cache_lock
);
131 itemstats
[id
].evicted
++;
132 itemstats
[id
].evicted_time
= current_time
- search
->time
;
133 if (search
->exptime
!= 0)
134 itemstats
[id
].evicted_nonzero
++;
135 if ((search
->it_flags
& ITEM_FETCHED
) == 0) {
137 stats
.evicted_unfetched
++;
139 itemstats
[id
].evicted_unfetched
++;
145 slabs_adjust_mem_requested(it
->slabs_clsid
, ITEM_ntotal(it
), ntotal
);
146 do_item_unlink_nolock(it
, hash(ITEM_key(it
), it
->nkey
, 0));
147 /* Initialize the item block: */
150 /* If we've just evicted an item, and the automover is set to
151 * angry bird mode, attempt to rip memory into this slab class.
152 * TODO: Move valid object detection into a function, and on a
153 * "successful" memory pull, look behind and see if the next alloc
154 * would be an eviction. Then kick off the slab mover before the
157 if (settings
.slab_automove
== 2)
158 slabs_reassign(-1, id
);
160 refcount_decr(&search
->refcount
);
163 /* If the LRU is empty or locked, attempt to allocate memory */
164 it
= slabs_alloc(ntotal
, id
);
166 refcount_decr(&search
->refcount
);
170 itemstats
[id
].outofmemory
++;
171 /* Last ditch effort. There was a very rare bug which caused
172 * refcount leaks. We leave this just in case they ever happen again.
173 * We can reasonably assume no item can stay locked for more than
174 * three hours, so if we find one in the tail which is that old,
177 if (search
!= NULL
&&
178 search
->refcount
!= 2 &&
179 search
->time
+ TAIL_REPAIR_TIME
< current_time
) {
180 itemstats
[id
].tailrepairs
++;
181 search
->refcount
= 1;
182 do_item_unlink_nolock(search
, hash(ITEM_key(search
), search
->nkey
, 0));
184 mutex_unlock(&cache_lock
);
188 assert(it
->slabs_clsid
== 0);
189 assert(it
!= heads
[id
]);
191 /* Item initialization can happen outside of the lock; the item's already
192 * been removed from the slab LRU.
194 it
->refcount
= 1; /* the caller will have a reference */
195 mutex_unlock(&cache_lock
);
196 it
->next
= it
->prev
= it
->h_next
= 0;
197 it
->slabs_clsid
= id
;
199 DEBUG_REFCNT(it
, '*');
200 it
->it_flags
= settings
.use_cas
? ITEM_CAS
: 0;
203 memcpy(ITEM_key(it
), key
, nkey
);
204 it
->exptime
= exptime
;
205 memcpy(ITEM_suffix(it
), suffix
, (size_t)nsuffix
);
206 it
->nsuffix
= nsuffix
;
210 void item_free(item
*it
) {
211 size_t ntotal
= ITEM_ntotal(it
);
213 assert((it
->it_flags
& ITEM_LINKED
) == 0);
214 assert(it
!= heads
[it
->slabs_clsid
]);
215 assert(it
!= tails
[it
->slabs_clsid
]);
216 assert(it
->refcount
== 0);
218 /* so slab size changer can tell later if item is already free or not */
219 clsid
= it
->slabs_clsid
;
221 DEBUG_REFCNT(it
, 'F');
222 slabs_free(it
, ntotal
, clsid
);
226 * Returns true if an item will fit in the cache (its size does not exceed
227 * the maximum for a cache entry.)
229 bool item_size_ok(const size_t nkey
, const int flags
, const int nbytes
) {
233 size_t ntotal
= item_make_header(nkey
+ 1, flags
, nbytes
,
235 if (settings
.use_cas
) {
236 ntotal
+= sizeof(uint64_t);
239 return slabs_clsid(ntotal
) != 0;
242 static void item_link_q(item
*it
) { /* item is the new head */
244 assert(it
->slabs_clsid
< LARGEST_ID
);
245 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
247 head
= &heads
[it
->slabs_clsid
];
248 tail
= &tails
[it
->slabs_clsid
];
250 assert((*head
&& *tail
) || (*head
== 0 && *tail
== 0));
253 if (it
->next
) it
->next
->prev
= it
;
255 if (*tail
== 0) *tail
= it
;
256 sizes
[it
->slabs_clsid
]++;
260 static void item_unlink_q(item
*it
) {
262 assert(it
->slabs_clsid
< LARGEST_ID
);
263 head
= &heads
[it
->slabs_clsid
];
264 tail
= &tails
[it
->slabs_clsid
];
267 assert(it
->prev
== 0);
271 assert(it
->next
== 0);
274 assert(it
->next
!= it
);
275 assert(it
->prev
!= it
);
277 if (it
->next
) it
->next
->prev
= it
->prev
;
278 if (it
->prev
) it
->prev
->next
= it
->next
;
279 sizes
[it
->slabs_clsid
]--;
283 int do_item_link(item
*it
, const uint32_t hv
) {
284 MEMCACHED_ITEM_LINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
285 assert((it
->it_flags
& (ITEM_LINKED
|ITEM_SLABBED
)) == 0);
286 mutex_lock(&cache_lock
);
287 it
->it_flags
|= ITEM_LINKED
;
288 it
->time
= current_time
;
291 stats
.curr_bytes
+= ITEM_ntotal(it
);
292 stats
.curr_items
+= 1;
293 stats
.total_items
+= 1;
296 /* Allocate a new CAS ID on link. */
297 ITEM_set_cas(it
, (settings
.use_cas
) ? get_cas_id() : 0);
298 assoc_insert(it
, hv
);
300 refcount_incr(&it
->refcount
);
301 mutex_unlock(&cache_lock
);
306 void do_item_unlink(item
*it
, const uint32_t hv
) {
307 MEMCACHED_ITEM_UNLINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
308 mutex_lock(&cache_lock
);
309 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
310 it
->it_flags
&= ~ITEM_LINKED
;
312 stats
.curr_bytes
-= ITEM_ntotal(it
);
313 stats
.curr_items
-= 1;
315 assoc_delete(ITEM_key(it
), it
->nkey
, hv
);
319 mutex_unlock(&cache_lock
);
322 /* FIXME: Is it necessary to keep this copy/pasted code? */
323 void do_item_unlink_nolock(item
*it
, const uint32_t hv
) {
324 MEMCACHED_ITEM_UNLINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
325 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
326 it
->it_flags
&= ~ITEM_LINKED
;
328 stats
.curr_bytes
-= ITEM_ntotal(it
);
329 stats
.curr_items
-= 1;
331 assoc_delete(ITEM_key(it
), it
->nkey
, hv
);
337 void do_item_remove(item
*it
) {
338 MEMCACHED_ITEM_REMOVE(ITEM_key(it
), it
->nkey
, it
->nbytes
);
339 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
341 if (refcount_decr(&it
->refcount
) == 0) {
346 void do_item_update(item
*it
) {
347 MEMCACHED_ITEM_UPDATE(ITEM_key(it
), it
->nkey
, it
->nbytes
);
348 if (it
->time
< current_time
- ITEM_UPDATE_INTERVAL
) {
349 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
351 mutex_lock(&cache_lock
);
352 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
354 it
->time
= current_time
;
357 mutex_unlock(&cache_lock
);
361 int do_item_replace(item
*it
, item
*new_it
, const uint32_t hv
) {
362 MEMCACHED_ITEM_REPLACE(ITEM_key(it
), it
->nkey
, it
->nbytes
,
363 ITEM_key(new_it
), new_it
->nkey
, new_it
->nbytes
);
364 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
366 do_item_unlink(it
, hv
);
367 return do_item_link(new_it
, hv
);
371 char *do_item_cachedump(const unsigned int slabs_clsid
, const unsigned int limit
, unsigned int *bytes
) {
372 unsigned int memlimit
= 2 * 1024 * 1024; /* 2MB max response size */
374 unsigned int bufcurr
;
377 unsigned int shown
= 0;
378 char key_temp
[KEY_MAX_LENGTH
+ 1];
381 it
= heads
[slabs_clsid
];
383 buffer
= malloc((size_t)memlimit
);
384 if (buffer
== 0) return NULL
;
387 while (it
!= NULL
&& (limit
== 0 || shown
< limit
)) {
388 assert(it
->nkey
<= KEY_MAX_LENGTH
);
389 /* Copy the key since it may not be null-terminated in the struct */
390 strncpy(key_temp
, ITEM_key(it
), it
->nkey
);
391 key_temp
[it
->nkey
] = 0x00; /* terminate */
392 len
= snprintf(temp
, sizeof(temp
), "ITEM %s [%d b; %lu s]\r\n",
393 key_temp
, it
->nbytes
- 2,
394 (unsigned long)it
->exptime
+ process_started
);
395 if (bufcurr
+ len
+ 6 > memlimit
) /* 6 is END\r\n\0 */
397 memcpy(buffer
+ bufcurr
, temp
, len
);
403 memcpy(buffer
+ bufcurr
, "END\r\n", 6);
410 void item_stats_evictions(uint64_t *evicted
) {
412 mutex_lock(&cache_lock
);
413 for (i
= 0; i
< LARGEST_ID
; i
++) {
414 evicted
[i
] = itemstats
[i
].evicted
;
416 mutex_unlock(&cache_lock
);
419 void do_item_stats(ADD_STAT add_stats
, void *c
) {
421 for (i
= 0; i
< LARGEST_ID
; i
++) {
422 if (tails
[i
] != NULL
) {
423 const char *fmt
= "items:%d:%s";
424 char key_str
[STAT_KEY_LEN
];
425 char val_str
[STAT_VAL_LEN
];
426 int klen
= 0, vlen
= 0;
427 if (tails
[i
] == NULL
) {
428 /* We removed all of the items in this slab class */
431 APPEND_NUM_FMT_STAT(fmt
, i
, "number", "%u", sizes
[i
]);
432 APPEND_NUM_FMT_STAT(fmt
, i
, "age", "%u", current_time
- tails
[i
]->time
);
433 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted",
434 "%llu", (unsigned long long)itemstats
[i
].evicted
);
435 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_nonzero",
436 "%llu", (unsigned long long)itemstats
[i
].evicted_nonzero
);
437 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_time",
438 "%u", itemstats
[i
].evicted_time
);
439 APPEND_NUM_FMT_STAT(fmt
, i
, "outofmemory",
440 "%llu", (unsigned long long)itemstats
[i
].outofmemory
);
441 APPEND_NUM_FMT_STAT(fmt
, i
, "tailrepairs",
442 "%llu", (unsigned long long)itemstats
[i
].tailrepairs
);
443 APPEND_NUM_FMT_STAT(fmt
, i
, "reclaimed",
444 "%llu", (unsigned long long)itemstats
[i
].reclaimed
);
445 APPEND_NUM_FMT_STAT(fmt
, i
, "expired_unfetched",
446 "%llu", (unsigned long long)itemstats
[i
].expired_unfetched
);
447 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_unfetched",
448 "%llu", (unsigned long long)itemstats
[i
].evicted_unfetched
);
452 /* getting here means both ascii and binary terminators fit */
453 add_stats(NULL
, 0, NULL
, 0, c
);
456 /** dumps out a list of objects of each size, with granularity of 32 bytes */
458 void do_item_stats_sizes(ADD_STAT add_stats
, void *c
) {
460 /* max 1MB object, divided into 32 bytes size buckets */
461 const int num_buckets
= 32768;
462 unsigned int *histogram
= calloc(num_buckets
, sizeof(int));
464 if (histogram
!= NULL
) {
467 /* build the histogram */
468 for (i
= 0; i
< LARGEST_ID
; i
++) {
469 item
*iter
= heads
[i
];
471 int ntotal
= ITEM_ntotal(iter
);
472 int bucket
= ntotal
/ 32;
473 if ((ntotal
% 32) != 0) bucket
++;
474 if (bucket
< num_buckets
) histogram
[bucket
]++;
479 /* write the buffer */
480 for (i
= 0; i
< num_buckets
; i
++) {
481 if (histogram
[i
] != 0) {
483 snprintf(key
, sizeof(key
), "%d", i
* 32);
484 APPEND_STAT(key
, "%u", histogram
[i
]);
489 add_stats(NULL
, 0, NULL
, 0, c
);
492 /** wrapper around assoc_find which does the lazy expiration logic */
493 item
*do_item_get(const char *key
, const size_t nkey
, const uint32_t hv
) {
494 mutex_lock(&cache_lock
);
495 item
*it
= assoc_find(key
, nkey
, hv
);
497 refcount_incr(&it
->refcount
);
498 /* Optimization for slab reassignment. prevents popular items from
499 * jamming in busy wait. Can only do this here to satisfy lock order
500 * of item_lock, cache_lock, slabs_lock. */
501 if (slab_rebalance_signal
&&
502 ((void *)it
>= slab_rebal
.slab_start
&& (void *)it
< slab_rebal
.slab_end
)) {
503 do_item_unlink_nolock(it
, hv
);
508 mutex_unlock(&cache_lock
);
511 if (settings
.verbose
> 2) {
513 fprintf(stderr
, "> NOT FOUND %s", key
);
515 fprintf(stderr
, "> FOUND KEY %s", ITEM_key(it
));
521 if (settings
.oldest_live
!= 0 && settings
.oldest_live
<= current_time
&&
522 it
->time
<= settings
.oldest_live
) {
523 do_item_unlink(it
, hv
);
527 fprintf(stderr
, " -nuked by flush");
529 } else if (it
->exptime
!= 0 && it
->exptime
<= current_time
) {
530 do_item_unlink(it
, hv
);
534 fprintf(stderr
, " -nuked by expire");
537 it
->it_flags
|= ITEM_FETCHED
;
538 DEBUG_REFCNT(it
, '+');
542 if (settings
.verbose
> 2)
543 fprintf(stderr
, "\n");
548 item
*do_item_touch(const char *key
, size_t nkey
, uint32_t exptime
,
550 item
*it
= do_item_get(key
, nkey
, hv
);
552 it
->exptime
= exptime
;
557 /* expires items that are more recent than the oldest_live setting. */
558 void do_item_flush_expired(void) {
561 if (settings
.oldest_live
== 0)
563 for (i
= 0; i
< LARGEST_ID
; i
++) {
564 /* The LRU is sorted in decreasing time order, and an item's timestamp
565 * is never newer than its last access time, so we only need to walk
566 * back until we hit an item older than the oldest_live time.
567 * The oldest_live checking will auto-expire the remaining items.
569 for (iter
= heads
[i
]; iter
!= NULL
; iter
= next
) {
570 if (iter
->time
>= settings
.oldest_live
) {
572 if ((iter
->it_flags
& ITEM_SLABBED
) == 0) {
573 do_item_unlink_nolock(iter
, hash(ITEM_key(iter
), iter
->nkey
, 0));
576 /* We've hit the first old item. Continue to the next queue. */