1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
4 #include <sys/socket.h>
5 #include <sys/signal.h>
6 #include <sys/resource.h>
8 #include <netinet/in.h>
16 /* Forward Declarations */
17 static void item_link_q(item
*it
);
18 static void item_unlink_q(item
*it
);
21 * We only reposition items in the LRU queue if they haven't been repositioned
22 * in this many seconds. That saves us from churning on frequently-accessed
25 #define ITEM_UPDATE_INTERVAL 60
27 #define LARGEST_ID POWER_LARGEST
30 uint64_t evicted_nonzero
;
31 rel_time_t evicted_time
;
35 uint64_t expired_unfetched
;
36 uint64_t evicted_unfetched
;
39 static item
*heads
[LARGEST_ID
];
40 static item
*tails
[LARGEST_ID
];
41 static itemstats_t itemstats
[LARGEST_ID
];
42 static unsigned int sizes
[LARGEST_ID
];
44 void item_stats_reset(void) {
45 mutex_lock(&cache_lock
);
46 memset(itemstats
, 0, sizeof(itemstats
));
47 pthread_mutex_unlock(&cache_lock
);
51 /* Get the next CAS id for a new item. */
52 uint64_t get_cas_id(void) {
53 static uint64_t cas_id
= 0;
57 /* Enable this for reference-count debugging. */
59 # define DEBUG_REFCNT(it,op) \
60 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
61 it, op, it->refcount, \
62 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
63 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
65 # define DEBUG_REFCNT(it,op) while(0)
69 * Generates the variable-sized part of the header for an object.
72 * nkey - The length of the key
74 * nbytes - Number of bytes to hold value and addition CRLF terminator
75 * suffix - Buffer for the "VALUE" line suffix (flags, size).
76 * nsuffix - The length of the suffix is stored here.
78 * Returns the total size of the header.
80 static size_t item_make_header(const uint8_t nkey
, const int flags
, const int nbytes
,
81 char *suffix
, uint8_t *nsuffix
) {
82 /* suffix is defined at 40 chars elsewhere.. */
83 *nsuffix
= (uint8_t) snprintf(suffix
, 40, " %d %d\r\n", flags
, nbytes
- 2);
84 return sizeof(item
) + nkey
+ *nsuffix
+ nbytes
;
88 item
*do_item_alloc(char *key
, const size_t nkey
, const int flags
, const rel_time_t exptime
, const int nbytes
) {
92 size_t ntotal
= item_make_header(nkey
+ 1, flags
, nbytes
, suffix
, &nsuffix
);
93 if (settings
.use_cas
) {
94 ntotal
+= sizeof(uint64_t);
97 unsigned int id
= slabs_clsid(ntotal
);
101 mutex_lock(&cache_lock
);
102 /* do a quick check if we have any expired items in the tail.. */
104 rel_time_t oldest_live
= settings
.oldest_live
;
107 if (search
!= NULL
&& (refcount_incr(&search
->refcount
) == 2)) {
108 if ((search
->exptime
!= 0 && search
->exptime
< current_time
)
109 || (search
->time
<= oldest_live
&& oldest_live
<= current_time
)) { // dead by flush
113 itemstats
[id
].reclaimed
++;
114 if ((search
->it_flags
& ITEM_FETCHED
) == 0) {
116 stats
.expired_unfetched
++;
118 itemstats
[id
].expired_unfetched
++;
121 slabs_adjust_mem_requested(it
->slabs_clsid
, ITEM_ntotal(it
), ntotal
);
122 do_item_unlink_nolock(it
, hash(ITEM_key(it
), it
->nkey
, 0));
123 /* Initialize the item block: */
125 } else if ((it
= slabs_alloc(ntotal
, id
)) == NULL
) {
126 if (settings
.evict_to_free
== 0) {
127 itemstats
[id
].outofmemory
++;
128 pthread_mutex_unlock(&cache_lock
);
131 itemstats
[id
].evicted
++;
132 itemstats
[id
].evicted_time
= current_time
- search
->time
;
133 if (search
->exptime
!= 0)
134 itemstats
[id
].evicted_nonzero
++;
135 if ((search
->it_flags
& ITEM_FETCHED
) == 0) {
137 stats
.evicted_unfetched
++;
139 itemstats
[id
].evicted_unfetched
++;
145 slabs_adjust_mem_requested(it
->slabs_clsid
, ITEM_ntotal(it
), ntotal
);
146 do_item_unlink_nolock(it
, hash(ITEM_key(it
), it
->nkey
, 0));
147 /* Initialize the item block: */
150 refcount_decr(&search
->refcount
);
153 /* If the LRU is empty or locked, attempt to allocate memory */
154 it
= slabs_alloc(ntotal
, id
);
156 refcount_decr(&search
->refcount
);
160 itemstats
[id
].outofmemory
++;
161 /* Last ditch effort. There was a very rare bug which caused
162 * refcount leaks. We leave this just in case they ever happen again.
163 * We can reasonably assume no item can stay locked for more than
164 * three hours, so if we find one in the tail which is that old,
167 if (search
!= NULL
&&
168 search
->refcount
!= 2 &&
169 search
->time
+ TAIL_REPAIR_TIME
< current_time
) {
170 itemstats
[id
].tailrepairs
++;
171 search
->refcount
= 1;
172 do_item_unlink_nolock(search
, hash(ITEM_key(search
), search
->nkey
, 0));
174 pthread_mutex_unlock(&cache_lock
);
178 assert(it
->slabs_clsid
== 0);
179 assert(it
!= heads
[id
]);
181 /* Item initialization can happen outside of the lock; the item's already
182 * been removed from the slab LRU.
184 it
->refcount
= 1; /* the caller will have a reference */
185 pthread_mutex_unlock(&cache_lock
);
186 it
->next
= it
->prev
= it
->h_next
= 0;
187 it
->slabs_clsid
= id
;
189 DEBUG_REFCNT(it
, '*');
190 it
->it_flags
= settings
.use_cas
? ITEM_CAS
: 0;
193 memcpy(ITEM_key(it
), key
, nkey
);
194 it
->exptime
= exptime
;
195 memcpy(ITEM_suffix(it
), suffix
, (size_t)nsuffix
);
196 it
->nsuffix
= nsuffix
;
200 void item_free(item
*it
) {
201 size_t ntotal
= ITEM_ntotal(it
);
203 assert((it
->it_flags
& ITEM_LINKED
) == 0);
204 assert(it
!= heads
[it
->slabs_clsid
]);
205 assert(it
!= tails
[it
->slabs_clsid
]);
206 assert(it
->refcount
== 0);
208 /* so slab size changer can tell later if item is already free or not */
209 clsid
= it
->slabs_clsid
;
211 DEBUG_REFCNT(it
, 'F');
212 slabs_free(it
, ntotal
, clsid
);
216 * Returns true if an item will fit in the cache (its size does not exceed
217 * the maximum for a cache entry.)
219 bool item_size_ok(const size_t nkey
, const int flags
, const int nbytes
) {
223 size_t ntotal
= item_make_header(nkey
+ 1, flags
, nbytes
,
225 if (settings
.use_cas
) {
226 ntotal
+= sizeof(uint64_t);
229 return slabs_clsid(ntotal
) != 0;
232 static void item_link_q(item
*it
) { /* item is the new head */
234 assert(it
->slabs_clsid
< LARGEST_ID
);
235 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
237 head
= &heads
[it
->slabs_clsid
];
238 tail
= &tails
[it
->slabs_clsid
];
240 assert((*head
&& *tail
) || (*head
== 0 && *tail
== 0));
243 if (it
->next
) it
->next
->prev
= it
;
245 if (*tail
== 0) *tail
= it
;
246 sizes
[it
->slabs_clsid
]++;
250 static void item_unlink_q(item
*it
) {
252 assert(it
->slabs_clsid
< LARGEST_ID
);
253 head
= &heads
[it
->slabs_clsid
];
254 tail
= &tails
[it
->slabs_clsid
];
257 assert(it
->prev
== 0);
261 assert(it
->next
== 0);
264 assert(it
->next
!= it
);
265 assert(it
->prev
!= it
);
267 if (it
->next
) it
->next
->prev
= it
->prev
;
268 if (it
->prev
) it
->prev
->next
= it
->next
;
269 sizes
[it
->slabs_clsid
]--;
273 int do_item_link(item
*it
, const uint32_t hv
) {
274 MEMCACHED_ITEM_LINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
275 assert((it
->it_flags
& (ITEM_LINKED
|ITEM_SLABBED
)) == 0);
276 mutex_lock(&cache_lock
);
277 it
->it_flags
|= ITEM_LINKED
;
278 it
->time
= current_time
;
281 stats
.curr_bytes
+= ITEM_ntotal(it
);
282 stats
.curr_items
+= 1;
283 stats
.total_items
+= 1;
286 /* Allocate a new CAS ID on link. */
287 ITEM_set_cas(it
, (settings
.use_cas
) ? get_cas_id() : 0);
288 assoc_insert(it
, hv
);
290 refcount_incr(&it
->refcount
);
291 pthread_mutex_unlock(&cache_lock
);
296 void do_item_unlink(item
*it
, const uint32_t hv
) {
297 MEMCACHED_ITEM_UNLINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
298 mutex_lock(&cache_lock
);
299 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
300 it
->it_flags
&= ~ITEM_LINKED
;
302 stats
.curr_bytes
-= ITEM_ntotal(it
);
303 stats
.curr_items
-= 1;
305 assoc_delete(ITEM_key(it
), it
->nkey
, hv
);
309 pthread_mutex_unlock(&cache_lock
);
312 /* FIXME: Is it necessary to keep this copy/pasted code? */
313 void do_item_unlink_nolock(item
*it
, const uint32_t hv
) {
314 MEMCACHED_ITEM_UNLINK(ITEM_key(it
), it
->nkey
, it
->nbytes
);
315 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
316 it
->it_flags
&= ~ITEM_LINKED
;
318 stats
.curr_bytes
-= ITEM_ntotal(it
);
319 stats
.curr_items
-= 1;
321 assoc_delete(ITEM_key(it
), it
->nkey
, hv
);
327 void do_item_remove(item
*it
) {
328 MEMCACHED_ITEM_REMOVE(ITEM_key(it
), it
->nkey
, it
->nbytes
);
329 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
331 if (refcount_decr(&it
->refcount
) == 0) {
336 void do_item_update(item
*it
) {
337 MEMCACHED_ITEM_UPDATE(ITEM_key(it
), it
->nkey
, it
->nbytes
);
338 if (it
->time
< current_time
- ITEM_UPDATE_INTERVAL
) {
339 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
341 mutex_lock(&cache_lock
);
342 if ((it
->it_flags
& ITEM_LINKED
) != 0) {
344 it
->time
= current_time
;
347 pthread_mutex_unlock(&cache_lock
);
351 int do_item_replace(item
*it
, item
*new_it
, const uint32_t hv
) {
352 MEMCACHED_ITEM_REPLACE(ITEM_key(it
), it
->nkey
, it
->nbytes
,
353 ITEM_key(new_it
), new_it
->nkey
, new_it
->nbytes
);
354 assert((it
->it_flags
& ITEM_SLABBED
) == 0);
356 do_item_unlink(it
, hv
);
357 return do_item_link(new_it
, hv
);
360 #ifndef __INTEL_COMPILER
361 #pragma GCC diagnostic ignored "-Wshadow"
365 char *do_item_cachedump(const unsigned int slabs_clsid
, const unsigned int limit
, unsigned int *bytes
) {
366 unsigned int memlimit
= 2 * 1024 * 1024; /* 2MB max response size */
368 unsigned int bufcurr
;
371 unsigned int shown
= 0;
372 char key_temp
[KEY_MAX_LENGTH
+ 1];
375 it
= heads
[slabs_clsid
];
377 buffer
= malloc((size_t)memlimit
);
378 if (buffer
== 0) return NULL
;
381 while (it
!= NULL
&& (limit
== 0 || shown
< limit
)) {
382 assert(it
->nkey
<= KEY_MAX_LENGTH
);
383 /* Copy the key since it may not be null-terminated in the struct */
384 strncpy(key_temp
, ITEM_key(it
), it
->nkey
);
385 key_temp
[it
->nkey
] = 0x00; /* terminate */
386 len
= snprintf(temp
, sizeof(temp
), "ITEM %s [%d b; %lu s]\r\n",
387 key_temp
, it
->nbytes
- 2,
388 (unsigned long)it
->exptime
+ process_started
);
389 if (bufcurr
+ len
+ 6 > memlimit
) /* 6 is END\r\n\0 */
391 memcpy(buffer
+ bufcurr
, temp
, len
);
397 memcpy(buffer
+ bufcurr
, "END\r\n", 6);
404 void item_stats_evictions(uint64_t *evicted
) {
406 mutex_lock(&cache_lock
);
407 for (i
= 0; i
< LARGEST_ID
; i
++) {
408 evicted
[i
] = itemstats
[i
].evicted
;
410 pthread_mutex_unlock(&cache_lock
);
413 void do_item_stats(ADD_STAT add_stats
, void *c
) {
415 for (i
= 0; i
< LARGEST_ID
; i
++) {
416 if (tails
[i
] != NULL
) {
417 const char *fmt
= "items:%d:%s";
418 char key_str
[STAT_KEY_LEN
];
419 char val_str
[STAT_VAL_LEN
];
420 int klen
= 0, vlen
= 0;
421 if (tails
[i
] == NULL
) {
422 /* We removed all of the items in this slab class */
425 APPEND_NUM_FMT_STAT(fmt
, i
, "number", "%u", sizes
[i
]);
426 APPEND_NUM_FMT_STAT(fmt
, i
, "age", "%u", current_time
- tails
[i
]->time
);
427 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted",
428 "%llu", (unsigned long long)itemstats
[i
].evicted
);
429 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_nonzero",
430 "%llu", (unsigned long long)itemstats
[i
].evicted_nonzero
);
431 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_time",
432 "%u", itemstats
[i
].evicted_time
);
433 APPEND_NUM_FMT_STAT(fmt
, i
, "outofmemory",
434 "%llu", (unsigned long long)itemstats
[i
].outofmemory
);
435 APPEND_NUM_FMT_STAT(fmt
, i
, "tailrepairs",
436 "%llu", (unsigned long long)itemstats
[i
].tailrepairs
);
437 APPEND_NUM_FMT_STAT(fmt
, i
, "reclaimed",
438 "%llu", (unsigned long long)itemstats
[i
].reclaimed
);
439 APPEND_NUM_FMT_STAT(fmt
, i
, "expired_unfetched",
440 "%llu", (unsigned long long)itemstats
[i
].expired_unfetched
);
441 APPEND_NUM_FMT_STAT(fmt
, i
, "evicted_unfetched",
442 "%llu", (unsigned long long)itemstats
[i
].evicted_unfetched
);
446 /* getting here means both ascii and binary terminators fit */
447 add_stats(NULL
, 0, NULL
, 0, c
);
450 /** dumps out a list of objects of each size, with granularity of 32 bytes */
452 void do_item_stats_sizes(ADD_STAT add_stats
, void *c
) {
454 /* max 1MB object, divided into 32 bytes size buckets */
455 const int num_buckets
= 32768;
456 unsigned int *histogram
= calloc(num_buckets
, sizeof(int));
458 if (histogram
!= NULL
) {
461 /* build the histogram */
462 for (i
= 0; i
< LARGEST_ID
; i
++) {
463 item
*iter
= heads
[i
];
465 int ntotal
= ITEM_ntotal(iter
);
466 int bucket
= ntotal
/ 32;
467 if ((ntotal
% 32) != 0) bucket
++;
468 if (bucket
< num_buckets
) histogram
[bucket
]++;
473 /* write the buffer */
474 for (i
= 0; i
< num_buckets
; i
++) {
475 if (histogram
[i
] != 0) {
477 snprintf(key
, sizeof(key
), "%d", i
* 32);
478 APPEND_STAT(key
, "%u", histogram
[i
]);
483 add_stats(NULL
, 0, NULL
, 0, c
);
486 /** wrapper around assoc_find which does the lazy expiration logic */
487 item
*do_item_get(const char *key
, const size_t nkey
, const uint32_t hv
) {
488 mutex_lock(&cache_lock
);
489 item
*it
= assoc_find(key
, nkey
, hv
);
491 refcount_incr(&it
->refcount
);
492 /* Optimization for slab reassignment. prevents popular items from
493 * jamming in busy wait. Can only do this here to satisfy lock order
494 * of item_lock, cache_lock, slabs_lock. */
495 if (slab_rebalance_signal
&&
496 ((void *)it
>= slab_rebal
.slab_start
&& (void *)it
< slab_rebal
.slab_end
)) {
497 do_item_unlink_nolock(it
, hv
);
502 pthread_mutex_unlock(&cache_lock
);
505 if (settings
.verbose
> 2) {
507 fprintf(stderr
, "> NOT FOUND %s", key
);
509 fprintf(stderr
, "> FOUND KEY %s", ITEM_key(it
));
515 if (settings
.oldest_live
!= 0 && settings
.oldest_live
<= current_time
&&
516 it
->time
<= settings
.oldest_live
) {
517 do_item_unlink(it
, hv
);
521 fprintf(stderr
, " -nuked by flush");
523 } else if (it
->exptime
!= 0 && it
->exptime
<= current_time
) {
524 do_item_unlink(it
, hv
);
528 fprintf(stderr
, " -nuked by expire");
531 it
->it_flags
|= ITEM_FETCHED
;
532 DEBUG_REFCNT(it
, '+');
536 if (settings
.verbose
> 2)
537 fprintf(stderr
, "\n");
542 item
*do_item_touch(const char *key
, size_t nkey
, uint32_t exptime
,
544 item
*it
= do_item_get(key
, nkey
, hv
);
546 it
->exptime
= exptime
;
551 /* expires items that are more recent than the oldest_live setting. */
552 void do_item_flush_expired(void) {
555 if (settings
.oldest_live
== 0)
557 for (i
= 0; i
< LARGEST_ID
; i
++) {
558 /* The LRU is sorted in decreasing time order, and an item's timestamp
559 * is never newer than its last access time, so we only need to walk
560 * back until we hit an item older than the oldest_live time.
561 * The oldest_live checking will auto-expire the remaining items.
563 for (iter
= heads
[i
]; iter
!= NULL
; iter
= next
) {
564 if (iter
->time
>= settings
.oldest_live
) {
566 if ((iter
->it_flags
& ITEM_SLABBED
) == 0) {
567 do_item_unlink_nolock(iter
, hash(ITEM_key(iter
), iter
->nkey
, 0));
570 /* We've hit the first old item. Continue to the next queue. */