d070502c023c70c8d99e8b44032afe715c84659f
[awesomized/libmemcached] / memcached / memcached.h
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /** \file
4 * The main memcached header holding commonly used data
5 * structures and function prototypes.
6 */
7
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <stdbool.h>
13 #include <stdlib.h>
14 #include <sys/types.h>
15 #include <sys/socket.h>
16 #include <sys/time.h>
17 #include <netinet/in.h>
18 #include <event.h>
19 #include <netdb.h>
20 #include <pthread.h>
21 #include <unistd.h>
22
23 #include "protocol_binary.h"
24 #include "cache.h"
25
26 #include "sasl_defs.h"
27
28 /** Maximum length of a key. */
29 #define KEY_MAX_LENGTH 250
30
31 /** Size of an incr buf. */
32 #define INCR_MAX_STORAGE_LEN 24
33
34 #define DATA_BUFFER_SIZE 2048
35 #define UDP_READ_BUFFER_SIZE 65536
36 #define UDP_MAX_PAYLOAD_SIZE 1400
37 #define UDP_HEADER_SIZE 8
38 #define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
39 /* I'm told the max length of a 64-bit num converted to string is 20 bytes.
40 * Plus a few for spaces, \r\n, \0 */
41 #define SUFFIX_SIZE 24
42
43 /** Initial size of list of items being returned by "get". */
44 #define ITEM_LIST_INITIAL 200
45
46 /** Initial size of list of CAS suffixes appended to "gets" lines. */
47 #define SUFFIX_LIST_INITIAL 20
48
49 /** Initial size of the sendmsg() scatter/gather array. */
50 #define IOV_LIST_INITIAL 400
51
52 /** Initial number of sendmsg() argument structures to allocate. */
53 #define MSG_LIST_INITIAL 10
54
55 /** High water marks for buffer shrinking */
56 #define READ_BUFFER_HIGHWAT 8192
57 #define ITEM_LIST_HIGHWAT 400
58 #define IOV_LIST_HIGHWAT 600
59 #define MSG_LIST_HIGHWAT 100
60
61 /* Binary protocol stuff */
62 #define MIN_BIN_PKT_LENGTH 16
63 #define BIN_PKT_HDR_WORDS (MIN_BIN_PKT_LENGTH/sizeof(uint32_t))
64
65 /* Initial power multiplier for the hash table */
66 #define HASHPOWER_DEFAULT 16
67
68 /* unistd.h is here */
69 #if HAVE_UNISTD_H
70 # include <unistd.h>
71 #endif
72
73 /* Slab sizing definitions. */
74 #define POWER_SMALLEST 1
75 #define POWER_LARGEST 200
76 #define CHUNK_ALIGN_BYTES 8
77 #define DONT_PREALLOC_SLABS
78 #define MAX_NUMBER_OF_SLAB_CLASSES (POWER_LARGEST + 1)
79
80 /** How long an object can reasonably be assumed to be locked before
81 harvesting it on a low memory condition. */
82 #define TAIL_REPAIR_TIME (3 * 3600)
83
84 /* warning: don't use these macros with a function, as it evals its arg twice */
85 #define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
86 (i)->data->cas : (uint64_t)0)
87
88 #define ITEM_set_cas(i,v) { \
89 if ((i)->it_flags & ITEM_CAS) { \
90 (i)->data->cas = v; \
91 } \
92 }
93
94 #define ITEM_key(item) (((char*)&((item)->data)) \
95 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
96
97 #define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
98 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
99
100 #define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
101 + (item)->nsuffix \
102 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
103
104 #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
105 + (item)->nsuffix + (item)->nbytes \
106 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
107
108 #define STAT_KEY_LEN 128
109 #define STAT_VAL_LEN 128
110
111 /** Append a simple stat with a stat name, value format and value */
112 #define APPEND_STAT(name, fmt, val) \
113 append_stat(name, add_stats, c, fmt, val);
114
115 /** Append an indexed stat with a stat name (with format), value format
116 and value */
117 #define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val) \
118 klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name); \
119 vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val); \
120 add_stats(key_str, klen, val_str, vlen, c);
121
122 /** Common APPEND_NUM_FMT_STAT format. */
123 #define APPEND_NUM_STAT(num, name, fmt, val) \
124 APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
125
126 /**
127 * Callback for any function producing stats.
128 *
129 * @param key the stat's key
130 * @param klen length of the key
131 * @param val the stat's value in an ascii form (e.g. text form of a number)
132 * @param vlen length of the value
133 * @parm cookie magic callback cookie
134 */
135 typedef void (*ADD_STAT)(const char *key, const uint16_t klen,
136 const char *val, const uint32_t vlen,
137 const void *cookie);
138
139 /*
140 * NOTE: If you modify this table you _MUST_ update the function state_text
141 */
142 /**
143 * Possible states of a connection.
144 */
145 enum conn_states {
146 conn_listening, /**< the socket which listens for connections */
147 conn_new_cmd, /**< Prepare connection for next command */
148 conn_waiting, /**< waiting for a readable socket */
149 conn_read, /**< reading in a command line */
150 conn_parse_cmd, /**< try to parse a command from the input buffer */
151 conn_write, /**< writing out a simple response */
152 conn_nread, /**< reading in a fixed number of bytes */
153 conn_swallow, /**< swallowing unnecessary bytes w/o storing */
154 conn_closing, /**< closing this connection */
155 conn_mwrite, /**< writing out many items sequentially */
156 conn_max_state /**< Max state value (used for assertion) */
157 };
158
159 enum bin_substates {
160 bin_no_state,
161 bin_reading_set_header,
162 bin_reading_cas_header,
163 bin_read_set_value,
164 bin_reading_get_key,
165 bin_reading_stat,
166 bin_reading_del_header,
167 bin_reading_incr_header,
168 bin_read_flush_exptime,
169 bin_reading_sasl_auth,
170 bin_reading_sasl_auth_data,
171 bin_reading_touch_key,
172 };
173
174 enum protocol {
175 ascii_prot = 3, /* arbitrary value. */
176 binary_prot,
177 negotiating_prot /* Discovering the protocol */
178 };
179
180 enum network_transport {
181 local_transport, /* Unix sockets*/
182 tcp_transport,
183 udp_transport
184 };
185
186 #define IS_UDP(x) (x == udp_transport)
187
188 #define NREAD_ADD 1
189 #define NREAD_SET 2
190 #define NREAD_REPLACE 3
191 #define NREAD_APPEND 4
192 #define NREAD_PREPEND 5
193 #define NREAD_CAS 6
194
195 enum store_item_type {
196 NOT_STORED=0, STORED, EXISTS, NOT_FOUND
197 };
198
199 enum delta_result_type {
200 OK, NON_NUMERIC, EOM, DELTA_ITEM_NOT_FOUND, DELTA_ITEM_CAS_MISMATCH
201 };
202
203 /** Time relative to server start. Smaller than time_t on 64-bit systems. */
204 typedef unsigned int rel_time_t;
205
206 /** Stats stored per slab (and per thread). */
207 struct slab_stats {
208 uint64_t set_cmds;
209 uint64_t get_hits;
210 uint64_t touch_hits;
211 uint64_t delete_hits;
212 uint64_t cas_hits;
213 uint64_t cas_badval;
214 uint64_t incr_hits;
215 uint64_t decr_hits;
216 };
217
218 /**
219 * Stats stored per-thread.
220 */
221 struct thread_stats {
222 pthread_mutex_t mutex;
223 uint64_t get_cmds;
224 uint64_t get_misses;
225 uint64_t touch_cmds;
226 uint64_t touch_misses;
227 uint64_t delete_misses;
228 uint64_t incr_misses;
229 uint64_t decr_misses;
230 uint64_t cas_misses;
231 uint64_t bytes_read;
232 uint64_t bytes_written;
233 uint64_t flush_cmds;
234 uint64_t conn_yields; /* # of yields for connections (-R option)*/
235 uint64_t auth_cmds;
236 uint64_t auth_errors;
237 struct slab_stats slab_stats[MAX_NUMBER_OF_SLAB_CLASSES];
238 };
239
240 /**
241 * Global stats.
242 */
243 struct stats {
244 pthread_mutex_t mutex;
245 unsigned int curr_items;
246 unsigned int total_items;
247 uint64_t curr_bytes;
248 unsigned int curr_conns;
249 unsigned int total_conns;
250 uint64_t rejected_conns;
251 unsigned int reserved_fds;
252 unsigned int conn_structs;
253 uint64_t get_cmds;
254 uint64_t set_cmds;
255 uint64_t touch_cmds;
256 uint64_t get_hits;
257 uint64_t get_misses;
258 uint64_t touch_hits;
259 uint64_t touch_misses;
260 uint64_t evictions;
261 uint64_t reclaimed;
262 time_t started; /* when the process was started */
263 bool accepting_conns; /* whether we are currently accepting */
264 uint64_t listen_disabled_num;
265 unsigned int hash_power_level; /* Better hope it's not over 9000 */
266 uint64_t hash_bytes; /* size used for hash tables */
267 bool hash_is_expanding; /* If the hash table is being expanded */
268 uint64_t expired_unfetched; /* items reclaimed but never touched */
269 uint64_t evicted_unfetched; /* items evicted but never touched */
270 bool slab_reassign_running; /* slab reassign in progress */
271 uint64_t slabs_moved; /* times slabs were moved around */
272 };
273
274 #define MAX_VERBOSITY_LEVEL 2
275
276 /* When adding a setting, be sure to update process_stat_settings */
277 /**
278 * Globally accessible settings as derived from the commandline.
279 */
280 struct settings {
281 size_t maxbytes;
282 int maxconns;
283 int port;
284 int udpport;
285 char *inter;
286 int verbose;
287 rel_time_t oldest_live; /* ignore existing items older than this */
288 int evict_to_free;
289 char *socketpath; /* path to unix socket if using local socket */
290 int access; /* access mask (a la chmod) for unix domain socket */
291 double factor; /* chunk size growth factor */
292 int chunk_size;
293 int num_threads; /* number of worker (without dispatcher) libevent threads to run */
294 int num_threads_per_udp; /* number of worker threads serving each udp socket */
295 char prefix_delimiter; /* character that marks a key prefix (for stats) */
296 int detail_enabled; /* nonzero if we're collecting detailed stats */
297 int reqs_per_event; /* Maximum number of io to process on each
298 io-event. */
299 bool use_cas;
300 enum protocol binding_protocol;
301 int backlog;
302 int item_size_max; /* Maximum item size, and upper end for slabs */
303 bool sasl; /* SASL on/off */
304 bool maxconns_fast; /* Whether or not to early close connections */
305 bool slab_reassign; /* Whether or not slab reassignment is allowed */
306 bool slab_automove; /* Whether or not to automatically move slabs */
307 int hashpower_init; /* Starting hash power level */
308 };
309
310 #ifndef __INTEL_COMPILER
311 #pragma GCC diagnostic ignored "-Wshadow"
312 #endif
313 extern struct stats stats;
314 extern time_t process_started;
315 extern struct settings settings;
316
317 #define ITEM_LINKED 1
318 #define ITEM_CAS 2
319
320 /* temp */
321 #define ITEM_SLABBED 4
322
323 #define ITEM_FETCHED 8
324
325 #ifndef __INTEL_COMPILER
326 #pragma GCC diagnostic ignored "-Wshadow"
327 #endif
328 /**
329 * Structure for storing items within memcached.
330 */
331 typedef struct _stritem {
332 struct _stritem *next;
333 struct _stritem *prev;
334 struct _stritem *h_next; /* hash chain next */
335 rel_time_t time; /* least recent access */
336 rel_time_t exptime; /* expire time */
337 int nbytes; /* size of data */
338 unsigned short refcount;
339 uint8_t nsuffix; /* length of flags-and-length string */
340 uint8_t it_flags; /* ITEM_* above */
341 uint8_t slabs_clsid;/* which slab class we're in */
342 uint8_t nkey; /* key length, w/terminating null and padding */
343 /* this odd type prevents type-punning issues when we do
344 * the little shuffle to save space when not using CAS. */
345 union {
346 uint64_t cas;
347 char end;
348 } data[];
349 /* if it_flags & ITEM_CAS we have 8 bytes CAS */
350 /* then null-terminated key */
351 /* then " flags length\r\n" (no terminating null) */
352 /* then data with terminating \r\n (no terminating null; it's binary!) */
353 } item;
354
355 typedef struct {
356 pthread_t thread_id; /* unique ID of this thread */
357 struct event_base *base; /* libevent handle this thread uses */
358 struct event notify_event; /* listen event for notify pipe */
359 int notify_receive_fd; /* receiving end of notify pipe */
360 int notify_send_fd; /* sending end of notify pipe */
361 struct thread_stats stats; /* Stats generated by this thread */
362 struct conn_queue *new_conn_queue; /* queue of new connections to handle */
363 cache_t *suffix_cache; /* suffix cache */
364 } LIBEVENT_THREAD;
365
366 typedef struct {
367 pthread_t thread_id; /* unique ID of this thread */
368 struct event_base *base; /* libevent handle this thread uses */
369 } LIBEVENT_DISPATCHER_THREAD;
370
371 /**
372 * The structure representing a connection into memcached.
373 */
374 typedef struct conn conn;
375 struct conn {
376 int sfd;
377 sasl_conn_t *sasl_conn;
378 enum conn_states state;
379 enum bin_substates substate;
380 struct event event;
381 short ev_flags;
382 short which; /** which events were just triggered */
383
384 char *rbuf; /** buffer to read commands into */
385 char *rcurr; /** but if we parsed some already, this is where we stopped */
386 int rsize; /** total allocated size of rbuf */
387 int rbytes; /** how much data, starting from rcur, do we have unparsed */
388
389 char *wbuf;
390 char *wcurr;
391 int wsize;
392 int wbytes;
393 /** which state to go into after finishing current write */
394 enum conn_states write_and_go;
395 void *write_and_free; /** free this memory after finishing writing */
396
397 char *ritem; /** when we read in an item's value, it goes here */
398 int rlbytes;
399
400 /* data for the nread state */
401
402 /**
403 * item is used to hold an item structure created after reading the command
404 * line of set/add/replace commands, but before we finished reading the actual
405 * data. The data is read into ITEM_data(item) to avoid extra copying.
406 */
407
408 void *item; /* for commands set/add/replace */
409
410 /* data for the swallow state */
411 int sbytes; /* how many bytes to swallow */
412
413 /* data for the mwrite state */
414 struct iovec *iov;
415 int iovsize; /* number of elements allocated in iov[] */
416 int iovused; /* number of elements used in iov[] */
417
418 struct msghdr *msglist;
419 int msgsize; /* number of elements allocated in msglist[] */
420 int msgused; /* number of elements used in msglist[] */
421 int msgcurr; /* element in msglist[] being transmitted now */
422 int msgbytes; /* number of bytes in current msg */
423
424 item **ilist; /* list of items to write out */
425 int isize;
426 item **icurr;
427 int ileft;
428
429 char **suffixlist;
430 int suffixsize;
431 char **suffixcurr;
432 int suffixleft;
433
434 enum protocol protocol; /* which protocol this connection speaks */
435 enum network_transport transport; /* what transport is used by this connection */
436
437 /* data for UDP clients */
438 int request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
439 struct sockaddr request_addr; /* Who sent the most recent request */
440 socklen_t request_addr_size;
441 unsigned char *hdrbuf; /* udp packet headers */
442 int hdrsize; /* number of headers' worth of space is allocated */
443
444 bool noreply; /* True if the reply should not be sent. */
445 /* current stats command */
446 struct {
447 char *buffer;
448 size_t size;
449 size_t offset;
450 } stats;
451
452 /* Binary protocol stuff */
453 /* This is where the binary header goes */
454 protocol_binary_request_header binary_header;
455 uint64_t cas; /* the cas to return */
456 short cmd; /* current command being processed */
457 int opaque;
458 int keylen;
459 conn *next; /* Used for generating a list of conn structures */
460 LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
461 };
462
463
464 /* current time of day (updated periodically) */
465 extern volatile rel_time_t current_time;
466
467 /* TODO: Move to slabs.h? */
468 extern volatile int slab_rebalance_signal;
469
470 struct slab_rebalance {
471 void *slab_start;
472 void *slab_end;
473 void *slab_pos;
474 int s_clsid;
475 int d_clsid;
476 int busy_items;
477 uint8_t done;
478 };
479
480 extern struct slab_rebalance slab_rebal;
481
482 /*
483 * Functions
484 */
485 void do_accept_new_conns(const bool do_accept);
486 enum delta_result_type do_add_delta(conn *c, const char *key,
487 const size_t nkey, const bool incr,
488 const int64_t delta, char *buf,
489 uint64_t *cas, const uint32_t hv);
490 enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv);
491 conn *conn_new(const int sfd, const enum conn_states init_state, const int event_flags, const int read_buffer_size, enum network_transport transport, struct event_base *base);
492 extern int daemonize(int nochdir, int noclose);
493
494 static inline int mutex_lock(pthread_mutex_t *mutex)
495 {
496 while (pthread_mutex_trylock(mutex));
497 return 0;
498 }
499
500 #define mutex_unlock(x) pthread_mutex_unlock(x)
501
502 #include "stats.h"
503 #include "slabs.h"
504 #include "assoc.h"
505 #include "items.h"
506 #include "trace.h"
507 #include "hash.h"
508 #include "util.h"
509
510 /*
511 * Functions such as the libevent-related calls that need to do cross-thread
512 * communication in multithreaded mode (rather than actually doing the work
513 * in the current thread) are called via "dispatch_" frontends, which are
514 * also #define-d to directly call the underlying code in singlethreaded mode.
515 */
516
517 void thread_init(int nthreads, struct event_base *main_base);
518 int dispatch_event_add(int thread, conn *c);
519 void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, int read_buffer_size, enum network_transport transport);
520
521 /* Lock wrappers for cache functions that are called from main loop. */
522 enum delta_result_type add_delta(conn *c, const char *key,
523 const size_t nkey, const int incr,
524 const int64_t delta, char *buf,
525 uint64_t *cas);
526 void accept_new_conns(const bool do_accept);
527 conn *conn_from_freelist(void);
528 bool conn_add_to_freelist(conn *c);
529 int is_listen_thread(void);
530 item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes);
531 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes);
532 void item_flush_expired(void);
533 item *item_get(const char *key, const size_t nkey);
534 item *item_touch(const char *key, const size_t nkey, uint32_t exptime);
535 int item_link(item *it);
536 void item_remove(item *it);
537 int item_replace(item *it, item *new_it, const uint32_t hv);
538 void item_stats(ADD_STAT add_stats, void *c);
539 void item_stats_sizes(ADD_STAT add_stats, void *c);
540 void item_unlink(item *it);
541 void item_update(item *it);
542
543 void item_lock(uint32_t hv);
544 void item_unlock(uint32_t hv);
545 unsigned short refcount_incr(unsigned short *refcount);
546 unsigned short refcount_decr(unsigned short *refcount);
547 void STATS_LOCK(void);
548 void STATS_UNLOCK(void);
549 void threadlocal_stats_reset(void);
550 void threadlocal_stats_aggregate(struct thread_stats *stats);
551 void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out);
552
553 /* Stat processing functions */
554 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
555 const char *fmt, ...);
556
557 enum store_item_type store_item(item *item, int comm, conn *c);
558
559 #if defined(HAVE_DROP_PRIVILEGES) && HAVE_DROP_PRIVILEGES
560 extern void drop_privileges(void);
561 #else
562 #define drop_privileges()
563 #endif
564
565 /* If supported, give compiler hints for branch prediction. */
566 #if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
567 #define __builtin_expect(x, expected_value) (x)
568 #endif
569
570 #define likely(x) __builtin_expect((x),1)
571 #define unlikely(x) __builtin_expect((x),0)