edd7fcf8cfefbbc42dbcb732785a0778524f18f6
[awesomized/libmemcached] / memcached / memcached.h
1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /** \file
4 * The main memcached header holding commonly used data
5 * structures and function prototypes.
6 */
7
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11
12 #include <stdbool.h>
13 #include <sys/types.h>
14 #include <sys/socket.h>
15 #include <sys/time.h>
16 #include <netinet/in.h>
17 #include <event.h>
18 #include <netdb.h>
19 #include <pthread.h>
20 #include <unistd.h>
21
22 #include "protocol_binary.h"
23 #include "cache.h"
24
25 #include "sasl_defs.h"
26
27 /** Maximum length of a key. */
28 #define KEY_MAX_LENGTH 250
29
30 /** Size of an incr buf. */
31 #define INCR_MAX_STORAGE_LEN 24
32
33 #define DATA_BUFFER_SIZE 2048
34 #define UDP_READ_BUFFER_SIZE 65536
35 #define UDP_MAX_PAYLOAD_SIZE 1400
36 #define UDP_HEADER_SIZE 8
37 #define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
38 /* I'm told the max length of a 64-bit num converted to string is 20 bytes.
39 * Plus a few for spaces, \r\n, \0 */
40 #define SUFFIX_SIZE 24
41
42 /** Initial size of list of items being returned by "get". */
43 #define ITEM_LIST_INITIAL 200
44
45 /** Initial size of list of CAS suffixes appended to "gets" lines. */
46 #define SUFFIX_LIST_INITIAL 20
47
48 /** Initial size of the sendmsg() scatter/gather array. */
49 #define IOV_LIST_INITIAL 400
50
51 /** Initial number of sendmsg() argument structures to allocate. */
52 #define MSG_LIST_INITIAL 10
53
54 /** High water marks for buffer shrinking */
55 #define READ_BUFFER_HIGHWAT 8192
56 #define ITEM_LIST_HIGHWAT 400
57 #define IOV_LIST_HIGHWAT 600
58 #define MSG_LIST_HIGHWAT 100
59
60 /* Binary protocol stuff */
61 #define MIN_BIN_PKT_LENGTH 16
62 #define BIN_PKT_HDR_WORDS (MIN_BIN_PKT_LENGTH/sizeof(uint32_t))
63
64 /* Initial power multiplier for the hash table */
65 #define HASHPOWER_DEFAULT 16
66
67 /* unistd.h is here */
68 #if HAVE_UNISTD_H
69 # include <unistd.h>
70 #endif
71
72 /* Slab sizing definitions. */
73 #define POWER_SMALLEST 1
74 #define POWER_LARGEST 200
75 #define CHUNK_ALIGN_BYTES 8
76 #define MAX_NUMBER_OF_SLAB_CLASSES (POWER_LARGEST + 1)
77
78 /** How long an object can reasonably be assumed to be locked before
79 harvesting it on a low memory condition. */
80 #define TAIL_REPAIR_TIME (3 * 3600)
81
82 /* warning: don't use these macros with a function, as it evals its arg twice */
83 #define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
84 (i)->data->cas : (uint64_t)0)
85
86 #define ITEM_set_cas(i,v) { \
87 if ((i)->it_flags & ITEM_CAS) { \
88 (i)->data->cas = v; \
89 } \
90 }
91
92 #define ITEM_key(item) (((char*)&((item)->data)) \
93 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
94
95 #define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
96 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
97
98 #define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
99 + (item)->nsuffix \
100 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
101
102 #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
103 + (item)->nsuffix + (item)->nbytes \
104 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
105
106 #define STAT_KEY_LEN 128
107 #define STAT_VAL_LEN 128
108
109 /** Append a simple stat with a stat name, value format and value */
110 #define APPEND_STAT(name, fmt, val) \
111 append_stat(name, add_stats, c, fmt, val);
112
113 /** Append an indexed stat with a stat name (with format), value format
114 and value */
115 #define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val) \
116 klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name); \
117 vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val); \
118 add_stats(key_str, klen, val_str, vlen, c);
119
120 /** Common APPEND_NUM_FMT_STAT format. */
121 #define APPEND_NUM_STAT(num, name, fmt, val) \
122 APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
123
124 /**
125 * Callback for any function producing stats.
126 *
127 * @param key the stat's key
128 * @param klen length of the key
129 * @param val the stat's value in an ascii form (e.g. text form of a number)
130 * @param vlen length of the value
131 * @parm cookie magic callback cookie
132 */
133 typedef void (*ADD_STAT)(const char *key, const uint16_t klen,
134 const char *val, const uint32_t vlen,
135 const void *cookie);
136
137 /*
138 * NOTE: If you modify this table you _MUST_ update the function state_text
139 */
140 /**
141 * Possible states of a connection.
142 */
143 enum conn_states {
144 conn_listening, /**< the socket which listens for connections */
145 conn_new_cmd, /**< Prepare connection for next command */
146 conn_waiting, /**< waiting for a readable socket */
147 conn_read, /**< reading in a command line */
148 conn_parse_cmd, /**< try to parse a command from the input buffer */
149 conn_write, /**< writing out a simple response */
150 conn_nread, /**< reading in a fixed number of bytes */
151 conn_swallow, /**< swallowing unnecessary bytes w/o storing */
152 conn_closing, /**< closing this connection */
153 conn_mwrite, /**< writing out many items sequentially */
154 conn_max_state /**< Max state value (used for assertion) */
155 };
156
157 enum bin_substates {
158 bin_no_state,
159 bin_reading_set_header,
160 bin_reading_cas_header,
161 bin_read_set_value,
162 bin_reading_get_key,
163 bin_reading_stat,
164 bin_reading_del_header,
165 bin_reading_incr_header,
166 bin_read_flush_exptime,
167 bin_reading_sasl_auth,
168 bin_reading_sasl_auth_data,
169 bin_reading_touch_key,
170 };
171
172 enum protocol {
173 ascii_prot = 3, /* arbitrary value. */
174 binary_prot,
175 negotiating_prot /* Discovering the protocol */
176 };
177
178 enum network_transport {
179 local_transport, /* Unix sockets*/
180 tcp_transport,
181 udp_transport
182 };
183
184 #define IS_UDP(x) (x == udp_transport)
185
186 #define NREAD_ADD 1
187 #define NREAD_SET 2
188 #define NREAD_REPLACE 3
189 #define NREAD_APPEND 4
190 #define NREAD_PREPEND 5
191 #define NREAD_CAS 6
192
193 enum store_item_type {
194 NOT_STORED=0, STORED, EXISTS, NOT_FOUND
195 };
196
197 enum delta_result_type {
198 OK, NON_NUMERIC, EOM, DELTA_ITEM_NOT_FOUND, DELTA_ITEM_CAS_MISMATCH
199 };
200
201 /** Time relative to server start. Smaller than time_t on 64-bit systems. */
202 typedef unsigned int rel_time_t;
203
204 /** Stats stored per slab (and per thread). */
205 struct slab_stats {
206 uint64_t set_cmds;
207 uint64_t get_hits;
208 uint64_t touch_hits;
209 uint64_t delete_hits;
210 uint64_t cas_hits;
211 uint64_t cas_badval;
212 uint64_t incr_hits;
213 uint64_t decr_hits;
214 };
215
216 /**
217 * Stats stored per-thread.
218 */
219 struct thread_stats {
220 pthread_mutex_t mutex;
221 uint64_t get_cmds;
222 uint64_t get_misses;
223 uint64_t touch_cmds;
224 uint64_t touch_misses;
225 uint64_t delete_misses;
226 uint64_t incr_misses;
227 uint64_t decr_misses;
228 uint64_t cas_misses;
229 uint64_t bytes_read;
230 uint64_t bytes_written;
231 uint64_t flush_cmds;
232 uint64_t conn_yields; /* # of yields for connections (-R option)*/
233 uint64_t auth_cmds;
234 uint64_t auth_errors;
235 struct slab_stats slab_stats[MAX_NUMBER_OF_SLAB_CLASSES];
236 };
237
238 /**
239 * Global stats.
240 */
241 struct stats {
242 pthread_mutex_t mutex;
243 unsigned int curr_items;
244 unsigned int total_items;
245 uint64_t curr_bytes;
246 unsigned int curr_conns;
247 unsigned int total_conns;
248 uint64_t rejected_conns;
249 unsigned int reserved_fds;
250 unsigned int conn_structs;
251 uint64_t get_cmds;
252 uint64_t set_cmds;
253 uint64_t touch_cmds;
254 uint64_t get_hits;
255 uint64_t get_misses;
256 uint64_t touch_hits;
257 uint64_t touch_misses;
258 uint64_t evictions;
259 uint64_t reclaimed;
260 time_t started; /* when the process was started */
261 bool accepting_conns; /* whether we are currently accepting */
262 uint64_t listen_disabled_num;
263 unsigned int hash_power_level; /* Better hope it's not over 9000 */
264 uint64_t hash_bytes; /* size used for hash tables */
265 bool hash_is_expanding; /* If the hash table is being expanded */
266 uint64_t expired_unfetched; /* items reclaimed but never touched */
267 uint64_t evicted_unfetched; /* items evicted but never touched */
268 bool slab_reassign_running; /* slab reassign in progress */
269 uint64_t slabs_moved; /* times slabs were moved around */
270 };
271
272 #define MAX_VERBOSITY_LEVEL 2
273
274 /* When adding a setting, be sure to update process_stat_settings */
275 /**
276 * Globally accessible settings as derived from the commandline.
277 */
278 struct settings {
279 size_t maxbytes;
280 int maxconns;
281 int port;
282 int udpport;
283 char *inter;
284 int verbose;
285 rel_time_t oldest_live; /* ignore existing items older than this */
286 int evict_to_free;
287 char *socketpath; /* path to unix socket if using local socket */
288 int access; /* access mask (a la chmod) for unix domain socket */
289 double factor; /* chunk size growth factor */
290 int chunk_size;
291 int num_threads; /* number of worker (without dispatcher) libevent threads to run */
292 int num_threads_per_udp; /* number of worker threads serving each udp socket */
293 char prefix_delimiter; /* character that marks a key prefix (for stats) */
294 int detail_enabled; /* nonzero if we're collecting detailed stats */
295 int reqs_per_event; /* Maximum number of io to process on each
296 io-event. */
297 bool use_cas;
298 enum protocol binding_protocol;
299 int backlog;
300 int item_size_max; /* Maximum item size, and upper end for slabs */
301 bool sasl; /* SASL on/off */
302 bool maxconns_fast; /* Whether or not to early close connections */
303 bool slab_reassign; /* Whether or not slab reassignment is allowed */
304 int slab_automove; /* Whether or not to automatically move slabs */
305 int hashpower_init; /* Starting hash power level */
306 };
307
308 extern struct stats stats;
309 extern time_t process_started;
310 extern struct settings settings;
311
312 #define ITEM_LINKED 1
313 #define ITEM_CAS 2
314
315 /* temp */
316 #define ITEM_SLABBED 4
317
318 #define ITEM_FETCHED 8
319
320 /**
321 * Structure for storing items within memcached.
322 */
323 typedef struct _stritem {
324 struct _stritem *next;
325 struct _stritem *prev;
326 struct _stritem *h_next; /* hash chain next */
327 rel_time_t time; /* least recent access */
328 rel_time_t exptime; /* expire time */
329 int nbytes; /* size of data */
330 unsigned short refcount;
331 uint8_t nsuffix; /* length of flags-and-length string */
332 uint8_t it_flags; /* ITEM_* above */
333 uint8_t slabs_clsid;/* which slab class we're in */
334 uint8_t nkey; /* key length, w/terminating null and padding */
335 /* this odd type prevents type-punning issues when we do
336 * the little shuffle to save space when not using CAS. */
337 union {
338 uint64_t cas;
339 char end;
340 } data[];
341 /* if it_flags & ITEM_CAS we have 8 bytes CAS */
342 /* then null-terminated key */
343 /* then " flags length\r\n" (no terminating null) */
344 /* then data with terminating \r\n (no terminating null; it's binary!) */
345 } item;
346
347 typedef struct {
348 pthread_t thread_id; /* unique ID of this thread */
349 struct event_base *base; /* libevent handle this thread uses */
350 struct event notify_event; /* listen event for notify pipe */
351 int notify_receive_fd; /* receiving end of notify pipe */
352 int notify_send_fd; /* sending end of notify pipe */
353 struct thread_stats stats; /* Stats generated by this thread */
354 struct conn_queue *new_conn_queue; /* queue of new connections to handle */
355 cache_t *suffix_cache; /* suffix cache */
356 } LIBEVENT_THREAD;
357
358 typedef struct {
359 pthread_t thread_id; /* unique ID of this thread */
360 struct event_base *base; /* libevent handle this thread uses */
361 } LIBEVENT_DISPATCHER_THREAD;
362
363 /**
364 * The structure representing a connection into memcached.
365 */
366 typedef struct conn conn;
367 struct conn {
368 int sfd;
369 sasl_conn_t *sasl_conn;
370 enum conn_states state;
371 enum bin_substates substate;
372 struct event event;
373 short ev_flags;
374 short which; /** which events were just triggered */
375
376 char *rbuf; /** buffer to read commands into */
377 char *rcurr; /** but if we parsed some already, this is where we stopped */
378 int rsize; /** total allocated size of rbuf */
379 int rbytes; /** how much data, starting from rcur, do we have unparsed */
380
381 char *wbuf;
382 char *wcurr;
383 int wsize;
384 int wbytes;
385 /** which state to go into after finishing current write */
386 enum conn_states write_and_go;
387 void *write_and_free; /** free this memory after finishing writing */
388
389 char *ritem; /** when we read in an item's value, it goes here */
390 int rlbytes;
391
392 /* data for the nread state */
393
394 /**
395 * item is used to hold an item structure created after reading the command
396 * line of set/add/replace commands, but before we finished reading the actual
397 * data. The data is read into ITEM_data(item) to avoid extra copying.
398 */
399
400 void *item; /* for commands set/add/replace */
401
402 /* data for the swallow state */
403 int sbytes; /* how many bytes to swallow */
404
405 /* data for the mwrite state */
406 struct iovec *iov;
407 int iovsize; /* number of elements allocated in iov[] */
408 int iovused; /* number of elements used in iov[] */
409
410 struct msghdr *msglist;
411 int msgsize; /* number of elements allocated in msglist[] */
412 int msgused; /* number of elements used in msglist[] */
413 int msgcurr; /* element in msglist[] being transmitted now */
414 int msgbytes; /* number of bytes in current msg */
415
416 item **ilist; /* list of items to write out */
417 int isize;
418 item **icurr;
419 int ileft;
420
421 char **suffixlist;
422 int suffixsize;
423 char **suffixcurr;
424 int suffixleft;
425
426 enum protocol protocol; /* which protocol this connection speaks */
427 enum network_transport transport; /* what transport is used by this connection */
428
429 /* data for UDP clients */
430 int request_id; /* Incoming UDP request ID, if this is a UDP "connection" */
431 struct sockaddr request_addr; /* Who sent the most recent request */
432 socklen_t request_addr_size;
433 unsigned char *hdrbuf; /* udp packet headers */
434 int hdrsize; /* number of headers' worth of space is allocated */
435
436 bool noreply; /* True if the reply should not be sent. */
437 /* current stats command */
438 struct {
439 char *buffer;
440 size_t size;
441 size_t offset;
442 } stats;
443
444 /* Binary protocol stuff */
445 /* This is where the binary header goes */
446 protocol_binary_request_header binary_header;
447 uint64_t cas; /* the cas to return */
448 short cmd; /* current command being processed */
449 int opaque;
450 int keylen;
451 conn *next; /* Used for generating a list of conn structures */
452 LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */
453 };
454
455
456 /* current time of day (updated periodically) */
457 extern volatile rel_time_t current_time;
458
459 /* TODO: Move to slabs.h? */
460 extern volatile int slab_rebalance_signal;
461
462 struct slab_rebalance {
463 void *slab_start;
464 void *slab_end;
465 void *slab_pos;
466 int s_clsid;
467 int d_clsid;
468 int busy_items;
469 uint8_t done;
470 };
471
472 extern struct slab_rebalance slab_rebal;
473
474 /*
475 * Functions
476 */
477 void do_accept_new_conns(const bool do_accept);
478 enum delta_result_type do_add_delta(conn *c, const char *key,
479 const size_t nkey, const bool incr,
480 const int64_t delta, char *buf,
481 uint64_t *cas, const uint32_t hv);
482 enum store_item_type do_store_item(item *item, int comm, conn* c, const uint32_t hv);
483 conn *conn_new(const int sfd, const enum conn_states init_state, const int event_flags, const int read_buffer_size, enum network_transport transport, struct event_base *base);
484 extern int daemonize(int nochdir, int noclose);
485
486 static inline int mutex_lock(pthread_mutex_t *mutex)
487 {
488 while (pthread_mutex_trylock(mutex));
489 return 0;
490 }
491
492 #define mutex_unlock(x) pthread_mutex_unlock(x)
493
494 #include "stats.h"
495 #include "slabs.h"
496 #include "assoc.h"
497 #include "items.h"
498 #include "trace.h"
499 #include "hash.h"
500 #include "util.h"
501
502 /*
503 * Functions such as the libevent-related calls that need to do cross-thread
504 * communication in multithreaded mode (rather than actually doing the work
505 * in the current thread) are called via "dispatch_" frontends, which are
506 * also #define-d to directly call the underlying code in singlethreaded mode.
507 */
508
509 void thread_init(int nthreads, struct event_base *main_base);
510 int dispatch_event_add(int thread, conn *c);
511 void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, int read_buffer_size, enum network_transport transport);
512
513 /* Lock wrappers for cache functions that are called from main loop. */
514 enum delta_result_type add_delta(conn *c, const char *key,
515 const size_t nkey, const int incr,
516 const int64_t delta, char *buf,
517 uint64_t *cas);
518 void accept_new_conns(const bool do_accept);
519 conn *conn_from_freelist(void);
520 bool conn_add_to_freelist(conn *c);
521 int is_listen_thread(void);
522 item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes);
523 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes);
524 void item_flush_expired(void);
525 item *item_get(const char *key, const size_t nkey);
526 item *item_touch(const char *key, const size_t nkey, uint32_t exptime);
527 int item_link(item *it);
528 void item_remove(item *it);
529 int item_replace(item *it, item *new_it, const uint32_t hv);
530 void item_stats(ADD_STAT add_stats, void *c);
531 void item_stats_sizes(ADD_STAT add_stats, void *c);
532 void item_unlink(item *it);
533 void item_update(item *it);
534
535 void item_lock(uint32_t hv);
536 void item_unlock(uint32_t hv);
537 unsigned short refcount_incr(unsigned short *refcount);
538 unsigned short refcount_decr(unsigned short *refcount);
539 void STATS_LOCK(void);
540 void STATS_UNLOCK(void);
541 void threadlocal_stats_reset(void);
542 void threadlocal_stats_aggregate(struct thread_stats *stats);
543 void slab_stats_aggregate(struct thread_stats *stats, struct slab_stats *out);
544
545 /* Stat processing functions */
546 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
547 const char *fmt, ...);
548
549 enum store_item_type store_item(item *item, int comm, conn *c);
550
551 #if HAVE_DROP_PRIVILEGES
552 extern void drop_privileges(void);
553 #else
554 #define drop_privileges()
555 #endif
556
557 /* If supported, give compiler hints for branch prediction. */
558 #if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
559 #define __builtin_expect(x, expected_value) (x)
560 #endif
561
562 #define likely(x) __builtin_expect((x),1)
563 #define unlikely(x) __builtin_expect((x),0)