1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
4 * The main memcached header holding commonly used data
5 * structures and function prototypes.
13 #include <sys/types.h>
14 #include <sys/socket.h>
16 #include <netinet/in.h>
22 #include "protocol_binary.h"
25 #include "sasl_defs.h"
27 /** Maximum length of a key. */
28 #define KEY_MAX_LENGTH 250
30 /** Size of an incr buf. */
31 #define INCR_MAX_STORAGE_LEN 24
33 #define DATA_BUFFER_SIZE 2048
34 #define UDP_READ_BUFFER_SIZE 65536
35 #define UDP_MAX_PAYLOAD_SIZE 1400
36 #define UDP_HEADER_SIZE 8
37 #define MAX_SENDBUF_SIZE (256 * 1024 * 1024)
38 /* I'm told the max length of a 64-bit num converted to string is 20 bytes.
39 * Plus a few for spaces, \r\n, \0 */
40 #define SUFFIX_SIZE 24
42 /** Initial size of list of items being returned by "get". */
43 #define ITEM_LIST_INITIAL 200
45 /** Initial size of list of CAS suffixes appended to "gets" lines. */
46 #define SUFFIX_LIST_INITIAL 20
48 /** Initial size of the sendmsg() scatter/gather array. */
49 #define IOV_LIST_INITIAL 400
51 /** Initial number of sendmsg() argument structures to allocate. */
52 #define MSG_LIST_INITIAL 10
54 /** High water marks for buffer shrinking */
55 #define READ_BUFFER_HIGHWAT 8192
56 #define ITEM_LIST_HIGHWAT 400
57 #define IOV_LIST_HIGHWAT 600
58 #define MSG_LIST_HIGHWAT 100
60 /* Binary protocol stuff */
61 #define MIN_BIN_PKT_LENGTH 16
62 #define BIN_PKT_HDR_WORDS (MIN_BIN_PKT_LENGTH/sizeof(uint32_t))
64 /* Initial power multiplier for the hash table */
65 #define HASHPOWER_DEFAULT 16
67 /* unistd.h is here */
72 /* Slab sizing definitions. */
73 #define POWER_SMALLEST 1
74 #define POWER_LARGEST 200
75 #define CHUNK_ALIGN_BYTES 8
76 #define MAX_NUMBER_OF_SLAB_CLASSES (POWER_LARGEST + 1)
78 /** How long an object can reasonably be assumed to be locked before
79 harvesting it on a low memory condition. */
80 #define TAIL_REPAIR_TIME (3 * 3600)
82 /* warning: don't use these macros with a function, as it evals its arg twice */
83 #define ITEM_get_cas(i) (((i)->it_flags & ITEM_CAS) ? \
84 (i)->data->cas : (uint64_t)0)
86 #define ITEM_set_cas(i,v) { \
87 if ((i)->it_flags & ITEM_CAS) { \
92 #define ITEM_key(item) (((char*)&((item)->data)) \
93 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
95 #define ITEM_suffix(item) ((char*) &((item)->data) + (item)->nkey + 1 \
96 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
98 #define ITEM_data(item) ((char*) &((item)->data) + (item)->nkey + 1 \
100 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
102 #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 \
103 + (item)->nsuffix + (item)->nbytes \
104 + (((item)->it_flags & ITEM_CAS) ? sizeof(uint64_t) : 0))
106 #define STAT_KEY_LEN 128
107 #define STAT_VAL_LEN 128
109 /** Append a simple stat with a stat name, value format and value */
110 #define APPEND_STAT(name, fmt, val) \
111 append_stat(name, add_stats, c, fmt, val);
113 /** Append an indexed stat with a stat name (with format), value format
115 #define APPEND_NUM_FMT_STAT(name_fmt, num, name, fmt, val) \
116 klen = snprintf(key_str, STAT_KEY_LEN, name_fmt, num, name); \
117 vlen = snprintf(val_str, STAT_VAL_LEN, fmt, val); \
118 add_stats(key_str, klen, val_str, vlen, c);
120 /** Common APPEND_NUM_FMT_STAT format. */
121 #define APPEND_NUM_STAT(num, name, fmt, val) \
122 APPEND_NUM_FMT_STAT("%d:%s", num, name, fmt, val)
125 * Callback for any function producing stats.
127 * @param key the stat's key
128 * @param klen length of the key
129 * @param val the stat's value in an ascii form (e.g. text form of a number)
130 * @param vlen length of the value
131 * @parm cookie magic callback cookie
133 typedef void (*ADD_STAT
)(const char *key
, const uint16_t klen
,
134 const char *val
, const uint32_t vlen
,
138 * NOTE: If you modify this table you _MUST_ update the function state_text
141 * Possible states of a connection.
144 conn_listening
, /**< the socket which listens for connections */
145 conn_new_cmd
, /**< Prepare connection for next command */
146 conn_waiting
, /**< waiting for a readable socket */
147 conn_read
, /**< reading in a command line */
148 conn_parse_cmd
, /**< try to parse a command from the input buffer */
149 conn_write
, /**< writing out a simple response */
150 conn_nread
, /**< reading in a fixed number of bytes */
151 conn_swallow
, /**< swallowing unnecessary bytes w/o storing */
152 conn_closing
, /**< closing this connection */
153 conn_mwrite
, /**< writing out many items sequentially */
154 conn_max_state
/**< Max state value (used for assertion) */
159 bin_reading_set_header
,
160 bin_reading_cas_header
,
164 bin_reading_del_header
,
165 bin_reading_incr_header
,
166 bin_read_flush_exptime
,
167 bin_reading_sasl_auth
,
168 bin_reading_sasl_auth_data
,
169 bin_reading_touch_key
,
173 ascii_prot
= 3, /* arbitrary value. */
175 negotiating_prot
/* Discovering the protocol */
178 enum network_transport
{
179 local_transport
, /* Unix sockets*/
184 #define IS_UDP(x) (x == udp_transport)
188 #define NREAD_REPLACE 3
189 #define NREAD_APPEND 4
190 #define NREAD_PREPEND 5
193 enum store_item_type
{
194 NOT_STORED
=0, STORED
, EXISTS
, NOT_FOUND
197 enum delta_result_type
{
198 OK
, NON_NUMERIC
, EOM
, DELTA_ITEM_NOT_FOUND
, DELTA_ITEM_CAS_MISMATCH
201 /** Time relative to server start. Smaller than time_t on 64-bit systems. */
202 typedef unsigned int rel_time_t
;
204 /** Stats stored per slab (and per thread). */
209 uint64_t delete_hits
;
217 * Stats stored per-thread.
219 struct thread_stats
{
220 pthread_mutex_t mutex
;
224 uint64_t touch_misses
;
225 uint64_t delete_misses
;
226 uint64_t incr_misses
;
227 uint64_t decr_misses
;
230 uint64_t bytes_written
;
232 uint64_t conn_yields
; /* # of yields for connections (-R option)*/
234 uint64_t auth_errors
;
235 struct slab_stats slab_stats
[MAX_NUMBER_OF_SLAB_CLASSES
];
242 pthread_mutex_t mutex
;
243 unsigned int curr_items
;
244 unsigned int total_items
;
246 unsigned int curr_conns
;
247 unsigned int total_conns
;
248 uint64_t rejected_conns
;
249 unsigned int reserved_fds
;
250 unsigned int conn_structs
;
257 uint64_t touch_misses
;
260 time_t started
; /* when the process was started */
261 bool accepting_conns
; /* whether we are currently accepting */
262 uint64_t listen_disabled_num
;
263 unsigned int hash_power_level
; /* Better hope it's not over 9000 */
264 uint64_t hash_bytes
; /* size used for hash tables */
265 bool hash_is_expanding
; /* If the hash table is being expanded */
266 uint64_t expired_unfetched
; /* items reclaimed but never touched */
267 uint64_t evicted_unfetched
; /* items evicted but never touched */
268 bool slab_reassign_running
; /* slab reassign in progress */
269 uint64_t slabs_moved
; /* times slabs were moved around */
272 #define MAX_VERBOSITY_LEVEL 2
274 /* When adding a setting, be sure to update process_stat_settings */
276 * Globally accessible settings as derived from the commandline.
285 rel_time_t oldest_live
; /* ignore existing items older than this */
287 char *socketpath
; /* path to unix socket if using local socket */
288 int access
; /* access mask (a la chmod) for unix domain socket */
289 double factor
; /* chunk size growth factor */
291 int num_threads
; /* number of worker (without dispatcher) libevent threads to run */
292 int num_threads_per_udp
; /* number of worker threads serving each udp socket */
293 char prefix_delimiter
; /* character that marks a key prefix (for stats) */
294 int detail_enabled
; /* nonzero if we're collecting detailed stats */
295 int reqs_per_event
; /* Maximum number of io to process on each
298 enum protocol binding_protocol
;
300 int item_size_max
; /* Maximum item size, and upper end for slabs */
301 bool sasl
; /* SASL on/off */
302 bool maxconns_fast
; /* Whether or not to early close connections */
303 bool slab_reassign
; /* Whether or not slab reassignment is allowed */
304 int slab_automove
; /* Whether or not to automatically move slabs */
305 int hashpower_init
; /* Starting hash power level */
308 extern struct stats stats
;
309 extern time_t process_started
;
310 extern struct settings settings
;
312 #define ITEM_LINKED 1
316 #define ITEM_SLABBED 4
318 #define ITEM_FETCHED 8
321 * Structure for storing items within memcached.
323 typedef struct _stritem
{
324 struct _stritem
*next
;
325 struct _stritem
*prev
;
326 struct _stritem
*h_next
; /* hash chain next */
327 rel_time_t time
; /* least recent access */
328 rel_time_t exptime
; /* expire time */
329 int nbytes
; /* size of data */
330 unsigned short refcount
;
331 uint8_t nsuffix
; /* length of flags-and-length string */
332 uint8_t it_flags
; /* ITEM_* above */
333 uint8_t slabs_clsid
;/* which slab class we're in */
334 uint8_t nkey
; /* key length, w/terminating null and padding */
335 /* this odd type prevents type-punning issues when we do
336 * the little shuffle to save space when not using CAS. */
341 /* if it_flags & ITEM_CAS we have 8 bytes CAS */
342 /* then null-terminated key */
343 /* then " flags length\r\n" (no terminating null) */
344 /* then data with terminating \r\n (no terminating null; it's binary!) */
348 pthread_t thread_id
; /* unique ID of this thread */
349 struct event_base
*base
; /* libevent handle this thread uses */
350 struct event notify_event
; /* listen event for notify pipe */
351 int notify_receive_fd
; /* receiving end of notify pipe */
352 int notify_send_fd
; /* sending end of notify pipe */
353 struct thread_stats stats
; /* Stats generated by this thread */
354 struct conn_queue
*new_conn_queue
; /* queue of new connections to handle */
355 cache_t
*suffix_cache
; /* suffix cache */
359 pthread_t thread_id
; /* unique ID of this thread */
360 struct event_base
*base
; /* libevent handle this thread uses */
361 } LIBEVENT_DISPATCHER_THREAD
;
364 * The structure representing a connection into memcached.
366 typedef struct conn conn
;
369 sasl_conn_t
*sasl_conn
;
370 enum conn_states state
;
371 enum bin_substates substate
;
374 short which
; /** which events were just triggered */
376 char *rbuf
; /** buffer to read commands into */
377 char *rcurr
; /** but if we parsed some already, this is where we stopped */
378 int rsize
; /** total allocated size of rbuf */
379 int rbytes
; /** how much data, starting from rcur, do we have unparsed */
385 /** which state to go into after finishing current write */
386 enum conn_states write_and_go
;
387 void *write_and_free
; /** free this memory after finishing writing */
389 char *ritem
; /** when we read in an item's value, it goes here */
392 /* data for the nread state */
395 * item is used to hold an item structure created after reading the command
396 * line of set/add/replace commands, but before we finished reading the actual
397 * data. The data is read into ITEM_data(item) to avoid extra copying.
400 void *item
; /* for commands set/add/replace */
402 /* data for the swallow state */
403 int sbytes
; /* how many bytes to swallow */
405 /* data for the mwrite state */
407 int iovsize
; /* number of elements allocated in iov[] */
408 int iovused
; /* number of elements used in iov[] */
410 struct msghdr
*msglist
;
411 int msgsize
; /* number of elements allocated in msglist[] */
412 int msgused
; /* number of elements used in msglist[] */
413 int msgcurr
; /* element in msglist[] being transmitted now */
414 int msgbytes
; /* number of bytes in current msg */
416 item
**ilist
; /* list of items to write out */
426 enum protocol protocol
; /* which protocol this connection speaks */
427 enum network_transport transport
; /* what transport is used by this connection */
429 /* data for UDP clients */
430 int request_id
; /* Incoming UDP request ID, if this is a UDP "connection" */
431 struct sockaddr request_addr
; /* Who sent the most recent request */
432 socklen_t request_addr_size
;
433 unsigned char *hdrbuf
; /* udp packet headers */
434 int hdrsize
; /* number of headers' worth of space is allocated */
436 bool noreply
; /* True if the reply should not be sent. */
437 /* current stats command */
444 /* Binary protocol stuff */
445 /* This is where the binary header goes */
446 protocol_binary_request_header binary_header
;
447 uint64_t cas
; /* the cas to return */
448 short cmd
; /* current command being processed */
451 conn
*next
; /* Used for generating a list of conn structures */
452 LIBEVENT_THREAD
*thread
; /* Pointer to the thread object serving this connection */
456 /* current time of day (updated periodically) */
457 extern volatile rel_time_t current_time
;
459 /* TODO: Move to slabs.h? */
460 extern volatile int slab_rebalance_signal
;
462 struct slab_rebalance
{
472 extern struct slab_rebalance slab_rebal
;
477 void do_accept_new_conns(const bool do_accept
);
478 enum delta_result_type
do_add_delta(conn
*c
, const char *key
,
479 const size_t nkey
, const bool incr
,
480 const int64_t delta
, char *buf
,
481 uint64_t *cas
, const uint32_t hv
);
482 enum store_item_type
do_store_item(item
*item
, int comm
, conn
* c
, const uint32_t hv
);
483 conn
*conn_new(const int sfd
, const enum conn_states init_state
, const int event_flags
, const int read_buffer_size
, enum network_transport transport
, struct event_base
*base
);
484 extern int daemonize(int nochdir
, int noclose
);
486 static inline int mutex_lock(pthread_mutex_t
*mutex
)
488 while (pthread_mutex_trylock(mutex
));
492 #define mutex_unlock(x) pthread_mutex_unlock(x)
503 * Functions such as the libevent-related calls that need to do cross-thread
504 * communication in multithreaded mode (rather than actually doing the work
505 * in the current thread) are called via "dispatch_" frontends, which are
506 * also #define-d to directly call the underlying code in singlethreaded mode.
509 void thread_init(int nthreads
, struct event_base
*main_base
);
510 int dispatch_event_add(int thread
, conn
*c
);
511 void dispatch_conn_new(int sfd
, enum conn_states init_state
, int event_flags
, int read_buffer_size
, enum network_transport transport
);
513 /* Lock wrappers for cache functions that are called from main loop. */
514 enum delta_result_type
add_delta(conn
*c
, const char *key
,
515 const size_t nkey
, const int incr
,
516 const int64_t delta
, char *buf
,
518 void accept_new_conns(const bool do_accept
);
519 conn
*conn_from_freelist(void);
520 bool conn_add_to_freelist(conn
*c
);
521 int is_listen_thread(void);
522 item
*item_alloc(char *key
, size_t nkey
, int flags
, rel_time_t exptime
, int nbytes
);
523 char *item_cachedump(const unsigned int slabs_clsid
, const unsigned int limit
, unsigned int *bytes
);
524 void item_flush_expired(void);
525 item
*item_get(const char *key
, const size_t nkey
);
526 item
*item_touch(const char *key
, const size_t nkey
, uint32_t exptime
);
527 int item_link(item
*it
);
528 void item_remove(item
*it
);
529 int item_replace(item
*it
, item
*new_it
, const uint32_t hv
);
530 void item_stats(ADD_STAT add_stats
, void *c
);
531 void item_stats_sizes(ADD_STAT add_stats
, void *c
);
532 void item_unlink(item
*it
);
533 void item_update(item
*it
);
535 void item_lock(uint32_t hv
);
536 void item_unlock(uint32_t hv
);
537 unsigned short refcount_incr(unsigned short *refcount
);
538 unsigned short refcount_decr(unsigned short *refcount
);
539 void STATS_LOCK(void);
540 void STATS_UNLOCK(void);
541 void threadlocal_stats_reset(void);
542 void threadlocal_stats_aggregate(struct thread_stats
*stats
);
543 void slab_stats_aggregate(struct thread_stats
*stats
, struct slab_stats
*out
);
545 /* Stat processing functions */
546 void append_stat(const char *name
, ADD_STAT add_stats
, conn
*c
,
547 const char *fmt
, ...);
549 enum store_item_type
store_item(item
*item
, int comm
, conn
*c
);
551 #if HAVE_DROP_PRIVILEGES
552 extern void drop_privileges(void);
554 #define drop_privileges()
557 /* If supported, give compiler hints for branch prediction. */
558 #if !defined(__GNUC__) || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
559 #define __builtin_expect(x, expected_value) (x)
562 #define likely(x) __builtin_expect((x),1)
563 #define unlikely(x) __builtin_expect((x),0)