Redis源码研究--字典

guzan · 发表于 2015-7-20 10:20:18

　　计划每天花1小时学习Redis 源码。在博客上做个记录。
　　--------6月18日-----------
　　redis的字典dict主要涉及几个数据结构，
　　dictEntry：具体的k-v链表结点
　　dictht：哈希表
　　dict：字典
　　具体关系为

1 typedef struct dict {
2    dictType *type;
3    void *privdata;
4    dictht ht[2];
5    int rehashidx; /* rehashing not in progress if rehashidx == -1 */
6    int iterators; /* number of iterators currently running */
7 } dict;

1 typedef struct dictht {
2    dictEntry **table;
3    unsigned long size;
4    unsigned long sizemask;
5    unsigned long used;
6 } dictht;

1 typedef struct dictEntry {
2    void *key;
3    union {
4       void *val;
5       uint64_t u64;
6       int64_t s64;
7    } v;
8    struct dictEntry *next;
9 } dictEntry;
　　一个字典有两个哈希表，冲突后采用了链地址法，很好理解。
　　一些简单操作采用了宏

#define dictGetKey(he) ((he)->key)
#define dictGetVal(he) ((he)->v.val)
#define dictGetSignedIntegerVal(he) ((he)->v.s64)
#define dictGetUnsignedIntegerVal(he) ((he)->v.u64)
　　------------6月19日----------------------
　　字典具体用到了两种哈希算法，我只看了简单的那一种，没想到代码竟然可以那么少，算法名字为djb2，

1 /* And a case insensitive hash function (based on djb hash) */
2 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len) {
3    unsigned int hash = (unsigned int)dict_hash_function_seed;
4
5    while (len--)
6       hash = ((hash ht[0]);
9    _dictReset(&d->ht[1]);
10
11    d->type = type;
12    d->privdata = privDataPtr;
13    d->rehashidx = -1;
14    d->iterators = 0;
15
16    return DICT_OK;
17 }
18
19 static void _dictReset(dictht *ht){
20    ht->table = NULL;
21    ht->size = 0;
22    ht->sizemask = 0;
23    ht->used = 0;
24 }
学了这么多年c语言了，malloc(sizeof(*d))我还是第一次看到。
说到sizeof，我还要提一句，c99之后，sizeof是运行时确定的，c99还加入了动态数组这一概念。csdn上的回答是错的。
对字典进行紧缩处理，让哈希表中的数/哈希表长度接近1：

1 int dictResize(dict *d){
2    int minimal;
3
4    if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;
5
6    minimal = d->ht[0].used;
7
8    if (minimal < DICT_HT_INITIAL_SIZE)
9       minimal = DICT_HT_INITIAL_SIZE;
10
11    return dictExpand(d, minimal);
12 }
13
14 #define dictIsRehashing(ht) ((ht)->rehashidx != -1)
15 #define DICT_HT_INITIAL_SIZE    4
当字典正在Rehash的时候不能进行Resize操作，初始时哈希表大小为4，哈希表大小一般都是2的幂次方。
如果minimal是5，经过dictExpand后，哈希表大小变为8.

1 static unsigned long _dictNextPower(unsigned long size){
2    unsigned long i = DICT_HT_INITIAL_SIZE;
3
4    if (size >= LONG_MAX) return LONG_MAX;
5    while(1) {
6       if (i >= size)
7          return i;
8       i *= 2;
9    }
10 }
11
12 int dictExpand(dict *d, unsigned long size){
13    dictht n; /* the new hash table */
14
15    unsigned long realsize = _dictNextPower(size);
16
17    /* the size is invalid if it is smaller than the number of
18    * elements already inside the hash table */
19    if (dictIsRehashing(d) || d->ht[0].used > size)
20       return DICT_ERR;
21
22    /* Allocate the new hash table and initialize all pointers to NULL */
23    n.size = realsize;
24    n.sizemask = realsize-1;
25    n.table = zcalloc(realsize*sizeof(dictEntry*));
26    n.used = 0;
27
28    /* Is this the first initialization? If so it's not really a rehashing
29    * we just set the first hash table so that it can accept keys. */
30    if (d->ht[0].table == NULL) {
31       d->ht[0] = n;
32       return DICT_OK;
33    }
34
35    /* Prepare a second hash table for incremental rehashing */
36    d->ht[1] = n;
37    d->rehashidx = 0;
38
39    return DICT_OK;
40 }
新建了一个哈希表n，size是扩展后的size，ht[0].table 为空说明这是第一次初始化，不是扩展，直接赋值。
ht[0].table 不为空，说明这是一次扩展，把n赋给ht[1]，ReHash标志rehashix也被设为0.
上边这段不大好理解，先看后面的，一会返过来再研究dictExpand函数。
--------------------6月20日--------------------------
向字典中添加元素需要调用dictAdd函数：

1 /* Add an element to the target hash table */
2 int dictAdd(dict *d, void *key, void *val){
3    dictEntry *entry = dictAddRaw(d,key);
4
5    if (!entry) return DICT_ERR;
6    dictSetVal(d, entry, val);
7    return DICT_OK;
8 }
具体实现需要看dictAddRaw函数：

1 dictEntry *dictAddRaw(dict *d, void *key){
2    int index;
3    dictEntry *entry;
4    dictht *ht;
5
6    if (dictIsRehashing(d)) _dictRehashStep(d);
7
8    /* Get the index of the new element, or -1 if
9    * the element already exists. */
10    if ((index = _dictKeyIndex(d, key)) == -1)
11       return NULL;
12
13    /* Allocate the memory and store the new entry */
14    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
15    entry = zmalloc(sizeof(*entry));
16    entry->next = ht->table[index];
17    ht->table[index] = entry;
18    ht->used++;
19
20    /* Set the hash entry fields. */
21    dictSetKey(d, entry, key);
22    return entry;
23 }
先判断是不是在进行Rehash，如果在Rehash，执行渐进式Rehash。
找到要插入的key的位置，如果相同的key已经存在了，返回NULL
如果在进行Rehash，ht指向ht[1]表，然后利用链表头插法（这个我熟）将entry插入，更新used。
添加key前需要查找key的位置：

1 /* Returns the index of a free slot that can be populated with
2  * an hash entry for the given 'key'.
3  * If the key already exists, -1 is returned.
4  *
5  * Note that if we are in the process of rehashing the hash table, the
6  * index is always returned in the context of the second (new) hash table. */
7 static int _dictKeyIndex(dict *d, const void *key){
8    unsigned int h, idx, table;
9    dictEntry *he;
10
11    /* Expand the hash table if needed */
12    if (_dictExpandIfNeeded(d) == DICT_ERR)
13       return -1;
14    /* Compute the key hash value */
15    h = dictHashKey(d, key);
16    for (table = 0; table ht[table].sizemask;
18       /* Search if this slot does not already contain the given key */
19       he = d->ht[table].table[idx];
20       while(he) {
21          if (dictCompareKeys(d, key, he->key))
22                return -1;
23          he = he->next;
24       }
25       if (!dictIsRehashing(d)) break;
26    }
27    return idx;
28 }
　　
　　插入之前，程序会检查一下哈希表空间是否够，需不需要expand。通过某种哈希算法计算key对应的哈希值h，sizemask二进制格式大体是这样的011111111，哈希值跟它一与，相当于只保留了后面几位。算出来的idx就是要插入的索引号。然后需要比较在这个索引上的链表中有没有跟要插入的key一样的，如果重复了，返回-1.
　　最后判断下当前如果没有在进行Rehash，ht[2]表就不用管了。
　　-----------------------6月21日---------------------

1 /* Expand the hash table if needed */
2 static int _dictExpandIfNeeded(dict *d){
3    /* Incremental rehashing already in progress. Return. */
4    if (dictIsRehashing(d)) return DICT_OK;
5
6    /* If the hash table is empty expand it to the initial size. */
7    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);
8
9    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
10    * table (global setting) or we should avoid it but the ratio between
11    * elements/buckets is over the "safe" threshold, we resize doubling
12    * the number of buckets. */
13    if (d->ht[0].used >= d->ht[0].size &&
14       (dict_can_resize ||
15       d->ht[0].used/d->ht[0].size > dict_force_resize_ratio))
16    {
17       return dictExpand(d, d->ht[0].used*2);
18    }
19    return DICT_OK;
20 }
　　
　　函数名前面带下划线的都表示这是private的。程序第4行又是先判断是否正在进行Rehash，

　　为什么要说又呢

　　如果哈希表是空的，那么我们扩展到DICT_HT_INITIAL_SIZE（4）个。
　　第13行有点不理解，used什么时候会大于size啊？？？？标记一下，以后再看。
　　dict_can_resize是个全局变量。dict_force_resize_ratio = 5.

/* Using dictEnableResize() / dictDisableResize() we make possible to
* enable/disable resizing of the hash table as needed. This is very important
* for Redis, as we use copy-on-write and don't want to move too much memory
* around when there is a child performing saving operations.
*
* Note that even when dict_can_resize is set to 0, not all resizes are
* prevented: an hash table is still allowed to grow if the ratio between
* the number of elements and the buckets > dict_force_resize_ratio. */
　　

1 void dictEnableResize(void) {
2 dict_can_resize = 1;
3 }
4
5 void dictDisableResize(void) {
6 dict_can_resize = 0;
7 }

　　字典的 rehash 操作实际上就是执行以下任务：

创建一个比 ht[0]->table 更大的 ht[1]->table ；

将 ht[0]->table 中的所有键值对迁移到 ht[1]->table ；

将原有 ht[0] 的数据清空，并将 ht[1] 替换为新的 ht[0] ；

　　经过以上步骤之后，程序就在不改变原有键值对数据的基础上，增大了哈希表的大小。

　　--------------6月22日---------------------------
　　先上Rehash的代码

1 int dictRehash(dict *d, int n) {
2    if (!dictIsRehashing(d)) return 0;
3
4    while(n--) {
5       dictEntry *de, *nextde;
6
7       /* Check if we already rehashed the whole table... */
8       if (d->ht[0].used == 0) {
9          zfree(d->ht[0].table);
10          d->ht[0] = d->ht[1];
11          _dictReset(&d->ht[1]);
12          d->rehashidx = -1;
13          return 0;
14       }
15
16       /* Note that rehashidx can't overflow as we are sure there are more
17       * elements because ht[0].used != 0 */
18       assert(d->ht[0].size > (unsigned)d->rehashidx);
19       while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++;
20       de = d->ht[0].table[d->rehashidx];
21       /* Move all the keys in this bucket from the old to the new hash HT */
22       while(de) {
23          unsigned int h;
24
25          nextde = de->next;
26          /* Get the index in the new hash table */
27          h = dictHashKey(d, de->key) & d->ht[1].sizemask;
28          de->next = d->ht[1].table[h];
29          d->ht[1].table[h] = de;
30          d->ht[0].used--;
31          d->ht[1].used++;
32          de = nextde;
33       }
34       d->ht[0].table[d->rehashidx] = NULL;
35       d->rehashidx++;
36    }
37    return 1;
38 }
　　
　　n步Rehash，在ht[0]中找到第一个不为空的table[rehashidx]，将这个位置的链表（可能只有一个元素）全部移到ht[1]中，并更新ht[0].used、ht[1].used。
　　执行过程中，ht[0]中的元素如果都已经转到了ht[1]中，即ht[0].used == 0，停止执行，释放ht[0].table指向的空间，ht[1]变为ht[0]，将rehashidx置为-1。
　　字典还剩一小部分，大体意思我弄懂了，加上之前看的动态字符串sds、双向链表adlist，加上空格注释统计了下共2248行。

1  341 adlist.c
2 93 adlist.h
3  810 dict.c
4  173 dict.h
5  732 sds.c
6 99 sds.h
7 2248 total
　　主要参考了《Redis 设计与实现》。谢谢90后作者了。

账号		自动登录	找回密码
密码			立即注册

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

Red Hat RHCE 8 (EX294) Cert Guide

亿图图示专家(EDraw Max) V7.9 中文破解版

zabbix3.4.1安装部署+微信推送信息+大屏显

[经验分享] Redis源码研究--字典

浏览过的版块

扫码加入运维网微信交流群