|
intset 实现了一个数字元素的集合。
使用数组和元素的有序存放实现存取,查找过程使用二分查找法,所有插入删除的的效率为O(log2N)。 与其他数据结构类似,作者使用变编码方式实现对内存的高效利用。 初始化的intset中的数字定义为int16_t,即每个元素占用2个字节,而随着数据的插入,逐渐调整编码方式到int32_t或int64_t
上代码
intset.h
1 #ifndef __INTSET_H
2 #define __INTSET_H
3 #include
4
5 typedef struct intset {
6 uint32_t encoding;
7 uint32_t length;
8 int8_t contents[];
9 } intset;
10
11 intset *intsetNew(void);
12 intset *intsetAdd(intset *is, int64_t value, uint8_t *success);
13 intset *intsetRemove(intset *is, int64_t value, int *success);
14 uint8_t intsetFind(intset *is, int64_t value);
15 int64_t intsetRandom(intset *is);
16 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value);
17 uint32_t intsetLen(intset *is);
18
19 #endif // __INTSET_H
intset.c
1 #include
2 #include
3 #include
4 #include "intset.h"
5 #include "zmalloc.h"
6
7 /* Note that these encodings are ordered, so:
8 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
9 #define INTSET_ENC_INT16 (sizeof(int16_t))
10 #define INTSET_ENC_INT32 (sizeof(int32_t))
11 #define INTSET_ENC_INT64 (sizeof(int64_t))
12
13 /* Return the required encoding for the provided value. */
14 //根据v的值判断使用哪儿种int以节省内存
15 static uint8_t _intsetValueEncoding(int64_t v) {
16 if (v < INT32_MIN || v > INT32_MAX)
17 return INTSET_ENC_INT64;
18 else if (v < INT16_MIN || v > INT16_MAX)
19 return INTSET_ENC_INT32;
20 return INTSET_ENC_INT16;
21 }
22
23 //得到enc编码方式下的第pos个位置的值
24 /* Return the value at pos, given an encoding. */
25 static int64_t _intsetGetEncoded(intset *is, int pos, uint8_t enc) {
26 if (enc == INTSET_ENC_INT64)
27 return ((int64_t*)is->contents)[pos];
28 else if (enc == INTSET_ENC_INT32)
29 return ((int32_t*)is->contents)[pos];
30 return ((int16_t*)is->contents)[pos];
31 }
32
33 //得到is中第pos个位置的value
34 /* Return the value at pos, using the configured encoding. */
35 static int64_t _intsetGet(intset *is, int pos) {
36 return _intsetGetEncoded(is,pos,is->encoding);
37 }
38
39 //在第n个pos个位置存放value
40 /* Set the value at pos, using the configured encoding. */
41 static void _intsetSet(intset *is, int pos, int64_t value) {
42 if (is->encoding == INTSET_ENC_INT64)
43 ((int64_t*)is->contents)[pos] = value;
44 else if (is->encoding == INTSET_ENC_INT32)
45 ((int32_t*)is->contents)[pos] = value;
46 else
47 ((int16_t*)is->contents)[pos] = value;
48 }
49
50 //得到一个新的intset,编码采用INTSET_ENC_INT16
51 /* Create an empty intset. */
52 intset *intsetNew(void) {
53 intset *is = zmalloc(sizeof(intset));
54 is->encoding = INTSET_ENC_INT16;
55 is->length = 0;
56 return is;
57 }
58
59 /* Resize the intset */
60 //重新为intset分配内存
61 static intset *intsetResize(intset *is, uint32_t len) {
62 uint32_t size = len*is->encoding;
63 is = zrealloc(is,sizeof(intset)+size);
64 return is;
65 }
66
67
68 //在排好序的iniset中查找value
69 //如果找到,返回1,*pos为其所在位置
70 //如果找不到,返回0,*pos为其能插入的位置
71 //二分查找法
72 /* Search for the position of "value". Return 1 when the value was found and
73 * sets "pos" to the position of the value within the intset. Return 0 when
74 * the value is not present in the intset and sets "pos" to the position
75 * where "value" can be inserted. */
76 static uint8_t intsetSearch(intset *is, int64_t value, uint32_t *pos) {
77 int min = 0, max = is->length-1, mid = -1;
78 int64_t cur = -1;
79
80 /* The value can never be found when the set is empty */
81 if (is->length == 0) {
82 if (pos) *pos = 0;
83 return 0;
84 } else {
85 /* Check for the case where we know we cannot find the value,
86 * but do know the insert position. */
87 //如果插入值大于最后一个元素,或者小于第一个元素,则可以认定无法找到该元素
88 if (value > _intsetGet(is,is->length-1)) {
89 if (pos) *pos = is->length;
90 return 0;
91 } else if (value < _intsetGet(is,0)) {
92 if (pos) *pos = 0;
93 return 0;
94 }
95 }
96
97 while(max >= min) {
98 mid = (min+max)/2;
99 cur = _intsetGet(is,mid);
100 if (value > cur) {
101 min = mid+1;
102 } else if (value < cur) {
103 max = mid-1;
104 } else {
105 break;
106 }
107 }
108
109 if (value == cur) {
110 if (pos) *pos = mid;
111 return 1;
112 } else {
113 if (pos) *pos = min;
114 return 0;
115 }
116 }
117
118 //升级intset的编码,并且出入一个新数字,因为新数字的绝对值一定大于
119 //当前intset中所有的数字的绝对值,如果是负数,则最小,放在最前边,否则放在最后边
120 /* Upgrades the intset to a larger encoding and inserts the given integer. */
121 static intset *intsetUpgradeAndAdd(intset *is, int64_t value) {
122 uint8_t curenc = is->encoding;
123 uint8_t newenc = _intsetValueEncoding(value);
124 int length = is->length;
125 int prepend = value < 0 ? 1 : 0;
126
127 /* First set new encoding and resize */
128 is->encoding = newenc;
129 is = intsetResize(is,is->length+1);
130
131 /* Upgrade back-to-front so we don't overwrite values.
132 * Note that the "prepend" variable is used to make sure we have an empty
133 * space at either the beginning or the end of the intset. */
134 while(length--)
135 _intsetSet(is,length+prepend,_intsetGetEncoded(is,length,curenc));
136
137 /* Set the value at the beginning or the end. */
138 if (prepend)
139 _intsetSet(is,0,value);
140 else
141 _intsetSet(is,is->length,value);
142 is->length++;
143 return is;
144 }
145
146 //把from开始直到最后intset最后的内容move到to开始的地方,在这之前,应该要resize一下
147 static void intsetMoveTail(intset *is, uint32_t from, uint32_t to) {
148 void *src, *dst;
149 uint32_t bytes = is->length-from;
150 if (is->encoding == INTSET_ENC_INT64) {
151 src = (int64_t*)is->contents+from;
152 dst = (int64_t*)is->contents+to;
153 bytes *= sizeof(int64_t);
154 } else if (is->encoding == INTSET_ENC_INT32) {
155 src = (int32_t*)is->contents+from;
156 dst = (int32_t*)is->contents+to;
157 bytes *= sizeof(int32_t);
158 } else {
159 src = (int16_t*)is->contents+from;
160 dst = (int16_t*)is->contents+to;
161 bytes *= sizeof(int16_t);
162 }
163 memmove(dst,src,bytes);
164 }
165
166 //在intset中插入一个元素,如果返回0,表示已经存在,否则返回1
167 /* Insert an integer in the intset */
168 intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
169 uint8_t valenc = _intsetValueEncoding(value);
170 uint32_t pos;
171 if (success) *success = 1;
172
173 /* Upgrade encoding if necessary. If we need to upgrade, we know that
174 * this value should be either appended (if > 0) or prepended (if < 0),
175 * because it lies outside the range of existing values. */
176 if (valenc > is->encoding) {
177 /* This always succeeds, so we don't need to curry *success. */
178 return intsetUpgradeAndAdd(is,value);
179 } else {
180 /* Abort if the value is already present in the set.
181 * This call will populate "pos" with the right position to insert
182 * the value when it cannot be found. */
183 if (intsetSearch(is,value,&pos)) {
184 if (success) *success = 0;
185 return is;
186 }
187
188 is = intsetResize(is,is->length+1);
189 if (pos < is->length) intsetMoveTail(is,pos,pos+1);
190 }
191
192 _intsetSet(is,pos,value);
193 is->length++;
194 return is;
195 }
196
197 //在intset中删除一个元素value
198 /* Delete integer from intset */
199 intset *intsetRemove(intset *is, int64_t value, int *success) {
200 uint8_t valenc = _intsetValueEncoding(value);
201 uint32_t pos;
202 if (success) *success = 0;
203
204 if (valenc encoding && intsetSearch(is,value,&pos)) {
205 /* We know we can delete */
206 if (success) *success = 1;
207
208 /* Overwrite value with tail and update length */
209 if (pos < (is->length-1)) intsetMoveTail(is,pos+1,pos);
210 is = intsetResize(is,is->length-1);
211 is->length--;
212 }
213 return is;
214 }
215
216 //判断value是否在is中
217 /* Determine whether a value belongs to this set */
218 uint8_t intsetFind(intset *is, int64_t value) {
219 uint8_t valenc = _intsetValueEncoding(value);
220 return valenc encoding && intsetSearch(is,value,NULL);
221 }
222
223 //随机取一个intset中的元素
224 /* Return random member */
225 int64_t intsetRandom(intset *is) {
226 return _intsetGet(is,rand()%is->length);
227 }
228
229 //把pos位置上的元素取出,如果超出位置,返回0,找到了则返回1,*value为其值
230 /* Sets the value to the value at the given position. When this position is
231 * out of range the function returns 0, when in range it returns 1. */
232 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) {
233 if (pos < is->length) {
234 *value = _intsetGet(is,pos);
235 return 1;
236 }
237 return 0;
238 }
239
240 //得到intset的元素数量
241 /* Return intset length */
242 uint32_t intsetLen(intset *is) {
243 return is->length;
244 }
245
246 #ifdef INTSET_TEST_MAIN
247 #include
248
249 void intsetRepr(intset *is) {
250 int i;
251 for (i = 0; i < is->length; i++) {
252 printf("%lld\n", (uint64_t)_intsetGet(is,i));
253 }
254 printf("\n");
255 }
256
257 void error(char *err) {
258 printf("%s\n", err);
259 exit(1);
260 }
261
262 void ok(void) {
263 printf("OK\n");
264 }
265
266 long long usec(void) {
267 struct timeval tv;
268 gettimeofday(&tv,NULL);
269 return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
270 }
271
272 #define assert(_e) ((_e)?(void)0:(_assert(#_e,__FILE__,__LINE__),exit(1)))
273 void _assert(char *estr, char *file, int line) {
274 printf("\n\n=== ASSERTION FAILED ===\n");
275 printf("==> %s:%d '%s' is not true\n",file,line,estr);
276 }
277
278 intset *createSet(int bits, int size) {
279 uint64_t mask = (1length-1); i++) {
298 if (is->encoding == INTSET_ENC_INT16) {
299 int16_t *i16 = (int16_t*)is->contents;
300 assert(i16 < i16[i+1]);
301 } else if (is->encoding == INTSET_ENC_INT32) {
302 int32_t *i32 = (int32_t*)is->contents;
303 assert(i32 < i32[i+1]);
304 } else {
305 int64_t *i64 = (int64_t*)is->contents;
306 assert(i64 < i64[i+1]);
307 }
308 }
309 }
310
311 int main(int argc, char **argv) {
312 uint8_t success;
313 int i;
314 intset *is;
315 sranddev();
316
317 printf("Value encodings: "); {
318 assert(_intsetValueEncoding(-32768) == INTSET_ENC_INT16);
319 assert(_intsetValueEncoding(+32767) == INTSET_ENC_INT16);
320 assert(_intsetValueEncoding(-32769) == INTSET_ENC_INT32);
321 assert(_intsetValueEncoding(+32768) == INTSET_ENC_INT32);
322 assert(_intsetValueEncoding(-2147483648) == INTSET_ENC_INT32);
323 assert(_intsetValueEncoding(+2147483647) == INTSET_ENC_INT32);
324 assert(_intsetValueEncoding(-2147483649) == INTSET_ENC_INT64);
325 assert(_intsetValueEncoding(+2147483648) == INTSET_ENC_INT64);
326 assert(_intsetValueEncoding(-9223372036854775808ull) == INTSET_ENC_INT64);
327 assert(_intsetValueEncoding(+9223372036854775807ull) == INTSET_ENC_INT64);
328 ok();
329 }
330
331 printf("Basic adding: "); {
332 is = intsetNew();
333 is = intsetAdd(is,5,&success); assert(success);
334 is = intsetAdd(is,6,&success); assert(success);
335 is = intsetAdd(is,4,&success); assert(success);
336 is = intsetAdd(is,4,&success); assert(!success);
337 ok();
338 }
339
340 printf("Large number of random adds: "); {
341 int inserts = 0;
342 is = intsetNew();
343 for (i = 0; i < 1024; i++) {
344 is = intsetAdd(is,rand()%0x800,&success);
345 if (success) inserts++;
346 }
347 assert(is->length == inserts);
348 checkConsistency(is);
349 ok();
350 }
351
352 printf("Upgrade from int16 to int32: "); {
353 is = intsetNew();
354 is = intsetAdd(is,32,NULL);
355 assert(is->encoding == INTSET_ENC_INT16);
356 is = intsetAdd(is,65535,NULL);
357 assert(is->encoding == INTSET_ENC_INT32);
358 assert(intsetFind(is,32));
359 assert(intsetFind(is,65535));
360 checkConsistency(is);
361
362 is = intsetNew();
363 is = intsetAdd(is,32,NULL);
364 assert(is->encoding == INTSET_ENC_INT16);
365 is = intsetAdd(is,-65535,NULL);
366 assert(is->encoding == INTSET_ENC_INT32);
367 assert(intsetFind(is,32));
368 assert(intsetFind(is,-65535));
369 checkConsistency(is);
370 ok();
371 }
372
373 printf("Upgrade from int16 to int64: "); {
374 is = intsetNew();
375 is = intsetAdd(is,32,NULL);
376 assert(is->encoding == INTSET_ENC_INT16);
377 is = intsetAdd(is,4294967295,NULL);
378 assert(is->encoding == INTSET_ENC_INT64);
379 assert(intsetFind(is,32));
380 assert(intsetFind(is,4294967295));
381 checkConsistency(is);
382
383 is = intsetNew();
384 is = intsetAdd(is,32,NULL);
385 assert(is->encoding == INTSET_ENC_INT16);
386 is = intsetAdd(is,-4294967295,NULL);
387 assert(is->encoding == INTSET_ENC_INT64);
388 assert(intsetFind(is,32));
389 assert(intsetFind(is,-4294967295));
390 checkConsistency(is);
391 ok();
392 }
393
394 printf("Upgrade from int32 to int64: "); {
395 is = intsetNew();
396 is = intsetAdd(is,65535,NULL);
397 assert(is->encoding == INTSET_ENC_INT32);
398 is = intsetAdd(is,4294967295,NULL);
399 assert(is->encoding == INTSET_ENC_INT64);
400 assert(intsetFind(is,65535));
401 assert(intsetFind(is,4294967295));
402 checkConsistency(is);
403
404 is = intsetNew();
405 is = intsetAdd(is,65535,NULL);
406 assert(is->encoding == INTSET_ENC_INT32);
407 is = intsetAdd(is,-4294967295,NULL);
408 assert(is->encoding == INTSET_ENC_INT64);
409 assert(intsetFind(is,65535));
410 assert(intsetFind(is,-4294967295));
411 checkConsistency(is);
412 ok();
413 }
414
415 printf("Stress lookups: "); {
416 long num = 100000, size = 10000;
417 int i, bits = 20;
418 long long start;
419 is = createSet(bits,size);
420 checkConsistency(is);
421
422 start = usec();
423 for (i = 0; i < num; i++) intsetSearch(is,rand() % ((1 |
|