aerospike学习(2)-内存管理

内存管理

set预分配

元数据预分配,提前分配了4096个set元数据

1
ns->p_sets_vmap

索引操作

key的hash值用0号字节8位表示partition-id,用1-4号字节的28位表示属于该partition的哪一颗子树(sprig),默认一个partition有256颗子树,每一颗子树存在一个互斥锁。

内存索引结构如下。其实除去一些保留的、功能性的、集群相关的字段,核心字段的有效字节数约为42字节。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
typedef struct as_index_s {

// offset: 0
/* 该rc 用于红黑树遍历时(比如节点增加,使得某些index迁移插入到另一棵树),防止该索引被释放。
为了减少锁冲突,每一颗有两种锁,reduce锁专门为该场景涉及,在reduce锁期间只收集节点指针,不做复杂操作。 */
uint16_t rc; // for now, incremented & decremented only when reducing sprig

// offset: 2
uint8_t : 8; // reserved for bigger rc, if needed

// offset: 3
uint8_t tree_id: 6; // partition对应的tree_id,ssd开头的header中有记录(一个ns可能有多颗树,比如二级索引也有树)
uint8_t color: 1; //红或黑
uint8_t : 1;

// offset: 4
cf_digest keyd; //20字节哈希值

// offset: 24
uint64_t left_h: 40; //红黑树左指针(实际是handle)
uint64_t right_h: 40; //红黑树右指针(实际是handle)

// offset: 34
uint16_t set_id_bits: 12; // 对应的set的id号,可以直接查询到set在内存的数据结构的地址
/* 下面四个结构不涉及核心过程,可不管 */
uint16_t in_sindex: 1;
uint16_t xdr_bin_cemetery: 1;
uint16_t has_bin_meta: 1; // named for warm restart erase (was for old DIM)
uint16_t xdr_write: 1;

// offset: 36
uint32_t xdr_tombstone: 1;
uint32_t xdr_nsup_tombstone: 1;
uint32_t void_time: 30; // 过期时间,若没设置过期,该值会取uint32的最大值

// offset: 40
uint64_t last_update_time: 40; //最新更新时间
uint64_t generation: 16; //版本号

// offset: 47
// Used by the storage engines.
uint64_t rblock_id: 37; // can address 2^37 * 16b = 2Tb drive,对应rblock块号
uint64_t n_rblocks: 19; // is enough for 8Mb/16b = 512K rblocks,使用了多少个rblock
uint64_t file_id: 7; // 对应的ssd设备号
uint64_t key_stored: 1; // ssd的块上有没有存key的值(因为有可能不保留key原值,只有hash)

// offset: 55
/* 集群交流用的 */
uint8_t repl_state: 2;
uint8_t tombstone: 1;
uint8_t cenotaph: 1;
uint8_t : 4;

// offset: 56
// These 8 bytes are currently unused.
void* dim;

// final size: 64

} __attribute__ ((__packed__)) as_index;

获取一个record数据,若不存在,则插入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// If there's an element with specified digest in the tree, return a locked
// reference to it in index_ref. If not, create an element with this digest,
// insert it into the tree, and return a locked reference to it in index_ref.
//
// Returns:
// 1 - created and inserted (reference returned in index_ref)
// 0 - found already existing (reference returned in index_ref)
// -1 - error - could not allocate arena stage
int
as_index_get_insert_vlock(as_index_tree* tree, const cf_digest* keyd,
as_index_ref* index_ref)
{
as_index_sprig isprig;
as_index_sprig_from_keyd(tree, &isprig, keyd);

int result = as_index_sprig_get_insert_vlock(&isprig, tree->id, keyd,
index_ref);
// 该树总元素数量加1
if (result == 1) {
as_incr_uint64(&tree->n_elements);
}

return result;
}

以下函数即红黑树的查找过程,如果找不到,创建一个树节点,并返回对应的index_ref,注意hold住了锁。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
static int
as_index_sprig_get_insert_vlock(as_index_sprig* isprig, uint8_t tree_id,
const cf_digest* keyd, as_index_ref* index_ref)
{
int cmp = 0;

// Use a stack as_index object for the root's parent, for convenience.
as_index root_parent;

// Save parents as we search for the specified element's insertion point.
as_index_ele eles[64]; // must be >= (24 * 2)
as_index_ele* ele;

while (true) {
ele = eles;

cf_mutex_lock(&isprig->pair->lock);

// Search for the specified element, or a parent to insert it under.

root_parent.left_h = isprig->sprig->root_h;
root_parent.color = BLACK;

ele->parent = NULL; // we'll never look this far up
ele->me_h = 0; // root parent has no handle, never used
ele->me = &root_parent;

cf_arenax_handle t_h = isprig->sprig->root_h;

while (t_h != SENTINEL_H) {
as_index* t = RESOLVE(t_h);

ele++;
ele->parent = ele - 1;
ele->me_h = t_h;
ele->me = t;

as_arch_prefetch_nt(t);
// 如果找到了,则直接赋值返回,可以看到用hash值做的比较
if ((cmp = cf_digest_compare(keyd, &t->keyd)) == 0) {
// The element already exists, simply return it.

index_ref->r = t;
index_ref->r_h = t_h;

index_ref->puddle = isprig->puddle;
index_ref->olock = &isprig->pair->lock;

return 0;
}

t_h = cmp > 0 ? t->left_h : t->right_h;
}

// We didn't find the tree element, so we'll be inserting it.

if (cf_mutex_trylock(&isprig->pair->reduce_lock)) {
break; // no reduce in progress - go ahead and insert new element
}

// The tree is being reduced - could take long, unlock so reads and
// overwrites aren't blocked.
cf_mutex_unlock(&isprig->pair->lock);

// Wait until the tree reduce is done...
cf_mutex_lock(&isprig->pair->reduce_lock);
cf_mutex_unlock(&isprig->pair->reduce_lock);

// ... and start over - we unlocked, so the tree may have changed.
}

// Create a new element and insert it.

// Save the root so we can detect whether it changes.
cf_arenax_handle old_root = isprig->sprig->root_h;

// 获取一个内存索引结构,会先尝试从free-list取,没有则分配
cf_arenax_handle n_h = cf_arenax_alloc(isprig->arena, isprig->puddle);
...

/* 后续代码均为将该索引节点插入到红黑树中*/
as_index* n = RESOLVE(n_h);

*n = (as_index){
.tree_id = tree_id,
.keyd = *keyd,
.left_h = SENTINEL_H,
.right_h = SENTINEL_H,
.color = RED
};

// Insert the new element n under parent ele.
if (ele->me == &root_parent || 0 < cmp) {
ele->me->left_h = n_h;
}
else {
ele->me->right_h = n_h;
}

ele++;
ele->parent = ele - 1;
ele->me_h = n_h;
ele->me = n;

// Rebalance the sprig as needed.
as_index_sprig_insert_rebalance(isprig, &root_parent, ele);

// If insertion caused the root to change, save the new root.
if (root_parent.left_h != old_root) {
isprig->sprig->root_h = root_parent.left_h;
}

cf_mutex_unlock(&isprig->pair->reduce_lock);

index_ref->r = n;
index_ref->r_h = n_h;

index_ref->puddle = isprig->puddle;
index_ref->olock = &isprig->pair->lock;

return 1;
}

删除一个key的操作

1
2
3
4
// 单纯从索引树中移除节点
as_index_delete(p_partition->tree, &flat->keyd);
// 真正释放索引结构到free_list
as_record_done(&r_ref, ns);
1
2
3
4
5
6
7
8
9
void
as_record_destroy(as_record* r, as_namespace* ns)
{
// 仅做统计(减少索引对应set的record数量,ns的record数量)
as_record_drop_stats(r, ns);

// ssd操作,真正从存储引擎中删除节点
as_storage_destroy_record(ns, r);
}

核心函数为ssd_block_free,相关逻辑在ssd管理章节给出。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void
as_storage_destroy_record_ssd(as_namespace *ns, as_record *r)
{
if (STORAGE_RBLOCK_IS_VALID(r->rblock_id) && r->n_rblocks != 0) {
drv_ssds *ssds = (drv_ssds*)ns->storage_private;
drv_ssd *ssd = &ssds->ssds[r->file_id];

ssd_block_free(ssd, r->rblock_id, r->n_rblocks, "destroy");

as_namespace_adjust_set_data_used_bytes(ns, as_index_get_set_id(r),
-(int64_t)N_RBLOCKS_TO_SIZE(r->n_rblocks));

r->rblock_id = 0;
r->n_rblocks = 0;
}
}

主索引内存管理

Aerospike会为每个namespace创建一个对应的arena。由于sprig的树节点(as_index)结构为定长的64byte,Aerospike使用了自己设计的arenax结构来管理该定长数据。

上图为arenax的组织结构。arenax为两层内存分配方式结构:

  • 第一层为stage,总共有256个stage,每个stage指向一块定长的内存区域(1G)。第二层,则为stage指向的这片内存区域,该片区域根据as_index的大小被切分成1677721块(1G/64)。
  • arenax维护了一个free list。当有as_index结构free时,则放回到该free list的头部。arenax实际维护的是该free list的头。当有malloc请求时,arenax首先会查看free list里面是否还有空闲slot,如果有的话,则将free list头上的slot返回给调用方。
  • 如果free list没有空闲slot,则从根据当前stage_id(at_stage_id)以及对应的element_id(at_element_id)拿到可用的slot,将该slot返回给调用方,并顺势调整at_stage_id, at_element_id
  • 返回给调用方的是cf_arenax_handle(uint64_t),该句柄为40位,并不是内存地址,而是一个两级索引,前12位为第1级stage索引,后28位为第2级elemnt索引。在使用时,需要通过handle调用接口转换出指定的内存指针: void* cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h)
  • 光从arenax的组织来看,单台server,一个namespace的索引节点上限(as_index)数量:stage_count * slot_count_per_stage = 256*16777216 = 4294967296。基本上算无上限。
  • 为区分非法handle,stage[0]的slot[0]不做分配,初始状态是at_stage_id = 0, at_element_id = 1。

wblock缓存

即数据块缓存。这里不详细阐述原理,原理将在ssd引擎一章统一阐述。

这里给出结论,在默认wblock为1M的情况下,数据块内存缓存默认为256M。