aerospike学习(2)-内存管理

发表于 2025-04-19 分类于哈希存储引擎

内存管理

set预分配

元数据预分配，提前分配了4096个set元数据

1	ns->p_sets_vmap

索引操作

key的hash值用0号字节8位表示partition-id，用1-4号字节的28位表示属于该partition的哪一颗子树(sprig)，默认一个partition有256颗子树，每一颗子树存在一个互斥锁。

内存索引结构如下。其实除去一些保留的、功能性的、集群相关的字段，核心字段的有效字节数约为42字节。

typedef struct as_index_s {

    // offset: 0
    /* 该rc 用于红黑树遍历时(比如节点增加，使得某些index迁移插入到另一棵树)，防止该索引被释放。
        为了减少锁冲突，每一颗有两种锁，reduce锁专门为该场景涉及，在reduce锁期间只收集节点指针，不做复杂操作。 */
    uint16_t rc; // for now, incremented & decremented only when reducing sprig

    // offset: 2
    uint8_t : 8; // reserved for bigger rc, if needed

    // offset: 3
    uint8_t tree_id: 6;	// partition对应的tree_id，ssd开头的header中有记录（一个ns可能有多颗树，比如二级索引也有树）
    uint8_t color: 1;	//红或黑
    uint8_t : 1;

    // offset: 4
    cf_digest keyd;	//20字节哈希值

    // offset: 24
    uint64_t left_h: 40;	//红黑树左指针（实际是handle）
    uint64_t right_h: 40;	//红黑树右指针（实际是handle）

    // offset: 34
    uint16_t set_id_bits: 12;	// 对应的set的id号，可以直接查询到set在内存的数据结构的地址
    /* 下面四个结构不涉及核心过程，可不管 */
    uint16_t in_sindex: 1;
    uint16_t xdr_bin_cemetery: 1;
    uint16_t has_bin_meta: 1; // named for warm restart erase (was for old DIM)
    uint16_t xdr_write: 1;

    // offset: 36
    uint32_t xdr_tombstone: 1;  
    uint32_t xdr_nsup_tombstone: 1;
    uint32_t void_time: 30;		// 过期时间，若没设置过期，该值会取uint32的最大值

    // offset: 40
    uint64_t last_update_time: 40;	//最新更新时间
    uint64_t generation: 16;	//版本号

    // offset: 47
    // Used by the storage engines.
    uint64_t rblock_id: 37;		// can address 2^37 * 16b = 2Tb drive，对应rblock块号
    uint64_t n_rblocks: 19;		// is enough for 8Mb/16b = 512K rblocks，使用了多少个rblock
    uint64_t file_id: 7;		// 对应的ssd设备号
    uint64_t key_stored: 1;		// ssd的块上有没有存key的值(因为有可能不保留key原值，只有hash)

    // offset: 55
    /* 集群交流用的 */
    uint8_t repl_state: 2;
    uint8_t tombstone: 1;
    uint8_t cenotaph: 1;
    uint8_t : 4;

    // offset: 56
    // These 8 bytes are currently unused.
    void* dim;

    // final size: 64

} __attribute__ ((__packed__)) as_index;

获取一个record数据，若不存在，则插入

// If there's an element with specified digest in the tree, return a locked
// reference to it in index_ref. If not, create an element with this digest,
// insert it into the tree, and return a locked reference to it in index_ref.
//
// Returns:
//		 1 - created and inserted (reference returned in index_ref)
//		 0 - found already existing (reference returned in index_ref)
//		-1 - error - could not allocate arena stage
int
as_index_get_insert_vlock(as_index_tree* tree, const cf_digest* keyd,
        as_index_ref* index_ref)
{
    as_index_sprig isprig;
    as_index_sprig_from_keyd(tree, &isprig, keyd);

    int result = as_index_sprig_get_insert_vlock(&isprig, tree->id, keyd,
            index_ref);
    // 该树总元素数量加1
    if (result == 1) {
        as_incr_uint64(&tree->n_elements);
    }

    return result;
}

以下函数即红黑树的查找过程，如果找不到，创建一个树节点，并返回对应的index_ref，注意hold住了锁。

static int
as_index_sprig_get_insert_vlock(as_index_sprig* isprig, uint8_t tree_id,
        const cf_digest* keyd, as_index_ref* index_ref)
{
    int cmp = 0;

    // Use a stack as_index object for the root's parent, for convenience.
    as_index root_parent;

    // Save parents as we search for the specified element's insertion point.
    as_index_ele eles[64]; // must be >= (24 * 2)
    as_index_ele* ele;

    while (true) {
        ele = eles;

        cf_mutex_lock(&isprig->pair->lock);

        // Search for the specified element, or a parent to insert it under.

        root_parent.left_h = isprig->sprig->root_h;
        root_parent.color = BLACK;

        ele->parent = NULL; // we'll never look this far up
        ele->me_h = 0; // root parent has no handle, never used
        ele->me = &root_parent;

        cf_arenax_handle t_h = isprig->sprig->root_h;

        while (t_h != SENTINEL_H) {
            as_index* t = RESOLVE(t_h);

            ele++;
            ele->parent = ele - 1;
            ele->me_h = t_h;
            ele->me = t;

            as_arch_prefetch_nt(t);
            // 如果找到了，则直接赋值返回，可以看到用hash值做的比较
            if ((cmp = cf_digest_compare(keyd, &t->keyd)) == 0) {
                // The element already exists, simply return it.

                index_ref->r = t;
                index_ref->r_h = t_h;

                index_ref->puddle = isprig->puddle;
                index_ref->olock = &isprig->pair->lock;

                return 0;
            }

            t_h = cmp > 0 ? t->left_h : t->right_h;
        }

        // We didn't find the tree element, so we'll be inserting it.

        if (cf_mutex_trylock(&isprig->pair->reduce_lock)) {
            break; // no reduce in progress - go ahead and insert new element
        }

        // The tree is being reduced - could take long, unlock so reads and
        // overwrites aren't blocked.
        cf_mutex_unlock(&isprig->pair->lock);

        // Wait until the tree reduce is done...
        cf_mutex_lock(&isprig->pair->reduce_lock);
        cf_mutex_unlock(&isprig->pair->reduce_lock);

        // ... and start over - we unlocked, so the tree may have changed.
    }

    // Create a new element and insert it.

    // Save the root so we can detect whether it changes.
    cf_arenax_handle old_root = isprig->sprig->root_h;

    // 获取一个内存索引结构，会先尝试从free-list取，没有则分配
    cf_arenax_handle n_h = cf_arenax_alloc(isprig->arena, isprig->puddle);
    ...	

    /* 后续代码均为将该索引节点插入到红黑树中*/
    as_index* n = RESOLVE(n_h);

    *n = (as_index){
        .tree_id = tree_id,
        .keyd = *keyd,
        .left_h = SENTINEL_H,
        .right_h = SENTINEL_H,
        .color = RED
    };

    // Insert the new element n under parent ele.
    if (ele->me == &root_parent || 0 < cmp) {
        ele->me->left_h = n_h;
    }
    else {
        ele->me->right_h = n_h;
    }

    ele++;
    ele->parent = ele - 1;
    ele->me_h = n_h;
    ele->me = n;

    // Rebalance the sprig as needed.
    as_index_sprig_insert_rebalance(isprig, &root_parent, ele);

    // If insertion caused the root to change, save the new root.
    if (root_parent.left_h != old_root) {
        isprig->sprig->root_h = root_parent.left_h;
    }

    cf_mutex_unlock(&isprig->pair->reduce_lock);

    index_ref->r = n;
    index_ref->r_h = n_h;

    index_ref->puddle = isprig->puddle;
    index_ref->olock = &isprig->pair->lock;

    return 1;
}

删除一个key的操作

// 单纯从索引树中移除节点
as_index_delete(p_partition->tree, &flat->keyd);
// 真正释放索引结构到free_list
as_record_done(&r_ref, ns);

void
as_record_destroy(as_record* r, as_namespace* ns)
{
    // 仅做统计（减少索引对应set的record数量，ns的record数量）
    as_record_drop_stats(r, ns);

    // ssd操作，真正从存储引擎中删除节点
    as_storage_destroy_record(ns, r);
}

核心函数为ssd_block_free，相关逻辑在ssd管理章节给出。

void
as_storage_destroy_record_ssd(as_namespace *ns, as_record *r)
{
    if (STORAGE_RBLOCK_IS_VALID(r->rblock_id) && r->n_rblocks != 0) {
        drv_ssds *ssds = (drv_ssds*)ns->storage_private;
        drv_ssd *ssd = &ssds->ssds[r->file_id];

        ssd_block_free(ssd, r->rblock_id, r->n_rblocks, "destroy");

        as_namespace_adjust_set_data_used_bytes(ns, as_index_get_set_id(r),
                -(int64_t)N_RBLOCKS_TO_SIZE(r->n_rblocks));

        r->rblock_id = 0;
        r->n_rblocks = 0;
    }
}

主索引内存管理

Aerospike会为每个namespace创建一个对应的arena。由于sprig的树节点(as_index)结构为定长的64byte，Aerospike使用了自己设计的arenax结构来管理该定长数据。

上图为arenax的组织结构。arenax为两层内存分配方式结构：

第一层为stage，总共有256个stage，每个stage指向一块定长的内存区域（1G)。第二层，则为stage指向的这片内存区域，该片区域根据as_index的大小被切分成1677721块(1G/64)。
arenax维护了一个free list。当有as_index结构free时，则放回到该free list的头部。arenax实际维护的是该free list的头。当有malloc请求时，arenax首先会查看free list里面是否还有空闲slot，如果有的话，则将free list头上的slot返回给调用方。
如果free list没有空闲slot，则从根据当前stage_id（at_stage_id）以及对应的element_id(at_element_id)拿到可用的slot，将该slot返回给调用方，并顺势调整at_stage_id, at_element_id。
返回给调用方的是cf_arenax_handle（uint64_t)，该句柄为40位，并不是内存地址，而是一个两级索引，前12位为第1级stage索引，后28位为第2级elemnt索引。在使用时，需要通过handle调用接口转换出指定的内存指针: void* cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h)。
光从arenax的组织来看，单台server，一个namespace的索引节点上限(as_index)数量：stage_count * slot_count_per_stage = 256*16777216 = 4294967296。基本上算无上限。
为区分非法handle，stage[0]的slot[0]不做分配，初始状态是at_stage_id = 0, at_element_id = 1。

wblock缓存

即数据块缓存。这里不详细阐述原理，原理将在ssd引擎一章统一阐述。

这里给出结论，在默认wblock为1M的情况下，数据块内存缓存默认为256M。