memcached源碼分析-slab存儲機制

導(dǎo)航

memcached源碼分析
 memcached源碼分析-網(wǎng)絡(luò)模塊
 memcached源碼分析-指令解析模塊
 memcached源碼分析-哈希表(hashtable)模塊
 memcached源碼分析-slab存儲機制

1.前言

這一章節(jié)主要介紹了memcached的內(nèi)存管理機制股缸，我們在指令解析模塊中說到了memcached是如何解析處理客戶端請求指令的手销。
例如：執(zhí)行set key flags exptime bytes \n value
memcached接收來此客戶端的存儲請求职员，那么服務(wù)端是如何存儲來自客戶端的存儲內(nèi)容的呢，這里就涉及到了slabs存儲機制赊琳，在此之前首先介紹memcached中關(guān)于內(nèi)存管理的幾個重要的概念

item 數(shù)據(jù)存儲節(jié)點

typedef struct _stritem {
    /* Protected by LRU locks */
    //一個item的地址, 主要用于LRU鏈和freelist鏈
    struct _stritem *next;
    //下一個item的地址,主要用于LRU鏈和freelist鏈
    struct _stritem *prev;

    /* Rest are protected by an item lock */
    //用于記錄哈希表槽中下一個item節(jié)點的地址
    struct _stritem *h_next;    /* hash chain next */
    //最近訪問時間
    rel_time_t      time;       /* least recent access */
    //緩存過期時間
    rel_time_t      exptime;    /* expire time */
    int             nbytes;     /* size of data */
    //當(dāng)前item被引用的次數(shù)街夭，用于判斷item是否被其它的線程在操作中
    //refcount == 1的情況下該節(jié)點才可以被刪除
    unsigned short  refcount;
    uint8_t         nsuffix;    /* length of flags-and-length string */
    uint8_t         it_flags;   /* ITEM_* above */
    //記錄該item節(jié)點位于哪個slabclass_t中
    uint8_t         slabs_clsid;/* which slab class we're in */
    uint8_t         nkey;       /* key length, w/terminating null and padding */
    /* this odd type prevents type-punning issues when we do
     * the little shuffle to save space when not using CAS. */
    union {
        uint64_t cas;
        char end;
    } data[];
    /* if it_flags & ITEM_CAS we have 8 bytes CAS */
    /* then null-terminated key */
    /* then " flags length\r\n" (no terminating null) */
    /* then data with terminating \r\n (no terminating null; it's binary!) */
} item;

slab與chunk

slab是一塊內(nèi)存空間，默認(rèn)大小為1M躏筏，memcached會把一個slab分割成一個個chunk, 這些被切割的小的內(nèi)存塊板丽，主要用來存儲item

slabclass

每個item的大小都可能不一樣，item存儲于chunk,如果chunk大小不夠趁尼，則不足以分配給item使用埃碱，如果chunk過大，則太過于浪費內(nèi)存空間酥泞。因此memcached采取的做法是砚殿，將slab切割成不同大小的chunk，這樣就滿足了不同大小item的存儲芝囤。被劃分不同大小chunk的slab的內(nèi)存在memcached就是用slabclass這個結(jié)構(gòu)體來表現(xiàn)的

typedef struct {
    //chunk大小
    unsigned int size;      /* sizes of items */
    //1M內(nèi)存大小被分割為多少個chunck
    unsigned int perslab;   /* how many items per slab */

    //空閑chunk鏈表
    void *slots;           /* list of item ptrs */
    //空閑chunk的個數(shù)
    unsigned int sl_curr;   /* total free items in list */

    //當(dāng)前slabclass已經(jīng)分配了所少個1M空間的slab
    unsigned int slabs;     /* how many slabs were allocated for this class */

    //slab指針數(shù)組
    void **slab_list;       /* array of slab pointers */
    //slab指針數(shù)組的大小
    unsigned int list_size; /* size of prev array */

    size_t requested; /* The number of requested bytes */
} slabclass_t;

slab_class結(jié)構(gòu)示意圖

1.slabclass數(shù)組初始化的時候似炎，每個slabclass_t都會分配一個1M大小的slab辛萍，slab會被切分為N個小的內(nèi)存塊，這個小的內(nèi)存塊的大小取決于slabclass_t結(jié)構(gòu)上的size的大小
2.每個slabclass_t都只存儲一定大小范圍的數(shù)據(jù)羡藐，并且下一個slabclass切割的chunk塊大于前一個slabclass切割的chunk塊大小
3.memcached中slabclass數(shù)組默認(rèn)大小為64贩毕，slabclass切割塊大小的增長因子默認(rèn)是1.25
例如：slabclass[1]切割的chunk塊大小為100字節(jié)，slabclass[2]為125仆嗦，如果需要存儲一個110字節(jié)的緩存辉阶，那么就需要到slabclass[2] 的空閑鏈表中獲取一個空閑節(jié)點進(jìn)行存儲

2.slabclass的初始化

/**
 * Determines the chunk sizes and initializes the slab class descriptors
 * accordingly.
 */
void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
    int i = POWER_SMALLEST - 1;
    unsigned int size = sizeof(item) + settings.chunk_size;

    /* Some platforms use runtime transparent hugepages. If for any reason
     * the initial allocation fails, the required settings do not persist
     * for remaining allocations. As such it makes little sense to do slab
     * preallocation. */
    bool __attribute__ ((unused)) do_slab_prealloc = false;

    mem_limit = limit;

    if (prealloc) {
        //...
        /* Allocate everything in a big chunk with malloc */
        //預(yù)分配一個大空間
        mem_base = malloc(mem_limit);
        do_slab_prealloc = true;
        if (mem_base != NULL) {
            mem_current = mem_base;
            mem_avail = mem_limit;
        } else {
            fprintf(stderr, "Warning: Failed to allocate requested memory in"
                    " one large chunk.\nWill allocate in smaller chunks\n");
        }
    }

    memset(slabclass, 0, sizeof(slabclass));

    while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
        if (slab_sizes != NULL) {
            if (slab_sizes[i-1] == 0)
                break;
            size = slab_sizes[i-1];
        } else if (size >= settings.slab_chunk_size_max / factor) {
            break;
        }
        /* Make sure items are always n-byte aligned */
        if (size % CHUNK_ALIGN_BYTES)
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);

        //當(dāng)前slabclass切割的chunk塊大小
        slabclass[i].size = size;
        slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
        if (slab_sizes == NULL)
            //下一個slabclass切割的chunk塊大小
            size *= factor;
        //...
    }

    power_largest = i;
    slabclass[power_largest].size = settings.slab_chunk_size_max;
    slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
    //...

    if (prealloc && do_slab_prealloc) {
        //為slabclass結(jié)構(gòu)分配slab空間，默認(rèn)大小為1M
        //并且將1M的slab空間切割成N個slabclass->size大小塊欧啤，并每個塊加入到slot空閑鏈表中睛藻，塊和塊直接用雙向鏈表串聯(lián)
        slabs_preallocate(power_largest);
    }
}


static void slabs_preallocate (const unsigned int maxslabs) {
    int i;
    unsigned int prealloc = 0;
    //...

    for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
        if (++prealloc > maxslabs)
            return;
        //分配slab空間启上，并切割slab塊
        if (do_slabs_newslab(i) == 0) {
            fprintf(stderr, "Error while preallocating slab memory!\n"
                "If using -L or other prealloc options, max memory must be "
                "at least %d megabytes.\n", power_largest);
            exit(1);
        }
    }
}

//為slabclass分配一個新的slab
static int do_slabs_newslab(const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
    int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size)
        ? settings.slab_page_size
        : p->size * p->perslab;
    char *ptr;

    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
         && g->slabs == 0)) {
        mem_limit_reached = true;
        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }
    //空閑鏈表沒有空間可用邢隧，需要重新分配開辟一個slab空間
    //這個slab空間地址使用slabclass->slab_list保存的，grow_slab_list就是擴大slab_list
    if ((grow_slab_list(id) == 0) ||
        //分配具體的slab空間
        (((ptr = get_page_from_global_pool()) == NULL) &&
        ((ptr = memory_allocate((size_t)len)) == 0))) {

        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }

    memset(ptr, 0, (size_t)len);
    //將新分配的slab空間進(jìn)行切割chunk塊冈在，并加入到slot空閑列表中
    split_slab_page_into_freelist(ptr, id);

    p->slab_list[p->slabs++] = ptr;
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);

    return 1;
}

static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int x;
    for (x = 0; x < p->perslab; x++) {
        //這里的釋放并不是一個真實意義上的釋放倒慧，而是將內(nèi)存塊歸還到slabclass的空閑列表中
        //我們分配一個新的slab空間，切割后加入到空閑列表中和memcached中釋放一個item節(jié)點操作是一致的包券，顧調(diào)用了相同的接口
        //將內(nèi)存chunk塊強轉(zhuǎn)為item類型纫谅，然后通過item中的雙向節(jié)點串接到slabclass的空閑節(jié)點列表中
        do_slabs_free(ptr, 0, id);
        //切割塊偏移
        ptr += p->size;
    }
}

3.item節(jié)點的分配流程

1、根據(jù)大小溅固，找到合適的slabclass
2付秕、 slabclass空閑列表中是否有空閑的item節(jié)點，如果有直接分配item侍郭，用于存儲內(nèi)容
3询吴、空閑列表沒有空閑的item可以分配，會重新開辟一個slab(默認(rèn)大小為1M)的內(nèi)存塊亮元，然后切割slab并放入到空閑列表中猛计，然后從空閑列表中獲取節(jié)點

item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
    item *it;
    /* do_item_alloc handles its own locks */
    it = do_item_alloc(key, nkey, flags, exptime, nbytes);
    return it;
}

item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
                    const rel_time_t exptime, const int nbytes) {
    //...
    //計算需要的空間大小
    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
    if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }
    //根據(jù)大小，找到對應(yīng)的slabclass下標(biāo)
    unsigned int id = slabs_clsid(ntotal);
    unsigned int hdr_id = 0;
    if (id == 0)
        return 0;

    /* This is a large item. Allocate a header object now, lazily allocate
     *  chunks while reading the upload.
     */
    if (ntotal > settings.slab_chunk_size_max) {
        //...
    } else {
        //從根據(jù)id去對應(yīng)的slabcalss中獲取一個item節(jié)點
        it = do_item_alloc_pull(ntotal, id);
    }

    //...

    /* Refcount is seeded to 1 by slabs_alloc() */
    it->next = it->prev = 0;

     //LRU分層機制爆捞，后面說到LRU我們再做進(jìn)一步分析
    if (settings.temp_lru &&
            exptime - current_time <= settings.temporary_ttl) {
        id |= TEMP_LRU;
    } else if (settings.lru_segmented) {
        id |= HOT_LRU;
    } else {
        /* There is only COLD in compat-mode */
        id |= COLD_LRU;
    }

    it->slabs_clsid = id;
    //...

    return it;
}

item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
    item *it = NULL;
    int i;

    //這里涉及LRU淘汰機制奉瘤，后面會做介紹
    //...
    it = slabs_alloc(ntotal, id, &total_bytes, 0);
    //...

    return it;
}

void *slabs_alloc(size_t size, unsigned int id, uint64_t *total_bytes,
        unsigned int flags) {
    void *ret;

    pthread_mutex_lock(&slabs_lock);
    ret = do_slabs_alloc(size, id, total_bytes, flags);
    pthread_mutex_unlock(&slabs_lock);
    return ret;
}

static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes,
        unsigned int flags) {
    slabclass_t *p;
    item *it = NULL;
    //...

    p = &slabclass[id];
    //...
    /* fail unless we have space at the end of a recently allocated page,
       we have something on our freelist, or we could allocate a new page */
    //空閑列表中已經(jīng)沒有空閑節(jié)點了，重新分配slab并切割加入到空閑列表中
    if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
        //為slabclass重新開辟一個slab
        do_slabs_newslab(id);
    }
    //空閑列表中含有空余的節(jié)點
    if (p->sl_curr != 0) {
        /* return off our freelist */
        //分配出去煮甥，并將該節(jié)點從空閑列表中移除
        it = (item *)p->slots;
        p->slots = it->next;
        if (it->next) it->next->prev = 0;
        /* Kill flag and initialize refcount here for lock safety in slab
         * mover's freeness detection. */
        it->it_flags &= ~ITEM_SLABBED;
        it->refcount = 1;
        p->sl_curr--;
        ret = (void *)it;
    } else {
        ret = NULL;
    }

    //...
    return ret;
}

4.item節(jié)點的釋放

釋放一個item節(jié)點盗温，并不會free內(nèi)存空間，而是將item節(jié)點歸還到slabclass的空閑列表中

void slabs_free(void *ptr, size_t size, unsigned int id) {
    pthread_mutex_lock(&slabs_lock);
    //將item節(jié)點歸還到slabclass的空閑列表中
    do_slabs_free(ptr, size, id);
    pthread_mutex_unlock(&slabs_lock);
}

static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
    slabclass_t *p;
    item *it;

    //...
    p = &slabclass[id];

    it = (item *)ptr;
    if ((it->it_flags & ITEM_CHUNKED) == 0) {
        //...
        it->it_flags = ITEM_SLABBED;
        it->slabs_clsid = 0;
        it->prev = 0;
        it->next = p->slots;
        if (it->next) it->next->prev = it;
        p->slots = it;

        p->sl_curr++;
        //...
    } else {
        do_slabs_free_chunked(it, size);
    }
    return;
}