概述
對于硬盤的訪問吏夯,如果IO所涉及的數(shù)據(jù)量太大昙啄、或者跨頁等原因律歼,有可能需要對訪問IO進行拆分成多個小IO來訪問踊挠。本文根據(jù)SPDK開源代碼中example\nvme\hello_world
示例進行研究
Request結構
在SPDK中將對硬盤的訪問IO首先包裝成一個Request請求只怎,如果此IO需要拆分袜瞬,則會將拆分后的IO記錄到未拆分時創(chuàng)建的這個Request的children
字段(隊列),下面是Request結構尝盼,只顯示幾個重要的以及跟拆分IO相關的字段
struct nvme_request {
struct spdk_nvme_cmd cmd; // SQE
......
/**
* Number of children requests still outstanding for this
* request which was split into multiple child requests.
*/
uint16_t num_children;
......
struct spdk_nvme_qpair *qpair; // IO Qpair
......
struct spdk_nvme_cpl cpl; // CQE
/**
* The following members should not be reordered with members
* above. These members are only needed when splitting
* requests which is done rarely, and the driver is careful
* to not touch the following fields until a split operation is
* needed, to avoid touching an extra cacheline.
*/
/**
* Points to the outstanding child requests for a parent request.
* Only valid if a request was split into multiple children
* requests, and is not initialized for non-split requests.
*/
TAILQ_HEAD(, nvme_request) children;
/**
* Linked-list pointers for a child request in its parent's list.
*/
TAILQ_ENTRY(nvme_request) child_tailq;
/**
* Points to a parent request if part of a split request,
* NULL otherwise.
*/
struct nvme_request *parent;
/**
* Completion status for a parent request. Initialized to all 0's
* (SUCCESS) before child requests are submitted. If a child
* request completes with error, the error status is copied here,
* to ensure that the parent request is also completed with error
* status once all child requests are completed.
*/
struct spdk_nvme_cpl parent_status;
/**
* The user_cb_fn and user_cb_arg fields are used for holding the original
* callback data when using nvme_allocate_request_user_copy.
*/
spdk_nvme_cmd_cb user_cb_fn;
void *user_cb_arg;
void *user_buffer;
};
IO拆分
接口調用關系
下圖是一個IO的request的創(chuàng)建以及拆分動作的調用過程吞滞。
<img src="D:\總結\md\spdk_IO_split\image-20230321144008359.png" alt="image-20230321144008359" style="zoom:50%;" />
實現(xiàn)
拆分過程最主要的就是一個while
循環(huán),將一個大的IO拆分成多個能一次處理的小IO
吐槽:接口的參數(shù)不是一般的多。裁赠。殿漠。
static struct nvme_request *
_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,
struct spdk_nvme_qpair *qpair,
const struct nvme_payload *payload,
uint32_t payload_offset, uint32_t md_offset,
uint64_t lba, uint32_t lba_count,
spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
uint32_t io_flags, struct nvme_request *req,
uint32_t sectors_per_max_io, uint32_t sector_mask,
uint16_t apptag_mask, uint16_t apptag, int *rc)
{
uint32_t sector_size = _nvme_get_host_buffer_sector_size(ns, io_flags);
uint32_t remaining_lba_count = lba_count;
struct nvme_request *child;
while (remaining_lba_count > 0) {
lba_count = sectors_per_max_io - (lba & sector_mask);
lba_count = spdk_min(remaining_lba_count, lba_count);
child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc,
io_flags, apptag_mask, apptag, req, true, rc);
if (child == NULL) {
return NULL;
}
remaining_lba_count -= lba_count;
lba += lba_count;
payload_offset += lba_count * sector_size;
md_offset += lba_count * ns->md_size;
}
return req;
}
而對于每一個小的IO,都會調用接口_nvme_ns_cmd_rw()
創(chuàng)建成一個request佩捞,然后調用nvme_request_add_child()
將新的request放到到最開始的大IO對應的Request的一個子request隊列中绞幌,如下
static inline void
nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child)
{
......
parent->num_children++;
TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); // 鏈接到parent的children隊列
child->parent = parent;
child->cb_fn = nvme_cb_complete_child;
child->cb_arg = child;
}
從上面代碼中可以看出,每一個子request完成后的回調接口都是指向nvme_cb_complete_child
一忱,這個接口做了一些資源清理的工作莲蜘,如下:
static inline void
nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
{
struct nvme_request *child = child_arg;
struct nvme_request *parent = child->parent;
nvme_request_remove_child(parent, child);
if (spdk_nvme_cpl_is_error(cpl)) {
memcpy(&parent->parent_status, cpl, sizeof(*cpl));
}
if (parent->num_children == 0) {
nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
parent, &parent->parent_status);
nvme_free_request(parent);
}
}
IO執(zhí)行
前面將IO拆分后組成多個小的request放到父request的一個隊列中。對父request進行submit帘营,在處理時會判斷父request中是否存在子request票渠,如果有則會循環(huán)將子request進行submit,所有子request完成之后直接退出不會再處理父request(相當于父request只是一個容器)芬迄,代碼如下:
static inline int
_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
{
......
if (req->num_children) {
/*
* This is a split (parent) request. Submit all of the children but not the parent
* request itself, since the parent is the original unsplit request.
*/
TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
if (spdk_likely(!child_req_failed)) {
rc = nvme_qpair_submit_request(qpair, child_req);
if (spdk_unlikely(rc != 0)) {
child_req_failed = true;
}
} else { /* free remaining child_reqs since one child_req fails */
nvme_request_remove_child(req, child_req);
nvme_request_free_children(child_req);
nvme_free_request(child_req);
}
}
if (spdk_unlikely(child_req_failed)) {
/* part of children requests have been submitted,
* return success since we must wait for those children to complete,
* but set the parent request to failure.
*/
if (req->num_children) {
req->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
return 0;
}
goto error;
}
return rc;
}
......
}
IO拆分分析
SPDK中IO拆分條件
SPDK中調用IO拆分的點如下代碼:
static inline struct nvme_request *
_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl, int *rc)
{
......
/*
* Intel DC P3*00 NVMe controllers benefit from driver-assisted striping.
* If this controller defines a stripe boundary and this I/O spans a stripe
* boundary, split the request into multiple requests and submit each
* separately to hardware.
*/
if (sectors_per_stripe > 0 &&
(((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) {
return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
cb_fn,
cb_arg, opc,
io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag, rc);
} else if (lba_count > sectors_per_max_io) {
return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
cb_fn,
cb_arg, opc,
io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag, rc);
} else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) {
if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc, io_flags,
req, apptag_mask, apptag, rc);
} else {
return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset,
lba, lba_count, cb_fn, cb_arg, opc, io_flags,
req, apptag_mask, apptag, rc);
}
}
_nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
return req;
}
代碼中只有三個拆分IO的分支:
- 磁盤有設置stripe问顷,并且此次IO跨stripe邊界了
- IO中LBA數(shù)量超過磁盤的一個IO支持最大的sector數(shù)量
- 這個很奇怪,看最外層條件是設置使用SGL方式禀梳,但是內部條件又分成了SGL和RPR兩種方式杜窄。。算途。
磁盤支持的最大IO計算方法
在SPDK中有兩個結構體中的幾個字段記錄了磁盤支持的IO相關的信息(在controller初始化過程中會獲取相關信息)如下
struct spdk_nvme_ctrlr {
......
/** maximum i/o size in bytes */
uint32_t max_xfer_size;
/** minimum page size supported by this controller in bytes */
uint32_t min_page_size;
/** selected memory page size for this controller in bytes */
uint32_t page_size;
......
};
struct spdk_nvme_ns {
struct spdk_nvme_ctrlr *ctrlr;
uint32_t sector_size;
/*
* Size of data transferred as part of each block,
* including metadata if FLBAS indicates the metadata is transferred
* as part of the data buffer at the end of each LBA.
*/
uint32_t extended_lba_size;
......
uint32_t sectors_per_max_io;
uint32_t sectors_per_max_io_no_md;
uint32_t sectors_per_stripe;
......
};
- 其中
page_size
是通過讀取controller的CAP信息獲取塞耕,如下
static void
nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr)
{
......
ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin);
/* For now, always select page_size == min_page_size. */
ctrlr->page_size = ctrlr->min_page_size;
......
}
- 字段
max_xfer_size
字段最大值依賴NVMe支持的prp_entry_size
和page_size
計算方式如下:
#define NVME_MAX_PRP_LIST_ENTRIES (503)
static uint32_t
nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
{
/*
* For commands requiring more than 2 PRP entries, one PRP will be
* embedded in the command (prp1), and the rest of the PRP entries
* will be in a list pointed to by the command (prp2). The number
* of PRP entries in the list is defined by
* NVME_MAX_PRP_LIST_ENTRIES.
*
* Note that the max xfer size is not (MAX_ENTRIES + 1) * page_size
* because the first PRP entry may not be aligned on a 4KiB
* boundary.
*/
return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
}
而max_xfer_size
在系統(tǒng)中使用的真實值,還依賴于controller支持的mdts嘴瓤,nvme協(xié)議中controller的mdts(單位應該是page個數(shù))定義如下:
max_xfer_size
最終值的計算如下:
static void
nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl)
{
......
/*
* Use MDTS to ensure our default max_xfer_size doesn't exceed what the
* controller supports.
*/
ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr);
NVME_CTRLR_DEBUGLOG(ctrlr, "transport max_xfer_size %u\n", ctrlr->max_xfer_size);
if (ctrlr->cdata.mdts > 0) {
ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size,
ctrlr->min_page_size * (1 << ctrlr->cdata.mdts));
NVME_CTRLR_DEBUGLOG(ctrlr, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size);
}
......
}
-
spdk_nvme_ns->sector_size
字段是由namespace的LBAF->LBADS信息獲取扫外,代碼如下
void
nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
{
struct spdk_nvme_ns_data *nsdata;
nsdata = _nvme_ns_get_data(ns);
ns->flags = 0x0000;
ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads;
ns->extended_lba_size = ns->sector_size;
......
}
namespace的LBAF->LBADS在nvme協(xié)議中的定義如下:
從上面信息看,sector_size
表示一個block的大小以字節(jié)為單位纱注,最小是512字節(jié)畏浆;max_xfer_size
表示一個IO最大的字節(jié)數(shù);從這兩個數(shù)據(jù)就可以知道一個IO最多可以有多少個block狞贱,計算如下:
void
nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
{
......
ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size;
ns->sectors_per_max_io_no_md = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->sector_size;
if (ns->ctrlr->quirks & NVME_QUIRK_MDTS_EXCLUDE_MD) {
ns->sectors_per_max_io = ns->sectors_per_max_io_no_md;
}
......
}
- 對于字段
ns->sectors_per_stripe
表示controller/namespace自定義的一個帶邊界的信息刻获,如果訪問的IO跨邊界也是需要進行拆分(如前面講SPDK拆分中的第一個條件,當前已知的是Intel的DC P3*00 NVMe controllers有此特性)
此字段的配置方式如下:
void
nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
{
......
if (nsdata->noiob) {
ns->sectors_per_stripe = nsdata->noiob;
SPDK_DEBUGLOG(nvme, "ns %u optimal IO boundary %" PRIu32 " blocks\n",
ns->id, ns->sectors_per_stripe);
} else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING &&
ns->ctrlr->cdata.vs[3] != 0) {
ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size /
ns->sector_size;
SPDK_DEBUGLOG(nvme, "ns %u stripe size quirk %" PRIu32 " blocks\n",
ns->id, ns->sectors_per_stripe);
} else {
ns->sectors_per_stripe = 0;
}
......
}
- 從上面代碼可知瞎嬉,第一個條件是namespace中的一個字段
noiob
蝎毡,nvme協(xié)議中定義如下:
NOIOB
- 第二個條件是controller的一個字段
vs
(vendor specific,這里使用vs[3]應該就是intel DC P3*00 NVMe controllers的定義)氧枣,nvme協(xié)議中定義如下:
vendor specific
構造驗證IO拆分
SPDK的hello_world示例中沐兵,可以針對第二個拆分條件進行構造,hello_world.c
文件做如下修改
-
申請buffer時申請2M的空間便监,如下
sequence.buf = spdk_zmalloc(0x200000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
-
寫IO時入參
lba_count
改為4096rc = spdk_nvme_ns_cmd_write(ns_entry->ns, ns_entry->qpair, sequence.buf, 0, /* LBA start */ 4096, /* number of LBAs */ write_complete, &sequence, 0);
經過調試扎谎,如上構造滿足SPDK的IO拆分條二個條件lba_count > sectors_per_max_io
在前面IO執(zhí)行章節(jié)的_nvme_qpair_submit_request
接口中增加打印children的數(shù)量碳想,如下
SPDK_ERRLOG("----------num of children = %d------------\n\n", req->num_children);
if (req->num_children) {
......
}
打印子request的個數(shù)如下
[2023-03-23 09:02:41.777381] nvme_qpair.c: 946:_nvme_qpair_submit_request: *ERROR*:----------num of children = 2------------
前面lba_count改為4096(其實大于2048就可以)可以進行拆分的原因如下,經過調試毁靶,在controller初始化完之后對應支持IO大小的相關信息如下:
- namespace相關
(gdb) p *ns
$2 = {
ctrlr = 0x2000003d60c0,
sector_size = 512, // 一個block占用的字節(jié)數(shù):1 << nsdata->lbaf[0].lbads
extended_lba_size = 512,
md_size = 0,
pi_type = 0,
sectors_per_max_io = 2048, // 即一個IO最多有2048個block胧奔,所以lba_count大于這個數(shù)之后會拆分
sectors_per_max_io_no_md = 2048,
......
nsdata = {
......
lbaf = {{ms = 0, lbads = 9, rp = 0, reserved6 = 0}, {ms = 0, lbads = 0, rp = 0, reserved6 = 0} <repeats 15 times>},
reserved6 = '\000' <repeats 191 times>,
vendor_specific = '\000' <repeats 3711 times>
}
......
node = {rbe_left = 0x0, rbe_right = 0x2000002f2e00, rbe_parent = 0x1}
}
- controller相關
ctrlr (after identify cmd)
{
......
max_xfer_size = 1048576, // MIN(page_size * 503, page_size * (1<<cdata->mdts))
min_page_size = 4096,
page_size = 4096,
......
cdata = {
vid = 5549,
ssvid = 5549,
sn = "VMware NVME_0000\000\000\000",
mn = "VMware Virtual NVMe Disk", '\000' <repeats 15 times>,
fr = "1.3\000\000\000\000",
rab = 0 '\000',
ieee = "\000PV",
cmic = {multi_port = 0 '\000', multi_ctrlr = 0 '\000', sr_iov = 0 '\000', ana_reporting = 0 '\000', reserved = 0 '\000'},
mdts = 8 '\b',
......
vs = '\000' <repeats 1023 times>
},
......
}