本文整理下之前的學(xué)習(xí)筆記包吝,基于DPDK17.11版本源碼分析譬圣。
主要看一下DPDK探測網(wǎng)卡設(shè)備躺苦,并進(jìn)行初始化的流程身腻,用到了類似kernel中的總線-設(shè)備-驅(qū)動模型。
本文的重點(diǎn)之一是DPDK如何在用戶態(tài)操作網(wǎng)卡寄存器匹厘,這里先給個答案: 想要操作網(wǎng)卡寄存器嘀趟,需要用到網(wǎng)卡的基地址BAR,intel網(wǎng)卡一般使用BAR0就行愈诚,通過mmap此文件/sys/bus/pci/devices/'pci address'/resource'map_idx'就可以在用戶態(tài)得到BAR0對應(yīng)的虛擬地址她按,此虛擬地址加上寄存器的偏移即可讀取/設(shè)置網(wǎng)卡寄存器。此文件是kernel為pci設(shè)備創(chuàng)建炕柔,提供了底層的mmap實(shí)現(xiàn)酌泰。
首先看兩個宏定義,利用gcc的屬性attribute(constructor)定義的函數(shù)匕累,可以在main函數(shù)執(zhí)行前運(yùn)行陵刹,還可以指定優(yōu)先級attribute(constructor(prio)),prio從101開始欢嘿,值越小越早執(zhí)行衰琐。不指定優(yōu)先級的版本是優(yōu)先級更低的。
/**
* Run function before main() with low priority.
*
* The constructor will be run after prioritized constructors.
*
* @param func
* Constructor function.
*/
#define RTE_INIT(func) \
static void __attribute__((constructor, used)) func(void)
/**
* Run function before main() with high priority.
*
* @param func
* Constructor function.
* @param prio
* Priority number must be above 100.
* Lowest number is the first to run.
*/
#define RTE_INIT_PRIO(func, prio) \
static void __attribute__((constructor(prio), used)) func(void)
RTE_PMD_REGISTER_PCI
宏RTE_PMD_REGISTER_PCI使用上面的宏RTE_INIT注冊驅(qū)動炼蹦。
/** Helper for PCI device registration from driver (eth, crypto) instance */
#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm); \
static void pciinitfn_ ##nm(void) \
{\
(pci_drv).driver.name = RTE_STR(nm);\
rte_pci_register(&pci_drv); \
} \
static struct rte_pci_driver rte_ixgbe_pmd = {
.id_table = pci_id_ixgbe_map, //記錄此驅(qū)動支持的網(wǎng)卡類型
.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC |
RTE_PCI_DRV_IOVA_AS_VA,
.probe = eth_ixgbe_pci_probe, //設(shè)備和驅(qū)動匹配上的碘耳,調(diào)用probe函數(shù)進(jìn)行初始化
.remove = eth_ixgbe_pci_remove,
};
//注冊igb網(wǎng)卡驅(qū)動
RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd);
RTE_REGISTER_BUS
RTE_REGISTER_BUS 在main函數(shù)執(zhí)行前注冊bus,插入全局鏈表 rte_bus_list框弛。
//優(yōu)先級為101,rte_log_init在main函數(shù)之前第一個被執(zhí)行
/* Logging should be first initializer (before drivers and bus) */
RTE_INIT_PRIO(rte_log_init, 101);
//優(yōu)先級為110捕捂,用于注冊bus
/**
* Helper for Bus registration.
* The constructor has higher priority than PMD constructors.
*/
#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_ ##nm, 110); \
static void businitfn_ ##nm(void) \
{\
(bus).name = RTE_STR(nm);\
rte_bus_register(&bus); \
}
//注冊 vdev 總線
RTE_REGISTER_BUS(vdev, rte_vdev_bus);
struct rte_pci_bus rte_pci_bus = {
.bus = {
.scan = rte_pci_scan,
.probe = rte_pci_probe,
.find_device = pci_find_device,
.plug = pci_plug,
.unplug = pci_unplug,
.parse = pci_parse,
.get_iommu_class = rte_pci_get_iommu_class,
},
.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};
//注冊 pci 總線
RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
rte_eal_init
rte_eal_init為DPDK程序的環(huán)境抽象層瑟枫,主要進(jìn)行CPU,內(nèi)存和網(wǎng)卡等的初始化指攒。本文重點(diǎn)介紹網(wǎng)卡相關(guān)慷妙。
int
rte_eal_init(int argc, char **argv)
//遍歷全局鏈表rte_bus_list掃描總線,對于pci總線來說允悦,調(diào)用 rte_pci_scan
rte_bus_scan()
//遍歷全局鏈表rte_bus_list探測總線膝擂,對于pci總線來說,調(diào)用 rte_pci_probe
rte_bus_probe()
rte_pci_scan
/*
* Scan the content of the PCI bus, and the devices in the devices
* list
*/
int
rte_pci_scan(void)
struct dirent *e;
DIR *dir;
char dirname[PATH_MAX];
struct rte_pci_addr addr;
//打開目錄/sys/bus/pci/devices, 掃描設(shè)備,這里用的是 struct rte_pci_device
dir = opendir(rte_pci_get_sysfs_path());
while ((e = readdir(dir)) != NULL) {
//此目錄下以pci地址為子目錄架馋,所以通過解析子目錄名字獲取pci地址
parse_pci_addr_format(e->d_name, sizeof(e->d_name), &addr)
//組合成設(shè)備目錄: /sys/bus/pci/devices/0000:81:00.0
snprintf(dirname, sizeof(dirname), "%s/%s", rte_pci_get_sysfs_path(), e->d_name);
pci_scan_one(dirname, &addr)
struct rte_pci_device *dev;
dev = malloc(sizeof(*dev));
dev->addr = *addr;
//讀取網(wǎng)卡設(shè)備目錄下的文件狞山,獲取 vendor id
/* get vendor id */
snprintf(filename, sizeof(filename), "%s/vendor", dirname);
eal_parse_sysfs_value(filename, &tmp);
dev->id.vendor_id = (uint16_t)tmp;
//同樣的讀取目錄,獲取 device id叉寂,subsystem_vendor id等信息萍启,保存到 dev 中
/* get device id */
/* get subsystem_vendor id */
/* get subsystem_device id */
/* get class_id */
/* get max_vfs */
/* get numa node, default to 0 if not present */
//解析設(shè)備目錄下的 /sys/bus/pci/devices/'pci address'/resource 文件,
//獲取物理地址和長度屏鳍,虛擬地址會在 rte_pci_probe 時進(jìn)行映射勘纯,映射成功后就可以在用戶態(tài)操作設(shè)備的寄存器
/* parse resources */
snprintf(filename, sizeof(filename), "%s/resource", dirname);
pci_parse_sysfs_resource(filename, dev) < 0)
for (i = 0; i<PCI_MAX_RESOURCE; i++) {
if (flags & IORESOURCE_MEM) { //只關(guān)注 mem 類型
dev->mem_resource[i].phys_addr = phys_addr;
dev->mem_resource[i].len = end_addr - phys_addr + 1;
/* not mapped for now */
dev->mem_resource[i].addr = NULL;
}
}
//解析當(dāng)前網(wǎng)卡綁定的驅(qū)動名字
/* parse driver */
snprintf(filename, sizeof(filename), "%s/driver", dirname);
pci_get_kernel_driver_by_path(filename, driver);
if (!strcmp(driver, "vfio-pci"))
dev->kdrv = RTE_KDRV_VFIO;
else if (!strcmp(driver, "igb_uio"))
dev->kdrv = RTE_KDRV_IGB_UIO;
else if (!strcmp(driver, "uio_pci_generic"))
dev->kdrv = RTE_KDRV_UIO_GENERIC;
else
dev->kdrv = RTE_KDRV_UNKNOWN;
//將掃描到的設(shè)備 pci_dev 按pci地址順序插入全局鏈表 rte_pci_bus.device_list,
//pci地址小的插入鏈表前面
/* device is valid, add in list (sorted) */
if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
rte_pci_add_device(dev);
//將掃描到的設(shè)備添加到鏈表 rte_pci_bus.device_list
TAILQ_INSERT_TAIL(&rte_pci_bus.device_list, pci_dev, next);
} else {
struct rte_pci_device *dev2;
int ret;
TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
//返回值大于0說明要插入的設(shè)備pci地址大于 dev2
ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
if (ret > 0)
continue;
//返回值小于0說明要插入的設(shè)備pci地址小于 dev2钓瞭,則將dev插入到dev2的前面
if (ret < 0) {
rte_pci_insert_device(dev2, dev);
} else { /* already registered */
//返回值相等驳遵,說明是同一個設(shè)備,更新下參數(shù)即可山涡。
dev2->kdrv = dev->kdrv;
dev2->max_vfs = dev->max_vfs;
pci_name_set(dev2);
memmove(dev2->mem_resource, dev->mem_resource, sizeof(dev->mem_resource));
free(dev);
}
return 0;
}
//遍歷完鏈表堤结,沒找到比dev小的,則將dev插入鏈表最后
rte_pci_add_device(dev);
}
}
rte_pci_probe
/*
* Scan the content of the PCI bus, and call the probe() function for
* all registered drivers that have a matching entry in its id_table
* for discovered devices.
*/
int
rte_pci_probe(void)
struct rte_pci_device *dev = NULL;
//遍歷設(shè)備鏈表rte_pci_bus.device_list佳鳖,rte_pci_scan 階段會把掃描到的設(shè)備插入此鏈表
//TAILQ_FOREACH(p, &(rte_pci_bus.device_list), next)
FOREACH_DEVICE_ON_PCIBUS(dev)
struct rte_pci_driver *dr = NULL;
//遍歷驅(qū)動鏈表rte_pci_bus.driver_list霍殴,調(diào)用 RTE_PMD_REGISTER_PCI時將驅(qū)動插入此鏈表
//TAILQ_FOREACH(p, &(rte_pci_bus.driver_list), next)
FOREACH_DRIVER_ON_PCIBUS(dr)
rte_pci_probe_one_driver(dr, dev);
//vendor_id,device_id,subsystem_vendor_id,subsystem_device_id,class_id
//設(shè)備和驅(qū)動的這幾項(xiàng)必須匹配
rte_pci_match(dr, dev);
const struct rte_pci_id *id_table;
//遍歷驅(qū)動的 id_table,此table記錄下驅(qū)動支持的網(wǎng)卡類型
for (id_table = pci_drv->id_table; id_table->vendor_id != 0; id_table++) {
/* check if device's identifiers match the driver's ones */
if (id_table->vendor_id != pci_dev->id.vendor_id &&
id_table->vendor_id != PCI_ANY_ID)
continue;
if (id_table->device_id != pci_dev->id.device_id &&
id_table->device_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_vendor_id !=
pci_dev->id.subsystem_vendor_id &&
id_table->subsystem_vendor_id != PCI_ANY_ID)
continue;
if (id_table->subsystem_device_id !=
pci_dev->id.subsystem_device_id &&
id_table->subsystem_device_id != PCI_ANY_ID)
continue;
if (id_table->class_id != pci_dev->id.class_id &&
id_table->class_id != RTE_CLASS_ANY_ID)
continue;
return 1;
}
return 0;
//設(shè)備和驅(qū)動匹配上了系吩。
//如果驅(qū)動設(shè)置了 RTE_PCI_DRV_NEED_MAPPING来庭,則必須進(jìn)行映射,
//即把設(shè)備物理地址映射成用戶態(tài)的虛擬地址
if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING)
rte_pci_map_device(dev);
//如果網(wǎng)卡綁定到 vfio 驅(qū)動穿挨,則調(diào)用 pci_vfio_map_resource
pci_vfio_map_resource(dev);
//如果網(wǎng)卡綁定到 igb_uio 或者 uio_pci_generic月弛,則調(diào)用 pci_uio_map_resource
/* map resources for devices that use igb_uio */
pci_uio_map_resource
struct mapped_pci_resource *uio_res = NULL;
struct mapped_pci_res_list *uio_res_list =
RTE_TAILQ_CAST(rte_uio_tailq.head, mapped_pci_res_list);
pci_uio_alloc_resource(dev, &uio_res);
uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
/* depending on kernel version, uio can be located in uio/uioX or uio:uioX */
//到目錄下/sys/bus/pci/devices/'pci address'/,找到uio目錄科盛,
//獲取uio number(網(wǎng)卡綁定到igb_uio驅(qū)動后帽衙,會創(chuàng)建此目錄)
snprintf(dirname, sizeof(dirname),
"%s/" PCI_PRI_FMT "/uio", rte_pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid, loc->function);
//打開 /dev/uiox 設(shè)備,獲取fd贞绵,并保存到dev->intr_handle.fd
snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
/* save fd if in primary process */
dev->intr_handle.fd = open(devname, O_RDWR);
//打開目錄 /sys/class/uio/uio%u/device/config厉萝,獲取fd,并保存
snprintf(cfgname, sizeof(cfgname), "/sys/class/uio/uio%u/device/config", uio_num);
dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
/* allocate the mapping details for secondary processes*/
*uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));
/* Map all BARs */
for (i = 0; i != PCI_MAX_RESOURCE; i++) {
/* skip empty BAR */
phaddr = dev->mem_resource[i].phys_addr;
if (phaddr == 0)
continue;
pci_uio_map_resource_by_index(dev, i, uio_res, map_idx);
int fd;
char devname[PATH_MAX];
void *mapaddr;
struct rte_pci_addr *loc;
struct pci_map *maps;
loc = &dev->addr;
maps = uio_res->maps;
//devname 為 /sys/bus/pci/devices/'pci address'/resource'map_idx'
/* update devname for mmap */
snprintf(devname, sizeof(devname),
"%s/" PCI_PRI_FMT "/resource%d",
rte_pci_get_sysfs_path(),
loc->domain, loc->bus, loc->devid,
loc->function, res_idx);
//分配內(nèi)存用來保存path
maps[map_idx].path = rte_malloc(NULL, strlen(devname) + 1, 0);
//打開文件/sys/bus/pci/devices/'pci address'/resource'map_idx'榨崩,進(jìn)行mmap映射
fd = open(devname, O_RDWR);
/* try mapping somewhere close to the end of hugepages */
//pci_map_addr為全局變量谴垫,嘗試映射到距離hugepages近的地方
if (pci_map_addr == NULL)
pci_map_addr = pci_find_max_end_va();
//mmap映射,返回映射成功的虛擬地址
mapaddr = pci_map_resource(pci_map_addr, fd, 0, (size_t)dev->mem_resource[res_idx].len, 0);
//映射成功后母蛛,將 pci_map_addr 指向 mapaddr+len 的虛擬地址翩剪,下次循環(huán)映射
pci_map_addr = RTE_PTR_ADD(mapaddr, (size_t)dev->mem_resource[res_idx].len);
//將映射后的虛擬地址保存下來,以后就可以使用此虛擬地址操作設(shè)備
maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
maps[map_idx].size = dev->mem_resource[res_idx].len;
maps[map_idx].addr = mapaddr;
maps[map_idx].offset = 0;
strcpy(maps[map_idx].path, devname);
//最后將虛擬地址保存到 dev 中
dev->mem_resource[res_idx].addr = mapaddr;
//將uio_res插入全局鏈表 uio_res_list彩郊,
//主要用于secondary進(jìn)程遍歷此鏈表映射相同的虛擬地址
TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);
/* reference driver structure */
dev->driver = dr;
dev->device.driver = &dr->driver;
//調(diào)用驅(qū)動的probe函數(shù)前弯,比如 eth_ixgbe_pci_probe
dr->probe(dr, dev);
//rte_eth_dev_pci_generic_probe是個通用的封裝函數(shù)蚪缀,用來申請 rte_eth_dev 內(nèi)存,
//成功后恕出,調(diào)用驅(qū)動特有的callback dev_init询枚,比如 eth_ixgbe_dev_init
//sizeof(struct ixgbe_adapter)為驅(qū)動特有的私有數(shù)據(jù)
rte_eth_dev_pci_generic_probe(pci_dev, sizeof(struct ixgbe_adapter), eth_ixgbe_dev_init);
struct rte_eth_dev *eth_dev;
eth_dev = rte_eth_dev_pci_allocate(pci_dev, private_data_size);
name = dev->device.name;
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
eth_dev = rte_eth_dev_allocate(name);
//找到第一個可用的portid
port_id = rte_eth_dev_find_free_port();
//rte_eth_dev_data 是一個多進(jìn)程共享變量
if (rte_eth_dev_data == NULL)
rte_eth_dev_data_alloc();
memset(&rte_eth_dev_data[port_id], 0, sizeof(struct rte_eth_dev_data));
eth_dev = eth_dev_get(port_id);
snprintf(eth_dev->data->name, sizeof(eth_dev->data->name), "%s", name);
eth_dev->data->port_id = port_id;
eth_dev->data->mtu = ETHER_MTU;
if (private_data_size) {
//分配驅(qū)動私有數(shù)據(jù)占用內(nèi)存
eth_dev->data->dev_private = rte_zmalloc_socket(name,
private_data_size, RTE_CACHE_LINE_SIZE,
dev->device.numa_node);
}
} else {
eth_dev = rte_eth_dev_attach_secondary(name);
}
//調(diào)用 eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev)
dev_init(eth_dev);
struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
struct rte_intr_handle *intr_handle = &pci_dev->intr_handle;
struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
eth_dev->dev_ops = &ixgbe_eth_dev_ops;
eth_dev->rx_pkt_burst = &ixgbe_recv_pkts;
eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
eth_dev->tx_pkt_prepare = &ixgbe_prep_pkts;
//將 bar0 的虛擬地址賦值到 hw->hw_addr,以后就可以通過 hw->hw_addr 讀取/設(shè)置寄存器了
hw->hw_addr = (void *)pci_dev->mem_resource[0].addr;
//注冊中斷處理函數(shù)
rte_intr_callback_register(intr_handle, ixgbe_dev_interrupt_handler, eth_dev);
//使能網(wǎng)卡pci層的中斷
/* enable uio/vfio intr/eventfd mapping */
rte_intr_enable(intr_handle);
//設(shè)置網(wǎng)卡寄存器剃根,使能網(wǎng)卡中斷
/* enable support intr */
ixgbe_enable_intr(eth_dev);