以前做 DBA 時喇肋,常用 DR 模式示弓。LB 只處理進入的請求损拢,將流量分發(fā)給后端往产,返回數(shù)據(jù)由 Real Server 直接返回到 Client, 所以模式叫 Direct Routing. 原理大家都清楚被碗,修改二層頭 mac 地址,所以他的局限也很明顯仿村,只能在同一個二層锐朴,不能跨網(wǎng)段。由于直接返回給 Client蔼囊,一般不會面對公網(wǎng)用戶焚志。那么具體實現(xiàn)細節(jié)呢?
轉發(fā)入口
上一篇提到畏鼓,dpvs 每個 slave 核心處于輪循狀態(tài)酱酬,執(zhí)行三個 LOOP JOB. 最核心的就是 lcore_job_recv_fwd
static void lcore_job_recv_fwd(void *arg)
{
int i, j;
portid_t pid;
lcoreid_t cid;
struct netif_queue_conf *qconf;
cid = rte_lcore_id();
assert(LCORE_ID_ANY != cid);
// 一個核可能負責多個網(wǎng)卡的多個隊列,所以是兩個 for
// i 是指網(wǎng)卡號 j 是指 第 i 個網(wǎng)卡的第 j 的隊列
for (i = 0; i < lcore_conf[lcore2index[cid]].nports; i++) {
pid = lcore_conf[lcore2index[cid]].pqs[i].id;
assert(pid <= bond_pid_end);
for (j = 0; j < lcore_conf[lcore2index[cid]].pqs[i].nrxq; j++) {
qconf = &lcore_conf[lcore2index[cid]].pqs[i].rxqs[j];
// 先從 ring 里拿數(shù)據(jù)云矫,如果有就處理
lcore_process_arp_ring(qconf, cid);
// 再從 網(wǎng)卡 里拿數(shù)據(jù)
qconf->len = netif_rx_burst(pid, qconf);
lcore_stats_burst(&lcore_stats[cid], qconf->len);
lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 0);
kni_send2kern_loop(pid, qconf);
}
}
}
這里面為什么是兩個 for 呢膳沽?因為一個 lcore 核心可能負責多個網(wǎng)卡,每個網(wǎng)卡又負責多個列隊让禀。理想情況肯定是一個核只處理一個網(wǎng)卡的一個列隊挑社。
-
lcore_process_arp_ring
先檢查全局 arp_ring 環(huán)形數(shù)組是否有數(shù)據(jù),如果有就處理巡揍。 -
netif_rx_burst
是接收網(wǎng)卡數(shù)據(jù)的核心函數(shù)
static inline uint16_t netif_rx_burst(portid_t pid, struct netif_queue_conf *qconf)
{
struct rte_mbuf *mbuf;
int nrx = 0;
if (qconf->isol_rxq) {
/* note API rte_ring_dequeue_bulk of dpdk-16.07 is not suitable, replace with
* its bulk version after upgrading to new dpdk version */
while (0 == rte_ring_dequeue(qconf->isol_rxq->rb, (void**)&mbuf)) {
qconf->mbufs[nrx++] = mbuf;
if (unlikely(nrx >= NETIF_MAX_PKT_BURST))
break;
}
/* Shoul we integrate statistics of isolated recieve lcore into packet
* processing lcore ? No! we just leave the work to tools */
} else {
nrx = rte_eth_rx_burst(pid, qconf->id, qconf->mbufs, NETIF_MAX_PKT_BURST);
}
qconf->len = nrx;
return nrx;
}
先判斷當前隊列是否專職接收數(shù)據(jù)滔灶,如果是的話,將數(shù)據(jù)從 ring_buffer 取出放到
qconf->mbufs 供下文處理吼肥。否則調用 dpdk 庫函數(shù) rte_eth_rx_burst
將網(wǎng)卡數(shù)據(jù)取出放到 qconf->mbufs 中录平。
-
lcore_stats_burst
統(tǒng)計函數(shù),暫時忽略缀皱。 -
lcore_process_packets
核心包處理入口斗这,下文分件。 -
kni_send2kern_loop
如果當前網(wǎng)卡數(shù)據(jù)啤斗,dpvs 不關心表箭,那么通過 kni 接口透傳到內核。比如一些 ssh 管理流量钮莲。
lcore_process_packets 二層包處理入口
這里涉及到了 dpdk 核心數(shù)據(jù)結構 mbuf, 可以類比內核的 skb, 很多二層三層的 header 轉換操作都是基于 mbuf
static void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
lcoreid_t cid, uint16_t count, bool pkts_from_ring)
{
int i, t;
struct ether_hdr *eth_hdr;
struct rte_mbuf *mbuf_copied = NULL;
/* prefetch packets 預取一定數(shù)量的 mbuf*/
for (t = 0; t < count && t < NETIF_PKT_PREFETCH_OFFSET; t++)
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
rte_pktmbuf_mtod
預取一定數(shù)量數(shù)據(jù)包
/* L2 filter */
for (i = 0; i < count; i++) {
struct rte_mbuf *mbuf = mbufs[i];
struct netif_port *dev = netif_port_get(mbuf->port);
獲取每個數(shù)據(jù)包對應的網(wǎng)卡信息免钻,dpdk 中網(wǎng)卡叫 port
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
if (dev->type == PORT_TYPE_BOND_SLAVE) {
dev = dev->bond->slave.master;
mbuf->port = dev->id;
}
兼容處理網(wǎng)卡是 bond 的情況
if (t < count) {
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
t++;
}
如果包沒有,那么去獲取
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
/* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
mbuf->packet_type = eth_type_parse(eth_hdr, dev);
獲得以太網(wǎng)頭崔拥,并判斷當前二層包類型极舔,本機 ETH_PKT_HOST、 廣播或組播链瓦,一般都是本機
/*
* In NETIF_PORT_FLAG_FORWARD2KNI mode.
* All packets received are deep copied and sent to KNI
* for the purpose of capturing forwarding packets.Since the
* rte_mbuf will be modified in the following procedure,
* we should use mbuf_copy instead of rte_pktmbuf_clone.
*/
if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) {
if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
pktmbuf_pool[dev->socket]))))
kni_ingress(mbuf_copied, dev, qconf);
else
RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\n",
__func__);
}
kni 模式所有的包都要透傳到內核拆魏,深考貝一份盯桦,kni_ingress
以后單獨說,這里忽略
/*
* handle VLAN
* if HW offload vlan strip, it's still need vlan module
* to act as VLAN filter.
*/
if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
mbuf->ol_flags & PKT_RX_VLAN_STRIPPED) {
if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
兼容處理 vlan 的情況渤刃,這里忽略拥峦,暫時不看
dev = netif_port_get(mbuf->port);
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
}
/* handler should free mbuf */
netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
(dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true:false,
cid, pkts_from_ring);
lcore_stats[cid].ibytes += mbuf->pkt_len;
lcore_stats[cid].ipackets++;
}
}
最后就是數(shù)據(jù)包轉發(fā)函數(shù) netif_deliver_mbuf
二層處理 mbuf
分段看一下 netif_deliver_mbuf
如何實現(xiàn)
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
uint16_t eth_type,
struct netif_port *dev,
struct netif_queue_conf *qconf,
bool forward2kni,
lcoreid_t cid,
bool pkts_from_ring)
{
struct pkt_type *pt;
int err;
uint16_t data_off;
assert(mbuf->port <= NETIF_MAX_PORTS);
assert(dev != NULL);
pt = pkt_type_get(eth_type, dev);
if (NULL == pt) { // pt 為空說明沒有對應協(xié)義的處理
// 如果沒有轉發(fā)過,那么 轉發(fā)到 kni
if (!forward2kni)
kni_ingress(mbuf, dev, qconf);
else
rte_pktmbuf_free(mbuf);
return EDPVS_OK;
}
二層的包要么是 arp, 要么是 ip 包卖子,根據(jù) pkt_type_get
來獲取處理結構體略号,算是工廠方法吧。這個 pt 由 netif_register_pkt
注冊洋闽,查看源碼可知玄柠,當前僅有兩種類型工廠 ip4_pkt_type 和 arp_pkt_type,暫不支持 ipv6. 如果 pt 不存在喊递,那么流量通過 kni 透傳或是直接丟棄随闪。
/*clone arp pkt to every queue*/ // 為什么這么搞?所有的 ARP 都要廣播到所有 queue?
if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring) {
struct rte_mempool *mbuf_pool;
struct rte_mbuf *mbuf_clone;
uint8_t i;
struct arp_hdr *arp;
unsigned socket_id;
socket_id = rte_socket_id();
mbuf_pool = pktmbuf_pool[socket_id];
rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr));
arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *);
rte_pktmbuf_prepend(mbuf,(uint16_t)sizeof(struct ether_hdr));
if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY) {
for (i = 0; i < DPVS_MAX_LCORE; i++) {
if ((i == cid) || (!is_lcore_id_fwd(i))
|| (i == rte_get_master_lcore()))
continue;
/*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/
mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool);
if (mbuf_clone) {
int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone);
if (unlikely(-EDQUOT == ret)) {
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\n",
__func__, i);
}
else if (ret < 0) {
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\n",
__func__, i);
rte_pktmbuf_free(mbuf_clone);
}
}
}
}
}
如果是 arp 類型的包骚勘,復制拷到所有隊列铐伴。這塊為什么這么處理呢?猜測俏讹,dpdk 程序是每個核都有本地變量当宴,無鎖的,所以鄰居子系統(tǒng)也要每個核都是全量的泽疆。后面再驗證吧户矢。
mbuf->l2_len = sizeof(struct ether_hdr);
/* Remove ether_hdr at the beginning of an mbuf */
data_off = mbuf->data_off;
if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr))))
return EDPVS_INVPKT;
調整 mbuf 指向三層 ip 層
err = pt->func(mbuf, dev);
這是核心的三層處理邏輯,通過閱讀 pkt_type_get
, 發(fā)現(xiàn)這里回調 ipv4_rcv
if (err == EDPVS_KNICONTINUE) {
if (pkts_from_ring || forward2kni) {
rte_pktmbuf_free(mbuf);
return EDPVS_OK;
}
if (likely(NULL != rte_pktmbuf_prepend(mbuf,
(mbuf->data_off - data_off)))) {
kni_ingress(mbuf, dev, qconf);
} else {
rte_pktmbuf_free(mbuf);
}
}
return EDPVS_OK;
}
有時數(shù)據(jù)包不是 dpvs 所關心的殉疼,那么通過 kni 透傳給內核
三層處理 ipv4_rcv
static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port)
{
struct ipv4_hdr *iph;
uint16_t hlen, len;
eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
assert(mbuf);
if (unlikely(etype == ETH_PKT_OTHERHOST || !port)) {
rte_pktmbuf_free(mbuf);
return EDPVS_DROP;
}
判斷包類型是否是本地的梯浪,不是就丟棄
IP4_UPD_PO_STATS(in, mbuf->pkt_len);
if (mbuf_may_pull(mbuf, sizeof(struct ipv4_hdr)) != 0)
goto inhdr_error;
確保包是有效的,至少得有 ip 頭
iph = ip4_hdr(mbuf); // l3 header
hlen = ip4_hdrlen(mbuf);
if (((iph->version_ihl) >> 4) != 4 || hlen < sizeof(struct ipv4_hdr))
goto inhdr_error;
if (mbuf_may_pull(mbuf, hlen) != 0)
goto inhdr_error;
有了 ip 頭瓢娜,ip 數(shù)據(jù)包也得是有效的
if (unlikely(!(port->flag & NETIF_PORT_FLAG_RX_IP_CSUM_OFFLOAD))) {
if (unlikely(rte_raw_cksum(iph, hlen) != 0xFFFF))
goto csum_error;
}
CSUM 計算挂洛,如果網(wǎng)卡硬件不帶計算功能,那么程序調用 rte_raw_cksum
計算
len = ntohs(iph->total_length);
if (mbuf->pkt_len < len) {
IP4_INC_STATS(intruncatedpkts);
goto drop;
} else if (len < hlen)
goto inhdr_error;
/* trim padding if needed */
if (mbuf->pkt_len > len) {
if (rte_pktmbuf_trim(mbuf, mbuf->pkt_len - len) != 0) {
IP4_INC_STATS(indiscards);
goto drop;
}
}
mbuf->userdata = NULL;
mbuf->l3_len = hlen;
#ifdef CONFIG_DPVS_IPV4_DEBUG
ip4_dump_hdr(iph, mbuf->port);
#endif
return INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);
csum_error:
IP4_INC_STATS(csumerrors);
inhdr_error:
IP4_INC_STATS(inhdrerrors);
drop:
rte_pktmbuf_free(mbuf);
return EDPVS_INVPKT;
}
最后調用 INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin)
先調用 INET_HOOK_PRE_ROUTING 這個鉤子所注冊的回調眠砾,然后根據(jù)返回值判斷是否走 ipv4_rcv_fin
虏劲,不同轉發(fā)模式行為是不同的。
三層鉤子入口 INET_HOOK
int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf,
struct netif_port *in, struct netif_port *out,
int (*okfn)(struct rte_mbuf *mbuf))
{
struct list_head *hook_list;
struct inet_hook_ops *ops;
struct inet_hook_state state;
int verdict = INET_ACCEPT;
state.hook = hook;
hook_list = &inet_hooks[hook];
獲取 INET_HOOK_PRE_ROUTING 對應的鉤子回調函數(shù)數(shù)組
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_read_lock(&inet_hook_lock);
#endif
ops = list_entry(hook_list, struct inet_hook_ops, list);
if (!list_empty(hook_list)) {
verdict = INET_ACCEPT;
list_for_each_entry_continue(ops, hook_list, list) {
repeat:
verdict = ops->hook(ops->priv, mbuf, &state);
if (verdict != INET_ACCEPT) {
if (verdict == INET_REPEAT)
goto repeat;
break;
}
}
}
遍歷回調函數(shù)數(shù)組褒颈,并執(zhí)行柒巫。根據(jù)返回值來判斷是否全部執(zhí)行,這里細節(jié)比較多谷丸,稍后再說堡掏。
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_read_unlock(&inet_hook_lock);
#endif
if (verdict == INET_ACCEPT || verdict == INET_STOP) {
return okfn(mbuf);
} else if (verdict == INET_DROP) {
rte_pktmbuf_free(mbuf);
return EDPVS_DROP;
} else { /* INET_STOLEN */
return EDPVS_OK;
}
}
根據(jù)返回值 verdict 判斷后續(xù)操作,比如 synproxy 第一步處理就會返回 INET_STOLEN淤井,如果 INET_ACCEPT 執(zhí)行 okfn 所指向的 ipv4_rcv_fin布疼,不同模式不同階段的值不同
三層鉤子 INET_HOOK_PRE_ROUTING
了解 linux iptables 的都知道摊趾,tcp 協(xié)義棧各種 hook 鉤子币狠。dpvs 自實現(xiàn)的也有游两,不過很精簡,先看一下 INET_HOOK_PRE_ROUTING 都注冊了哪些回調漩绵。
static struct inet_hook_ops dp_vs_ops[] = {
{
.hook = dp_vs_in,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 100,
},
{
.hook = dp_vs_pre_routing,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 99,
},
};
查看 ip_vs_core.c dp_vs_init
會注冊這個鉤子贱案,注意權重值 priority, 查看注冊函數(shù) ipv4_register_hooks
意思是值越小的越先執(zhí)行。那么本次回調止吐,先執(zhí)行 dp_vs_pre_routing宝踪,再執(zhí)行 dp_vs_in
三層鉤子回調 dp_vs_pre_routing
static int dp_vs_pre_routing(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state)
{
struct dp_vs_iphdr iph;
int af;
struct dp_vs_service *svc;
af = AF_INET;
// 填充四層失敗,還要返回 ACCEPT碍扔?
if (EDPVS_OK != dp_vs_fill_iphdr(af, mbuf, &iph))
return INET_ACCEPT;
/* Drop all ip fragment except ospf */
if ((af == AF_INET) && ip4_is_frag(ip4_hdr(mbuf))
&& (iph.proto != IPPROTO_OSPF)) {
dp_vs_estats_inc(DEFENCE_IP_FRAG_DROP);
return INET_DROP;
}
/* Drop udp packet which send to tcp-vip */
if (g_defence_udp_drop && IPPROTO_UDP == iph.proto) {
if ((svc = dp_vs_lookup_vip(af, IPPROTO_UDP, &iph.daddr)) == NULL) {
if ((svc = dp_vs_lookup_vip(af, IPPROTO_TCP, &iph.daddr)) != NULL) {
dp_vs_estats_inc(DEFENCE_UDP_DROP);
return INET_DROP;
}
}
}
/* Synproxy: defence synflood */
if (IPPROTO_TCP == iph.proto) {
int v = INET_ACCEPT;
if (0 == dp_vs_synproxy_syn_rcv(af, mbuf, &iph, &v))
return v;
}
return INET_ACCEPT;
}
這里有個最重要的功能是 syn_proxy瘩燥,后文再細講,目前只用在 nat不同、full-nat 模式下
三層鉤子回調 dp_vs_in
dp_vs_in
開始進入 lvs 模塊厉膀,之前的都是前戲。這塊代碼非常復雜二拐,簡單說服鹅,對于存在的 proxy 連接,判斷方向(client -> LB 或是 rs -> LB)百新,直接發(fā)送流量企软。新來的請求,查找 virtual server饭望,根據(jù) LB 算法查找對應后端 real server仗哨,建立連接,并保存這個會話铅辞,大并發(fā)時這個會話非常龐大厌漂。這里細節(jié)也非常多。
static int dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state)
{
struct dp_vs_iphdr iph;
struct dp_vs_proto *prot;
struct dp_vs_conn *conn;
int dir, af, verdict, err, related;
bool drop = false;
eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
assert(mbuf && state);
/* cannot use mbuf->l3_type which is conflict with m.packet_type
* or using wrapper to avoid af check here */
/* af = mbuf->l3_type == htons(ETHER_TYPE_IPv4) ? AF_INET : AF_INET6; */
af = AF_INET;
if (unlikely(etype != ETH_PKT_HOST))
return INET_ACCEPT;
數(shù)據(jù)包不是發(fā)往本機的巷挥,那么返回桩卵。這里為什么不是 drop 呢?
if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK)
return INET_ACCEPT;
if (unlikely(iph.proto == IPPROTO_ICMP)) {
/* handle related ICMP error to existing conn */
verdict = dp_vs_in_icmp(mbuf, &related);
if (related || verdict != INET_ACCEPT)
return verdict;
/* let unrelated and valid ICMP goes down,
* may implement ICMP fwd in the futher. */
}
處理 ICMP 消息倍宾,暫時不看雏节,只關注 tcp4 主流程
prot = dp_vs_proto_lookup(iph.proto);
if (unlikely(!prot))
return INET_ACCEPT;
查找四層處理協(xié)義,目前實現(xiàn)了 tcp高职、udp钩乍、icmp 三種
/*
* Defrag ipvs-forwarding TCP/UDP is not supported for some reasons,
*
* - RSS/flow-director do not support TCP/UDP fragments, means it's
* not able to direct frags to same lcore as original TCP/UDP packets.
* - per-lcore conn table will miss if frags reachs wrong lcore.
*
* If we redirect frags to "correct" lcore, it may cause performance
* issue. Also it need to understand RSS algorithm. Moreover, for the
* case frags in same flow are not occur in same lcore, a global lock is
* needed, which is not a good idea.
*/
if (ip4_is_frag(ip4_hdr(mbuf))) {
RTE_LOG(DEBUG, IPVS, "%s: frag not support.\n", __func__);
return INET_DROP;
}
劃重點,這里涉及 dpvs 核心優(yōu)化 fdir怔锌,以后單獨講
/* packet belongs to existing connection ? */
conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop);
根據(jù)不同四層協(xié)義寥粹,調用 conn_lookup
函數(shù)查找會話变过。有可能會 drop 掉。dir 是設置數(shù)據(jù)流方向涝涤,從 client 到 LB媚狰,還是從 real server 到 LB
if (unlikely(drop)) {
RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.\n", __func__);
return INET_DROP;
}
// 如果沒找到,那么調用 conn_sched 去和 real server 連接
if (unlikely(!conn)) {
/* try schedule RS and create new connection */
if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
/* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
return verdict;
}
/* only SNAT triggers connection by inside-outside traffic. */
if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
dir = DPVS_CONN_DIR_OUTBOUND;
else
dir = DPVS_CONN_DIR_INBOUND;
}
對于新建立的連接阔拳,肯定是沒有會話的崭孤。conn_sched 根據(jù)請求選擇一個后端 real server 建立連接。
if (conn->flags & DPVS_CONN_F_SYNPROXY) {
if (dir == DPVS_CONN_DIR_INBOUND) {
/* Filter out-in ack packet when cp is at SYN_SENT state.
* Drop it if not a valid packet, store it otherwise */
if (0 == dp_vs_synproxy_filter_ack(mbuf, conn, prot,
&iph, &verdict)) {
dp_vs_stats_in(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
/* "Reuse" synproxy sessions.
* "Reuse" means update syn_proxy_seq struct
* and clean ack_mbuf etc. */
if (0 != dp_vs_synproxy_ctrl_conn_reuse) {
if (0 == dp_vs_synproxy_reuse_conn(af, mbuf, conn, prot,
&iph, &verdict)) {
dp_vs_stats_in(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
}
} else {
/* Syn-proxy 3 logic: receive syn-ack from rs */
if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot,
ip4_hdrlen(mbuf), &verdict) == 0) {
dp_vs_stats_out(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
}
}
特殊處理 syn proxy
if (prot->state_trans) {
err = prot->state_trans(prot, conn, mbuf, dir);
if (err != EDPVS_OK)
RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
}
conn->old_state = conn->state;
tcp 狀態(tài)轉移糊肠,這個很好理解
/* holding the conn, need a "put" later. */
if (dir == DPVS_CONN_DIR_INBOUND)
return xmit_inbound(mbuf, prot, conn);
else
return xmit_outbound(mbuf, prot, conn);
}
根據(jù)流量方向 dir辨宠,來選擇如何寫數(shù)據(jù)。
dp_vs_in 查找存在的連接
static struct dp_vs_conn *
tcp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf, int *direct, bool reverse, bool *drop)
{
struct tcphdr *th, _tcph;
struct dp_vs_conn *conn;
assert(proto && iph && mbuf);
th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
if (unlikely(!th))
return NULL;
if (dp_vs_blklst_lookup(iph->proto, &iph->daddr, th->dest, &iph->saddr)) {
*drop = true;
return NULL;
}
conn = dp_vs_conn_get(iph->af, iph->proto,
&iph->saddr, &iph->daddr, th->source, th->dest, direct, reverse);
/*
* L2 confirm neighbour
* pkt in from client confirm neighbour to client
* pkt out from rs confirm neighbour to rs
*/
if (conn != NULL) {
if (th->ack) {
if ((*direct == DPVS_CONN_DIR_INBOUND) && conn->out_dev
&& (conn->out_nexthop.in.s_addr != htonl(INADDR_ANY))) {
neigh_confirm(conn->out_nexthop.in, conn->out_dev);
} else if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev
&& (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))) {
neigh_confirm(conn->in_nexthop.in, conn->in_dev);
}
}
}
return conn;
}
首先通過 dp_vs_blklst_lookup
查找 ip 黑名單货裹,然后調用 dp_vs_conn_get
查找嗤形。 最后要確認鄰居子系統(tǒng)。
struct dp_vs_conn *dp_vs_conn_get(int af, uint16_t proto,
const union inet_addr *saddr, const union inet_addr *daddr,
uint16_t sport, uint16_t dport, int *dir, bool reverse)
{
uint32_t hash;
struct conn_tuple_hash *tuphash;
struct dp_vs_conn *conn = NULL;
#ifdef CONFIG_DPVS_IPVS_DEBUG
char sbuf[64], dbuf[64];
#endif
if (unlikely(reverse))
hash = conn_hashkey(af, daddr, dport, saddr, sport);
else
hash = conn_hashkey(af, saddr, sport, daddr, dport);
#ifdef CONFIG_DPVS_IPVS_CONN_LOCK
rte_spinlock_lock(&this_conn_lock);
#endif
if (unlikely(reverse)) { /* swap source/dest for lookup */
list_for_each_entry(tuphash, &this_conn_tab[hash], list) {
if (tuphash->sport == dport
&& tuphash->dport == sport
&& inet_addr_equal(af, &tuphash->saddr, daddr)
&& inet_addr_equal(af, &tuphash->daddr, saddr)
&& tuphash->proto == proto
&& tuphash->af == af) {
/* hit */
conn = tuplehash_to_conn(tuphash);
rte_atomic32_inc(&conn->refcnt);
if (dir)
*dir = tuphash->direct;
break;
}
}
} else {
list_for_each_entry(tuphash, &this_conn_tab[hash], list) {
if (tuphash->sport == sport
&& tuphash->dport == dport
&& inet_addr_equal(af, &tuphash->saddr, saddr)
&& inet_addr_equal(af, &tuphash->daddr, daddr)
&& tuphash->proto == proto
&& tuphash->af == af) {
/* hit */
conn = tuplehash_to_conn(tuphash);
rte_atomic32_inc(&conn->refcnt);
if (dir)
*dir = tuphash->direct;
break;
}
}
}
return conn;
}
dp_vs_conn_get 使用五元組進行索引弧圆,<af, daddr, dport, saddr, sport>. 查找表 this_conn_tab 每個 lcore 核一個赋兵,桶大小固定的。就是一個二維數(shù)組墓阀,每個元素又是鏈表毡惜。查找表的管理是個大學問,如果被攻擊了斯撮,表會爆炸经伙。并且表的行為也要隨著 tcp 狀態(tài)變遷增刪改查。
在這里可以看到勿锅,dir 方向被賦值帕膜,而 tuphash->direct 來自建立連接時初始化。
list_for_each_entry
是一個鏈表遍歷的宏溢十,dpvs 使用的鏈表和內核是一樣的垮刹。數(shù)據(jù)結構以后單獨說。
dp_vs_in 新建立連接
新建的連接由 conn_sched 進行調度张弛,對于 tcp 服務調用 tcp_conn_sched
static int tcp_conn_sched(struct dp_vs_proto *proto,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
struct dp_vs_conn **conn,
int *verdict)
{
struct tcphdr *th, _tcph;
struct dp_vs_service *svc;
assert(proto && iph && mbuf && conn && verdict);
th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph);
if (unlikely(!th)) {
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
獲取 tcp header, 只是指針操作荒典,不涉及數(shù)據(jù)復制
/* only TCP-SYN without other flag can be scheduled */
if (!th->syn || th->ack || th->fin || th->rst) {
/* Drop tcp packet which is send to vip and !vport */
if (g_defence_tcp_drop &&
(svc = dp_vs_lookup_vip(iph->af, iph->proto, &iph->daddr))) {
dp_vs_estats_inc(DEFENCE_TCP_DROP);
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
*verdict = INET_ACCEPT;
return EDPVS_INVAL;
}
對于新建立的連接,只允許 syn 請求吞鸭,其它的丟棄
svc = dp_vs_service_lookup(iph->af, iph->proto,
&iph->daddr, th->dest, 0, mbuf, NULL);
if (!svc) {
/* Drop tcp packet which is send to vip and !vport */
if (g_defence_tcp_drop &&
(svc = dp_vs_lookup_vip(iph->af, iph->proto, &iph->daddr))) {
dp_vs_estats_inc(DEFENCE_TCP_DROP);
*verdict = INET_DROP;
return EDPVS_INVPKT;
}
*verdict = INET_ACCEPT;
return EDPVS_NOSERV;
}
dp_vs_service_lookup
根據(jù)請求目地址和端口來查找服務寺董,如果找不到丟棄。
*conn = dp_vs_schedule(svc, iph, mbuf, false);
if (!*conn) {
dp_vs_service_put(svc);
*verdict = INET_DROP;
return EDPVS_RESOURCE;
}
dp_vs_service_put(svc);
return EDPVS_OK;
}
dp_vs_schedule
根據(jù)服務來選擇后端 real server 建立連接刻剥。
dp_vs_in 新建立連接后端調度dp_vs_schedule
struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc,
const struct dp_vs_iphdr *iph,
struct rte_mbuf *mbuf,
bool is_synproxy_on)
{
uint16_t _ports[2], *ports; /* sport, dport */
struct dp_vs_dest *dest;
struct dp_vs_conn *conn;
struct dp_vs_conn_param param;
struct sockaddr_in daddr, saddr;
int err;
assert(svc && iph && mbuf);
ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports);
if (!ports)
return NULL;
/* persistent service */
if (svc->flags & DP_VS_SVC_F_PERSISTENT)
return dp_vs_sched_persist(svc, iph, mbuf, is_synproxy_on);
長連接請求行為是有些差異的遮咖,暫時忽略,后面再分析造虏。
dest = svc->scheduler->schedule(svc, mbuf); // 特定的調度算法
if (!dest) {
RTE_LOG(WARNING, IPVS, "%s: no dest found.\n", __func__);
#ifdef CONFIG_DPVS_MBUF_DEBUG
dp_vs_mbuf_dump("found dest failed.", iph->af, mbuf);
#endif
return NULL;
}
根據(jù)特定算法選擇 real server, 常用的有 rr, wrr, wlc 以后再分析御吞。返回 dest 結構體是后端 rs
if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
if (unlikely(iph->proto == IPPROTO_ICMP)) {
struct icmphdr *ich, _icmph;
ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
if (!ich)
return NULL;
ports = _ports;
_ports[0] = icmp4_id(ich);
_ports[1] = ich->type << 8 | ich->code;
/* ID may confict for diff host,
* need we use ID pool ? */
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->daddr, &dest->addr,
ports[1], ports[0],
0, ¶m);
} else {
/* we cannot inherit dest (host's src port),
* that may confict for diff hosts,
* and using dest->port is worse choice. */
memset(&daddr, 0, sizeof(daddr));
daddr.sin_family = AF_INET;
daddr.sin_addr = iph->daddr.in;
daddr.sin_port = ports[1];
memset(&saddr, 0, sizeof(saddr));
saddr.sin_family = AF_INET;
saddr.sin_addr = dest->addr.in;
saddr.sin_port = 0;
err = sa_fetch(NULL, &daddr, &saddr);
if (err != 0) {
#ifdef CONFIG_DPVS_MBUF_DEBUG
dp_vs_mbuf_dump("sa_fetch failed.", iph->af, mbuf);
#endif
return NULL;
}
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->daddr, &dest->addr,
ports[1], saddr.sin_port,
0, ¶m);
}
} else {
snat 特殊處理麦箍,暫時不看,以后分析
if (unlikely(iph->proto == IPPROTO_ICMP)) {
struct icmphdr *ich, _icmph;
ich = mbuf_header_pointer(mbuf, iph->len, sizeof(_icmph), &_icmph);
if (!ich)
return NULL;
ports = _ports;
_ports[0] = icmp4_id(ich);
_ports[1] = ich->type << 8 | ich->code;
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
} else {
ICMP 處理暫時不看陶珠,以后分析
dp_vs_conn_fill_param(iph->af, iph->proto,
&iph->saddr, &iph->daddr,
ports[0], ports[1], 0, ¶m);
}
}
填充參數(shù) proto, caddr, vaddr, cport, vport 供新建連接使用
conn = dp_vs_conn_new(mbuf, ¶m, dest,
is_synproxy_on ? DPVS_CONN_F_SYNPROXY : 0);
if (!conn) {
if (dest->fwdmode == DPVS_FWD_MODE_SNAT && iph->proto != IPPROTO_ICMP)
sa_release(NULL, &daddr, &saddr);
#ifdef CONFIG_DPVS_MBUF_DEBUG
dp_vs_mbuf_dump("create conn failed.", iph->af, mbuf);
#endif
return NULL;
}
dp_vs_stats_conn(conn);
return conn;
}
dp_vs_conn_new
根據(jù)參數(shù)挟裂,目標機器信息建立代理連接
dp_vs_in 新建立連接dp_vs_conn_new
struct dp_vs_conn * dp_vs_conn_new(struct rte_mbuf *mbuf,
struct dp_vs_conn_param *param,
struct dp_vs_dest *dest, uint32_t flags)
{
struct dp_vs_conn *new;
struct conn_tuple_hash *t;
uint16_t rport;
__be16 _ports[2], *ports;
int err;
assert(mbuf && param && dest);
if (unlikely(rte_mempool_get(this_conn_cache, (void **)&new) != 0)) {
RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__);
return NULL;
}
memset(new, 0, sizeof(struct dp_vs_conn));
new->connpool = this_conn_cache;
內存池,這很重要背率,malloc 分配內存很慢的
/* set proper RS port */
if ((flags & DPVS_CONN_F_TEMPLATE) || param->ct_dport != 0)
rport = param->ct_dport;
else if (dest->fwdmode == DPVS_FWD_MODE_SNAT) {
if (unlikely(param->proto == IPPROTO_ICMP)) {
rport = param->vport;
} else {
ports = mbuf_header_pointer(mbuf, ip4_hdrlen(mbuf),
sizeof(_ports), _ports);
if (unlikely(!ports)) {
RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__);
goto errout;
}
rport = ports[0];
}
} else
rport = dest->port;
/* init inbound conn tuple hash */
t = &tuplehash_in(new);
t->direct = DPVS_CONN_DIR_INBOUND; // 入口流量话瞧,肯定是外網(wǎng)進來的
t->af = param->af;
t->proto = param->proto;
t->saddr = *param->caddr; // 源地址是 外網(wǎng) client addr
t->sport = param->cport;
t->daddr = *param->vaddr; // 目地地址是 服務虛IP地址
t->dport = param->vport;
INIT_LIST_HEAD(&t->list);
/* init outbound conn tuple hash */
t = &tuplehash_out(new);
t->direct = DPVS_CONN_DIR_OUTBOUND; // 出口
t->af = param->af;
t->proto = param->proto;
if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
t->saddr.in.s_addr = ip4_hdr(mbuf)->src_addr;
else
t->saddr = dest->addr;
t->sport = rport;
t->daddr = *param->caddr; /* non-FNAT */
t->dport = param->cport; /* non-FNAT */
INIT_LIST_HEAD(&t->list);
conn 連接有一個 tuplehash 數(shù)組元素嫩与,長度方2寝姿,保存兩個方向的 tupehash 結構體。不同方向的源地址和目的地址意義是不同的划滋。
/* init connection */
new->af = param->af;
new->proto = param->proto;
new->caddr = *param->caddr;
new->cport = param->cport;
new->vaddr = *param->vaddr;
new->vport = param->vport;
new->laddr = *param->caddr; /* non-FNAT */
new->lport = param->cport; /* non-FNAT */
if (dest->fwdmode == DPVS_FWD_MODE_SNAT)
new->daddr.in.s_addr = ip4_hdr(mbuf)->src_addr;
else
new->daddr = dest->addr;
new->dport = rport;
連接保存 caddr, vaddr, cport, vport 信息
/* neighbour confirm cache 鄰居子系統(tǒng)*/
new->in_nexthop.in.s_addr = htonl(INADDR_ANY);
new->out_nexthop.in.s_addr = htonl(INADDR_ANY);
new->in_dev = NULL;
new->out_dev = NULL;
/* Controll member */
new->control = NULL;
rte_atomic32_clear(&new->n_control);
/* caller will use it right after created,
* just like dp_vs_conn_get(). */
rte_atomic32_set(&new->refcnt, 1);
new->flags = flags;
new->state = 0;
#ifdef CONFIG_DPVS_IPVS_STATS_DEBUG
new->ctime = rte_rdtsc();
#endif
/* bind destination and corresponding trasmitter */
err = conn_bind_dest(new, dest);
if (err != EDPVS_OK) {
RTE_LOG(WARNING, IPVS, "%s: fail to bind dest: %s\n",
__func__, dpvs_strerror(err));
goto errout;
}
conn_bind_dest
在這里設置轉發(fā)模式相關的幾個發(fā)包收包操作饵筑,非常重要
/* FNAT only: select and bind local address/port */
if (dest->fwdmode == DPVS_FWD_MODE_FNAT) {
if ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK)
goto unbind_dest;
}
full-nat 特殊處理,以后再分析
/* add to hash table (dual dir for each bucket) */
if ((err = conn_hash(new)) != EDPVS_OK)
goto unbind_laddr;
conn_hash
將連接加到 this_conn_tab
流表处坪,仔細看實現(xiàn)根资,實際上是將 tuphash 兩個方向的都加到流表里,方便不同方向的檢索同窘。
/* timer */
new->timeout.tv_sec = conn_init_timeout;
new->timeout.tv_usec = 0;
默認超時時間
/* synproxy 用于 syn proxy 使用*/
INIT_LIST_HEAD(&new->ack_mbuf);
rte_atomic32_set(&new->syn_retry_max, 0);
rte_atomic32_set(&new->dup_ack_cnt, 0);
if ((flags & DPVS_CONN_F_SYNPROXY) && !(flags & DPVS_CONN_F_TEMPLATE)) {
struct tcphdr _tcph, *th;
struct dp_vs_synproxy_ack_pakcet *ack_mbuf;
struct dp_vs_proto *pp;
th = mbuf_header_pointer(mbuf, ip4_hdrlen(mbuf), sizeof(_tcph), &_tcph);
if (!th) {
RTE_LOG(ERR, IPVS, "%s: get tcphdr failed\n", __func__);
goto unbind_laddr;
}
/* save ack packet */
if (unlikely(rte_mempool_get(this_ack_mbufpool, (void **)&ack_mbuf) != 0)) {
RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__);
goto unbind_laddr;
}
ack_mbuf->mbuf = mbuf;
list_add_tail(&ack_mbuf->list, &new->ack_mbuf);
new->ack_num++;
sp_dbg_stats32_inc(sp_ack_saved);
/* save ack_seq - 1 */
new->syn_proxy_seq.isn =
htonl((uint32_t) ((ntohl(th->ack_seq) - 1)));
/* save ack_seq */
new->fnat_seq.fdata_seq = htonl(th->ack_seq);
/* FIXME: use DP_VS_TCP_S_SYN_SENT for syn */
pp = dp_vs_proto_lookup(param->proto);
new->timeout.tv_sec = pp->timeout_table[new->state = DPVS_TCP_S_SYN_SENT];
}
sync proxy 非常重要玄帕,以后分析
this_conn_count++;
/* schedule conn timer */
dpvs_time_rand_delay(&new->timeout, 1000000);
if (new->flags & DPVS_CONN_F_TEMPLATE)
dpvs_timer_sched(&new->timer, &new->timeout, conn_expire, new, true);
else
dpvs_timer_sched(&new->timer, &new->timeout, conn_expire, new, false);
#ifdef CONFIG_DPVS_IPVS_DEBUG
conn_dump("new conn: ", new);
#endif
return new;
unbind_laddr:
dp_vs_laddr_unbind(new);
unbind_dest:
conn_unbind_dest(new);
errout:
rte_mempool_put(this_conn_cache, new);
return NULL;
}
最后將連接加到定時器,管理連接超時想邦。tcp 不同狀態(tài)的超時時間是不同的裤纹,以后單獨分析定時器
dp_vs_in 新建立連接conn_bind_dest
switch (dest->fwdmode) {
case DPVS_FWD_MODE_NAT:
conn->packet_xmit = dp_vs_xmit_nat;
conn->packet_out_xmit = dp_vs_out_xmit_nat;
break;
case DPVS_FWD_MODE_TUNNEL:
conn->packet_xmit = dp_vs_xmit_tunnel;
break;
case DPVS_FWD_MODE_DR:
conn->packet_xmit = dp_vs_xmit_dr;
break;
case DPVS_FWD_MODE_FNAT:
conn->packet_xmit = dp_vs_xmit_fnat;
conn->packet_out_xmit = dp_vs_out_xmit_fnat;
break;
case DPVS_FWD_MODE_SNAT:
conn->packet_xmit = dp_vs_xmit_snat;
conn->packet_out_xmit = dp_vs_out_xmit_snat;
break;
default:
return EDPVS_NOTSUPP;
}
conn_bind_dest
只貼了核心部分,可以看到 NAT 相關的流量都要經(jīng)過 LB丧没,而 DR TUNNEL 是不需要經(jīng)過的鹰椒,只有入流量,沒有出呕童。
xmit_inbound xmit_outbound 回寫數(shù)據(jù)
再回頭看 dp_vs_in
漆际,由于 DR 模式只有入口流量,所以只會調用 xmit_inbound
.
/* forward to RS */
err = conn->packet_xmit(prot, conn, mbuf);
if (err != EDPVS_OK)
RTE_LOG(DEBUG, IPVS, "%s: fail to transmit: %d\n", __func__, err);
dp_vs_conn_put(conn);
/* always stolen the packet */
return INET_STOLEN;
而最終 xmit_inbound
調用 conn_bind_dest 指定的 dp_vs_xmit_dr
, 并且永遠返回 INET_STOLEN夺饲,再回到 INET_HOOK奸汇,如果返回值是 INET_STOLEN,那么不會調 okfn 回調往声。
dp_vs_xmit_dr 寫數(shù)據(jù)給 rs
由于 dr 工作在二層同一個物理網(wǎng)絡擂找,所以最終調用 neigh_resolve_output
鄰居子系統(tǒng),將包發(fā)出去烁挟。
neigh_fill_mac(neighbour, m);
netif_xmit(m, neighbour->port);
最終調用兩個函數(shù)婴洼,neigh_fill_mac
負責填充 mac, netif_xmit
負責發(fā)送數(shù)據(jù)。
static void neigh_fill_mac(struct neighbour_entry *neighbour, struct rte_mbuf *m)
{
struct ether_hdr *eth;
uint16_t pkt_type;
m->l2_len = sizeof(struct ether_hdr);
eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct ether_hdr));
ether_addr_copy(&neighbour->eth_addr,ð->d_addr);
ether_addr_copy(&neighbour->port->addr,ð->s_addr);
pkt_type = (uint16_t)m->packet_type;
eth->ether_type = rte_cpu_to_be_16(pkt_type);
}
這里很明顯了撼嗓,ether_addr_copy(&neighbour->eth_addr,ð->d_addr) 將目的 mac 地址改寫成鄰居子系統(tǒng)中查到的 real server 地址柬采,將源 mac 改寫成當前 LB 網(wǎng)卡的地址欢唾。
int netif_xmit(struct rte_mbuf *mbuf, struct netif_port *dev)
{
int ret = EDPVS_OK;
uint16_t mbuf_refcnt;
if (unlikely(NULL == mbuf || NULL == dev)) {
if (mbuf)
rte_pktmbuf_free(mbuf);
return EDPVS_INVAL;
}
if (mbuf->port != dev->id)
mbuf->port = dev->id;
/* assert for possible double free */
mbuf_refcnt = rte_mbuf_refcnt_read(mbuf);
assert((mbuf_refcnt >= 1) && (mbuf_refcnt <= 64));
if (dev->flag & NETIF_PORT_FLAG_TC_EGRESS) {
mbuf = tc_handle_egress(netif_tc(dev), mbuf, &ret);
if (likely(!mbuf))
return ret;
}
return netif_hard_xmit(mbuf, dev);
}
netif_xmit
發(fā)送數(shù)據(jù)前先通過 tc_handle_egress 做流控,以后單獨分析粉捻。然后通過 netif_hard_xmit
將數(shù)據(jù)寫到網(wǎng)卡礁遣。
小結
通篇都是代碼,比較難讀肩刃。其他轉發(fā)模式稍后分析祟霍,關于 dpvs 的優(yōu)化點也一一詳解。