后端 rs 一般有多個,通過一定算法做負(fù)載均衡徒扶。程序初始化時 dp_vs_init
調(diào)用 dp_vs_sched_init
進(jìn)行全局注冊。
int dp_vs_sched_init(void)
{
INIT_LIST_HEAD(&dp_vs_schedulers);
rte_rwlock_init(&__dp_vs_sched_lock);
dp_vs_rr_init();
dp_vs_wrr_init();
dp_vs_wlc_init();
dp_vs_conhash_init();
return EDPVS_OK;
}
可以看到 dpvs 當(dāng)前支持 rr, wrr, wlc, conhash 四種調(diào)度算法八回。
調(diào)度入口
當(dāng) client 首次請求時酷愧,dp_vs_schedule
選擇 rs驾诈,然后建立連接
dest = svc->scheduler->schedule(svc, mbuf);
這就是調(diào)度入口缠诅,具體使用哪種算法,由程序初始化時乍迄,配置 service 服務(wù)時決定管引。dp_vs_bind_scheduler
調(diào)用 scheduler->init_service
函數(shù)指針初始化。
輪循算法 rr
首先這是最簡單的算法闯两,看下如何實(shí)現(xiàn)的褥伴。首先看 init_service
初始化函數(shù)
static int dp_vs_rr_init_svc(struct dp_vs_service *svc)
{
svc->sched_data = &svc->dests;
return EDPVS_OK;
}
很簡單谅将,直接將后端 rs 鏈表 dests 賦給 sched_data. 再看一下 schedule
入口
static struct dp_vs_dest *dp_vs_rr_schedule(struct dp_vs_service *svc,
const struct rte_mbuf *mbuf)
{
struct list_head *p, *q;
struct dp_vs_dest *dest;
rte_rwlock_write_lock(&svc->sched_lock);
p = (struct list_head *)svc->sched_data;
p = p->next;
q = p;
do {
/* skip list head */
if (q == &svc->dests) {
q = q->next;
continue;
}
dest = list_entry(q, struct dp_vs_dest, n_list);
if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
(dest->flags & DPVS_DEST_F_AVAILABLE) &&
rte_atomic16_read(&dest->weight) > 0)
/* HIT */
goto out;
q = q->next;
} while (q != p);
rte_rwlock_write_unlock(&svc->sched_lock);
return NULL;
out:
svc->sched_data = q;
rte_rwlock_write_unlock(&svc->sched_lock);
return dest;
}
原理也很明了,遍歷 sched_data 鏈表重慢,找到第一個可用的 dest 后返回饥臂,并保存找到的位置,下次選擇時從這個位置的下一個開始查找似踱。
加權(quán)輪循 wrr
帶權(quán)重的輪循隅熙,首先看 init_service
初始化函數(shù)
static int dp_vs_wrr_init_svc(struct dp_vs_service *svc)
{
struct dp_vs_wrr_mark *mark;
/*
* Allocate the mark variable for WRR scheduling
*/
mark = rte_zmalloc("wrr_mark", sizeof(struct dp_vs_wrr_mark), RTE_CACHE_LINE_SIZE);
if (mark == NULL) {
return EDPVS_NOMEM;
}
mark->cl = &svc->dests;
mark->cw = 0;
mark->mw = dp_vs_wrr_max_weight(svc);
mark->di = dp_vs_wrr_gcd_weight(svc);
svc->sched_data = mark;
return EDPVS_OK;
}
struct dp_vs_wrr_mark {
struct list_head *cl; /* current list head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
};
分配 dp_vs_wrr_mark 結(jié)構(gòu)體,初始化賦值給 svc->sched_data. dp_vs_wrr_max_weight
求出后端 rs 權(quán)重最大值核芽,dp_vs_wrr_gcd_weight
求出這些權(quán)重值的最大公約數(shù)囚戚,這個 gcd 用于權(quán)重值增減的步長。比如權(quán)重分別是 100轧简,20驰坊,50,那么 gcd 就是 10哮独,所謂的 decreasing interval
就是 10.
static struct dp_vs_dest *dp_vs_wrr_schedule(struct dp_vs_service *svc,
const struct rte_mbuf *mbuf)
{
struct dp_vs_dest *dest;
struct dp_vs_wrr_mark *mark = svc->sched_data;
struct list_head *p;
/*
* This loop will always terminate, because mark->cw in (0, max_weight]
* and at least one server has its weight equal to max_weight.
*/
rte_rwlock_write_lock(&svc->sched_lock);
p = mark->cl;
while (1) {
if (mark->cl == &svc->dests) {
/* it is at the head of the destination list */
if (mark->cl == mark->cl->next) {
/* no dest entry */
dest = NULL;
goto out;
}
mark->cl = svc->dests.next;
mark->cw -= mark->di;
if (mark->cw <= 0) {
mark->cw = mark->mw;
/*
* Still zero, which means no available servers.
*/
if (mark->cw == 0) {
mark->cl = &svc->dests;
dest = NULL;
goto out;
}
}
} else
mark->cl = mark->cl->next;
if (mark->cl != &svc->dests) {
/* not at the head of the list */
dest = list_entry(mark->cl, struct dp_vs_dest, n_list);
if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
(dest->flags & DPVS_DEST_F_AVAILABLE) &&
rte_atomic16_read(&dest->weight) >= mark->cw) {
/* got it */
break;
}
}
if (mark->cl == p && mark->cw == mark->di) {
/* back to the start, and no dest is found.
It is only possible when all dests are OVERLOADED */
dest = NULL;
goto out;
}
}
out:
rte_rwlock_write_unlock(&svc->sched_lock);
return dest;
}
- 首次循環(huán) mark->cl = &svc->dests, 也就是隊(duì)首拳芙,cw 減去一個步長 di. 并且將 mark->cl 置為隊(duì)首后的第一個元素 svc->dests.next 然后查看 mark->cl 的權(quán)重如果大于 cw, 那么返回當(dāng)前 mark->cl 做為 rs.
- 如果此時 mark->cl 不是隊(duì)首,那么直接
mark->cl = mark->cl->next
查看下一個元素借嗽,是否滿足可用态鳖,并且權(quán)重大于 cw, 符合要求返回 mark->cl 做為 rs - 如果mark->cl 權(quán)重小于 cw, 那么 while 循環(huán),繼續(xù)遍歷下一個恶导。
這個算法的原理清楚了浆竭,但是 while 循環(huán)的復(fù)雜度暫時不知道,水平有限...
加權(quán)最小連接 wlc
static struct dp_vs_scheduler dp_vs_wlc_scheduler = {
.name = "wlc",
.n_list = LIST_HEAD_INIT(dp_vs_wlc_scheduler.n_list),
.schedule = dp_vs_wlc_schedule,
};
這個調(diào)度算法是沒有 init_service
函數(shù)的惨寿,那么直接看 dp_vs_wlc_schedule
static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc,
const struct rte_mbuf *mbuf)
{
struct dp_vs_dest *dest, *least;
unsigned int loh, doh;
/*
* We calculate the load of each dest server as follows:
* (dest overhead) / dest->weight
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
list_for_each_entry(dest, &svc->dests, n_list) {
if (!(dest->flags & DPVS_DEST_F_OVERLOAD) &&
(dest->flags & DPVS_DEST_F_AVAILABLE) &&
rte_atomic16_read(&dest->weight) > 0) {
least = dest;
loh = dp_vs_wlc_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
list_for_each_entry_continue(dest, &svc->dests, n_list) {
if (dest->flags & DPVS_DEST_F_OVERLOAD)
continue;
doh = dp_vs_wlc_dest_overhead(dest);
if (loh * rte_atomic16_read(&dest->weight) >
doh * rte_atomic16_read(&least->weight)) {
least = dest;
loh = doh;
}
}
return least;
}
static inline unsigned int dp_vs_wlc_dest_overhead(struct dp_vs_dest *dest)
{
return (rte_atomic32_read(&dest->actconns) << 8) +
rte_atomic32_read(&dest->inactconns);
}
- 首先是給后端 rs 打分的算法, (dest overhead) / dest->weight邦泄,負(fù)載除以權(quán)重。負(fù)載是由
dp_vs_wlc_dest_overhead
完成計(jì)算裂垦,活躍連接和非活躍連接的加權(quán)統(tǒng)計(jì)顺囊,活躍占比最大。那么這個打分蕉拢,肯定是越小特碳,被選擇的可能性越大 - 第一個 list_for_each_entry,找出第一個 rs, 算出打分晕换。第二個遍歷所有 rs, 將打分進(jìn)行對比午乓,最后分最少的被賦值 least, 返回給上游使用。
連接哈希 conhash
這個算法用的好像不多闸准,會將相同 hash 算法的固定分配置某個后端 rs. 先看 init_service
static int dp_vs_conhash_init_svc(struct dp_vs_service *svc)
{
svc->sched_data = conhash_init(NULL);
if (!svc->sched_data) {
RTE_LOG(ERR, SERVICE, "%s: conhash init faild!\n", __func__);
return EDPVS_NOMEM;
}
dp_vs_conhash_assign(svc);
return EDPVS_OK;
}
這塊 conhash_init
居然用了紅黑樹的實(shí)現(xiàn)益愈,然后調(diào)用 dp_vs_conhash_assign
將后端 rs hash 到這個 rbtree 里.
static int
dp_vs_conhash_assign(struct dp_vs_service *svc)
{
struct dp_vs_dest *dest;
struct node_s *p_node;
int weight = 0;
char str[40];
list_for_each_entry(dest, &svc->dests, n_list) {
weight = rte_atomic16_read(&dest->weight);
if (weight > 0) {
p_node = rte_zmalloc("p_node", sizeof(struct node_s), RTE_CACHE_LINE_SIZE);
if (p_node == NULL) {
return EDPVS_NOMEM;
}
rte_atomic32_inc(&dest->refcnt);
p_node->data = dest;
snprintf(str, sizeof(str), "%u%d", dest->addr.in.s_addr, dest->port);
conhash_set_node(p_node, str, weight*REPLICA);
conhash_add_node(svc->sched_data, p_node);
}
}
return EDPVS_OK;
}
- 只有 weight 大于 0 才是可用的 rs
- 生成 node 節(jié)點(diǎn)加入到樹里,這里看節(jié)點(diǎn)分復(fù)制成 weightREPLICA 份,做一致性 hash 用蒸其,建了 權(quán)重160個虛擬節(jié)點(diǎn)
static struct dp_vs_dest *
dp_vs_conhash_schedule(struct dp_vs_service *svc, const struct rte_mbuf *mbuf)
{
struct dp_vs_dest *dest;
dest = dp_vs_conhash_get(svc, (struct conhash_s *)svc->sched_data, mbuf);
if (!dest
|| !(dest->flags & DPVS_DEST_F_AVAILABLE)
|| rte_atomic16_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
return NULL;
}
else
return dest;
}
schedule
函數(shù)也不難敏释,直接根據(jù) mbuf 調(diào)用 dp_vs_conhash_get
, 獲取最近的虛擬節(jié)點(diǎn)即可。
static inline struct dp_vs_dest *
dp_vs_conhash_get(struct dp_vs_service *svc, struct conhash_s *conhash,
const struct rte_mbuf *mbuf)
{
char str[40] = {0};
uint64_t quic_cid;
uint32_t sip;
const struct node_s *node;
if (svc->flags & DP_VS_SVC_F_QID_HASH) {
if (svc->proto != IPPROTO_UDP) {
RTE_LOG(ERR, IPVS, "QUIC cid hash scheduler should only be set in UDP service.\n");
return NULL;
}
/* try to get CID for hash target first, then source IP. */
if (EDPVS_OK == get_quic_hash_target(mbuf, &quic_cid)) {
snprintf(str, sizeof(str), "%lu", quic_cid);
} else if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) {
snprintf(str, sizeof(str), "%u", sip);
} else {
return NULL;
}
} else if (svc->flags & DP_VS_SVC_F_SIP_HASH) {
if (EDPVS_OK == get_sip_hash_target(mbuf, &sip)) {
snprintf(str, sizeof(str), "%u", sip);
} else {
return NULL;
}
} else {
RTE_LOG(ERR, IPVS, "%s: invalid hash target.\n", __func__);
return NULL;
}
node = conhash_lookup(conhash, str);
return node == NULL? NULL: node->data;
}
可以看到摸袁,對于 udp 如果如果開啟了 CID钥顽,那么調(diào)用 get_quic_hash_target
生成 hash 值。其它情況一律使用 get_sip_hash_target
即源地址 ip 生成 hash
總結(jié)
這塊比較常見靠汁,也沒什么難度耳鸯,大家看看就好