1. ONE RPC - Open Network Computing RPC
參考文檔
最早由Sun研發(fā)的RPC協(xié)議,也叫SUN RPC寡痰。 Linux Kernel Client有它的的實(shí)現(xiàn)抗楔。nfs-ganesha借用ntirpc - Transport-Independent RPC
ONE RPC可以通過(guò)rpcgen工具對(duì)msg.x
進(jìn)行編譯,分別產(chǎn)生client skeleton
和server skeleton
下面是nfs-ganesha中的nfsv41.x
部分代碼
program NFS4_PROGRAM {
version NFS_V4 {
void
NFSPROC4_NULL(void) = 0;
COMPOUND4res
NFSPROC4_COMPOUND(COMPOUND4args) = 1;
} = 4;
} = 100003;
RPC傳輸?shù)膮?shù)通過(guò)XDR - eXternal Data Representation
編解碼拦坠,在網(wǎng)絡(luò)中傳輸连躏。
2. rpcbind service
參考
RPC由三元組<program number, protocol (tcp/udp), version>確定。這個(gè)三元組可以從rcpbind得到相應(yīng)的port贪婉。
rpcinfo -p
program vers proto port service
100000 4 tcp 111 portmapper
100000 3 tcp 111 portmapper
100000 2 tcp 111 portmapper
100000 4 udp 111 portmapper
100000 3 udp 111 portmapper
100000 2 udp 111 portmapper
100003 4 udp 2049 nfs
100003 4 tcp 2049 nfs
100011 1 udp 875 rquotad
100011 1 tcp 875 rquotad
100011 2 udp 875 rquotad
100011 2 tcp 875 rquotad
NFS4固定端口是2049反粥,不需要rpcbind service,這樣可以讓NFS的client穿過(guò)防火墻疲迂。
3. 代碼中的一些縮寫
- ioq - io queue
- svc - service
- rqst - request
- xprt - tranport
- clnt - client
4. 網(wǎng)絡(luò)連接部分代碼
4.1 Channel
每種連接(TCP,UDP,RDMA) 會(huì)創(chuàng)建一個(gè)channel,每個(gè)channel由struct svc_rqst_rec
表示才顿,每個(gè)channel都創(chuàng)建一個(gè)epoll_fd
- svc_rqst_new_evchan //創(chuàng)建channel
- svc_rqst_lookup_chan //通過(guò)channel id得到struct svc_rqst_rec
- svc_rqst_hook_events //將epoll事件注冊(cè)到這個(gè)channel上,當(dāng)epoll得到通知時(shí),可以獲得struct rpc_dplx_rec
- svc_rqst_epoll_events //此時(shí)已經(jīng)得到epoll通知尤蒿,處理epoll的所有事件
- svc_rqst_epoll_event //此時(shí)已經(jīng)得到epoll通知郑气,此函數(shù)返回struct rpc_dplx_rec
- svc_rqst_evchan_reg //將channel和xprt建立聯(lián)系,rec->ev_p = sr_rec
- svc_rqst_xprt_register //svc_vc_rendezvous調(diào)用此函數(shù)腰池,將newxprt和channel 1進(jìn)行關(guān)聯(lián)尾组,并且newxprt和xprt建立父子聯(lián)系。
- svc_rqst_xprt_task //處理channel接收到的請(qǐng)求
4.2 Channel數(shù)量
數(shù)量是5示弓。
- id 4: UDP listen
- id 3: TCP listen
- id 2: rpcbind連接
- id 1: TCP accept
- id 0: 未用
4.3 xprt
xprt代表一個(gè)對(duì)應(yīng)channel的一個(gè)連接(或?qū)嵗?讳侨,用struct svc_xprt
表示。Channel和xprt的關(guān)系是一對(duì)多奏属。對(duì)于同一個(gè)Client來(lái)說(shuō)跨跨,Channel和xprt的關(guān)系是一對(duì)一。
- svc_xprt_lookup //通過(guò)fd查詢xprt
- makefd_xprt //根據(jù)fd創(chuàng)建xprt
4.4 svc_rqst_rec數(shù)據(jù)結(jié)構(gòu)
變量一般叫做sr_rec,描述一個(gè)channel勇婴。
struct svc_rqst_rec {
struct work_pool_entry ev_wpe;
struct opr_rbtree call_expires;
mutex_t ev_lock;
int sv[2];
uint32_t id_k; /* chan id */
/*
* union of event processor types
*/
enum svc_event_type ev_type;
union {
#if defined(TIRPC_EPOLL)
struct {
int epoll_fd;
struct epoll_event ctrl_ev;
struct epoll_event *events;
u_int max_events; /* max epoll events */
} epoll;
#endif
struct {
fd_set set; /* select/fd_set (currently unhooked) */
} fd;
} ev_u;
int32_t ev_refcnt;
uint16_t ev_flags;
};
4.5 svc_xprt數(shù)據(jù)結(jié)構(gòu)
變量名一般叫xprt忱嘹,代表server端的一個(gè)連接,用makefd_xprt
創(chuàng)建一個(gè)連接耕渴。用svc_xprt_lookup
查詢一個(gè)連接拘悦。
- 對(duì)于socket fd的監(jiān)聽(tīng)是一種連接,它的xp_recv是
svc_vc_rendezvous
- 對(duì)于accept fd的監(jiān)聽(tīng)是一種連接橱脸,它的xp_recv是
svc_vc_recv
typedef struct svc_xprt SVCXPRT;
/*
* Server side transport handle
*/
struct svc_xprt {
struct xp_ops {
/* receive incoming requests */
svc_xprt_fun_t xp_recv;
/* get transport status */
svc_xprt_fun_t xp_stat;
/* decode incoming message header (called by request_cb) */
svc_req_fun_t xp_decode;
/* send reply */
svc_req_fun_t xp_reply;
/* optional checksum (after authentication/decryption) */
void (*xp_checksum) (struct svc_req *, void *, size_t);
/* actually destroy after xp_destroy_it and xp_release_it */
void (*xp_destroy) (SVCXPRT *, u_int, const char *, const int);
/* catch-all function */
bool (*xp_control) (SVCXPRT *, const u_int, void *);
/* free client user data */
svc_xprt_fun_t xp_free_user_data;
} *xp_ops;
/* handle incoming connections (per xp_fd) */
union {
svc_req_fun_t process_cb;
svc_xprt_fun_t rendezvous_cb;
} xp_dispatch;
SVCXPRT *xp_parent;
char *xp_tp; /* transport provider device name */
char *xp_netid; /* network token */
void *xp_p1; /* private: for use by svc ops */
void *xp_p2; /* private: for use by svc ops */
void *xp_p3; /* private: for use by svc lib */
void *xp_u1; /* client user data */
void *xp_u2; /* client user data */
struct rpc_address xp_local; /* local address, length, port */
struct rpc_address xp_remote; /* remote address, length, port */
#if defined(HAVE_BLKIN)
/* blkin tracing */
struct {
char *svc_name;
struct blkin_endpoint endp;
} blkin;
#endif
/* serialize private data */
mutex_t xp_lock;
int xp_fd;
int xp_ifindex; /* interface index */
int xp_si_type; /* si type */
int xp_type; /* xprt type */
int32_t xp_refcnt; /* handle reference count */
uint16_t xp_flags; /* flags */
};
4.6 rpc_dplx_rec數(shù)據(jù)結(jié)構(gòu)
可以通過(guò)REC_XPRT得到rec础米,rec是epoll事件攜帶的參數(shù)。
struct rpc_dplx_rec *rec = REC_XPRT(xprt);
struct rpc_dplx_rec {
struct svc_xprt xprt;
struct xdr_ioq ioq; //可以通過(guò)它找到rec上所有的xioq
struct opr_rbtree call_replies;
struct opr_rbtree_node fd_node;
struct {
rpc_dplx_lock_t lock;
struct timespec ts;
} recv;
union {
struct {
struct epoll_event event;
} epoll;
} ev_u;
void *ev_p; /* struct svc_rqst_rec (internal) */
size_t maxrec;
long pagesz;
u_int recvsz;
u_int sendsz;
uint32_t call_xid; /**< current call xid */
uint32_t ev_count; /**< atomic count of waiting events */
};
#define REC_XPRT(p) (opr_containerof((p), struct rpc_dplx_rec, xprt))
4.7 網(wǎng)絡(luò)連接相關(guān)操作
- Allocate_sockets //為TCP,UDP創(chuàng)建socket
- Bind_sockets //執(zhí)行bind
- Create_tcp //執(zhí)行l(wèi)isten,為TCP socket調(diào)用EPOLL_CTL_ADD
- svc_vc_rendezvous //執(zhí)行accept得到connection fd,為它調(diào)用EPOLL_CTL_ADD,事件
- svc_vc_recv //receive for specific socket connection fd
4.8 epoll事件的關(guān)聯(lián)
在svc_rqst_hook_events()中,設(shè)置ev->data.ptr = rec
用svc_rqst_epoll_event可以反向得到這個(gè)rec
4.9 svc_rqst_epoll_events中收到1個(gè)event
- 調(diào)用svc_rqst_epoll_event得到rec,這個(gè)是在svc_rqst_hook_events()中設(shè)置的
- 對(duì)于大多數(shù)情況慰技,只有一個(gè)event,直接調(diào)用svc_rqst_xprt_task
- svc_rqst_xprt_task
4.10 svc_rqst_epoll_events中收到多個(gè)event
- 調(diào)用svc_rqst_epoll_event得到rec,這個(gè)是在svc_rqst_hook_events()中設(shè)置的
- 對(duì)于少數(shù)數(shù)情況椭盏,有多個(gè)event。先直接調(diào)用svc_rqst_xprt_task吻商,且將多余的部分放到work pool里處理
- svc_rqst_xprt_task
- work_pool_submit => svc_rqst_xprt_task
4.11 典型的callstack
TCP的第一次連接掏颊,由svc_vc_rendezvous處理,在其中執(zhí)行accept艾帐。accept以后的TCP連接由svc_vc_recv處理乌叶。在svc_vc_rendezvous中對(duì)創(chuàng)建一個(gè)新的xprt,以后的TCP連接都用xprt,并建立新的xprt的處理函數(shù)柒爸。參見(jiàn)svc_vc_override_ops
函數(shù)准浴。
#0 nfs_rpc_dispatch_tcp_NFS (xprt=0x7fffe0000ca0) at /root/code/nfs-ganesha-2.7/src/MainNFSD/nfs_rpc_dispatcher_thread.c:308
#1 0x00007ffff7bb489a in svc_vc_rendezvous (xprt=0x7f58e0) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_vc.c:507
#2 0x00007ffff7bb1c6b in svc_rqst_xprt_task (wpe=0x7f5af8) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:769
#3 0x00007ffff7bb20ee in svc_rqst_epoll_events (sr_rec=0x7e0680, n_events=1) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:941
#4 0x00007ffff7bb2396 in svc_rqst_epoll_loop (sr_rec=0x7e0680) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:1014
#5 0x00007ffff7bb2460 in svc_rqst_run_task (wpe=0x7e0680) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:1050
#6 0x00007ffff7bbb313 in work_pool_thread (arg=0x7fffd4000b40) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/work_pool.c:181
#7 0x00007ffff6e036ba in start_thread (arg=0x7fffd3dfd700) at pthread_create.c:333
#8 0x00007ffff693141d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
#0 nfs4_Compound (arg=0x7f7314001268, req=0x7f7314000b60, res=0x7f7314001bc0) at /root/code/nfs-ganesha-2.7/src/Protocols/NFS/nfs4_Compound.c:618
#1 0x000000000045731a in nfs_rpc_process_request (reqdata=0x7f7314000b60) at /root/code/nfs-ganesha-2.7/src/MainNFSD/nfs_worker_thread.c:1329
#2 0x0000000000457ac1 in nfs_rpc_valid_NFS (req=0x7f7314000b60) at /root/code/nfs-ganesha-2.7/src/MainNFSD/nfs_worker_thread.c:1539
#3 0x00007f73bc7c36f5 in svc_vc_decode (req=0x7f7314000b60) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_vc.c:825
#4 0x000000000044a9b4 in nfs_rpc_decode_request (xprt=0x7f7328000b20, xdrs=0x7f73140008c0) at /root/code/nfs-ganesha-2.7/src/MainNFSD/nfs_rpc_dispatcher_thread.c:1341
#5 0x00007f73bc7c3606 in svc_vc_recv (xprt=0x7f7328000b20) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_vc.c:798
#6 0x00007f73bc7bfb40 in svc_rqst_xprt_task (wpe=0x7f7328000d38) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:767
#7 0x00007f73bc7bffe0 in svc_rqst_epoll_events (sr_rec=0x3e45de0, n_events=1) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:939
#8 0x00007f73bc7c0288 in svc_rqst_epoll_loop (sr_rec=0x3e45de0) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:1012
#9 0x00007f73bc7c0352 in svc_rqst_run_task (wpe=0x3e45de0) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/svc_rqst.c:1048
#10 0x00007f73bc7c91a1 in work_pool_thread (arg=0x7f732c000b20) at /root/code/nfs-ganesha-2.7/src/libntirpc/src/work_pool.c:181
5. 數(shù)據(jù)的接收和發(fā)送
- svc_vc_recv: 接收數(shù)據(jù),將讀到的數(shù)據(jù)封裝到XDR數(shù)據(jù)結(jié)構(gòu)捎稚。
- svc_vc_decode: 將XDR數(shù)據(jù)decode成struct rpc_msg乐横,它定義在rfc5531
- nfs_rpc_valid_NFS: 根據(jù)rpc_msg中的cb_prog,路由到不同的處理函數(shù)中處理
struct rpc_msg {
u_int32_t rm_xid;
enum msg_type rm_direction;
struct {
struct call_body RM_cmb;
struct reply_body RM_rmb;
} ru;
#define rm_call ru.RM_cmb
#define rm_reply ru.RM_rmb
/* New with TI-RPC */
struct xdrpair rm_xdr;
uint32_t rm_flags;
/* Moved in N TI-RPC; used by auth, logging, replies */
rpcprog_t cb_prog;
rpcvers_t cb_vers;
rpcproc_t cb_proc;
struct opaque_auth cb_cred;
struct opaque_auth cb_verf; /* protocol specific - provided by client */
/* avoid separate alloc/free */
char rq_cred_body[MAX_AUTH_BYTES]; /* size is excessive */
};
6. ioq (未完成)
- svc_ioq_flushv
- svc_ioq_write
- svc_ioq_write_submit
XDR數(shù)據(jù)結(jié)構(gòu)
typedef struct rpc_xdr {
const struct xdr_ops {
/* get 4 unsigned bytes from underlying stream */
bool (*x_getunit)(struct rpc_xdr *, uint32_t *);
/* put 4 unsigned bytes to underlying stream */
bool (*x_putunit)(struct rpc_xdr *, const uint32_t);
/* get some bytes from " */
bool (*x_getbytes)(struct rpc_xdr *, char *, u_int);
/* put some bytes to " */
bool (*x_putbytes)(struct rpc_xdr *, const char *, u_int);
/* returns bytes off from beginning */
u_int (*x_getpostn)(struct rpc_xdr *);
/* lets you reposition the stream */
bool (*x_setpostn)(struct rpc_xdr *, u_int);
/* free private resources of this xdr_stream */
void (*x_destroy)(struct rpc_xdr *);
bool (*x_control)(struct rpc_xdr *, int, void *);
/* new vector and refcounted interfaces */
bool (*x_getbufs)(struct rpc_xdr *, xdr_uio *, u_int);
bool (*x_putbufs)(struct rpc_xdr *, xdr_uio *, u_int);
} *x_ops;
void *x_public; /* users' data */
void *x_private; /* pointer to private data */
void *x_lib[2]; /* RPC library private */
uint8_t *x_data; //每encode一個(gè)元素今野,向前一個(gè)位置
void *x_base; //base 地址
struct xdr_vio x_v; //其內(nèi)部buffer是由一個(gè)uv的v提供葡公,它里面的vio_wrap記錄最大界限,如果超過(guò)需要執(zhí)行xdr_ioq_putunit操作
u_int x_handy; /* extra private word */
u_int x_flags; /* shared flags */
enum xdr_op x_op; /* operation; fast additional param */
} XDR;
xdrs->x_v.vio_base是起始位置条霜,xdrs->x_data是目前指針的位置催什。所以兩個(gè)值之差,可以算出有多少內(nèi)存放在buffer里了宰睡。
xdr_ioq數(shù)據(jù)結(jié)構(gòu)
變量名一般叫xioq
struct xdr_ioq {
XDR xdrs[1];
struct work_pool_entry ioq_wpe; //回收時(shí)使用
struct poolq_entry ioq_s; //插入到rec->ioq.ioq_uv.uvqh.qh使用
pthread_cond_t ioq_cond;
struct poolq_head *ioq_pool; //僅僅rdma使用蒲凶,不考慮
struct xdr_ioq_uv_head ioq_uv; //只有rec->ioq.ioq_uv才是真正的head, 其他ioq的ioq_uv無(wú)意義,這個(gè)設(shè)計(jì)太stupid
uint64_t id;
};
struct xdr_ioq_uv_head {
struct poolq_head uvqh; //xioq隊(duì)列
struct poolq_entry *(*uvq_fetch)(struct xdr_ioq *xioq,
struct poolq_head *ioqh,
char *comment, u_int count,
u_int ioq_flags);
size_t min_bsize; /* multiple of pagesize */
size_t max_bsize; /* multiple of min_bsize */
size_t plength; /* sub-total of previous lengths, not including
* any length in this xdr_ioq_uv */
u_int pcount; /* fill index (0..m) in the current stream */
};
xdr_ioq_uv數(shù)據(jù)結(jié)構(gòu)
變量名一般叫uv或者data,uv是兩種表示方式拆内,目前用的是v,即iovec表示,u貌似RDMA時(shí)候用旋圆,暫不考慮。
struct xdr_ioq_uv
{
struct poolq_entry uvq;
/* spliced buffers, if any */
struct xdr_uio u;
/* Each xdr_ioq_uv can have a different kind of buffer or data source,
* as indicated by the uio_flags, needing different release techniques.
* Note: overloads uio_release with uio_p1 for pool.
*/
struct xdr_vio v; /* immediately follows u (uio_vio[0]) */
};
xdr_vio數(shù)據(jù)結(jié)構(gòu)
參見(jiàn)函數(shù)xdr_ioq_uv_create
/* XDR buffer vector descriptors */
typedef struct xdr_vio {
uint8_t *vio_base;
uint8_t *vio_head; /* minimum vio_tail (header offset) */
uint8_t *vio_tail;
uint8_t *vio_wrap; /* maximum vio_tail */
} xdr_vio;
/* vio_wrap >= vio_tail >= vio_head >= vio_base */
svc_vc_recv分析
- 創(chuàng)建xioq麸恍,插入rec中的隊(duì)列
- 創(chuàng)建uv,插入到xioq隊(duì)列
- xdr_ioq_reset灵巧,填xioq->xdrs
xdr_ioq_uv //buffer
xdr_ioq //ioq
- xdr_ioq_uv_create()創(chuàng)建緩存,返回struct xdr_ioq_uv
- xdr_ioq_reset //fill ioq
- xdr_ioq_uv_reset //fill XDR
- svc_vc_recv結(jié)束前,已經(jīng)把數(shù)據(jù)讀入uv里面
- svc_vc_decode孩等,將XDR數(shù)據(jù)decode到struct rpc_msg
XDR_FLAG_VIO
xdr_ioq_uv_advance *
xdr_ioq_uv_append
svc_ioq_flushv
svc_req
struct svc_req {
SVCXPRT *rq_xprt; /* associated transport */
/* New with TI-RPC */
char *rq_clntname; /* read only client name */
char *rq_svcname; /* read only cooked service cred */
XDR *rq_xdrs;
void *rq_u1; /* user data */
void *rq_u2; /* user data */
uint64_t rq_cksum;
/* Moved in N TI-RPC */
struct SVCAUTH *rq_auth; /* auth handle */
void *rq_ap1; /* auth private */
void *rq_ap2; /* auth private */
/* avoid separate alloc/free */
struct rpc_msg rq_msg;
uint32_t rq_refcnt;
};
數(shù)據(jù)收發(fā)
svc_vc_reply
- xdr_ioq_create 創(chuàng)建IOQ,此IOQ的buffer來(lái)自一個(gè)新創(chuàng)建的uv
- xdr_reply_encode
- SVCAUTH_WRAP,會(huì)內(nèi)部調(diào)用ganesha的xdr_COMPOUND4res
- svc_ioq_write_now
8. 總結(jié)
- ntirpc為TCP listen采够,TCP accept創(chuàng)建Channel
- 在Channel下創(chuàng)建xptr
- 為Channel下的xtpr設(shè)定epoll事件肄方,事件關(guān)聯(lián)到具體的xptr
- epoll得到通知,從xptr讀取數(shù)據(jù)到XDR
- svc_vc_decode將XDR解碼成rpc_msg
- 根據(jù)rpc_msg的cb_prog蹬癌,路由到具體的處理函數(shù)