Socket套接字
套接字就是支持TCP/IP網(wǎng)絡(luò)通信的基本操作單元,是進(jìn)行TCP/IP通信的接口。套接字Socket看做是不同主機(jī)之間的進(jìn)程進(jìn)行雙向通信的端點约郁,簡單的說就是通信的兩方的一種約定缩挑,用套接字中的相關(guān)函數(shù)來完成通信過程。套接字Socket是連接應(yīng)用程序和網(wǎng)絡(luò)驅(qū)動程序的橋梁鬓梅,套接字Socket在應(yīng)用程序中創(chuàng)建供置,通過綁定與網(wǎng)絡(luò)驅(qū)動建立關(guān)系。此后绽快,應(yīng)用程序送給套接字Socket的數(shù)據(jù)芥丧,由套接字Socket交給網(wǎng)絡(luò)驅(qū)動程序向網(wǎng)絡(luò)上發(fā)送出去。計算機(jī)從網(wǎng)絡(luò)上收到與該套接字Socket綁定IP地址和端口號相關(guān)的數(shù)據(jù)后坊罢,由網(wǎng)絡(luò)驅(qū)動程序交給Socket续担,應(yīng)用程序便可從該Socket中提取接收到的數(shù)據(jù),網(wǎng)絡(luò)應(yīng)用程序就是這樣通過Socket進(jìn)行數(shù)據(jù)的發(fā)送與接收的活孩。
操作系統(tǒng)區(qū)分不同應(yīng)用程序進(jìn)程間的網(wǎng)絡(luò)通信和連接,主要有3個參數(shù):通信的目的IP地址物遇、使用的傳輸層協(xié)議(TCP或UDP)和使用的端口號。
Socket=Ipaddress+TCP/UDP+portSocket=Ipaddress+TCP/UDP+port
系統(tǒng)調(diào)用
對于Linux系統(tǒng)來說憾儒,它分為有用戶態(tài)和內(nèi)核態(tài)询兴,用戶態(tài)只能訪問屬于用戶態(tài)的內(nèi)存空間,內(nèi)核態(tài)的內(nèi)存空間對于用戶態(tài)來說是不可見的航夺。系統(tǒng)調(diào)用則是指通過中斷來向內(nèi)核發(fā)出請求蕉朵,實現(xiàn)內(nèi)核提供的某些服務(wù)。系統(tǒng)調(diào)用的機(jī)制核心就是用了操作系統(tǒng)給用戶留下的一個特別開放的中斷來實現(xiàn)阳掐,在Linux中為int 80h中斷始衅。
socket創(chuàng)建
在用戶進(jìn)程中使用下示socket函數(shù)來進(jìn)行socket系統(tǒng)調(diào)用創(chuàng)建屬于tcp的socket
socket(AF_INET, SOCK_STREAM, 0))
AF_INET是指這是ipv4的協(xié)議族,SOCK_STREAM(流式socket缭保,面向連接)是指這是tcp類型的socket汛闸,對應(yīng)的udp的socket類型標(biāo)識為SOCK_DGRAM(數(shù)據(jù)報式socket,無連接)艺骂,該函數(shù)通過系統(tǒng)調(diào)用后如果成功則會返回一個與socket相關(guān)聯(lián)的fd(int類型)诸老,對應(yīng)在系統(tǒng)調(diào)用里面調(diào)用的是sys_socketcall,sys_socketcall幾乎是所有用戶進(jìn)程socket所有操作函數(shù)的入口:
/** sys_socketcall (linux/syscalls.h)*/
asmlinkage long sys_socketcall(int call, unsigned long __user *args);
sys_socketcall實際調(diào)用的是SYSCALL_DEFINE2:
/** SYSCALL_DEFINE2 (net/socket.c)*/
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[AUDITSC_ARGS];
unsigned long a0, a1;
int err;
unsigned int len;
if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
call = array_index_nospec(call, SYS_SENDMMSG + 1);
len = nargs[call];
if (len > sizeof(a))
return -EINVAL;
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT;
err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
if (err)
return err;
a0 = a[0];
a1 = a[1];
switch (call) { //通過判斷call指令的類型來進(jìn)入不同的函數(shù)進(jìn)行處理
case SYS_SOCKET: //與用戶態(tài)中 socket(int domain, int type, int protocol) 對應(yīng)钳恕,創(chuàng)建socket
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND: //socket綁定
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT: //socket建立連接
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1); //socket監(jiān)聽
break;
case SYS_ACCEPT: //socket接收系統(tǒng)
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[2], 0);
break;
case SYS_GETSOCKNAME:
err =
sys_getsockname(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_GETPEERNAME:
err =
sys_getpeername(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_SOCKETPAIR:
err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
case SYS_RECV:
err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_RECVFROM:
err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4],
(int __user *)a[5]);
break;
case SYS_SHUTDOWN:
err = sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
break;
case SYS_GETSOCKOPT:
err =
sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
(int __user *)a[4]);
break;
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]);
break;
case SYS_SENDMMSG:
err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]);
break;
case SYS_RECVMMSG:
err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
(struct timespec __user *)a[4]);
break;
case SYS_ACCEPT4:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[2], a[3]);
break;
default:
err = -EINVAL;
break;
}
return err;
}
在SYSCALL_DEFINE2函數(shù)中别伏,通過call指令判斷進(jìn)入到不同函數(shù)中處理,以創(chuàng)建socket為例忧额,其實際處理時在sys_socket中厘肮,它也是一個系統(tǒng)調(diào)用,對應(yīng)的是SYSCALL_DEFINE3:
/** SYSCALL_DEFINE3 (net/socket.c)*/
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
TRACE_OUT(tr_sock, ("family=%d type=%d protocol=%d\n", family, type, protocol));
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
//創(chuàng)建socket睦番,關(guān)鍵函數(shù)
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
SYSCALL_DEFINE3中主要判斷了設(shè)置的socket類型type类茂,通過調(diào)用sock_create創(chuàng)建socket結(jié)構(gòu)耍属,使用sock_map_fd將socket結(jié)構(gòu)映射為文件描述符并返回,socket結(jié)構(gòu)體如下所示:
struct socket {
socket_state state; // 連接狀態(tài):SS_CONNECTING, SS_CONNECTED 等
kmemcheck_bitfield_begin(type);
short type; // 類型:SOCK_STREAM, SOCK_DGRAM 等
kmemcheck_bitfield_end(type);
unsigned long flags; // 標(biāo)志位:SOCK_ASYNC_NOSPACE(發(fā)送隊列是否已滿)等
struct socket_wq __rcu *wq; // 等待隊列
struct file *file; // 該socket結(jié)構(gòu)體對應(yīng)VFS中的file指針
struct sock *sk; // socket網(wǎng)絡(luò)層表示巩检,真正處理網(wǎng)絡(luò)協(xié)議的地方
const struct proto_ops *ops; // socket操作函數(shù)集:bind, connect, accept 等
};
socket結(jié)構(gòu)體中定義了socket的基本狀態(tài)厚骗,類型,標(biāo)志兢哭,等待隊列领舰,文件指針,操作函數(shù)集等厦瓢,利用 sock 結(jié)構(gòu)提揍,將 socket 操作與真正處理網(wǎng)絡(luò)協(xié)議相關(guān)的事務(wù)分離。
sock_create函數(shù)調(diào)用__sock_create
/** sock_create (net/socket.c)*/
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create)
__sock_create函數(shù)如下
/** __sock_create (net/socket.c)*/
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf) {
TRACE_OUT(tr_sock, ("net_proto_family is NULL"));
goto out_release;
}
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
/*核心處理煮仇,調(diào)用協(xié)議簇中的create函數(shù)來初始化socket*/
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
其中核心的處理為pf->create(net, sock, protocol, kern)劳跃,調(diào)用協(xié)議簇中的create函數(shù),ipv4的定義在af_inet.c文件中
/** (net/ipv4/af_inet.c)*/
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
可以看到.create實際上調(diào)用的是inet_create函數(shù)浙垫,在inet_create中進(jìn)行socket初始化的具體實現(xiàn)刨仑。