Linux 協議棧分析 socket

Linux.協議棧分析.socket

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

int retval;

struct socket *sock;

int flags;

/* Check the SOCK_* constants for consistency. */

BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

flags = type & ~SOCK_TYPE_MASK;

if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

1266

1267

1268

1269

1270

1271

1272

1273

1274

1275

1276

1277

1278

1279

1280

1281

1282

1283

1284

1285

1286

1287

1288

1289

1290

1291

1292

1293

1294

1295

1296

1297

1298

1299

1300

1301

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

int retval;

struct socket *sock;

int flags;

/* Check the SOCK_* constants for consistency. */

BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

flags = type & ~SOCK_TYPE_MASK;

if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

return -EINVAL;

type &= SOCK_TYPE_MASK;

if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

retval = sock_create(family, type, protocol, &sock);

if (retval < 0)

goto out;

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

if (retval < 0)

goto out_release;

out:

/* It may be already another descriptor 8) Not kernel problem. */

return retval;

out_release:

sock_release(sock);

return retval;

}

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

int retval;

struct socket *sock;

int flags;

/* Check the SOCK_* constants for consistency. */

BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

flags = type & ~SOCK_TYPE_MASK;

if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

return -EINVAL;

type &= SOCK_TYPE_MASK;

if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

retval = sock_create(family, type, protocol, &sock);

if (retval < 0)

goto out;

2011 年 02 月 28 日 / puppy 發表於 Uncategorized / 尚無評論

通過查看socket的幫助手冊可以得到socket的定義形式爲：

int socket(int domain, int type, int protocol);

int socket(int domain, int type, int protocol);

domain的有效值如下:

       AF_UNIX, AF_LOCAL   Local communication              unix(7)
       AF_INET             IPv4 Internet protocols          ip(7)
       AF_INET6            IPv6 Internet protocols          ipv6(7)
       AF_IPX              IPX - Novell protocols
       AF_NETLINK          Kernel user interface device     netlink(7)
       AF_X25              ITU-T X.25 / ISO-8208 protocol   x25(7)
       AF_AX25             Amateur radio AX.25 protocol
       AF_ATMPVC           Access to raw ATM PVCs
       AF_APPLETALK        Appletalk                        ddp(7)
       AF_PACKET           Low level packet interface       packet(7)

而type的取值範圍爲:

       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based
                       byte  streams.  An out-of-band data transmission mecha‐
                       nism may be supported.
       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages
                       of a fixed maximum length).
       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-
                       based data transmission path  for  datagrams  of  fixed
                       maximum  length;  a  consumer  is  required  to read an
                       entire packet with each input system call.
       SOCK_RAW        Provides raw network protocol access.
       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐
                       antee ordering.
       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see
                       packet(7).

而type的取值範圍爲:

       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based
                       byte  streams.  An out-of-band data transmission mecha‐
                       nism may be supported.
       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages
                       of a fixed maximum length).
       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-
                       based data transmission path  for  datagrams  of  fixed
                       maximum  length;  a  consumer  is  required  to read an
                       entire packet with each input system call.
       SOCK_RAW        Provides raw network protocol access.
       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐
                       antee ordering.
       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see
                       packet(7).

而在內核版本2.6.27之後，還可以通過設定相應二進制爲1來設定socket的類型。即type可以在取上述值後再按位OR以下值。這一點可以在socket進入內核的源代碼中得到證實。

       SOCK_NONBLOCK   Set  the  O_NONBLOCK  file  status flag on the new open
                       file description.  Using this flag saves extra calls to
                       fcntl(2) to achieve the same result.
       SOCK_CLOEXEC    Set the close-on-exec (FD_CLOEXEC) flag on the new file
                       descriptor.  See the description of the O_CLOEXEC  flag
                       in open(2) for reasons why this may be useful.

protocol一般爲0。
socket函數經過前述的方式進入內核後會最終由sys_socket(net/socket.c)來完成。

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    return -EINVAL;
  type &= SOCK_TYPE_MASK;
 
  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
  retval = sock_create(family, type, protocol, &sock);
  if (retval < 0)
    goto out;
 
  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
  if (retval < 0)
    goto out_release;
 
out:
  /* It may be already another descriptor 8) Not kernel problem. */
  return retval;
 
out_release:
  sock_release(sock);
  return retval;
}

1278~1281行就是取得type的值並檢查是否合法。

1278~1281行就是取得type的值並檢查是否合法。
我們知道socket對於用戶的而言就是一個已經打開的特殊文件，而內核則爲插口(socket)定義了一種特殊的文件類型形成特殊的文件系統sockfs(net/socket.c)，而sys_socket中調用的兩個函數sock_create和sock_map_fd，可以看到這兩個函數都共用一個sock參數，這便是爲內核管理socket用的，而sock_map_fd明顯是爲用戶提供已經打開的文件號。
sockfs的建立過程省略，sockfs的定義如下：

301

302

303

304

305

306

307

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {

.name = "sockfs",

.get_sb = sockfs_get_sb,

.kill_sb = kill_anon_super,

};

而所謂的通過socket函數創建一個插口，就是在sockfs中創建一個特殊文件，或者說是一個結點，併爲實現相應插口功能建立一起一整套數據結構。所以首先就通過sock_create創建一個struct socket數據結構，然後通過sock_map_fd映射到一個已經打開的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定義

我們知道socket對於用戶的而言就是一個已經打開的特殊文件，而內核則爲插口(socket)定義了一種特殊的文件類型形成特殊的文件系統sockfs(net/socket.c)，而sys_socket中調用的兩個函數sock_create和sock_map_fd，可以看到這兩個函數都共用一個sock參數，這便是爲內核管理socket用的，而sock_map_fd明顯是爲用戶提供已經打開的文件號。
sockfs的建立過程省略，sockfs的定義如下：

static struct vfsmount *sock_mnt __read_mostly;
 
static struct file_system_type sock_fs_type = {
  .name =    "sockfs",
  .get_sb =  sockfs_get_sb,
  .kill_sb =  kill_anon_super,
};

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 */
struct socket {
  socket_state    state;
 
  kmemcheck_bitfield_begin(type);
  short      type;
  kmemcheck_bitfield_end(type);
 
  unsigned long    flags;
  /*
   * Please keep fasync_list & wait fields in the same cache line
   */
  struct fasync_struct  *fasync_list;
  wait_queue_head_t  wait;
 
  struct file    *file;
  struct sock    *sk;
  const struct proto_ops  *ops;
};

struct proto_ops {
  int    family;
  struct module  *owner;
  int    (*release)   (struct socket *sock);
  int    (*bind)       (struct socket *sock,
              struct sockaddr *myaddr,
              int sockaddr_len);
  int    (*connect)   (struct socket *sock,
              struct sockaddr *vaddr,
              int sockaddr_len, int flags);
  int    (*socketpair)(struct socket *sock1,
              struct socket *sock2);
  int    (*accept)    (struct socket *sock,
              struct socket *newsock, int flags);
  int    (*getname)   (struct socket *sock,
              struct sockaddr *addr,
              int *sockaddr_len, int peer);
  unsigned int  (*poll)       (struct file *file, struct socket *sock,
              struct poll_table_struct *wait);
  int    (*ioctl)     (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int    (*listen)    (struct socket *sock, int len);
  int    (*shutdown)  (struct socket *sock, int flags);
  int    (*setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*compat_setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*compat_getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len);
  int    (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len,
              int flags);
  int    (*mmap)       (struct file *file, struct socket *sock,
              struct vm_area_struct * vma);
  ssize_t    (*sendpage)  (struct socket *sock, struct page *page,
              int offset, size_t size, int flags);
  ssize_t   (*splice_read)(struct socket *sock,  loff_t *ppos,
               struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};

接下來分析sock_create(net/socket.c)，sock_create會調用__sock_create。

static int __sock_create(struct net *net, int family, int type, int protocol,
       struct socket **res, int kern)
{
  int err;
  struct socket *sock;
  const struct net_proto_family *pf;
 
  /*
   *      Check protocol is in range
   */
  if (family < 0 || family >= NPROTO)
    return -EAFNOSUPPORT;
  if (type < 0 || type >= SOCK_MAX)
    return -EINVAL;
 
  /* Compatibility.
 
     This uglymoron is moved from INET layer to here to avoid
     deadlock in module load.
   */
  if (family == PF_INET && type == SOCK_PACKET) {
    static int warned;
    if (!warned) {
      warned = 1;
      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
    }
    family = PF_PACKET;
  }
 
  err = security_socket_create(family, type, protocol, kern);
  if (err)
    return err;
 
  /*
   *  Allocate the socket and allow the family to set things up. if
   *  the protocol is 0, the family is instructed to select an appropriate
   *  default.
   */
  sock = sock_alloc();
  if (!sock) {
    if (net_ratelimit())
      printk(KERN_WARNING "socket: no more sockets\n");
    return -ENFILE;  /* Not exactly a match, but its the
           closest posix thing */
  }
 
  sock->type = type;
 
#ifdef CONFIG_MODULES
  /* Attempt to load a protocol module if the find failed.
   *
   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
   * requested real, full-featured networking support upon configuration.
   * Otherwise module support will break!
   */
  if (net_families[family] == NULL)
    request_module("net-pf-%d", family);
#endif
 
  rcu_read_lock();
  pf = rcu_dereference(net_families[family]);
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;
 
  /*
   * We will call the ->create function, that possibly is in a loadable
   * module, so we have to bump that loadable module refcnt first.
   */
  if (!try_module_get(pf->owner))
    goto out_release;
 
  /* Now protected by module ref count */
  rcu_read_unlock();
 
  err = pf->create(net, sock, protocol);
  if (err < 0)
    goto out_module_put;
 
  /*
   * Now to bump the refcnt of the [loadable] module that owns this
   * socket at sock_release time we decrement its refcnt.
   */
  if (!try_module_get(sock->ops->owner))
    goto out_module_busy;
 
  /*
   * Now that we're done with the ->create function, the [loadable]
   * module can have its refcnt decremented
   */
  module_put(pf->owner);
  err = security_socket_post_create(sock, family, type, protocol, kern);
  if (err)
    goto out_sock_release;
  *res = sock;
 
  return 0;
 
out_module_busy:
  err = -EAFNOSUPPORT;
out_module_put:
  sock->ops = NULL;
  module_put(pf->owner);
out_sock_release:
  sock_release(sock);
  return err;
 
out_release:
  rcu_read_unlock();
  goto out_sock_release;
}
 
int sock_create(int family, int type, int protocol, struct socket **res)
{
  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

1150~1171行做的很簡單，不過是參數檢查。
接下來的security_socket_create以及後面的security_socket_post_create都定義在/include/linux/security.h中定義的空函數

static inline int security_socket_create(int family, int type,
           int protocol, int kern)
{
  return 0;
}
static inline int security_socket_post_create(struct socket *sock,
                int family,
                int type,
                int protocol, int kern)
{
  return 0;
}

1182行的sock_alloc的代碼如下：

static struct socket *sock_alloc(void)
{
  struct inode *inode;
  struct socket *sock;
 
  inode = new_inode(sock_mnt->mnt_sb);
  if (!inode)
    return NULL;
 
  sock = SOCKET_I(inode);
 
  kmemcheck_annotate_bitfield(sock, type);
  inode->i_mode = S_IFSOCK | S_IRWXUGO;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
 
  percpu_add(sockets_in_use, 1);
  return sock;
}

其中的new_inode是在/fs/inode.c中定義

static struct inode *alloc_inode(struct super_block *sb)
{
  struct inode *inode;
 
  if (sb->s_op->alloc_inode)
    inode = sb->s_op->alloc_inode(sb);
  else
    inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
  if (!inode)
    return NULL;
 
  if (unlikely(inode_init_always(sb, inode))) {
    if (inode->i_sb->s_op->destroy_inode)
      inode->i_sb->s_op->destroy_inode(inode);
    else
      kmem_cache_free(inode_cachep, inode);
    return NULL;
  }
 
  return inode;
}

struct inode *new_inode(struct super_block *sb)
{
  /*
   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
   * error if st_ino won't fit in target struct field. Use 32bit counter
   * here to attempt to avoid that.
   */
  static unsigned int last_ino;
  struct inode *inode;
 
  spin_lock_prefetch(&inode_lock);
 
  inode = alloc_inode(sb);
  if (inode) {
    spin_lock(&inode_lock);
    __inode_add_to_lists(sb, NULL, inode);
    inode->i_ino = ++last_ino;
    inode->i_state = 0;
    spin_unlock(&inode_lock);
  }
  return inode;
}
EXPORT_SYMBOL(new_inode);

可以看出new_inode會調用alloc_inode分配inode，而alloc_inode會調用sockfs在VFS中註冊的相應的函數來處理，那這個函數是什麼呢？先來看一看/net/socket.c

static struct inode *sock_alloc_inode(struct super_block *sb)
{
  struct socket_alloc *ei;
 
  ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
  if (!ei)
    return NULL;
  init_waitqueue_head(&ei->socket.wait);
 
  ei->socket.fasync_list = NULL;
  ei->socket.state = SS_UNCONNECTED;
  ei->socket.flags = 0;
  ei->socket.ops = NULL;
  ei->socket.sk = NULL;
  ei->socket.file = NULL;
 
  return &ei->vfs_inode;
}

static const struct super_operations sockfs_ops = {
  .alloc_inode =  sock_alloc_inode,
  .destroy_inode =sock_destroy_inode,
  .statfs =  simple_statfs,
};

爲幫助理解列出struct socket_alloc 結構體的定義。

struct socket_alloc {
  struct socket socket;
  struct inode vfs_inode;
};
 
static inline struct socket *SOCKET_I(struct inode *inode)
{
  return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

可以看到這個函數其實就是sock_alloc_inode,該函數分配了一個struct socket_alloc類型的結構體，然後返回這個結構體中的一個成員變量vfs_inode的地址，可以看出來這就是一個inode結構。然後就回到了sock_alloc函數的第489行，通過SOCKET_I獲得與vfs_inode同在socket_alloc結構體中的成員socket的地址。然後程序返回到__sock_create的1190行。

1192開始的代碼說明，如果編譯內核開啓了CONFIG_MODULES也就是內核模塊的選項就先檢查內核現在是否有支持由family(就是domain)所指定的網域的代碼，如果沒有則通過request_module來安裝。

說到這裏就先看看1204行的net_families這個數組，很明顯它是控制和操作各個網域的一個控制結構體的集合，通過變量pf可以發現它的類型爲struct net_proto_family(/include/linux/net.h)

struct net_proto_family {
  int    family;
  int    (*create)(struct net *net, struct socket *sock, int protocol);
  struct module  *owner;
};

然後1219行通過pf調用相應網域的create的函數，可以很簡單地得出對於AF_UNIX, AF_INET, AF_INET6, AF_PACKET這些所對應的create函數肯定不一樣。接下來我們以AF_INET爲例說明。在/net/ipv4/af_inet.c中

static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};

由936可以得出對於AF_inet其create函數爲inet_create,定義於同一文件中。

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
  struct sock *sk;
  struct inet_protosw *answer;
  struct inet_sock *inet;
  struct proto *answer_prot;
  unsigned char answer_flags;
  char answer_no_check;
  int try_loading_module = 0;
  int err;
 
  if (unlikely(!inet_ehash_secret))
    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
      build_ehash_secret();
 
  sock->state = SS_UNCONNECTED;
 
  /* Look for the requested type/protocol pair. */
lookup_protocol:
  err = -ESOCKTNOSUPPORT;
  rcu_read_lock();
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) {
      if (protocol != IPPROTO_IP)
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) {
        protocol = answer->protocol;
        break;
      }
      if (IPPROTO_IP == answer->protocol)
        break;
    }
    err = -EPROTONOSUPPORT;
  }
 
  if (unlikely(err)) {
    if (try_loading_module < 2) {
      rcu_read_unlock();
      /*
       * Be more specific, e.g. net-pf-2-proto-132-type-1
       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
       */
      if (++try_loading_module == 1)
        request_module("net-pf-%d-proto-%d-type-%d",
                 PF_INET, protocol, sock->type);
      /*
       * Fall back to generic, e.g. net-pf-2-proto-132
       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
       */
      else
        request_module("net-pf-%d-proto-%d",
                 PF_INET, protocol);
      goto lookup_protocol;
    } else
      goto out_rcu_unlock;
  }
 
  err = -EPERM;
  if (answer->capability > 0 && !capable(answer->capability))
    goto out_rcu_unlock;
 
  err = -EAFNOSUPPORT;
  if (!inet_netns_ok(net, protocol))
    goto out_rcu_unlock;
 
  sock->ops = answer->ops;
  answer_prot = answer->prot;
  answer_no_check = answer->no_check;
  answer_flags = answer->flags;
  rcu_read_unlock();
 
  WARN_ON(answer_prot->slab == NULL);
 
  err = -ENOBUFS;
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;
 
  err = 0;
  sk->sk_no_check = answer_no_check;
  if (INET_PROTOSW_REUSE & answer_flags)
    sk->sk_reuse = 1;
 
  inet = inet_sk(sk);
  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
  if (SOCK_RAW == sock->type) {
    inet->num = protocol;
    if (IPPROTO_RAW == protocol)
      inet->hdrincl = 1;
  }
 
  if (ipv4_config.no_pmtu_disc)
    inet->pmtudisc = IP_PMTUDISC_DONT;
  else
    inet->pmtudisc = IP_PMTUDISC_WANT;
 
  inet->id = 0;
 
  sock_init_data(sock, sk);
 
  sk->sk_destruct     = inet_sock_destruct;
  sk->sk_protocol     = protocol;
  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
  inet->uc_ttl  = -1;
  inet->mc_loop  = 1;
  inet->mc_ttl  = 1;
  inet->mc_all  = 1;
  inet->mc_index  = 0;
  inet->mc_list  = NULL;
 
  sk_refcnt_debug_inc(sk);
 
  if (inet->num) {
    /* It assumes that any protocol which allows
     * the user to assign a number at socket
     * creation time automatically
     * shares.
     */
    inet->sport = htons(inet->num);
    /* Add to protocol hash chains. */
    sk->sk_prot->hash(sk);
  }
 
  if (sk->sk_prot->init) {
    err = sk->sk_prot->init(sk);
    if (err)
      sk_common_release(sk);
  }
out:
  return err;
out_rcu_unlock:
  rcu_read_unlock();
  goto out;
}

每283到325就是通過type和protocol從inetsw中找出對應的struct inet_protosw的結構體。inetsw是定義於(net/ipv4/af_inet.c)中定義的

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

而對於struct inet_protosw是在/include/net/protocol.h中定義

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
  struct list_head list;
 
        /* These two fields form the lookup key.  */
  unsigned short   type;     /* This is the 2nd argument to socket(2). */
  unsigned short   protocol; /* This is the L4 protocol number.  */
 
  struct proto   *prot;
  const struct proto_ops *ops;
 
  int              capability; /* Which (if any) capability do
              * we need to use this socket
              * interface?
                                      */
  char             no_check;   /* checksum on rcv/xmit/none? */
  unsigned char   flags;      /* See INET_PROTOSW_* below.  */
};

inetsw其實是就是Linux內核的典型的組織鏈表結構的一個數組，是按type組織的。inetsw是通過inet_register_protosw初始化的

void inet_register_protosw(struct inet_protosw *p)
{
  struct list_head *lh;
  struct inet_protosw *answer;
  int protocol = p->protocol;
  struct list_head *last_perm;
 
  spin_lock_bh(&inetsw_lock);
 
  if (p->type >= SOCK_MAX)
    goto out_illegal;
 
  /* If we are trying to override a permanent protocol, bail. */
  answer = NULL;
  last_perm = &inetsw[p->type];
  list_for_each(lh, &inetsw[p->type]) {
    answer = list_entry(lh, struct inet_protosw, list);
 
    /* Check only the non-wild match. */
    if (INET_PROTOSW_PERMANENT & answer->flags) {
      if (protocol == answer->protocol)
        break;
      last_perm = lh;
    }
 
    answer = NULL;
  }
  if (answer)
    goto out_permanent;
 
  /* Add the new entry after the last permanent entry if any, so that
   * the new entry does not override a permanent entry when matched with
   * a wild-card protocol. But it is allowed to override any existing
   * non-permanent entry.  This means that when we remove this entry, the
   * system automatically returns to the old behavior.
   */
  list_add_rcu(&p->list, last_perm);
out:
  spin_unlock_bh(&inetsw_lock);
 
  return;
 
out_permanent:
  printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
         protocol);
  goto out;
 
out_illegal:
  printk(KERN_ERR
         "Ignoring attempt to register invalid socket type %d.\n",
         p->type);
  goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

對於inet_register_protosw的調用是在inet_init中的第1593行進行的。

static int __init inet_init(void)
{
  struct sk_buff *dummy_skb;
  struct inet_protosw *q;
  struct list_head *r;
  int rc = -EINVAL;
 
  BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
  rc = proto_register(&tcp_prot, 1);
  if (rc)
    goto out;
 
  rc = proto_register(&udp_prot, 1);
  if (rc)
    goto out_unregister_tcp_proto;
 
  rc = proto_register(&raw_prot, 1);
  if (rc)
    goto out_unregister_udp_proto;
 
  /*
   *  Tell SOCKET that we are alive...
   */
 
  (void)sock_register(&inet_family_ops);
 
#ifdef CONFIG_SYSCTL
  ip_static_sysctl_init();
#endif
 
  /*
   *  Add all the base protocols.
   */
 
  if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
  if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
  if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
#ifdef CONFIG_IP_MULTICAST
  if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
#endif
 
  /* Register the socket-side information for inet_create. */
  for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    INIT_LIST_HEAD(r);
 
  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    inet_register_protosw(q);
 
  /*
   *  Set the ARP module up
   */
 
  arp_init();
 
  /*
   *  Set the IP module up
   */
 
  ip_init();
 
  tcp_v4_init();
 
  /* Setup TCP slab cache for open requests. */
  tcp_init();
 
  /* Setup UDP memory threshold */
  udp_init();
 
  /* Add UDP-Lite (RFC 3828) */
  udplite4_register();
 
  /*
   *  Set the ICMP layer up
   */
 
  if (icmp_init() < 0)
    panic("Failed to create the ICMP control socket.\n");
 
  /*
   *  Initialise the multicast router
   */
#if defined(CONFIG_IP_MROUTE)
  if (ip_mr_init())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
  /*
   *  Initialise per-cpu ipv4 mibs
   */
 
  if (init_ipv4_mibs())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
 
  ipv4_proc_init();
 
  ipfrag_init();
 
  dev_add_pack(&ip_packet_type);
 
  rc = 0;
out:
  return rc;
out_unregister_udp_proto:
  proto_unregister(&udp_prot);
out_unregister_tcp_proto:
  proto_unregister(&tcp_prot);
  goto out;
}
 
fs_initcall(inet_init);

從1592行可以看出初始化inetsw是用的inetsw_array數組，再看看inetsw_array數組。

const struct proto_ops inet_stream_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_stream_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = inet_accept,
  .getname     = inet_getname,
  .poll       = tcp_poll,
  .ioctl       = inet_ioctl,
  .listen       = inet_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = tcp_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = tcp_sendpage,
  .splice_read     = tcp_splice_read,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
 
const struct proto_ops inet_dgram_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = udp_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
 
/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = datagram_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
 
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};
 
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
  {
    .type =       SOCK_STREAM,
    .protocol =   IPPROTO_TCP,
    .prot =       &tcp_prot,
    .ops =        &inet_stream_ops,
    .capability = -1,
    .no_check =   0,
    .flags =      INET_PROTOSW_PERMANENT |
            INET_PROTOSW_ICSK,
  },
 
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_UDP,
    .prot =       &udp_prot,
    .ops =        &inet_dgram_ops,
    .capability = -1,
    .no_check =   UDP_CSUM_DEFAULT,
    .flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
         .type =       SOCK_RAW,
         .protocol =   IPPROTO_IP,  /* wild card */
         .prot =       &raw_prot,
         .ops =        &inet_sockraw_ops,
         .capability = CAP_NET_RAW,
         .no_check =   UDP_CSUM_DEFAULT,
         .flags =      INET_PROTOSW_REUSE,
       }
};
 
#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

假設我們分析ipv4中的TCP協議，其它協議也可以參照分析。現在回到inet_create函數，這個函數最重要的一行就是335，這一行的作用就是初始化套接口socket所應該對應的操作函數。例如如果用socket(AF_INET, SOCK_STREAM, 0);創建套接字，則內核就會在這裏爲這個套接字關聯上相應的TCP的操作函數集inet_stream_ops，以後在這個套接字上的數據的各種操作如accept listen bind send recv都會通過這些函數完成。
接下來在inet_create中的344後就是分配一個struct sock結構體，這個sock結構和socket結構是一一對應的，兩個結構各有一個成員指向對方。struct sock是在include/net/sock.h中定義，它有兩個非常重要的成員sk_receive_queue和sk_write_queue。還有兩個成員sk_rcvbuf,sk_sndbuf分別代表接收和發送緩衝區的大小，默認是32767字節，是在sock_init_data(net/core/sock.c)中初始化的。另外對於有連接模式可能要求超時重傳，所以還有一個sk_timer的定時隊列。

/**
  *  struct sock - network layer representation of sockets
  *  @__sk_common: shared layout with inet_timewait_sock
  *  @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *  @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *  @sk_lock:  synchronizer
  *  @sk_rcvbuf: size of receive buffer in bytes
  *  @sk_sleep: sock wait queue
  *  @sk_dst_cache: destination cache
  *  @sk_dst_lock: destination cache lock
  *  @sk_policy: flow policy
  *  @sk_rmem_alloc: receive queue bytes committed
  *  @sk_receive_queue: incoming packets
  *  @sk_wmem_alloc: transmit queue bytes committed
  *  @sk_write_queue: Packet sending queue
  *  @sk_async_wait_queue: DMA copied packets
  *  @sk_omem_alloc: "o" is "option" or "other"
  *  @sk_wmem_queued: persistent queue size
  *  @sk_forward_alloc: space allocated forward
  *  @sk_allocation: allocation mode
  *  @sk_sndbuf: size of send buffer in bytes
  *  @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *       %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *  @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
  *  @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *  @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *  @sk_gso_max_size: Maximum GSO segment size to build
  *  @sk_lingertime: %SO_LINGER l_linger setting
  *  @sk_backlog: always used with the per-socket spinlock held
  *  @sk_callback_lock: used with the callbacks in the end of this struct
  *  @sk_error_queue: rarely used
  *  @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *        IPV6_ADDRFORM for instance)
  *  @sk_err: last error
  *  @sk_err_soft: errors that don't cause failure but are the cause of a
  *          persistent failure not just 'timed out'
  *  @sk_drops: raw/udp drops counter
  *  @sk_ack_backlog: current listen backlog
  *  @sk_max_ack_backlog: listen backlog set in listen()
  *  @sk_priority: %SO_PRIORITY setting
  *  @sk_type: socket type (%SOCK_STREAM, etc)
  *  @sk_protocol: which protocol this socket belongs in this network family
  *  @sk_peercred: %SO_PEERCRED setting
  *  @sk_rcvlowat: %SO_RCVLOWAT setting
  *  @sk_rcvtimeo: %SO_RCVTIMEO setting
  *  @sk_sndtimeo: %SO_SNDTIMEO setting
  *  @sk_filter: socket filtering instructions
  *  @sk_protinfo: private area, net family specific, when not using slab
  *  @sk_timer: sock cleanup timer
  *  @sk_stamp: time stamp of last packet received
  *  @sk_socket: Identd and reporting IO signals
  *  @sk_user_data: RPC layer private data
  *  @sk_sndmsg_page: cached page for sendmsg
  *  @sk_sndmsg_off: cached offset for sendmsg
  *  @sk_send_head: front of stuff to transmit
  *  @sk_security: used by security modules
  *  @sk_mark: generic packet mark
  *  @sk_write_pending: a write to stream socket waits to start
  *  @sk_state_change: callback to indicate change in the state of the sock
  *  @sk_data_ready: callback to indicate there is data to be processed
  *  @sk_write_space: callback to indicate there is bf sending space available
  *  @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *  @sk_backlog_rcv: callback to process the backlog
  *  @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
 */
struct sock {
  /*
   * Now struct inet_timewait_sock also uses sock_common, so please just
   * don't add nothing before this first member (__sk_common) --acme
   */
  struct sock_common  __sk_common;
#define sk_node      __sk_common.skc_node
#define sk_nulls_node    __sk_common.skc_nulls_node
#define sk_refcnt    __sk_common.skc_refcnt
 
#define sk_copy_start    __sk_common.skc_hash
#define sk_hash      __sk_common.skc_hash
#define sk_family    __sk_common.skc_family
#define sk_state    __sk_common.skc_state
#define sk_reuse    __sk_common.skc_reuse
#define sk_bound_dev_if    __sk_common.skc_bound_dev_if
#define sk_bind_node    __sk_common.skc_bind_node
#define sk_prot      __sk_common.skc_prot
#define sk_net      __sk_common.skc_net
  kmemcheck_bitfield_begin(flags);
  unsigned int    sk_shutdown  : 2,
        sk_no_check  : 2,
        sk_userlocks : 4,
        sk_protocol  : 8,
        sk_type      : 16;
  kmemcheck_bitfield_end(flags);
  int      sk_rcvbuf;
  socket_lock_t    sk_lock;
  /*
   * The backlog queue is special, it is always used with
   * the per-socket spinlock held and requires low latency
   * access. Therefore we special case it's implementation.
   */
  struct {
    struct sk_buff *head;
    struct sk_buff *tail;
  } sk_backlog;
  wait_queue_head_t  *sk_sleep;
  struct dst_entry  *sk_dst_cache;
#ifdef CONFIG_XFRM
  struct xfrm_policy  *sk_policy[2];
#endif
  rwlock_t    sk_dst_lock;
  atomic_t    sk_rmem_alloc;
  atomic_t    sk_wmem_alloc;
  atomic_t    sk_omem_alloc;
  int      sk_sndbuf;
  struct sk_buff_head  sk_receive_queue;
  struct sk_buff_head  sk_write_queue;
#ifdef CONFIG_NET_DMA
  struct sk_buff_head  sk_async_wait_queue;
#endif
  int      sk_wmem_queued;
  int      sk_forward_alloc;
  gfp_t      sk_allocation;
  int      sk_route_caps;
  int      sk_gso_type;
  unsigned int    sk_gso_max_size;
  int      sk_rcvlowat;
  unsigned long     sk_flags;
  unsigned long          sk_lingertime;
  struct sk_buff_head  sk_error_queue;
  struct proto    *sk_prot_creator;
  rwlock_t    sk_callback_lock;
  int      sk_err,
        sk_err_soft;
  atomic_t    sk_drops;
  unsigned short    sk_ack_backlog;
  unsigned short    sk_max_ack_backlog;
  __u32      sk_priority;
  struct ucred    sk_peercred;
  long      sk_rcvtimeo;
  long      sk_sndtimeo;
  struct sk_filter        *sk_filter;
  void      *sk_protinfo;
  struct timer_list  sk_timer;
  ktime_t      sk_stamp;
  struct socket    *sk_socket;
  void      *sk_user_data;
  struct page    *sk_sndmsg_page;
  struct sk_buff    *sk_send_head;
  __u32      sk_sndmsg_off;
  int      sk_write_pending;
#ifdef CONFIG_SECURITY
  void      *sk_security;
#endif
  __u32      sk_mark;
  /* XXX 4 bytes hole on 64 bit */
  void      (*sk_state_change)(struct sock *sk);
  void      (*sk_data_ready)(struct sock *sk, int bytes);
  void      (*sk_write_space)(struct sock *sk);
  void      (*sk_error_report)(struct sock *sk);
    int      (*sk_backlog_rcv)(struct sock *sk,
              struct sk_buff *skb);
  void                    (*sk_destruct)(struct sock *sk);
};

在分析sk_alloc之前先分析一下answer_prot. answer_prot是struct proto類型(include/net/sock.h)

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 * transport -> network interface is defined by struct inet_proto
 */
struct proto {
  void      (*close)(struct sock *sk,
          long timeout);
  int      (*connect)(struct sock *sk,
                struct sockaddr *uaddr,
          int addr_len);
  int      (*disconnect)(struct sock *sk, int flags);
 
  struct sock *    (*accept) (struct sock *sk, int flags, int *err);
 
  int      (*ioctl)(struct sock *sk, int cmd,
           unsigned long arg);
  int      (*init)(struct sock *sk);
  void      (*destroy)(struct sock *sk);
  void      (*shutdown)(struct sock *sk, int how);
  int      (*setsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*getsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          int __user *option);
#ifdef CONFIG_COMPAT
  int      (*compat_setsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*compat_getsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          int __user *option);
#endif
  int      (*sendmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg, size_t len);
  int      (*recvmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg,
          size_t len, int noblock, int flags,
          int *addr_len);
  int      (*sendpage)(struct sock *sk, struct page *page,
          int offset, size_t size, int flags);
  int      (*bind)(struct sock *sk,
          struct sockaddr *uaddr, int addr_len);
 
  int      (*backlog_rcv) (struct sock *sk,
            struct sk_buff *skb);
 
  /* Keeping track of sk's, looking them up, and port selection methods. */
  void      (*hash)(struct sock *sk);
  void      (*unhash)(struct sock *sk);
  int      (*get_port)(struct sock *sk, unsigned short snum);
 
  /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
  unsigned int    inuse_idx;
#endif
 
  /* Memory pressure */
  void      (*enter_memory_pressure)(struct sock *sk);
  atomic_t    *memory_allocated;  /* Current allocated memory. */
  struct percpu_counter  *sockets_allocated;  /* Current number of sockets. */
  /*
   * Pressure flag: try to collapse.
   * Technical note: it is used by multiple contexts non atomically.
   * All the __sk_mem_schedule() is of this nature: accounting
   * is strict, actions are advisory and have some latency.
   */
  int      *memory_pressure;
  int      *sysctl_mem;
  int      *sysctl_wmem;
  int      *sysctl_rmem;
  int      max_header;
 
  struct kmem_cache  *slab;
  unsigned int    obj_size;
  int      slab_flags;
 
  struct percpu_counter  *orphan_count;
 
  struct request_sock_ops  *rsk_prot;
  struct timewait_sock_ops *twsk_prot;
 
  union {
    struct inet_hashinfo  *hashinfo;
    struct udp_table  *udp_table;
    struct raw_hashinfo  *raw_hash;
  } h;
 
  struct module    *owner;
 
  char      name[32];
 
  struct list_head  node;
#ifdef SOCK_REFCNT_DEBUG
  atomic_t    socks;
#endif
};

假設分析的是TCP協議，則通過336行的賦值從inetsw_array找到其prot成員變量爲tcp_prot(net/ipv4/tcp_ipv4.h)。

struct proto tcp_prot = {
  .name      = "TCP",
  .owner      = THIS_MODULE,
  .close      = tcp_close,
  .connect    = tcp_v4_connect,
  .disconnect    = tcp_disconnect,
  .accept      = inet_csk_accept,
  .ioctl      = tcp_ioctl,
  .init      = tcp_v4_init_sock,
  .destroy    = tcp_v4_destroy_sock,
  .shutdown    = tcp_shutdown,
  .setsockopt    = tcp_setsockopt,
  .getsockopt    = tcp_getsockopt,
  .recvmsg    = tcp_recvmsg,
  .backlog_rcv    = tcp_v4_do_rcv,
  .hash      = inet_hash,
  .unhash      = inet_unhash,
  .get_port    = inet_csk_get_port,
  .enter_memory_pressure  = tcp_enter_memory_pressure,
  .sockets_allocated  = &tcp_sockets_allocated,
  .orphan_count    = &tcp_orphan_count,
  .memory_allocated  = &tcp_memory_allocated,
  .memory_pressure  = &tcp_memory_pressure,
  .sysctl_mem    = sysctl_tcp_mem,
  .sysctl_wmem    = sysctl_tcp_wmem,
  .sysctl_rmem    = sysctl_tcp_rmem,
  .max_header    = MAX_TCP_HEADER,
  .obj_size    = sizeof(struct tcp_sock),
  .slab_flags    = SLAB_DESTROY_BY_RCU,
  .twsk_prot    = &tcp_timewait_sock_ops,
  .rsk_prot    = &tcp_request_sock_ops,
  .h.hashinfo    = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
  .compat_setsockopt  = compat_tcp_setsockopt,
  .compat_getsockopt  = compat_tcp_getsockopt,
#endif
};

通過tcp_prot的結構體對各成員的賦值可以發現並沒有初始化，而obj_size被初始化爲sizeof(struct tcp_sock)這一點可以在後面的分析中看到。接下來看inet_create的344行，即sk_alloc(net/ipv4/af_inet.c)。

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
    int family)
{
  struct sock *sk;
  struct kmem_cache *slab;
 
  slab = prot->slab;
  if (slab != NULL) {
    sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
    if (!sk)
      return sk;
    if (priority & __GFP_ZERO) {
      /*
       * caches using SLAB_DESTROY_BY_RCU should let
       * sk_node.next un-modified. Special care is taken
       * when initializing object to zero.
       */
      if (offsetof(struct sock, sk_node.next) != 0)
        memset(sk, 0, offsetof(struct sock, sk_node.next));
      memset(&sk->sk_node.pprev, 0,
             prot->obj_size - offsetof(struct sock,
               sk_node.pprev));
    }
  }
  else
    sk = kmalloc(prot->obj_size, priority);
 
  if (sk != NULL) {
    kmemcheck_annotate_bitfield(sk, flags);
 
    if (security_sk_alloc(sk, family, priority))
      goto out_free;
 
    if (!try_module_get(prot->owner))
      goto out_free_sec;
  }
 
  return sk;
 
out_free_sec:
  security_sk_free(sk);
out_free:
  if (slab != NULL)
    kmem_cache_free(slab, sk);
  else
    kfree(sk);
  return NULL;
}

/**
 *  sk_alloc - All socket objects are allocated here
 *  @net: the applicable net namespace
 *  @family: protocol family
 *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *  @prot: struct proto associated with this new sock instance
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
          struct proto *prot)
{
  struct sock *sk;
 
  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
  if (sk) {
    sk->sk_family = family;
    /*
     * See comment in struct sock definition to understand
     * why we need sk_prot_creator -acme
     */
    sk->sk_prot = sk->sk_prot_creator = prot;
    sock_lock_init(sk);
    sock_net_set(sk, get_net(net));
    atomic_set(&sk->sk_wmem_alloc, 1);
  }
 
  return sk;
}
EXPORT_SYMBOL(sk_alloc);

很明顯在sk_alloc中直接調用sk_prot_alloc來分配sock結構，在sk_prot_alloc中先判定slab是否爲空(如前提示)，由於tcp_prot並未初始化slab所以直接分配obj_size大小即sizeof(struct tcp_sock)的空間，並返回空間類型爲struct sock *的地址，但是又可以看到該空間的大小爲sizeof(struct tcp_sock)，那就說明有兩種情況：一、sizeof(struct tcp_sock) == sizeof(struct sock) 二、sizeof(struct tcp_sock) >= sizeof(struct sock) 。通過分析實際是第二種情況，通過列出一系列數據結構可以很明顯地看出。
先來看struct tcp_sock結構的定義(include/linux/tcp.h)

struct tcp_sock {
  /* inet_connection_sock has to be the first member of tcp_sock */
  struct inet_connection_sock  inet_conn;
  u16  tcp_header_len;  /* Bytes of tcp header to send    */
  u16  xmit_size_goal_segs; /* Goal for segmenting output packets */
 
/*
 *  Header prediction flags
 *  0x5?10 << 16 + snd_wnd in net byte order
 */
  __be32  pred_flags;
 
/*
 *  RFC793 variables by their proper names. This means you can
 *  read the code and the spec side by side (and laugh ...)
 *  See RFC793 and RFC1122. The RFC writes these in capitals.
 */
   u32  rcv_nxt;  /* What we want to receive next   */
  u32  copied_seq;  /* Head of yet unread data    */
  u32  rcv_wup;  /* rcv_nxt on last window update sent  */
   u32  snd_nxt;  /* Next sequence we send    */
 
   u32  snd_una;  /* First byte we want an ack for  */
   u32  snd_sml;  /* Last byte of the most recently transmitted small packet */
  u32  rcv_tstamp;  /* timestamp of last received ACK (for keepalives) */
  u32  lsndtime;  /* timestamp of last sent data packet (for restart window) */
 
  /* Data for direct copy to user */
  struct {
    struct sk_buff_head  prequeue;
    struct task_struct  *task;
    struct iovec    *iov;
    int      memory;
    int      len;
#ifdef CONFIG_NET_DMA
    /* members for async copy */
    struct dma_chan    *dma_chan;
    int      wakeup;
    struct dma_pinned_list  *pinned_list;
    dma_cookie_t    dma_cookie;
#endif
  } ucopy;
 
  u32  snd_wl1;  /* Sequence for window update    */
  u32  snd_wnd;  /* The window we expect to receive  */
  u32  max_window;  /* Maximal window ever seen from peer  */
  u32  mss_cache;  /* Cached effective mss, not including SACKS */
 
  u32  window_clamp;  /* Maximal window to advertise    */
  u32  rcv_ssthresh;  /* Current window clamp      */
 
  u32  frto_highmark;  /* snd_nxt when RTO occurred */
  u16  advmss;    /* Advertised MSS      */
  u8  frto_counter;  /* Number of new acks after RTO */
  u8  nonagle;  /* Disable Nagle algorithm?             */
 
/* RTT measurement */
  u32  srtt;    /* smoothed round trip time << 3  */
  u32  mdev;    /* medium deviation      */
  u32  mdev_max;  /* maximal mdev for the last rtt period  */
  u32  rttvar;    /* smoothed mdev_max      */
  u32  rtt_seq;  /* sequence number to update rttvar  */
 
  u32  packets_out;  /* Packets which are "in flight"  */
  u32  retrans_out;  /* Retransmitted packets out    */
 
  u16  urg_data;  /* Saved octet of OOB data and control flags */
  u8  ecn_flags;  /* ECN status bits.      */
  u8  reordering;  /* Packet reordering metric.    */
  u32  snd_up;    /* Urgent pointer    */
 
  u8  keepalive_probes; /* num of allowed keep alive probes  */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
  struct tcp_options_received rx_opt;
 
/*
 *  Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
   u32  snd_ssthresh;  /* Slow start size threshold    */
   u32  snd_cwnd;  /* Sending congestion window    */
  u32  snd_cwnd_cnt;  /* Linear increase counter    */
  u32  snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
  u32  snd_cwnd_used;
  u32  snd_cwnd_stamp;
 
   u32  rcv_wnd;  /* Current receiver window    */
  u32  write_seq;  /* Tail(+1) of data held in tcp send buffer */
  u32  pushed_seq;  /* Last pushed seq, required to talk to windows */
  u32  lost_out;  /* Lost packets      */
  u32  sacked_out;  /* SACK'd packets      */
  u32  fackets_out;  /* FACK'd packets      */
  u32  tso_deferred;
  u32  bytes_acked;  /* Appropriate Byte Counting - RFC3465 */
 
  /* from STCP, retrans queue hinting */
  struct sk_buff* lost_skb_hint;
  struct sk_buff *scoreboard_skb_hint;
  struct sk_buff *retransmit_skb_hint;
 
  struct sk_buff_head  out_of_order_queue; /* Out of order segments go here */
 
  /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */
  struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
  struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
  struct tcp_sack_block recv_sack_cache[4];
 
  struct sk_buff *highest_sack;   /* highest skb with SACK received
           * (validity guaranteed only if
           * sacked_out > 0)
           */
 
  int     lost_cnt_hint;
  u32     retransmit_high;  /* L-bits may be on up to this seqno */
 
  u32  lost_retrans_low;  /* Sent seq after any rxmit (lowest) */
 
  u32  prior_ssthresh; /* ssthresh saved at recovery start  */
  u32  high_seq;  /* snd_nxt at onset of congestion  */
 
  u32  retrans_stamp;  /* Timestamp of the last retransmit,
         * also used in SYN-SENT to remember stamp of
         * the first SYN. */
  u32  undo_marker;  /* tracking retrans started here. */
  int  undo_retrans;  /* number of undoable retransmissions. */
  u32  total_retrans;  /* Total retransmits for entire connection */
 
  u32  urg_seq;  /* Seq of received urgent pointer */
  unsigned int    keepalive_time;    /* time before keep alive takes place */
  unsigned int    keepalive_intvl;  /* time interval between keep alive probes */
 
  int      linger2;
 
/* Receiver side RTT estimation */
  struct {
    u32  rtt;
    u32  seq;
    u32  time;
  } rcv_rtt_est;
 
/* Receiver queue space */
  struct {
    int  space;
    u32  seq;
    u32  time;
  } rcvq_space;
 
/* TCP-specific MTU probe information. */
  struct {
    u32      probe_seq_start;
    u32      probe_seq_end;
  } mtu_probe;
 
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
  const struct tcp_sock_af_ops  *af_specific;
 
/* TCP MD5 Signature Option information */
  struct tcp_md5sig_info  *md5sig_info;
#endif
};

在tcp_sock的結構體的第一個成員變量類型爲struct inet_connection_sock(include/net/inet_connection_sock.h)

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:     FIFO of established children
 * @icsk_bind_hash:     Bind node
 * @icsk_timeout:     Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:       Retransmit timeout
 * @icsk_pmtu_cookie     Last pmtu seen by socket
 * @icsk_ca_ops       Pluggable congestion control hook
 * @icsk_af_ops       Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:     Congestion control state
 * @icsk_retransmits:     Number of unrecovered [RTO] timeouts
 * @icsk_pending:     Scheduled timer event
 * @icsk_backoff:     Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:     unanswered 0 window probes
 * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:       Delayed ACK control data
 * @icsk_mtup;       MTU probing control data
 */
struct inet_connection_sock {
  /* inet_sock has to be the first member! */
  struct inet_sock    icsk_inet;
  struct request_sock_queue icsk_accept_queue;
  struct inet_bind_bucket    *icsk_bind_hash;
  unsigned long      icsk_timeout;
   struct timer_list    icsk_retransmit_timer;
   struct timer_list    icsk_delack_timer;
  __u32        icsk_rto;
  __u32        icsk_pmtu_cookie;
  const struct tcp_congestion_ops *icsk_ca_ops;
  const struct inet_connection_sock_af_ops *icsk_af_ops;
  unsigned int      (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
  __u8        icsk_ca_state;
  __u8        icsk_retransmits;
  __u8        icsk_pending;
  __u8        icsk_backoff;
  __u8        icsk_syn_retries;
  __u8        icsk_probes_out;
  __u16        icsk_ext_hdr_len;
  struct {
    __u8      pending;   /* ACK is pending         */
    __u8      quick;   /* Scheduled number of quick acks     */
    __u8      pingpong;   /* The session is interactive       */
    __u8      blocked;   /* Delayed ACK was blocked by socket lock */
    __u32      ato;     /* Predicted tick of soft clock     */
    unsigned long    timeout;   /* Currently scheduled timeout       */
    __u32      lrcvtime;   /* timestamp of last received data packet */
    __u16      last_seg_size; /* Size of last incoming segment     */
    __u16      rcv_mss;   /* MSS used for delayed ACK decisions     */
  } icsk_ack;
  struct {
    int      enabled;
 
    /* Range of MTUs to search */
    int      search_high;
    int      search_low;
 
    /* Information on the current probe. */
    int      probe_size;
  } icsk_mtup;
  u32        icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE  (16 * sizeof(u32))
};

在 inet_connection_sock結構體中第一個成員變量類型爲struct inet_sock(include/net/inet_sock.h)

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @daddr - Foreign IPv4 addr
 * @rcv_saddr - Bound local IPv4 addr
 * @dport - Destination port
 * @num - Local port
 * @saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @sport - Source port
 * @id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
  /* sk and pinet6 has to be the first two members of inet_sock */
  struct sock    sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  struct ipv6_pinfo  *pinet6;
#endif
  /* Socket demultiplex comparisons on incoming packets. */
  __be32      daddr;
  __be32      rcv_saddr;
  __be16      dport;
  __u16      num;
  __be32      saddr;
  __s16      uc_ttl;
  __u16      cmsg_flags;
  struct ip_options  *opt;
  __be16      sport;
  __u16      id;
  __u8      tos;
  __u8      mc_ttl;
  __u8      pmtudisc;
  __u8      recverr:1,
        is_icsk:1,
        freebind:1,
        hdrincl:1,
        mc_loop:1,
        transparent:1,
        mc_all:1;
  int      mc_index;
  __be32      mc_addr;
  struct ip_mc_socklist  *mc_list;
  struct {
    unsigned int    flags;
    unsigned int    fragsize;
    struct ip_options  *opt;
    struct dst_entry  *dst;
    int      length; /* Total length of all frames */
    __be32      addr;
    struct flowi    fl;
  } cork;
};

而inet_sock的第一個成員正是struct sock類型，所以sk_prot_alloc直接返回struct sock *類型指針是沒有問題的，接下來執行inet_create中的353行用inet_sk通過sk獲得inet指針的值，inet_sk函數其實就相當於強制類型轉換，返回的就是sk的指針。
接下來程序就一路返回到__sock_create，接着再返回到sys_socket中。在sys_socket中調用了最後一個函數sock_map_fd(net/socket.c，將socket指針sock與一個已經打開的文件號關聯起來返回給用戶程序。

/*
 *  Obtains the first available file descriptor and sets it up for use.
 *
 *  These functions create file structures and maps them to fd space
 *  of the current process. On success it returns file descriptor
 *  and file struct implicitly stored in sock->file.
 *  Note that another thread may close file descriptor before we return
 *  from this function. We use the fact that now we do not refer
 *  to socket after mapping. If one day we will need it, this
 *  function will increment ref. count on file by 1.
 *
 *  In any case returned fd MAY BE not valid!
 *  This race condition is unavoidable
 *  with shared fd spaces, we cannot solve it inside kernel,
 *  but we take care of internal coherence yet.
 */
 
static int sock_alloc_fd(struct file **filep, int flags)
{
  int fd;
 
  fd = get_unused_fd_flags(flags);
  if (likely(fd >= 0)) {
    struct file *file = get_empty_filp();
 
    *filep = file;
    if (unlikely(!file)) {
      put_unused_fd(fd);
      return -ENFILE;
    }
  } else
    *filep = NULL;
  return fd;
}
 
static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
{
  struct dentry *dentry;
  struct qstr name = { .name = "" };
 
  dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
  if (unlikely(!dentry))
    return -ENOMEM;
 
  dentry->d_op = &sockfs_dentry_operations;
  /*
   * We dont want to push this dentry into global dentry hash table.
   * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
   * This permits a working /proc/$pid/fd/XXX on sockets
   */
  dentry->d_flags &= ~DCACHE_UNHASHED;
  d_instantiate(dentry, SOCK_INODE(sock));
 
  sock->file = file;
  init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
      &socket_file_ops);
  SOCK_INODE(sock)->i_fop = &socket_file_ops;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->f_pos = 0;
  file->private_data = sock;
 
  return 0;
}
 
int sock_map_fd(struct socket *sock, int flags)
{
  struct file *newfile;
  int fd = sock_alloc_fd(&newfile, flags);
 
  if (likely(fd >= 0)) {
    int err = sock_attach_fd(sock, newfile, flags);
 
    if (unlikely(err < 0)) {
      put_filp(newfile);
      put_unused_fd(fd);
      return err;
    }
    fd_install(fd, newfile);
  }
  return fd;
}

fs/dcache.c

/* the caller must hold dcache_lock */
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
  if (inode)
    list_add(&dentry->d_alias, &inode->i_dentry);
  dentry->d_inode = inode;
  fsnotify_d_instantiate(dentry, inode);
}
 
/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
  BUG_ON(!list_empty(&entry->d_alias));
  spin_lock(&dcache_lock);
  __d_instantiate(entry, inode);
  spin_unlock(&dcache_lock);
  security_d_instantiate(entry, inode);
}

/net/socket.c

/*
 *  Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *  in the operation structures but are done directly via the socketcall() multiplexor.
 */
 
static const struct file_operations socket_file_ops = {
  .owner =  THIS_MODULE,
  .llseek =  no_llseek,
  .aio_read =  sock_aio_read,
  .aio_write =  sock_aio_write,
  .poll =    sock_poll,
  .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
  .compat_ioctl = compat_sock_ioctl,
#endif
  .mmap =    sock_mmap,
  .open =    sock_no_open,  /* special open code to disallow open via /proc */
  .release =  sock_close,
  .fasync =  sock_fasync,
  .sendpage =  sock_sendpage,
  .splice_write = generic_splice_sendpage,
  .splice_read =  sock_splice_read,
};

在sock_map_fd中先通過402行獲得一個未用的已經打開的文件號以及file結構，然後通過405行調用sock_attach_fd將文件號與sock相關聯起來，在sock_attach_fd中先通地375行從sockfs中分配一個dentry，其中sock_mnt就是在描述sockfs中提到的，d_instantiate的作用就是將dentry與socket的inode關聯起來，然後388行又將sock->file與file關聯起來。389～390行將socket文件上的操作初始化爲socket_file_ops。這樣，通過send/recv進入內核將調用inet_stream_ops中的函數，而通過read/write調用將調用socket_file_ops中的函數。然後反回至sys_socket函數中，再經過系統調用切換到用戶態，socket函數的整個調用過程完成。

轉自http://acm.hrbeu.edu.cn/~puppy/2011/02/28/linux-%E5%8D%8F%E8%AE%AE%E6%A0%88%E5%88%86%E6%9E%90-socket/

Linux 協議棧分析 socket

Linux.協議棧分析.socket

Linux藍牙系列(1) --- bluetooth基本概念

todo 沒有分類-會後續移到上面

Linux 協議棧分析 socket

Linux 協議棧分析 socket——筆記

Android2.2添加Ethernet 框架支持（一）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結