Linux 協議棧分析 socket

Linux.協議棧分析.socket

 

 

 

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))


 

 

 

 

 

 

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))


 

 

 

 

 

 

 

 

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

  int retval;

  struct socket *sock;

  int flags;

 

  /* Check the SOCK_* constants for consistency.  */

  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

 

  flags = type & ~SOCK_TYPE_MASK;

  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

 

 

 

 

 

 

 

 

1266

1267

1268

1269

1270

1271

1272

1273

1274

1275

1276

1277

1278

1279

1280

1281

1282

1283

1284

1285

1286

1287

1288

1289

1290

1291

1292

1293

1294

1295

1296

1297

1298

1299

1300

1301

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

  int retval;

  struct socket *sock;

  int flags;

 

  /* Check the SOCK_* constants for consistency.  */

  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

 

  flags = type & ~SOCK_TYPE_MASK;

  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

    return -EINVAL;

  type &= SOCK_TYPE_MASK;

 

  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

 

  retval = sock_create(family, type, protocol, &sock);

  if (retval < 0)

    goto out;

 

  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

  if (retval < 0)

    goto out_release;

 

out:

  /* It may be already another descriptor 8) Not kernel problem. */

  return retval;

 

out_release:

  sock_release(sock);

  return retval;

}

 

 

 

 

 

 

 

 

 

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

  int retval;

  struct socket *sock;

  int flags;

 

  /* Check the SOCK_* constants for consistency.  */

  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

 

  flags = type & ~SOCK_TYPE_MASK;

  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

    return -EINVAL;

  type &= SOCK_TYPE_MASK;

 

  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

 

  retval = sock_create(family, type, protocol, &sock);

  if (retval < 0)

    goto out;

 

 

 

 

通過查看socket的幫助手冊可以得到socket的定義形式爲:


int socket(int domain, int type, int protocol);


C
int socket(int domain, int type, int protocol);


domain的有效值如下:

       AF_UNIX, AF_LOCAL   Local communication              unix(7)
       AF_INET             IPv4 Internet protocols          ip(7)
       AF_INET6            IPv6 Internet protocols          ipv6(7)
       AF_IPX              IPX - Novell protocols
       AF_NETLINK          Kernel user interface device     netlink(7)
       AF_X25              ITU-T X.25 / ISO-8208 protocol   x25(7)
       AF_AX25             Amateur radio AX.25 protocol
       AF_ATMPVC           Access to raw ATM PVCs
       AF_APPLETALK        Appletalk                        ddp(7)
       AF_PACKET           Low level packet interface       packet(7)

而type的取值範圍爲:

       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based
                       byte  streams.  An out-of-band data transmission mecha‐
                       nism may be supported.
       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages
                       of a fixed maximum length).
       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-
                       based data transmission path  for  datagrams  of  fixed
                       maximum  length;  a  consumer  is  required  to read an
                       entire packet with each input system call.
       SOCK_RAW        Provides raw network protocol access.
       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐
                       antee ordering.
       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see
                       packet(7).

 

而type的取值範圍爲:

       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based
                       byte  streams.  An out-of-band data transmission mecha‐
                       nism may be supported.
       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages
                       of a fixed maximum length).
       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-
                       based data transmission path  for  datagrams  of  fixed
                       maximum  length;  a  consumer  is  required  to read an
                       entire packet with each input system call.
       SOCK_RAW        Provides raw network protocol access.
       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐
                       antee ordering.
       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see
                       packet(7).

 

 

而在內核版本2.6.27之後,還可以通過設定相應二進制爲1來設定socket的類型。即type可以在取上述值後再按位OR以下值。這一點可以在socket進入內核的源代碼中得到證實。

       SOCK_NONBLOCK   Set  the  O_NONBLOCK  file  status flag on the new open
                       file description.  Using this flag saves extra calls to
                       fcntl(2) to achieve the same result.
       SOCK_CLOEXEC    Set the close-on-exec (FD_CLOEXEC) flag on the new file
                       descriptor.  See the description of the O_CLOEXEC  flag
                       in open(2) for reasons why this may be useful.

protocol一般爲0。
socket函數經過前述的方式進入內核後會最終由sys_socket(net/socket.c)來完成。

C
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock;
  int flags;
 
  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
 
  flags = type & ~SOCK_TYPE_MASK;
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    return -EINVAL;
  type &= SOCK_TYPE_MASK;
 
  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
  retval = sock_create(family, type, protocol, &sock);
  if (retval < 0)
    goto out;
 
  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
  if (retval < 0)
    goto out_release;
 
out:
  /* It may be already another descriptor 8) Not kernel problem. */
  return retval;
 
out_release:
  sock_release(sock);
  return retval;
}

1278~1281行就是取得type的值並檢查是否合法。

 

 

 

 

 

 

 

 

 

 

 

1278~1281行就是取得type的值並檢查是否合法。
我們知道socket對於用戶的而言就是一個已經打開的特殊文件,而內核則爲插口(socket)定義了一種特殊的文件類型形成特殊的文件系統sockfs(net/socket.c),而sys_socket中調用的兩個函數sock_create和sock_map_fd,可以看到這兩個函數都共用一個sock參數,這便是爲內核管理socket用的,而sock_map_fd明顯是爲用戶提供已經打開的文件號。
sockfs的建立過程省略,sockfs的定義如下:

C

301

302

303

304

305

306

307

static struct vfsmount *sock_mnt __read_mostly;

 

static struct file_system_type sock_fs_type = {

  .name =    "sockfs",

  .get_sb =  sockfs_get_sb,

  .kill_sb =  kill_anon_super,

};

而所謂的通過socket函數創建一個插口,就是在sockfs中創建一個特殊文件,或者說是一個結點,併爲實現相應插口功能建立一起一整套數據結構。所以首先就通過sock_create創建一個struct socket數據結構,然後通過sock_map_fd映射到一個已經打開的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定義

 

 

 

 

 

 

 

 

 


我們知道socket對於用戶的而言就是一個已經打開的特殊文件,而內核則爲插口(socket)定義了一種特殊的文件類型形成特殊的文件系統sockfs(net/socket.c),而sys_socket中調用的兩個函數sock_create和sock_map_fd,可以看到這兩個函數都共用一個sock參數,這便是爲內核管理socket用的,而sock_map_fd明顯是爲用戶提供已經打開的文件號。
sockfs的建立過程省略,sockfs的定義如下:

C
301
302
303
304
305
306
307
static struct vfsmount *sock_mnt __read_mostly;
 
static struct file_system_type sock_fs_type = {
  .name =    "sockfs",
  .get_sb =  sockfs_get_sb,
  .kill_sb =  kill_anon_super,
};

而所謂的通過socket函數創建一個插口,就是在sockfs中創建一個特殊文件,或者說是一個結點,併爲實現相應插口功能建立一起一整套數據結構。所以首先就通過sock_create創建一個struct socket數據結構,然後通過sock_map_fd映射到一個已經打開的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定義(include/linux/net.h):

C
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 */
struct socket {
  socket_state    state;
 
  kmemcheck_bitfield_begin(type);
  short      type;
  kmemcheck_bitfield_end(type);
 
  unsigned long    flags;
  /*
   * Please keep fasync_list & wait fields in the same cache line
   */
  struct fasync_struct  *fasync_list;
  wait_queue_head_t  wait;
 
  struct file    *file;
  struct sock    *sk;
  const struct proto_ops  *ops;
};
C
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
struct proto_ops {
  int    family;
  struct module  *owner;
  int    (*release)   (struct socket *sock);
  int    (*bind)       (struct socket *sock,
              struct sockaddr *myaddr,
              int sockaddr_len);
  int    (*connect)   (struct socket *sock,
              struct sockaddr *vaddr,
              int sockaddr_len, int flags);
  int    (*socketpair)(struct socket *sock1,
              struct socket *sock2);
  int    (*accept)    (struct socket *sock,
              struct socket *newsock, int flags);
  int    (*getname)   (struct socket *sock,
              struct sockaddr *addr,
              int *sockaddr_len, int peer);
  unsigned int  (*poll)       (struct file *file, struct socket *sock,
              struct poll_table_struct *wait);
  int    (*ioctl)     (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int    (*listen)    (struct socket *sock, int len);
  int    (*shutdown)  (struct socket *sock, int flags);
  int    (*setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*compat_setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*compat_getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len);
  int    (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len,
              int flags);
  int    (*mmap)       (struct file *file, struct socket *sock,
              struct vm_area_struct * vma);
  ssize_t    (*sendpage)  (struct socket *sock, struct page *page,
              int offset, size_t size, int flags);
  ssize_t   (*splice_read)(struct socket *sock,  loff_t *ppos,
               struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};

接下來分析sock_create(net/socket.c),sock_create會調用__sock_create。

C
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
static int __sock_create(struct net *net, int family, int type, int protocol,
       struct socket **res, int kern)
{
  int err;
  struct socket *sock;
  const struct net_proto_family *pf;
 
  /*
   *      Check protocol is in range
   */
  if (family < 0 || family >= NPROTO)
    return -EAFNOSUPPORT;
  if (type < 0 || type >= SOCK_MAX)
    return -EINVAL;
 
  /* Compatibility.
 
     This uglymoron is moved from INET layer to here to avoid
     deadlock in module load.
   */
  if (family == PF_INET && type == SOCK_PACKET) {
    static int warned;
    if (!warned) {
      warned = 1;
      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
    }
    family = PF_PACKET;
  }
 
  err = security_socket_create(family, type, protocol, kern);
  if (err)
    return err;
 
  /*
   *  Allocate the socket and allow the family to set things up. if
   *  the protocol is 0, the family is instructed to select an appropriate
   *  default.
   */
  sock = sock_alloc();
  if (!sock) {
    if (net_ratelimit())
      printk(KERN_WARNING "socket: no more sockets\n");
    return -ENFILE;  /* Not exactly a match, but its the
           closest posix thing */
  }
 
  sock->type = type;
 
#ifdef CONFIG_MODULES
  /* Attempt to load a protocol module if the find failed.
   *
   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
   * requested real, full-featured networking support upon configuration.
   * Otherwise module support will break!
   */
  if (net_families[family] == NULL)
    request_module("net-pf-%d", family);
#endif
 
  rcu_read_lock();
  pf = rcu_dereference(net_families[family]);
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;
 
  /*
   * We will call the ->create function, that possibly is in a loadable
   * module, so we have to bump that loadable module refcnt first.
   */
  if (!try_module_get(pf->owner))
    goto out_release;
 
  /* Now protected by module ref count */
  rcu_read_unlock();
 
  err = pf->create(net, sock, protocol);
  if (err < 0)
    goto out_module_put;
 
  /*
   * Now to bump the refcnt of the [loadable] module that owns this
   * socket at sock_release time we decrement its refcnt.
   */
  if (!try_module_get(sock->ops->owner))
    goto out_module_busy;
 
  /*
   * Now that we're done with the ->create function, the [loadable]
   * module can have its refcnt decremented
   */
  module_put(pf->owner);
  err = security_socket_post_create(sock, family, type, protocol, kern);
  if (err)
    goto out_sock_release;
  *res = sock;
 
  return 0;
 
out_module_busy:
  err = -EAFNOSUPPORT;
out_module_put:
  sock->ops = NULL;
  module_put(pf->owner);
out_sock_release:
  sock_release(sock);
  return err;
 
out_release:
  rcu_read_unlock();
  goto out_sock_release;
}
 
int sock_create(int family, int type, int protocol, struct socket **res)
{
  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

1150~1171行做的很簡單,不過是參數檢查。
接下來的security_socket_create以及後面的security_socket_post_create都定義在/include/linux/security.h中定義的空函數

C
static inline int security_socket_create(int family, int type,
           int protocol, int kern)
{
  return 0;
}
static inline int security_socket_post_create(struct socket *sock,
                int family,
                int type,
                int protocol, int kern)
{
  return 0;
}

1182行的sock_alloc的代碼如下:

C
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
static struct socket *sock_alloc(void)
{
  struct inode *inode;
  struct socket *sock;
 
  inode = new_inode(sock_mnt->mnt_sb);
  if (!inode)
    return NULL;
 
  sock = SOCKET_I(inode);
 
  kmemcheck_annotate_bitfield(sock, type);
  inode->i_mode = S_IFSOCK | S_IRWXUGO;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
 
  percpu_add(sockets_in_use, 1);
  return sock;
}

其中的new_inode是在/fs/inode.c中定義

C
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
static struct inode *alloc_inode(struct super_block *sb)
{
  struct inode *inode;
 
  if (sb->s_op->alloc_inode)
    inode = sb->s_op->alloc_inode(sb);
  else
    inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
  if (!inode)
    return NULL;
 
  if (unlikely(inode_init_always(sb, inode))) {
    if (inode->i_sb->s_op->destroy_inode)
      inode->i_sb->s_op->destroy_inode(inode);
    else
      kmem_cache_free(inode_cachep, inode);
    return NULL;
  }
 
  return inode;
}
C
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
struct inode *new_inode(struct super_block *sb)
{
  /*
   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
   * error if st_ino won't fit in target struct field. Use 32bit counter
   * here to attempt to avoid that.
   */
  static unsigned int last_ino;
  struct inode *inode;
 
  spin_lock_prefetch(&inode_lock);
 
  inode = alloc_inode(sb);
  if (inode) {
    spin_lock(&inode_lock);
    __inode_add_to_lists(sb, NULL, inode);
    inode->i_ino = ++last_ino;
    inode->i_state = 0;
    spin_unlock(&inode_lock);
  }
  return inode;
}
EXPORT_SYMBOL(new_inode);

可以看出new_inode會調用alloc_inode分配inode,而alloc_inode會調用sockfs在VFS中註冊的相應的函數來處理,那這個函數是什麼呢?先來看一看/net/socket.c

C
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
static struct inode *sock_alloc_inode(struct super_block *sb)
{
  struct socket_alloc *ei;
 
  ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
  if (!ei)
    return NULL;
  init_waitqueue_head(&ei->socket.wait);
 
  ei->socket.fasync_list = NULL;
  ei->socket.state = SS_UNCONNECTED;
  ei->socket.flags = 0;
  ei->socket.ops = NULL;
  ei->socket.sk = NULL;
  ei->socket.file = NULL;
 
  return &ei->vfs_inode;
}
C
287
288
289
290
291
static const struct super_operations sockfs_ops = {
  .alloc_inode =  sock_alloc_inode,
  .destroy_inode =sock_destroy_inode,
  .statfs =  simple_statfs,
};

爲幫助理解列出struct socket_alloc 結構體的定義。

C
794
795
796
797
798
799
800
801
802
struct socket_alloc {
  struct socket socket;
  struct inode vfs_inode;
};
 
static inline struct socket *SOCKET_I(struct inode *inode)
{
  return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

可以看到這個函數其實就是sock_alloc_inode,該函數分配了一個struct socket_alloc類型的結構體,然後返回這個結構體中的一個成員變量vfs_inode的地址,可以看出來這就是一個inode結構。然後就回到了sock_alloc函數的第489行,通過SOCKET_I獲得與vfs_inode同在socket_alloc結構體中的成員socket的地址。然後程序返回到__sock_create的1190行。

1192開始的代碼說明,如果編譯內核開啓了CONFIG_MODULES也就是內核模塊的選項就先檢查內核現在是否有支持由family(就是domain)所指定的網域的代碼,如果沒有則通過request_module來安裝。

說到這裏就先看看1204行的net_families這個數組,很明顯它是控制和操作各個網域的一個控制結構體的集合,通過變量pf可以發現它的類型爲struct net_proto_family(/include/linux/net.h)

C
201
202
203
204
205
struct net_proto_family {
  int    family;
  int    (*create)(struct net *net, struct socket *sock, int protocol);
  struct module  *owner;
};

然後1219行通過pf調用相應網域的create的函數,可以很簡單地得出對於AF_UNIX, AF_INET, AF_INET6, AF_PACKET這些所對應的create函數肯定不一樣。接下來我們以AF_INET爲例說明。在/net/ipv4/af_inet.c中

C
934
935
936
937
938
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};

由936可以得出對於AF_inet其create函數爲inet_create,定義於同一文件中。

C
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
static int inet_create(struct net *net, struct socket *sock, int protocol)
{
  struct sock *sk;
  struct inet_protosw *answer;
  struct inet_sock *inet;
  struct proto *answer_prot;
  unsigned char answer_flags;
  char answer_no_check;
  int try_loading_module = 0;
  int err;
 
  if (unlikely(!inet_ehash_secret))
    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
      build_ehash_secret();
 
  sock->state = SS_UNCONNECTED;
 
  /* Look for the requested type/protocol pair. */
lookup_protocol:
  err = -ESOCKTNOSUPPORT;
  rcu_read_lock();
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) {
      if (protocol != IPPROTO_IP)
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) {
        protocol = answer->protocol;
        break;
      }
      if (IPPROTO_IP == answer->protocol)
        break;
    }
    err = -EPROTONOSUPPORT;
  }
 
  if (unlikely(err)) {
    if (try_loading_module < 2) {
      rcu_read_unlock();
      /*
       * Be more specific, e.g. net-pf-2-proto-132-type-1
       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
       */
      if (++try_loading_module == 1)
        request_module("net-pf-%d-proto-%d-type-%d",
                 PF_INET, protocol, sock->type);
      /*
       * Fall back to generic, e.g. net-pf-2-proto-132
       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
       */
      else
        request_module("net-pf-%d-proto-%d",
                 PF_INET, protocol);
      goto lookup_protocol;
    } else
      goto out_rcu_unlock;
  }
 
  err = -EPERM;
  if (answer->capability > 0 && !capable(answer->capability))
    goto out_rcu_unlock;
 
  err = -EAFNOSUPPORT;
  if (!inet_netns_ok(net, protocol))
    goto out_rcu_unlock;
 
  sock->ops = answer->ops;
  answer_prot = answer->prot;
  answer_no_check = answer->no_check;
  answer_flags = answer->flags;
  rcu_read_unlock();
 
  WARN_ON(answer_prot->slab == NULL);
 
  err = -ENOBUFS;
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;
 
  err = 0;
  sk->sk_no_check = answer_no_check;
  if (INET_PROTOSW_REUSE & answer_flags)
    sk->sk_reuse = 1;
 
  inet = inet_sk(sk);
  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
  if (SOCK_RAW == sock->type) {
    inet->num = protocol;
    if (IPPROTO_RAW == protocol)
      inet->hdrincl = 1;
  }
 
  if (ipv4_config.no_pmtu_disc)
    inet->pmtudisc = IP_PMTUDISC_DONT;
  else
    inet->pmtudisc = IP_PMTUDISC_WANT;
 
  inet->id = 0;
 
  sock_init_data(sock, sk);
 
  sk->sk_destruct     = inet_sock_destruct;
  sk->sk_protocol     = protocol;
  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
  inet->uc_ttl  = -1;
  inet->mc_loop  = 1;
  inet->mc_ttl  = 1;
  inet->mc_all  = 1;
  inet->mc_index  = 0;
  inet->mc_list  = NULL;
 
  sk_refcnt_debug_inc(sk);
 
  if (inet->num) {
    /* It assumes that any protocol which allows
     * the user to assign a number at socket
     * creation time automatically
     * shares.
     */
    inet->sport = htons(inet->num);
    /* Add to protocol hash chains. */
    sk->sk_prot->hash(sk);
  }
 
  if (sk->sk_prot->init) {
    err = sk->sk_prot->init(sk);
    if (err)
      sk_common_release(sk);
  }
out:
  return err;
out_rcu_unlock:
  rcu_read_unlock();
  goto out;
}

每283到325就是通過type和protocol從inetsw中找出對應的struct inet_protosw的結構體。inetsw是定義於(net/ipv4/af_inet.c)中定義的

C
120
121
122
123
124
/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

而對於struct inet_protosw是在/include/net/protocol.h中定義

C
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
  struct list_head list;
 
        /* These two fields form the lookup key.  */
  unsigned short   type;     /* This is the 2nd argument to socket(2). */
  unsigned short   protocol; /* This is the L4 protocol number.  */
 
  struct proto   *prot;
  const struct proto_ops *ops;
 
  int              capability; /* Which (if any) capability do
              * we need to use this socket
              * interface?
                                      */
  char             no_check;   /* checksum on rcv/xmit/none? */
  unsigned char   flags;      /* See INET_PROTOSW_* below.  */
};

inetsw其實是就是Linux內核的典型的組織鏈表結構的一個數組,是按type組織的。inetsw是通過inet_register_protosw初始化的

C
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
void inet_register_protosw(struct inet_protosw *p)
{
  struct list_head *lh;
  struct inet_protosw *answer;
  int protocol = p->protocol;
  struct list_head *last_perm;
 
  spin_lock_bh(&inetsw_lock);
 
  if (p->type >= SOCK_MAX)
    goto out_illegal;
 
  /* If we are trying to override a permanent protocol, bail. */
  answer = NULL;
  last_perm = &inetsw[p->type];
  list_for_each(lh, &inetsw[p->type]) {
    answer = list_entry(lh, struct inet_protosw, list);
 
    /* Check only the non-wild match. */
    if (INET_PROTOSW_PERMANENT & answer->flags) {
      if (protocol == answer->protocol)
        break;
      last_perm = lh;
    }
 
    answer = NULL;
  }
  if (answer)
    goto out_permanent;
 
  /* Add the new entry after the last permanent entry if any, so that
   * the new entry does not override a permanent entry when matched with
   * a wild-card protocol. But it is allowed to override any existing
   * non-permanent entry.  This means that when we remove this entry, the
   * system automatically returns to the old behavior.
   */
  list_add_rcu(&p->list, last_perm);
out:
  spin_unlock_bh(&inetsw_lock);
 
  return;
 
out_permanent:
  printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
         protocol);
  goto out;
 
out_illegal:
  printk(KERN_ERR
         "Ignoring attempt to register invalid socket type %d.\n",
         p->type);
  goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

對於inet_register_protosw的調用是在inet_init中的第1593行進行的。

C
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
static int __init inet_init(void)
{
  struct sk_buff *dummy_skb;
  struct inet_protosw *q;
  struct list_head *r;
  int rc = -EINVAL;
 
  BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
  rc = proto_register(&tcp_prot, 1);
  if (rc)
    goto out;
 
  rc = proto_register(&udp_prot, 1);
  if (rc)
    goto out_unregister_tcp_proto;
 
  rc = proto_register(&raw_prot, 1);
  if (rc)
    goto out_unregister_udp_proto;
 
  /*
   *  Tell SOCKET that we are alive...
   */
 
  (void)sock_register(&inet_family_ops);
 
#ifdef CONFIG_SYSCTL
  ip_static_sysctl_init();
#endif
 
  /*
   *  Add all the base protocols.
   */
 
  if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
  if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
  if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
#ifdef CONFIG_IP_MULTICAST
  if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
#endif
 
  /* Register the socket-side information for inet_create. */
  for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    INIT_LIST_HEAD(r);
 
  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    inet_register_protosw(q);
 
  /*
   *  Set the ARP module up
   */
 
  arp_init();
 
  /*
   *  Set the IP module up
   */
 
  ip_init();
 
  tcp_v4_init();
 
  /* Setup TCP slab cache for open requests. */
  tcp_init();
 
  /* Setup UDP memory threshold */
  udp_init();
 
  /* Add UDP-Lite (RFC 3828) */
  udplite4_register();
 
  /*
   *  Set the ICMP layer up
   */
 
  if (icmp_init() < 0)
    panic("Failed to create the ICMP control socket.\n");
 
  /*
   *  Initialise the multicast router
   */
#if defined(CONFIG_IP_MROUTE)
  if (ip_mr_init())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
  /*
   *  Initialise per-cpu ipv4 mibs
   */
 
  if (init_ipv4_mibs())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
 
  ipv4_proc_init();
 
  ipfrag_init();
 
  dev_add_pack(&ip_packet_type);
 
  rc = 0;
out:
  return rc;
out_unregister_udp_proto:
  proto_unregister(&udp_prot);
out_unregister_tcp_proto:
  proto_unregister(&tcp_prot);
  goto out;
}
 
fs_initcall(inet_init);

從1592行可以看出初始化inetsw是用的inetsw_array數組,再看看inetsw_array數組。

C
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
const struct proto_ops inet_stream_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_stream_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = inet_accept,
  .getname     = inet_getname,
  .poll       = tcp_poll,
  .ioctl       = inet_ioctl,
  .listen       = inet_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = tcp_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = tcp_sendpage,
  .splice_read     = tcp_splice_read,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
 
const struct proto_ops inet_dgram_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = udp_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
 
/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = datagram_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
 
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};
 
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
  {
    .type =       SOCK_STREAM,
    .protocol =   IPPROTO_TCP,
    .prot =       &tcp_prot,
    .ops =        &inet_stream_ops,
    .capability = -1,
    .no_check =   0,
    .flags =      INET_PROTOSW_PERMANENT |
            INET_PROTOSW_ICSK,
  },
 
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_UDP,
    .prot =       &udp_prot,
    .ops =        &inet_dgram_ops,
    .capability = -1,
    .no_check =   UDP_CSUM_DEFAULT,
    .flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
         .type =       SOCK_RAW,
         .protocol =   IPPROTO_IP,  /* wild card */
         .prot =       &raw_prot,
         .ops =        &inet_sockraw_ops,
         .capability = CAP_NET_RAW,
         .no_check =   UDP_CSUM_DEFAULT,
         .flags =      INET_PROTOSW_REUSE,
       }
};
 
#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

假設我們分析ipv4中的TCP協議,其它協議也可以參照分析。現在回到inet_create函數,這個函數最重要的一行就是335,這一行的作用就是初始化套接口socket所應該對應的操作函數。例如如果用socket(AF_INET, SOCK_STREAM, 0);創建套接字,則內核就會在這裏爲這個套接字關聯上相應的TCP的操作函數集inet_stream_ops,以後在這個套接字上的數據的各種操作如accept listen bind send recv都會通過這些函數完成。
接下來在inet_create中的344後就是分配一個struct sock結構體,這個sock結構和socket結構是一一對應的,兩個結構各有一個成員指向對方。struct sock是在include/net/sock.h中定義,它有兩個非常重要的成員sk_receive_queue和sk_write_queue。還有兩個成員sk_rcvbuf,sk_sndbuf分別代表接收和發送緩衝區的大小,默認是32767字節,是在sock_init_data(net/core/sock.c)中初始化的。另外對於有連接模式可能要求超時重傳,所以還有一個sk_timer的定時隊列。

C
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
/**
  *  struct sock - network layer representation of sockets
  *  @__sk_common: shared layout with inet_timewait_sock
  *  @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *  @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *  @sk_lock:  synchronizer
  *  @sk_rcvbuf: size of receive buffer in bytes
  *  @sk_sleep: sock wait queue
  *  @sk_dst_cache: destination cache
  *  @sk_dst_lock: destination cache lock
  *  @sk_policy: flow policy
  *  @sk_rmem_alloc: receive queue bytes committed
  *  @sk_receive_queue: incoming packets
  *  @sk_wmem_alloc: transmit queue bytes committed
  *  @sk_write_queue: Packet sending queue
  *  @sk_async_wait_queue: DMA copied packets
  *  @sk_omem_alloc: "o" is "option" or "other"
  *  @sk_wmem_queued: persistent queue size
  *  @sk_forward_alloc: space allocated forward
  *  @sk_allocation: allocation mode
  *  @sk_sndbuf: size of send buffer in bytes
  *  @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *       %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *  @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
  *  @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *  @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *  @sk_gso_max_size: Maximum GSO segment size to build
  *  @sk_lingertime: %SO_LINGER l_linger setting
  *  @sk_backlog: always used with the per-socket spinlock held
  *  @sk_callback_lock: used with the callbacks in the end of this struct
  *  @sk_error_queue: rarely used
  *  @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *        IPV6_ADDRFORM for instance)
  *  @sk_err: last error
  *  @sk_err_soft: errors that don't cause failure but are the cause of a
  *          persistent failure not just 'timed out'
  *  @sk_drops: raw/udp drops counter
  *  @sk_ack_backlog: current listen backlog
  *  @sk_max_ack_backlog: listen backlog set in listen()
  *  @sk_priority: %SO_PRIORITY setting
  *  @sk_type: socket type (%SOCK_STREAM, etc)
  *  @sk_protocol: which protocol this socket belongs in this network family
  *  @sk_peercred: %SO_PEERCRED setting
  *  @sk_rcvlowat: %SO_RCVLOWAT setting
  *  @sk_rcvtimeo: %SO_RCVTIMEO setting
  *  @sk_sndtimeo: %SO_SNDTIMEO setting
  *  @sk_filter: socket filtering instructions
  *  @sk_protinfo: private area, net family specific, when not using slab
  *  @sk_timer: sock cleanup timer
  *  @sk_stamp: time stamp of last packet received
  *  @sk_socket: Identd and reporting IO signals
  *  @sk_user_data: RPC layer private data
  *  @sk_sndmsg_page: cached page for sendmsg
  *  @sk_sndmsg_off: cached offset for sendmsg
  *  @sk_send_head: front of stuff to transmit
  *  @sk_security: used by security modules
  *  @sk_mark: generic packet mark
  *  @sk_write_pending: a write to stream socket waits to start
  *  @sk_state_change: callback to indicate change in the state of the sock
  *  @sk_data_ready: callback to indicate there is data to be processed
  *  @sk_write_space: callback to indicate there is bf sending space available
  *  @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *  @sk_backlog_rcv: callback to process the backlog
  *  @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
 */
struct sock {
  /*
   * Now struct inet_timewait_sock also uses sock_common, so please just
   * don't add nothing before this first member (__sk_common) --acme
   */
  struct sock_common  __sk_common;
#define sk_node      __sk_common.skc_node
#define sk_nulls_node    __sk_common.skc_nulls_node
#define sk_refcnt    __sk_common.skc_refcnt
 
#define sk_copy_start    __sk_common.skc_hash
#define sk_hash      __sk_common.skc_hash
#define sk_family    __sk_common.skc_family
#define sk_state    __sk_common.skc_state
#define sk_reuse    __sk_common.skc_reuse
#define sk_bound_dev_if    __sk_common.skc_bound_dev_if
#define sk_bind_node    __sk_common.skc_bind_node
#define sk_prot      __sk_common.skc_prot
#define sk_net      __sk_common.skc_net
  kmemcheck_bitfield_begin(flags);
  unsigned int    sk_shutdown  : 2,
        sk_no_check  : 2,
        sk_userlocks : 4,
        sk_protocol  : 8,
        sk_type      : 16;
  kmemcheck_bitfield_end(flags);
  int      sk_rcvbuf;
  socket_lock_t    sk_lock;
  /*
   * The backlog queue is special, it is always used with
   * the per-socket spinlock held and requires low latency
   * access. Therefore we special case it's implementation.
   */
  struct {
    struct sk_buff *head;
    struct sk_buff *tail;
  } sk_backlog;
  wait_queue_head_t  *sk_sleep;
  struct dst_entry  *sk_dst_cache;
#ifdef CONFIG_XFRM
  struct xfrm_policy  *sk_policy[2];
#endif
  rwlock_t    sk_dst_lock;
  atomic_t    sk_rmem_alloc;
  atomic_t    sk_wmem_alloc;
  atomic_t    sk_omem_alloc;
  int      sk_sndbuf;
  struct sk_buff_head  sk_receive_queue;
  struct sk_buff_head  sk_write_queue;
#ifdef CONFIG_NET_DMA
  struct sk_buff_head  sk_async_wait_queue;
#endif
  int      sk_wmem_queued;
  int      sk_forward_alloc;
  gfp_t      sk_allocation;
  int      sk_route_caps;
  int      sk_gso_type;
  unsigned int    sk_gso_max_size;
  int      sk_rcvlowat;
  unsigned long     sk_flags;
  unsigned long          sk_lingertime;
  struct sk_buff_head  sk_error_queue;
  struct proto    *sk_prot_creator;
  rwlock_t    sk_callback_lock;
  int      sk_err,
        sk_err_soft;
  atomic_t    sk_drops;
  unsigned short    sk_ack_backlog;
  unsigned short    sk_max_ack_backlog;
  __u32      sk_priority;
  struct ucred    sk_peercred;
  long      sk_rcvtimeo;
  long      sk_sndtimeo;
  struct sk_filter        *sk_filter;
  void      *sk_protinfo;
  struct timer_list  sk_timer;
  ktime_t      sk_stamp;
  struct socket    *sk_socket;
  void      *sk_user_data;
  struct page    *sk_sndmsg_page;
  struct sk_buff    *sk_send_head;
  __u32      sk_sndmsg_off;
  int      sk_write_pending;
#ifdef CONFIG_SECURITY
  void      *sk_security;
#endif
  __u32      sk_mark;
  /* XXX 4 bytes hole on 64 bit */
  void      (*sk_state_change)(struct sock *sk);
  void      (*sk_data_ready)(struct sock *sk, int bytes);
  void      (*sk_write_space)(struct sock *sk);
  void      (*sk_error_report)(struct sock *sk);
    int      (*sk_backlog_rcv)(struct sock *sk,
              struct sk_buff *skb);
  void                    (*sk_destruct)(struct sock *sk);
};

在分析sk_alloc之前先分析一下answer_prot. answer_prot是struct proto類型(include/net/sock.h)

C
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 * transport -> network interface is defined by struct inet_proto
 */
struct proto {
  void      (*close)(struct sock *sk,
          long timeout);
  int      (*connect)(struct sock *sk,
                struct sockaddr *uaddr,
          int addr_len);
  int      (*disconnect)(struct sock *sk, int flags);
 
  struct sock *    (*accept) (struct sock *sk, int flags, int *err);
 
  int      (*ioctl)(struct sock *sk, int cmd,
           unsigned long arg);
  int      (*init)(struct sock *sk);
  void      (*destroy)(struct sock *sk);
  void      (*shutdown)(struct sock *sk, int how);
  int      (*setsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*getsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          int __user *option);
#ifdef CONFIG_COMPAT
  int      (*compat_setsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*compat_getsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          int __user *option);
#endif
  int      (*sendmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg, size_t len);
  int      (*recvmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg,
          size_t len, int noblock, int flags,
          int *addr_len);
  int      (*sendpage)(struct sock *sk, struct page *page,
          int offset, size_t size, int flags);
  int      (*bind)(struct sock *sk,
          struct sockaddr *uaddr, int addr_len);
 
  int      (*backlog_rcv) (struct sock *sk,
            struct sk_buff *skb);
 
  /* Keeping track of sk's, looking them up, and port selection methods. */
  void      (*hash)(struct sock *sk);
  void      (*unhash)(struct sock *sk);
  int      (*get_port)(struct sock *sk, unsigned short snum);
 
  /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
  unsigned int    inuse_idx;
#endif
 
  /* Memory pressure */
  void      (*enter_memory_pressure)(struct sock *sk);
  atomic_t    *memory_allocated;  /* Current allocated memory. */
  struct percpu_counter  *sockets_allocated;  /* Current number of sockets. */
  /*
   * Pressure flag: try to collapse.
   * Technical note: it is used by multiple contexts non atomically.
   * All the __sk_mem_schedule() is of this nature: accounting
   * is strict, actions are advisory and have some latency.
   */
  int      *memory_pressure;
  int      *sysctl_mem;
  int      *sysctl_wmem;
  int      *sysctl_rmem;
  int      max_header;
 
  struct kmem_cache  *slab;
  unsigned int    obj_size;
  int      slab_flags;
 
  struct percpu_counter  *orphan_count;
 
  struct request_sock_ops  *rsk_prot;
  struct timewait_sock_ops *twsk_prot;
 
  union {
    struct inet_hashinfo  *hashinfo;
    struct udp_table  *udp_table;
    struct raw_hashinfo  *raw_hash;
  } h;
 
  struct module    *owner;
 
  char      name[32];
 
  struct list_head  node;
#ifdef SOCK_REFCNT_DEBUG
  atomic_t    socks;
#endif
};

假設分析的是TCP協議,則通過336行的賦值從inetsw_array找到其prot成員變量爲tcp_prot(net/ipv4/tcp_ipv4.h)。

C
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
struct proto tcp_prot = {
  .name      = "TCP",
  .owner      = THIS_MODULE,
  .close      = tcp_close,
  .connect    = tcp_v4_connect,
  .disconnect    = tcp_disconnect,
  .accept      = inet_csk_accept,
  .ioctl      = tcp_ioctl,
  .init      = tcp_v4_init_sock,
  .destroy    = tcp_v4_destroy_sock,
  .shutdown    = tcp_shutdown,
  .setsockopt    = tcp_setsockopt,
  .getsockopt    = tcp_getsockopt,
  .recvmsg    = tcp_recvmsg,
  .backlog_rcv    = tcp_v4_do_rcv,
  .hash      = inet_hash,
  .unhash      = inet_unhash,
  .get_port    = inet_csk_get_port,
  .enter_memory_pressure  = tcp_enter_memory_pressure,
  .sockets_allocated  = &tcp_sockets_allocated,
  .orphan_count    = &tcp_orphan_count,
  .memory_allocated  = &tcp_memory_allocated,
  .memory_pressure  = &tcp_memory_pressure,
  .sysctl_mem    = sysctl_tcp_mem,
  .sysctl_wmem    = sysctl_tcp_wmem,
  .sysctl_rmem    = sysctl_tcp_rmem,
  .max_header    = MAX_TCP_HEADER,
  .obj_size    = sizeof(struct tcp_sock),
  .slab_flags    = SLAB_DESTROY_BY_RCU,
  .twsk_prot    = &tcp_timewait_sock_ops,
  .rsk_prot    = &tcp_request_sock_ops,
  .h.hashinfo    = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
  .compat_setsockopt  = compat_tcp_setsockopt,
  .compat_getsockopt  = compat_tcp_getsockopt,
#endif
};

通過tcp_prot的結構體對各成員的賦值可以發現並沒有初始化,而obj_size被初始化爲sizeof(struct tcp_sock)這一點可以在後面的分析中看到。接下來看inet_create的344行,即sk_alloc(net/ipv4/af_inet.c)。

C
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
    int family)
{
  struct sock *sk;
  struct kmem_cache *slab;
 
  slab = prot->slab;
  if (slab != NULL) {
    sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
    if (!sk)
      return sk;
    if (priority & __GFP_ZERO) {
      /*
       * caches using SLAB_DESTROY_BY_RCU should let
       * sk_node.next un-modified. Special care is taken
       * when initializing object to zero.
       */
      if (offsetof(struct sock, sk_node.next) != 0)
        memset(sk, 0, offsetof(struct sock, sk_node.next));
      memset(&sk->sk_node.pprev, 0,
             prot->obj_size - offsetof(struct sock,
               sk_node.pprev));
    }
  }
  else
    sk = kmalloc(prot->obj_size, priority);
 
  if (sk != NULL) {
    kmemcheck_annotate_bitfield(sk, flags);
 
    if (security_sk_alloc(sk, family, priority))
      goto out_free;
 
    if (!try_module_get(prot->owner))
      goto out_free_sec;
  }
 
  return sk;
 
out_free_sec:
  security_sk_free(sk);
out_free:
  if (slab != NULL)
    kmem_cache_free(slab, sk);
  else
    kfree(sk);
  return NULL;
}
C
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
/**
 *  sk_alloc - All socket objects are allocated here
 *  @net: the applicable net namespace
 *  @family: protocol family
 *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *  @prot: struct proto associated with this new sock instance
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
          struct proto *prot)
{
  struct sock *sk;
 
  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
  if (sk) {
    sk->sk_family = family;
    /*
     * See comment in struct sock definition to understand
     * why we need sk_prot_creator -acme
     */
    sk->sk_prot = sk->sk_prot_creator = prot;
    sock_lock_init(sk);
    sock_net_set(sk, get_net(net));
    atomic_set(&sk->sk_wmem_alloc, 1);
  }
 
  return sk;
}
EXPORT_SYMBOL(sk_alloc);

很明顯在sk_alloc中直接調用sk_prot_alloc來分配sock結構,在sk_prot_alloc中先判定slab是否爲空(如前提示),由於tcp_prot並未初始化slab所以直接分配obj_size大小即sizeof(struct tcp_sock)的空間,並返回空間類型爲struct sock *的地址,但是又可以看到該空間的大小爲sizeof(struct tcp_sock),那就說明有兩種情況:一、sizeof(struct tcp_sock) == sizeof(struct sock) 二、sizeof(struct tcp_sock) >= sizeof(struct sock) 。通過分析實際是第二種情況,通過列出一系列數據結構可以很明顯地看出。
先來看struct tcp_sock結構的定義(include/linux/tcp.h)

C
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
struct tcp_sock {
  /* inet_connection_sock has to be the first member of tcp_sock */
  struct inet_connection_sock  inet_conn;
  u16  tcp_header_len;  /* Bytes of tcp header to send    */
  u16  xmit_size_goal_segs; /* Goal for segmenting output packets */
 
/*
 *  Header prediction flags
 *  0x5?10 << 16 + snd_wnd in net byte order
 */
  __be32  pred_flags;
 
/*
 *  RFC793 variables by their proper names. This means you can
 *  read the code and the spec side by side (and laugh ...)
 *  See RFC793 and RFC1122. The RFC writes these in capitals.
 */
   u32  rcv_nxt;  /* What we want to receive next   */
  u32  copied_seq;  /* Head of yet unread data    */
  u32  rcv_wup;  /* rcv_nxt on last window update sent  */
   u32  snd_nxt;  /* Next sequence we send    */
 
   u32  snd_una;  /* First byte we want an ack for  */
   u32  snd_sml;  /* Last byte of the most recently transmitted small packet */
  u32  rcv_tstamp;  /* timestamp of last received ACK (for keepalives) */
  u32  lsndtime;  /* timestamp of last sent data packet (for restart window) */
 
  /* Data for direct copy to user */
  struct {
    struct sk_buff_head  prequeue;
    struct task_struct  *task;
    struct iovec    *iov;
    int      memory;
    int      len;
#ifdef CONFIG_NET_DMA
    /* members for async copy */
    struct dma_chan    *dma_chan;
    int      wakeup;
    struct dma_pinned_list  *pinned_list;
    dma_cookie_t    dma_cookie;
#endif
  } ucopy;
 
  u32  snd_wl1;  /* Sequence for window update    */
  u32  snd_wnd;  /* The window we expect to receive  */
  u32  max_window;  /* Maximal window ever seen from peer  */
  u32  mss_cache;  /* Cached effective mss, not including SACKS */
 
  u32  window_clamp;  /* Maximal window to advertise    */
  u32  rcv_ssthresh;  /* Current window clamp      */
 
  u32  frto_highmark;  /* snd_nxt when RTO occurred */
  u16  advmss;    /* Advertised MSS      */
  u8  frto_counter;  /* Number of new acks after RTO */
  u8  nonagle;  /* Disable Nagle algorithm?             */
 
/* RTT measurement */
  u32  srtt;    /* smoothed round trip time << 3  */
  u32  mdev;    /* medium deviation      */
  u32  mdev_max;  /* maximal mdev for the last rtt period  */
  u32  rttvar;    /* smoothed mdev_max      */
  u32  rtt_seq;  /* sequence number to update rttvar  */
 
  u32  packets_out;  /* Packets which are "in flight"  */
  u32  retrans_out;  /* Retransmitted packets out    */
 
  u16  urg_data;  /* Saved octet of OOB data and control flags */
  u8  ecn_flags;  /* ECN status bits.      */
  u8  reordering;  /* Packet reordering metric.    */
  u32  snd_up;    /* Urgent pointer    */
 
  u8  keepalive_probes; /* num of allowed keep alive probes  */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
  struct tcp_options_received rx_opt;
 
/*
 *  Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
   u32  snd_ssthresh;  /* Slow start size threshold    */
   u32  snd_cwnd;  /* Sending congestion window    */
  u32  snd_cwnd_cnt;  /* Linear increase counter    */
  u32  snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
  u32  snd_cwnd_used;
  u32  snd_cwnd_stamp;
 
   u32  rcv_wnd;  /* Current receiver window    */
  u32  write_seq;  /* Tail(+1) of data held in tcp send buffer */
  u32  pushed_seq;  /* Last pushed seq, required to talk to windows */
  u32  lost_out;  /* Lost packets      */
  u32  sacked_out;  /* SACK'd packets      */
  u32  fackets_out;  /* FACK'd packets      */
  u32  tso_deferred;
  u32  bytes_acked;  /* Appropriate Byte Counting - RFC3465 */
 
  /* from STCP, retrans queue hinting */
  struct sk_buff* lost_skb_hint;
  struct sk_buff *scoreboard_skb_hint;
  struct sk_buff *retransmit_skb_hint;
 
  struct sk_buff_head  out_of_order_queue; /* Out of order segments go here */
 
  /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */
  struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
  struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
  struct tcp_sack_block recv_sack_cache[4];
 
  struct sk_buff *highest_sack;   /* highest skb with SACK received
           * (validity guaranteed only if
           * sacked_out > 0)
           */
 
  int     lost_cnt_hint;
  u32     retransmit_high;  /* L-bits may be on up to this seqno */
 
  u32  lost_retrans_low;  /* Sent seq after any rxmit (lowest) */
 
  u32  prior_ssthresh; /* ssthresh saved at recovery start  */
  u32  high_seq;  /* snd_nxt at onset of congestion  */
 
  u32  retrans_stamp;  /* Timestamp of the last retransmit,
         * also used in SYN-SENT to remember stamp of
         * the first SYN. */
  u32  undo_marker;  /* tracking retrans started here. */
  int  undo_retrans;  /* number of undoable retransmissions. */
  u32  total_retrans;  /* Total retransmits for entire connection */
 
  u32  urg_seq;  /* Seq of received urgent pointer */
  unsigned int    keepalive_time;    /* time before keep alive takes place */
  unsigned int    keepalive_intvl;  /* time interval between keep alive probes */
 
  int      linger2;
 
/* Receiver side RTT estimation */
  struct {
    u32  rtt;
    u32  seq;
    u32  time;
  } rcv_rtt_est;
 
/* Receiver queue space */
  struct {
    int  space;
    u32  seq;
    u32  time;
  } rcvq_space;
 
/* TCP-specific MTU probe information. */
  struct {
    u32      probe_seq_start;
    u32      probe_seq_end;
  } mtu_probe;
 
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
  const struct tcp_sock_af_ops  *af_specific;
 
/* TCP MD5 Signature Option information */
  struct tcp_md5sig_info  *md5sig_info;
#endif
};

在tcp_sock的結構體的第一個成員變量類型爲struct inet_connection_sock(include/net/inet_connection_sock.h)

C
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:     FIFO of established children
 * @icsk_bind_hash:     Bind node
 * @icsk_timeout:     Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:       Retransmit timeout
 * @icsk_pmtu_cookie     Last pmtu seen by socket
 * @icsk_ca_ops       Pluggable congestion control hook
 * @icsk_af_ops       Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:     Congestion control state
 * @icsk_retransmits:     Number of unrecovered [RTO] timeouts
 * @icsk_pending:     Scheduled timer event
 * @icsk_backoff:     Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:     unanswered 0 window probes
 * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:       Delayed ACK control data
 * @icsk_mtup;       MTU probing control data
 */
struct inet_connection_sock {
  /* inet_sock has to be the first member! */
  struct inet_sock    icsk_inet;
  struct request_sock_queue icsk_accept_queue;
  struct inet_bind_bucket    *icsk_bind_hash;
  unsigned long      icsk_timeout;
   struct timer_list    icsk_retransmit_timer;
   struct timer_list    icsk_delack_timer;
  __u32        icsk_rto;
  __u32        icsk_pmtu_cookie;
  const struct tcp_congestion_ops *icsk_ca_ops;
  const struct inet_connection_sock_af_ops *icsk_af_ops;
  unsigned int      (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
  __u8        icsk_ca_state;
  __u8        icsk_retransmits;
  __u8        icsk_pending;
  __u8        icsk_backoff;
  __u8        icsk_syn_retries;
  __u8        icsk_probes_out;
  __u16        icsk_ext_hdr_len;
  struct {
    __u8      pending;   /* ACK is pending         */
    __u8      quick;   /* Scheduled number of quick acks     */
    __u8      pingpong;   /* The session is interactive       */
    __u8      blocked;   /* Delayed ACK was blocked by socket lock */
    __u32      ato;     /* Predicted tick of soft clock     */
    unsigned long    timeout;   /* Currently scheduled timeout       */
    __u32      lrcvtime;   /* timestamp of last received data packet */
    __u16      last_seg_size; /* Size of last incoming segment     */
    __u16      rcv_mss;   /* MSS used for delayed ACK decisions     */
  } icsk_ack;
  struct {
    int      enabled;
 
    /* Range of MTUs to search */
    int      search_high;
    int      search_low;
 
    /* Information on the current probe. */
    int      probe_size;
  } icsk_mtup;
  u32        icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE  (16 * sizeof(u32))
};

在 inet_connection_sock結構體中第一個成員變量類型爲struct inet_sock(include/net/inet_sock.h)

C
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @daddr - Foreign IPv4 addr
 * @rcv_saddr - Bound local IPv4 addr
 * @dport - Destination port
 * @num - Local port
 * @saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @sport - Source port
 * @id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
  /* sk and pinet6 has to be the first two members of inet_sock */
  struct sock    sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  struct ipv6_pinfo  *pinet6;
#endif
  /* Socket demultiplex comparisons on incoming packets. */
  __be32      daddr;
  __be32      rcv_saddr;
  __be16      dport;
  __u16      num;
  __be32      saddr;
  __s16      uc_ttl;
  __u16      cmsg_flags;
  struct ip_options  *opt;
  __be16      sport;
  __u16      id;
  __u8      tos;
  __u8      mc_ttl;
  __u8      pmtudisc;
  __u8      recverr:1,
        is_icsk:1,
        freebind:1,
        hdrincl:1,
        mc_loop:1,
        transparent:1,
        mc_all:1;
  int      mc_index;
  __be32      mc_addr;
  struct ip_mc_socklist  *mc_list;
  struct {
    unsigned int    flags;
    unsigned int    fragsize;
    struct ip_options  *opt;
    struct dst_entry  *dst;
    int      length; /* Total length of all frames */
    __be32      addr;
    struct flowi    fl;
  } cork;
};

而inet_sock的第一個成員正是struct sock類型,所以sk_prot_alloc直接返回struct sock *類型指針是沒有問題的,接下來執行inet_create中的353行用inet_sk通過sk獲得inet指針的值,inet_sk函數其實就相當於強制類型轉換,返回的就是sk的指針。
接下來程序就一路返回到__sock_create,接着再返回到sys_socket中。在sys_socket中調用了最後一個函數sock_map_fd(net/socket.c,將socket指針sock與一個已經打開的文件號關聯起來返回給用戶程序。

C
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/*
 *  Obtains the first available file descriptor and sets it up for use.
 *
 *  These functions create file structures and maps them to fd space
 *  of the current process. On success it returns file descriptor
 *  and file struct implicitly stored in sock->file.
 *  Note that another thread may close file descriptor before we return
 *  from this function. We use the fact that now we do not refer
 *  to socket after mapping. If one day we will need it, this
 *  function will increment ref. count on file by 1.
 *
 *  In any case returned fd MAY BE not valid!
 *  This race condition is unavoidable
 *  with shared fd spaces, we cannot solve it inside kernel,
 *  but we take care of internal coherence yet.
 */
 
static int sock_alloc_fd(struct file **filep, int flags)
{
  int fd;
 
  fd = get_unused_fd_flags(flags);
  if (likely(fd >= 0)) {
    struct file *file = get_empty_filp();
 
    *filep = file;
    if (unlikely(!file)) {
      put_unused_fd(fd);
      return -ENFILE;
    }
  } else
    *filep = NULL;
  return fd;
}
 
static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
{
  struct dentry *dentry;
  struct qstr name = { .name = "" };
 
  dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
  if (unlikely(!dentry))
    return -ENOMEM;
 
  dentry->d_op = &sockfs_dentry_operations;
  /*
   * We dont want to push this dentry into global dentry hash table.
   * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
   * This permits a working /proc/$pid/fd/XXX on sockets
   */
  dentry->d_flags &= ~DCACHE_UNHASHED;
  d_instantiate(dentry, SOCK_INODE(sock));
 
  sock->file = file;
  init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
      &socket_file_ops);
  SOCK_INODE(sock)->i_fop = &socket_file_ops;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->f_pos = 0;
  file->private_data = sock;
 
  return 0;
}
 
int sock_map_fd(struct socket *sock, int flags)
{
  struct file *newfile;
  int fd = sock_alloc_fd(&newfile, flags);
 
  if (likely(fd >= 0)) {
    int err = sock_attach_fd(sock, newfile, flags);
 
    if (unlikely(err < 0)) {
      put_filp(newfile);
      put_unused_fd(fd);
      return err;
    }
    fd_install(fd, newfile);
  }
  return fd;
}

fs/dcache.c

C
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
/* the caller must hold dcache_lock */
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
  if (inode)
    list_add(&dentry->d_alias, &inode->i_dentry);
  dentry->d_inode = inode;
  fsnotify_d_instantiate(dentry, inode);
}
 
/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
  BUG_ON(!list_empty(&entry->d_alias));
  spin_lock(&dcache_lock);
  __d_instantiate(entry, inode);
  spin_unlock(&dcache_lock);
  security_d_instantiate(entry, inode);
}

/net/socket.c

C
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
 *  Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *  in the operation structures but are done directly via the socketcall() multiplexor.
 */
 
static const struct file_operations socket_file_ops = {
  .owner =  THIS_MODULE,
  .llseek =  no_llseek,
  .aio_read =  sock_aio_read,
  .aio_write =  sock_aio_write,
  .poll =    sock_poll,
  .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
  .compat_ioctl = compat_sock_ioctl,
#endif
  .mmap =    sock_mmap,
  .open =    sock_no_open,  /* special open code to disallow open via /proc */
  .release =  sock_close,
  .fasync =  sock_fasync,
  .sendpage =  sock_sendpage,
  .splice_write = generic_splice_sendpage,
  .splice_read =  sock_splice_read,
};

在sock_map_fd中先通過402行獲得一個未用的已經打開的文件號以及file結構,然後通過405行調用sock_attach_fd將文件號與sock相關聯起來,在sock_attach_fd中先通地375行從sockfs中分配一個dentry,其中sock_mnt就是在描述sockfs中提到的,d_instantiate的作用就是將dentry與socket的inode關聯起來,然後388行又將sock->file與file關聯起來。389~390行將socket文件上的操作初始化爲socket_file_ops。這樣,通過send/recv進入內核將調用inet_stream_ops中的函數,而通過read/write調用將調用socket_file_ops中的函數。然後反回至sys_socket函數中,再經過系統調用切換到用戶態,socket函數的整個調用過程完成。




轉自http://acm.hrbeu.edu.cn/~puppy/2011/02/28/linux-%E5%8D%8F%E8%AE%AE%E6%A0%88%E5%88%86%E6%9E%90-socket/


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章