在網上看到高手總結出來的,省的自己花時間再研究,放到此處以便學習。
成都的天氣好像越來越好了,前幾天還穿着穿着外套直打哆嗦,到今天已經“撥開陰雲見太陽”,暖洋洋的,心情也暖洋洋的。暖和的正好想睡覺。打個呵欠,把網絡設備管理這部份總結下吧。
Linux素以優秀的網絡管理能力而著稱,linux爲何具有這麼高的效率?我們從網絡設備的管理說起。
Linux爲何要對網絡設備單獨管理呢?這是因爲。協議棧很多地方都會涉及到網絡設備。小至IP地址的設置。大至IP路由的更新。都離不開高效的網絡設備管理。將網絡設備單獨管理可以提高效率!
每個網絡設備,在linux中都會對應一個數據結構,net_device。 就從這個結構說起
Linux 2。6。21中,對net_device定義如下:
struct net_device
{
//設備的名稱,例如常見的“eth0”等
char name[IFNAMSIZ];
//共享內存的起始,結束地址
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
//網絡設備的I/O基地址
unsigned long base_addr; /* device I/O address */
//被賦予的中斷號
unsigned int irq; /* device IRQ number */
//在多端口設備上使用哪一個端口
unsigned char if_port; /* Selectable AUI, TP,..*/
//爲設備分配的DMA通道
unsigned char dma; /* DMA channel */
//設備的狀態
unsigned long state;
// 下一個net_device
struct net_device *next;
//初始化函數。
int (*init)(struct net_device *dev);
struct net_device *next_sched;
/* Interface index. Unique device identifier */
//設備在內核中對應的序號
int ifindex;
int iflink;
//獲得接口狀態的函數指針
struct net_device_stats* (*get_stats)(struct net_device *dev);
struct iw_statistics* (*get_wireless_stats)(struct net_device *dev);
struct iw_handler_def * wireless_handlers;
struct ethtool_ops *ethtool_ops;
//傳輸狀態。檢查傳輸是否被鎖住
unsigned long trans_start; /* Time (in jiffies) of last Tx */
//最使使用的時間
unsigned long last_rx; /* Time of last Rx */
//接口標誌
unsigned short flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short unused_alignment_fixer; /* Because we need priv_flags,
* and we want to be 32-bit aligned.
*/
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
void *priv; /* pointer to private data */
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
unsigned char addr_len; /* hardware address length */
struct dev_mc_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
int promiscuity;
int allmulti;
int watchdog_timeo;
struct timer_list watchdog_timer;
/* Protocol specific pointers */
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct list_head poll_list; /* Link to poll list */
int quota;
int weight;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct Qdisc *qdisc_ingress;
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
/* ingress path synchronizer */
spinlock_t ingress_lock;
/* hard_start_xmit synchronizer */
spinlock_t xmit_lock;
/* cpu id of processor entered to hard_start_xmit or -1,
if nobody entered there.
*/
int xmit_lock_owner;
/* device queue lock */
spinlock_t queue_lock;
/* Number of references to this device */
atomic_t refcnt;
/* delayed register/unregister */
struct list_head todo_list;
/* device name hash chain */
struct hlist_node name_hlist;
/* device index hash chain */
struct hlist_node index_hlist;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERING, /* called register_netdevice */
NETREG_REGISTERED, /* completed register todo */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
} reg_state;
/* Net device features */
int features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum only TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
#define NETIF_F_LLTX 4096 /* LockLess TX */
/* Called after device is detached from network. */
void (*uninit)(struct net_device *dev);
/* Called after last user reference disappears. */
void (*destructor)(struct net_device *dev);
/* Pointers to interface service routines. */
//打開函數指針
int (*open)(struct net_device *dev);
//設備停用時調用此函數
int (*stop)(struct net_device *dev);
//初始化數據包的傳輸
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
#define HAVE_NETDEV_POLL
//輪詢函數
int (*poll) (struct net_device *dev, int *quota);
//建立硬件頭信息
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
void *daddr,
void *saddr,
unsigned len);
//ARP解析之後,重構頭部
int (*rebuild_header)(struct sk_buff *skb);
#define HAVE_MULTICAST
//多播支持函數
void (*set_multicast_list)(struct net_device *dev);
#define HAVE_SET_MAC_ADDR
int (*set_mac_address)(struct net_device *dev,
void *addr);
#define HAVE_PRIVATE_IOCTL
int (*do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
#define HAVE_SET_CONFIG
int (*set_config)(struct net_device *dev,
struct ifmap *map);
#define HAVE_HEADER_CACHE
int (*hard_header_cache)(struct neighbour *neigh,
struct hh_cache *hh);
void (*header_cache_update)(struct hh_cache *hh,
struct net_device *dev,
unsigned char * haddr);
#define HAVE_CHANGE_MTU
int (*change_mtu)(struct net_device *dev, int new_mtu);
#define HAVE_TX_TIMEOUT
void (*tx_timeout) (struct net_device *dev);
void (*vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
int (*hard_header_parse)(struct sk_buff *skb,
unsigned char *haddr);
int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
int (*accept_fastpath)(struct net_device *, struct dst_entry*);
#ifdef CONFIG_NETPOLL
int netpoll_rx;
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*poll_controller)(struct net_device *dev);
#endif
/* bridge stuff */
//對應的網橋端口(以後分析)
struct net_bridge_port *br_port;
#ifdef CONFIG_NET_DIVERT
/* this will get initialized at each interface type init routine */
struct divert_blk *divert;
#endif /* CONFIG_NET_DIVERT */
/* class/net/name entry */
struct class_device class_dev;
/* how much padding had been added by alloc_netdev() */
int padded;
}
暈,太多的成員。太龐大了。不要緊,等到要使用到相應成員的時候再來解釋好了。
注意到這麼龐大的結構中,有個成員叫: struct net_device *next,呵呵,很熟悉吧,就是用它來建立網絡設備的鏈表。
每一個網絡設備啓動的時候,都會調用register_netdev() (drivers/net/net_init.c)
跟蹤這個函數:
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock();
/*
* If the name is a format string the caller wants us to
* do a name allocation
*/
if (strchr(dev->name, '%'))
{
err = dev_alloc_name(dev, dev->name);
if (err < 0)
goto out;
}
/*
* Back compatibility hook. Kill this one in 2.5
*/
if (dev->name[0]==0 || dev->name[0]==' ')
{
err = dev_alloc_name(dev, "eth%d");
if (err < 0)
goto out;
}
err = register_netdevice(dev);
out:
rtnl_unlock();
return err;
}
跟蹤至: register_netdevice(struct net_device *dev) (net/core/dev.c)
int register_netdevice(struct net_device *dev)
{
struct hlist_head *head;
struct hlist_node *p;
int ret;
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
spin_lock_init(&dev->queue_lock);
spin_lock_init(&dev->xmit_lock);
dev->xmit_lock_owner = -1;
#ifdef CONFIG_NET_CLS_ACT
spin_lock_init(&dev->ingress_lock);
#endif
ret = alloc_divert_blk(dev);
if (ret)
goto out;
dev->iflink = -1;
/* Init, if this function is available */
//如果dev -> init 被賦值,那麼調用此函數
if (dev->init) {
ret = dev->init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out_err;
}
}
//判斷name 是否合法
if (!dev_valid_name(dev->name)) {
ret = -EINVAL;
goto out_err;
}
//爲此設備分配一個index
dev->ifindex = dev_new_index();
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
/* Check for existence of name */
//所有網絡設備,以名字作爲哈希主鍵存在dev_name_head中,該變量是一個哈希數組
//找到該名字對應的鏈表
//如果內核中已經含有此名字的網絡設備,出錯退出
head = dev_name_hash(dev->name);
hlist_for_each(p, head) {
struct net_device *d
= hlist_entry(p, struct net_device, name_hlist);
if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
ret = -EEXIST;
goto out_err;
}
}
/* Fix illegal SG+CSUM combinations. */
if ((dev->features & NETIF_F_SG) &&
!(dev->features & (NETIF_F_IP_CSUM |
NETIF_F_NO_CSUM |
NETIF_F_HW_CSUM))) {
printk("%s: Dropping NETIF_F_SG since no checksum feature./n",
dev->name);
dev->features &= ~NETIF_F_SG;
}
/*
* nil rebuild_header routine,
* that should be never called and used as just bug trap.
*/
//爲rebuild_header賦默認值
if (!dev->rebuild_header)
dev->rebuild_header = default_rebuild_header;
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
dev->next = NULL;
dev_init_scheduler(dev);
write_lock_bh(&dev_base_lock);
//初始化的時候,有struct net_device **dev_tail = &dev_base;
//這段代碼的意思實際就是:把dev加入dev_base爲首結點隊鏈表的尾部
*dev_tail = dev;
dev_tail = &dev->next;
//把此結點加入到以名字爲哈希主鍵的鏈表數組dev_name_head中
hlist_add_head(&dev->name_hlist, head);
//把此結點加到以序號爲主鍵的鏈表數組dev_index_head中
hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
dev_hold(dev);
dev->reg_state = NETREG_REGISTERING;
write_unlock_bh(&dev_base_lock);
/* Notify protocols, that a new device appeared. */
//在通知鏈表上發送事件
notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
/* Finish registration after unlock */
net_set_todo(dev);
ret = 0;
out:
return ret;
out_err:
free_divert_blk(dev);
goto out;
}
從此可以看出。新加入一個設備時,會插入三個位置:以名字爲哈希值組織的dev_name_head ,以序號爲主鏈的哈希數組dev_index_head.還有dev_base.它爲快速查找網絡設備提供了基礎。事實上。在內核中,經常要根據index找到dev. 或者根據name找到dev.我們遇到的時候再分析
到現在,我們可以在內核中順藤摸瓜的找到每一個網絡設備了。
還有很重要的。設備更改了配置,要怎麼通知跟他相關的子系統呢?例如,網卡更新了IP,如何使路由得到更新?
接着往下看:
注意到上面註冊代碼中所調用的一個函數notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev).
該函數的作用是,在通知鏈表上netdev_chain上發送NETDEV_REGISTER消息,所有在與該通知鏈表關聯的子系統都可以收到此消息。以此,可以快速的更新整個系統的配置消息。
以路由子系統爲例,來講述該過程:
在IPV4子系統加載的時候,加調用ip_init(),接着調用fib_init(),然後再調用ip_fib_init()
跟蹤一下此函數:
void __init ip_fib_init(void)
{
#ifndef CONFIG_IP_MULTIPLE_TABLES
ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
#else
fib_rules_init();
#endif
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
}
register_netdevice_notifier是做什麼的呢?往下跟蹤:
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net_device *dev;
int err;
rtnl_lock();
//註冊通知鏈
err = notifier_chain_register(&netdev_chain, nb);
if (!err) {
for (dev = dev_base; dev; dev = dev->next) {
nb->notifier_call(nb, NETDEV_REGISTER, dev);
if (dev->flags & IFF_UP)
nb->notifier_call(nb, NETDEV_UP, dev);
}
}
rtnl_unlock();
return err;
}
呵呵,它在netdev_chain上註冊了通知鏈,當此鏈上有事件發生時,會調用fib_netdev_notifiers中的相關信息處理,看一下fib_netdev_notifier的信息:
struct notifier_block fib_netdev_notifier = {
.notifier_call =fib_netdev_event,
};
OK,現在越來越具體了,如果netdev_chain有事件,會調用fib_netdev_event處理。繼續跟蹤:
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
struct in_device *in_dev = __in_dev_get(dev);
//設備註銷
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
return NOTIFY_DONE;
}
if (!in_dev)
return NOTIFY_DONE;
switch (event) {
//設備UP
case NETDEV_UP:
for_ifa(in_dev) {
fib_add_ifaddr(ifa);
} endfor_ifa(in_dev);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
rt_cache_flush(-1);
break;
//設備DOWN
case NETDEV_DOWN:
fib_disable_ip(dev, 0);
break;
//設備參數改變
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
rt_cache_flush(0);
break;
}
return NOTIFY_DONE;
}
路由部份的代碼將在後續的筆記中給出。至此,整個網絡設備的架構非常的清晰了!
它主要完成:對網應對應的net net_device賦初值。並向內核調用register_netdev完成網絡設備的註冊,網絡設備註冊我們在上一節中已經說過,這裏不再贅述。
看一下net_device中幾個關鍵的函數:
//在設備將打開的時候,調用此函數
netdev->open = e100_open;
//在設備停用的時候調用此函數
netdev->stop = e100_close;
//設備發送數據的時候調用此函數
netdev->hard_start_xmit = e100_xmit_frame;
到此時,網卡的初始化工作已經完成了。之後就可以操作網卡了。
那網卡應該怎麼使用呢?必須首先喚起網卡,即使之UP,例如 ifconfig eth0 up
此時,內核會根據接口名字“eth0”找到對應的net_device.然後調用 net_device-> open.即:e100_open。
分析如下:
static int e100_open(struct net_device *netdev)
{
struct nic *nic = netdev_priv(netdev);
int err = 0;
//網卡正在UP,關閉載波信號
netif_carrier_off(netdev);
if((err = e100_up(nic)))
DPRINTK(IFUP, ERR, "Cannot open interface, aborting./n");
return err;
}
我們關心的是e100_up。跟蹤如下:
static int e100_up(struct nic *nic)
{
int err;
//分配收包隊列
if((err = e100_rx_alloc_list(nic)))
return err;
//分配控制隊列
if((err = e100_alloc_cbs(nic)))
goto err_rx_clean_list;
//硬件初始化
if((err = e100_hw_init(nic)))
goto err_clean_cbs;
//多播
e100_set_multicast_list(nic->netdev);
//開始接收數據
e100_start_receiver(nic);
mod_timer(&nic->watchdog, jiffies);
//註冊中斷例程
if((err = request_irq(nic->pdev->irq, e100_intr, SA_SHIRQ,
nic->netdev->name, nic->netdev)))
goto err_no_irq;
//啓用中斷
e100_enable_irq(nic);
netif_wake_queue(nic->netdev);
return 0;
err_no_irq:
del_timer_sync(&nic->watchdog);
err_clean_cbs:
e100_clean_cbs(nic);
err_rx_clean_list:
e100_rx_clean_list(nic);
return err;
}
在此函數中,我們可以看到,它主要完成了:接立接收環形DMA緩衝區。註冊了中斷處理函數
關於環形DMA緩衝區接立是由e100_rx_alloc_list(nic)完成的
static int e100_rx_alloc_list(struct nic *nic)
{
struct rx *rx;
// nic->params.rfds.count,接收緩存的總個數
unsigned int i, count = nic->params.rfds.count;
//rx_to_use:正在存在數據的位置
//rx_to_clean:數據的初始爲止。所以。數據的有限位置是從rx_to_use到rx_to_use
nic->rx_to_use = nic->rx_to_clean = NULL;
if(!(nic->rxs = kmalloc(sizeof(struct rx) * count, GFP_ATOMIC)))
return -ENOMEM;
memset(nic->rxs, 0, sizeof(struct rx) * count);
//遍歷並建立循環鏈表
for(rx = nic->rxs, i = 0; i < count; rx++, i++) {
rx->next = (i + 1 < count) ? rx + 1 : nic->rxs;
rx->prev = (i == 0) ? nic->rxs + count - 1 : rx - 1;
if(e100_rx_alloc_skb(nic, rx)) {
e100_rx_clean_list(nic);
return -ENOMEM;
}
}
//初始化起如位置爲nic->rxs
nic->rx_to_use = nic->rx_to_clean = nic->rxs;
return 0;
}
爲設備建立DMA映射的主函數爲e100_rx_alloc_skb().分析如下:
static inline int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)
{
unsigned int rx_offset = 2; /* u32 align protocol headers */
if(!(rx->skb = dev_alloc_skb(RFD_BUF_LEN + rx_offset)))
return -ENOMEM;
/* Align, init, and map the RFD. */
rx->skb->dev = nic->netdev;
//在數據存儲區之前空出offset空間
skb_reserve(rx->skb, rx_offset);
//skb->data前部置RFD
memcpy(rx->skb->data, &nic->blank_rfd, sizeof(struct rfd));
//DMA內存映射,映射至skb->data
rx->dma_addr = pci_map_single(nic->pdev, rx->skb->data,
RFD_BUF_LEN, PCI_DMA_BIDIRECTIONAL);
/* Link the RFD to end of RFA by linking previous RFD to
l this one, and clearing EL bit of previous. */
//初始化前一個skb中的控制信息
if(rx->prev->skb) {
struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;
put_unaligned(cpu_to_le32(rx->dma_addr),
(u32 *)&prev_rfd->link);
wmb();
prev_rfd->command &= ~cpu_to_le16(cb_el);
pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,
sizeof(struct rfd), PCI_DMA_TODEVICE);
}
return 0;
}
在這個函數裏,主要完成了:DMA環形鏈表的建立。在這裏涉及到了一個重要的數據結構sk_buff.稍後再給出它的結構分析。在這裏我們只要知道在skb->data裏儲存的是接收數據就OK了。值得一提的是,Intel 100M 網卡對接收數據的處理,跟平時遇到的網卡不一樣,接收數據時會由接收控制RU寫入接收信息,由此判斷接收是否完全等信息。也就是我們在代碼裏面看到的rfd.所以,在skb->data對應的就是rfd+網絡傳過來的數據.
到這裏,接收準備工作已經完成了。
四:數據接收
爲了瞭解網卡數據接收的過程。有必要先討論DMA的具體過程。
DMA傳輸數據可以分爲以下幾個步驟:
首先:CPU向DMA送命令,如DMA方式,主存地址,傳送的字數等,之後CPU執行原來的程序.
然後DMA 控制在 I/O 設備與主存間交換數據。接收數據完後, 向CPU發DMA請求,取得總線控制權,進行數據傳送,修改卡上主存地址,修改字數計數器內且檢查其值是否爲零,不爲零則繼續傳送,若已爲零,則向 CPU發中斷請求.。
也就是說,網卡收到包時,將它放入當前skb->data中。再來一個包時。DMA會修改卡上主存地址,轉到skb->next,將數據放入其中。這也就是,一個skb->data存儲一個數據包的原因。
好了,現在就可以來看具體的代碼實現了。
當網絡數據到絡,網卡將其放到DMA內存,然後DMA向CPU報告中斷,CPU根據中斷向量,找到中斷處理例程,也就是我們前面註冊的e100_intr()進行處理。
static irqreturn_t e100_intr(int irq, void *dev_id, struct pt_regs *regs)
{
struct net_device *netdev = dev_id;
struct nic *nic = netdev_priv(netdev);
u8 stat_ack = readb(&nic->csr->scb.stat_ack);
DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X/n", stat_ack);
if(stat_ack == stat_ack_not_ours || /* Not our interrupt */
stat_ack == stat_ack_not_present) /* Hardware is ejected */
return IRQ_NONE;
/* Ack interrupt(s) */
//發送中斷ACK。Cpu向設備發送ACK。表示此中斷已經處理
writeb(stat_ack, &nic->csr->scb.stat_ack);
/* We hit Receive No Resource (RNR); restart RU after cleaning */
if(stat_ack & stat_ack_rnr)
nic->ru_running = 0;
//禁用中斷
e100_disable_irq(nic);
//CPU開始調度此設備。轉而會運行netdev->poll
netif_rx_schedule(netdev);
return IRQ_HANDLED;
}
netif_rx_schedule(netdev)後,cpu開始調度此設備,輪詢設備是否有數據要處理。轉後調用netdev->poll函數,即:e100_poll()
static int e100_poll(struct net_device *netdev, int *budget)
{
struct nic *nic = netdev_priv(netdev);
unsigned int work_to_do = min(netdev->quota, *budget);
unsigned int work_done = 0;
int tx_cleaned;
//開始對nic中,DMA數據的處理
e100_rx_clean(nic, &work_done, work_to_do);
tx_cleaned = e100_tx_clean(nic);
/* If no Rx and Tx cleanup work was done, exit polling mode. */
if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) {
netif_rx_complete(netdev);
e100_enable_irq(nic);
return 0;
}
*budget -= work_done;
netdev->quota -= work_done;
return 1;
}
跟蹤進e100_rx_clean():
static inline void e100_rx_clean(struct nic *nic, unsigned int *work_done,
unsigned int work_to_do)
{
struct rx *rx;
/* Indicate newly arrived packets */
//遍歷環形DMA中的數據,調用e100_rx_indicate()進行處理
for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {
if(e100_rx_indicate(nic, rx, work_done, work_to_do))
break; /* No more to clean */
}
/* Alloc new skbs to refill list */
for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {
if(unlikely(e100_rx_alloc_skb(nic, rx)))
break; /* Better luck next time (see watchdog) */
}
e100_start_receiver(nic);
}
在這裏,它會遍歷環形DMA中的數據,即從nic->rx_to_clean開始的數據,直至數據全部處理完
進入處理函數:e100_rx_indicate()
static inline int e100_rx_indicate(struct nic *nic, struct rx *rx,
unsigned int *work_done, unsigned int work_to_do)
{
struct sk_buff *skb = rx->skb;
//從這裏取得rfd.其中包括了一些接收信息,但不是鏈路傳過來的有效數據
struct rfd *rfd = (struct rfd *)skb->data;
u16 rfd_status, actual_size;
if(unlikely(work_done && *work_done >= work_to_do))
return -EAGAIN;
//同步DMA緩存
pci_dma_sync_single_for_cpu(nic->pdev, rx->dma_addr,
sizeof(struct rfd), PCI_DMA_FROMDEVICE);
//取得接收狀態
rfd_status = le16_to_cpu(rfd->status);
DPRINTK(RX_STATUS, DEBUG, "status=0x%04X/n", rfd_status);
/* If data isn't ready, nothing to indicate */
//沒有接收完全,返回
if(unlikely(!(rfd_status & cb_complete)))
return -EAGAIN;
//取得接收數據的長度
actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;
if(unlikely(actual_size > RFD_BUF_LEN - sizeof(struct rfd)))
actual_size = RFD_BUF_LEN - sizeof(struct rfd);
//取消DMA緩存映射
pci_unmap_single(nic->pdev, rx->dma_addr,
RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
//由於RFD不是鏈路傳入的數據,清除
skb_reserve(skb, sizeof(struct rfd));
//調整skb中的tail指針,與len更新
skb_put(skb, actual_size);
//取得鏈路層協議
skb->protocol = eth_type_trans(skb, nic->netdev);
//接收失敗
if(unlikely(!(rfd_status & cb_ok))) {
/* Don't indicate if hardware indicates errors */
nic->net_stats.rx_dropped++;
dev_kfree_skb_any(skb);
}
//數據超長。Drop it
else if(actual_size > nic->netdev->mtu + VLAN_ETH_HLEN) {
/* Don't indicate oversized frames */
nic->rx_over_length_errors++;
nic->net_stats.rx_dropped++;
dev_kfree_skb_any(skb);
} else {
//成功的接收了,更新統計計數
nic->net_stats.rx_packets++;
nic->net_stats.rx_bytes += actual_size;
nic->netdev->last_rx = jiffies;
//送至上次協議處理
netif_receive_skb(skb);
if(work_done)
(*work_done)++;
}
rx->skb = NULL;
return 0;
}
上面代碼中要去判斷接收是否完全,爲什麼要去判斷呢?根據DMA機制,是網卡把數據放入DMA之後。DMA再向CPU發中斷的嘛?呵呵。在這裏進行接收完全判斷是因爲:
1:由其它原因造成的中斷
2:在處理中斷時候。數據又到達了。網卡依然會把它放至下一個skb。而在代碼處理中是遍歷處理的,也就是說處理下一個skb的時候,可能網卡正在傳數據。
好了,運行到netif_receive_skb()之後,數據包被送到上層。關於後續的處理流程,以後會有專題討論
五:數據的發送
在進入到發送函數之前,我們先來看e100_up()->e100_alloc_cbs函數:
static int e100_alloc_cbs(struct nic *nic)
{
struct cb *cb;
unsigned int i, count = nic->params.cbs.count;
nic->cuc_cmd = cuc_start;
nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = NULL;
nic->cbs_avail = 0;
//線性DMA映射,這裏返回的是虛擬地址,供CPU使用的
nic->cbs = pci_alloc_consistent(nic->pdev,
sizeof(struct cb) * count, &nic->cbs_dma_addr);
if(!nic->cbs)
return -ENOMEM;
//建立環形的發送緩衝區
for(cb = nic->cbs, i = 0; i < count; cb++, i++) {
cb->next = (i + 1 < count) ? cb + 1 : nic->cbs;
cb->prev = (i == 0) ? nic->cbs + count - 1 : cb - 1;
cb->dma_addr = nic->cbs_dma_addr + i * sizeof(struct cb);
cb->link = cpu_to_le32(nic->cbs_dma_addr +
((i+1) % count) * sizeof(struct cb));
cb->skb = NULL;
}
//初始化各指針,使其指向緩衝初始位置
nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = nic->cbs;
nic->cbs_avail = count;
return 0;
}
在這一段代碼裏,完成了發送的準備工作,建立了發送環形緩存。在發送數劇時,只要把數據送入緩存即可
數據最終會調用dev-> hard_start_xmit函數。在e100代碼裏,也就是e100_xmit_frame(). 進入裏面看下:
static int e100_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
{
struct nic *nic = netdev_priv(netdev);
int err;
if(nic->flags & ich_10h_workaround) {
e100_exec_cmd(nic, cuc_nop, 0);
udelay(1);
}
err = e100_exec_cb(nic, skb, e100_xmit_prepare);
switch(err) {
case -ENOSPC:
/* We queued the skb, but now we're out of space. */
netif_stop_queue(netdev);
break;
case -ENOMEM:
/* This is a hard error - log it. */
DPRINTK(TX_ERR, DEBUG, "Out of Tx resources, returning skb/n");
netif_stop_queue(netdev);
return 1;
}
netdev->trans_start = jiffies;
return 0;
}
繼續跟蹤進 e100_exec_cb(nic, skb, e100_xmit_prepare);
static inline int e100_exec_cb(struct nic *nic, struct sk_buff *skb,
void (*cb_prepare)(struct nic *, struct cb *, struct sk_buff *))
{
struct cb *cb;
unsigned long flags;
int err = 0;
spin_lock_irqsave(&nic->cb_lock, flags);
if(unlikely(!nic->cbs_avail)) {
err = -ENOMEM;
goto err_unlock;
}
//將skb 推入環形發送緩衝
//cb_to_use:發送緩衝當前的使用位置
cb = nic->cb_to_use;
nic->cb_to_use = cb->next;
nic->cbs_avail--;
cb->skb = skb;
if(unlikely(!nic->cbs_avail))
err = -ENOSPC;
cb_prepare(nic, cb, skb);
/* Order is important otherwise we'll be in a race with h/w:
* set S-bit in current first, then clear S-bit in previous. */
cb->command |= cpu_to_le16(cb_s);
wmb();
cb->prev->command &= cpu_to_le16(~cb_s);
//當發送數據不爲空。將餘下數劇全部發送
while(nic->cb_to_send != nic->cb_to_use) {
if(unlikely(e100_exec_cmd(nic, nic->cuc_cmd,
nic->cb_to_send->dma_addr))) {
/* Ok, here's where things get sticky. It's
* possible that we can't schedule the command
* because the controller is too busy, so
* let's just queue the command and try again
* when another command is scheduled. */
break;
} else {
nic->cuc_cmd = cuc_resume;
nic->cb_to_send = nic->cb_to_send->next;
}
}
err_unlock:
spin_unlock_irqrestore(&nic->cb_lock, flags);
return err;
}
在這裏我們看到,發送數據過程主要由e100_exec_cmd完成。跟蹤進去
static inline int e100_exec_cmd(struct nic *nic, u8 cmd, dma_addr_t dma_addr)
{
unsigned long flags;
unsigned int i;
int err = 0;
spin_lock_irqsave(&nic->cmd_lock, flags);
/* Previous command is accepted when SCB clears */
for(i = 0; i < E100_WAIT_SCB_TIMEOUT; i++) {
if(likely(!readb(&nic->csr->scb.cmd_lo)))
break;
cpu_relax();
if(unlikely(i > (E100_WAIT_SCB_TIMEOUT >> 1)))
udelay(5);
}
if(unlikely(i == E100_WAIT_SCB_TIMEOUT)) {
err = -EAGAIN;
goto err_unlock;
}
if(unlikely(cmd != cuc_resume))
//將數據的存放地址放入對應寄存器
writel(dma_addr, &nic->csr->scb.gen_ptr);
//將發送操作寫入控制寄存器
writeb(cmd, &nic->csr->scb.cmd_lo);
err_unlock:
spin_unlock_irqrestore(&nic->cmd_lock, flags);
return err;
}
從此可以看到。Intel 100M網卡對發送數據的處理,只需將地址,命令寫入相應的寄存器即可。詳細資料可以查看intel 100M網卡的說明。
令人不解的是,在發送數據時,不要將發送長度寫入相關寄存器嗎?那他又是如何截取的呢?
sk_buff結構分析
sk_buff是我們遇到的第二個重要的結構,在內核中經常被縮寫成skb.在linux 2.6.21它被定義成:
struct sk_buff {
//指向下一個skb
struct sk_buff *next;
//上一個skb
struct sk_buff *prev;
struct sk_buf0f_head *list;
//對應的sock。這也是個重要的結構,在傳輸層的時候我們再來分析
struct sock *sk;
//接收或者發送時間戳
struct timeval stamp;
//接收或者發送時對應的net_device
struct net_device *dev;
//接收的net_device
struct net_device *input_dev;
//數據包對應的真實net_device.關於虛擬設備可以在之後的網橋模式分析中討論
struct net_device *real_dev;
//ip層的相關信息
union {
struct tcphdr *th;
struct udphdr *uh;
struct icmphdr *icmph;
struct igmphdr *igmph;
struct iphdr *ipiph;
struct ipv6hdr *ipv6h;
unsigned char *raw;
} h;
//協議層的相關信息
union {
struct iphdr *iph;
struct ipv6hdr *ipv6h;
struct arphdr *arph;
unsigned char *raw;
} nh;
//鏈路層的相關信息
union {
unsigned char *raw;
} mac;
//在路由子系統中再來分析這一結構
struct dst_entry *dst;
struct sec_path *sp;
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[40];
//各層的數據長度
unsigned int len,
data_len,
mac_len,
csum;
unsigned char local_df,
cloned,
pkt_type,
ip_summed;
__u32 priority;
unsigned short protocol,
security;
void (*destructor)(struct sk_buff *skb);
#ifdef CONFIG_NETFILTER
unsigned long nfmark;
__u32 nfcache;
__u32 nfctinfo;
struct nf_conntrack *nfct;
#ifdef CONFIG_NETFILTER_DEBUG
unsigned int nf_debug;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
#endif /* CONFIG_NETFILTER */
#if defined(CONFIG_HIPPI)
union {
__u32 ifield;
} private;
#endif
#ifdef CONFIG_NET_SCHED
__u32 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u32 tc_verd; /* traffic control verdict */
__u32 tc_classid; /* traffic control classid */
#endif
#endif
/* These elements must be at the end, see alloc_skb() for details. */
unsigned int truesize;
//引用計數
atomic_t users;
//存儲空間的起始地址
unsigned char *head,
//網絡數據的起始起址
*data,
//存放網絡數據的結束地址
*tail,
//存儲空間的結束地址
*end;
}
對應我們上面的網卡驅動分析。接收到的數據是存放在data至tail之間的區域。
Skb通常還有常用的幾個函數,一一列舉分析如下:
struct sk_buff *alloc_skb(unsigned int size,int gfp_mask)
分配存儲空間爲sixe的skb,內存分配級別爲gfp_mask.注意這裏的存儲空間的含義,即爲skb->data至skb->tail的區域
struct sk_buff *skb_clone(struct sk_buff *skb, int priority)
克隆出的skb指向同一個結構,同時會增加skb的引用計數
struct sk_buff *skb_copy(const struct sk_buff *skb, int priority)
複製一個全新的skb
void kfree_skb(struct sk_buff *skb)
當skb的引用計數爲1的時候,釋放此skb
unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
使skb的存儲空間擴大len.即使tail指針下移
unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
push,即推出一段數據,使data指針下層。
void skb_reserve(struct sk_buff *skb, unsigned int len)
該操作使data指針跟tail指針同時下移,即擴大存儲區域之前的空間
int skb_headroom(const struct sk_buff *skb)
返回data之前可用的空間數量
int skb_tailroom(const struct sk_buff *skb)
返回緩存區中可用的空間大小
二:從網卡驅動說起。
以intel 100M 網卡驅動爲例簡要概述數據包的接收與發送流程。代碼見(drivers/net/e100.c)
網卡是屬於PCI設備,它的註冊跟一般的PCI設備註冊沒什麼兩樣。
static int __init e100_init_module(void)
{
if(((1 << debug) - 1) & NETIF_MSG_DRV) {
printk(KERN_INFO PFX "%s, %s/n", DRV_DESCRIPTION, DRV_VERSION);
printk(KERN_INFO PFX "%s/n", DRV_COPYRIGHT);
}
//註冊PCI
return pci_module_init(&e100_driver);
}
其中e100_driver對應爲網卡的pci_driver.
static struct pci_driver e100_driver = {
//驅動對應的名字
.name = DRV_NAME,
//匹配類型
.id_table = e100_id_table,
//偵測函數
.probe = e100_probe,
//移除函數,設備移除時將調用此函數
.remove = __devexit_p(e100_remove),
#ifdef CONFIG_PM
.suspend = e100_suspend,
.resume = e100_resume,
#endif
}
當總數探測到PCI設備符合e100_id_table中的參數時,將會調用e100_probe,開始設備的初始化
在e100_probe中:
static int __devinit e100_probe(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
struct net_device *netdev;
struct nic *nic;
int err;
//分配net_device併爲其賦值
//alloc_etherdev爲以太網接口的net_device分配函數。它是alloc_netdev的封裝函數
if(!(netdev = alloc_etherdev(sizeof(struct nic)))) {
if(((1 << debug) - 1) & NETIF_MSG_PROBE)
printk(KERN_ERR PFX "Etherdev alloc failed, abort./n");
return -ENOMEM;
}
//對netdev中的函數指針賦初值
netdev->open = e100_open;
netdev->stop = e100_close;
netdev->hard_start_xmit = e100_xmit_frame;
netdev->get_stats = e100_get_stats;
netdev->set_multicast_list = e100_set_multicast_list;
netdev->set_mac_address = e100_set_mac_address;
netdev->change_mtu = e100_change_mtu;
netdev->do_ioctl = e100_do_ioctl;
//支持ethtool工具時有效
SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);
netdev->tx_timeout = e100_tx_timeout;
netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;
//輪詢函數
netdev->poll = e100_poll;
netdev->weight = E100_NAPI_WEIGHT;
#ifdef CONFIG_NET_POLL_CONTROLLER
netdev->poll_controller = e100_netpoll;
#endif
//獲得net_device私有數據區,並對其賦值
//私有數據大小是由alloc_etherdev()參數中指定的
nic = netdev_priv(netdev);
nic->netdev = netdev;
nic->pdev = pdev;
nic->msg_enable = (1 << debug) - 1;
pci_set_drvdata(pdev, netdev);
//啓動網卡.爲之後DMA,I/O內存映射做準備
//它實際上是對PCI的控制寄存器賦值來實現的
if((err = pci_enable_device(pdev))) {
DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting./n");
goto err_out_free_dev;
}
//獲取該資源相關聯的標誌
//如果該設備存在I/O內存,則置IORESOURCE_MEM
if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
DPRINTK(PROBE, ERR, "Cannot find proper PCI device "
"base address, aborting./n");
err = -ENODEV;
goto err_out_disable_pdev;
}
//對PCI的6個寄存器都會調用資源分配函數進行申請
if((err = pci_request_regions(pdev, DRV_NAME))) {
DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting./n");
goto err_out_disable_pdev;
}
//探制設備的DMA能力。如果設備支持DMA。pci_set_dma_mask返回0
pci_set_master(pdev);
if((err = pci_set_dma_mask(pdev, 0xFFFFFFFFULL))) {
DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting./n");
goto err_out_free_res;
}
SET_MODULE_OWNER(netdev);
SET_NETDEV_DEV(netdev, &pdev->dev);
//映射設備對應的I/O。以後對設備寄存器的操作可以直接轉換爲對內存的操作
nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr));
if(!nic->csr) {
DPRINTK(PROBE, ERR, "Cannot map device registers, aborting./n");
err = -ENOMEM;
goto err_out_free_res;
}
if(ent->driver_data)
nic->flags |= ich;
else
nic->flags &= ~ich;
spin_lock_init(&nic->cb_lock);
spin_lock_init(&nic->cmd_lock);
//設置定時器。
init_timer(&nic->watchdog);
nic->watchdog.function = e100_watchdog;
nic->watchdog.data = (unsigned long)nic;
init_timer(&nic->blink_timer);
nic->blink_timer.function = e100_blink_led;
nic->blink_timer.data = (unsigned long)nic;
//爲nic->mem建立線性DMA。只是在支持ethtool的時候纔有用
if((err = e100_alloc(nic))) {
DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting./n");
goto err_out_iounmap;
}
//對nic成員賦初值
e100_get_defaults(nic);
e100_hw_reset(nic);
e100_phy_init(nic);
//讀取網卡的EEPROM。其中存放着網卡的MAC地址。
//對EEPROM是通過對I/O映射內存的操作實現的,即nic->csr
if((err = e100_eeprom_load(nic)))
goto err_out_free;
//設置netdev->dev_addr
memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);
if(!is_valid_ether_addr(netdev->dev_addr)) {
DPRINTK(PROBE, ERR, "Invalid MAC address from "
"EEPROM, aborting./n");
err = -EAGAIN;
goto err_out_free;
}
/* Wol magic packet can be enabled from eeprom */
if((nic->mac >= mac_82558_D101_A4) &&
(nic->eeprom[eeprom_id] & eeprom_id_wol))
nic->flags |= wol_magic;
pci_enable_wake(pdev, 0, nic->flags & (wol_magic | e100_asf(nic)));
//註冊網絡設備
if((err = register_netdev(netdev))) {
DPRINTK(PROBE, ERR, "Cannot register net device, aborting./n");
goto err_out_free;
}
DPRINTK(PROBE, INFO, "addr 0x%lx, irq %d, "
"MAC addr %02X:%02X:%02X:%02X:%02X:%02X/n",
pci_resource_start(pdev, 0), pdev->irq,
netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2],
netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]);
return 0;
err_out_free:
e100_free(nic);
err_out_iounmap:
iounmap(nic->csr);
err_out_free_res:
pci_release_regions(pdev);
err_out_disable_pdev:
pci_disable_device(pdev);
err_out_free_dev:
pci_set_drvdata(pdev, NULL);
free_netdev(netdev);
return err;
}
<<prison break>>第三季的第五集,終於在翹首企盼中姍姍來遲了,scofid用它驚人的智慧一次次化險爲夷,但在邪惡的sona監獄他將如何逃脫呢?這我們不得而知,但我們可以分析Linux網絡驅動來得到數據包是怎麼通過物理接口的這一層“prison”束縛來達到通信目的:-)
一:預備知識
關於I/O內存映射。
設備通過控制總線,數據總線,狀態總線與CPU相連。控制總數傳送控制信號,例如,網卡的啓用。數據總線控制數據傳輸,例如,網卡發送數據,狀態總數一般都是讀取設備的當前狀態,例如讀取網卡的MAC地址。
在傳統的操作中,都是通過讀寫設備寄存器的值來實現。但是這樣耗費了CPU時鐘。而且每取一次值都要讀取設備寄存器,造成了效率的低下。在現代操作系統中。引用了I/O內存映射。即把寄存器的值映身到主存。對設備寄存器的操作,轉換爲對主存的操作,這樣極大的提高了效率。
關於DMA
這是關於設備數據處理的一種方式。傳統的處理方法爲:當設備接收到數據,向CPU報告中斷。CPU處理中斷,把數據放到內存。
在現代操作系統中引入的DMA是指,設備接收到數據時,把數據放至DMA內存,再向CPU產生中斷。這樣節省了大量的CPU時間
關於軟中斷與NAPI
在現代操作系統中,對中斷的處理速度要求越來越高。爲了響應中斷,將中斷分爲兩部份,即上半部與下半部。上半部將數據推入處理隊列,響應中斷。然後再由下半部調度完成餘下的任務。
NAPI是2.6新引入的一個概念,它在發生中斷的時候,禁用中斷。然後處理數據。之後,每隔一定的時候,它會主動向設備詢用是否有數據要處理。
I/O,DMA在後續代碼分析中會討論在linux2.6.21中的實現。軟中斷與NAPI的詳細知識將會在分析中斷處理的時候,一一爲你道來