skg_segment是實現封裝報文GSO分段的基礎,直接拋代碼。
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
struct sk_buff *skb_segment(struct sk_buff *head_skb,
netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
unsigned int doffset = head_skb->data - skb_mac_header(head_skb); //得到內層報頭的長度
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb); //得到外層報頭的長度,非封裝報文該值爲0, 是支持封裝報文GSO的基礎
unsigned int headroom;
unsigned int len;
__be16 proto;
bool csum;
int sg = !!(features & NETIF_F_SG); //是否支持SG
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
int dummy;
__skb_push(head_skb, doffset); //報文移到內層報文的mac頭
proto = skb_network_protocol(head_skb, &dummy); //報文協議類型
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
csum = !head_skb->encap_hdr_csum &&
!!can_checksum_protocol(features, proto);
headroom = skb_headroom(head_skb); //得到報文的headroom大小
pos = skb_headlen(head_skb); //報文線性區長度
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
int hsize;
int size;
len = head_skb->len - offset; //計算報文待拷貝的長度,不包括包頭
if (len > mss)
len = mss; //len超過mss,則只能拷貝mss長度
hsize = skb_headlen(head_skb) - offset; //待拷貝的線性區長度
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg)
hsize = len;
if (!hsize && i >= nfrags && skb_headlen(list_skb) && //frag_list中還有數據
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len); //frag_list中的skb線性區長度不超過len,即mss值
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
pos += skb_headlen(list_skb); //增加線性區長度
while (pos < offset + len) { //只能拷貝len長度
BUG_ON(i >= nfrags);
size = skb_frag_size(frag);
if (pos + size > offset + len)
break;
i++;
pos += size; //增加frag的長度
frag++;
}
nskb = skb_clone(list_skb, GFP_ATOMIC); //克隆報文,該報文包含完整的數據,需要裁剪
list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
if (unlikely(pskb_trim(nskb, len))) { //裁剪報文到len長度
kfree_skb(nskb);
goto err;
}
hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) { //擴展head,以容得下外層報頭
kfree_skb(nskb);
goto err;
}
nskb->truesize += skb_end_offset(nskb) - hsize; //truesize值刷新
skb_release_head_state(nskb);
__skb_push(nskb, doffset); //skb移動到內層報文的mac頭
} else {
nskb = __alloc_skb(hsize + doffset + headroom, //skb的frag還未使用完,採用新申請skb的方式
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
skb_reserve(nskb, headroom); //skb預留headroom長度
__skb_put(nskb, doffset); //線性區擴展內層報頭長度
}
if (segs)
tail->next = nskb;
else
segs = nskb;
tail = nskb;
__copy_skb_header(nskb, head_skb); //拷貝skb的相關信息,包括header都拷貝了
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); //刷新header值
skb_reset_mac_len(nskb); //重置mac len值
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, //拷貝外兩層報頭(如果封裝的話)
nskb->data - tnl_hlen,
doffset + tnl_hlen);
if (nskb->len == len + doffset) //對於使用frag_list場景,滿足條件;拷貝frag場景不滿足
goto perform_csum_check;
if (!sg && !nskb->remcsum_offload) {
nskb->ip_summed = CHECKSUM_NONE;
nskb->csum = skb_copy_and_csum_bits(head_skb, offset, //計算cusm值
skb_put(nskb, len),
len, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
continue;
}
nskb_frag = skb_shinfo(nskb)->frags;
skb_copy_from_linear_data_offset(head_skb, offset, //拷貝線性區數據
skb_put(nskb, hsize), hsize);
skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;
while (pos < offset + len) {
if (i >= nfrags) {
BUG_ON(skb_headlen(list_skb));
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
BUG_ON(!nfrags);
list_skb = list_skb->next; //frag_list場景,取下一個skb
}
if (unlikely(skb_shinfo(nskb)->nr_frags >=
MAX_SKB_FRAGS)) {
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
goto err;
}
if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
goto err;
*nskb_frag = *frag; //frag_list的邏輯和frag的邏輯合併在了一起,增加了複雜度
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);
if (pos < offset) {
nskb_frag->page_offset += offset - pos;
skb_frag_size_sub(nskb_frag, offset - pos); //frag分拆
}
skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) {
i++;
frag++;
pos += size;
} else {
skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); //frag分拆
goto skip_fraglist;
}
nskb_frag++;
}
skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
perform_csum_check:
if (!csum && !nskb->remcsum_offload) {
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0); //計算csum值
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
/* Some callers want to get the end of the list.
* Put it in segs->prev to avoid walking the list.
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
*/
if (head_skb->destructor == sock_wfree) {
swap(tail->truesize, head_skb->truesize);
swap(tail->destructor, head_skb->destructor);
swap(tail->sk, head_skb->sk);
}
return segs;
err:
kfree_skb_list(segs);
return ERR_PTR(err);
}