OVS dp支持的action都在do_execute_actions函數中定義,支持的action包括:
OVS_ACTION_ATTR_OUTPUT
OVS_ACTION_ATTR_USERSPACE
OVS_ACTION_ATTR_HASH
OVS_ACTION_ATTR_PUSH_MPLS
OVS_ACTION_ATTR_POP_MPLS
OVS_ACTION_ATTR_PUSH_VLAN
OVS_ACTION_ATTR_POP_VLAN
OVS_ACTION_ATTR_RECIRC
OVS_ACTION_ATTR_SET
OVS_ACTION_ATTR_SET_MASKED
OVS_ACTION_ATTR_SET_TO_MASKED
OVS_ACTION_ATTR_SAMPLE
OVS_ACTION_ATTR_CT
本系列要完成這些action的分析,output已經在之前介紹datapath主流程時已經介紹,不再進行介紹。
一、OVS_ACTION_ATTR_USERSPACE
本節爲OVS_ACTION_ATTR_USERSPACE的處理函數爲output_userspace函數,以此函數作爲入口進行分析。
1、output_userspace函數
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
struct ip_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_ACTION; //封裝upcall對象
upcall.mru = OVS_CB(skb)->mru;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0; //獲取userspace action相關的信息
a = nla_next(a, &rem)) {
switch (nla_type(a)) {
case OVS_USERSPACE_ATTR_USERDATA:
upcall.userdata = a;
break;
case OVS_USERSPACE_ATTR_PID:
upcall.portid = nla_get_u32(a);
break;
case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
/* Get out tunnel info. */
struct vport *vport;
vport = ovs_vport_rcu(dp, nla_get_u32(a));
if (vport) {
int err;
upcall.egress_tun_info = &info;
err = ovs_vport_get_egress_tun_info(vport, skb,
&upcall);
if (err)
upcall.egress_tun_info = NULL;
}
break;
}
case OVS_USERSPACE_ATTR_ACTIONS: {
/* Include actions. */
upcall.actions = actions;
upcall.actions_len = actions_len;
break;
}
} /* End of switch. */
}
return ovs_dp_upcall(dp, skb, key, &upcall); //upcall
}
2、ovs_dp_upcall函數
int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct dp_stats_percpu *stats;
int err;
if (upcall_info->portid == 0) {
err = -ENOTCONN;
goto err;
}
if (!skb_is_gso(skb))
err = queue_userspace_packet(dp, skb, key, upcall_info);
else
err = queue_gso_packets(dp, skb, key, upcall_info);
if (err)
goto err;
return 0;
err:
stats = this_cpu_ptr(dp->stats_percpu);
u64_stats_update_begin(&stats->syncp);
stats->n_lost++;
u64_stats_update_end(&stats->syncp);
return err;
}
3、queue_userspace_packet函數
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0); //爲什麼要把tci置0?
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); //創建upcall消息對象
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, //dp_packet_genl_family 和 upcall_info->cmd確定處理函數
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); //upcall信息對象添加key
BUG_ON(err);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, //upcall信息對象添加userdata
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); //upcall信息對象添加egress_tun_info
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info,
upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); //upcall信息對象添加actions
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy()
*/
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen); //upcall信息對象添加報文
if (err)
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); //發送netlink報文
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
到此可以看到userspace action和精確流表未匹配導致的upcall在處理流程上是比較一致的,兩者都是通過調用ovs_dp_upcall函數實現信息發送到用戶態程序。upcall處理線程是如何處理的不在本篇分析,將在後續給出分析。
通過userspace能夠實現什麼功能呢? 現在還想不出,等分析upcall處理後,再回過頭來回答這個問題。
二、OVS_ACTION_ATTR_HASH
本節分析OVS_ACTION_ATTR_HASH action,該action的處理函數爲execute_hash函數
1、execute_hash函數
static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
const struct nlattr *attr)
{
struct ovs_action_hash *hash_act = nla_data(attr);
u32 hash = 0;
/* OVS_HASH_ALG_L4 is the only possible hash algorithm. */
hash = skb_get_hash(skb);
hash = jhash_1word(hash, hash_act->hash_basis);
if (!hash)
hash = 0x1;
key->ovs_flow_hash = hash; //計算hash值
}
該action僅對key的ovs_flow_hash成員變量進行了修改,從該變量的使用地方逆推,最終是queue_userspace_packet會使用,該函數是把報文發送給用戶態進程,本次就看下queue_userspace_packet函數是如何使用到該成員變量的。
2、queue_userspace_packet函數
static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_key *key,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
.snd_portid = upcall_info->portid,
};
size_t len;
unsigned int hlen;
int err, dp_ifindex;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
return -ENODEV;
if (skb_vlan_tag_present(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return -ENOMEM;
nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
if (!nskb)
return -ENOMEM;
vlan_set_tci(nskb, 0); //爲什麼要把tci置0?
skb = nskb;
}
if (nla_attr_size(skb->len) > USHRT_MAX) {
err = -EFBIG;
goto out;
}
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic. Only perform zerocopy if padding is not required.
*/
if (dp->user_features & OVS_DP_F_UNALIGNED)
hlen = skb_zerocopy_headlen(skb);
else
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); //創建upcall消息對象
if (!user_skb) {
err = -ENOMEM;
goto out;
}
upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, //dp_packet_genl_family 和 upcall_info->cmd確定處理函數
0, upcall_info->cmd);
upcall->dp_ifindex = dp_ifindex;
err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); //upcall信息對象添加key,該函數最終會用到ovs_flow_hash
BUG_ON(err);
if (upcall_info->userdata)
__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, //upcall信息對象添加userdata
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); //upcall信息對象添加egress_tun_info
err = ovs_nla_put_egress_tunnel_key(user_skb,
upcall_info->egress_tun_info,
upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
if (upcall_info->actions_len) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); //upcall信息對象添加actions
err = ovs_nla_put_actions(upcall_info->actions,
upcall_info->actions_len,
user_skb);
if (!err)
nla_nest_end(user_skb, nla);
else
nla_nest_cancel(user_skb, nla);
}
/* Add OVS_PACKET_ATTR_MRU */
if (upcall_info->mru) {
if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
upcall_info->mru)) {
err = -ENOBUFS;
goto out;
}
pad_packet(dp, user_skb);
}
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy()
*/
if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
err = -ENOBUFS;
goto out;
}
nla->nla_len = nla_attr_size(skb->len);
err = skb_zerocopy(user_skb, skb, skb->len, hlen); //upcall信息對象添加報文
if (err)
goto out;
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
pad_packet(dp, user_skb);
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); //發送netlink報文
user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
3、ovs_nla_put_key函數
int ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, int attr, bool is_mask,
struct sk_buff *skb)
{
int err;
struct nlattr *nla;
nla = nla_nest_start(skb, attr);
if (!nla)
return -EMSGSIZE;
err = __ovs_nla_put_key(swkey, output, is_mask, skb);
if (err)
return err;
nla_nest_end(skb, nla);
return 0;
}
4、__ovs_nla_put_key函數
static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, bool is_mask,
struct sk_buff *skb)
{
struct ovs_key_ethernet *eth_key;
struct nlattr *nla, *encap;
if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
goto nla_put_failure;
if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) //使用到該變量
goto nla_put_failure;
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
goto nla_put_failure;
if ((swkey->tun_key.u.ipv4.dst || is_mask)) {
const void *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
swkey->tun_opts_len))
goto nla_put_failure;
}
if (swkey->phy.in_port == DP_MAX_PORTS) {
if (is_mask && (output->phy.in_port == 0xffff))
if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
goto nla_put_failure;
三、OVS_ACTION_ATTR_PUSH_VLAN
本節分析OVS_ACTION_ATTR_PUSH_VLAN action,該action的處理函數爲push_vlan。
1、push_vlan函數
static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_vlan *vlan)
{
if (skb_vlan_tag_present(skb)) //如果報文已經包含vlan
invalidate_flow_key(key); //設置key的以太報文類型爲0
else
key->eth.tci = vlan->vlan_tci; //設置key的報文tci值
return skb_vlan_push(skb, vlan->vlan_tpid, //添加vlan信息
ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
}
2、skb_vlan_push函數
#define skb_vlan_push rpl_skb_vlan_push
int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) { //如果報文已經有vlan信息
unsigned int offset = skb->data - skb_mac_header(skb); //計算data與mac之間的offset
int err;
/* __vlan_insert_tag expect skb->data pointing to mac header.
* So change skb->data before calling it and change back to
* original position later
*/
__skb_push(skb, offset); //data切換到mac地址
err = __vlan_insert_tag(skb, skb->vlan_proto, //插入vlan標籤
skb_vlan_tag_get(skb));
if (err)
return err;
skb->mac_len += VLAN_HLEN; //skb二層頭長度增加VLAN頭長度,4個字節
__skb_pull(skb, offset); //data回到源位置,實際是相比之前的報文,還要往回移4字節
if (skb->ip_summed == CHECKSUM_COMPLETE) //重新計算checksum值
skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ (2 * ETH_ALEN), VLAN_HLEN, 0));
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); //設置skb->vlan_tci值,不修改報文數據
return 0;
}
3、__vlan_insert_tag函數
#define __vlan_insert_tag(skb, proto, tci) rpl_vlan_insert_tag(skb, tci)
static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
{
struct vlan_ethhdr *veth;
if (skb_cow_head(skb, VLAN_HLEN) < 0) //如果skb的headroom不能增加vlan頭長度,則需要擴展
return -ENOMEM;
veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); //data往前移4字節,使得報文在二層多出4字節存放vlan tag
/* Move the mac addresses to the beginning of the new header. */
memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); //目的mac和源mac拷貝到新的位置,共拷貝12字節
skb->mac_header -= VLAN_HLEN; //mac頭也往前移了4字節
/* first, the ethernet type */
veth->h_vlan_proto = htons(ETH_P_8021Q); //設置vlan tag的報文類型,veth指針指向不是目的mac地址的位置嗎?爲什麼?
/* now, the TCI */
veth->h_vlan_TCI = htons(vlan_tci); //設置vlan tag的tci值
return 0;
}
push vlan動作就是如果報文已經有vlan,那麼先修改報文的數據,添加vlan頭,然後再設置skb->vlan_tci,該vlan頭由硬件在發送時添加到報文中。
四、OVS_ACTION_ATTR_POP_VLAN
本節分析OVS_ACTION_ATTR_POP_VLAN action的處理函數pop_vlan。
1、pop_vlan函數
static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
int err;
err = skb_vlan_pop(skb);
if (skb_vlan_tag_present(skb)) //如果還存在vlan,則設置key的報文類型爲0
invalidate_flow_key(key);
else
key->eth.tci = 0; //設置key的tci爲0
return err;
}
2、skb_vlan_pop函數
#define skb_vlan_pop rpl_skb_vlan_pop
int rpl_skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
__be16 vlan_proto;
int err;
if (likely(skb_vlan_tag_present(skb))) { //如果skb的vlan_tci非0,直接設置該值爲0
skb->vlan_tci = 0;
} else {
if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
skb->protocol != htons(ETH_P_8021AD)) ||
skb->len < VLAN_ETH_HLEN))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci); //skb報文pop vlan,修改報文數據
if (err)
return err;
}
/* move next vlan tag to hw accel tag */
if (likely((skb->protocol != htons(ETH_P_8021Q) && //qinq場景
skb->protocol != htons(ETH_P_8021AD)) ||
skb->len < VLAN_ETH_HLEN))
return 0;
vlan_proto = htons(ETH_P_8021Q);
err = __skb_vlan_pop(skb, &vlan_tci); //需要進一步pop vlan
if (unlikely(err))
return err;
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); //設置skb的vlan_tci值
return 0;
}
3、__skb_vlan_pop函數
static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
unsigned int offset = skb->data - skb_mac_header(skb);
int err;
__skb_push(skb, offset); //data切換到mac地址
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
goto pull;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*vlan_tci = ntohs(vhdr->h_vlan_TCI);
memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); //mac往後移4個字節
__skb_pull(skb, VLAN_HLEN); //skb數據往後移4個字節
vlan_set_encap_proto(skb, vhdr); //解析內層報文,被賦值給skb protocol
skb->mac_header += VLAN_HLEN; //mac_header往後移4個字節
if (skb_network_offset(skb) < ETH_HLEN) //如果網絡頭的偏移小於二層長度(14字節)
skb_set_network_header(skb, ETH_HLEN); //設置網絡頭的偏移爲14字節
skb_reset_mac_len(skb); //重新設置二層長度,等於network_header - mac_header
pull:
__skb_pull(skb, offset); //data切換到源位置,由於去掉了vlan頭,實際會多往前移動4字節
return err;
}
pop vlan,如果報文vlan已經解析,即放在skb的vlan_tci變量,那麼直接把該變量賦值爲0,key的vlan_tci設置爲0即可;否則的話就需要修改skb的報文數據,軟件最多會pop兩個vlan頭,硬件還可以剝一個頭。
五、OVS_ACTION_ATTR_SET
本節分析OVS_ACTION_ATTR_SET action的處理函數execute_set_action函數。
1、execute_set_action函數
static int execute_set_action(struct sk_buff *skb,
struct sw_flow_key *flow_key,
const struct nlattr *a)
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
struct ovs_tunnel_info *tun = nla_data(a);
ovs_skb_dst_drop(skb);
ovs_dst_hold((struct dst_entry *)tun->tun_dst); //爲什麼要調用空函數
ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); //設置skb的tun_dst成員對象,這個信息在vxlan報文發包的時候使用
return 0;
}
return -EINVAL;
}
該action的處理函數非常簡單,僅設置了一個參數。 我們來看看是如何被使用到的,以vxlan隧道爲例,我們從vxlan端口的send函數(vxlan_xmit)入手來看。
2、vxlan_xmit函數
#define vxlan_xmit rpl_vxlan_xmit
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct vxlan_dev *vxlan = netdev_priv(dev);
const struct ip_tunnel_info *info;
info = skb_tunnel_info(skb); //得到tunnel信息,即execute_set_action函數設置的內容
skb_reset_mac_header(skb);
if ((vxlan->flags & VXLAN_F_PROXY))
goto out;
if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
info && info->mode & IP_TUNNEL_INFO_TX) {
vxlan_xmit_one(skb, dev, NULL, false);
return NETDEV_TX_OK;
}
out:
pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
kfree_skb(skb);
return NETDEV_TX_OK;
}
vxlan報文發送流程,不在這裏分析。 通過分析,該action的作用是封裝報文,通過隧道發送報文。 在dp的層面,只有一個tunnel端口(每種tunnel隧道一個),而其他類型的端口可以是多個的,從這裏也可以看到tunnel端口只是配置信息不同而已,所以只需要一個端口,配置信息在action中提供。
六、OVS_ACTION_ATTR_RECIRC
本節分析OVS_ACTION_ATTR_RECIRC action的處理函數execute_recirc。
1、execute_recirc函數
static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *a, int rem)
{
struct deferred_action *da;
if (!is_flow_key_valid(key)) { //如果key爲valid,需要重新生成key
int err;
err = ovs_flow_key_update(skb, key); //重新生成key
if (err)
return err;
}
BUG_ON(!is_flow_key_valid(key));
if (!nla_is_last(a, rem)) { //如果action不是最後一個,則需要克隆skb
/* Recirc action is the not the last action
* of the action list, need to clone the skb.
*/
skb = skb_clone(skb, GFP_ATOMIC);
/* Skip the recirc action when out of memory, but
* continue on with the rest of the action list.
*/
if (!skb)
return 0;
}
da = add_deferred_actions(skb, key, NULL); //添加deferred action
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
kfree_skb(skb);
if (net_ratelimit())
pr_warn("%s: deferred action limit reached, drop recirc action\n",
ovs_dp_name(dp));
}
return 0;
}
2、add_deferred_actions函數
/* Return queue entry if fifo is not full */
static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
const struct sw_flow_key *key,
const struct nlattr *attr)
{
struct action_fifo *fifo;
struct deferred_action *da;
fifo = this_cpu_ptr(action_fifos);
da = action_fifo_put(fifo); //添加一個deferred_action
if (da) {
da->skb = skb;
da->actions = attr; //recirc action,actions爲空
da->pkt_key = *key;
}
return da;
}
3、action_fifo_put函數
static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
{
if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
return NULL;
return &fifo->fifo[fifo->head++];
}
從上面可知,OVS_ACTION_ATTR_RECIRC action就是在action_fifos全局對象中添加一個deferred_action。 這些actions在什麼被使用呢? 答案是ovs_execute_actions函數。
4、ovs_execute_actions函數
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int level = this_cpu_read(exec_actions_level);
int err;
if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
if (net_ratelimit())
pr_warn("%s: packet loop detected, dropping.\n",
ovs_dp_name(dp));
kfree_skb(skb);
return -ELOOP;
}
this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
if (!level)
process_deferred_actions(dp); //執行deferred actions, 前提條件是level爲0,即第一次執行該函數時。可以把該action推遲到最後執行。
this_cpu_dec(exec_actions_level);
/* This return status currently does not reflect the errors
* encounted during deferred actions execution. Probably needs to
* be fixed in the future.
*/
return err;
}
5、process_deferred_actions函數
static void process_deferred_actions(struct datapath *dp)
{
struct action_fifo *fifo = this_cpu_ptr(action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
return;
/* Finishing executing all deferred actions. */
do {
struct deferred_action *da = action_fifo_get(fifo);
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = &da->pkt_key;
const struct nlattr *actions = da->actions;
if (actions)
do_execute_actions(dp, skb, key, actions,
nla_len(actions));
else
ovs_dp_process_packet(skb, key); //recirc進該流程,開始重新處理該報文,從查找流表開始,和前一次處理的差異就是key多了recirc_id。
} while (!action_fifo_is_empty(fifo));
/* Reset FIFO for the next packet. */
action_fifo_init(fifo); //清空fifo
}
OVS_ACTION_ATTR_RECIRC action提供了重複處理的功能,但是這樣的功能價值是什麼? 現在還沒想明白。
七、OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED
本節分析OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED action,處理函數爲execute_masked_set_action函數。
1、execute_masked_set_action函數
static int execute_masked_set_action(struct sk_buff *skb,
struct sw_flow_key *flow_key,
const struct nlattr *a)
{
int err = 0;
switch (nla_type(a)) {
case OVS_KEY_ATTR_PRIORITY:
OVS_SET_MASKED(skb->priority, nla_get_u32(a), //報文優先級設置, 用於tc控制
*get_mask(a, u32 *));
flow_key->phy.priority = skb->priority;
break;
case OVS_KEY_ATTR_SKB_MARK:
OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); //報文mark設置, iptables會使用
flow_key->phy.skb_mark = skb->mark;
break;
case OVS_KEY_ATTR_TUNNEL_INFO:
/* Masked data not supported for tunnel. */
err = -EINVAL;
break;
case OVS_KEY_ATTR_ETHERNET:
err = set_eth_addr(skb, flow_key, nla_data(a), //設置源mac、目的mac
get_mask(a, struct ovs_key_ethernet *));
break;
case OVS_KEY_ATTR_IPV4:
err = set_ipv4(skb, flow_key, nla_data(a), //設置IPV4字段,源IP、目的IP、tos、ttl;
get_mask(a, struct ovs_key_ipv4 *));
break;
case OVS_KEY_ATTR_IPV6:
err = set_ipv6(skb, flow_key, nla_data(a), //設置IPV6相關字段
get_mask(a, struct ovs_key_ipv6 *));
break;
case OVS_KEY_ATTR_TCP:
err = set_tcp(skb, flow_key, nla_data(a), //設置tcp字段,修改源端口和目的端口
get_mask(a, struct ovs_key_tcp *));
break;
case OVS_KEY_ATTR_UDP:
err = set_udp(skb, flow_key, nla_data(a), //設置udp字段,修改源端口和目的端口
get_mask(a, struct ovs_key_udp *));
break;
case OVS_KEY_ATTR_SCTP:
err = set_sctp(skb, flow_key, nla_data(a),
get_mask(a, struct ovs_key_sctp *));
break;
case OVS_KEY_ATTR_MPLS:
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
case OVS_KEY_ATTR_CT_STATE:
case OVS_KEY_ATTR_CT_ZONE:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
err = -EINVAL;
break;
}
return err;
}
2、set_eth_addr函數
static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_ethernet *key,
const struct ovs_key_ethernet *mask)
{
int err;
err = skb_ensure_writable(skb, ETH_HLEN);
if (unlikely(err))
return err;
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
mask->eth_src);
ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
mask->eth_dst);
ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
return 0;
}
3、set_ipv4函數
static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_ipv4 *key,
const struct ovs_key_ipv4 *mask)
{
struct iphdr *nh;
__be32 new_addr;
int err;
err = skb_ensure_writable(skb, skb_network_offset(skb) +
sizeof(struct iphdr));
if (unlikely(err))
return err;
nh = ip_hdr(skb);
/* Setting an IP addresses is typically only a side effect of
* matching on them in the current userspace implementation, so it
* makes sense to check if the value actually changed.
*/
if (mask->ipv4_src) {
new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
if (unlikely(new_addr != nh->saddr)) {
set_ip_addr(skb, nh, &nh->saddr, new_addr);
flow_key->ipv4.addr.src = new_addr;
}
}
if (mask->ipv4_dst) {
new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
if (unlikely(new_addr != nh->daddr)) {
set_ip_addr(skb, nh, &nh->daddr, new_addr);
flow_key->ipv4.addr.dst = new_addr;
}
}
if (mask->ipv4_tos) {
ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
flow_key->ip.tos = nh->tos;
}
if (mask->ipv4_ttl) {
set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
flow_key->ip.ttl = nh->ttl;
}
return 0;
}
4、set_udp函數
static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
const struct ovs_key_udp *key,
const struct ovs_key_udp *mask)
{
struct udphdr *uh;
__be16 src, dst;
int err;
err = skb_ensure_writable(skb, skb_transport_offset(skb) +
sizeof(struct udphdr));
if (unlikely(err))
return err;
uh = udp_hdr(skb);
/* Either of the masks is non-zero, so do not bother checking them. */
src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
if (likely(src != uh->source)) {
set_tp_port(skb, &uh->source, src, &uh->check);
flow_key->tp.src = src;
}
if (likely(dst != uh->dest)) {
set_tp_port(skb, &uh->dest, dst, &uh->check);
flow_key->tp.dst = dst;
}
if (unlikely(!uh->check))
uh->check = CSUM_MANGLED_0;
} else {
uh->source = src;
uh->dest = dst;
flow_key->tp.src = src;
flow_key->tp.dst = dst;
}
skb_clear_hash(skb);
return 0;
}
本節分析的action的作用是修改skb報文,通過key和mask兩個值可以修改任意sw_flow_key結構體定義的字段。基於該框架,可以任意修改報文內容。例如arp代答等等。
八、OVS_ACTION_ATTR_SAMPLE
本節分析OVS_ACTION_ATTR_SAMPLE action的處理函數sample。
1、sample函數
static int sample(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
const struct nlattr *acts_list = NULL;
const struct nlattr *a;
int rem;
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
a = nla_next(a, &rem)) {
u32 probability;
switch (nla_type(a)) {
case OVS_SAMPLE_ATTR_PROBABILITY: //提供概率設置
probability = nla_get_u32(a);
if (!probability || prandom_u32() > probability)
return 0;
break;
case OVS_SAMPLE_ATTR_ACTIONS: //提供對採樣報文的處理
acts_list = a;
break;
}
}
rem = nla_len(acts_list);
a = nla_data(acts_list);
/* Actions list is empty, do nothing */
if (unlikely(!rem))
return 0;
/* The only known usage of sample action is having a single user-space
* action. Treat this usage as a special case.
* The output_userspace() should clone the skb to be sent to the
* user space. This skb will be consumed by its caller.
*/
if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
nla_is_last(a, rem)))
return output_userspace(dp, skb, key, a, actions, actions_len); //發送到用戶態,相比OVS_ACTION_ATTR_USERSPACE,提供概率的能力
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
/* Skip the sample action when out of memory. */
return 0;
if (!add_deferred_actions(skb, key, a)) { //放到fifo數組中,在最後處理
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping sample action\n",
ovs_dp_name(dp));
kfree_skb(skb);
}
return 0;
}
output_userspace在前幾篇已經分析過,會把報文上傳到用戶態,用戶態如何處理後續分析。add_deferred_actions會把報文放在fifo數組中,在報文處理的最後時刻處理,看ovs_execute_actions函數。
2、ovs_execute_actions函數
/* Execute a list of actions against 'skb'. */
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int level = this_cpu_read(exec_actions_level);
int err;
if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
if (net_ratelimit())
pr_warn("%s: packet loop detected, dropping.\n",
ovs_dp_name(dp));
kfree_skb(skb);
return -ELOOP;
}
this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
if (!level) //do_execute_actions如果循環進入此函數,那麼level非零,不會進入
process_deferred_actions(dp);
this_cpu_dec(exec_actions_level);
/* This return status currently does not reflect the errors
* encounted during deferred actions execution. Probably needs to
* be fixed in the future.
*/
return err;
}
我們再看一下process_deferred_actions函數是怎麼處理的。
3、process_deferred_actions函數
static void process_deferred_actions(struct datapath *dp)
{
struct action_fifo *fifo = this_cpu_ptr(action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
return;
/* Finishing executing all deferred actions. */
do {
struct deferred_action *da = action_fifo_get(fifo);
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = &da->pkt_key;
const struct nlattr *actions = da->actions;
if (actions)
do_execute_actions(dp, skb, key, actions, //sample進入此分支
nla_len(actions));
else
ovs_dp_process_packet(skb, key); //recirc進該流程,開始重新處理該報文
} while (!action_fifo_is_empty(fifo));
/* Reset FIFO for the next packet. */
action_fifo_init(fifo); //清空fifo
}
sample總體提供兩個功能,1)概率性地發送報文到用戶態;2)兩次處理報文的能力(自定義處理動作),爲什麼提供這個能力? 作用是什麼? 希望通過進一步分析,能夠回答這個問題。
原文鏈接:https://blog.csdn.net/one_clouder/article/details/52418570