OVS源碼--datapath之action分析(二)

OVS dp支持的action都在do_execute_actions函數中定義,支持的action包括:

OVS_ACTION_ATTR_OUTPUT
OVS_ACTION_ATTR_USERSPACE
OVS_ACTION_ATTR_HASH
OVS_ACTION_ATTR_PUSH_MPLS
OVS_ACTION_ATTR_POP_MPLS
OVS_ACTION_ATTR_PUSH_VLAN
OVS_ACTION_ATTR_POP_VLAN
OVS_ACTION_ATTR_RECIRC
OVS_ACTION_ATTR_SET
OVS_ACTION_ATTR_SET_MASKED
OVS_ACTION_ATTR_SET_TO_MASKED
OVS_ACTION_ATTR_SAMPLE
OVS_ACTION_ATTR_CT

本系列要完成這些action的分析,output已經在之前介紹datapath主流程時已經介紹,不再進行介紹。

一、OVS_ACTION_ATTR_USERSPACE

本節爲OVS_ACTION_ATTR_USERSPACE的處理函數爲output_userspace函數,以此函數作爲入口進行分析。

1、output_userspace函數

static int output_userspace(struct datapath *dp, struct sk_buff *skb,
			    struct sw_flow_key *key, const struct nlattr *attr,
			    const struct nlattr *actions, int actions_len)
{
	struct ip_tunnel_info info;
	struct dp_upcall_info upcall;
	const struct nlattr *a;
	int rem;
 
	memset(&upcall, 0, sizeof(upcall));
	upcall.cmd = OVS_PACKET_CMD_ACTION;         //封裝upcall對象
	upcall.mru = OVS_CB(skb)->mru;
 
	for (a = nla_data(attr), rem = nla_len(attr); rem > 0;	  //獲取userspace action相關的信息
		 a = nla_next(a, &rem)) {
		switch (nla_type(a)) {
		case OVS_USERSPACE_ATTR_USERDATA:
			upcall.userdata = a;
			break;
 
		case OVS_USERSPACE_ATTR_PID:
			upcall.portid = nla_get_u32(a);
			break;
 
		case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
			/* Get out tunnel info. */
			struct vport *vport;
 
			vport = ovs_vport_rcu(dp, nla_get_u32(a));
			if (vport) {
				int err;
 
				upcall.egress_tun_info = &info;
				err = ovs_vport_get_egress_tun_info(vport, skb,
								    &upcall);
				if (err)
					upcall.egress_tun_info = NULL;
			}
 
			break;
		}
 
		case OVS_USERSPACE_ATTR_ACTIONS: {
			/* Include actions. */
			upcall.actions = actions;
			upcall.actions_len = actions_len;
			break;
		}
 
		} /* End of switch. */
	}
 
	return ovs_dp_upcall(dp, skb, key, &upcall);    //upcall
}

2、ovs_dp_upcall函數

int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
		  const struct sw_flow_key *key,
		  const struct dp_upcall_info *upcall_info)
{
	struct dp_stats_percpu *stats;
	int err;
 
	if (upcall_info->portid == 0) {
		err = -ENOTCONN;
		goto err;
	}
 
	if (!skb_is_gso(skb))
		err = queue_userspace_packet(dp, skb, key, upcall_info);
	else
		err = queue_gso_packets(dp, skb, key, upcall_info);
	if (err)
		goto err;
 
	return 0;
 
err:
	stats = this_cpu_ptr(dp->stats_percpu);
 
	u64_stats_update_begin(&stats->syncp);
	stats->n_lost++;
	u64_stats_update_end(&stats->syncp);
 
	return err;
}

3、queue_userspace_packet函數

static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
				  const struct sw_flow_key *key,
				  const struct dp_upcall_info *upcall_info)
{
	struct ovs_header *upcall;
	struct sk_buff *nskb = NULL;
	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
	struct nlattr *nla;
	struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
		.snd_portid = upcall_info->portid,
	};
	size_t len;
	unsigned int hlen;
	int err, dp_ifindex;
 
	dp_ifindex = get_dpifindex(dp);
	if (!dp_ifindex)
		return -ENODEV;
 
	if (skb_vlan_tag_present(skb)) {
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (!nskb)
			return -ENOMEM;
 
		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
		if (!nskb)
			return -ENOMEM;
 
		vlan_set_tci(nskb, 0);	//爲什麼要把tci置0?
 
		skb = nskb;
	}
 
	if (nla_attr_size(skb->len) > USHRT_MAX) {
		err = -EFBIG;
		goto out;
	}
 
	/* Complete checksum if needed */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto out;
 
	/* Older versions of OVS user space enforce alignment of the last
	 * Netlink attribute to NLA_ALIGNTO which would require extensive
	 * padding logic. Only perform zerocopy if padding is not required.
	 */
	if (dp->user_features & OVS_DP_F_UNALIGNED)
		hlen = skb_zerocopy_headlen(skb);
	else
		hlen = skb->len;
 
	len = upcall_msg_size(upcall_info, hlen);
	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);		//創建upcall消息對象
	if (!user_skb) {
		err = -ENOMEM;
		goto out;
	}
 
	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,	//dp_packet_genl_family 和 upcall_info->cmd確定處理函數
			     0, upcall_info->cmd);
	upcall->dp_ifindex = dp_ifindex;
 
	err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);  //upcall信息對象添加key
	BUG_ON(err);
 
	if (upcall_info->userdata)    
		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,   //upcall信息對象添加userdata
			  nla_len(upcall_info->userdata),
			  nla_data(upcall_info->userdata));
 
 
	if (upcall_info->egress_tun_info) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);  //upcall信息對象添加egress_tun_info
		err = ovs_nla_put_egress_tunnel_key(user_skb,
						    upcall_info->egress_tun_info,
						    upcall_info->egress_tun_opts);
		BUG_ON(err);
		nla_nest_end(user_skb, nla);
	}
 
	if (upcall_info->actions_len) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);  //upcall信息對象添加actions
		err = ovs_nla_put_actions(upcall_info->actions,
					  upcall_info->actions_len,
					  user_skb);
		if (!err)
			nla_nest_end(user_skb, nla);
		else
			nla_nest_cancel(user_skb, nla);
	}
 
	/* Add OVS_PACKET_ATTR_MRU */
	if (upcall_info->mru) {
		if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
				upcall_info->mru)) {
			err = -ENOBUFS;
			goto out;
		}
		pad_packet(dp, user_skb);
	}
 
	/* Only reserve room for attribute header, packet data is added
	 * in skb_zerocopy()
	 */
	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
		err = -ENOBUFS;
		goto out;
	}
	nla->nla_len = nla_attr_size(skb->len);
 
	err = skb_zerocopy(user_skb, skb, skb->len, hlen);    //upcall信息對象添加報文
	if (err)
		goto out;
 
	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
	pad_packet(dp, user_skb);
 
	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 
	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);    //發送netlink報文
	user_skb = NULL;
out:
	if (err)
		skb_tx_error(skb);
	kfree_skb(user_skb);
	kfree_skb(nskb);
	return err;
}

到此可以看到userspace action和精確流表未匹配導致的upcall在處理流程上是比較一致的,兩者都是通過調用ovs_dp_upcall函數實現信息發送到用戶態程序。upcall處理線程是如何處理的不在本篇分析,將在後續給出分析。

通過userspace能夠實現什麼功能呢? 現在還想不出,等分析upcall處理後,再回過頭來回答這個問題。

二、OVS_ACTION_ATTR_HASH

本節分析OVS_ACTION_ATTR_HASH action,該action的處理函數爲execute_hash函數

1、execute_hash函數

static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
			 const struct nlattr *attr)
{
	struct ovs_action_hash *hash_act = nla_data(attr);
	u32 hash = 0;
 
	/* OVS_HASH_ALG_L4 is the only possible hash algorithm.  */
	hash = skb_get_hash(skb);
	hash = jhash_1word(hash, hash_act->hash_basis);
	if (!hash)
		hash = 0x1;
 
	key->ovs_flow_hash = hash;	//計算hash值
}

該action僅對key的ovs_flow_hash成員變量進行了修改,從該變量的使用地方逆推,最終是queue_userspace_packet會使用,該函數是把報文發送給用戶態進程,本次就看下queue_userspace_packet函數是如何使用到該成員變量的。

2、queue_userspace_packet函數

static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
				  const struct sw_flow_key *key,
				  const struct dp_upcall_info *upcall_info)
{
	struct ovs_header *upcall;
	struct sk_buff *nskb = NULL;
	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
	struct nlattr *nla;
	struct genl_info info = {
#ifdef HAVE_GENLMSG_NEW_UNICAST
		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
#endif
		.snd_portid = upcall_info->portid,
	};
	size_t len;
	unsigned int hlen;
	int err, dp_ifindex;
 
	dp_ifindex = get_dpifindex(dp);
	if (!dp_ifindex)
		return -ENODEV;
 
	if (skb_vlan_tag_present(skb)) {
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (!nskb)
			return -ENOMEM;
 
		nskb = vlan_insert_tag_set_proto(nskb, nskb->vlan_proto, skb_vlan_tag_get(nskb));
		if (!nskb)
			return -ENOMEM;
 
		vlan_set_tci(nskb, 0);	//爲什麼要把tci置0?
 
		skb = nskb;
	}
 
	if (nla_attr_size(skb->len) > USHRT_MAX) {
		err = -EFBIG;
		goto out;
	}
 
	/* Complete checksum if needed */
	if (skb->ip_summed == CHECKSUM_PARTIAL &&
	    (err = skb_checksum_help(skb)))
		goto out;
 
	/* Older versions of OVS user space enforce alignment of the last
	 * Netlink attribute to NLA_ALIGNTO which would require extensive
	 * padding logic. Only perform zerocopy if padding is not required.
	 */
	if (dp->user_features & OVS_DP_F_UNALIGNED)
		hlen = skb_zerocopy_headlen(skb);
	else
		hlen = skb->len;
 
	len = upcall_msg_size(upcall_info, hlen);
	user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);		//創建upcall消息對象
	if (!user_skb) {
		err = -ENOMEM;
		goto out;
	}
 
	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,	//dp_packet_genl_family 和 upcall_info->cmd確定處理函數
			     0, upcall_info->cmd);
	upcall->dp_ifindex = dp_ifindex;
 
	err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);  //upcall信息對象添加key,該函數最終會用到ovs_flow_hash
	BUG_ON(err);
 
	if (upcall_info->userdata)    
		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,   //upcall信息對象添加userdata
			  nla_len(upcall_info->userdata),
			  nla_data(upcall_info->userdata));
 
 
	if (upcall_info->egress_tun_info) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);  //upcall信息對象添加egress_tun_info
		err = ovs_nla_put_egress_tunnel_key(user_skb,
						    upcall_info->egress_tun_info,
						    upcall_info->egress_tun_opts);
		BUG_ON(err);
		nla_nest_end(user_skb, nla);
	}
 
	if (upcall_info->actions_len) {
		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS);  //upcall信息對象添加actions
		err = ovs_nla_put_actions(upcall_info->actions,
					  upcall_info->actions_len,
					  user_skb);
		if (!err)
			nla_nest_end(user_skb, nla);
		else
			nla_nest_cancel(user_skb, nla);
	}
 
	/* Add OVS_PACKET_ATTR_MRU */
	if (upcall_info->mru) {
		if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
				upcall_info->mru)) {
			err = -ENOBUFS;
			goto out;
		}
		pad_packet(dp, user_skb);
	}
 
	/* Only reserve room for attribute header, packet data is added
	 * in skb_zerocopy()
	 */
	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
		err = -ENOBUFS;
		goto out;
	}
	nla->nla_len = nla_attr_size(skb->len);
 
	err = skb_zerocopy(user_skb, skb, skb->len, hlen);    //upcall信息對象添加報文
	if (err)
		goto out;
 
	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
	pad_packet(dp, user_skb);
 
	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 
	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);    //發送netlink報文
	user_skb = NULL;
out:
	if (err)
		skb_tx_error(skb);
	kfree_skb(user_skb);
	kfree_skb(nskb);
	return err;
}

3、ovs_nla_put_key函數

int ovs_nla_put_key(const struct sw_flow_key *swkey,
		    const struct sw_flow_key *output, int attr, bool is_mask,
		    struct sk_buff *skb)
{
	int err;
	struct nlattr *nla;
 
	nla = nla_nest_start(skb, attr);
	if (!nla)
		return -EMSGSIZE;
	err = __ovs_nla_put_key(swkey, output, is_mask, skb);
	if (err)
		return err;
	nla_nest_end(skb, nla);
 
	return 0;
}

4、__ovs_nla_put_key函數

static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
			     const struct sw_flow_key *output, bool is_mask,
			     struct sk_buff *skb)
{
	struct ovs_key_ethernet *eth_key;
	struct nlattr *nla, *encap;
 
	if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
		goto nla_put_failure;
 
	if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash))     //使用到該變量
		goto nla_put_failure;
 
	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
		goto nla_put_failure;
 
	if ((swkey->tun_key.u.ipv4.dst || is_mask)) {
		const void *opts = NULL;
 
		if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
			opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
 
		if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
				       swkey->tun_opts_len))
			goto nla_put_failure;
	}
 
	if (swkey->phy.in_port == DP_MAX_PORTS) {
		if (is_mask && (output->phy.in_port == 0xffff))
			if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
				goto nla_put_failure;

三、OVS_ACTION_ATTR_PUSH_VLAN

本節分析OVS_ACTION_ATTR_PUSH_VLAN action,該action的處理函數爲push_vlan。

1、push_vlan函數

static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
		     const struct ovs_action_push_vlan *vlan)
{
	if (skb_vlan_tag_present(skb))		//如果報文已經包含vlan
		invalidate_flow_key(key);	//設置key的以太報文類型爲0
	else
		key->eth.tci = vlan->vlan_tci;	//設置key的報文tci值
	return skb_vlan_push(skb, vlan->vlan_tpid,	//添加vlan信息
			     ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
}

2、skb_vlan_push函數

#define skb_vlan_push rpl_skb_vlan_push
int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
	if (skb_vlan_tag_present(skb)) {	//如果報文已經有vlan信息
		unsigned int offset = skb->data - skb_mac_header(skb);		//計算data與mac之間的offset
		int err;
 
		/* __vlan_insert_tag expect skb->data pointing to mac header.
		 * So change skb->data before calling it and change back to
		 * original position later
		 */
		__skb_push(skb, offset);			//data切換到mac地址
		err = __vlan_insert_tag(skb, skb->vlan_proto,	//插入vlan標籤
					skb_vlan_tag_get(skb));
		if (err)
			return err;
		skb->mac_len += VLAN_HLEN;		//skb二層頭長度增加VLAN頭長度,4個字節
		__skb_pull(skb, offset);		//data回到源位置,實際是相比之前的報文,還要往回移4字節
 
		if (skb->ip_summed == CHECKSUM_COMPLETE)	//重新計算checksum值
			skb->csum = csum_add(skb->csum, csum_partial(skb->data
					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
	}
	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);     //設置skb->vlan_tci值,不修改報文數據
	return 0;
}

3、__vlan_insert_tag函數

#define __vlan_insert_tag(skb, proto, tci) rpl_vlan_insert_tag(skb, tci)
static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
{
	struct vlan_ethhdr *veth;
 
	if (skb_cow_head(skb, VLAN_HLEN) < 0)	//如果skb的headroom不能增加vlan頭長度,則需要擴展
		return -ENOMEM;
 
	veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN);      //data往前移4字節,使得報文在二層多出4字節存放vlan tag
 
	/* Move the mac addresses to the beginning of the new header. */
	memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);    //目的mac和源mac拷貝到新的位置,共拷貝12字節
	skb->mac_header -= VLAN_HLEN;	//mac頭也往前移了4字節
 
	/* first, the ethernet type */
	veth->h_vlan_proto = htons(ETH_P_8021Q);	//設置vlan tag的報文類型,veth指針指向不是目的mac地址的位置嗎?爲什麼?
 
	/* now, the TCI */
	veth->h_vlan_TCI = htons(vlan_tci);		//設置vlan tag的tci值
 
	return 0;
}

push vlan動作就是如果報文已經有vlan,那麼先修改報文的數據,添加vlan頭,然後再設置skb->vlan_tci,該vlan頭由硬件在發送時添加到報文中。

四、OVS_ACTION_ATTR_POP_VLAN

本節分析OVS_ACTION_ATTR_POP_VLAN action的處理函數pop_vlan。

1、pop_vlan函數

static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
{
	int err;
 
	err = skb_vlan_pop(skb);
	if (skb_vlan_tag_present(skb))		//如果還存在vlan,則設置key的報文類型爲0
		invalidate_flow_key(key);
	else
		key->eth.tci = 0;		//設置key的tci爲0
	return err;
}

2、skb_vlan_pop函數

#define skb_vlan_pop rpl_skb_vlan_pop
int rpl_skb_vlan_pop(struct sk_buff *skb)
{
	u16 vlan_tci;
	__be16 vlan_proto;
	int err;
 
	if (likely(skb_vlan_tag_present(skb))) {	//如果skb的vlan_tci非0,直接設置該值爲0
		skb->vlan_tci = 0;	
	} else {
		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
			      skb->protocol != htons(ETH_P_8021AD)) ||
			     skb->len < VLAN_ETH_HLEN))
			return 0;
 
		err = __skb_vlan_pop(skb, &vlan_tci);    //skb報文pop vlan,修改報文數據
		if (err)
			return err;
	}
	/* move next vlan tag to hw accel tag */
	if (likely((skb->protocol != htons(ETH_P_8021Q) &&             //qinq場景
		    skb->protocol != htons(ETH_P_8021AD)) ||
		   skb->len < VLAN_ETH_HLEN))
		return 0;
 
	vlan_proto = htons(ETH_P_8021Q);
	err = __skb_vlan_pop(skb, &vlan_tci);		//需要進一步pop vlan
	if (unlikely(err))
		return err;
 
	__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);   //設置skb的vlan_tci值
	return 0;
}

3、__skb_vlan_pop函數

static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
	struct vlan_hdr *vhdr;
	unsigned int offset = skb->data - skb_mac_header(skb);
	int err;
 
	__skb_push(skb, offset);	//data切換到mac地址
	err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
	if (unlikely(err))
		goto pull;
 
	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
 
	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
 
	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);	//mac往後移4個字節
	__skb_pull(skb, VLAN_HLEN);					//skb數據往後移4個字節
 
	vlan_set_encap_proto(skb, vhdr);        //解析內層報文,被賦值給skb protocol
	skb->mac_header += VLAN_HLEN;		//mac_header往後移4個字節
 
	if (skb_network_offset(skb) < ETH_HLEN)		//如果網絡頭的偏移小於二層長度(14字節)
		skb_set_network_header(skb, ETH_HLEN);	//設置網絡頭的偏移爲14字節
 
	skb_reset_mac_len(skb);		//重新設置二層長度,等於network_header - mac_header	
pull:
	__skb_pull(skb, offset);		//data切換到源位置,由於去掉了vlan頭,實際會多往前移動4字節
 
	return err;
}

pop vlan,如果報文vlan已經解析,即放在skb的vlan_tci變量,那麼直接把該變量賦值爲0,key的vlan_tci設置爲0即可;否則的話就需要修改skb的報文數據,軟件最多會pop兩個vlan頭,硬件還可以剝一個頭。

五、OVS_ACTION_ATTR_SET

本節分析OVS_ACTION_ATTR_SET action的處理函數execute_set_action函數。

1、execute_set_action函數

static int execute_set_action(struct sk_buff *skb,
			      struct sw_flow_key *flow_key,
			      const struct nlattr *a)
{
	/* Only tunnel set execution is supported without a mask. */
	if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
		struct ovs_tunnel_info *tun = nla_data(a);
 
		ovs_skb_dst_drop(skb);                                     
		ovs_dst_hold((struct dst_entry *)tun->tun_dst);            //爲什麼要調用空函數
		ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);    //設置skb的tun_dst成員對象,這個信息在vxlan報文發包的時候使用
		return 0;
	}
 
	return -EINVAL;
}

該action的處理函數非常簡單,僅設置了一個參數。 我們來看看是如何被使用到的,以vxlan隧道爲例,我們從vxlan端口的send函數(vxlan_xmit)入手來看。

2、vxlan_xmit函數

#define vxlan_xmit rpl_vxlan_xmit
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	struct vxlan_dev *vxlan = netdev_priv(dev);
	const struct ip_tunnel_info *info;
 
	info = skb_tunnel_info(skb);    //得到tunnel信息,即execute_set_action函數設置的內容
 
	skb_reset_mac_header(skb);
 
	if ((vxlan->flags & VXLAN_F_PROXY))
		goto out;
 
	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
	    info && info->mode & IP_TUNNEL_INFO_TX) {
		vxlan_xmit_one(skb, dev, NULL, false);
		return NETDEV_TX_OK;
	}
out:
	pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
	kfree_skb(skb);
	return NETDEV_TX_OK;
}

vxlan報文發送流程,不在這裏分析。 通過分析,該action的作用是封裝報文,通過隧道發送報文。 在dp的層面,只有一個tunnel端口(每種tunnel隧道一個),而其他類型的端口可以是多個的,從這裏也可以看到tunnel端口只是配置信息不同而已,所以只需要一個端口,配置信息在action中提供。

六、OVS_ACTION_ATTR_RECIRC

本節分析OVS_ACTION_ATTR_RECIRC action的處理函數execute_recirc。

1、execute_recirc函數

static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
			  struct sw_flow_key *key,
			  const struct nlattr *a, int rem)
{
	struct deferred_action *da;
 
	if (!is_flow_key_valid(key)) {     //如果key爲valid,需要重新生成key
		int err;
 
		err = ovs_flow_key_update(skb, key);    //重新生成key
		if (err)
			return err;
	}
	BUG_ON(!is_flow_key_valid(key));
 
	if (!nla_is_last(a, rem)) {    //如果action不是最後一個,則需要克隆skb
		/* Recirc action is the not the last action
		 * of the action list, need to clone the skb.
		 */
		skb = skb_clone(skb, GFP_ATOMIC);
 
		/* Skip the recirc action when out of memory, but
		 * continue on with the rest of the action list.
		 */
		if (!skb)
			return 0;
	}
 
	da = add_deferred_actions(skb, key, NULL);    //添加deferred action
	if (da) {
		da->pkt_key.recirc_id = nla_get_u32(a);
	} else {
		kfree_skb(skb);
 
		if (net_ratelimit())
			pr_warn("%s: deferred action limit reached, drop recirc action\n",
				ovs_dp_name(dp));
	}
 
	return 0;
}

2、add_deferred_actions函數

/* Return queue entry if fifo is not full */
static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
						    const struct sw_flow_key *key,
						    const struct nlattr *attr)
{
	struct action_fifo *fifo;
	struct deferred_action *da;
 
	fifo = this_cpu_ptr(action_fifos);
	da = action_fifo_put(fifo);		//添加一個deferred_action
	if (da) {
		da->skb = skb;
		da->actions = attr;             //recirc action,actions爲空
		da->pkt_key = *key;
	}
 
	return da;
}

3、action_fifo_put函數

static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
{
	if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
		return NULL;
 
	return &fifo->fifo[fifo->head++];
}

從上面可知,OVS_ACTION_ATTR_RECIRC action就是在action_fifos全局對象中添加一個deferred_action。 這些actions在什麼被使用呢? 答案是ovs_execute_actions函數。

4、ovs_execute_actions函數

int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
			const struct sw_flow_actions *acts,
			struct sw_flow_key *key)
{
	int level = this_cpu_read(exec_actions_level);
	int err;
 
	if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
		if (net_ratelimit())
			pr_warn("%s: packet loop detected, dropping.\n",
				ovs_dp_name(dp));
 
		kfree_skb(skb);
		return -ELOOP;
	}
 
	this_cpu_inc(exec_actions_level);
	err = do_execute_actions(dp, skb, key,
				 acts->actions, acts->actions_len);
 
	if (!level)
		process_deferred_actions(dp);    //執行deferred actions, 前提條件是level爲0,即第一次執行該函數時。可以把該action推遲到最後執行。
 
	this_cpu_dec(exec_actions_level);
 
	/* This return status currently does not reflect the errors
	 * encounted during deferred actions execution. Probably needs to
	 * be fixed in the future.
	 */
	return err;
}

5、process_deferred_actions函數

static void process_deferred_actions(struct datapath *dp)
{
	struct action_fifo *fifo = this_cpu_ptr(action_fifos);
 
	/* Do not touch the FIFO in case there is no deferred actions. */
	if (action_fifo_is_empty(fifo))
		return;
 
	/* Finishing executing all deferred actions. */
	do {
		struct deferred_action *da = action_fifo_get(fifo);
		struct sk_buff *skb = da->skb;
		struct sw_flow_key *key = &da->pkt_key;
		const struct nlattr *actions = da->actions;
 
		if (actions)
			do_execute_actions(dp, skb, key, actions,
					   nla_len(actions));
		else
			ovs_dp_process_packet(skb, key);      //recirc進該流程,開始重新處理該報文,從查找流表開始,和前一次處理的差異就是key多了recirc_id。
	} while (!action_fifo_is_empty(fifo));
 
	/* Reset FIFO for the next packet.  */
	action_fifo_init(fifo);			//清空fifo
}

OVS_ACTION_ATTR_RECIRC action提供了重複處理的功能,但是這樣的功能價值是什麼? 現在還沒想明白。

七、OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED

本節分析OVS_ACTION_ATTR_SET_MASKED 和 OVS_ACTION_ATTR_SET_TO_MASKED action,處理函數爲execute_masked_set_action函數。

1、execute_masked_set_action函數

static int execute_masked_set_action(struct sk_buff *skb,
				     struct sw_flow_key *flow_key,
				     const struct nlattr *a)
{
	int err = 0;
 
	switch (nla_type(a)) {
	case OVS_KEY_ATTR_PRIORITY:
		OVS_SET_MASKED(skb->priority, nla_get_u32(a),		//報文優先級設置, 用於tc控制
			       *get_mask(a, u32 *));
		flow_key->phy.priority = skb->priority;
		break;
 
	case OVS_KEY_ATTR_SKB_MARK:
		OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));   //報文mark設置, iptables會使用
		flow_key->phy.skb_mark = skb->mark;
		break;
 
	case OVS_KEY_ATTR_TUNNEL_INFO:
		/* Masked data not supported for tunnel. */
		err = -EINVAL;
		break;
 
	case OVS_KEY_ATTR_ETHERNET:
		err = set_eth_addr(skb, flow_key, nla_data(a),		//設置源mac、目的mac
				   get_mask(a, struct ovs_key_ethernet *));
		break;
 
	case OVS_KEY_ATTR_IPV4:
		err = set_ipv4(skb, flow_key, nla_data(a),		//設置IPV4字段,源IP、目的IP、tos、ttl;
			       get_mask(a, struct ovs_key_ipv4 *));
		break;
 
	case OVS_KEY_ATTR_IPV6:
		err = set_ipv6(skb, flow_key, nla_data(a),		//設置IPV6相關字段
			       get_mask(a, struct ovs_key_ipv6 *));
		break;
 
	case OVS_KEY_ATTR_TCP:
		err = set_tcp(skb, flow_key, nla_data(a),		//設置tcp字段,修改源端口和目的端口
			      get_mask(a, struct ovs_key_tcp *));
		break;
 
	case OVS_KEY_ATTR_UDP:
		err = set_udp(skb, flow_key, nla_data(a),		//設置udp字段,修改源端口和目的端口
			      get_mask(a, struct ovs_key_udp *));
		break;
 
	case OVS_KEY_ATTR_SCTP:
		err = set_sctp(skb, flow_key, nla_data(a),
			       get_mask(a, struct ovs_key_sctp *));
		break;
 
	case OVS_KEY_ATTR_MPLS:
		err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
								    __be32 *));
		break;
 
	case OVS_KEY_ATTR_CT_STATE:
	case OVS_KEY_ATTR_CT_ZONE:
	case OVS_KEY_ATTR_CT_MARK:
	case OVS_KEY_ATTR_CT_LABELS:
		err = -EINVAL;
		break;
	}
 
	return err;
}

2、set_eth_addr函數

static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
			const struct ovs_key_ethernet *key,
			const struct ovs_key_ethernet *mask)
{
	int err;
 
	err = skb_ensure_writable(skb, ETH_HLEN);
	if (unlikely(err))
		return err;
 
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
 
	ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
			       mask->eth_src);
	ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
			       mask->eth_dst);
 
	ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
 
	ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
	ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
	return 0;
}

3、set_ipv4函數

static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
		    const struct ovs_key_ipv4 *key,
		    const struct ovs_key_ipv4 *mask)
{
	struct iphdr *nh;
	__be32 new_addr;
	int err;
 
	err = skb_ensure_writable(skb, skb_network_offset(skb) +
				  sizeof(struct iphdr));
	if (unlikely(err))
		return err;
 
	nh = ip_hdr(skb);
 
	/* Setting an IP addresses is typically only a side effect of
	 * matching on them in the current userspace implementation, so it
	 * makes sense to check if the value actually changed.
	 */
	if (mask->ipv4_src) {
		new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
 
		if (unlikely(new_addr != nh->saddr)) {
			set_ip_addr(skb, nh, &nh->saddr, new_addr);
			flow_key->ipv4.addr.src = new_addr;
		}
	}
	if (mask->ipv4_dst) {
		new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
 
		if (unlikely(new_addr != nh->daddr)) {
			set_ip_addr(skb, nh, &nh->daddr, new_addr);
			flow_key->ipv4.addr.dst = new_addr;
		}
	}
	if (mask->ipv4_tos) {
		ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
		flow_key->ip.tos = nh->tos;
	}
	if (mask->ipv4_ttl) {
		set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
		flow_key->ip.ttl = nh->ttl;
	}
 
	return 0;
}

4、set_udp函數

static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
		   const struct ovs_key_udp *key,
		   const struct ovs_key_udp *mask)
{
	struct udphdr *uh;
	__be16 src, dst;
	int err;
 
	err = skb_ensure_writable(skb, skb_transport_offset(skb) +
				  sizeof(struct udphdr));
	if (unlikely(err))
		return err;
 
	uh = udp_hdr(skb);
	/* Either of the masks is non-zero, so do not bother checking them. */
	src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
	dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
 
	if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
		if (likely(src != uh->source)) {
			set_tp_port(skb, &uh->source, src, &uh->check);
			flow_key->tp.src = src;
		}
		if (likely(dst != uh->dest)) {
			set_tp_port(skb, &uh->dest, dst, &uh->check);
			flow_key->tp.dst = dst;
		}
 
		if (unlikely(!uh->check))
			uh->check = CSUM_MANGLED_0;
	} else {
		uh->source = src;
		uh->dest = dst;
		flow_key->tp.src = src;
		flow_key->tp.dst = dst;
	}
 
	skb_clear_hash(skb);
 
	return 0;
}

本節分析的action的作用是修改skb報文,通過key和mask兩個值可以修改任意sw_flow_key結構體定義的字段。基於該框架,可以任意修改報文內容。例如arp代答等等。

八、OVS_ACTION_ATTR_SAMPLE

本節分析OVS_ACTION_ATTR_SAMPLE action的處理函數sample。

1、sample函數

static int sample(struct datapath *dp, struct sk_buff *skb,
		  struct sw_flow_key *key, const struct nlattr *attr,
		  const struct nlattr *actions, int actions_len)
{
	const struct nlattr *acts_list = NULL;
	const struct nlattr *a;
	int rem;
 
	for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
		 a = nla_next(a, &rem)) {
		u32 probability;
 
		switch (nla_type(a)) {
		case OVS_SAMPLE_ATTR_PROBABILITY:		//提供概率設置
			probability = nla_get_u32(a);
			if (!probability || prandom_u32() > probability)
				return 0;
			break;
 
		case OVS_SAMPLE_ATTR_ACTIONS:		//提供對採樣報文的處理
			acts_list = a;
			break;
		}
	}
 
	rem = nla_len(acts_list);
	a = nla_data(acts_list);
 
	/* Actions list is empty, do nothing */
	if (unlikely(!rem))
		return 0;
 
	/* The only known usage of sample action is having a single user-space
	 * action. Treat this usage as a special case.
	 * The output_userspace() should clone the skb to be sent to the
	 * user space. This skb will be consumed by its caller.
	 */
	if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
		   nla_is_last(a, rem)))
		return output_userspace(dp, skb, key, a, actions, actions_len);   //發送到用戶態,相比OVS_ACTION_ATTR_USERSPACE,提供概率的能力
 
	skb = skb_clone(skb, GFP_ATOMIC);
	if (!skb)
		/* Skip the sample action when out of memory. */
		return 0;
 
	if (!add_deferred_actions(skb, key, a)) {	//放到fifo數組中,在最後處理
		if (net_ratelimit())
			pr_warn("%s: deferred actions limit reached, dropping sample action\n",
				ovs_dp_name(dp));
 
		kfree_skb(skb);
	}
	return 0;
}

output_userspace在前幾篇已經分析過,會把報文上傳到用戶態,用戶態如何處理後續分析。add_deferred_actions會把報文放在fifo數組中,在報文處理的最後時刻處理,看ovs_execute_actions函數。

2、ovs_execute_actions函數

/* Execute a list of actions against 'skb'. */
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
			const struct sw_flow_actions *acts,
			struct sw_flow_key *key)
{
	int level = this_cpu_read(exec_actions_level);
	int err;
 
	if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
		if (net_ratelimit())
			pr_warn("%s: packet loop detected, dropping.\n",
				ovs_dp_name(dp));
 
		kfree_skb(skb);
		return -ELOOP;
	}
 
	this_cpu_inc(exec_actions_level);
	err = do_execute_actions(dp, skb, key,
				 acts->actions, acts->actions_len);
 
	if (!level)    //do_execute_actions如果循環進入此函數,那麼level非零,不會進入
		process_deferred_actions(dp);
 
	this_cpu_dec(exec_actions_level);
 
	/* This return status currently does not reflect the errors
	 * encounted during deferred actions execution. Probably needs to
	 * be fixed in the future.
	 */
	return err;
}

我們再看一下process_deferred_actions函數是怎麼處理的。

3、process_deferred_actions函數

static void process_deferred_actions(struct datapath *dp)
{
	struct action_fifo *fifo = this_cpu_ptr(action_fifos);
 
	/* Do not touch the FIFO in case there is no deferred actions. */
	if (action_fifo_is_empty(fifo))
		return;
 
	/* Finishing executing all deferred actions. */
	do {
		struct deferred_action *da = action_fifo_get(fifo);
		struct sk_buff *skb = da->skb;
		struct sw_flow_key *key = &da->pkt_key;
		const struct nlattr *actions = da->actions;
 
		if (actions)
			do_execute_actions(dp, skb, key, actions,    //sample進入此分支
					   nla_len(actions));
		else
			ovs_dp_process_packet(skb, key);   //recirc進該流程,開始重新處理該報文
	} while (!action_fifo_is_empty(fifo));
 
	/* Reset FIFO for the next packet.  */
	action_fifo_init(fifo);			//清空fifo
}

sample總體提供兩個功能,1)概率性地發送報文到用戶態;2)兩次處理報文的能力(自定義處理動作),爲什麼提供這個能力? 作用是什麼? 希望通過進一步分析,能夠回答這個問題。

原文鏈接:https://blog.csdn.net/one_clouder/article/details/52418570

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章