本文會重點討論下vhost pmd和lib庫中的api如何使用。

在前面的章節中描述過virtio-net設備的生命週期包括設備創建、配置、服務啓動和設備銷燬幾個階段。

我們先回憶下整個生命週期：

設備創建 vhost-user通過socket連接來創建。當創建一個virtio-net設備是，需要:
- 分配新的virtio-net設備結構，並添加到設備鏈表中
- 爲該設備分配一個處理處理核並添加設備到數據面的鏈表中
- 在vhost上分配一個爲virtio-net設備服務的RX\TX隊列
配置
利用VHOST_SET_VRING_*消息通知vhost虛擬隊列的大小、基本索引和位置，vhost將虛擬隊列映射到自己的虛擬地址空間
服務啓動
vhost利用VHOST_SET_VRING_KICK消息來啓動虛擬隊列服務。之後，vhost便可以輪詢接收隊列，並將數據放到virtio-net設備的接收隊列上。同時，也可以輪詢發送虛擬隊列，查看是否有待發送的數據包，如果有，則將其複製到發送隊列中。
設備銷燬
vhost利用VHOST_GET_VRING_BASE消息來通知停止提供對接收隊列和發送虛擬隊列的服務。同時，分配給virtio-net設備的處理和和物理網卡上的RX和TX隊列也將被釋放。

在examples\vhost目錄下有對virtio-net使用的示例，一起來研究下：

int main(int argc, char *argv[])
{
    unsigned lcore_id, core_id = 0;
    unsigned nb_ports, valid_num_ports;
    int ret, i;
    uint8_t portid;
    static pthread_t tid;
    char thread_name[RTE_MAX_THREAD_NAME_LEN];
    uint64_t flags = 0;

    signal(SIGINT, sigint_handler);

    /* init EAL */
    ret = rte_eal_init(argc, argv);
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
    argc -= ret;
    argv += ret;

    /* parse app arguments */
    ret = us_vhost_parse_args(argc, argv);
    if (ret < 0)
        rte_exit(EXIT_FAILURE, "Invalid argument\n");

    for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
        TAILQ_INIT(&lcore_info[lcore_id].vdev_list);

        if (rte_lcore_is_enabled(lcore_id))
            lcore_ids[core_id++] = lcore_id;
    }

    if (rte_lcore_count() > RTE_MAX_LCORE)
        rte_exit(EXIT_FAILURE,"Not enough cores\n");

    /* Get the number of physical ports. */
    nb_ports = rte_eth_dev_count();

    /*
     * Update the global var NUM_PORTS and global array PORTS
     * and get value of var VALID_NUM_PORTS according to system ports number
     */
    valid_num_ports = check_ports_num(nb_ports);

    if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
        RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
            "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
 return -1;
    }

    /*
     * FIXME: here we are trying to allocate mbufs big enough for
     * @MAX_QUEUES, but the truth is we're never going to use that
     * many queues here. We probably should only do allocation for
     * those queues we are going to use.
     */
    create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
             MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);

    if (vm2vm_mode == VM2VM_HARDWARE) {
        /* Enable VT loop back to let L2 switch to do it. */
        vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
        RTE_LOG(DEBUG, VHOST_CONFIG,
            "Enable loop back for L2 switch in vmdq.\n");
    }

    /* initialize all ports */
    for (portid = 0; portid < nb_ports; portid++) {
        /* skip ports that are not enabled */
        if ((enabled_port_mask & (1 << portid)) == 0) {
            RTE_LOG(INFO, VHOST_PORT,
                "Skipping disabled port %d\n", portid);
            continue;
        }
        if (port_init(portid) != 0)
            rte_exit(EXIT_FAILURE,
                "Cannot initialize network ports\n");
    }

    /* Enable stats if the user option is set. */
    if (enable_stats) {
        ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
        if (ret != 0)
            rte_exit(EXIT_FAILURE,
                "Cannot create print-stats thread\n");

        /* Set thread_name for aid in debugging.  */
        snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
        ret = rte_thread_setname(tid, thread_name);
        if (ret != 0)
            RTE_LOG(DEBUG, VHOST_CONFIG,
                "Cannot set print-stats name\n");
    }

    /* Launch all data cores. */
    RTE_LCORE_FOREACH_SLAVE(lcore_id)
        rte_eal_remote_launch(switch_worker, NULL, lcore_id);

    if (client_mode)
        flags |= RTE_VHOST_USER_CLIENT;

    if (dequeue_zero_copy)
        flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;

    /* Register vhost user driver to handle vhost messages. */
    for (i = 0; i < nb_sockets; i++) {
        char *file = socket_files + i * PATH_MAX;
        ret = rte_vhost_driver_register(file, flags);
        if (ret != 0) {
            unregister_drivers(i);
            rte_exit(EXIT_FAILURE,
                "vhost driver register failure.\n");
        }

        if (builtin_net_driver)
            rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);

        if (mergeable == 0) {
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_MRG_RXBUF);
        }

        if (enable_tx_csum == 0) {
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_CSUM);
        }

        if (enable_tso == 0) {
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_HOST_TSO4);
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_HOST_TSO6);
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_GUEST_TSO4);
            rte_vhost_driver_disable_features(file,
                1ULL << VIRTIO_NET_F_GUEST_TSO6);
        }

        if (promiscuous) {
            rte_vhost_driver_enable_features(file,
                1ULL << VIRTIO_NET_F_CTRL_RX);
        }

        ret = rte_vhost_driver_callback_register(file,
            &virtio_net_device_ops);
        if (ret != 0) {
            rte_exit(EXIT_FAILURE,
                "failed to register vhost driver callbacks.\n");
        }

        if (rte_vhost_driver_start(file) < 0) {
            rte_exit(EXIT_FAILURE,
                "failed to start vhost driver.\n");
        }
    }

    RTE_LCORE_FOREACH_SLAVE(lcore_id)
        rte_eal_wait_lcore(lcore_id);
 return 0;

}

從上面的流程中可以看出，創建是通過rte_vhost_driver_register()來完成的，之後根據配置文件調用rte_vhost_driver_set_features()\rte_vhost_driver_disable_features()來配置設備的相關特性，並調用rte_vhost_driver_start()接口來啓動virtio-net設備，在這個接口中會判斷vhost是作爲client還是server模式啓動的，根據不同的角色和virtio前段驅動完成狀態協商以及virtqueue和vring的地址空間映射動作。以上的所有動作完成後，就可以開啓服務等待接收\發送報文了。

對報文的處理都是由switch_worker()完成，這個函數會運行在所有轉發核上。

/*
 * Main function of vhost-switch. It basically does:
 *
 * for each vhost device {
 *    - drain_eth_rx()
 *
 *      Which drains the host eth Rx queue linked to the vhost device,
 *      and deliver all of them to guest virito Rx ring associated with
 *      this vhost device.
 *
 *    - drain_virtio_tx()
 *
 *      Which drains the guest virtio Tx queue and deliver all of them
 *      to the target, which could be another vhost device, or the
 *      physical eth dev. The route is done in function "virtio_tx_route".
 * }
 */
static int
switch_worker(void *arg __rte_unused)
{
    unsigned i;
    unsigned lcore_id = rte_lcore_id();
    struct vhost_dev *vdev;
    struct mbuf_table *tx_q;

    RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);

    tx_q = &lcore_tx_queue[lcore_id];
    for (i = 0; i < rte_lcore_count(); i++) {
        if (lcore_ids[i] == lcore_id) {
            tx_q->txq_id = i;
            break;
        }
    }

    while(1) {
        drain_mbuf_table(tx_q);

        /*
         * Inform the configuration core that we have exited the
         * linked list and that no devices are in use if requested.
         */
        if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
            lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;

        /*
         * Process vhost devices
         */
        TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
                  lcore_vdev_entry) {
            if (unlikely(vdev->remove)) {
                unlink_vmdq(vdev);
                vdev->ready = DEVICE_SAFE_REMOVE;
                continue;
            }

            if (likely(vdev->ready == DEVICE_RX))
                drain_eth_rx(vdev);

            if (likely(!vdev->remove))
                drain_virtio_tx(vdev);
        }
    }
 return 0;
}

這個接口做的事情比較明確，在每個核上啓動一個while()循環，在開始服務前先看看物理網卡的發包隊列裏是否還有緩存報文未發出去，如果有，先處理掉；然後再對該核上所有的設備在收報方向上調用drain_eth_rx(vdev)從host的物理nic上接受報文並放入到guest的Rx ring中，在發包方向上調用drain_virtio_tx(vdev)從guest的Tx ring中取出報文並通過host的物理nic上發送出去。這兩個接口也有必要研究下。

收包方向

實現接口是drain_eth_rx(struct vhost_dev *vdev)，這個接口裏的動作又可以劃分爲兩步：

調用物理網卡的pmd，從nic中收報
將收取到的報文，放進virtio的avail vring中

static inline void __attribute__((always_inline)) drain_eth_rx(struct vhost_dev *vdev)
{
    uint16_t rx_count, enqueue_count;
    struct rte_mbuf *pkts[MAX_PKT_BURST];

    /*step1: 從物理網卡收取報文，以批量模式*/
    rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
                    pkts, MAX_PKT_BURST);
    if (!rx_count)
 return;

    /*step2：將收取的報文放入到virtio的Rx ring中*/
    /*
     * When "enable_retry" is set, here we wait and retry when there
     * is no enough free slots in the queue to hold @rx_count packets,
     * to diminish packet loss.
     */
     /*放入之前先檢查avail ring的空間是否夠用，不夠的話就根據配置等待*/
    if (enable_retry &&
        unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
            VIRTIO_RXQ))) {
        uint32_t retry;

        for (retry = 0; retry < burst_rx_retry_num; retry++) {
            rte_delay_us(burst_rx_delay_time);
            if (rx_count <= rte_vhost_avail_entries(vdev->vid,
                    VIRTIO_RXQ))
                break;
        }
    }

    if (builtin_net_driver) {
        enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
                        pkts, rx_count);
    } else {
        enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
                        pkts, rx_count);
    }
    if (enable_stats) {
        rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
        rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
    }

    free_pkts(pkts, rx_count);
}

uint16_t
rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
    struct rte_mbuf **pkts, uint16_t count)
{
    struct virtio_net *dev = get_device(vid);

    if (!dev)
 return 0;

    if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
    else
 return virtio_dev_rx(dev, queue_id, pkts, count);
}

/**
 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
 * be received from the physical port or from another virtio device. A packet
 * count is returned to indicate the number of packets that are succesfully
 * added to the RX queue. This function works when the mbuf is scattered, but
 * it doesn't support the mergeable feature.
 */
static inline uint32_t __attribute__((always_inline))
virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
          struct rte_mbuf **pkts, uint32_t count)
{
    struct vhost_virtqueue *vq;
    uint16_t avail_idx, free_entries, start_idx;
    uint16_t desc_indexes[MAX_PKT_BURST];
    struct vring_desc *descs;
    uint16_t used_idx;
    uint32_t i, sz;

    LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
    if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
        RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
            dev->vid, __func__, queue_id);
 		return 0;
	}

    vq = dev->virtqueue[queue_id];
    if (unlikely(vq->enabled == 0))
 		return 0;

    avail_idx = *((volatile uint16_t *)&vq->avail->idx);
    start_idx = vq->last_used_idx;
    free_entries = avail_idx - start_idx;
    count = RTE_MIN(count, free_entries);
    count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
    if (count == 0)
 		return 0;

    LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
        dev->vid, start_idx, start_idx + count);

    /* Retrieve all of the desc indexes first to avoid caching issues. */
    rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
    for (i = 0; i < count; i++) {
        used_idx = (start_idx + i) & (vq->size - 1);
        desc_indexes[i] = vq->avail->ring[used_idx];
        vq->used->ring[used_idx].id = desc_indexes[i];
        vq->used->ring[used_idx].len = pkts[i]->pkt_len +
                           dev->vhost_hlen;
        vhost_log_used_vring(dev, vq,
            offsetof(struct vring_used, ring[used_idx]),
            sizeof(vq->used->ring[used_idx]));
    }

    rte_prefetch0(&vq->desc[desc_indexes[0]]);
    for (i = 0; i < count; i++) {
        uint16_t desc_idx = desc_indexes[i];
        int err;

        if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
            descs = (struct vring_desc *)(uintptr_t)
                rte_vhost_gpa_to_vva(dev->mem,
                    vq->desc[desc_idx].addr);
            if (unlikely(!descs)) {
                count = i;
                break;
            }

            desc_idx = 0;
            sz = vq->desc[desc_idx].len / sizeof(*descs); 
		} else { 
			descs = vq->desc; sz = vq->size; 
		} 
		err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz); 
		if (unlikely(err)) { 
			used_idx = (start_idx + i) & (vq->size - 1); 
			vq->used->ring[used_idx].len = dev->vhost_hlen; 
			vhost_log_used_vring(dev, vq, offsetof(struct vring_used, ring[used_idx]), sizeof(vq->used->ring[used_idx])); 
			} if (i + 1 < count) 
				rte_prefetch0(&vq->desc[desc_indexes[i+1]]); 
	} 
	rte_smp_wmb(); 
	*(volatile uint16_t *)&vq->used->idx += count;
    vq->last_used_idx += count;
    vhost_log_used_vring(dev, vq,offsetof(struct vring_used, idx),sizeof(vq->used->idx));

    /* flush used->idx update before we read avail->flags. */
    rte_mb();

    /* Kick the guest if necessary. */
    if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
            && (vq->callfd >= 0))
        eventfd_write(vq->callfd, (eventfd_t)1);
 	return count;
}

第2步的具體調用路徑是：drain_eth_rx–>rte_vhost_enqueue_burst–>virtio_dev_rx,在virtio_dev_rx()函數中完成報文的拷貝入desc，vring以及virtqueue相關指針的更新，完成後發送eventfd通知guest去處理。

發包方向

實現接口是drain_virtio_tx(struct vhost_dev *vdev)，這個接口裏的動作也可以劃分爲兩步：

從virtio的Tx ring中取包
將取到的報文找到合適的路徑發送處理，這裏的路徑包括：到其他VM，到物理nic

static inline void __attribute__((always_inline))
drain_virtio_tx(struct vhost_dev *vdev)
{
    struct rte_mbuf *pkts[MAX_PKT_BURST];
    uint16_t count;
    uint16_t i;
    /*step 1:從virtio的Tx ring中取包到pkts數組中*/
    if (builtin_net_driver) {
        count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
                    pkts, MAX_PKT_BURST);
    } else {
        count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
                    mbuf_pool, pkts, MAX_PKT_BURST);
    }

    /*step 2:將pkts中的報文通過合適路徑發送出去*/
    /* setup VMDq for the first packet * 第一個報文時rarp報文，用來實現mac學習和在VMDq中建立vlan tag映射 */
    if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
        if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
            free_pkts(pkts, count);
    }

    for (i = 0; i < count; ++i)
        virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
}

第1步中的rte_vhost_dequeue_burst的實現就是rte_vhost_enqueue_burst的反向動作，具體實現看代碼即可：

uint16_t
rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
    struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
{
    struct virtio_net *dev;
    struct rte_mbuf *rarp_mbuf = NULL;
    struct vhost_virtqueue *vq;
    uint32_t desc_indexes[MAX_PKT_BURST];
    uint32_t used_idx;
    uint32_t i = 0;
    uint16_t free_entries;
    uint16_t avail_idx;

    dev = get_device(vid);
    if (!dev)
        return 0;

    if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
        RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
            dev->vid, __func__, queue_id);
        return 0;
    }

    vq = dev->virtqueue[queue_id];
    if (unlikely(vq->enabled == 0))
        return 0;

    if (unlikely(dev->dequeue_zero_copy)) {
        struct zcopy_mbuf *zmbuf, *next;
        int nr_updated = 0;

        for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
             zmbuf != NULL; zmbuf = next) {
            next = TAILQ_NEXT(zmbuf, next);

            if (mbuf_is_consumed(zmbuf->mbuf)) {
                used_idx = vq->last_used_idx++ & (vq->size - 1);
                update_used_ring(dev, vq, used_idx,
                         zmbuf->desc_idx);
                nr_updated += 1;

                TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
                rte_pktmbuf_free(zmbuf->mbuf);
                put_zmbuf(zmbuf);
                vq->nr_zmbuf -= 1;
            }
        }

        update_used_idx(dev, vq, nr_updated);
    }

    /*
     * Construct a RARP broadcast packet, and inject it to the "pkts"
     * array, to looks like that guest actually send such packet.
     *
     * Check user_send_rarp() for more information.
     *
     * broadcast_rarp shares a cacheline in the virtio_net structure
     * with some fields that are accessed during enqueue and
     * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
     * result in false sharing between enqueue and dequeue.
     *
     * Prevent unnecessary false sharing by reading broadcast_rarp first
     * and only performing cmpset if the read indicates it is likely to
     * be set.
     */

    if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
            rte_atomic16_cmpset((volatile uint16_t *)
                &dev->broadcast_rarp.cnt, 1, 0))) {

        rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
        if (rarp_mbuf == NULL) {
            RTE_LOG(ERR, VHOST_DATA,
                "Failed to allocate memory for mbuf.\n");
            return 0;
        }

        if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
            rte_pktmbuf_free(rarp_mbuf);
            rarp_mbuf = NULL;
        } else {
            count -= 1;
        }
    }

    free_entries = *((volatile uint16_t *)&vq->avail->idx) -
            vq->last_avail_idx;
    if (free_entries == 0)
        goto out;

    LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);

    /* Prefetch available and used ring */
    avail_idx = vq->last_avail_idx & (vq->size - 1);
    used_idx  = vq->last_used_idx  & (vq->size - 1);
    rte_prefetch0(&vq->avail->ring[avail_idx]);
    rte_prefetch0(&vq->used->ring[used_idx]);

    count = RTE_MIN(count, MAX_PKT_BURST);
    count = RTE_MIN(count, free_entries);
    LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
            dev->vid, count);

    /* Retrieve all of the head indexes first to avoid caching issues. */
    for (i = 0; i < count; i++) {
        avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
        used_idx  = (vq->last_used_idx  + i) & (vq->size - 1);
        desc_indexes[i] = vq->avail->ring[avail_idx];

        if (likely(dev->dequeue_zero_copy == 0))
            update_used_ring(dev, vq, used_idx, desc_indexes[i]);
    }

    /* Prefetch descriptor index. */
    rte_prefetch0(&vq->desc[desc_indexes[0]]);
    for (i = 0; i < count; i++) {
        struct vring_desc *desc;
        uint16_t sz, idx;
        int err;

        if (likely(i + 1 < count))
            rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);

        if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
            desc = (struct vring_desc *)(uintptr_t)
                rte_vhost_gpa_to_vva(dev->mem,
                    vq->desc[desc_indexes[i]].addr);
            if (unlikely(!desc))
                break;

            rte_prefetch0(desc);
            sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
            idx = 0;
        } else {
            desc = vq->desc;
            sz = vq->size;
            idx = desc_indexes[i];
        }

        pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
        if (unlikely(pkts[i] == NULL)) {
            RTE_LOG(ERR, VHOST_DATA,
                "Failed to allocate memory for mbuf.\n");
            break;
        }

        err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
        if (unlikely(err)) {
            rte_pktmbuf_free(pkts[i]);
            break;
        }

        if (unlikely(dev->dequeue_zero_copy)) {
            struct zcopy_mbuf *zmbuf;

            zmbuf = get_zmbuf(vq);
            if (!zmbuf) {
                rte_pktmbuf_free(pkts[i]);
                break;
            }
            zmbuf->mbuf = pkts[i];
            zmbuf->desc_idx = desc_indexes[i];

            /*
             * Pin lock the mbuf; we will check later to see
             * whether the mbuf is freed (when we are the last
             * user) or not. If that's the case, we then could
             * update the used ring safely.
             */
            rte_mbuf_refcnt_update(pkts[i], 1);

            vq->nr_zmbuf += 1;
            TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
        }
    }
    vq->last_avail_idx += i;

    if (likely(dev->dequeue_zero_copy == 0)) {
        vq->last_used_idx += i;
        update_used_idx(dev, vq, i);
    }

out:
    if (unlikely(rarp_mbuf != NULL)) {
        /*
         * Inject it to the head of "pkts" array, so that switch's mac
         * learning table will get updated first.
         */
        memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
        pkts[0] = rarp_mbuf;
        i += 1;
    }

    return i;
}

第二步中virtio_tx_route接口中需要區分下發送的目標，如果目標是local VM並且設置了VM2VM_SOFTWARE，就可以直接通過軟交換把報文轉發出去了；如果沒有設置VM2VM_SOFTWARE宏或者是要通過物理nic發送出去，則最終調用do_drain_mbuf_table–>rte_eth_tx_burst將報文發送出去。

static inline void __attribute__((always_inline))
do_drain_mbuf_table(struct mbuf_table *tx_q)
{
    uint16_t count;

    count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
                 tx_q->m_table, tx_q->len);
    if (unlikely(count < tx_q->len))
        free_pkts(&tx_q->m_table[count], tx_q->len - count);

    tx_q->len = 0;
}

總結

至此，再整理下具體的vhost發包和收包接口：主要使用上一講中重要的API來完成對設備生命週期的管理。

使用vhost lib提供的以下接口來完成vhost設備的收發包：

uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
    struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
    struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);

原文鏈接：http://www.voidcn.com/article/p-vfzcgoqe-bpq.html

DPDK vhost-user研究（二十六）

收包方向

發包方向

總結

linux虛擬網絡設備--虛擬機網卡和linux bridge上tap設備的關係（七）

linux用戶態驅動--VFIO（一）

KVM virtio_net之NAPI機制（十七）

OVS端口鏡像（十五）

KVM Vhost-net 和 Virtio-net代碼詳解（十八）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結