OVS源碼--vswitchd啓動(十九)

ovs-vswitchd.c的main函數最終會進入一個while循環,在這個無限循環中,裏面最重要的兩個函數是bridge_run()和netdev_run()。

    bridge_init(remote);
    free(remote);

    exiting = false;
    while (!exiting) {     //while循環,直到退出
        memory_run();
        if (memory_should_report()) {
            struct simap usage;

            simap_init(&usage);
            bridge_get_memory_usage(&usage);
            memory_report(&usage);
            simap_destroy(&usage);
        }
        bridge_run();                    //處理controller交互,ovs-ofctl命令
        unixctl_server_run(unixctl);     //處理ovs-vsctl命令
        netdev_run();

        memory_wait();
        bridge_wait();
        unixctl_server_wait(unixctl);
        netdev_wait();
        if (exiting) {
            poll_immediate_wake();
        }
        poll_block();                     //阻塞,當沒有請求處理的時候,阻塞在此處
        if (should_service_stop()) {
            exiting = true;
        }
    }
    bridge_exit();
    unixctl_server_destroy(unixctl);
    service_stop();

Openvswitch主要管理兩種類型的設備,一個是創建的虛擬網橋,一個是連接到虛擬網橋上的設備。

其中bridge_run就是初始化數據庫中已經創建的虛擬網橋。

在具體介紹代碼調用之前,需要回顧一下OVS 的代碼架構:
在這裏插入圖片描述
ovs-vswitchd是ovs中最核心的組件,openflow的相關邏輯都在vswitchd裏實現,一般來說ovs分爲datapath, vswitchd以及ovsdb三個部分,datapath一般是和具體是數據面平臺相關的,比如白盒交換機,或者linux內核等。ovsdb用於存儲vswitch本身的配置信息,比如端口,拓撲,規則等。vswitchd本身是分層的結構,最上層daemon主要用於和ovsdb通信,做配置的下發和更新等,中間層ofproto,用於和openflow控制器通信,以及通過ofproto_class暴露了ofproto provider接口(目前只支持一個ofproto provider,即ofproto-dpif),不同平臺上openflow的具體實現就通過ofproto_class統一。

ovs bridge的創建流程:

1.通過ovs-vsctl 創建網橋,將創建參數發送給ovsdb-server,ovsdb-server將數據寫入數據庫。

2.ovs-vswitchd從ovsdb-server中讀取創建網橋的信息,在ovs-vswithd層創建一個bridge結構體信息

3.然後將brdige信息應用到ofproto層,在ofproto層通過ofproto_create創建網橋,
ofproto_create(br->name, br->type, &br->ofproto)根據配置創建一個交換機,其中type是交換機類型。這個ofproto對象對外和OpenFlow Controller打交道,對內通過netdev和ofproto-dpif和下面打交道。ofproto結構體(該結構體也表示一個bridge)通過ofproto provider 構造函數創建ofproto provider的私有信息(ofproto_class的定義在ofproto-provider.h中,它的實現在ofproto-dpif.c的ofproto_dpif_class中)。

ofproto_class的作用:不同的交換形態(軟交換、硬件加速),目前只有一種,軟交換。

4.ofproto-dpif通過backer關聯dpif。struct dpif_class是datapath interface實現的工廠接口類,用於和實際的datapath(比如openvswitch.ko, 或者userspace datapath)交互。目前已有的兩個dpif的實現是dpif-netlink和dpif-netdev,前者是基於內核datapath的dpif實現(linux),後者基於用戶態datapath(dpdk):
1)lib/dpif-netlink.c: 特定Linux實現的dpif,該dpif與Open vSwith實現的內核模塊通信。內核模塊執行所有的交換工作,將內核態不匹配的數據包發送到用戶態。dpif封裝調用內核接口。
2)lib/dpif-netdev.c :是一種通用的 dpif 實現。該dpif就是Open vSwith在用戶態的實現。數據包的交換不會進入內核。

dpif_class的作用:不同的轉發平面(用戶態、內核態)
用戶態dp:基於dpdk技術,輪訓轉發,適用NFV領域;
內核態dp:內核態交換流表方式,適用於管理平面或IT領域

一、虛擬網橋的初始化bridge_run

bridge_run會調用bridge_run__,bridge_run__中最重要的是對於所有的網橋,都調用ofproto_run:

static void
bridge_run__(void)
{
    /* Let each bridge do the work that it needs to do. */
    HMAP_FOR_EACH (br, node, &all_bridges) {
        ofproto_run(br->ofproto);
    }
}

int ofproto_run(struct ofproto *p)會調用error = p->ofproto_class->run§;

ofproto_class的定義在ofproto-provider.h中,它的實現在ofproto-dpif.c的ofproto_dpif_class中:

const struct ofproto_class ofproto_dpif_class = {
    init,
    enumerate_types,
    enumerate_names,
    del,
    port_open_type,
    type_run,
    type_wait,
    alloc,
    construct,
    destruct,
    dealloc,
    run,
    wait,
    NULL, /* get_memory_usage. */
    type_get_memory_usage,
    flush,
    query_tables,
    set_tables_version,
    port_alloc,
    port_construct,
    port_destruct,
    port_dealloc,
    port_modified,
    port_reconfigured,
    port_query_by_name,
    port_add,
    port_del,
    port_get_stats,
    port_dump_start,
    port_dump_next,
    port_dump_done,
    port_poll,
    port_poll_wait,
    port_is_lacp_current,
    port_get_lacp_stats,
    NULL, /* rule_choose_table */
    rule_alloc,
    rule_construct,
    rule_insert,
    rule_delete,
    rule_destruct,
    rule_dealloc,
    rule_get_stats,
    rule_execute,
    set_frag_handling,
    packet_out,
    set_netflow,
    get_netflow_ids,
    set_sflow,
    set_ipfix,
    set_cfm,
    cfm_status_changed,
    get_cfm_status,
    set_lldp,
    get_lldp_status,
    set_aa,
    aa_mapping_set,
    aa_mapping_unset,
    aa_vlan_get_queued,
    aa_vlan_get_queue_size,
    set_bfd,
    bfd_status_changed,
    get_bfd_status,
    set_stp,
    get_stp_status,
    set_stp_port,
    get_stp_port_status,
    get_stp_port_stats,
    set_rstp,
    get_rstp_status,
    set_rstp_port,
    get_rstp_port_status,
    set_queues,
    bundle_set,
    bundle_remove,
    mirror_set__,
    mirror_get_stats__,
    set_flood_vlans,
    is_mirror_output_bundle,
    forward_bpdu_changed,
    set_mac_table_config,
    set_mcast_snooping,
    set_mcast_snooping_port,
    set_realdev,
    NULL, /* meter_get_features */
    NULL, /* meter_set */
    NULL, /* meter_get */
    NULL, /* meter_del */
    group_alloc, /* group_alloc */
    group_construct, /* group_construct */
    group_destruct, /* group_destruct */
    group_dealloc, /* group_dealloc */
    group_modify, /* group_modify */
    group_get_stats, /* group_get_stats */
    get_datapath_version, /* get_datapath_version */
};

在ofproto-provider.h中註釋裏是這樣說的,這裏定義了四類數據結構:

  • Struct ofproto表示一個交換機
  • Struct ofport表示交換機上的一個端口
  • Struct rule表示交換機上的一條flow規則
  • Struct ofgroup表示一個flow規則組

上面說到啓動的過程中,會調用ofproto_class->run,也即會調用ofproto-dpif.c中的static int run(struct ofproto *ofproto_)函數。在這個函數中,會初始化netflow, sflow, ipfix,stp, rstp, mac address learning等一系列操作。

bridge_run還會調用static void bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg),其中ovs_cfg是從ovsdb-server裏面讀取出來的配置。

在這個函數裏面,包括以下調用:

//  ofproto_create根據配置創建一個交換機
ofproto_create(br->name, br->type, &br->ofproto)
↓
ofproto->ofproto_class->construct(ofproto)

//set config
ofproto_bridge_set_config

// 對於每一個網橋,將網卡添加進去:
HMAP_FOR_EACH (br, node, &all_bridges) {
    bridge_add_ports(br, &br->wanted_ports);
    shash_destroy(&br->wanted_ports);
}
static void
bridge_add_ports(struct bridge *br, const struct shash *wanted_ports)
{
    /* First add interfaces that request a particular port number. */
    bridge_add_ports__(br, wanted_ports, true);

    /* Then add interfaces that want automatic port number assignment.
     * We add these afterward to avoid accidentally taking a specifically
     * requested port number. */
    bridge_add_ports__(br, wanted_ports, false);
}

然後依次調用:

static void bridge_add_ports__(struct bridge *br, const struct shash *wanted_ports, bool with_requested_port)static bool iface_create(struct bridge *br, const struct ovsrec_interface *iface_cfg, const struct ovsrec_port *port_cfg)static int iface_do_create(const struct bridge *br, const struct ovsrec_interface *iface_cfg, const struct ovsrec_port *port_cfg, ofp_port_t *ofp_portp, struct netdev **netdevp, char **errp)int ofproto_port_add(struct ofproto *ofproto, struct netdev *netdev, ofp_port_t *ofp_portp)
↓
error = ofproto->ofproto_class->port_add(ofproto, netdev);

然後ofproto-dpif.c中的ofproto_dpif_class的 port_add函數會通過 dpif_port_add函數調用

error = dpif->dpif_class->port_add(dpif, netdev, &port_no);

用戶態dp會調用dpif_netdev_class的dpif_netdev_port_add函數,之後便會啓動pmd轉發線程,詳見
https://blog.csdn.net/qq_20817327/article/details/106761936

內核態dp會調用dpif_netlink_class的dpif_netlink_port_add,會調用 dpif_netlink_port_add__,在這個函數裏面,會調用netlink的API,命令爲OVS_VPORT_CMD_NEW

const char *name = netdev_vport_get_dpif_port(netdev,
                                                  namebuf, sizeof namebuf);
struct dpif_netlink_vport request, reply;
struct nl_sock **socksp = NULL;

if (dpif->handlers) {
    socksp = vport_create_socksp(dpif, &error);
    if (!socksp) {
        return error;
    }
}

dpif_netlink_vport_init(&request);
request.cmd = OVS_VPORT_CMD_NEW;
request.dp_ifindex = dpif->dp_ifindex;
request.type = netdev_to_ovs_vport_type(netdev);

request.name = name;

upcall_pids = vport_socksp_to_pids(socksp, dpif->n_handlers);
request.n_upcall_pids = socksp ? dpif->n_handlers : 1;
request.upcall_pids = upcall_pids;
error = dpif_netlink_vport_transact(&request, &reply, &buf);

這裏會調用內核模塊openvswitch.ko,在內核中添加虛擬網卡。

二、虛擬網卡的初始化netdev_run()

void
netdev_run(void)
    OVS_EXCLUDED(netdev_class_mutex, netdev_mutex)
{
    struct netdev_registered_class *rc;

    netdev_initialize();
    ovs_mutex_lock(&netdev_class_mutex);
    HMAP_FOR_EACH (rc, hmap_node, &netdev_classes) {
        if (rc->class->run) {
            rc->class->run();
        }
    }
    ovs_mutex_unlock(&netdev_class_mutex);
}

依次循環調用netdev_classes中的每一個run。

對於不同類型的虛擬網卡,都有對應的netdev_class。

例如對於dpdk的網卡有:

static const struct netdev_class dpdk_class =
    NETDEV_DPDK_CLASS(
        "dpdk",
        NULL,
        netdev_dpdk_construct,
        netdev_dpdk_destruct,
        netdev_dpdk_set_multiq,
        netdev_dpdk_eth_send,
        netdev_dpdk_get_carrier,
        netdev_dpdk_get_stats,
        netdev_dpdk_get_features,
        netdev_dpdk_get_status,
        netdev_dpdk_rxq_recv);

對於物理網卡,也需要有相應的netdev_class:

const struct netdev_class netdev_linux_class =
    NETDEV_LINUX_CLASS(
        "system",
        netdev_linux_construct,
        netdev_linux_get_stats,
        netdev_linux_get_features,
        netdev_linux_get_status);

對於連接到KVM的tap網卡:

const struct netdev_class netdev_tap_class =
    NETDEV_LINUX_CLASS(
        "tap",
        netdev_linux_construct_tap,
        netdev_tap_get_stats,
        netdev_linux_get_features,
        netdev_linux_get_status);

對於虛擬的軟網卡,比如veth pair:

const struct netdev_class netdev_internal_class =
    NETDEV_LINUX_CLASS(
        "internal",
        netdev_linux_construct,
        netdev_internal_get_stats,
        NULL, /* get_features */
        netdev_internal_get_status);

其中NETDEV_LINUX_CLASS是一個宏,不是所有的參數都需要全部填寫。

#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
                           GET_FEATURES, GET_STATUS) \
{ \
    NAME, \
                                                                \
    NULL, \
    netdev_linux_run, \
    netdev_linux_wait, \
                                                                \
    netdev_linux_alloc, \
    CONSTRUCT, \
    netdev_linux_destruct, \
    netdev_linux_dealloc, \
    NULL, /* get_config */ \
    NULL, /* set_config */ \
    NULL, /* get_tunnel_config */ \
    NULL, /* build header */ \
    NULL, /* push header */ \
    NULL, /* pop header */ \
    NULL, /* get_numa_id */ \
    NULL, /* set_multiq */ \
                                                                \
    netdev_linux_send, \
    netdev_linux_send_wait, \
                                                                \
    netdev_linux_set_etheraddr, \
    netdev_linux_get_etheraddr, \
    netdev_linux_get_mtu, \
    netdev_linux_set_mtu, \
    netdev_linux_get_ifindex, \
    netdev_linux_get_carrier, \
    netdev_linux_get_carrier_resets, \
    netdev_linux_set_miimon_interval, \
    GET_STATS, \
                                                                \
    GET_FEATURES, \
    netdev_linux_set_advertisements, \
                                                                \
    netdev_linux_set_policing, \
    netdev_linux_get_qos_types, \
    netdev_linux_get_qos_capabilities, \
    netdev_linux_get_qos, \
    netdev_linux_set_qos, \
    netdev_linux_get_queue, \
    netdev_linux_set_queue, \
    netdev_linux_delete_queue, \
    netdev_linux_get_queue_stats, \
    netdev_linux_queue_dump_start, \
    netdev_linux_queue_dump_next, \
    netdev_linux_queue_dump_done, \
    netdev_linux_dump_queue_stats, \
                                                                \
    netdev_linux_get_in4, \
    netdev_linux_set_in4, \
    netdev_linux_get_in6, \
    netdev_linux_add_router, \
    netdev_linux_get_next_hop, \
    GET_STATUS, \
    netdev_linux_arp_lookup, \
                                                                \
    netdev_linux_update_flags, \
                                                                \
    netdev_linux_rxq_alloc, \
    netdev_linux_rxq_construct, \
    netdev_linux_rxq_destruct, \
    netdev_linux_rxq_dealloc, \
    netdev_linux_rxq_recv, \
    netdev_linux_rxq_wait, \
    netdev_linux_rxq_drain, \
}

rc->class->run()調用的是netdev-linux.c下的netdev_linux_run。

netdev_linux_run會調用netlink的sock得到虛擬網卡的狀態,並且更新狀態:

error = nl_sock_recv(sock, &buf, false);
if (!error) {
    struct rtnetlink_change change;
    if (rtnetlink_parse(&buf, &change)) {
        struct netdev *netdev_ = netdev_from_name(change.ifname);
        if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
           struct netdev_linux *netdev = netdev_linux_cast(netdev_);
           ovs_mutex_lock(&netdev->mutex);
           netdev_linux_update(netdev, &change);
           ovs_mutex_unlock(&netdev->mutex);
        }
        netdev_close(netdev_);
     }
}

總結:
bridge_run__ 循環處理一些必要的操作,如stp、rstp、mcast處理等,同時ovs-ofctl 下發openflow,也是在這裏處理。bridge_reconfigure 主要完成根據數據以及當前進程信息,創建、更新、刪除必要的網橋、接口、端口以及其他協議的配置等。最終這些操作爲應用到ofproto 層、ofproto dpif 層、run_stats_update、run_status_update、run_system_stats 更新openvswitch數據庫狀態信息。netdev_run監控網卡狀態並更新。

補充:
ovs+dpdk場景下,對應的netdev_dpdk_run爲NULL,前面介紹過,用戶態dp會調用dpif_netdev_class的dpif_netdev_port_add函數,在其中啓動pmd線程,採用同步輪詢機制來接收和發送報文,而不是中斷,以此提高報文的處理效率。

原文鏈接:https://blog.csdn.net/qq_15437629/article/details/79598556

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章