xen网络后端驱动分析（设备篇）

ispsh 发表于 2015-10-11 15:14:18

　　netback和通用网络设备差不多，其priv结构体为xen_netif（netfront为netfront_info，igb叫igb_adapter，都是一个调调。注，priv结构体是跟着net_device后面的一块线性内存区域，用来存放不同设备驱动的私有结构）
　　我们先拿jeremy's git tree的2.6.31的netback做分析，然后再比较下2.6.32以及upstream netback的updates。和netfront一样，2.6.31也有一个accelerator的机制，基本已经快被SRIOV取代了，这里就不多说了。
　　struct xen_netif {
/* Unique identifier for this interface. */
domid_t       domid;
unsigned int handle;
　　 u8             fe_dev_addr;
　　netfront的mac地址，通过xenstore获得（前端写，后端读）
　　/* Physical parameters of the comms window. */
grant_handle_t tx_shmem_handle;
grant_ref_t    tx_shmem_ref;
grant_handle_t rx_shmem_handle;
grant_ref_t    rx_shmem_ref;
unsigned int irq;
　　tx, rx两个IO ring对应的grant_handle_t, grant_ref_t

/* The shared rings and indexes. */
struct xen_netif_tx_back_ring tx;
struct xen_netif_rx_back_ring rx;
struct vm_struct *tx_comms_area;
struct vm_struct *rx_comms_area;

/* Set of features that can be turned on in dev->features. */
int features;

int smart_poll;

/* Internal feature information. */
u8 can_queue:1; /* can queue packets for receiver? */

/* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
RING_IDX rx_req_cons_peek;

　　/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
unsigned long credit_bytes;
unsigned long credit_usec;
atomic64_t remaining_credit;
struct timer_list credit_timeout;

/* Enforce draining of the transmit queue. */
struct timer_list tx_queue_timeout;

/* Statistics */
int nr_copied_skbs;

/* Miscellaneous private stuff. */
struct list_head list;/* scheduling list */
atomic_t       refcnt;
struct net_device *dev;
struct net_device_stats stats;

unsigned int carrier;

wait_queue_head_t waiting_to_free;
};

　　

　　struct backend_info {
struct xenbus_device *dev;
struct xen_netif *netif;
enum xenbus_state frontend_state;
struct xenbus_watch hotplug_status_watch;
int have_hotplug_status_watch:1;

int have_rate_watch:1;
struct xenbus_watch rate_watch;
};

　　netback会watch两个xenstore entry，一个是rate_watch，用来侦测出包速率的变化，一个是hotplug_status_watch，用来发现netfront状态变化，并记录到frontend_state中。前后端驱动的状态包括
　　enum xenbus_state
{
XenbusStateUnknown    = 0,
XenbusStateInitialising = 1,
XenbusStateInitWait = 2,/* Finished early
                  initialisation, but waiting
                  for information from the peer
                  or hotplug scripts. */
XenbusStateInitialised= 3,/* Initialised and waiting for a
                  connection from the peer. */
XenbusStateConnected = 4,
XenbusStateClosing    = 5,/* The device is being closed
                  due to an error or an unplug
                  event. */
XenbusStateClosed    = 6,

/*
* Reconfiguring: The device is being reconfigured.
*/
XenbusStateReconfiguring = 7,

XenbusStateReconfigured= 8
};

　　

　　netback设备调用xenbus_register_backend来把自己注册到xenbus上，你可以把xenbus想象成前后端驱动的pci bus，而netback, netfront则是上面的pci device。
　　static struct xenbus_driver netback = {
.name = "vif",
.owner = THIS_MODULE,
.ids = netback_ids,
.probe = netback_probe,
.remove = netback_remove,
.uevent = netback_uevent,
.otherend_changed = frontend_changed,
};

　　int netif_xenbus_init(void)
{
printk(KERN_CRIT "registering netback\n");
return xenbus_register_backend(&netback);
}

　　

　　static int netback_probe(struct xenbus_device *dev,
         const struct xenbus_device_id *id)
{
const char *message;
struct xenbus_transaction xbt;
int err;
int sg;
struct backend_info *be = kzalloc(sizeof(struct backend_info),
                  GFP_KERNEL);
if (!be) {
   xenbus_dev_fatal(dev, -ENOMEM,
            "allocating backend structure");
   return -ENOMEM;
}

be->dev = dev;
dev_set_drvdata(&dev->dev, be);

通过xenbus_device生成一个backend_info结构体

sg = 1;
if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
   sg = 0;

　　do {
   err = xenbus_transaction_start(&xbt);
   if (err) {
         xenbus_dev_fatal(dev, err, "starting transaction");
         goto fail;
   }

   err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
   if (err) {
         message = "writing feature-sg";
         goto abort_transaction;
   }

   err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
               "%d", sg);
   if (err) {
         message = "writing feature-gso-tcpv4";
         goto abort_transaction;
   }

   /* We support rx-copy path. */
   err = xenbus_printf(xbt, dev->nodename,
               "feature-rx-copy", "%d", 1);
   if (err) {
         message = "writing feature-rx-copy";
         goto abort_transaction;
   }

　　/*
      * We don't support rx-flip path (except old guests who don't
      * grok this feature flag).
      */
   err = xenbus_printf(xbt, dev->nodename,
               "feature-rx-flip", "%d", 0);
   if (err) {
         message = "writing feature-rx-flip";
         goto abort_transaction;
   }

   /* We support data smart poll mechanism */
   err = xenbus_printf(xbt, dev->nodename,
               "feature-smart-poll", "%d", 1);
   if (err) {
         message = "writing feature-smart-poll";
         goto abort_transaction;
   }

   err = xenbus_transaction_end(xbt, 0);
} while (err == -EAGAIN);

　　if (err) {
   xenbus_dev_fatal(dev, err, "completing transaction");
   goto fail;
}

//netback_probe_accelerators(be, dev);

err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err)
   goto fail;

/* This kicks hotplug scripts, so do it immediately. */
backend_create_netif(be);

return 0;

abort_transaction:
xenbus_transaction_end(xbt, 1);
xenbus_dev_fatal(dev, err, "%s", message);
fail:
DPRINTK("failed");
netback_remove(dev);
return err;
}

　　backend_create_netif调用netif_alloc创建一个包含xen_netif结构体的net_device设备
　　

　　struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
{
int err = 0;
struct net_device *dev;
struct xen_netif *netif;
char name = {};

snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
if (dev == NULL) {
   DPRINTK("Could not create netif: out of memory\n");
   return ERR_PTR(-ENOMEM);
}
创建net_device结构体

SET_NETDEV_DEV(dev, parent);

　　netif = netdev_priv(dev);
memset(netif, 0, sizeof(*netif));
netif->domid= domid;
netif->handle = handle;
netif->features = NETIF_F_SG;
atomic_set(&netif->refcnt, 1);
init_waitqueue_head(&netif->waiting_to_free);
netif->dev = dev;
INIT_LIST_HEAD(&netif->list);

netback_carrier_off(netif);
　　初始化xen_netif结构体

atomic64_set(&netif->remaining_credit,INT_MAX);
netif->credit_bytes = INT_MAX;
netif->credit_usec= 0L;
init_timer(&netif->credit_timeout);
/* Initialize 'expires' now: it's used to track the credit window. */
netif->credit_timeout.expires = jiffies;
　　
init_timer(&netif->tx_queue_timeout);
　　tx_queue_timeout是定时器，用来释放tx_queue中的过期报文

dev->netdev_ops = &netback_ops;
dev->features = NETIF_F_IP_CSUM|NETIF_F_SG;

SET_ETHTOOL_OPS(dev, &network_ethtool_ops);

　　dev->tx_queue_len = netbk_queue_length;
　　netback的tx queue长度，如果设置过大会造成延迟增加

/*
* Initialise a dummy MAC address. We choose the numerically
* largest non-broadcast address to prevent the address getting
* stolen by an Ethernet bridge for STP purposes.
* (FE:FF:FF:FF:FF:FF)
*/
memset(dev->dev_addr, 0xFF, ETH_ALEN);
dev->dev_addr &= ~0x01;

　　netback的mac地址都是FE:FF:FF:FF:FF:FF
　　rtnl_lock();
err = register_netdevice(dev);
rtnl_unlock();
if (err) {
   DPRINTK("Could not register new net device %s: err=%d\n",
         dev->name, err);
   free_netdev(dev);
   return ERR_PTR(err);
}

DPRINTK("Successfully created netif\n");
return netif;
}

　　netback_ops结构如下
　　static struct net_device_ops netback_ops =
{
.ndo_start_xmit = netif_be_start_xmit,
.ndo_get_stats= netif_be_get_stats,
.ndo_open = net_open,
.ndo_stop = net_close,
.ndo_change_mtu = netbk_change_mtu,
};

　　

　　如果netfront的状态发生变化，那么netback通过xenbus_watch也可以发现，此时会调用frontend_changed
　　static void frontend_changed(struct xenbus_device *dev,
            enum xenbus_state frontend_state)
{
struct backend_info *be = dev_get_drvdata(&dev->dev);

DPRINTK("%s", xenbus_strstate(frontend_state));

be->frontend_state = frontend_state;

switch (frontend_state) {
case XenbusStateInitialising:
   if (dev->state == XenbusStateClosed) {
         printk(KERN_INFO "%s: %s: prepare for reconnect\n",
               __FUNCTION__, dev->nodename);
         xenbus_switch_state(dev, XenbusStateInitWait);
   }
   break;

case XenbusStateInitialised:
   break;

case XenbusStateConnected:
   if (dev->state == XenbusStateConnected)
         break;
   backend_create_netif(be);
   if (be->netif)
         connect(be);
   break;

　　case XenbusStateClosing:
   if (be->netif)
         kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
   disconnect_backend(dev);
   xenbus_switch_state(dev, XenbusStateClosing);
   break;

case XenbusStateClosed:
   xenbus_switch_state(dev, XenbusStateClosed);
   if (xenbus_dev_is_online(dev))
         break;
   /* fall through if not online */
case XenbusStateUnknown:
   device_unregister(&dev->dev);
   break;

default:
   xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
            frontend_state);
   break;
}
}

　　e.g. 如果发现netfront状态是XenbusStateConnected，调用connect与netfront建立连接
　　static void connect(struct backend_info *be)
{
int err;
struct xenbus_device *dev = be->dev;

err = connect_rings(be);
if (err)
   return;

err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
if (err) {
   xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
   return;
}
　　xen_net_read_rate(dev, &be->netif->credit_bytes,
         &be->netif->credit_usec);
atomic64_set(&be->netif->remaining_credit,be->netif->credit_bytes);
　　 unregister_hotplug_status_watch(be);
　　err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
               hotplug_status_changed,
               "%s/%s", dev->nodename, "hotplug-status");
if (err) {
   /* Switch now, since we can't do a watch. */
   xenbus_switch_state(dev, XenbusStateConnected);
} else {
   be->have_hotplug_status_watch = 1;
}

　　增加hotplug-status的xenbus_watch
　　unregister_rate_watch(be);
err=xenbus_watch_pathfmt(dev, &be->rate_watch,
               rate_changed,"%s/%s", dev->nodename, "rate");
if(!err){
   be->have_rate_watch=1;
}
　　增加rate的xenbus_watch
　　netif_wake_queue(be->netif->dev);
　　唤醒发送队列
}

　　

　　static int connect_rings(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
unsigned long tx_ring_ref, rx_ring_ref;
unsigned int evtchn, rx_copy;
int err;
int val;

DPRINTK("");

err = xenbus_gather(XBT_NIL, dev->otherend,
            "tx-ring-ref", "%lu", &tx_ring_ref,
            "rx-ring-ref", "%lu", &rx_ring_ref,
            "event-channel", "%u", &evtchn, NULL);
if (err) {
   xenbus_dev_fatal(dev, err,
            "reading %s/ring-ref and event-channel",
            dev->otherend);
   return err;
}
　　 err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
　　             &rx_copy);
　　if (err == -ENOENT) {
   err = 0;
   rx_copy = 0;
}
if (err < 0) {
   xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
            dev->otherend);
   return err;
}
　　 if (!rx_copy)
　　return -EOPNOTSUPP;
　　if (be->netif->dev->tx_queue_len != 0) {
   if (xenbus_scanf(XBT_NIL, dev->otherend,
            "feature-rx-notify", "%d", &val) < 0)
         val = 0;
   if (val)
         be->netif->can_queue = 1;
   else
         /* Must be non-zero for pfifo_fast to work. */
         be->netif->dev->tx_queue_len = 1;
}

　　if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
   val = 0;
if (!val) {
   be->netif->features &= ~NETIF_F_SG;
   be->netif->dev->features &= ~NETIF_F_SG;
   if (be->netif->dev->mtu > ETH_DATA_LEN)
         be->netif->dev->mtu = ETH_DATA_LEN;
}

if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
         &val) < 0)
   val = 0;
if (val) {
   be->netif->features |= NETIF_F_TSO;
   be->netif->dev->features |= NETIF_F_TSO;
}

if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
         "%d", &val) < 0)
   val = 0;
if (val) {
   be->netif->features &= ~NETIF_F_IP_CSUM;
   be->netif->dev->features &= ~NETIF_F_IP_CSUM;
}

　　if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
         "%d", &val) < 0)
   val = 0;
if (val)
   be->netif->smart_poll = 1;
else
   be->netif->smart_poll = 0;
　　从xenbus获得netfront参数信息

/* Map the shared frame, irq etc. */
err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
if (err) {
   xenbus_dev_fatal(dev, err,
            "mapping shared-frames %lu/%lu port %u",
            tx_ring_ref, rx_ring_ref, evtchn);
   return err;
}
return 0;
}

　　这里又调用了netif_map，把前端的IO ring page映射过来
　　int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
      unsigned long rx_ring_ref, unsigned int evtchn)
{
int err = -ENOMEM;
struct xen_netif_tx_sring *txs;
struct xen_netif_rx_sring *rxs;

/* Already connected through? */
if (netif->irq)
   return 0;

netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
if (netif->tx_comms_area == NULL)
   return -ENOMEM;
netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
if (netif->rx_comms_area == NULL)
   goto err_rx;

　　调用alloc_vm_area生成一个vm_struct，tx_comms_area, rx_comms_area都是一个vm_struct。vm_struct表示虚拟地址连续的一段内核地址。
　　err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
if (err)
   goto err_map;

err = bind_interdomain_evtchn_to_irqhandler(
   netif->domid, evtchn, netif_be_int, 0,
   netif->dev->name, netif);
if (err < 0)
   goto err_hypervisor;
netif->irq = err;
disable_irq(netif->irq);
　　绑定event channel
　　 txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);

rxs = (struct xen_netif_rx_sring *)
   ((char *)netif->rx_comms_area->addr);
BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
　　初始化tx, rx IO ring，大小为一个PAGE_SIZE
　　 netif->rx_req_cons_peek = 0;

netif_get(netif);

　　rtnl_lock();
netback_carrier_on(netif);
if (netif_running(netif->dev))
   __netif_up(netif);
rtnl_unlock();

return 0;
err_hypervisor:
unmap_frontend_pages(netif);
err_map:
free_vm_area(netif->rx_comms_area);
err_rx:
free_vm_area(netif->tx_comms_area);
return err;
}

　　其中调用的map_frontend_pages，用来映射netfront传送过来的GR，并把授权的page映射到自己的地址空间。我们知道一个页面映射的过程有如下步骤：
　　1. netfront创建一个GR（通过调用gnttab_claim_grant_reference），这里的话就是tx_ring_ref, rx_ring_ref两个GR
　　2. netfront把这个GR授权给netback访问（调用gnttab_grant_foreign_access_ref），GR此时包含了这个page的mfn，被授权的domid,即netfront_info->xbdev->otherend_id
　　3. netback调用map_frontend_pages开始做映射，该函数原型如下
　　static int map_frontend_pages(
struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
{
struct gnttab_map_grant_ref op;
　　 gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, GNTMAP_host_map, tx_ring_ref, netif->domid);
　　该函数设置gnttab_map_grant_ref结构体，映射类型是GNTMAP_host_map，把tx_ring_ref代表的GR，映射到tx_comms_area->addr的本地虚拟地址上
　　 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
　　    BUG();
　　调用grant table的hypercall进行映射
　　 if (op.status) {
   DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
   return op.status;
}
　　 netif->tx_shmem_ref = tx_ring_ref;
　　netif->tx_shmem_handle = op.handle;
　　op.handle是hypercall调用的返回值，指向一个grant_mapping
　　 gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, GNTMAP_host_map, rx_ring_ref, netif->domid);
　　if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
   BUG();

　　if (op.status) {
   struct gnttab_unmap_grant_ref unop;

   gnttab_set_unmap_op(&unop,
               (unsigned long)netif->tx_comms_area->addr,
               GNTMAP_host_map, netif->tx_shmem_handle);
   HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
   DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
   return op.status;
}

netif->rx_shmem_ref = rx_ring_ref;
netif->rx_shmem_handle = op.handle;

return 0;
}

　　4. 当映射完成后，就可以访问了，之后就是撤销映射，函数为unmap_frontend_pages
　　static void unmap_frontend_pages(struct xen_netif *netif)
{
struct gnttab_unmap_grant_ref op;

gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
            GNTMAP_host_map, netif->tx_shmem_handle);

if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
   BUG();

gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
            GNTMAP_host_map, netif->rx_shmem_handle);

if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
   BUG();
　　对tx_comms_area, rx_comms_area设置gnttab_unmap_grant_ref，之后调用hypercall撤销映射
　　}
　　5. 最后是netfront撤销grant ref的foreign access权限，通过调用xennet_end_access或者直接调用gnttab_end_foreign_access_ref来实现。在netfront驱动中，tx_ring, rx_ring是通过xennet_end_access来释放的（xennet_disconnect_backend中调用到），发送数据的page是通过gnttab_end_foreign_access_ref来实现回收，而接收由于是通过grant table页面传递的方式进行的内存拷贝操作，因此通过gnttab_end_foreign_transfer_ref的方式进行回收（可以参考netfront驱动中的xennet_uninit实现，分别调用了xennet_release_tx_bufs,xennet_release_rx_bufs）
　　

　　最后，再来看下netback提供的net_device_ops接口
　　static struct net_device_ops netback_ops =
{
.ndo_start_xmit = netif_be_start_xmit,
.ndo_get_stats= netif_be_get_stats,
.ndo_open = net_open,
.ndo_stop = net_close,
.ndo_change_mtu = netbk_change_mtu,
};
　　static int net_open(struct net_device *dev)
{
struct xen_netif *netif = netdev_priv(dev);
if (netback_carrier_ok(netif)) {
   __netif_up(netif);
   netif_start_queue(dev);
}
return 0;
}

static int net_close(struct net_device *dev)
{
struct xen_netif *netif = netdev_priv(dev);
if (netback_carrier_ok(netif))
   __netif_down(netif);
netif_stop_queue(dev);
return 0;
}

　　__netif_up, __netif_down用来打开/关闭irq中断。__netif_up中，如果有包需要发送，则把xen_netif加入到一个net_schedule_list的全局链表中，之后调用maybe_schedule_tx_action，判断是否触发tasklet_schedule(&net_tx_tasklet)进行报文发送。
　　

　　netif_be_start_xmit留到后面再分析了
　　

　　

　　

      版权声明：本文为博主原创文章，未经博主允许不得转载。

页: [1]

运维网's Archiver

xen网络后端驱动分析（设备篇）