ispsh 发表于 2015-10-11 15:14:18

xen网络后端驱动分析(设备篇)

  netback和通用网络设备差不多,其priv结构体为xen_netif(netfront为netfront_info,igb叫igb_adapter,都是一个调调。注,priv结构体是跟着net_device后面的一块线性内存区域,用来存放不同设备驱动的私有结构)
  我们先拿jeremy's git tree的2.6.31的netback做分析,然后再比较下2.6.32以及upstream netback的updates。和netfront一样,2.6.31也有一个accelerator的机制,基本已经快被SRIOV取代了,这里就不多说了。
  struct xen_netif {
    /* Unique identifier for this interface. */
    domid_t          domid;
    unsigned int   handle;
      u8               fe_dev_addr;
  netfront的mac地址,通过xenstore获得(前端写,后端读)
  /* Physical parameters of the comms window. */
    grant_handle_t   tx_shmem_handle;
    grant_ref_t      tx_shmem_ref;
    grant_handle_t   rx_shmem_handle;
    grant_ref_t      rx_shmem_ref;
    unsigned int   irq;
  tx, rx两个IO ring对应的grant_handle_t, grant_ref_t

    /* The shared rings and indexes. */
    struct xen_netif_tx_back_ring tx;
    struct xen_netif_rx_back_ring rx;
    struct vm_struct *tx_comms_area;
    struct vm_struct *rx_comms_area;

    /* Set of features that can be turned on in dev->features. */
    int features;

    int smart_poll;

    /* Internal feature information. */
    u8 can_queue:1; /* can queue packets for receiver? */

    /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
    RING_IDX rx_req_cons_peek;

  /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
    unsigned long   credit_bytes;
    unsigned long   credit_usec;
    atomic64_t   remaining_credit;
    struct timer_list credit_timeout;

    /* Enforce draining of the transmit queue. */
    struct timer_list tx_queue_timeout;

    /* Statistics */
    int nr_copied_skbs;

    /* Miscellaneous private stuff. */
    struct list_head list;/* scheduling list */
    atomic_t         refcnt;
    struct net_device *dev;
    struct net_device_stats stats;

    unsigned int carrier;

    wait_queue_head_t waiting_to_free;
};

  

  struct backend_info {
    struct xenbus_device *dev;
    struct xen_netif *netif;
    enum xenbus_state frontend_state;
    struct xenbus_watch hotplug_status_watch;
    int have_hotplug_status_watch:1;

    int have_rate_watch:1;
    struct xenbus_watch rate_watch;
};

  netback会watch两个xenstore entry,一个是rate_watch,用来侦测出包速率的变化,一个是hotplug_status_watch,用来发现netfront状态变化,并记录到frontend_state中。前后端驱动的状态包括
  enum xenbus_state
{
    XenbusStateUnknown      = 0,
    XenbusStateInitialising = 1,
    XenbusStateInitWait   = 2,/* Finished early
                     initialisation, but waiting
                     for information from the peer
                     or hotplug scripts. */
    XenbusStateInitialised= 3,/* Initialised and waiting for a
                     connection from the peer. */
    XenbusStateConnected    = 4,
    XenbusStateClosing      = 5,/* The device is being closed
                     due to an error or an unplug
                     event. */
    XenbusStateClosed       = 6,

    /*
    * Reconfiguring: The device is being reconfigured.
    */
    XenbusStateReconfiguring = 7,

    XenbusStateReconfigured= 8
};

  

  netback设备调用xenbus_register_backend来把自己注册到xenbus上,你可以把xenbus想象成前后端驱动的pci bus,而netback, netfront则是上面的pci device。
  static struct xenbus_driver netback = {
    .name = "vif",
    .owner = THIS_MODULE,
    .ids = netback_ids,
    .probe = netback_probe,
    .remove = netback_remove,
    .uevent = netback_uevent,
    .otherend_changed = frontend_changed,
};

  int netif_xenbus_init(void)
{
    printk(KERN_CRIT "registering netback\n");
    return xenbus_register_backend(&netback);
}

  

  static int netback_probe(struct xenbus_device *dev,
             const struct xenbus_device_id *id)
{
    const char *message;
    struct xenbus_transaction xbt;
    int err;
    int sg;
    struct backend_info *be = kzalloc(sizeof(struct backend_info),
                      GFP_KERNEL);
    if (!be) {
      xenbus_dev_fatal(dev, -ENOMEM,
               "allocating backend structure");
      return -ENOMEM;
    }

    be->dev = dev;
    dev_set_drvdata(&dev->dev, be);

通过xenbus_device生成一个backend_info结构体

    sg = 1;
    if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
      sg = 0;

  do {
      err = xenbus_transaction_start(&xbt);
      if (err) {
            xenbus_dev_fatal(dev, err, "starting transaction");
            goto fail;
      }

      err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
      if (err) {
            message = "writing feature-sg";
            goto abort_transaction;
      }

      err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
                  "%d", sg);
      if (err) {
            message = "writing feature-gso-tcpv4";
            goto abort_transaction;
      }

      /* We support rx-copy path. */
      err = xenbus_printf(xbt, dev->nodename,
                  "feature-rx-copy", "%d", 1);
      if (err) {
            message = "writing feature-rx-copy";
            goto abort_transaction;
      }

  /*
         * We don't support rx-flip path (except old guests who don't
         * grok this feature flag).
         */
      err = xenbus_printf(xbt, dev->nodename,
                  "feature-rx-flip", "%d", 0);
      if (err) {
            message = "writing feature-rx-flip";
            goto abort_transaction;
      }

      /* We support data smart poll mechanism */
      err = xenbus_printf(xbt, dev->nodename,
                  "feature-smart-poll", "%d", 1);
      if (err) {
            message = "writing feature-smart-poll";
            goto abort_transaction;
      }

      err = xenbus_transaction_end(xbt, 0);
    } while (err == -EAGAIN);

  if (err) {
      xenbus_dev_fatal(dev, err, "completing transaction");
      goto fail;
    }

    //netback_probe_accelerators(be, dev);

    err = xenbus_switch_state(dev, XenbusStateInitWait);
    if (err)
      goto fail;

    /* This kicks hotplug scripts, so do it immediately. */
    backend_create_netif(be);

    return 0;

abort_transaction:
    xenbus_transaction_end(xbt, 1);
    xenbus_dev_fatal(dev, err, "%s", message);
fail:
    DPRINTK("failed");
    netback_remove(dev);
    return err;
}

  backend_create_netif调用netif_alloc创建一个包含xen_netif结构体的net_device设备
  

  struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
{
    int err = 0;
    struct net_device *dev;
    struct xen_netif *netif;
    char name = {};

    snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
    dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
    if (dev == NULL) {
      DPRINTK("Could not create netif: out of memory\n");
      return ERR_PTR(-ENOMEM);
    }
创建net_device结构体

    SET_NETDEV_DEV(dev, parent);

  netif = netdev_priv(dev);
    memset(netif, 0, sizeof(*netif));
    netif->domid= domid;
    netif->handle = handle;
    netif->features = NETIF_F_SG;
    atomic_set(&netif->refcnt, 1);
    init_waitqueue_head(&netif->waiting_to_free);
    netif->dev = dev;
    INIT_LIST_HEAD(&netif->list);

    netback_carrier_off(netif);
  初始化xen_netif结构体

    atomic64_set(&netif->remaining_credit,INT_MAX);
    netif->credit_bytes = INT_MAX;
    netif->credit_usec= 0L;
    init_timer(&netif->credit_timeout);
    /* Initialize 'expires' now: it's used to track the credit window. */
    netif->credit_timeout.expires = jiffies;
  
    init_timer(&netif->tx_queue_timeout);
  tx_queue_timeout是定时器,用来释放tx_queue中的过期报文

    dev->netdev_ops = &netback_ops;
    dev->features   = NETIF_F_IP_CSUM|NETIF_F_SG;

    SET_ETHTOOL_OPS(dev, &network_ethtool_ops);

  dev->tx_queue_len = netbk_queue_length;
  netback的tx queue长度,如果设置过大会造成延迟增加

    /*
   * Initialise a dummy MAC address. We choose the numerically
   * largest non-broadcast address to prevent the address getting
   * stolen by an Ethernet bridge for STP purposes.
   * (FE:FF:FF:FF:FF:FF)
   */
    memset(dev->dev_addr, 0xFF, ETH_ALEN);
    dev->dev_addr &= ~0x01;

  netback的mac地址都是FE:FF:FF:FF:FF:FF
  rtnl_lock();
    err = register_netdevice(dev);
    rtnl_unlock();
    if (err) {
      DPRINTK("Could not register new net device %s: err=%d\n",
            dev->name, err);
      free_netdev(dev);
      return ERR_PTR(err);
    }

    DPRINTK("Successfully created netif\n");
    return netif;
}

  netback_ops结构如下
  static struct net_device_ops netback_ops =
{
    .ndo_start_xmit = netif_be_start_xmit,
    .ndo_get_stats= netif_be_get_stats,
    .ndo_open   = net_open,
    .ndo_stop   = net_close,
    .ndo_change_mtu = netbk_change_mtu,
};

  

  如果netfront的状态发生变化,那么netback通过xenbus_watch也可以发现,此时会调用frontend_changed
  static void frontend_changed(struct xenbus_device *dev,
               enum xenbus_state frontend_state)
{
    struct backend_info *be = dev_get_drvdata(&dev->dev);

    DPRINTK("%s", xenbus_strstate(frontend_state));

    be->frontend_state = frontend_state;

    switch (frontend_state) {
    case XenbusStateInitialising:
      if (dev->state == XenbusStateClosed) {
            printk(KERN_INFO "%s: %s: prepare for reconnect\n",
                   __FUNCTION__, dev->nodename);
            xenbus_switch_state(dev, XenbusStateInitWait);
      }
      break;

    case XenbusStateInitialised:
      break;

    case XenbusStateConnected:
      if (dev->state == XenbusStateConnected)
            break;
      backend_create_netif(be);
      if (be->netif)
            connect(be);
      break;

  case XenbusStateClosing:
      if (be->netif)
            kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
      disconnect_backend(dev);
      xenbus_switch_state(dev, XenbusStateClosing);
      break;

    case XenbusStateClosed:
      xenbus_switch_state(dev, XenbusStateClosed);
      if (xenbus_dev_is_online(dev))
            break;
      /* fall through if not online */
    case XenbusStateUnknown:
      device_unregister(&dev->dev);
      break;

    default:
      xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
               frontend_state);
      break;
    }
}

  e.g. 如果发现netfront状态是XenbusStateConnected,调用connect与netfront建立连接
  static void connect(struct backend_info *be)
{
    int err;
    struct xenbus_device *dev = be->dev;

    err = connect_rings(be);
    if (err)
      return;

    err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
    if (err) {
      xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
      return;
    }
  xen_net_read_rate(dev, &be->netif->credit_bytes,
            &be->netif->credit_usec);
    atomic64_set(&be->netif->remaining_credit,be->netif->credit_bytes);
      unregister_hotplug_status_watch(be);
  err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
                   hotplug_status_changed,
                   "%s/%s", dev->nodename, "hotplug-status");
    if (err) {
      /* Switch now, since we can't do a watch. */
      xenbus_switch_state(dev, XenbusStateConnected);
    } else {
      be->have_hotplug_status_watch = 1;
    }

  增加hotplug-status的xenbus_watch
  unregister_rate_watch(be);
    err=xenbus_watch_pathfmt(dev, &be->rate_watch,
                   rate_changed,"%s/%s", dev->nodename, "rate");
    if(!err){
      be->have_rate_watch=1;
    }
  增加rate的xenbus_watch
  netif_wake_queue(be->netif->dev);
  唤醒发送队列
}

  

  static int connect_rings(struct backend_info *be)
{
    struct xenbus_device *dev = be->dev;
    unsigned long tx_ring_ref, rx_ring_ref;
    unsigned int evtchn, rx_copy;
    int err;
    int val;

    DPRINTK("");

    err = xenbus_gather(XBT_NIL, dev->otherend,
                "tx-ring-ref", "%lu", &tx_ring_ref,
                "rx-ring-ref", "%lu", &rx_ring_ref,
                "event-channel", "%u", &evtchn, NULL);
    if (err) {
      xenbus_dev_fatal(dev, err,
               "reading %s/ring-ref and event-channel",
               dev->otherend);
      return err;
    }
      err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
                 &rx_copy);
  if (err == -ENOENT) {
      err = 0;
      rx_copy = 0;
    }
    if (err < 0) {
      xenbus_dev_fatal(dev, err, &quot;reading %s/request-rx-copy&quot;,
               dev->otherend);
      return err;
    }
      if (!rx_copy)
  return -EOPNOTSUPP;
  if (be->netif->dev->tx_queue_len != 0) {
      if (xenbus_scanf(XBT_NIL, dev->otherend,
               &quot;feature-rx-notify&quot;, &quot;%d&quot;, &val) < 0)
            val = 0;
      if (val)
            be->netif->can_queue = 1;
      else
            /* Must be non-zero for pfifo_fast to work. */
            be->netif->dev->tx_queue_len = 1;
    }

  if (xenbus_scanf(XBT_NIL, dev->otherend, &quot;feature-sg&quot;, &quot;%d&quot;, &val) < 0)
      val = 0;
    if (!val) {
      be->netif->features &= ~NETIF_F_SG;
      be->netif->dev->features &= ~NETIF_F_SG;
      if (be->netif->dev->mtu > ETH_DATA_LEN)
            be->netif->dev->mtu = ETH_DATA_LEN;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, &quot;feature-gso-tcpv4&quot;, &quot;%d&quot;,
             &val) < 0)
      val = 0;
    if (val) {
      be->netif->features |= NETIF_F_TSO;
      be->netif->dev->features |= NETIF_F_TSO;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, &quot;feature-no-csum-offload&quot;,
             &quot;%d&quot;, &val) < 0)
      val = 0;
    if (val) {
      be->netif->features &= ~NETIF_F_IP_CSUM;
      be->netif->dev->features &= ~NETIF_F_IP_CSUM;
    }

  if (xenbus_scanf(XBT_NIL, dev->otherend, &quot;feature-smart-poll&quot;,
             &quot;%d&quot;, &val) < 0)
      val = 0;
    if (val)
      be->netif->smart_poll = 1;
    else
      be->netif->smart_poll = 0;
  从xenbus获得netfront参数信息

    /* Map the shared frame, irq etc. */
    err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
    if (err) {
      xenbus_dev_fatal(dev, err,
               &quot;mapping shared-frames %lu/%lu port %u&quot;,
               tx_ring_ref, rx_ring_ref, evtchn);
      return err;
    }
    return 0;
}

  这里又调用了netif_map,把前端的IO ring page映射过来
  int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
          unsigned long rx_ring_ref, unsigned int evtchn)
{
    int err = -ENOMEM;
    struct xen_netif_tx_sring *txs;
    struct xen_netif_rx_sring *rxs;

    /* Already connected through? */
    if (netif->irq)
      return 0;

    netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
    if (netif->tx_comms_area == NULL)
      return -ENOMEM;
    netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
    if (netif->rx_comms_area == NULL)
      goto err_rx;

  调用alloc_vm_area生成一个vm_struct,tx_comms_area, rx_comms_area都是一个vm_struct。vm_struct表示虚拟地址连续的一段内核地址。
  err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
    if (err)
      goto err_map;

    err = bind_interdomain_evtchn_to_irqhandler(
      netif->domid, evtchn, netif_be_int, 0,
      netif->dev->name, netif);
    if (err < 0)
      goto err_hypervisor;
    netif->irq = err;
    disable_irq(netif->irq);
  绑定event channel
      txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
    BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);

    rxs = (struct xen_netif_rx_sring *)
      ((char *)netif->rx_comms_area->addr);
    BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
  初始化tx, rx IO ring,大小为一个PAGE_SIZE
      netif->rx_req_cons_peek = 0;

    netif_get(netif);

  rtnl_lock();
    netback_carrier_on(netif);
    if (netif_running(netif->dev))
      __netif_up(netif);
    rtnl_unlock();

    return 0;
err_hypervisor:
    unmap_frontend_pages(netif);
err_map:
    free_vm_area(netif->rx_comms_area);
err_rx:
    free_vm_area(netif->tx_comms_area);
    return err;
}

  其中调用的map_frontend_pages,用来映射netfront传送过来的GR,并把授权的page映射到自己的地址空间。我们知道一个页面映射的过程有如下步骤:
  1. netfront创建一个GR(通过调用gnttab_claim_grant_reference),这里的话就是tx_ring_ref, rx_ring_ref两个GR
  2. netfront把这个GR授权给netback访问(调用gnttab_grant_foreign_access_ref),GR此时包含了这个page的mfn,被授权的domid,即netfront_info->xbdev->otherend_id
  3. netback调用map_frontend_pages开始做映射,该函数原型如下
  static int map_frontend_pages(
    struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
{
    struct gnttab_map_grant_ref op;
      gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, GNTMAP_host_map, tx_ring_ref, netif->domid);
  该函数设置gnttab_map_grant_ref结构体,映射类型是GNTMAP_host_map,把tx_ring_ref代表的GR,映射到tx_comms_area->addr的本地虚拟地址上
      if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
        BUG();
  调用grant table的hypercall进行映射
      if (op.status) {
      DPRINTK(&quot; Gnttab failure mapping tx_ring_ref!\n&quot;);
      return op.status;
    }
      netif->tx_shmem_ref    = tx_ring_ref;
  netif->tx_shmem_handle = op.handle;
  op.handle是hypercall调用的返回值,指向一个grant_mapping
      gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, GNTMAP_host_map, rx_ring_ref, netif->domid);
  if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
      BUG();

  if (op.status) {
      struct gnttab_unmap_grant_ref unop;

      gnttab_set_unmap_op(&unop,
                  (unsigned long)netif->tx_comms_area->addr,
                  GNTMAP_host_map, netif->tx_shmem_handle);
      HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
      DPRINTK(&quot; Gnttab failure mapping rx_ring_ref!\n&quot;);
      return op.status;
    }

    netif->rx_shmem_ref    = rx_ring_ref;
    netif->rx_shmem_handle = op.handle;

    return 0;
}

  4. 当映射完成后,就可以访问了,之后就是撤销映射,函数为unmap_frontend_pages
  static void unmap_frontend_pages(struct xen_netif *netif)
{
    struct gnttab_unmap_grant_ref op;

    gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
                GNTMAP_host_map, netif->tx_shmem_handle);

    if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
      BUG();

    gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
                GNTMAP_host_map, netif->rx_shmem_handle);

    if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
      BUG();
  对tx_comms_area, rx_comms_area设置gnttab_unmap_grant_ref,之后调用hypercall撤销映射
  }
  5. 最后是netfront撤销grant ref的foreign access权限,通过调用xennet_end_access或者直接调用gnttab_end_foreign_access_ref来实现。在netfront驱动中,tx_ring, rx_ring是通过xennet_end_access来释放的(xennet_disconnect_backend中调用到),发送数据的page是通过gnttab_end_foreign_access_ref来实现回收,而接收由于是通过grant table页面传递的方式进行的内存拷贝操作,因此通过gnttab_end_foreign_transfer_ref的方式进行回收(可以参考netfront驱动中的xennet_uninit实现,分别调用了xennet_release_tx_bufs,xennet_release_rx_bufs)
  

  最后,再来看下netback提供的net_device_ops接口
  static struct net_device_ops netback_ops =
{
    .ndo_start_xmit = netif_be_start_xmit,
    .ndo_get_stats= netif_be_get_stats,
    .ndo_open   = net_open,
    .ndo_stop   = net_close,
    .ndo_change_mtu = netbk_change_mtu,
};
  static int net_open(struct net_device *dev)
{
    struct xen_netif *netif = netdev_priv(dev);
    if (netback_carrier_ok(netif)) {
      __netif_up(netif);
      netif_start_queue(dev);
    }
    return 0;
}

static int net_close(struct net_device *dev)
{
    struct xen_netif *netif = netdev_priv(dev);
    if (netback_carrier_ok(netif))
      __netif_down(netif);
    netif_stop_queue(dev);
    return 0;
}

  __netif_up, __netif_down用来打开/关闭irq中断。__netif_up中,如果有包需要发送,则把xen_netif加入到一个net_schedule_list的全局链表中,之后调用maybe_schedule_tx_action,判断是否触发tasklet_schedule(&net_tx_tasklet)进行报文发送。
  

  netif_be_start_xmit留到后面再分析了
  

  

  

         版权声明:本文为博主原创文章,未经博主允许不得转载。
页: [1]
查看完整版本: xen网络后端驱动分析(设备篇)