设为首页 收藏本站
查看: 2752|回复: 0

[经验分享] 7.2 Qemu/KVM 直接IO框架

[复制链接]

尚未签到

发表于 2015-10-10 12:28:42 | 显示全部楼层 |阅读模式


  7.2.1 qemu pci-assign模块
  虚拟机上的设备是由qemu创建出来的,对于直接io也是如此。 区别在于直接io时,qemu直接调用vm host上的硬件设备完成相应功能;而不需要更多的软件处理。
  static const TypeInfoassign_info = { (pci-assign.c)
  .name               = "kvm-pci-assign",
  .parent             = TYPE_PCI_DEVICE,
  .instance_size      = sizeof(AssignedDevice),
  .class_init         = assign_class_init,
  };
  (1) 初始化
  static intassigned_initfn(struct PCIDevice *pci_dev)
  {
  AssignedDevice *dev =DO_UPCAST(AssignedDevice, dev, pci_dev);
  //对config空间的虚拟寄存做初始化, 将寄存器的值存在软件变量dev 的emulate_config_read 和emulate_config_write中
  assigned_dev_emulate_config_read(dev, 0,PCI_CONFIG_SPACE_SIZE);
  assigned_dev_direct_config_read(dev, PCI_STATUS,2);
  。。。。。。。。。。
  //和真实的pci设备关联, 由于启动时会输入pci bus,device,func号,所以依据这些信息能得到pci deice对应在vm host上的设别文件
  get_real_device(dev, dev->host.domain,dev->host.bus,
  dev->host.slot,dev->host.function)
  
  assigned_device_pci_cap_init(pci_dev)为pci_dev添加capability
  //增加misx的mmio处理回调assigned_dev_msix_mmio_ops
  assigned_dev_register_msix_mmio(dev);
  //为pci device的memory空间建立mmap
  assigned_dev_register_regions(dev->real_device.regions,
  dev->real_device.region_number, dev)};
  r = assign_device(dev); //调用kvm的KVM_ASSIGN_PCI_DEVICE,
  r = assign_intx(dev);//调用kvm 的KVM_ASSIGN_DEV_IRQ,管理中断
  ....
  }
  下面分析其中的关键函数:
  get_real_device ==》
  a. snprintf(dir, sizeof(dir),
  "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",r_seg, r_bus, r_dev, r_func);
  dev->config_fd = open(name, O_RDWR); //打开真实设备的config,并读出内容
  read(dev->config_fd,pci_dev->dev.config, pci_config_size(&pci_dev->dev));
  但对bar地址做特殊处理
  memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
  memset(&pci_dev->dev.config[PCI_ROM_ADDRESS],0, 4);
  b. 记录mmio信息到PCIRegion *rp;结构
  snprintf(name, sizeof(name),"%sresource", dir);
  f = fopen(name, "r");
  对每个bar做:
  fscanf(f, "%" SCNi64 "%" SCNi64 " %" SCNi64 "\n",&start, &end,&flags) ;
  rp = dev->regions + r;  rp->valid = 0;
  rp->resource_fd = -1;
  size = end - start + 1;
  snprintf(name, sizeof(name),"%sresource%d", dir, r);
  fd = open(name, O_RDWR);
  rp->resource_fd = fd;
  rp->type = flags; rp->valid = 1;  rp->base_addr = start; rp->size = size;
  pci_dev->v_addrs[r].region = rp;
  
  assigned_dev_register_regions==》
  a.  pci_dev->v_addrs.u.r_virtbase = mmap(NULL,cur_region->size,
  PROT_WRITE |PROT_READ, MAP_SHARED,
  cur_region->resource_fd, (off_t)0);
  b.分为mmio和pio的case 分开处理(下面仅分析mmio)关联mmio gpa到真实设备的hva:
  若mmio size < 0x1000(没有到一个内存page大小)
  则   memory_region_init_io(&pci_dev->v_addrs.real_iomem,
  &slow_bar_ops,&pci_dev->v_addrs,
  &quot;assigned-dev-slow-bar&quot;, cur_region->size);
  否则用:void *virtbase = pci_dev->v_addrs.u.r_virtbase;
  memory_region_init_ram_ptr(&pci_dev->v_addrs.real_iomem,
  name, cur_region->size,virtbase);
  //当EPt建立好后,guest os访问gpa时就直接访问真实设备了不会有vm-exit发生
  c.           assigned_dev_iomem_setup(&pci_dev->dev, i, cur_region->size);
  pci_register_bar((PCIDevice *)pci_dev, i, t,
  &pci_dev->v_addrs.container);
  
  assign_device ==>  kvm_device_pci_assign ==>
  kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data); (注意assgin时同时设置了host 与guest)
  
  assign_intx ==>
  a.   intx_route = pci_device_route_intx_to_irq(&dev->dev,dev->intpin); ==》
  pci_device_route_intx_to_irq(call piix3_route_intx_pin_to_irq)得到当前dev的irq信息
  b. deassign当前irq  ==》kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ,&assigned_irq);
  c.重新assign当前设置kvm_device_intx_assign(kvm_state,dev->dev_id, intx_host_msi,
  intx_route.irq);==》
  static intkvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
  uint32_tirq_type, uint32_t guest_irq)
  {
  struct kvm_assigned_irq assigned_irq = {
  .assigned_dev_id = dev_id,
  .guest_irq = guest_irq,
  .flags = irq_type,
  };
  
  if (kvm_check_extension(s,KVM_CAP_ASSIGN_DEV_IRQ)) {
  return kvm_vm_ioctl(s,KVM_ASSIGN_DEV_IRQ, &assigned_irq);
  } else {
  return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ,&assigned_irq);
  }
  }
  
  kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
  
  (2) Reset
  reset_assigned_device:
  a. 对于msix设备调用assigned_dev_update_msix(pci_dev);
  b. 真实设备reset
  snprintf(reset_file, sizeof(reset_file), &quot;/sys/bus/pci/devices/%04x:%02x:%02x.%01x/reset&quot;,
  adev->host.domain,adev->host.bus, adev->host.slot, adev->host.function);
  
  fd = open(reset_file, O_WRONLY);
  ret = write(fd, reset, strlen(reset));
  c. assigned_dev_pci_write_config(pci_dev,PCI_COMMAND, 0, 1);
  
  assigned_dev_update_msi==>分misx和intx的case
  msix case: 1. virq =kvm_irqchip_add_msi_route(kvm_state, msg);
  2. kvm_device_msi_assign(kvm_state,assigned_dev->dev_id, virq);
  最终调用kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ,&assigned_irq);
  intx case:   assign_intx(assigned_dev);
  
  intkvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
  {
  ......
  virq = kvm_irqchip_get_virq(s); //软件分配一个空闲irq号
  
  kroute.gsi = virq;
  kroute.type = KVM_IRQ_ROUTING_MSI;
  kroute.flags = 0;
  kroute.u.msi.address_lo =(uint32_t)msg.address;
  kroute.u.msi.address_hi = msg.address>> 32;
  kroute.u.msi.data = msg.data;
  //  调用kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING,s->irq_routes);
  kvm_add_routing_entry(s, &kroute);
  return virq;
  
  7.2.2  kvm pci assgin
  源码位于virt\assignd-dev.c:
  kvm_vm_ioctl_assigned_device==> case KVM_ASSIGN_PCI_DEVICE ==>
  kvm_vm_ioctl_assign_device==>
  a)  kvm_find_assigned_dev查看dev是否assigned,若已assigned,直接返回
  b)  pci_get_domain_bus_and_slot根据设备地址得到该设备的pci_device
  c)  probe_sysfs_permissions打开设备sysfs的访问权限,这样qeum能访问
  d)  pcidevice的相关初始化
  pci_enable_device(dev);
  pci_request_regions(dev,&quot;kvm_assigned_device&quot;);
  pci_reset_function(dev);
  pci_save_state(dev);
  match->pci_saved_state =pci_store_saved_state(dev);
  e)  加入设备到assignedlist list_add(&match->list, &kvm->arch.assigned_dev_head);
  f)  若vm的iommu domain未建立则kvm_iommu_map_guest(kvm);
  g)  r =kvm_assign_device(kvm, match); 将设备关联到iommu
  
  kvm_iommu_map_guest ==》
  kvm->arch.iommu_domain= iommu_domain_alloc(&pci_bus_type);
  kvm_iommu_map_memslots(kvm);
  
  kvm_assign_device(virtio/iommu.c)==>
  a. r = iommu_attach_device(domain,&pdev->dev); //调用iommu关联设备
  
  kvm_iommu_map_memslots ==》
  slots =kvm_memslots(kvm);
  kvm_for_each_memslot(memslot, slots) {
  r = kvm_iommu_map_pages(kvm, memslot);
  if (r)
  break;
  }
  
  kvm_iommu_map_pages ==》
  对slot中的每个gfn
  a. iommu_iova_to_phys(domain,gfn_to_gpa(gfn)) 检查是否已建立iommu映射
  b. iommu_map(domain, gfn_to_gpa(gfn),pfn_to_hpa(pfn), page_size, flags);建立映射
  
  同时在qemu新增加memory映射时该函数也会被调用:
  __kvm_set_memory_region==》
  if((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
  r = kvm_iommu_map_pages(kvm, &new);
  return r;
  }
  
  7.2.3 kvm  interruptassgin
  (1) 中断assign
  kvm_vm_ioctl_assigned_device==》 case KVM_ASSIGN_DEV_IRQ==>
  kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); ==>
  
  if (host_irq_type)
  r = assign_host_irq(kvm, match, host_irq_type);
  if (guest_irq_type)
  r = assign_guest_irq(kvm, match, assigned_irq,guest_irq_type);
  
  assign_host_irq ==> 分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
  ==> assigned_device_enable_host_msix==>
  a. pci_enable_msix_exact(dev->dev, dev->host_msix_entries,dev->entries_nr);
  b. request_threaded_irq(dev->host_msix_entries.vector,
  kvm_assigned_dev_msix,
  kvm_assigned_dev_thread_msix,
  0,dev->irq_name, dev); //注册了中断处理函数
  
  assign_guest_irq ==》分为了intx, msi,和msix三种case,我们以后仅分析msix的case:
  a .id =kvm_request_irq_source_id(kvm); ==》
  b. assigned_device_enable_guest_msix
  
  assigned_device_enable_guest_msix(structkvm *kvm,
  struct kvm_assigned_dev_kernel *dev,
  struct kvm_assigned_irq *irq)
  {
  dev->guest_irq = irq->guest_irq;//guest_irq为guest os 的中断号
  dev->ack_notifier.gsi = -1;
  return 0;
  }
  
  
  (2) MSIX中断管理
  对于misx的guest irq号由assigned_dev_update_msi ==》kvm_irqchip_add_msi_route分配
  对应内核态为:
  kvm_vm_ioctl ==》case KVM_SET_GSI_ROUTING  ==> kvm_set_irq_routing (virt\irqchip.c) ==》
  setup_routing_entry ==》 kvm_set_routing_entry (irq_comm.c) ==>
  case KVM_IRQ_ROUTING_MSI
  e->set = kvm_set_msi;//中断注入回调函数
  e->msi.address_lo = ue->u.msi.address_lo;
  e->msi.address_hi = ue->u.msi.address_hi;
  e->msi.data = ue->u.msi.data;
  
  int kvm_set_msi(structkvm_kernel_irq_routing_entry *e,
  struct kvm *kvm, int irq_source_id, int level, boolline_status)
  {
  struct kvm_lapic_irq irq;
  kvm_set_msi_irq(e, &irq);
  return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
  }
  
  下面来看看host端中断处理:
  kvm_assigned_dev_raise_guest_irq(structkvm_assigned_dev_kernel *assigned_dev,
  int vector)
  {
  if (unlikely(assigned_dev->irq_requested_type &
  KVM_DEV_IRQ_GUEST_INTX)) {
  spin_lock(&assigned_dev->intx_mask_lock);
  if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
  kvm_set_irq(assigned_dev->kvm,
  assigned_dev->irq_source_id, vector, 1,
  false);
  spin_unlock(&assigned_dev->intx_mask_lock);
  } else
  kvm_set_irq(assigned_dev->kvm,assigned_dev->irq_source_id,
  vector, 1,false);
  }
  所以kvm_set_irq ==> kvm_set_msi(
  
  static irqreturn_tkvm_assigned_dev_msix(int irq, void *dev_id)
  {
  struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  int index = find_index_from_host_irq(assigned_dev, irq);
  u32 vector;
  int ret = 0;
  
  if (index >= 0) {
  vector = assigned_dev->guest_msix_entries[index].vector;
  ret = kvm_set_irq_inatomic(assigned_dev->kvm,
  assigned_dev->irq_source_id,
  vector,1);
  }
  
  return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD :IRQ_HANDLED;
  }
  
  kvm_set_irq_inatomic==》kvm_set_msi_inatomic ==》 kvm_irq_delivery_to_apic_fast
  当真实中断发生时,向guest assigneddevice注入中断
  
  (3) intx中断管理
  assign_host_irq ==》 request_threaded_irq(dev->host_irq,irq_handler,
  kvm_assigned_dev_thread_intx, flags,
  dev->irq_name, dev);
  assign_guest_irq ==> caseintx
  static intassigned_device_enable_guest_intx(struct kvm *kvm,
  struct kvm_assigned_dev_kernel *dev,
  struct kvm_assigned_irq *irq)
  {
  dev->guest_irq = irq->guest_irq;
  dev->ack_notifier.gsi = irq->guest_irq;
  return 0;
  }
  
  kvm_assigned_dev_thread_intx==》kvm_assigned_dev_raise_guest_irq ==> kvm_set_irq
  
  由此可知,kvm的中断虚拟化流程如下:
  (1) 同时注册真实设备的中断处理函数
  (2) 当中断发生时,根据真实设别中断号对应虚拟设备号,注入中断
  如果系统采用了irq remap机制,则host的中断不会产生,直接在guest os上产生中断。
  下一节将讨论iommu.
  
  除pci-assign外,另一种直接io方法,为vfio. 它与pci-assign的区别在于,vfio更多的虚拟化实现放在了qemu用户空间中实现。 但其底层任然会使用iommu;本文就不详细分析vfio了。 其源代码位于:
  Qemu:  hw\vfio_pci.c
  Host driver:drivers/pci/vfio/
         版权声明:本文为博主原创文章,未经博主允许不得转载。

运维网声明 1、欢迎大家加入本站运维交流群:群②:261659950 群⑤:202807635 群⑦870801961 群⑧679858003
2、本站所有主题由该帖子作者发表,该帖子作者与运维网享有帖子相关版权
3、所有作品的著作权均归原作者享有,请您和我们一样尊重他人的著作权等合法权益。如果您对作品感到满意,请购买正版
4、禁止制作、复制、发布和传播具有反动、淫秽、色情、暴力、凶杀等内容的信息,一经发现立即删除。若您因此触犯法律,一切后果自负,我们对此不承担任何责任
5、所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其内容的准确性、可靠性、正当性、安全性、合法性等负责,亦不承担任何法律责任
6、所有作品仅供您个人学习、研究或欣赏,不得用于商业或者其他用途,否则,一切后果均由您自己承担,我们对此不承担任何法律责任
7、如涉及侵犯版权等问题,请您及时通知我们,我们将立即采取措施予以解决
8、联系人Email:admin@iyunv.com 网址:www.yunweiku.com

所有资源均系网友上传或者通过网络收集,我们仅提供一个展示、介绍、观摩学习的平台,我们不对其承担任何法律责任,如涉及侵犯版权等问题,请您及时通知我们,我们将立即处理,联系人Email:kefu@iyunv.com,QQ:1061981298 本贴地址:https://www.yunweiku.com/thread-125084-1-1.html 上篇帖子: J2ME对话-采访KVM之父AT 下篇帖子: KVM在嵌入式Linux上的移植
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

扫码加入运维网微信交流群X

扫码加入运维网微信交流群

扫描二维码加入运维网微信交流群,最新一手资源尽在官方微信交流群!快快加入我们吧...

扫描微信二维码查看详情

客服E-mail:kefu@iyunv.com 客服QQ:1061981298


QQ群⑦:运维网交流群⑦ QQ群⑧:运维网交流群⑧ k8s群:运维网kubernetes交流群


提醒:禁止发布任何违反国家法律、法规的言论与图片等内容;本站内容均来自个人观点与网络等信息,非本站认同之观点.


本站大部分资源是网友从网上搜集分享而来,其版权均归原作者及其网站所有,我们尊重他人的合法权益,如有内容侵犯您的合法权益,请及时与我们联系进行核实删除!



合作伙伴: 青云cloud

快速回复 返回顶部 返回列表