现在的位置: 首页 > 综合 > 正文

xen块设备体系结构(4)

2013年09月17日 ⁄ 综合 ⁄ 共 11204字 ⁄ 字号 评论关闭

blkback/blktap

linux-2.6-kernel/drivers/xen/blkback
linux-2.6-kernel/drivers/xen/blktap
里面都是dom0里后端块设备的驱动代码。
后端块设备驱动初始化函数如下,这个是最重要的核心函数:
static int __init blkif_init(void)
{
int i, mmap_pages;
int rc = 0;

if (!xen_pv_domain())
return -ENODEV;

mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;

pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
blkif_reqs, GFP_KERNEL);
pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
mmap_pages, GFP_KERNEL);
pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);

if (blkback_pagemap_init(mmap_pages))
goto out_of_memory;

if (!pending_reqs || !pending_grant_handles || !pending_pages) {
rc = -ENOMEM;
goto out_of_memory;
}

for (i = 0; i < mmap_pages; i++)
pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;

rc = blkif_interface_init();
if (rc)
goto failed_init;

memset(pending_reqs, 0, sizeof(pending_reqs));
INIT_LIST_HEAD(&pending_free);

for (i = 0; i < blkif_reqs; i++)
list_add_tail(&pending_reqs[i].free_list, &pending_free);

rc = blkif_xenbus_init();
if (rc)
goto failed_init;

return 0;
 out_of_memory:
printk(KERN_ERR "%s: out of memory\n", __func__);
 failed_init:
kfree(pending_reqs);
kfree(pending_grant_handles);
free_empty_pages_and_pagevec(pending_pages, mmap_pages);
return rc;
}

如果不是pv domain,直接报错退出,因为后端设备驱动必须跑在半虚拟化domain上
static int blkif_reqs = 64;
这个blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST 是后端设备为请求分配的内存池子大小,算下来是704个page的大小。这个blkif_reqs的大小是可修改的。改大的话,对内存其实没有多少损耗,同时可以提高I/O性能。这个请求池子不要和io ring搞混了。io ring是前后端通信的一个共享的请求ring,只包括前后端的请求/响应,而不包括具体的数据。io ring大小是一个page,算下来并发的请求最大为32个。而blkif_reqs的大小可以远远大于32。MAX_SEGMENTS_PER_REQUEST表示一个blkif_reqs读写的最大sector个数,xen里默认为11。这个值强烈建议不要乱改。一般来说,硬盘一个sector的size为512B,
据说4K的sector会大大提高硬盘的IO性能,我猜是不是因为这个原因,所以一共分配了blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST个page,为了支持4k大小的sector读写 (??不太确定) 
由于前后端是通过shared memory + grant_table 的机制,这704个page的内存里应该是请求/响应实际的数据。
pending_reqs 为分配的blkif_reqs个请求的池子。保存每个请求对应的 pending_req_t 结构
pending_grant_handles 为blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST个page对应的grant_handle_t 结构体分配空间。每个grant_handle_t结构管理一个page的权限。
pending_pages  是 mmaps个page的内存池指针
blkback_pagemap_init调用kzalloc给blkback_pagemap初始化一片内存
调用blkif_interface_init,通过slab allocator,创建blkif_t 结构的cache空间,blkif_t 结构如下:
typedef struct blkif_st {
/* Unique identifier for this interface. */
domid_t           domid;
unsigned int      handle;
/* Physical parameters of the comms window. */
unsigned int      irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings;
struct vm_struct *blk_ring_area;
/* The VBD attached to this interface. */
struct vbd        vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
/* Private fields. */
spinlock_t       blk_ring_lock;
atomic_t         refcnt;

wait_queue_head_t   wq;
struct task_struct  *xenblkd;
unsigned int        waiting_reqs;
struct request_queue     *plug;

/* statistics */
unsigned long       st_print;
int                 st_rd_req;
int                 st_wr_req;
int                 st_oo_req;
int                 st_br_req;
int                 st_rd_sect;
int                 st_wr_sect;

wait_queue_head_t waiting_to_free;

grant_handle_t shmem_handle;
grant_ref_t    shmem_ref;
} blkif_t;

最后,调用blkif_xenbus_init,该函数调用xenbus_register_backend来注册 blkback设备。
static struct xenbus_driver blkback = {
.name = "vbd",
.owner = THIS_MODULE,
.ids = blkback_ids,
.probe = blkback_probe,
.remove = blkback_remove,
.otherend_changed = frontend_changed
};

int blkif_xenbus_init(void)
{
return xenbus_register_backend(&blkback);
}

blkback_pagemap是一个struct blkback_pagemap的数组,blkback_pagemap.c里都是对该结构的操作封装。代码很简单这里就不多讲了。
blkback.c 是后端块驱动的核心代码
blkback中,请求结构为:
typedef struct {
blkif_t       *blkif;
u64            id;
int            nr_pages;
atomic_t       pendcnt;
unsigned short operation;
int            status;
struct list_head free_list;
} pending_req_t;
pending_reqs是 pending_req_t 的数组,保存正在处理的pending_req_t
pending_free是预先分配好的空pending_req_t 结构,因此alloc_req, free_req 就是从pending_free里取出/放回一个pending_req_t 结构
注册到xenbus上的后端设备驱动结构如下:
static struct xenbus_driver blkback = {
.name = "vbd",
.owner = THIS_MODULE,
.ids = blkback_ids,
.probe = blkback_probe,
.remove = blkback_remove,
.otherend_changed = frontend_changed
};

int blkif_xenbus_init(void)
{
return xenbus_register_backend(&blkback);
}

调用xenbus_register_backend把注册后端设备驱动程序
当新的后端vbd设备被创建之后,blkback_probe被调用
static int blkback_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
int err;
struct backend_info *be = kzalloc(sizeof(struct backend_info),
 GFP_KERNEL);
if (!be) {
xenbus_dev_fatal(dev, -ENOMEM,
"allocating backend structure");
return -ENOMEM;
}
be->dev = dev;
dev_set_drvdata(&dev->dev, be);
be->blkif = blkif_alloc(dev->otherend_id);
if (IS_ERR(be->blkif)) {
err = PTR_ERR(be->blkif);
be->blkif = NULL;
xenbus_dev_fatal(dev, err, "creating block interface");
goto fail;
}

/* setup back pointer */
be->blkif->be = be;

err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
  "%s/%s", dev->nodename, "physical-device");
if (err)
goto fail;

err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err)
goto fail;

return 0;
fail:
DPRINTK("failed");
blkback_remove(dev);
return err;
}

blkback_probe函数首先分配一个backend_info结构be,把struct xenbus_device* dev 赋给 be->dev,再分配一个blkif_t结构赋给 be->blkif
调用xenbus_watch_pathfmt , watch 一个 xenstore上的  %nodename/physical-device。xenbus_watch的callback函数是 backend_changed。
最后调用xenbus_switch_state把xenbus状态设置为XenbusStateInitWait
frontend_changed是blkback驱动中,前端设备的callback函数
static void frontend_changed(struct xenbus_device *dev,
    enum xenbus_state frontend_state)
{
struct backend_info *be = dev_get_drvdata(&dev->dev);
int err;

DPRINTK("%s", xenbus_strstate(frontend_state));
switch (frontend_state) {
case XenbusStateInitialising:
if (dev->state == XenbusStateClosed) {
printk(KERN_INFO "%s: %s: prepare for reconnect\n",
      __FUNCTION__, dev->nodename);
xenbus_switch_state(dev, XenbusStateInitWait);
}
break;
case XenbusStateInitialised:
case XenbusStateConnected:
/* Ensure we connect even when two watches fire in
  close successsion and we miss the intermediate value
  of frontend_state. */
if (dev->state == XenbusStateConnected)
break;
err = connect_ring(be);
if (err)
break;
update_blkif_status(be->blkif);
break;
case XenbusStateClosing:
blkif_disconnect(be->blkif);
xenbus_switch_state(dev, XenbusStateClosing);
break;
case XenbusStateClosed:
xenbus_switch_state(dev, XenbusStateClosed);
if (xenbus_dev_is_online(dev))
break;
/* fall through if not online */
case XenbusStateUnknown:
device_unregister(&dev->dev);
break;
default:
xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
frontend_state);
break;
}
}

该函数主要就是根据前端设备的状态更改后端设备相应的状态,这里注意有一个初始化前后端设备ring的动作:如果前端设备为XenbusStateInitialised,XenbusStateConnected,调用 connect_ring (be) 连接和前端设备通信的 ring。
connect_ring从xenstore里读取"ring-ref", "event-channel" "protocol" 相关信息,调用blkif_map(be->blkif, ring_ref, evtchn)
blkif_map首先 alloc_vm_area (PAGE_SIZE) 并赋给 blkif->blk_ring_area。这个4K大小的page就是前后端设备通信用的 ring。之后调用map_frontend_page,赋给前端设备grant_table访问这个page的权限。最后调用 bind_interdomain_evtchn_to_irqhandler,建立一个interdomain通信的event channel,并绑定这个event channel的 local_port到后端设备的
irq上。
backend_changed是一个callback函数,当后端设备ready,或者出现hotplug的时候被调用:
/**
 * Callback received when the hotplug scripts have placed the physical-device
 * node.  Read it and the mode node, and create a vbd.  If the frontend is
 * ready, connect.
 */
static void backend_changed(struct xenbus_watch *watch,
   const char **vec, unsigned int len)
{
int err;
unsigned major;
unsigned minor;
struct backend_info *be
= container_of(watch, struct backend_info, backend_watch);
struct xenbus_device *dev = be->dev;
int cdrom = 0;
char *device_type;

DPRINTK("");

err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
  &major, &minor);
if (XENBUS_EXIST_ERR(err)) {
/* Since this watch will fire once immediately after it is
  registered, we expect this.  Ignore it, and wait for the
  hotplug scripts. */
return;
}
if (err != 2) {
xenbus_dev_fatal(dev, err, "reading physical-device");
return;
}

if ((be->major || be->minor) &&
   ((be->major != major) || (be->minor != minor))) {
printk(KERN_WARNING
      "blkback: changing physical device (from %x:%x to "
      "%x:%x) not supported.\n", be->major, be->minor,
      major, minor);
return;
}

be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
if (IS_ERR(be->mode)) {
err = PTR_ERR(be->mode);
be->mode = NULL;
xenbus_dev_fatal(dev, err, "reading mode");
return;
}

device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
if (!IS_ERR(device_type)) {
cdrom = strcmp(device_type, "cdrom") == 0;
kfree(device_type);
}

if (be->major == 0 && be->minor == 0) {
/* Front end dir is a number, which is used as the handle. */
char *p = strrchr(dev->otherend, '/') + 1;
long handle = simple_strtoul(p, NULL, 0);
be->major = major;
be->minor = minor;
err = vbd_create(be->blkif, handle, major, minor,
(NULL == strchr(be->mode, 'w')), cdrom);
if (err) {
be->major = be->minor = 0;
xenbus_dev_fatal(dev, err, "creating vbd structure");
return;
}
err = xenvbd_sysfs_addif(dev);
if (err) {
vbd_free(&be->blkif->vbd);
be->major = be->minor = 0;
xenbus_dev_fatal(dev, err, "creating sysfs entries");
return;
}
/* We're potentially connected now */
update_blkif_status(be->blkif);
}
}

通过传入参数xenbus_watch,得到包含该结构的backend_info,再通过backend_info得到xenbus_device。之后的事情就简单了,就是和xenstore通信。
从xenstore里读出 "physical-device"目录下的基于%major:%minor的设备的 xenbus_device->nodename。
基于nodename,从xenstore里读出"mode",赋给backend_info->mode。读出"device-type",赋给backend_info->device_type。
调用vbd_create创建后端的vbd设备,即虚拟块设备。调用xenvbd_sysfs_addif()创建sysfs entry。
调用update_blkif_status,里面调用connect方法,尝试去连接后端对应的前端设备
static void update_blkif_status(blkif_t *blkif)
{
int err;
char name[TASK_COMM_LEN];

/* Not ready to connect? */
if (!blkif->irq || !blkif->vbd.bdev)
return;

/* Already connected? */
if (blkif->be->dev->state == XenbusStateConnected)
return;

/* Attempt to connect: exit if we fail to. */
connect(blkif->be);
if (blkif->be->dev->state != XenbusStateConnected)
return;

err = blkback_name(blkif, name);
if (err) {
xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
return;
}

err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
if (err) {
xenbus_dev_error(blkif->be->dev, err, "block flush");
return;
}
invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);

blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
if (IS_ERR(blkif->xenblkd)) {
err = PTR_ERR(blkif->xenblkd);
blkif->xenblkd = NULL;
xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
}
}

update_blkif_status首先调用connect ,connect 的工作就是写xenstore,包括 %nodename/sectors  %nodename/info   %nodename/sector-size
之后调用filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping),把脏数据写回块设备。invalidate_inode_page2,使得块设备所有inode所map的内存无效,相当于释放这些内存(这个不太确定)。
最后调用kthread_run,启动一个blkif_schedule线程
在每台宿主机上可以看到这些内核线程,e.g.
[blkback.30.xvda]
blkif_schedule是blkback的核心函数之一
int blkif_schedule(void *arg)
{
blkif_t *blkif = arg;
struct vbd *vbd = &blkif->vbd;

blkif_get(blkif);

if (debug_lvl)
printk(KERN_DEBUG "%s: started\n", current->comm);

while (!kthread_should_stop()) {
if (try_to_freeze())
continue;
if (unlikely(vbd->size != vbd_size(vbd)))
vbd_resize(blkif);

wait_event_interruptible(
blkif->wq,
blkif->waiting_reqs || kthread_should_stop());
wait_event_interruptible(
pending_free_wq,
!list_empty(&pending_free) || kthread_should_stop());

blkif->waiting_reqs = 0;
smp_mb(); /* clear flag *before* checking for work */

if (do_block_io_op(blkif))
blkif->waiting_reqs = 1;
unplug_queue(blkif);

if (log_stats && time_after(jiffies, blkif->st_print))
print_stats(blkif);
}

if (log_stats)
print_stats(blkif);
if (debug_lvl)
printk(KERN_DEBUG "%s: exiting\n", current->comm);

blkif->xenblkd = NULL;
blkif_put(blkif);

return 0;
}

blkif_get,blkif_put为增/减 blkif 的引用计数
每次判断kthread_should_stop 决定是否退出线程。如果不退出则调用 wait_event_interruptible,wait在blkif->wq上,这是个waiting_requests的队列。我的推测是每次有了等待被处理的请求,blkif_schedule就会开始处理。
处理过程首先调用 do_block_io_op (blkif) :
先计算出req_cons和req_prod的值,如果两者不等,说明有produce的request没有被consume。从后端blkback的 IO环blkif->blk_rings里取出请求req
根据req.operation,调用dispatch_rw_block_io来处理。

在dispatch_rw_block_io中,最终的块设备请求的处理,是通过submit_bio来提交的:
vbd_translate 生成phys_req 的请求preq,其中包括了linux内核的块设备结构struct block_device preq.bdev ,plug_queue把 block_device的块设备请求队列赋值到 blkif_t 里面 (plug_queue),然后调用 bio_add_page产生一个bio结构,调用submit_bio把请求发给下面的驱动

抱歉!评论已关闭.