[PATCH] md: improve raid1 "IO Barrier" concept

author NeilBrown <neilb@suse.de>

Fri, 6 Jan 2006 08:20:12 +0000 (00:20 -0800)

committer Linus Torvalds <torvalds@g5.osdl.org>

Fri, 6 Jan 2006 16:34:01 +0000 (08:34 -0800)
author NeilBrown <neilb@suse.de>
Fri, 6 Jan 2006 08:20:12 +0000 (00:20 -0800)
committer Linus Torvalds <torvalds@g5.osdl.org>
Fri, 6 Jan 2006 16:34:01 +0000 (08:34 -0800)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index 229d7b20429721f0939fd84d207bff5441127d4b..f5204149ab65a0c2517664c5a3e26a390eeae8cc 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -51,6 +51,8 @@ static mdk_personality_t raid1_personality;
  
  static void unplug_slaves(mddev_t *mddev);
  
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
  
  static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
  {
@@ -160,20 +162,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
  
  static inline void free_r1bio(r1bio_t *r1_bio)
  {
-       unsigned long flags;
-
         conf_t *conf = mddev_to_conf(r1_bio->mddev);
  
         /*
          * Wake up any possible resync thread that waits for the device
          * to go idle.
          */
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       if (!--conf->nr_pending) {
-               wake_up(&conf->wait_idle);
-               wake_up(&conf->wait_resume);
-       }
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       allow_barrier(conf);
  
         put_all_bios(conf, r1_bio);
         mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +177,10 @@ static inline void free_r1bio(r1bio_t *r1_bio)
  static inline void put_buf(r1bio_t *r1_bio)
  {
         conf_t *conf = mddev_to_conf(r1_bio->mddev);
-       unsigned long flags;
  
         mempool_free(r1_bio, conf->r1buf_pool);
  
-       spin_lock_irqsave(&conf->resync_lock, flags);
-       if (!conf->barrier)
-               BUG();
-       --conf->barrier;
-       wake_up(&conf->wait_resume);
-       wake_up(&conf->wait_idle);
-
-       if (!--conf->nr_pending) {
-               wake_up(&conf->wait_idle);
-               wake_up(&conf->wait_resume);
-       }
-       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       lower_barrier(conf);
  }
  
  static void reschedule_retry(r1bio_t *r1_bio)
@@ -210,6 +193,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
         list_add(&r1_bio->retry_list, &conf->retry_list);
         spin_unlock_irqrestore(&conf->device_lock, flags);
  
+       wake_up(&conf->wait_barrier);
         md_wakeup_thread(mddev->thread);
  }
  
@@ -593,30 +577,83 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
         return ret;
  }
  
-/*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down.  This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'.  When that returns there
+ *    is no backgroup IO happening,  It must arrange to call
+ *    allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier.  Once that returns
+ *    there is no normal IO happeing.  It must arrange to call
+ *    lower_barrier when the particular background IO completes.
   */
  #define RESYNC_DEPTH 32
  
-static void device_barrier(conf_t *conf, sector_t sect)
+static void raise_barrier(conf_t *conf)
  {
         spin_lock_irq(&conf->resync_lock);
-       wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-                           conf->resync_lock, raid1_unplug(conf->mddev->queue));
-       
-       if (!conf->barrier++) {
-               wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-                                   conf->resync_lock, raid1_unplug(conf->mddev->queue));
-               if (conf->nr_pending)
-                       BUG();
+
+       /* Wait until no block IO is waiting */
+       wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+                           conf->resync_lock,
+                           raid1_unplug(conf->mddev->queue));
+
+       /* block any new IO from starting */
+       conf->barrier++;
+
+       /* No wait for all pending IO to complete */
+       wait_event_lock_irq(conf->wait_barrier,
+                           !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+                           conf->resync_lock,
+                           raid1_unplug(conf->mddev->queue));
+
+       spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&conf->resync_lock, flags);
+       conf->barrier--;
+       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+       spin_lock_irq(&conf->resync_lock);
+       if (conf->barrier) {
+               conf->nr_waiting++;
+               wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+                                   conf->resync_lock,
+                                   raid1_unplug(conf->mddev->queue));
+               conf->nr_waiting--;
         }
-       wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-                           conf->resync_lock, raid1_unplug(conf->mddev->queue));
-       conf->next_resync = sect;
+       conf->nr_pending++;
         spin_unlock_irq(&conf->resync_lock);
  }
  
+static void allow_barrier(conf_t *conf)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&conf->resync_lock, flags);
+       conf->nr_pending--;
+       spin_unlock_irqrestore(&conf->resync_lock, flags);
+       wake_up(&conf->wait_barrier);
+}
+
+
  /* duplicate the data pages for behind I/O */
  static struct page **alloc_behind_pages(struct bio *bio)
  {
@@ -678,10 +715,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
          */
         md_write_start(mddev, bio); /* wait on superblock update early */
  
-       spin_lock_irq(&conf->resync_lock);
-       wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-       conf->nr_pending++;
-       spin_unlock_irq(&conf->resync_lock);
+       wait_barrier(conf);
  
         disk_stat_inc(mddev->gendisk, ios[rw]);
         disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -909,13 +943,8 @@ static void print_conf(conf_t *conf)
  
  static void close_sync(conf_t *conf)
  {
-       spin_lock_irq(&conf->resync_lock);
-       wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-                           conf->resync_lock,  raid1_unplug(conf->mddev->queue));
-       spin_unlock_irq(&conf->resync_lock);
-
-       if (conf->barrier) BUG();
-       if (waitqueue_active(&conf->wait_idle)) BUG();
+       wait_barrier(conf);
+       allow_barrier(conf);
  
         mempool_destroy(conf->r1buf_pool);
         conf->r1buf_pool = NULL;
@@ -1317,12 +1346,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                 return sync_blocks;
         }
         /*
-        * If there is non-resync activity waiting for us then
-        * put in a delay to throttle resync.
+        * If there is non-resync activity waiting for a turn,
+        * and resync is going fast enough,
+        * then let it though before starting on this new sync request.
          */
-       if (!go_faster && waitqueue_active(&conf->wait_resume))
+       if (!go_faster && conf->nr_waiting)
                 msleep_interruptible(1000);
-       device_barrier(conf, sector_nr + RESYNC_SECTORS);
+
+       raise_barrier(conf);
+
+       conf->next_resync = sector_nr;
  
         /*
          * If reconstructing, and >1 working disc,
@@ -1355,10 +1388,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
  
         r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
  
-       spin_lock_irq(&conf->resync_lock);
-       conf->nr_pending++;
-       spin_unlock_irq(&conf->resync_lock);
-
         r1_bio->mddev = mddev;
         r1_bio->sector = sector_nr;
         r1_bio->state = 0;
@@ -1542,8 +1571,7 @@ static int run(mddev_t *mddev)
                 mddev->recovery_cp = MaxSector;
  
         spin_lock_init(&conf->resync_lock);
-       init_waitqueue_head(&conf->wait_idle);
-       init_waitqueue_head(&conf->wait_resume);
+       init_waitqueue_head(&conf->wait_barrier);
  
         bio_list_init(&conf->pending_bio_list);
         bio_list_init(&conf->flushing_bio_list);
@@ -1714,11 +1742,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
         }
         memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
  
-       spin_lock_irq(&conf->resync_lock);
-       conf->barrier++;
-       wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-                           conf->resync_lock, raid1_unplug(mddev->queue));
-       spin_unlock_irq(&conf->resync_lock);
+       raise_barrier(conf);
  
         /* ok, everything is stopped */
         oldpool = conf->r1bio_pool;
@@ -1738,12 +1762,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
         conf->raid_disks = mddev->raid_disks = raid_disks;
  
         conf->last_used = 0; /* just make sure it is in-range */
-       spin_lock_irq(&conf->resync_lock);
-       conf->barrier--;
-       spin_unlock_irq(&conf->resync_lock);
-       wake_up(&conf->wait_resume);
-       wake_up(&conf->wait_idle);
-
+       lower_barrier(conf);
  
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
         md_wakeup_thread(mddev->thread);
@@ -1758,18 +1777,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
  
         switch(state) {
         case 1:
-               spin_lock_irq(&conf->resync_lock);
-               conf->barrier++;
-               wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-                                   conf->resync_lock, raid1_unplug(mddev->queue));
-               spin_unlock_irq(&conf->resync_lock);
+               raise_barrier(conf);
                 break;
         case 0:
-               spin_lock_irq(&conf->resync_lock);
-               conf->barrier--;
-               spin_unlock_irq(&conf->resync_lock);
-               wake_up(&conf->wait_resume);
-               wake_up(&conf->wait_idle);
+               lower_barrier(conf);
                 break;
         }
         if (mddev->thread) {
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h

index 292b98f2b408dd375154c9bf24569c5289c0f6ee..c5567425253333bb5607098167a0c9b5d6a0eba9 100644 (file)
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -45,6 +45,7 @@ struct r1_private_data_s {
  
         spinlock_t              resync_lock;
         int                     nr_pending;
+       int                     nr_waiting;
         int                     barrier;
         sector_t                next_resync;
         int                     fullsync;  /* set to 1 if a full sync is needed,
@@ -52,8 +53,7 @@ struct r1_private_data_s {
                                             * Cleared when a sync completes.
                                             */
  
-       wait_queue_head_t       wait_idle;
-       wait_queue_head_t       wait_resume;
+       wait_queue_head_t       wait_barrier;
  
         struct pool_info        *poolinfo;
author	NeilBrown <neilb@suse.de>
	Fri, 6 Jan 2006 08:20:12 +0000 (00:20 -0800)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Fri, 6 Jan 2006 16:34:01 +0000 (08:34 -0800)
drivers/md/raid1.c		patch \| blob \| history
include/linux/raid/raid1.h		patch \| blob \| history