md: introduce get_priority_stripe() to improve raid456 write performance

author Dan Williams <dan.j.williams@intel.com>

Mon, 28 Apr 2008 09:15:53 +0000 (02:15 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 28 Apr 2008 15:58:42 +0000 (08:58 -0700)
author Dan Williams <dan.j.williams@intel.com>
Mon, 28 Apr 2008 09:15:53 +0000 (02:15 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 28 Apr 2008 15:58:42 +0000 (08:58 -0700)
diff --git a/Documentation/md.txt b/Documentation/md.txt

index 396cdd982c26505ee39a577a64a7fb2c7472985e..a8b430627473aa243995ab6f6e173b9cf1ff819e 100644 (file)
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -450,3 +450,9 @@ These currently include
        there are upper and lower limits (32768, 16).  Default is 128.
    strip_cache_active (currently raid5 only)
        number of active entries in the stripe cache
+  preread_bypass_threshold (currently raid5 only)
+      number of times a stripe requiring preread will be bypassed by
+      a stripe that does not require preread.  For fairness defaults
+      to 1.  Setting this to 0 disables bypass accounting and
+      requires preread stripes to wait until all full-width stripe-
+      writes are complete.  Valid values are 0 to stripe_cache_size.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 4efec467e2f178f13860c53ee83b0e2dd6d15d8f..45eead6086475b3e7c5386f6008912f2aad963b6 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
  #define STRIPE_SHIFT           (PAGE_SHIFT - 9)
  #define STRIPE_SECTORS         (STRIPE_SIZE>>9)
  #define        IO_THRESHOLD            1
+#define BYPASS_THRESHOLD       1
  #define NR_HASH                        (PAGE_SIZE / sizeof(struct hlist_head))
  #define HASH_MASK              (NR_HASH - 1)
  
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh)
  
         might_sleep();
  
+       set_bit(STRIPE_IO_STARTED, &sh->state);
         for (i = disks; i--; ) {
                 int rw;
                 struct bio *bi;
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
                                 locked++;
                         }
                 }
+               if (locked + 1 == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&sh->raid_conf->pending_full_writes);
         } else {
                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
                                         STRIPE_SECTORS, 0, 0);
         }
  
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
                                                         0);
                         }
                 }
+
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
                                 s->locked++;
                                 set_bit(R5_Wantwrite, &sh->dev[i].flags);
                         }
+               if (s->locked == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&conf->pending_full_writes);
                 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
                 set_bit(STRIPE_INSYNC, &sh->state);
  
@@ -3094,6 +3109,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 else
                         continue;
  
+               set_bit(STRIPE_IO_STARTED, &sh->state);
+
                 bi = &sh->dev[i].req;
  
                 bi->bi_rw = rw;
@@ -3164,7 +3181,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                 atomic_inc(&conf->preread_active_stripes);
-                       list_add_tail(&sh->lru, &conf->handle_list);
+                       list_add_tail(&sh->lru, &conf->hold_list);
                 }
         } else
                 blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3459,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
         }
  }
  
+/* __get_priority_stripe - get the next stripe to process
+ *
+ * Full stripe writes are allowed to pass preread active stripes up until
+ * the bypass_threshold is exceeded.  In general the bypass_count
+ * increments when the handle_list is handled before the hold_list; however, it
+ * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
+ * stripe with in flight i/o.  The bypass_count will be reset when the
+ * head of the hold_list has changed, i.e. the head was promoted to the
+ * handle_list.
+ */
+static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
+{
+       struct stripe_head *sh;
+
+       pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
+                 __func__,
+                 list_empty(&conf->handle_list) ? "empty" : "busy",
+                 list_empty(&conf->hold_list) ? "empty" : "busy",
+                 atomic_read(&conf->pending_full_writes), conf->bypass_count);
+
+       if (!list_empty(&conf->handle_list)) {
+               sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+
+               if (list_empty(&conf->hold_list))
+                       conf->bypass_count = 0;
+               else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
+                       if (conf->hold_list.next == conf->last_hold)
+                               conf->bypass_count++;
+                       else {
+                               conf->last_hold = conf->hold_list.next;
+                               conf->bypass_count -= conf->bypass_threshold;
+                               if (conf->bypass_count < 0)
+                                       conf->bypass_count = 0;
+                       }
+               }
+       } else if (!list_empty(&conf->hold_list) &&
+                  ((conf->bypass_threshold &&
+                    conf->bypass_count > conf->bypass_threshold) ||
+                   atomic_read(&conf->pending_full_writes) == 0)) {
+               sh = list_entry(conf->hold_list.next,
+                               typeof(*sh), lru);
+               conf->bypass_count -= conf->bypass_threshold;
+               if (conf->bypass_count < 0)
+                       conf->bypass_count = 0;
+       } else
+               return NULL;
+
+       list_del_init(&sh->lru);
+       atomic_inc(&sh->count);
+       BUG_ON(atomic_read(&sh->count) != 1);
+       return sh;
+}
  
  static int make_request(struct request_queue *q, struct bio * bi)
  {
@@ -3914,7 +3983,6 @@ static void raid5d(mddev_t *mddev)
         handled = 0;
         spin_lock_irq(&conf->device_lock);
         while (1) {
-               struct list_head *first;
                 struct bio *bio;
  
                 if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +4004,12 @@ static void raid5d(mddev_t *mddev)
                         handled++;
                 }
  
-               if (list_empty(&conf->handle_list)) {
+               sh = __get_priority_stripe(conf);
+
+               if (!sh) {
                         async_tx_issue_pending_all();
                         break;
                 }
-
-               first = conf->handle_list.next;
-               sh = list_entry(first, struct stripe_head, lru);
-
-               list_del_init(first);
-               atomic_inc(&sh->count);
-               BUG_ON(atomic_read(&sh->count)!= 1);
                 spin_unlock_irq(&conf->device_lock);
                 
                 handled++;
@@ -4010,6 +4073,42 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                 raid5_show_stripe_cache_size,
                                 raid5_store_stripe_cache_size);
  
+static ssize_t
+raid5_show_preread_threshold(mddev_t *mddev, char *page)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       if (conf)
+               return sprintf(page, "%d\n", conf->bypass_threshold);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       char *end;
+       int new;
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+       if (!conf)
+               return -ENODEV;
+
+       new = simple_strtoul(page, &end, 10);
+       if (!*page || (*end && *end != '\n'))
+               return -EINVAL;
+       if (new > conf->max_nr_stripes || new < 0)
+               return -EINVAL;
+       conf->bypass_threshold = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
+                                       S_IRUGO | S_IWUSR,
+                                       raid5_show_preread_threshold,
+                                       raid5_store_preread_threshold);
+
  static ssize_t
  stripe_cache_active_show(mddev_t *mddev, char *page)
  {
@@ -4026,6 +4125,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
  static struct attribute *raid5_attrs[] =  {
         &raid5_stripecache_size.attr,
         &raid5_stripecache_active.attr,
+       &raid5_preread_bypass_threshold.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -4130,12 +4230,14 @@ static int run(mddev_t *mddev)
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
         INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->hold_list);
         INIT_LIST_HEAD(&conf->delayed_list);
         INIT_LIST_HEAD(&conf->bitmap_list);
         INIT_LIST_HEAD(&conf->inactive_list);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
         atomic_set(&conf->active_aligned_reads, 0);
+       conf->bypass_threshold = BYPASS_THRESHOLD;
  
         pr_debug("raid5: run(%s) called.\n", mdname(mddev));
  
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h

index 93678f57ccbe280d3b681aa5c31a2c5049c32a6a..f0827d31ae6fab718158499c10aaa50979527b17 100644 (file)
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -252,6 +252,8 @@ struct r6_state {
  #define        STRIPE_EXPANDING        9
  #define        STRIPE_EXPAND_SOURCE    10
  #define        STRIPE_EXPAND_READY     11
+#define        STRIPE_IO_STARTED       12 /* do not count towards 'bypass_count' */
+#define        STRIPE_FULL_WRITE       13 /* all blocks are set to be overwritten */
  /*
   * Operations flags (in issue order)
   */
@@ -316,12 +318,17 @@ struct raid5_private_data {
         int                     previous_raid_disks;
  
         struct list_head        handle_list; /* stripes needing handling */
+       struct list_head        hold_list; /* preread ready stripes */
         struct list_head        delayed_list; /* stripes that have plugged requests */
         struct list_head        bitmap_list; /* stripes delaying awaiting bitmap update */
         struct bio              *retry_read_aligned; /* currently retrying aligned bios   */
         struct bio              *retry_read_aligned_list; /* aligned bios retry list  */
         atomic_t                preread_active_stripes; /* stripes with scheduled io */
         atomic_t                active_aligned_reads;
+       atomic_t                pending_full_writes; /* full write backlog */
+       int                     bypass_count; /* bypassed prereads */
+       int                     bypass_threshold; /* preread nice */
+       struct list_head        *last_hold; /* detect hold_list promotions */
  
         atomic_t                reshape_stripes; /* stripes with pending writes for reshape */
         /* unfortunately we need two cache names as we temporarily have
author	Dan Williams <dan.j.williams@intel.com>
	Mon, 28 Apr 2008 09:15:53 +0000 (02:15 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 28 Apr 2008 15:58:42 +0000 (08:58 -0700)
Documentation/md.txt		patch \| blob \| history
drivers/md/raid5.c		patch \| blob \| history
include/linux/raid/raid5.h		patch \| blob \| history