]> pilppa.com Git - linux-2.6-omap-h63xx.git/commitdiff
ext4: New inode/block allocation algorithms for flex_bg filesystems
authorTheodore Ts'o <tytso@mit.edu>
Thu, 12 Mar 2009 16:18:34 +0000 (12:18 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Thu, 12 Mar 2009 16:18:34 +0000 (12:18 -0400)
The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option.  It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled).  The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks.  This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs:  stat /home/tytso/Maildir/cur
Inode: 1844562   Type: directory    Mode:  0700   Flags: 0x81000
Generation: 1132745781    Version: 0x00000000:0000ad71
User: 15806   Group: 15806   Size: 1060864
File ACL: 0    Directory ACL: 0
Links: 2   Blockcount: 2072
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/ext4_i.h
fs/ext4/extents.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c

index 2cbfc0b04d370192d9c950868059b7366d4e88ee..096456c8559b48ed9536963fc51cb757f53a95e9 100644 (file)
@@ -827,6 +827,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MIN_BATCH_TIME        0
 #define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */
 
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4
+
 /*
  * Structure of a directory entry
  */
index 2d516c0a22afe8d1eb2c70929e9398d76660c3e8..4ce2187123aad818de707f4e82a988fdb8ee14a3 100644 (file)
@@ -122,6 +122,9 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
 
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+
        /* allocation reservation info for delalloc */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
index e0aa4fe4f596fcb11549de818436edced6887f3b..aa3431856c9a9eba5aeccdd8194e783e42d528c2 100644 (file)
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+       ext4_group_t block_group;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        int depth;
 
        if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        }
 
        /* OK. use inode's group */
-       bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+       block_group = ei->i_block_group;
+       if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+               /*
+                * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+                * block groups per flexgroup, reserve the first block 
+                * group for directories and special files.  Regular 
+                * files will start at the second block group.  This
+                * tends to speed up directory access and improves 
+                * fsck times.
+                */
+               block_group &= ~(flex_size-1);
+               if (S_ISREG(inode->i_mode))
+                       block_group++;
+       }
+       bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+       /*
+        * If we are doing delayed allocation, we don't need take
+        * colour into account.
+        */
+       if (test_opt(inode->i_sb, DELALLOC))
+               return bg_start;
+
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
index ae3eb57dccdd09a949972107ec3db40d0a852016..617f5a2d800a2e84c4c97ee21f5e609e1423e9af 100644 (file)
@@ -410,6 +410,43 @@ out:
        return 0;
 }
 
+struct orlov_stats {
+       __u32 free_inodes;
+       __u32 free_blocks;
+       __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                      int flex_size, struct orlov_stats *stats)
+{
+       struct ext4_group_desc *desc;
+       ext4_group_t            ngroups = EXT4_SB(sb)->s_groups_count;
+       int                     i;
+
+       stats->free_inodes = 0;
+       stats->free_blocks = 0;
+       stats->used_dirs = 0;
+
+       g *= flex_size;
+
+       for (i = 0; i < flex_size; i++) {
+               if (g >= ngroups)
+                       break;
+               desc = ext4_get_group_desc(sb, g++, NULL);
+               if (!desc)
+                       continue;
+
+               stats->free_inodes += ext4_free_inodes_count(sb, desc);
+               stats->free_blocks += ext4_free_blks_count(sb, desc);
+               stats->used_dirs += ext4_used_dirs_count(sb, desc);
+       }
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -425,35 +462,34 @@ out:
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
  */
 
-#define INODE_COST 64
-#define BLOCK_COST 256
-
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                               ext4_group_t *group)
+                           ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_super_block *es = sbi->s_es;
        ext4_group_t ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
-       ext4_fsblk_t blocks_per_dir;
        unsigned int ndirs;
-       int max_debt, max_dirs, min_inodes;
+       int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-       ext4_group_t i;
+       ext4_group_t i, grp, g;
        struct ext4_group_desc *desc;
+       struct orlov_stats stats;
+       int flex_size = ext4_flex_bg_size(sbi);
+
+       if (flex_size > 1) {
+               ngroups = (ngroups + flex_size - 1) >>
+                       sbi->s_log_groups_per_flex;
+               parent_group >>= sbi->s_log_groups_per_flex;
+       }
 
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
@@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        do_div(avefreeb, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-       if ((parent == sb->s_root->d_inode) ||
-           (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+       if (S_ISDIR(mode) &&
+           ((parent == sb->s_root->d_inode) ||
+            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
                int best_ndir = inodes_per_group;
-               ext4_group_t grp;
                int ret = -1;
 
                get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
-                       grp = (parent_group + i) % ngroups;
-                       desc = ext4_get_group_desc(sb, grp, NULL);
-                       if (!desc || !ext4_free_inodes_count(sb, desc))
+                       g = (parent_group + i) % ngroups;
+                       get_orlov_stats(sb, g, flex_size, &stats);
+                       if (!stats.free_inodes)
                                continue;
-                       if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+                       if (stats.used_dirs >= best_ndir)
                                continue;
-                       if (ext4_free_inodes_count(sb, desc) < avefreei)
+                       if (stats.free_inodes < avefreei)
                                continue;
-                       if (ext4_free_blks_count(sb, desc) < avefreeb)
+                       if (stats.free_blocks < avefreeb)
                                continue;
-                       *group = grp;
+                       grp = g;
                        ret = 0;
-                       best_ndir = ext4_used_dirs_count(sb, desc);
+                       best_ndir = stats.used_dirs;
+               }
+               if (ret)
+                       goto fallback;
+       found_flex_bg:
+               if (flex_size == 1) {
+                       *group = grp;
+                       return 0;
+               }
+
+               /*
+                * We pack inodes at the beginning of the flexgroup's
+                * inode tables.  Block allocation decisions will do
+                * something similar, although regular files will
+                * start at 2nd block group of the flexgroup.  See
+                * ext4_ext_find_goal() and ext4_find_near().
+                */
+               grp *= flex_size;
+               for (i = 0; i < flex_size; i++) {
+                       if (grp+i >= sbi->s_groups_count)
+                               break;
+                       desc = ext4_get_group_desc(sb, grp+i, NULL);
+                       if (desc && ext4_free_inodes_count(sb, desc)) {
+                               *group = grp+i;
+                               return 0;
+                       }
                }
-               if (ret == 0)
-                       return ret;
                goto fallback;
        }
 
-       blocks_per_dir = ext4_blocks_count(es) - freeb;
-       do_div(blocks_per_dir, ndirs);
-
        max_dirs = ndirs / ngroups + inodes_per_group / 16;
-       min_inodes = avefreei - inodes_per_group / 4;
-       min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-       max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-       max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-       if (max_debt * INODE_COST > inodes_per_group)
-               max_debt = inodes_per_group / INODE_COST;
-       if (max_debt > 255)
-               max_debt = 255;
-       if (max_debt == 0)
-               max_debt = 1;
+       min_inodes = avefreei - inodes_per_group*flex_size / 4;
+       if (min_inodes < 1)
+               min_inodes = 1;
+       min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+       /*
+        * Start looking in the flex group where we last allocated an
+        * inode for this parent directory
+        */
+       if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+               parent_group = EXT4_I(parent)->i_last_alloc_group;
+               if (flex_size > 1)
+                       parent_group >>= sbi->s_log_groups_per_flex;
+       }
 
        for (i = 0; i < ngroups; i++) {
-               *group = (parent_group + i) % ngroups;
-               desc = ext4_get_group_desc(sb, *group, NULL);
-               if (!desc || !ext4_free_inodes_count(sb, desc))
-                       continue;
-               if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+               grp = (parent_group + i) % ngroups;
+               get_orlov_stats(sb, grp, flex_size, &stats);
+               if (stats.used_dirs >= max_dirs)
                        continue;
-               if (ext4_free_inodes_count(sb, desc) < min_inodes)
+               if (stats.free_inodes < min_inodes)
                        continue;
-               if (ext4_free_blks_count(sb, desc) < min_blocks)
+               if (stats.free_blocks < min_blocks)
                        continue;
-               return 0;
+               goto found_flex_bg;
        }
 
 fallback:
+       ngroups = sbi->s_groups_count;
+       avefreei = freei / ngroups;
+       parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
-               *group = (parent_group + i) % ngroups;
-               desc = ext4_get_group_desc(sb, *group, NULL);
+               grp = (parent_group + i) % ngroups;
+               desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                       ext4_free_inodes_count(sb, desc) >= avefreei)
+                   ext4_free_inodes_count(sb, desc) >= avefreei) {
+                       *group = grp;
                        return 0;
+               }
        }
 
        if (avefreei) {
@@ -542,12 +604,51 @@ fallback:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-                               ext4_group_t *group)
+                           ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *desc;
-       ext4_group_t i;
+       ext4_group_t i, last;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+       /*
+        * Try to place the inode is the same flex group as its
+        * parent.  If we can't find space, use the Orlov algorithm to
+        * find another flex group, and store that information in the
+        * parent directory's inode information so that use that flex
+        * group for future allocations.
+        */
+       if (flex_size > 1) {
+               int retry = 0;
+
+       try_again:
+               parent_group &= ~(flex_size-1);
+               last = parent_group + flex_size;
+               if (last > ngroups)
+                       last = ngroups;
+               for  (i = parent_group; i < last; i++) {
+                       desc = ext4_get_group_desc(sb, i, NULL);
+                       if (desc && ext4_free_inodes_count(sb, desc)) {
+                               *group = i;
+                               return 0;
+                       }
+               }
+               if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+                       retry = 1;
+                       parent_group = EXT4_I(parent)->i_last_alloc_group;
+                       goto try_again;
+               }
+               /*
+                * If this didn't work, use the Orlov search algorithm
+                * to find a new flex group; we pass in the mode to
+                * avoid the topdir algorithms.
+                */
+               *group = parent_group + flex_size;
+               if (*group > ngroups)
+                       *group = 0;
+               return find_group_orlov(sb, parent, group, mode);
+       }
 
        /*
         * Try to place the inode in its parent directory
@@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
 
-       if (sbi->s_log_groups_per_flex) {
+       if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
-                       ret2 = find_group_other(sb, dir, &group);
+                       ret2 = find_group_other(sb, dir, &group, mode);
                        if (ret2 == 0 && once)
                                once = 0;
                                printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                       ret2 = find_group_orlov(sb, dir, &group);
+                       ret2 = find_group_orlov(sb, dir, &group, mode);
        } else
-               ret2 = find_group_other(sb, dir, &group);
+               ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
+       EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -894,6 +996,7 @@ got:
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
+       ei->i_last_alloc_group = ~0;
 
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
index 71d3ecd5db798ce6bcab06331ca7ec4819f62ea1..25811507d2b01ea87b398aed96a3e63ebf611401 100644 (file)
@@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+       ext4_group_t block_group;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-       bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+       block_group = ei->i_block_group;
+       if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+               block_group &= ~(flex_size-1);
+               if (S_ISREG(inode->i_mode))
+                       block_group++;
+       }
+       bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+       /*
+        * If we are doing delayed allocation, we don't need take
+        * colour into account.
+        */
+       if (test_opt(inode->i_sb, DELALLOC))
+               return bg_start;
+
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -4287,6 +4302,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
+       ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
index b038188bd039e8f8c2762c13318c078fd353b17a..b0d6022eaa67ff32aadf81f91916a1617d730da0 100644 (file)
@@ -1726,6 +1726,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
        unsigned free, fragments;
        unsigned i, bits;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
@@ -1747,6 +1748,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
                        return 0;
 
+               /* Avoid using the first bg of a flexgroup for data files */
+               if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+                   (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+                   ((group % flex_size) == 0))
+                       return 0;
+
                bits = ac->ac_sb->s_blocksize_bits + 1;
                for (i = ac->ac_2order; i <= bits; i++)
                        if (grp->bb_counters[i] > 0)