ocfs2: implement directory read-ahead

author Mark Fasheh <mark.fasheh@oracle.com>

Fri, 21 Apr 2006 20:49:02 +0000 (13:49 -0700)

committer Mark Fasheh <mark.fasheh@oracle.com>

Wed, 20 Sep 2006 22:53:40 +0000 (15:53 -0700)
author Mark Fasheh <mark.fasheh@oracle.com>
Fri, 21 Apr 2006 20:49:02 +0000 (13:49 -0700)
committer Mark Fasheh <mark.fasheh@oracle.com>
Wed, 20 Sep 2006 22:53:40 +0000 (15:53 -0700)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c

index 9a24adf9be6e865d898993ec821cc5f7c399d05f..c9037414f4f65fad9f0b812b9ba75f7c9823cc6d 100644 (file)
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
         mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
                    (unsigned long long)block, nr, flags, inode);
  
+       BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+              (!inode || !(flags & OCFS2_BH_CACHED)));
+
         if (osb == NULL || osb->sb == NULL || bhs == NULL) {
                 status = -EINVAL;
                 mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                 bh = bhs[i];
                 ignore_cache = 0;
  
+               /* There are three read-ahead cases here which we need to
+                * be concerned with. All three assume a buffer has
+                * previously been submitted with OCFS2_BH_READAHEAD
+                * and it hasn't yet completed I/O.
+                *
+                * 1) The current request is sync to disk. This rarely
+                *    happens these days, and never when performance
+                *    matters - the code can just wait on the buffer
+                *    lock and re-submit.
+                *
+                * 2) The current request is cached, but not
+                *    readahead. ocfs2_buffer_uptodate() will return
+                *    false anyway, so we'll wind up waiting on the
+                *    buffer lock to do I/O. We re-check the request
+                *    with after getting the lock to avoid a re-submit.
+                *
+                * 3) The current request is readahead (and so must
+                *    also be a caching one). We short circuit if the
+                *    buffer is locked (under I/O) and if it's in the
+                *    uptodate cache. The re-check from #2 catches the
+                *    case that the previous read-ahead completes just
+                *    before our is-it-in-flight check.
+                */
+
                 if (flags & OCFS2_BH_CACHED &&
                     !ocfs2_buffer_uptodate(inode, bh)) {
                         mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 continue;
                         }
  
+                       /* A read-ahead request was made - if the
+                        * buffer is already under read-ahead from a
+                        * previously submitted request than we are
+                        * done here. */
+                       if ((flags & OCFS2_BH_READAHEAD)
+                           && ocfs2_buffer_read_ahead(inode, bh))
+                               continue;
+
                         lock_buffer(bh);
                         if (buffer_jbd(bh)) {
  #ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 continue;
  #endif
                         }
+
+                       /* Re-check ocfs2_buffer_uptodate() as a
+                        * previously read-ahead buffer may have
+                        * completed I/O while we were waiting for the
+                        * buffer lock. */
+                       if ((flags & OCFS2_BH_CACHED)
+                           && !(flags & OCFS2_BH_READAHEAD)
+                           && ocfs2_buffer_uptodate(inode, bh)) {
+                               unlock_buffer(bh);
+                               continue;
+                       }
+
                         clear_buffer_uptodate(bh);
                         get_bh(bh); /* for end_buffer_read_sync() */
                         bh->b_end_io = end_buffer_read_sync;
-                       if (flags & OCFS2_BH_READAHEAD)
-                               submit_bh(READA, bh);
-                       else
-                               submit_bh(READ, bh);
+                       submit_bh(READ, bh);
                         continue;
                 }
         }
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
         for (i = (nr - 1); i >= 0; i--) {
                 bh = bhs[i];
  
-               /* We know this can't have changed as we hold the
-                * inode sem. Avoid doing any work on the bh if the
-                * journal has it. */
-               if (!buffer_jbd(bh))
-                       wait_on_buffer(bh);
-
-               if (!buffer_uptodate(bh)) {
-                       /* Status won't be cleared from here on out,
-                        * so we can safely record this and loop back
-                        * to cleanup the other buffers. Don't need to
-                        * remove the clustered uptodate information
-                        * for this bh as it's not marked locally
-                        * uptodate. */
-                       status = -EIO;
-                       brelse(bh);
-                       bhs[i] = NULL;
-                       continue;
+               if (!(flags & OCFS2_BH_READAHEAD)) {
+                       /* We know this can't have changed as we hold the
+                        * inode sem. Avoid doing any work on the bh if the
+                        * journal has it. */
+                       if (!buffer_jbd(bh))
+                               wait_on_buffer(bh);
+
+                       if (!buffer_uptodate(bh)) {
+                               /* Status won't be cleared from here on out,
+                                * so we can safely record this and loop back
+                                * to cleanup the other buffers. Don't need to
+                                * remove the clustered uptodate information
+                                * for this bh as it's not marked locally
+                                * uptodate. */
+                               status = -EIO;
+                               brelse(bh);
+                               bhs[i] = NULL;
+                               continue;
+                       }
                 }
  
+               /* Always set the buffer in the cache, even if it was
+                * a forced read, or read-ahead which hasn't yet
+                * completed. */
                 if (inode)
                         ocfs2_set_buffer_uptodate(inode, bh);
         }
         if (inode)
                 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
  
-       mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 
+       mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
              (unsigned long long)block, nr,
-            (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+            (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
  
  bail:
  
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h

index 6ecb90937b685abcc08e7af17484e84de6c6c939..6cc20930fac31429319ce76a060b7475e94f45e9 100644 (file)
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -49,7 +49,7 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
  
  
  #define OCFS2_BH_CACHED            1
-#define OCFS2_BH_READAHEAD         8   /* use this to pass READA down to submit_bh */
+#define OCFS2_BH_READAHEAD         8
  
  static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
                                    struct buffer_head **bh, int flags,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c

index 3d494d1a5f36360bd2f90983c05fff5b9b2f88a5..04e01915b86e4e4f6ab86c6056da2ebcf4050d09 100644 (file)
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -74,14 +74,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
  int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
  {
         int error = 0;
-       unsigned long offset, blk;
-       int i, num, stored;
+       unsigned long offset, blk, last_ra_blk = 0;
+       int i, stored;
         struct buffer_head * bh, * tmp;
         struct ocfs2_dir_entry * de;
         int err;
         struct inode *inode = filp->f_dentry->d_inode;
         struct super_block * sb = inode->i_sb;
-       int have_disk_lock = 0;
+       unsigned int ra_sectors = 16;
  
         mlog_entry("dirino=%llu\n",
                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -95,9 +95,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                         mlog_errno(error);
                 /* we haven't got any yet, so propagate the error. */
                 stored = error;
-               goto bail;
+               goto bail_nolock;
         }
-       have_disk_lock = 1;
  
         offset = filp->f_pos & (sb->s_blocksize - 1);
  
@@ -113,16 +112,21 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                         continue;
                 }
  
-               /*
-                * Do the readahead (8k)
-                */
-               if (!offset) {
-                       for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+               /* The idea here is to begin with 8k read-ahead and to stay
+                * 4k ahead of our current position.
+                *
+                * TODO: Use the pagecache for this. We just need to
+                * make sure it's cluster-safe... */
+               if (!last_ra_blk
+                   || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
+                       for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                              i > 0; i--) {
                                 tmp = ocfs2_bread(inode, ++blk, &err, 1);
                                 if (tmp)
                                         brelse(tmp);
                         }
+                       last_ra_blk = blk;
+                       ra_sectors = 8;
                 }
  
  revalidate:
@@ -194,9 +198,9 @@ revalidate:
  
         stored = 0;
  bail:
-       if (have_disk_lock)
-               ocfs2_meta_unlock(inode, 0);
+       ocfs2_meta_unlock(inode, 0);
  
+bail_nolock:
         mlog_exit(stored);
  
         return stored;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c

index 3f496c41fea8b200c488674a3a1910a9ac3a4133..7bcf69154592ec6a1083c3a840865097f1d1593a 100644 (file)
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1050,12 +1050,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
         u64 p_blkno;
         int readflags = OCFS2_BH_CACHED;
  
-#if 0
-       /* only turn this on if we know we can deal with read_block
-        * returning nothing */
         if (reada)
                 readflags |= OCFS2_BH_READAHEAD;
-#endif
  
         if (((u64)block << inode->i_sb->s_blocksize_bits) >=
             i_size_read(inode)) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c

index 24126476a8ccd95e1cc505d5bb72d46990ae90e5..0d3e939b1f561a655eea61c2090ecf7eb88fbc38 100644 (file)
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -56,6 +56,7 @@
  #include "journal.h"
  #include "namei.h"
  #include "suballoc.h"
+#include "super.h"
  #include "symlink.h"
  #include "sysfile.h"
  #include "uptodate.h"
@@ -1962,13 +1963,8 @@ restart:
                                 }
                                 num++;
  
-                               /* XXX: questionable readahead stuff here */
                                 bh = ocfs2_bread(dir, b++, &err, 1);
                                 bh_use[ra_max] = bh;
-#if 0          // ???
-                               if (bh)
-                                       ll_rw_block(READ, 1, &bh);
-#endif
                         }
                 }
                 if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1976,6 +1972,10 @@ restart:
                 wait_on_buffer(bh);
                 if (!buffer_uptodate(bh)) {
                         /* read error, skip block & hope for the best */
+                       ocfs2_error(dir->i_sb, "reading directory %llu, "
+                                   "offset %lu\n",
+                                   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                   block);
                         brelse(bh);
                         goto next;
                 }
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c

index b8a00a79332676dd8956a10180926a2b580a7213..9707ed7a3206d5055587e5cea8555b17b8e61d8b 100644 (file)
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -206,7 +206,10 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
  }
  
  /* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache. */
+ * the block is stored in our inode metadata cache. 
+ * 
+ * This can be called under lock_buffer()
+ */
  int ocfs2_buffer_uptodate(struct inode *inode,
                           struct buffer_head *bh)
  {
@@ -226,6 +229,16 @@ int ocfs2_buffer_uptodate(struct inode *inode,
         return ocfs2_buffer_cached(OCFS2_I(inode), bh);
  }
  
+/* 
+ * Determine whether a buffer is currently out on a read-ahead request.
+ * ip_io_sem should be held to serialize submitters with the logic here.
+ */
+int ocfs2_buffer_read_ahead(struct inode *inode,
+                           struct buffer_head *bh)
+{
+       return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
  /* Requires ip_lock */
  static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
                                      sector_t block)
@@ -403,7 +416,11 @@ out_free:
   *
   * Note that this function may actually fail to insert the block if
   * memory cannot be allocated. This is not fatal however (but may
- * result in a performance penalty) */
+ * result in a performance penalty)
+ *
+ * Readahead buffers can be passed in here before the I/O request is
+ * completed.
+ */
  void ocfs2_set_buffer_uptodate(struct inode *inode,
                                struct buffer_head *bh)
  {
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h

index 01cd32d26b06867c28f67326941ee4c52ab1f087..2e73206059a857af1ed61960ae77922f40efb786 100644 (file)
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,5 +40,7 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
                                    struct buffer_head *bh);
  void ocfs2_remove_from_cache(struct inode *inode,
                              struct buffer_head *bh);
+int ocfs2_buffer_read_ahead(struct inode *inode,
+                           struct buffer_head *bh);
  
  #endif /* OCFS2_UPTODATE_H */
author	Mark Fasheh <mark.fasheh@oracle.com>
	Fri, 21 Apr 2006 20:49:02 +0000 (13:49 -0700)
committer	Mark Fasheh <mark.fasheh@oracle.com>
	Wed, 20 Sep 2006 22:53:40 +0000 (15:53 -0700)
fs/ocfs2/buffer_head_io.c		patch \| blob \| history
fs/ocfs2/buffer_head_io.h		patch \| blob \| history
fs/ocfs2/dir.c		patch \| blob \| history
fs/ocfs2/inode.c		patch \| blob \| history
fs/ocfs2/namei.c		patch \| blob \| history
fs/ocfs2/uptodate.c		patch \| blob \| history
fs/ocfs2/uptodate.h		patch \| blob \| history