From: Chuck Lever Date: Wed, 30 Nov 2005 23:09:02 +0000 (-0500) Subject: NFS: support large reads and writes on the wire X-Git-Tag: v2.6.16-rc1~935^2~4^2~28 X-Git-Url: http://pilppa.com/gitweb/?a=commitdiff_plain;h=40859d7ee64ed6bfad8a4e93f9bb5c1074afadff;p=linux-2.6-omap-h63xx.git NFS: support large reads and writes on the wire Most NFS server implementations allow up to 64KB reads and writes on the wire. The Solaris NFS server allows up to a megabyte, for instance. Now the Linux NFS client supports transfer sizes up to 1MB, too. This will help reduce protocol and context switch overhead on read/write intensive NFS workloads, and support larger atomic read and write operations on servers that support them. Test-plan: Connectathon and iozone on mount point with wsize=rsize>32768 over TCP. Tests with NFS over UDP to verify the maximum RPC payload size cap. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f69d95aa78b..fd7ac5e841c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -154,6 +154,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int struct list_head *list; struct nfs_direct_req *dreq; unsigned int reads = 0; + unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); if (!dreq) @@ -167,7 +168,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int list = &dreq->list; for(;;) { - struct nfs_read_data *data = nfs_readdata_alloc(); + struct nfs_read_data *data = nfs_readdata_alloc(rpages); if (unlikely(!data)) { while (!list_empty(list)) { @@ -431,7 +432,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode, struct nfs_writeverf first_verf; struct nfs_write_data *wdata; - wdata = nfs_writedata_alloc(); + wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); if (!wdata) return -ENOMEM; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4e6558df54b..acde2c5725b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -221,10 +221,10 @@ nfs_calc_block_size(u64 tsize) static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { - if (bsize < 1024) - bsize = NFS_DEF_FILE_IO_BUFFER_SIZE; - else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE) - bsize = NFS_MAX_FILE_IO_BUFFER_SIZE; + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } @@ -307,20 +307,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; - if (server->wsize > max_rpc_payload) - server->wsize = max_rpc_payload; - + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->rpages > NFS_READ_MAXIOV) { - server->rpages = NFS_READ_MAXIOV; - server->rsize = server->rpages << PAGE_CACHE_SHIFT; - } + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->wpages > NFS_WRITE_MAXIOV) { - server->wpages = NFS_WRITE_MAXIOV; - server->wsize = server->wpages << PAGE_CACHE_SHIFT; - } if (sb->s_blocksize == 0) sb->s_blocksize = nfs_block_bits(server->wsize, diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 1b272a135a3..985cc53b8dd 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -296,8 +296,8 @@ static int __init root_nfs_name(char *name) nfs_port = -1; nfs_data.version = NFS_MOUNT_VERSION; nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ - nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE; - nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE; + nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; + nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; nfs_data.acregmin = 3; nfs_data.acregmax = 60; nfs_data.acdirmin = 30; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 21486242c3d..05eb43fadf8 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -83,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, int result; struct nfs_read_data *rdata; - rdata = nfs_readdata_alloc(); + rdata = nfs_readdata_alloc(1); if (!rdata) return -ENOMEM; @@ -283,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode) nbytes = req->wb_bytes; for(;;) { - data = nfs_readdata_alloc(); + data = nfs_readdata_alloc(1); if (!data) goto out_bad; INIT_LIST_HEAD(&data->pages); @@ -339,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode) if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) return nfs_pagein_multi(head, inode); - data = nfs_readdata_alloc(); + data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); if (!data) goto out_bad; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80bc4ea1b82..1ce0c200df1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -89,18 +89,33 @@ static mempool_t *nfs_commit_mempool; static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); -static inline struct nfs_write_data *nfs_commit_alloc(void) +static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); + if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kmalloc(size, GFP_NOFS); + if (p->pagevec) { + memset(p->pagevec, 0, size); + } else { + mempool_free(p, nfs_commit_mempool); + p = NULL; + } + } } return p; } static inline void nfs_commit_free(struct nfs_write_data *p) { + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } @@ -167,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, int result, written = 0; struct nfs_write_data *wdata; - wdata = nfs_writedata_alloc(); + wdata = nfs_writedata_alloc(1); if (!wdata) return -ENOMEM; @@ -909,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how) nbytes = req->wb_bytes; for (;;) { - data = nfs_writedata_alloc(); + data = nfs_writedata_alloc(1); if (!data) goto out_bad; list_add(&data->pages, &list); @@ -973,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how) if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE) return nfs_flush_multi(head, inode, how); - data = nfs_writedata_alloc(); + data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); if (!data) goto out_bad; @@ -1241,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head, * Commit dirty pages */ static int -nfs_commit_list(struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how) { struct nfs_write_data *data; struct nfs_page *req; - data = nfs_commit_alloc(); + data = nfs_commit_alloc(NFS_SERVER(inode)->wpages); if (!data) goto out_bad; @@ -1351,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&nfsi->req_lock); if (res) { - error = nfs_commit_list(&head, how); + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4dff705d2ff..d38010ba647 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -38,9 +38,6 @@ # define NFS_DEBUG #endif -#define NFS_MAX_FILE_IO_BUFFER_SIZE 32768 -#define NFS_DEF_FILE_IO_BUFFER_SIZE 4096 - /* Default timeout values */ #define NFS_MAX_UDP_TIMEOUT (60*HZ) #define NFS_MAX_TCP_TIMEOUT (600*HZ) @@ -462,18 +459,33 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page) */ extern mempool_t *nfs_wdata_mempool; -static inline struct nfs_write_data *nfs_writedata_alloc(void) +static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); + if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kmalloc(size, GFP_NOFS); + if (p->pagevec) { + memset(p->pagevec, 0, size); + } else { + mempool_free(p, nfs_wdata_mempool); + p = NULL; + } + } } return p; } static inline void nfs_writedata_free(struct nfs_write_data *p) { + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } @@ -492,16 +504,33 @@ extern void nfs_readdata_release(void *data); */ extern mempool_t *nfs_rdata_mempool; -static inline struct nfs_read_data *nfs_readdata_alloc(void) +static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); - if (p) + + if (p) { memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kmalloc(size, GFP_NOFS); + if (p->pagevec) { + memset(p->pagevec, 0, size); + } else { + mempool_free(p, nfs_rdata_mempool); + p = NULL; + } + } + } return p; } static inline void nfs_readdata_free(struct nfs_read_data *p) { + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b8b0eed98ec..9f422fd8767 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -4,6 +4,16 @@ #include #include +/* + * To change the maximum rsize and wsize supported by the NFS client, adjust + * NFS_MAX_FILE_IO_SIZE. 64KB is a typical maximum, but some servers can + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +#define NFS_MAX_FILE_IO_SIZE (1048576U) +#define NFS_DEF_FILE_IO_SIZE (4096U) +#define NFS_MIN_FILE_IO_SIZE (1024U) + struct nfs4_fsid { __u64 major; __u64 minor; @@ -215,12 +225,6 @@ struct nfs4_delegreturnargs { /* * Arguments to the read call. */ - -#define NFS_READ_MAXIOV (9U) -#if (NFS_READ_MAXIOV > (MAX_IOVEC -2)) -#error "NFS_READ_MAXIOV is too large" -#endif - struct nfs_readargs { struct nfs_fh * fh; struct nfs_open_context *context; @@ -239,11 +243,6 @@ struct nfs_readres { /* * Arguments to the write call. */ -#define NFS_WRITE_MAXIOV (9U) -#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2)) -#error "NFS_WRITE_MAXIOV is too large" -#endif - struct nfs_writeargs { struct nfs_fh * fh; struct nfs_open_context *context; @@ -674,6 +673,8 @@ struct nfs4_server_caps_res { struct nfs_page; +#define NFS_PAGEVEC_SIZE (8U) + struct nfs_read_data { int flags; struct rpc_task task; @@ -682,13 +683,14 @@ struct nfs_read_data { struct nfs_fattr fattr; /* fattr storage */ struct list_head pages; /* Coalesced read requests */ struct nfs_page *req; /* multi ops per nfs_page */ - struct page *pagevec[NFS_READ_MAXIOV]; + struct page **pagevec; struct nfs_readargs args; struct nfs_readres res; #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif void (*complete) (struct nfs_read_data *, int); + struct page *page_array[NFS_PAGEVEC_SIZE + 1]; }; struct nfs_write_data { @@ -700,13 +702,14 @@ struct nfs_write_data { struct nfs_writeverf verf; struct list_head pages; /* Coalesced requests we wish to flush */ struct nfs_page *req; /* multi ops per nfs_page */ - struct page *pagevec[NFS_WRITE_MAXIOV]; + struct page **pagevec; struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif void (*complete) (struct nfs_write_data *, int); + struct page *page_array[NFS_PAGEVEC_SIZE + 1]; }; struct nfs_access_entry; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5da968729cf..5676794ee34 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -134,11 +134,6 @@ xdr_adjust_iovec(struct kvec *iov, u32 *p) return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base); } -/* - * Maximum number of iov's we use. - */ -#define MAX_IOVEC (12) - /* * XDR buffer helper functions */