hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork()

author Mel Gorman <mel@csn.ul.ie>

Thu, 24 Jul 2008 04:27:23 +0000 (21:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
author Mel Gorman <mel@csn.ul.ie>
Thu, 24 Jul 2008 04:27:23 +0000 (21:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index aeabf80f81a5d38561768ea95d1b544605e5dcc4..1576bbecd084575e2cb97fffe6846c2039309dd5 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -103,9 +103,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         ret = -ENOMEM;
         len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
  
-       if (vma->vm_flags & VM_MAYSHARE &&
-           hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-                                 len >> HPAGE_SHIFT))
+       if (hugetlb_reserve_pages(inode,
+                               vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+                               len >> HPAGE_SHIFT, vma))
                 goto out;
  
         ret = 0;
@@ -942,7 +942,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
                 goto out_dentry;
  
         error = -ENOMEM;
-       if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+       if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
                 goto out_inode;
  
         d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index a79e80b689d803d161a4ca694370aeec1ba25629..185b14c9f021fb9d3fb76ae6a1296f8e71206a73 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -17,6 +17,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
         return vma->vm_flags & VM_HUGETLB;
  }
  
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
  int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
  int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
  int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
@@ -30,7 +31,8 @@ int hugetlb_report_node_meminfo(int, char *);
  unsigned long hugetlb_total_pages(void);
  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, int write_access);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+                                               struct vm_area_struct *vma);
  void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
  
  extern unsigned long max_huge_pages;
@@ -58,6 +60,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
  {
         return 0;
  }
+
+static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+}
+
  static inline unsigned long hugetlb_total_pages(void)
  {
         return 0;
diff --git a/kernel/fork.c b/kernel/fork.c

index adefc1131f274082960de7f375a67d4949cdfe27..552c8d8e77ad5e3fb43f606081c57023bb25e4c6 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -33,6 +33,7 @@
  #include <linux/cpu.h>
  #include <linux/cgroup.h>
  #include <linux/security.h>
+#include <linux/hugetlb.h>
  #include <linux/swap.h>
  #include <linux/syscalls.h>
  #include <linux/jiffies.h>
@@ -306,6 +307,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                         spin_unlock(&file->f_mapping->i_mmap_lock);
                 }
  
+               /*
+                * Clear hugetlb-related page reserves for children. This only
+                * affects MAP_PRIVATE mappings. Faults generated by the child
+                * are not guaranteed to succeed, even if read-only
+                */
+               if (is_vm_hugetlb_page(tmp))
+                       reset_vma_resv_huge_pages(tmp);
+
                 /*
                  * Link in the new vma and copy the page table entries.
                  */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index a4dbba8965f3a19abb88b356e3da4b8f61d26895..0af500db3632e809956b4acc8f8580724efa628d 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,69 @@ static int hugetlb_next_nid;
   */
  static DEFINE_SPINLOCK(hugetlb_lock);
  
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ */
+static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               return (unsigned long)vma->vm_private_data;
+       return 0;
+}
+
+static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
+                                                       unsigned long reserve)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       vma->vm_private_data = (void *)reserve;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED) {
+               /* Shared mappings always use reserves */
+               resv_huge_pages--;
+       } else {
+               /*
+                * Only the process that called mmap() has reserves for
+                * private mappings.
+                */
+               if (vma_resv_huge_pages(vma)) {
+                       resv_huge_pages--;
+                       reserve = (unsigned long)vma->vm_private_data - 1;
+                       vma->vm_private_data = (void *)reserve;
+               }
+       }
+}
+
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_private_reserves(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED)
+               return 0;
+       if (!vma_resv_huge_pages(vma))
+               return 0;
+       return 1;
+}
+
  static void clear_huge_page(struct page *page, unsigned long addr)
  {
         int i;
@@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
         struct zone *zone;
         struct zoneref *z;
  
+       /*
+        * A child process with MAP_PRIVATE mappings created by their parent
+        * have no page reserves. This check ensures that reservations are
+        * not "stolen". The child may still get SIGKILLed
+        */
+       if (!vma_has_private_reserves(vma) &&
+                       free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                 MAX_NR_ZONES - 1, nodemask) {
                 nid = zone_to_nid(zone);
@@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                         list_del(&page->lru);
                         free_huge_pages--;
                         free_huge_pages_node[nid]--;
-                       if (vma && vma->vm_flags & VM_MAYSHARE)
-                               resv_huge_pages--;
+                       decrement_hugepage_resv_vma(vma);
+
                         break;
                 }
         }
@@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
         }
  }
  
-
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
-                                               unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                   unsigned long addr)
  {
         struct page *page;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned int chg = 0;
+
+       /*
+        * Processes that did not create the mapping will have no reserves and
+        * will not have accounted against quota. Check that the quota can be
+        * made before satisfying the allocation
+        */
+       if (!vma_has_private_reserves(vma)) {
+               chg = 1;
+               if (hugetlb_get_quota(inode->i_mapping, chg))
+                       return ERR_PTR(-ENOSPC);
+       }
  
         spin_lock(&hugetlb_lock);
         page = dequeue_huge_page_vma(vma, addr);
         spin_unlock(&hugetlb_lock);
-       return page ? page : ERR_PTR(-VM_FAULT_OOM);
-}
  
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
-                                               unsigned long addr)
-{
-       struct page *page = NULL;
-
-       if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
-               return ERR_PTR(-VM_FAULT_SIGBUS);
-
-       spin_lock(&hugetlb_lock);
-       if (free_huge_pages > resv_huge_pages)
-               page = dequeue_huge_page_vma(vma, addr);
-       spin_unlock(&hugetlb_lock);
         if (!page) {
                 page = alloc_buddy_huge_page(vma, addr);
                 if (!page) {
-                       hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                       hugetlb_put_quota(inode->i_mapping, chg);
                         return ERR_PTR(-VM_FAULT_OOM);
                 }
         }
-       return page;
-}
  
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                   unsigned long addr)
-{
-       struct page *page;
-       struct address_space *mapping = vma->vm_file->f_mapping;
-
-       if (vma->vm_flags & VM_MAYSHARE)
-               page = alloc_huge_page_shared(vma, addr);
-       else
-               page = alloc_huge_page_private(vma, addr);
+       set_page_refcounted(page);
+       set_page_private(page, (unsigned long) mapping);
  
-       if (!IS_ERR(page)) {
-               set_page_refcounted(page);
-               set_page_private(page, (unsigned long) mapping);
-       }
         return page;
  }
  
@@ -757,6 +814,13 @@ out:
         return ret;
  }
  
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+       unsigned long reserve = vma_resv_huge_pages(vma);
+       if (reserve)
+               hugetlb_acct_memory(-reserve);
+}
+
  /*
   * We cannot handle pagefaults against hugetlb pages at all.  They cause
   * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  
  struct vm_operations_struct hugetlb_vm_ops = {
         .fault = hugetlb_vm_op_fault,
+       .close = hugetlb_vm_op_close,
  };
  
  static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end)
         return chg;
  }
  
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+int hugetlb_reserve_pages(struct inode *inode,
+                                       long from, long to,
+                                       struct vm_area_struct *vma)
  {
         long ret, chg;
  
-       chg = region_chg(&inode->i_mapping->private_list, from, to);
+       /*
+        * Shared mappings base their reservation on the number of pages that
+        * are already allocated on behalf of the file. Private mappings need
+        * to reserve the full area even if read-only as mprotect() may be
+        * called to make the mapping read-write. Assume !vma is a shm mapping
+        */
+       if (!vma || vma->vm_flags & VM_SHARED)
+               chg = region_chg(&inode->i_mapping->private_list, from, to);
+       else {
+               chg = to - from;
+               set_vma_resv_huge_pages(vma, chg);
+       }
+
         if (chg < 0)
                 return chg;
  
@@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
                 hugetlb_put_quota(inode->i_mapping, chg);
                 return ret;
         }
-       region_add(&inode->i_mapping->private_list, from, to);
+       if (!vma || vma->vm_flags & VM_SHARED)
+               region_add(&inode->i_mapping->private_list, from, to);
         return 0;
  }
author	Mel Gorman <mel@csn.ul.ie>
	Thu, 24 Jul 2008 04:27:23 +0000 (21:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history