From 5da7ca86078964cbfe6c83efc1205904587706fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:10:46 -0800 Subject: [PATCH] [PATCH] Add NUMA policy support for huge pages. The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter Cc: Andi Kleen Acked-by: William Lee Irwin III Cc: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++-- include/linux/mempolicy.h | 8 ++++++++ mm/hugetlb.c | 24 ++++++++++++++---------- mm/mempolicy.c | 39 ++++++++++++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 21 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1056717ee50..68d82ad6b17 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(void); +struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ do { } while (0) -#define alloc_huge_page() ({ NULL; }) +#define alloc_huge_page(vma, addr) ({ NULL; }) #define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 8b67cf837ca..817db642711 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -156,6 +156,8 @@ extern void numa_default_policy(void); extern void numa_policy_init(void); extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new); extern struct mempolicy default_policy; +extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, + unsigned long addr); #else @@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old, { } +static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, + unsigned long addr) +{ + return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); +} + #endif /* CONFIG_NUMA */ #endif /* __KERNEL__ */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e93bd63462f..eb405565949 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include @@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page) free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct vm_area_struct *vma, + unsigned long address) { int nid = numa_node_id(); struct page *page = NULL; - struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; + struct zonelist *zonelist = huge_zonelist(vma, address); struct zone **z; for (z = zonelist->zones; *z; z++) { @@ -87,13 +90,13 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } -struct page *alloc_huge_page(void) +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(); + page = dequeue_huge_page(vma, addr); if (!page) { spin_unlock(&hugetlb_lock); return NULL; @@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count) spin_lock(&hugetlb_lock); try_to_free_low(count); while (count < nr_huge_pages) { - struct page *page = dequeue_huge_page(); + struct page *page = dequeue_huge_page(NULL, 0); if (!page) break; update_and_free_page(page); @@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -static struct page *find_or_alloc_huge_page(struct address_space *mapping, - unsigned long idx, int shared) +static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, struct address_space *mapping, + unsigned long idx, int shared) { struct page *page; int err; @@ -378,7 +382,7 @@ retry: if (hugetlb_get_quota(mapping)) goto out; - page = alloc_huge_page(); + page = alloc_huge_page(vma, addr); if (!page) { hugetlb_put_quota(mapping); goto out; @@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_get(old_page); - new_page = alloc_huge_page(); + new_page = alloc_huge_page(vma, address); if (!new_page) { page_cache_release(old_page); @@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - page = find_or_alloc_huge_page(mapping, idx, + page = find_or_alloc_huge_page(vma, address, mapping, idx, vma->vm_flags & VM_SHARED); if (!page) goto out; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9..45c51ac6344 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol, return nid; } +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + off = vma->vm_pgoff; + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +/* Return a zonelist suitable for a huge page allocation. */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + if (pol->policy == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + } + return zonelist_policy(GFP_HIGHUSER, pol); +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; - if (vma) { - unsigned long off; - off = vma->vm_pgoff; - off += (addr - vma->vm_start) >> PAGE_SHIFT; - nid = offset_il_node(pol, vma, off); - } else { - /* fall back to process interleaving */ - nid = interleave_nodes(pol); - } + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); -- 2.41.1