From: David Gibson Date: Wed, 22 Jun 2005 00:14:44 +0000 (-0700) Subject: [PATCH] Hugepage consolidation X-Git-Tag: v2.6.13-rc4~130^2~202^2~153 X-Git-Url: http://pilppa.com/gitweb/?a=commitdiff_plain;h=63551ae0feaaa23807ebea60de1901564bbef32e;p=linux-2.6-omap-h63xx.git [PATCH] Hugepage consolidation A lot of the code in arch/*/mm/hugetlbpage.c is quite similar. This patch attempts to consolidate a lot of the code across the arch's, putting the combined version in mm/hugetlb.c. There are a couple of uglyish hacks in order to covert all the hugepage archs, but the result is a very large reduction in the total amount of code. It also means things like hugepage lazy allocation could be implemented in one place, instead of six. Tested, at least a little, on ppc64, i386 and x86_64. Notes: - this patch changes the meaning of set_huge_pte() to be more analagous to set_pte() - does SH4 need s special huge_ptep_get_and_clear()?? Acked-by: William Lee Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index 171fc925e1e..5aa06001a4b 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -18,7 +18,7 @@ #include #include -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -30,7 +30,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -42,21 +42,6 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) -{ - pte_t entry; - - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); - if (write_access) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - } else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); - set_pte(page_table, entry); -} - /* * This function checks for proper alignment of input addr and len parameters. */ @@ -69,77 +54,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - set_pte(dst_pte, entry); - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - addr += HPAGE_SIZE; - } - return 0; - -nomem: - return -ENOMEM; -} - -int -follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vpfn, vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - vpfn = vaddr/PAGE_SIZE; - while (vaddr < vma->vm_end && remainder) { - - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); - - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - ++vpfn; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) @@ -204,83 +118,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, } #endif -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +void hugetlb_clean_stale_pgtable(pte_t *pte) { - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t pte, *ptep; + pmd_t *pmd = (pmd_t *) pte; struct page *page; - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - ptep = huge_pte_offset(mm, address); - if (!ptep) - continue; - pte = ptep_get_and_clear(mm, address, ptep); - if (pte_none(pte)) - continue; - page = pte_page(pte); - put_page(page); - } - add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; + page = pmd_page(*pmd); + pmd_clear(pmd); + dec_page_state(nr_page_table_pages); + page_cache_release(page); } /* x86_64 also uses this file */ diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index df08ae7634b..e0a776a3044 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -24,7 +24,7 @@ unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT; -static pte_t * +pte_t * huge_pte_alloc (struct mm_struct *mm, unsigned long addr) { unsigned long taddr = htlbpage_to_page(addr); @@ -43,7 +43,7 @@ huge_pte_alloc (struct mm_struct *mm, unsigned long addr) return pte; } -static pte_t * +pte_t * huge_pte_offset (struct mm_struct *mm, unsigned long addr) { unsigned long taddr = htlbpage_to_page(addr); @@ -67,23 +67,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } -static void -set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, pte_t * page_table, int write_access) -{ - pte_t entry; - - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); - if (write_access) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - } else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); - set_pte(page_table, entry); - return; -} /* * This function checks for proper alignment of input addr and len parameters. */ @@ -99,68 +82,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - set_pte(dst_pte, entry); - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - addr += HPAGE_SIZE; - } - return 0; -nomem: - return -ENOMEM; -} - -int -follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *st, int *length, int i) -{ - pte_t *ptep, pte; - unsigned long start = *st; - unsigned long pstart; - int len = *length; - struct page *page; - - do { - pstart = start & HPAGE_MASK; - ptep = huge_pte_offset(mm, start); - pte = *ptep; - -back1: - page = pte_page(pte); - if (pages) { - page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT); - get_page(page); - pages[i] = page; - } - if (vmas) - vmas[i] = vma; - i++; - len--; - start += PAGE_SIZE; - if (((start & HPAGE_MASK) == pstart) && len && - (start < vma->vm_end)) - goto back1; - } while (len && start < vma->vm_end); - *length = len; - *st = start; - return i; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write) { struct page *page; @@ -212,81 +133,6 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, free_pgd_range(tlb, addr, end, floor, ceiling); } -void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t *pte; - struct page *page; - - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - if (pte_none(*pte)) - continue; - page = pte_page(*pte); - put_page(page); - pte_clear(mm, address, pte); - } - add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - page_cache_release(page); - goto out; - } - } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; -} - unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index d3bf86a5c1a..b4ab766f598 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -121,7 +121,7 @@ static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr return hugepte_offset(dir, addr); } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pud_t *pud; @@ -134,7 +134,7 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return hugepte_offset(pud, addr); } -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pud_t *pud; @@ -147,25 +147,6 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return hugepte_alloc(mm, pud, addr); } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, - pte_t *ptep, int write_access) -{ - pte_t entry; - - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); - if (write_access) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - } else { - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - } - entry = pte_mkyoung(entry); - entry = pte_mkhuge(entry); - - set_pte_at(mm, addr, ptep, entry); -} - /* * This function checks for proper alignment of input addr and len parameters. */ @@ -259,80 +240,6 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len) return -EINVAL; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int err = -ENOMEM; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto out; - - src_pte = huge_pte_offset(src, addr); - entry = *src_pte; - - ptepage = pte_page(entry); - get_page(ptepage); - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - set_pte_at(dst, addr, dst_pte, entry); - - addr += HPAGE_SIZE; - } - - err = 0; - out: - return err; -} - -int -follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vpfn, vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - vpfn = vaddr/PAGE_SIZE; - while (vaddr < vma->vm_end && remainder) { - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); - - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - ++vpfn; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -363,89 +270,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long addr; - pte_t *ptep; - struct page *page; - - WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON((start % HPAGE_SIZE) != 0); - BUG_ON((end % HPAGE_SIZE) != 0); - - for (addr = start; addr < end; addr += HPAGE_SIZE) { - pte_t pte; - - ptep = huge_pte_offset(mm, addr); - if (!ptep || pte_none(*ptep)) - continue; - - pte = *ptep; - page = pte_page(pte); - pte_clear(mm, addr, ptep); - - put_page(page); - } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_pending(); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON((vma->vm_start % HPAGE_SIZE) != 0); - BUG_ON((vma->vm_end % HPAGE_SIZE) != 0); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (! pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; -} - /* Because we have an exclusive hugepage region which lies within the * normal user address space, we have to take special measures to make * non-huge mmap()s evade the hugepage reserved regions. */ diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 1f897bab231..95bb1a6c606 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -24,7 +24,7 @@ #include #include -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return pte; } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -56,28 +56,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, pte_t * page_table, int write_access) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { - unsigned long i; - pte_t entry; + int i; - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte_at(mm, addr, ptep, entry); + ptep++; + addr += PAGE_SIZE; + pte_val(entry) += PAGE_SIZE; + } +} - if (write_access) - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot))); - else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t entry; + int i; - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(page_table, entry); - page_table++; + entry = *ptep; - pte_val(entry) += PAGE_SIZE; + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + pte_clear(mm, addr, ptep); + addr += PAGE_SIZE; + ptep++; } + + return entry; } /* @@ -92,79 +98,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int i; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - BUG_ON(!src_pte || pte_none(*src_pte)); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(dst_pte, entry); - pte_val(entry) += PAGE_SIZE; - dst_pte++; - } - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - addr += HPAGE_SIZE; - } - return 0; - -nomem: - return -ENOMEM; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - while (vaddr < vma->vm_end && remainder) { - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - BUG_ON(!pte || pte_none(*pte)); - - page = pte_page(*pte); - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -181,84 +114,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t *pte; - struct page *page; - int i; - - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - BUG_ON(!pte); - if (pte_none(*pte)) - continue; - page = pte_page(*pte); - put_page(page); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(mm, address+(i*PAGE_SIZE), pte); - pte++; - } - } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; -} diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index bcad2aefa4e..dcd9c8a8baf 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -24,7 +24,7 @@ #include #include -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return pte; } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -80,6 +80,20 @@ static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, } } +pte_t huge_ptep_get_and_clear(pte_t *ptep) +{ + pte_t entry; + + entry = *ptep; + + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + pte_clear(pte); + pte++; + } + + return entry; +} + /* * This function checks for proper alignment of input addr and len parameters. */ diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index 5a1f831b2de..625cbb336a2 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -22,7 +22,7 @@ #include #include -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -41,7 +41,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return pte; } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,30 +62,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, - struct page *page, pte_t * page_table, int write_access) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { - unsigned long i; - pte_t entry; + int i; + + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte_at(mm, addr, ptep, entry); + ptep++; + addr += PAGE_SIZE; + pte_val(entry) += PAGE_SIZE; + } +} - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t entry; + int i; - if (write_access) - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot))); - else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); + entry = *ptep; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte_at(mm, addr, page_table, entry); - page_table++; + pte_clear(mm, addr, ptep); addr += PAGE_SIZE; - - pte_val(entry) += PAGE_SIZE; + ptep++; } + + return entry; } /* @@ -100,79 +104,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int i; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - BUG_ON(!src_pte || pte_none(*src_pte)); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte_at(dst, addr, dst_pte, entry); - pte_val(entry) += PAGE_SIZE; - dst_pte++; - addr += PAGE_SIZE; - } - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - } - return 0; - -nomem: - return -ENOMEM; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - while (vaddr < vma->vm_end && remainder) { - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - BUG_ON(!pte || pte_none(*pte)); - - page = pte_page(*pte); - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -190,34 +121,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t *pte; - struct page *page; - int i; - - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - BUG_ON(!pte); - if (pte_none(*pte)) - continue; - page = pte_page(*pte); - put_page(page); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(mm, address+(i*PAGE_SIZE), pte); - pte++; - } - } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - static void context_reload(void *__data) { struct mm_struct *mm = __data; @@ -226,12 +129,8 @@ static void context_reload(void *__data) load_secondary_context(mm); } -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +void hugetlb_prefault_arch_hook(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - /* On UltraSPARC-III+ and later, configure the second half of * the Data-TLB for huge pages. */ @@ -261,50 +160,4 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) } spin_unlock(&ctx_alloc_lock); } - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; } diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index ed13969fa2d..41400d342d4 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -68,6 +68,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE #endif #define pgd_val(x) ((x).pgd) diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 8d60c2b4b00..e9efe148fdf 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -236,6 +236,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PRESENT | _PAGE_PSE; return pte; } #ifdef CONFIG_X86_PAE # include @@ -275,7 +276,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, */ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -#define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index fcc9c3344ab..48586e08f43 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -283,6 +283,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A)) #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) +#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_P)) /* * Macro to a page protection value as "uncacheable". Note that "protection" is really a diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h index 4c6d129e7d9..180467be8e7 100644 --- a/include/asm-sh/page.h +++ b/include/asm-sh/page.h @@ -31,6 +31,7 @@ #define HPAGE_SIZE (1UL << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE-1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT-PAGE_SHIFT) +#define ARCH_HAS_SETCLEAR_HUGE_PTE #endif #ifdef __KERNEL__ diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h index cd847a47a9a..ecb909572d3 100644 --- a/include/asm-sh/pgtable.h +++ b/include/asm-sh/pgtable.h @@ -196,6 +196,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; } /* * Macro and implementation to make a page protection as uncachable. diff --git a/include/asm-sh64/page.h b/include/asm-sh64/page.h index e1f7f5a4121..d6167f1c0e9 100644 --- a/include/asm-sh64/page.h +++ b/include/asm-sh64/page.h @@ -41,6 +41,7 @@ #define HPAGE_SIZE (1UL << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE-1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT-PAGE_SHIFT) +#define ARCH_HAS_SETCLEAR_HUGE_PTE #endif #ifdef __KERNEL__ diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h index 525e1523ef5..78ac6be2d9e 100644 --- a/include/asm-sh64/pgtable.h +++ b/include/asm-sh64/pgtable.h @@ -430,6 +430,8 @@ extern inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | extern inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_EXECUTE)); return pte; } extern inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } +extern inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; } + /* * Conversion functions: convert a page and protection to a page entry. diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h index 219ea043a14..b87dbbd64bc 100644 --- a/include/asm-sparc64/page.h +++ b/include/asm-sparc64/page.h @@ -95,6 +95,8 @@ typedef unsigned long pgprot_t; #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1UL)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define ARCH_HAS_SETCLEAR_HUGE_PTE +#define ARCH_HAS_HUGETLB_PREFAULT_HOOK #endif #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_32BIT) ? \ diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h index ae2cd5b09a7..1ae00c5087f 100644 --- a/include/asm-sparc64/pgtable.h +++ b/include/asm-sparc64/pgtable.h @@ -286,6 +286,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot) #define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_ACCESSED | _PAGE_R)) #define pte_mkwrite(pte) (__pte(pte_val(pte) | _PAGE_WRITE)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_MODIFIED | _PAGE_W)) +#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_SZHUGE)) /* to find an entry in a page-table-directory. */ #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index f43048035a0..9ce338c3a71 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -28,6 +28,7 @@ #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index db2a0efbf57..4eec176c3c3 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -253,6 +253,7 @@ extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; extern inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) extern inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } extern inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } extern inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; } @@ -263,6 +264,7 @@ extern inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ extern inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } extern inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } +extern inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } struct vm_area_struct; @@ -290,7 +292,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, */ #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) static inline int pmd_large(pmd_t pte) { return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6af1ae4a821..f529d144281 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -4,6 +4,7 @@ #ifdef CONFIG_HUGETLB_PAGE #include +#include struct ctl_table; @@ -22,12 +23,6 @@ int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); -int pmd_huge(pmd_t pmd); struct page *alloc_huge_page(void); void free_huge_page(struct page *); @@ -35,6 +30,17 @@ extern unsigned long max_huge_pages; extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; +/* arch callbacks */ + +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr); +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); +struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, + int write); +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write); +int is_aligned_hugepage_range(unsigned long addr, unsigned long len); +int pmd_huge(pmd_t pmd); + #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ @@ -48,6 +54,28 @@ extern int sysctl_hugetlb_shm_group; int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif +#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE +#define set_huge_pte_at(mm, addr, ptep, pte) set_pte_at(mm, addr, ptep, pte) +#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep) +#else +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); +#endif + +#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK +#define hugetlb_prefault_arch_hook(mm) do { } while (0) +#else +void hugetlb_prefault_arch_hook(struct mm_struct *mm); +#endif + +#ifndef ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE +#define hugetlb_clean_stale_pgtable(pte) BUG() +#else +void hugetlb_clean_stale_pgtable(pte_t *pte); +#endif + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4eb5ae3fbe1..fbd1111ea11 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,10 +7,14 @@ #include #include #include -#include #include #include #include +#include +#include +#include + +#include const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages; @@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = { .nopage = hugetlb_nopage, }; +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) +{ + pte_t entry; + + if (vma->vm_flags & VM_WRITE) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else { + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + } + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + + return entry; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */ + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(dst, addr, dst_pte, entry); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t pte; + struct page *page; + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(start & ~HPAGE_MASK); + BUG_ON(end & ~HPAGE_MASK); + + for (address = start; address < end; address += HPAGE_SIZE) { + pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address)); + if (pte_none(pte)) + continue; + page = pte_page(pte); + put_page(page); + } + add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); + flush_tlb_range(vma, start, end); +} + void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length) { @@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma, unmap_hugepage_range(vma, start, start + length); spin_unlock(&mm->page_table_lock); } + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + hugetlb_prefault_arch_hook(mm); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + if (! pte_none(*pte)) + hugetlb_clean_stale_pgtable(pte); + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i) +{ + unsigned long vpfn, vaddr = *position; + int remainder = *length; + + BUG_ON(!is_vm_hugetlb_page(vma)); + + vpfn = vaddr/PAGE_SIZE; + while (vaddr < vma->vm_end && remainder) { + + if (pages) { + pte_t *pte; + struct page *page; + + /* Some archs (sparc64, sh*) have multiple + * pte_ts to each hugepage. We have to make + * sure we get the first, for the page + * indexing below to work. */ + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageCompound(page)); + + get_page(page); + pages[i] = page; + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + ++vpfn; + --remainder; + ++i; + } + + *length = remainder; + *position = vaddr; + + return i; +}