Skip to content

Commit b99a342

Browse files
yhuang-inteltorvalds
authored andcommitted
NUMA balancing: reduce TLB flush via delaying mapping on hint page fault
With NUMA balancing, in hint page fault handler, the faulting page will be migrated to the accessing node if necessary. During the migration, TLB will be shot down on all CPUs that the process has run on recently. Because in the hint page fault handler, the PTE will be made accessible before the migration is tried. The overhead of TLB shooting down can be high, so it's better to be avoided if possible. In fact, if we delay mapping the page until migration, that can be avoided. This is what this patch doing. For the multiple threads applications, it's possible that a page is accessed by multiple threads almost at the same time. In the original implementation, because the first thread will install the accessible PTE before migrating the page, the other threads may access the page directly before the page is made inaccessible again during migration. While with the patch, the second thread will go through the page fault handler too. And because of the PageLRU() checking in the following code path, migrate_misplaced_page() numamigrate_isolate_page() isolate_lru_page() the migrate_misplaced_page() will return 0, and the PTE will be made accessible in the second thread. This will introduce a little more overhead. But we think the possibility for a page to be accessed by the multiple threads at the same time is low, and the overhead difference isn't too large. If this becomes a problem in some workloads, we need to consider how to reduce the overhead. To test the patch, we run a test case as follows on a 2-socket Intel server (1 NUMA node per socket) with 128GB DRAM (64GB per socket). 1. Run a memory eater on NUMA node 1 to use 40GB memory before running pmbench. 2. Run pmbench (normal accessing pattern) with 8 processes, and 8 threads per process, so there are 64 threads in total. The working-set size of each process is 8960MB, so the total working-set size is 8 * 8960MB = 70GB. The CPU of all pmbench processes is bound to node 1. The pmbench processes will access some DRAM on node 0. 3. After the pmbench processes run for 10 seconds, kill the memory eater. Now, some pages will be migrated from node 0 to node 1 via NUMA balancing. Test results show that, with the patch, the pmbench throughput (page accesses/s) increases 5.5%. The number of the TLB shootdowns interrupts reduces 98% (from ~4.7e7 to ~9.7e5) with about 9.2e6 pages (35.8GB) migrated. From the perf profile, it can be found that the CPU cycles spent by try_to_unmap() and its callees reduces from 6.02% to 0.47%. That is, the CPU cycles spent by TLB shooting down decreases greatly. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: "Huang, Ying" <[email protected]> Reviewed-by: Mel Gorman <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Peter Xu <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: "Matthew Wilcox" <[email protected]> Cc: Will Deacon <[email protected]> Cc: Michel Lespinasse <[email protected]> Cc: Arjun Roy <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent b12d691 commit b99a342

File tree

1 file changed

+32
-22
lines changed

1 file changed

+32
-22
lines changed

mm/memory.c

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4125,29 +4125,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
41254125
goto out;
41264126
}
41274127

4128-
/*
4129-
* Make it present again, Depending on how arch implementes non
4130-
* accessible ptes, some can allow access by kernel mode.
4131-
*/
4132-
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4128+
/* Get the normal PTE */
4129+
old_pte = ptep_get(vmf->pte);
41334130
pte = pte_modify(old_pte, vma->vm_page_prot);
4134-
pte = pte_mkyoung(pte);
4135-
if (was_writable)
4136-
pte = pte_mkwrite(pte);
4137-
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4138-
update_mmu_cache(vma, vmf->address, vmf->pte);
41394131

41404132
page = vm_normal_page(vma, vmf->address, pte);
4141-
if (!page) {
4142-
pte_unmap_unlock(vmf->pte, vmf->ptl);
4143-
return 0;
4144-
}
4133+
if (!page)
4134+
goto out_map;
41454135

41464136
/* TODO: handle PTE-mapped THP */
4147-
if (PageCompound(page)) {
4148-
pte_unmap_unlock(vmf->pte, vmf->ptl);
4149-
return 0;
4150-
}
4137+
if (PageCompound(page))
4138+
goto out_map;
41514139

41524140
/*
41534141
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4157,7 +4145,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
41574145
* pte_dirty has unpredictable behaviour between PTE scan updates,
41584146
* background writeback, dirty balancing and application behaviour.
41594147
*/
4160-
if (!pte_write(pte))
4148+
if (!was_writable)
41614149
flags |= TNF_NO_GROUP;
41624150

41634151
/*
@@ -4171,23 +4159,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
41714159
page_nid = page_to_nid(page);
41724160
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
41734161
&flags);
4174-
pte_unmap_unlock(vmf->pte, vmf->ptl);
41754162
if (target_nid == NUMA_NO_NODE) {
41764163
put_page(page);
4177-
goto out;
4164+
goto out_map;
41784165
}
4166+
pte_unmap_unlock(vmf->pte, vmf->ptl);
41794167

41804168
/* Migrate to the requested node */
41814169
if (migrate_misplaced_page(page, vma, target_nid)) {
41824170
page_nid = target_nid;
41834171
flags |= TNF_MIGRATED;
4184-
} else
4172+
} else {
41854173
flags |= TNF_MIGRATE_FAIL;
4174+
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4175+
spin_lock(vmf->ptl);
4176+
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4177+
pte_unmap_unlock(vmf->pte, vmf->ptl);
4178+
goto out;
4179+
}
4180+
goto out_map;
4181+
}
41864182

41874183
out:
41884184
if (page_nid != NUMA_NO_NODE)
41894185
task_numa_fault(last_cpupid, page_nid, 1, flags);
41904186
return 0;
4187+
out_map:
4188+
/*
4189+
* Make it present again, depending on how arch implements
4190+
* non-accessible ptes, some can allow access by kernel mode.
4191+
*/
4192+
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
4193+
pte = pte_modify(old_pte, vma->vm_page_prot);
4194+
pte = pte_mkyoung(pte);
4195+
if (was_writable)
4196+
pte = pte_mkwrite(pte);
4197+
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4198+
update_mmu_cache(vma, vmf->address, vmf->pte);
4199+
pte_unmap_unlock(vmf->pte, vmf->ptl);
4200+
goto out;
41914201
}
41924202

41934203
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)

0 commit comments

Comments
 (0)