Skip to content

Commit 1d7715c

Browse files
Vladimir Davydovtorvalds
Vladimir Davydov
authored andcommitted
mmu-notifier: add clear_young callback
In the scope of the idle memory tracking feature, which is introduced by the following patch, we need to clear the referenced/accessed bit not only in primary, but also in secondary ptes. The latter is required in order to estimate wss of KVM VMs. At the same time we want to avoid flushing tlb, because it is quite expensive and it won't really affect the final result. Currently, there is no function for clearing pte young bit that would meet our requirements, so this patch introduces one. To achieve that we have to add a new mmu-notifier callback, clear_young, since there is no method for testing-and-clearing a secondary pte w/o flushing tlb. The new method is not mandatory and currently only implemented by KVM. Signed-off-by: Vladimir Davydov <[email protected]> Reviewed-by: Andres Lagar-Cavilla <[email protected]> Acked-by: Paolo Bonzini <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Raghavendra K T <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Greg Thelen <[email protected]> Cc: Michel Lespinasse <[email protected]> Cc: David Rientjes <[email protected]> Cc: Pavel Emelyanov <[email protected]> Cc: Cyrill Gorcunov <[email protected]> Cc: Jonathan Corbet <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 80ae2fd commit 1d7715c

File tree

3 files changed

+92
-0
lines changed

3 files changed

+92
-0
lines changed

include/linux/mmu_notifier.h

+44
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
6565
unsigned long start,
6666
unsigned long end);
6767

68+
/*
69+
* clear_young is a lightweight version of clear_flush_young. Like the
70+
* latter, it is supposed to test-and-clear the young/accessed bitflag
71+
* in the secondary pte, but it may omit flushing the secondary tlb.
72+
*/
73+
int (*clear_young)(struct mmu_notifier *mn,
74+
struct mm_struct *mm,
75+
unsigned long start,
76+
unsigned long end);
77+
6878
/*
6979
* test_young is called to check the young/accessed bitflag in
7080
* the secondary pte. This is used to know if the page is
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
203213
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
204214
unsigned long start,
205215
unsigned long end);
216+
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
217+
unsigned long start,
218+
unsigned long end);
206219
extern int __mmu_notifier_test_young(struct mm_struct *mm,
207220
unsigned long address);
208221
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
231244
return 0;
232245
}
233246

247+
static inline int mmu_notifier_clear_young(struct mm_struct *mm,
248+
unsigned long start,
249+
unsigned long end)
250+
{
251+
if (mm_has_notifiers(mm))
252+
return __mmu_notifier_clear_young(mm, start, end);
253+
return 0;
254+
}
255+
234256
static inline int mmu_notifier_test_young(struct mm_struct *mm,
235257
unsigned long address)
236258
{
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
311333
__young; \
312334
})
313335

336+
#define ptep_clear_young_notify(__vma, __address, __ptep) \
337+
({ \
338+
int __young; \
339+
struct vm_area_struct *___vma = __vma; \
340+
unsigned long ___address = __address; \
341+
__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
342+
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
343+
___address + PAGE_SIZE); \
344+
__young; \
345+
})
346+
347+
#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
348+
({ \
349+
int __young; \
350+
struct vm_area_struct *___vma = __vma; \
351+
unsigned long ___address = __address; \
352+
__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
353+
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
354+
___address + PMD_SIZE); \
355+
__young; \
356+
})
357+
314358
#define ptep_clear_flush_notify(__vma, __address, __ptep) \
315359
({ \
316360
unsigned long ___addr = __address & PAGE_MASK; \

mm/mmu_notifier.c

+17
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
123123
return young;
124124
}
125125

126+
int __mmu_notifier_clear_young(struct mm_struct *mm,
127+
unsigned long start,
128+
unsigned long end)
129+
{
130+
struct mmu_notifier *mn;
131+
int young = 0, id;
132+
133+
id = srcu_read_lock(&srcu);
134+
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
135+
if (mn->ops->clear_young)
136+
young |= mn->ops->clear_young(mn, mm, start, end);
137+
}
138+
srcu_read_unlock(&srcu, id);
139+
140+
return young;
141+
}
142+
126143
int __mmu_notifier_test_young(struct mm_struct *mm,
127144
unsigned long address)
128145
{

virt/kvm/kvm_main.c

+31
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
387387
return young;
388388
}
389389

390+
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
391+
struct mm_struct *mm,
392+
unsigned long start,
393+
unsigned long end)
394+
{
395+
struct kvm *kvm = mmu_notifier_to_kvm(mn);
396+
int young, idx;
397+
398+
idx = srcu_read_lock(&kvm->srcu);
399+
spin_lock(&kvm->mmu_lock);
400+
/*
401+
* Even though we do not flush TLB, this will still adversely
402+
* affect performance on pre-Haswell Intel EPT, where there is
403+
* no EPT Access Bit to clear so that we have to tear down EPT
404+
* tables instead. If we find this unacceptable, we can always
405+
* add a parameter to kvm_age_hva so that it effectively doesn't
406+
* do anything on clear_young.
407+
*
408+
* Also note that currently we never issue secondary TLB flushes
409+
* from clear_young, leaving this job up to the regular system
410+
* cadence. If we find this inaccurate, we might come up with a
411+
* more sophisticated heuristic later.
412+
*/
413+
young = kvm_age_hva(kvm, start, end);
414+
spin_unlock(&kvm->mmu_lock);
415+
srcu_read_unlock(&kvm->srcu, idx);
416+
417+
return young;
418+
}
419+
390420
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
391421
struct mm_struct *mm,
392422
unsigned long address)
@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
419449
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
420450
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
421451
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
452+
.clear_young = kvm_mmu_notifier_clear_young,
422453
.test_young = kvm_mmu_notifier_test_young,
423454
.change_pte = kvm_mmu_notifier_change_pte,
424455
.release = kvm_mmu_notifier_release,

0 commit comments

Comments
 (0)