Skip to content

Commit f91f2ee

Browse files
committed
Merge branch 'akpm' (rest of patches from Andrew)
Merge the left-over patches from Andrew Morton. This merges the remaining two patches from Andrew's pile of "little bit more MM". I mulled it over, and we emailed back and forth with Josef, and he pointed out where I was wrong. Rule #51 of kernel maintenance: when somebody makes it clear that they know the code better than you did, stop arguing and just apply the damn patch. Add a third patch by me to add a comment for the case that I had thought was buggy and Josef corrected me on. * emailed patches from Andrew Morton <[email protected]>: filemap: add a comment about FAULT_FLAG_RETRY_NOWAIT behavior filemap: drop the mmap_sem for all blocking operations filemap: kill page_cache_read usage in filemap_fault
2 parents f261c4e + 8b0f9fa commit f91f2ee

File tree

2 files changed

+131
-72
lines changed

2 files changed

+131
-72
lines changed

include/linux/pagemap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
239239
#define FGP_WRITE 0x00000008
240240
#define FGP_NOFS 0x00000010
241241
#define FGP_NOWAIT 0x00000020
242+
#define FGP_FOR_MMAP 0x00000040
242243

243244
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
244245
int fgp_flags, gfp_t cache_gfp_mask);

mm/filemap.c

Lines changed: 130 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry);
15871587
* @gfp_mask and added to the page cache and the VM's LRU
15881588
* list. The page is returned locked and with an increased
15891589
* refcount.
1590+
* - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
1591+
* its own locking dance if the page is already in cache, or unlock the page
1592+
* before returning if we had to add the page to pagecache.
15901593
*
15911594
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
15921595
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -1641,7 +1644,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
16411644
if (!page)
16421645
return NULL;
16431646

1644-
if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1647+
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
16451648
fgp_flags |= FGP_LOCK;
16461649

16471650
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1655,6 +1658,13 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
16551658
if (err == -EEXIST)
16561659
goto repeat;
16571660
}
1661+
1662+
/*
1663+
* add_to_page_cache_lru locks the page, and for mmap we expect
1664+
* an unlocked page.
1665+
*/
1666+
if (page && (fgp_flags & FGP_FOR_MMAP))
1667+
unlock_page(page);
16581668
}
16591669

16601670
return page;
@@ -2379,64 +2389,98 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
23792389
EXPORT_SYMBOL(generic_file_read_iter);
23802390

23812391
#ifdef CONFIG_MMU
2382-
/**
2383-
* page_cache_read - adds requested page to the page cache if not already there
2384-
* @file: file to read
2385-
* @offset: page index
2386-
* @gfp_mask: memory allocation flags
2387-
*
2388-
* This adds the requested page to the page cache if it isn't already there,
2389-
* and schedules an I/O to read in its contents from disk.
2390-
*
2391-
* Return: %0 on success, negative error code otherwise.
2392-
*/
2393-
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
2392+
#define MMAP_LOTSAMISS (100)
2393+
static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
2394+
struct file *fpin)
23942395
{
2395-
struct address_space *mapping = file->f_mapping;
2396-
struct page *page;
2397-
int ret;
2396+
int flags = vmf->flags;
23982397

2399-
do {
2400-
page = __page_cache_alloc(gfp_mask);
2401-
if (!page)
2402-
return -ENOMEM;
2398+
if (fpin)
2399+
return fpin;
24032400

2404-
ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
2405-
if (ret == 0)
2406-
ret = mapping->a_ops->readpage(file, page);
2407-
else if (ret == -EEXIST)
2408-
ret = 0; /* losing race to add is OK */
2401+
/*
2402+
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
2403+
* anything, so we only pin the file and drop the mmap_sem if only
2404+
* FAULT_FLAG_ALLOW_RETRY is set.
2405+
*/
2406+
if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
2407+
FAULT_FLAG_ALLOW_RETRY) {
2408+
fpin = get_file(vmf->vma->vm_file);
2409+
up_read(&vmf->vma->vm_mm->mmap_sem);
2410+
}
2411+
return fpin;
2412+
}
24092413

2410-
put_page(page);
2414+
/*
2415+
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
2416+
* @vmf - the vm_fault for this fault.
2417+
* @page - the page to lock.
2418+
* @fpin - the pointer to the file we may pin (or is already pinned).
2419+
*
2420+
* This works similar to lock_page_or_retry in that it can drop the mmap_sem.
2421+
* It differs in that it actually returns the page locked if it returns 1 and 0
2422+
* if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
2423+
* will point to the pinned file and needs to be fput()'ed at a later point.
2424+
*/
2425+
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
2426+
struct file **fpin)
2427+
{
2428+
if (trylock_page(page))
2429+
return 1;
24112430

2412-
} while (ret == AOP_TRUNCATED_PAGE);
2431+
/*
2432+
* NOTE! This will make us return with VM_FAULT_RETRY, but with
2433+
* the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
2434+
* is supposed to work. We have way too many special cases..
2435+
*/
2436+
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
2437+
return 0;
24132438

2414-
return ret;
2439+
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
2440+
if (vmf->flags & FAULT_FLAG_KILLABLE) {
2441+
if (__lock_page_killable(page)) {
2442+
/*
2443+
* We didn't have the right flags to drop the mmap_sem,
2444+
* but all fault_handlers only check for fatal signals
2445+
* if we return VM_FAULT_RETRY, so we need to drop the
2446+
* mmap_sem here and return 0 if we don't have a fpin.
2447+
*/
2448+
if (*fpin == NULL)
2449+
up_read(&vmf->vma->vm_mm->mmap_sem);
2450+
return 0;
2451+
}
2452+
} else
2453+
__lock_page(page);
2454+
return 1;
24152455
}
24162456

2417-
#define MMAP_LOTSAMISS (100)
24182457

24192458
/*
2420-
* Synchronous readahead happens when we don't even find
2421-
* a page in the page cache at all.
2459+
* Synchronous readahead happens when we don't even find a page in the page
2460+
* cache at all. We don't want to perform IO under the mmap sem, so if we have
2461+
* to drop the mmap sem we return the file that was pinned in order for us to do
2462+
* that. If we didn't pin a file then we return NULL. The file that is
2463+
* returned needs to be fput()'ed when we're done with it.
24222464
*/
2423-
static void do_sync_mmap_readahead(struct vm_fault *vmf)
2465+
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
24242466
{
24252467
struct file *file = vmf->vma->vm_file;
24262468
struct file_ra_state *ra = &file->f_ra;
24272469
struct address_space *mapping = file->f_mapping;
2470+
struct file *fpin = NULL;
24282471
pgoff_t offset = vmf->pgoff;
24292472

24302473
/* If we don't want any read-ahead, don't bother */
24312474
if (vmf->vma->vm_flags & VM_RAND_READ)
2432-
return;
2475+
return fpin;
24332476
if (!ra->ra_pages)
2434-
return;
2477+
return fpin;
24352478

24362479
if (vmf->vma->vm_flags & VM_SEQ_READ) {
2480+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
24372481
page_cache_sync_readahead(mapping, ra, file, offset,
24382482
ra->ra_pages);
2439-
return;
2483+
return fpin;
24402484
}
24412485

24422486
/* Avoid banging the cache line if not needed */
@@ -2448,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf)
24482492
* stop bothering with read-ahead. It will only hurt.
24492493
*/
24502494
if (ra->mmap_miss > MMAP_LOTSAMISS)
2451-
return;
2495+
return fpin;
24522496

24532497
/*
24542498
* mmap read-around
24552499
*/
2500+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
24562501
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
24572502
ra->size = ra->ra_pages;
24582503
ra->async_size = ra->ra_pages / 4;
24592504
ra_submit(ra, mapping, file);
2505+
return fpin;
24602506
}
24612507

24622508
/*
24632509
* Asynchronous readahead happens when we find the page and PG_readahead,
2464-
* so we want to possibly extend the readahead further..
2510+
* so we want to possibly extend the readahead further. We return the file that
2511+
* was pinned if we have to drop the mmap_sem in order to do IO.
24652512
*/
2466-
static void do_async_mmap_readahead(struct vm_fault *vmf,
2467-
struct page *page)
2513+
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
2514+
struct page *page)
24682515
{
24692516
struct file *file = vmf->vma->vm_file;
24702517
struct file_ra_state *ra = &file->f_ra;
24712518
struct address_space *mapping = file->f_mapping;
2519+
struct file *fpin = NULL;
24722520
pgoff_t offset = vmf->pgoff;
24732521

24742522
/* If we don't want any read-ahead, don't bother */
24752523
if (vmf->vma->vm_flags & VM_RAND_READ)
2476-
return;
2524+
return fpin;
24772525
if (ra->mmap_miss > 0)
24782526
ra->mmap_miss--;
2479-
if (PageReadahead(page))
2527+
if (PageReadahead(page)) {
2528+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
24802529
page_cache_async_readahead(mapping, ra, file,
24812530
page, offset, ra->ra_pages);
2531+
}
2532+
return fpin;
24822533
}
24832534

24842535
/**
@@ -2510,6 +2561,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
25102561
{
25112562
int error;
25122563
struct file *file = vmf->vma->vm_file;
2564+
struct file *fpin = NULL;
25132565
struct address_space *mapping = file->f_mapping;
25142566
struct file_ra_state *ra = &file->f_ra;
25152567
struct inode *inode = mapping->host;
@@ -2531,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
25312583
* We found the page, so try async readahead before
25322584
* waiting for the lock.
25332585
*/
2534-
do_async_mmap_readahead(vmf, page);
2586+
fpin = do_async_mmap_readahead(vmf, page);
25352587
} else if (!page) {
25362588
/* No page in the page cache at all */
2537-
do_sync_mmap_readahead(vmf);
25382589
count_vm_event(PGMAJFAULT);
25392590
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
25402591
ret = VM_FAULT_MAJOR;
2592+
fpin = do_sync_mmap_readahead(vmf);
25412593
retry_find:
2542-
page = find_get_page(mapping, offset);
2543-
if (!page)
2544-
goto no_cached_page;
2594+
page = pagecache_get_page(mapping, offset,
2595+
FGP_CREAT|FGP_FOR_MMAP,
2596+
vmf->gfp_mask);
2597+
if (!page) {
2598+
if (fpin)
2599+
goto out_retry;
2600+
return vmf_error(-ENOMEM);
2601+
}
25452602
}
25462603

2547-
if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
2548-
put_page(page);
2549-
return ret | VM_FAULT_RETRY;
2550-
}
2604+
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
2605+
goto out_retry;
25512606

25522607
/* Did it get truncated? */
25532608
if (unlikely(page->mapping != mapping)) {
@@ -2564,6 +2619,16 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
25642619
if (unlikely(!PageUptodate(page)))
25652620
goto page_not_uptodate;
25662621

2622+
/*
2623+
* We've made it this far and we had to drop our mmap_sem, now is the
2624+
* time to return to the upper layer and have it re-find the vma and
2625+
* redo the fault.
2626+
*/
2627+
if (fpin) {
2628+
unlock_page(page);
2629+
goto out_retry;
2630+
}
2631+
25672632
/*
25682633
* Found the page and have a reference on it.
25692634
* We must recheck i_size under page lock.
@@ -2578,28 +2643,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
25782643
vmf->page = page;
25792644
return ret | VM_FAULT_LOCKED;
25802645

2581-
no_cached_page:
2582-
/*
2583-
* We're only likely to ever get here if MADV_RANDOM is in
2584-
* effect.
2585-
*/
2586-
error = page_cache_read(file, offset, vmf->gfp_mask);
2587-
2588-
/*
2589-
* The page we want has now been added to the page cache.
2590-
* In the unlikely event that someone removed it in the
2591-
* meantime, we'll just come back here and read it again.
2592-
*/
2593-
if (error >= 0)
2594-
goto retry_find;
2595-
2596-
/*
2597-
* An error return from page_cache_read can result if the
2598-
* system is low on memory, or a problem occurs while trying
2599-
* to schedule I/O.
2600-
*/
2601-
return vmf_error(error);
2602-
26032646
page_not_uptodate:
26042647
/*
26052648
* Umm, take care of errors if the page isn't up-to-date.
@@ -2608,12 +2651,15 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
26082651
* and we need to check for errors.
26092652
*/
26102653
ClearPageError(page);
2654+
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
26112655
error = mapping->a_ops->readpage(file, page);
26122656
if (!error) {
26132657
wait_on_page_locked(page);
26142658
if (!PageUptodate(page))
26152659
error = -EIO;
26162660
}
2661+
if (fpin)
2662+
goto out_retry;
26172663
put_page(page);
26182664

26192665
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2622,6 +2668,18 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
26222668
/* Things didn't work out. Return zero to tell the mm layer so. */
26232669
shrink_readahead_size_eio(file, ra);
26242670
return VM_FAULT_SIGBUS;
2671+
2672+
out_retry:
2673+
/*
2674+
* We dropped the mmap_sem, we need to return to the fault handler to
2675+
* re-find the vma and come back and find our hopefully still populated
2676+
* page.
2677+
*/
2678+
if (page)
2679+
put_page(page);
2680+
if (fpin)
2681+
fput(fpin);
2682+
return ret | VM_FAULT_RETRY;
26252683
}
26262684
EXPORT_SYMBOL(filemap_fault);
26272685

0 commit comments

Comments
 (0)