@@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry);
1587
1587
* @gfp_mask and added to the page cache and the VM's LRU
1588
1588
* list. The page is returned locked and with an increased
1589
1589
* refcount.
1590
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
1591
+ * its own locking dance if the page is already in cache, or unlock the page
1592
+ * before returning if we had to add the page to pagecache.
1590
1593
*
1591
1594
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
1592
1595
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -1641,7 +1644,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1641
1644
if (!page )
1642
1645
return NULL ;
1643
1646
1644
- if (WARN_ON_ONCE (!(fgp_flags & FGP_LOCK )))
1647
+ if (WARN_ON_ONCE (!(fgp_flags & ( FGP_LOCK | FGP_FOR_MMAP ) )))
1645
1648
fgp_flags |= FGP_LOCK ;
1646
1649
1647
1650
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1655,6 +1658,13 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1655
1658
if (err == - EEXIST )
1656
1659
goto repeat ;
1657
1660
}
1661
+
1662
+ /*
1663
+ * add_to_page_cache_lru locks the page, and for mmap we expect
1664
+ * an unlocked page.
1665
+ */
1666
+ if (page && (fgp_flags & FGP_FOR_MMAP ))
1667
+ unlock_page (page );
1658
1668
}
1659
1669
1660
1670
return page ;
@@ -2379,64 +2389,98 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2379
2389
EXPORT_SYMBOL (generic_file_read_iter );
2380
2390
2381
2391
#ifdef CONFIG_MMU
2382
- /**
2383
- * page_cache_read - adds requested page to the page cache if not already there
2384
- * @file: file to read
2385
- * @offset: page index
2386
- * @gfp_mask: memory allocation flags
2387
- *
2388
- * This adds the requested page to the page cache if it isn't already there,
2389
- * and schedules an I/O to read in its contents from disk.
2390
- *
2391
- * Return: %0 on success, negative error code otherwise.
2392
- */
2393
- static int page_cache_read (struct file * file , pgoff_t offset , gfp_t gfp_mask )
2392
+ #define MMAP_LOTSAMISS (100)
2393
+ static struct file * maybe_unlock_mmap_for_io (struct vm_fault * vmf ,
2394
+ struct file * fpin )
2394
2395
{
2395
- struct address_space * mapping = file -> f_mapping ;
2396
- struct page * page ;
2397
- int ret ;
2396
+ int flags = vmf -> flags ;
2398
2397
2399
- do {
2400
- page = __page_cache_alloc (gfp_mask );
2401
- if (!page )
2402
- return - ENOMEM ;
2398
+ if (fpin )
2399
+ return fpin ;
2403
2400
2404
- ret = add_to_page_cache_lru (page , mapping , offset , gfp_mask );
2405
- if (ret == 0 )
2406
- ret = mapping -> a_ops -> readpage (file , page );
2407
- else if (ret == - EEXIST )
2408
- ret = 0 ; /* losing race to add is OK */
2401
+ /*
2402
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
2403
+ * anything, so we only pin the file and drop the mmap_sem if only
2404
+ * FAULT_FLAG_ALLOW_RETRY is set.
2405
+ */
2406
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT )) ==
2407
+ FAULT_FLAG_ALLOW_RETRY ) {
2408
+ fpin = get_file (vmf -> vma -> vm_file );
2409
+ up_read (& vmf -> vma -> vm_mm -> mmap_sem );
2410
+ }
2411
+ return fpin ;
2412
+ }
2409
2413
2410
- put_page (page );
2414
+ /*
2415
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
2416
+ * @vmf - the vm_fault for this fault.
2417
+ * @page - the page to lock.
2418
+ * @fpin - the pointer to the file we may pin (or is already pinned).
2419
+ *
2420
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
2421
+ * It differs in that it actually returns the page locked if it returns 1 and 0
2422
+ * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
2423
+ * will point to the pinned file and needs to be fput()'ed at a later point.
2424
+ */
2425
+ static int lock_page_maybe_drop_mmap (struct vm_fault * vmf , struct page * page ,
2426
+ struct file * * fpin )
2427
+ {
2428
+ if (trylock_page (page ))
2429
+ return 1 ;
2411
2430
2412
- } while (ret == AOP_TRUNCATED_PAGE );
2431
+ /*
2432
+ * NOTE! This will make us return with VM_FAULT_RETRY, but with
2433
+ * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
2434
+ * is supposed to work. We have way too many special cases..
2435
+ */
2436
+ if (vmf -> flags & FAULT_FLAG_RETRY_NOWAIT )
2437
+ return 0 ;
2413
2438
2414
- return ret ;
2439
+ * fpin = maybe_unlock_mmap_for_io (vmf , * fpin );
2440
+ if (vmf -> flags & FAULT_FLAG_KILLABLE ) {
2441
+ if (__lock_page_killable (page )) {
2442
+ /*
2443
+ * We didn't have the right flags to drop the mmap_sem,
2444
+ * but all fault_handlers only check for fatal signals
2445
+ * if we return VM_FAULT_RETRY, so we need to drop the
2446
+ * mmap_sem here and return 0 if we don't have a fpin.
2447
+ */
2448
+ if (* fpin == NULL )
2449
+ up_read (& vmf -> vma -> vm_mm -> mmap_sem );
2450
+ return 0 ;
2451
+ }
2452
+ } else
2453
+ __lock_page (page );
2454
+ return 1 ;
2415
2455
}
2416
2456
2417
- #define MMAP_LOTSAMISS (100)
2418
2457
2419
2458
/*
2420
- * Synchronous readahead happens when we don't even find
2421
- * a page in the page cache at all.
2459
+ * Synchronous readahead happens when we don't even find a page in the page
2460
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
2461
+ * to drop the mmap sem we return the file that was pinned in order for us to do
2462
+ * that. If we didn't pin a file then we return NULL. The file that is
2463
+ * returned needs to be fput()'ed when we're done with it.
2422
2464
*/
2423
- static void do_sync_mmap_readahead (struct vm_fault * vmf )
2465
+ static struct file * do_sync_mmap_readahead (struct vm_fault * vmf )
2424
2466
{
2425
2467
struct file * file = vmf -> vma -> vm_file ;
2426
2468
struct file_ra_state * ra = & file -> f_ra ;
2427
2469
struct address_space * mapping = file -> f_mapping ;
2470
+ struct file * fpin = NULL ;
2428
2471
pgoff_t offset = vmf -> pgoff ;
2429
2472
2430
2473
/* If we don't want any read-ahead, don't bother */
2431
2474
if (vmf -> vma -> vm_flags & VM_RAND_READ )
2432
- return ;
2475
+ return fpin ;
2433
2476
if (!ra -> ra_pages )
2434
- return ;
2477
+ return fpin ;
2435
2478
2436
2479
if (vmf -> vma -> vm_flags & VM_SEQ_READ ) {
2480
+ fpin = maybe_unlock_mmap_for_io (vmf , fpin );
2437
2481
page_cache_sync_readahead (mapping , ra , file , offset ,
2438
2482
ra -> ra_pages );
2439
- return ;
2483
+ return fpin ;
2440
2484
}
2441
2485
2442
2486
/* Avoid banging the cache line if not needed */
@@ -2448,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_fault *vmf)
2448
2492
* stop bothering with read-ahead. It will only hurt.
2449
2493
*/
2450
2494
if (ra -> mmap_miss > MMAP_LOTSAMISS )
2451
- return ;
2495
+ return fpin ;
2452
2496
2453
2497
/*
2454
2498
* mmap read-around
2455
2499
*/
2500
+ fpin = maybe_unlock_mmap_for_io (vmf , fpin );
2456
2501
ra -> start = max_t (long , 0 , offset - ra -> ra_pages / 2 );
2457
2502
ra -> size = ra -> ra_pages ;
2458
2503
ra -> async_size = ra -> ra_pages / 4 ;
2459
2504
ra_submit (ra , mapping , file );
2505
+ return fpin ;
2460
2506
}
2461
2507
2462
2508
/*
2463
2509
* Asynchronous readahead happens when we find the page and PG_readahead,
2464
- * so we want to possibly extend the readahead further..
2510
+ * so we want to possibly extend the readahead further. We return the file that
2511
+ * was pinned if we have to drop the mmap_sem in order to do IO.
2465
2512
*/
2466
- static void do_async_mmap_readahead (struct vm_fault * vmf ,
2467
- struct page * page )
2513
+ static struct file * do_async_mmap_readahead (struct vm_fault * vmf ,
2514
+ struct page * page )
2468
2515
{
2469
2516
struct file * file = vmf -> vma -> vm_file ;
2470
2517
struct file_ra_state * ra = & file -> f_ra ;
2471
2518
struct address_space * mapping = file -> f_mapping ;
2519
+ struct file * fpin = NULL ;
2472
2520
pgoff_t offset = vmf -> pgoff ;
2473
2521
2474
2522
/* If we don't want any read-ahead, don't bother */
2475
2523
if (vmf -> vma -> vm_flags & VM_RAND_READ )
2476
- return ;
2524
+ return fpin ;
2477
2525
if (ra -> mmap_miss > 0 )
2478
2526
ra -> mmap_miss -- ;
2479
- if (PageReadahead (page ))
2527
+ if (PageReadahead (page )) {
2528
+ fpin = maybe_unlock_mmap_for_io (vmf , fpin );
2480
2529
page_cache_async_readahead (mapping , ra , file ,
2481
2530
page , offset , ra -> ra_pages );
2531
+ }
2532
+ return fpin ;
2482
2533
}
2483
2534
2484
2535
/**
@@ -2510,6 +2561,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2510
2561
{
2511
2562
int error ;
2512
2563
struct file * file = vmf -> vma -> vm_file ;
2564
+ struct file * fpin = NULL ;
2513
2565
struct address_space * mapping = file -> f_mapping ;
2514
2566
struct file_ra_state * ra = & file -> f_ra ;
2515
2567
struct inode * inode = mapping -> host ;
@@ -2531,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2531
2583
* We found the page, so try async readahead before
2532
2584
* waiting for the lock.
2533
2585
*/
2534
- do_async_mmap_readahead (vmf , page );
2586
+ fpin = do_async_mmap_readahead (vmf , page );
2535
2587
} else if (!page ) {
2536
2588
/* No page in the page cache at all */
2537
- do_sync_mmap_readahead (vmf );
2538
2589
count_vm_event (PGMAJFAULT );
2539
2590
count_memcg_event_mm (vmf -> vma -> vm_mm , PGMAJFAULT );
2540
2591
ret = VM_FAULT_MAJOR ;
2592
+ fpin = do_sync_mmap_readahead (vmf );
2541
2593
retry_find :
2542
- page = find_get_page (mapping , offset );
2543
- if (!page )
2544
- goto no_cached_page ;
2594
+ page = pagecache_get_page (mapping , offset ,
2595
+ FGP_CREAT |FGP_FOR_MMAP ,
2596
+ vmf -> gfp_mask );
2597
+ if (!page ) {
2598
+ if (fpin )
2599
+ goto out_retry ;
2600
+ return vmf_error (- ENOMEM );
2601
+ }
2545
2602
}
2546
2603
2547
- if (!lock_page_or_retry (page , vmf -> vma -> vm_mm , vmf -> flags )) {
2548
- put_page (page );
2549
- return ret | VM_FAULT_RETRY ;
2550
- }
2604
+ if (!lock_page_maybe_drop_mmap (vmf , page , & fpin ))
2605
+ goto out_retry ;
2551
2606
2552
2607
/* Did it get truncated? */
2553
2608
if (unlikely (page -> mapping != mapping )) {
@@ -2564,6 +2619,16 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2564
2619
if (unlikely (!PageUptodate (page )))
2565
2620
goto page_not_uptodate ;
2566
2621
2622
+ /*
2623
+ * We've made it this far and we had to drop our mmap_sem, now is the
2624
+ * time to return to the upper layer and have it re-find the vma and
2625
+ * redo the fault.
2626
+ */
2627
+ if (fpin ) {
2628
+ unlock_page (page );
2629
+ goto out_retry ;
2630
+ }
2631
+
2567
2632
/*
2568
2633
* Found the page and have a reference on it.
2569
2634
* We must recheck i_size under page lock.
@@ -2578,28 +2643,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2578
2643
vmf -> page = page ;
2579
2644
return ret | VM_FAULT_LOCKED ;
2580
2645
2581
- no_cached_page :
2582
- /*
2583
- * We're only likely to ever get here if MADV_RANDOM is in
2584
- * effect.
2585
- */
2586
- error = page_cache_read (file , offset , vmf -> gfp_mask );
2587
-
2588
- /*
2589
- * The page we want has now been added to the page cache.
2590
- * In the unlikely event that someone removed it in the
2591
- * meantime, we'll just come back here and read it again.
2592
- */
2593
- if (error >= 0 )
2594
- goto retry_find ;
2595
-
2596
- /*
2597
- * An error return from page_cache_read can result if the
2598
- * system is low on memory, or a problem occurs while trying
2599
- * to schedule I/O.
2600
- */
2601
- return vmf_error (error );
2602
-
2603
2646
page_not_uptodate :
2604
2647
/*
2605
2648
* Umm, take care of errors if the page isn't up-to-date.
@@ -2608,12 +2651,15 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2608
2651
* and we need to check for errors.
2609
2652
*/
2610
2653
ClearPageError (page );
2654
+ fpin = maybe_unlock_mmap_for_io (vmf , fpin );
2611
2655
error = mapping -> a_ops -> readpage (file , page );
2612
2656
if (!error ) {
2613
2657
wait_on_page_locked (page );
2614
2658
if (!PageUptodate (page ))
2615
2659
error = - EIO ;
2616
2660
}
2661
+ if (fpin )
2662
+ goto out_retry ;
2617
2663
put_page (page );
2618
2664
2619
2665
if (!error || error == AOP_TRUNCATED_PAGE )
@@ -2622,6 +2668,18 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
2622
2668
/* Things didn't work out. Return zero to tell the mm layer so. */
2623
2669
shrink_readahead_size_eio (file , ra );
2624
2670
return VM_FAULT_SIGBUS ;
2671
+
2672
+ out_retry :
2673
+ /*
2674
+ * We dropped the mmap_sem, we need to return to the fault handler to
2675
+ * re-find the vma and come back and find our hopefully still populated
2676
+ * page.
2677
+ */
2678
+ if (page )
2679
+ put_page (page );
2680
+ if (fpin )
2681
+ fput (fpin );
2682
+ return ret | VM_FAULT_RETRY ;
2625
2683
}
2626
2684
EXPORT_SYMBOL (filemap_fault );
2627
2685
0 commit comments