165
165
* equal to request size using our average fragment size group lists (data
166
166
* structure 2) in O(1) time.
167
167
*
168
+ * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied
169
+ * in CR1. The fact that we couldn't find a group in CR1 suggests that there is
170
+ * no BG that has average fragment size > goal length. So before falling to the
171
+ * slower CR2, in CR1.5 we proactively trim goal length and then use the same
172
+ * fragment lists as CR1 to find a BG with a big enough average fragment size.
173
+ * This increases the chances of finding a suitable block group in O(1) time and
174
+ * results * in faster allocation at the cost of reduced size of allocation.
175
+ *
168
176
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
169
177
* linear order which requires O(N) search time for each CR0 and CR1 phase.
170
178
*
@@ -962,6 +970,91 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
962
970
* group = grp -> bb_group ;
963
971
ac -> ac_flags |= EXT4_MB_CR1_OPTIMIZED ;
964
972
} else {
973
+ * new_cr = CR1_5 ;
974
+ }
975
+ }
976
+
977
+ /*
978
+ * We couldn't find a group in CR1 so try to find the highest free fragment
979
+ * order we have and proactively trim the goal request length to that order to
980
+ * find a suitable group faster.
981
+ *
982
+ * This optimizes allocation speed at the cost of slightly reduced
983
+ * preallocations. However, we make sure that we don't trim the request too
984
+ * much and fall to CR2 in that case.
985
+ */
986
+ static void ext4_mb_choose_next_group_cr1_5 (struct ext4_allocation_context * ac ,
987
+ enum criteria * new_cr , ext4_group_t * group , ext4_group_t ngroups )
988
+ {
989
+ struct ext4_sb_info * sbi = EXT4_SB (ac -> ac_sb );
990
+ struct ext4_group_info * grp = NULL ;
991
+ int i , order , min_order ;
992
+ unsigned long num_stripe_clusters = 0 ;
993
+
994
+ if (unlikely (ac -> ac_flags & EXT4_MB_CR1_5_OPTIMIZED )) {
995
+ if (sbi -> s_mb_stats )
996
+ atomic_inc (& sbi -> s_bal_cr1_5_bad_suggestions );
997
+ }
998
+
999
+ /*
1000
+ * mb_avg_fragment_size_order() returns order in a way that makes
1001
+ * retrieving back the length using (1 << order) inaccurate. Hence, use
1002
+ * fls() instead since we need to know the actual length while modifying
1003
+ * goal length.
1004
+ */
1005
+ order = fls (ac -> ac_g_ex .fe_len );
1006
+ min_order = order - sbi -> s_mb_cr1_5_max_trim_order ;
1007
+ if (min_order < 0 )
1008
+ min_order = 0 ;
1009
+
1010
+ if (1 << min_order < ac -> ac_o_ex .fe_len )
1011
+ min_order = fls (ac -> ac_o_ex .fe_len ) + 1 ;
1012
+
1013
+ if (sbi -> s_stripe > 0 ) {
1014
+ /*
1015
+ * We are assuming that stripe size is always a multiple of
1016
+ * cluster ratio otherwise __ext4_fill_super exists early.
1017
+ */
1018
+ num_stripe_clusters = EXT4_NUM_B2C (sbi , sbi -> s_stripe );
1019
+ if (1 << min_order < num_stripe_clusters )
1020
+ min_order = fls (num_stripe_clusters );
1021
+ }
1022
+
1023
+ for (i = order ; i >= min_order ; i -- ) {
1024
+ int frag_order ;
1025
+ /*
1026
+ * Scale down goal len to make sure we find something
1027
+ * in the free fragments list. Basically, reduce
1028
+ * preallocations.
1029
+ */
1030
+ ac -> ac_g_ex .fe_len = 1 << i ;
1031
+
1032
+ if (num_stripe_clusters > 0 ) {
1033
+ /*
1034
+ * Try to round up the adjusted goal to stripe size
1035
+ * (in cluster units) multiple for efficiency.
1036
+ *
1037
+ * XXX: Is s->stripe always a power of 2? In that case
1038
+ * we can use the faster round_up() variant.
1039
+ */
1040
+ ac -> ac_g_ex .fe_len = roundup (ac -> ac_g_ex .fe_len ,
1041
+ num_stripe_clusters );
1042
+ }
1043
+
1044
+ frag_order = mb_avg_fragment_size_order (ac -> ac_sb ,
1045
+ ac -> ac_g_ex .fe_len );
1046
+
1047
+ grp = ext4_mb_find_good_group_avg_frag_lists (ac , frag_order );
1048
+ if (grp )
1049
+ break ;
1050
+ }
1051
+
1052
+ if (grp ) {
1053
+ * group = grp -> bb_group ;
1054
+ ac -> ac_flags |= EXT4_MB_CR1_5_OPTIMIZED ;
1055
+ } else {
1056
+ /* Reset goal length to original goal length before falling into CR2 */
1057
+ ac -> ac_g_ex .fe_len = ac -> ac_orig_goal_len ;
965
1058
* new_cr = CR2 ;
966
1059
}
967
1060
}
@@ -1028,6 +1121,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
1028
1121
ext4_mb_choose_next_group_cr0 (ac , new_cr , group , ngroups );
1029
1122
} else if (* new_cr == CR1 ) {
1030
1123
ext4_mb_choose_next_group_cr1 (ac , new_cr , group , ngroups );
1124
+ } else if (* new_cr == CR1_5 ) {
1125
+ ext4_mb_choose_next_group_cr1_5 (ac , new_cr , group , ngroups );
1031
1126
} else {
1032
1127
/*
1033
1128
* TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -2351,7 +2446,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
2351
2446
2352
2447
if (ac -> ac_criteria < CR2 ) {
2353
2448
/*
2354
- * In CR1, we are sure that this group will
2449
+ * In CR1 and CR1_5 , we are sure that this group will
2355
2450
* have a large enough continuous free extent, so skip
2356
2451
* over the smaller free extents
2357
2452
*/
@@ -2483,6 +2578,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
2483
2578
2484
2579
return true;
2485
2580
case CR1 :
2581
+ case CR1_5 :
2486
2582
if ((free / fragments ) >= ac -> ac_g_ex .fe_len )
2487
2583
return true;
2488
2584
break ;
@@ -2747,7 +2843,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2747
2843
* spend a lot of time loading imperfect groups
2748
2844
*/
2749
2845
if ((prefetch_grp == group ) &&
2750
- (cr > CR1 ||
2846
+ (cr > CR1_5 ||
2751
2847
prefetch_ios < sbi -> s_mb_prefetch_limit )) {
2752
2848
nr = sbi -> s_mb_prefetch ;
2753
2849
if (ext4_has_feature_flex_bg (sb )) {
@@ -2787,7 +2883,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2787
2883
ac -> ac_groups_scanned ++ ;
2788
2884
if (cr == CR0 )
2789
2885
ext4_mb_simple_scan_group (ac , & e4b );
2790
- else if (cr == CR1 && sbi -> s_stripe &&
2886
+ else if (( cr == CR1 || cr == CR1_5 ) && sbi -> s_stripe &&
2791
2887
!(ac -> ac_g_ex .fe_len %
2792
2888
EXT4_B2C (sbi , sbi -> s_stripe )))
2793
2889
ext4_mb_scan_aligned (ac , & e4b );
@@ -2803,6 +2899,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2803
2899
/* Processed all groups and haven't found blocks */
2804
2900
if (sbi -> s_mb_stats && i == ngroups )
2805
2901
atomic64_inc (& sbi -> s_bal_cX_failed [cr ]);
2902
+
2903
+ if (i == ngroups && ac -> ac_criteria == CR1_5 )
2904
+ /* Reset goal length to original goal length before
2905
+ * falling into CR2 */
2906
+ ac -> ac_g_ex .fe_len = ac -> ac_orig_goal_len ;
2806
2907
}
2807
2908
2808
2909
if (ac -> ac_b_ex .fe_len > 0 && ac -> ac_status != AC_STATUS_FOUND &&
@@ -2972,6 +3073,16 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2972
3073
seq_printf (seq , "\t\tbad_suggestions: %u\n" ,
2973
3074
atomic_read (& sbi -> s_bal_cr1_bad_suggestions ));
2974
3075
3076
+ seq_puts (seq , "\tcr1.5_stats:\n" );
3077
+ seq_printf (seq , "\t\thits: %llu\n" , atomic64_read (& sbi -> s_bal_cX_hits [CR1_5 ]));
3078
+ seq_printf (seq , "\t\tgroups_considered: %llu\n" ,
3079
+ atomic64_read (& sbi -> s_bal_cX_groups_considered [CR1_5 ]));
3080
+ seq_printf (seq , "\t\textents_scanned: %u\n" , atomic_read (& sbi -> s_bal_cX_ex_scanned [CR1_5 ]));
3081
+ seq_printf (seq , "\t\tuseless_loops: %llu\n" ,
3082
+ atomic64_read (& sbi -> s_bal_cX_failed [CR1_5 ]));
3083
+ seq_printf (seq , "\t\tbad_suggestions: %u\n" ,
3084
+ atomic_read (& sbi -> s_bal_cr1_5_bad_suggestions ));
3085
+
2975
3086
seq_puts (seq , "\tcr2_stats:\n" );
2976
3087
seq_printf (seq , "\t\thits: %llu\n" , atomic64_read (& sbi -> s_bal_cX_hits [CR2 ]));
2977
3088
seq_printf (seq , "\t\tgroups_considered: %llu\n" ,
@@ -3489,6 +3600,8 @@ int ext4_mb_init(struct super_block *sb)
3489
3600
sbi -> s_mb_stats = MB_DEFAULT_STATS ;
3490
3601
sbi -> s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD ;
3491
3602
sbi -> s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS ;
3603
+ sbi -> s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER ;
3604
+
3492
3605
/*
3493
3606
* The default group preallocation is 512, which for 4k block
3494
3607
* sizes translates to 2 megabytes. However for bigalloc file
@@ -4392,6 +4505,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
4392
4505
* placement or satisfy big request as is */
4393
4506
ac -> ac_g_ex .fe_logical = start ;
4394
4507
ac -> ac_g_ex .fe_len = EXT4_NUM_B2C (sbi , size );
4508
+ ac -> ac_orig_goal_len = ac -> ac_g_ex .fe_len ;
4395
4509
4396
4510
/* define goal start in order to merge */
4397
4511
if (ar -> pright && (ar -> lright == (start + size )) &&
@@ -4435,8 +4549,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
4435
4549
if (ac -> ac_g_ex .fe_start == ac -> ac_b_ex .fe_start &&
4436
4550
ac -> ac_g_ex .fe_group == ac -> ac_b_ex .fe_group )
4437
4551
atomic_inc (& sbi -> s_bal_goals );
4438
- if (ac -> ac_f_ex .fe_len == ac -> ac_g_ex .fe_len )
4552
+ /* did we allocate as much as normalizer originally wanted? */
4553
+ if (ac -> ac_f_ex .fe_len == ac -> ac_orig_goal_len )
4439
4554
atomic_inc (& sbi -> s_bal_len_goals );
4555
+
4440
4556
if (ac -> ac_found > sbi -> s_mb_max_to_scan )
4441
4557
atomic_inc (& sbi -> s_bal_breaks );
4442
4558
}
@@ -4921,7 +5037,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
4921
5037
4922
5038
pa = ac -> ac_pa ;
4923
5039
4924
- if (ac -> ac_b_ex .fe_len < ac -> ac_g_ex . fe_len ) {
5040
+ if (ac -> ac_b_ex .fe_len < ac -> ac_orig_goal_len ) {
4925
5041
int new_bex_start ;
4926
5042
int new_bex_end ;
4927
5043
@@ -4936,14 +5052,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
4936
5052
* fragmentation in check while ensuring logical range of best
4937
5053
* extent doesn't overflow out of goal extent:
4938
5054
*
4939
- * 1. Check if best ex can be kept at end of goal and still
4940
- * cover original start
5055
+ * 1. Check if best ex can be kept at end of goal (before
5056
+ * cr_best_avail trimmed it) and still cover original start
4941
5057
* 2. Else, check if best ex can be kept at start of goal and
4942
5058
* still cover original start
4943
5059
* 3. Else, keep the best ex at start of original request.
4944
5060
*/
4945
5061
new_bex_end = ac -> ac_g_ex .fe_logical +
4946
- EXT4_C2B (sbi , ac -> ac_g_ex . fe_len );
5062
+ EXT4_C2B (sbi , ac -> ac_orig_goal_len );
4947
5063
new_bex_start = new_bex_end - EXT4_C2B (sbi , ac -> ac_b_ex .fe_len );
4948
5064
if (ac -> ac_o_ex .fe_logical >= new_bex_start )
4949
5065
goto adjust_bex ;
@@ -4964,7 +5080,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
4964
5080
BUG_ON (ac -> ac_o_ex .fe_logical < ac -> ac_b_ex .fe_logical );
4965
5081
BUG_ON (ac -> ac_o_ex .fe_len > ac -> ac_b_ex .fe_len );
4966
5082
BUG_ON (new_bex_end > (ac -> ac_g_ex .fe_logical +
4967
- EXT4_C2B (sbi , ac -> ac_g_ex . fe_len )));
5083
+ EXT4_C2B (sbi , ac -> ac_orig_goal_len )));
4968
5084
}
4969
5085
4970
5086
pa -> pa_lstart = ac -> ac_b_ex .fe_logical ;
@@ -5584,6 +5700,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5584
5700
ac -> ac_o_ex .fe_start = block ;
5585
5701
ac -> ac_o_ex .fe_len = len ;
5586
5702
ac -> ac_g_ex = ac -> ac_o_ex ;
5703
+ ac -> ac_orig_goal_len = ac -> ac_g_ex .fe_len ;
5587
5704
ac -> ac_flags = ar -> flags ;
5588
5705
5589
5706
/* we have to define context: we'll work with a file or
0 commit comments