@@ -1006,24 +1006,28 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
1006
1006
* fls() instead since we need to know the actual length while modifying
1007
1007
* goal length.
1008
1008
*/
1009
- order = fls (ac -> ac_g_ex .fe_len );
1009
+ order = fls (ac -> ac_g_ex .fe_len ) - 1 ;
1010
1010
min_order = order - sbi -> s_mb_best_avail_max_trim_order ;
1011
1011
if (min_order < 0 )
1012
1012
min_order = 0 ;
1013
1013
1014
- if (1 << min_order < ac -> ac_o_ex .fe_len )
1015
- min_order = fls (ac -> ac_o_ex .fe_len ) + 1 ;
1016
-
1017
1014
if (sbi -> s_stripe > 0 ) {
1018
1015
/*
1019
1016
* We are assuming that stripe size is always a multiple of
1020
1017
* cluster ratio otherwise __ext4_fill_super exists early.
1021
1018
*/
1022
1019
num_stripe_clusters = EXT4_NUM_B2C (sbi , sbi -> s_stripe );
1023
1020
if (1 << min_order < num_stripe_clusters )
1024
- min_order = fls (num_stripe_clusters );
1021
+ /*
1022
+ * We consider 1 order less because later we round
1023
+ * up the goal len to num_stripe_clusters
1024
+ */
1025
+ min_order = fls (num_stripe_clusters ) - 1 ;
1025
1026
}
1026
1027
1028
+ if (1 << min_order < ac -> ac_o_ex .fe_len )
1029
+ min_order = fls (ac -> ac_o_ex .fe_len );
1030
+
1027
1031
for (i = order ; i >= min_order ; i -- ) {
1028
1032
int frag_order ;
1029
1033
/*
@@ -4761,56 +4765,160 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
4761
4765
int order , i ;
4762
4766
struct ext4_inode_info * ei = EXT4_I (ac -> ac_inode );
4763
4767
struct ext4_locality_group * lg ;
4764
- struct ext4_prealloc_space * tmp_pa , * cpa = NULL ;
4765
- ext4_lblk_t tmp_pa_start , tmp_pa_end ;
4768
+ struct ext4_prealloc_space * tmp_pa = NULL , * cpa = NULL ;
4769
+ loff_t tmp_pa_end ;
4766
4770
struct rb_node * iter ;
4767
4771
ext4_fsblk_t goal_block ;
4768
4772
4769
4773
/* only data can be preallocated */
4770
4774
if (!(ac -> ac_flags & EXT4_MB_HINT_DATA ))
4771
4775
return false;
4772
4776
4773
- /* first, try per-file preallocation */
4777
+ /*
4778
+ * first, try per-file preallocation by searching the inode pa rbtree.
4779
+ *
4780
+ * Here, we can't do a direct traversal of the tree because
4781
+ * ext4_mb_discard_group_preallocation() can paralelly mark the pa
4782
+ * deleted and that can cause direct traversal to skip some entries.
4783
+ */
4774
4784
read_lock (& ei -> i_prealloc_lock );
4785
+
4786
+ if (RB_EMPTY_ROOT (& ei -> i_prealloc_node )) {
4787
+ goto try_group_pa ;
4788
+ }
4789
+
4790
+ /*
4791
+ * Step 1: Find a pa with logical start immediately adjacent to the
4792
+ * original logical start. This could be on the left or right.
4793
+ *
4794
+ * (tmp_pa->pa_lstart never changes so we can skip locking for it).
4795
+ */
4775
4796
for (iter = ei -> i_prealloc_node .rb_node ; iter ;
4776
4797
iter = ext4_mb_pa_rb_next_iter (ac -> ac_o_ex .fe_logical ,
4777
- tmp_pa_start , iter )) {
4798
+ tmp_pa -> pa_lstart , iter )) {
4778
4799
tmp_pa = rb_entry (iter , struct ext4_prealloc_space ,
4779
4800
pa_node .inode_node );
4801
+ }
4780
4802
4781
- /* all fields in this condition don't change,
4782
- * so we can skip locking for them */
4783
- tmp_pa_start = tmp_pa -> pa_lstart ;
4784
- tmp_pa_end = tmp_pa -> pa_lstart + EXT4_C2B (sbi , tmp_pa -> pa_len );
4785
-
4786
- /* original request start doesn't lie in this PA */
4787
- if (ac -> ac_o_ex .fe_logical < tmp_pa_start ||
4788
- ac -> ac_o_ex .fe_logical >= tmp_pa_end )
4789
- continue ;
4803
+ /*
4804
+ * Step 2: The adjacent pa might be to the right of logical start, find
4805
+ * the left adjacent pa. After this step we'd have a valid tmp_pa whose
4806
+ * logical start is towards the left of original request's logical start
4807
+ */
4808
+ if (tmp_pa -> pa_lstart > ac -> ac_o_ex .fe_logical ) {
4809
+ struct rb_node * tmp ;
4810
+ tmp = rb_prev (& tmp_pa -> pa_node .inode_node );
4790
4811
4791
- /* non-extent files can't have physical blocks past 2^32 */
4792
- if (!( ext4_test_inode_flag ( ac -> ac_inode , EXT4_INODE_EXTENTS )) &&
4793
- ( tmp_pa -> pa_pstart + EXT4_C2B ( sbi , tmp_pa -> pa_len ) >
4794
- EXT4_MAX_BLOCK_FILE_PHYS )) {
4812
+ if ( tmp ) {
4813
+ tmp_pa = rb_entry ( tmp , struct ext4_prealloc_space ,
4814
+ pa_node . inode_node );
4815
+ } else {
4795
4816
/*
4796
- * Since PAs don't overlap, we won't find any
4797
- * other PA to satisfy this.
4817
+ * If there is no adjacent pa to the left then finding
4818
+ * an overlapping pa is not possible hence stop searching
4819
+ * inode pa tree
4798
4820
*/
4799
- break ;
4821
+ goto try_group_pa ;
4800
4822
}
4823
+ }
4824
+
4825
+ BUG_ON (!(tmp_pa && tmp_pa -> pa_lstart <= ac -> ac_o_ex .fe_logical ));
4801
4826
4802
- /* found preallocated blocks, use them */
4827
+ /*
4828
+ * Step 3: If the left adjacent pa is deleted, keep moving left to find
4829
+ * the first non deleted adjacent pa. After this step we should have a
4830
+ * valid tmp_pa which is guaranteed to be non deleted.
4831
+ */
4832
+ for (iter = & tmp_pa -> pa_node .inode_node ;; iter = rb_prev (iter )) {
4833
+ if (!iter ) {
4834
+ /*
4835
+ * no non deleted left adjacent pa, so stop searching
4836
+ * inode pa tree
4837
+ */
4838
+ goto try_group_pa ;
4839
+ }
4840
+ tmp_pa = rb_entry (iter , struct ext4_prealloc_space ,
4841
+ pa_node .inode_node );
4803
4842
spin_lock (& tmp_pa -> pa_lock );
4804
- if (tmp_pa -> pa_deleted == 0 && tmp_pa -> pa_free &&
4805
- likely (ext4_mb_pa_goal_check (ac , tmp_pa ))) {
4806
- atomic_inc (& tmp_pa -> pa_count );
4807
- ext4_mb_use_inode_pa (ac , tmp_pa );
4843
+ if (tmp_pa -> pa_deleted == 0 ) {
4844
+ /*
4845
+ * We will keep holding the pa_lock from
4846
+ * this point on because we don't want group discard
4847
+ * to delete this pa underneath us. Since group
4848
+ * discard is anyways an ENOSPC operation it
4849
+ * should be okay for it to wait a few more cycles.
4850
+ */
4851
+ break ;
4852
+ } else {
4808
4853
spin_unlock (& tmp_pa -> pa_lock );
4809
- read_unlock (& ei -> i_prealloc_lock );
4810
- return true;
4811
4854
}
4855
+ }
4856
+
4857
+ BUG_ON (!(tmp_pa && tmp_pa -> pa_lstart <= ac -> ac_o_ex .fe_logical ));
4858
+ BUG_ON (tmp_pa -> pa_deleted == 1 );
4859
+
4860
+ /*
4861
+ * Step 4: We now have the non deleted left adjacent pa. Only this
4862
+ * pa can possibly satisfy the request hence check if it overlaps
4863
+ * original logical start and stop searching if it doesn't.
4864
+ */
4865
+ tmp_pa_end = (loff_t )tmp_pa -> pa_lstart + EXT4_C2B (sbi , tmp_pa -> pa_len );
4866
+
4867
+ if (ac -> ac_o_ex .fe_logical >= tmp_pa_end ) {
4812
4868
spin_unlock (& tmp_pa -> pa_lock );
4869
+ goto try_group_pa ;
4870
+ }
4871
+
4872
+ /* non-extent files can't have physical blocks past 2^32 */
4873
+ if (!(ext4_test_inode_flag (ac -> ac_inode , EXT4_INODE_EXTENTS )) &&
4874
+ (tmp_pa -> pa_pstart + EXT4_C2B (sbi , tmp_pa -> pa_len ) >
4875
+ EXT4_MAX_BLOCK_FILE_PHYS )) {
4876
+ /*
4877
+ * Since PAs don't overlap, we won't find any other PA to
4878
+ * satisfy this.
4879
+ */
4880
+ spin_unlock (& tmp_pa -> pa_lock );
4881
+ goto try_group_pa ;
4882
+ }
4883
+
4884
+ if (tmp_pa -> pa_free && likely (ext4_mb_pa_goal_check (ac , tmp_pa ))) {
4885
+ atomic_inc (& tmp_pa -> pa_count );
4886
+ ext4_mb_use_inode_pa (ac , tmp_pa );
4887
+ spin_unlock (& tmp_pa -> pa_lock );
4888
+ read_unlock (& ei -> i_prealloc_lock );
4889
+ return true;
4890
+ } else {
4891
+ /*
4892
+ * We found a valid overlapping pa but couldn't use it because
4893
+ * it had no free blocks. This should ideally never happen
4894
+ * because:
4895
+ *
4896
+ * 1. When a new inode pa is added to rbtree it must have
4897
+ * pa_free > 0 since otherwise we won't actually need
4898
+ * preallocation.
4899
+ *
4900
+ * 2. An inode pa that is in the rbtree can only have it's
4901
+ * pa_free become zero when another thread calls:
4902
+ * ext4_mb_new_blocks
4903
+ * ext4_mb_use_preallocated
4904
+ * ext4_mb_use_inode_pa
4905
+ *
4906
+ * 3. Further, after the above calls make pa_free == 0, we will
4907
+ * immediately remove it from the rbtree in:
4908
+ * ext4_mb_new_blocks
4909
+ * ext4_mb_release_context
4910
+ * ext4_mb_put_pa
4911
+ *
4912
+ * 4. Since the pa_free becoming 0 and pa_free getting removed
4913
+ * from tree both happen in ext4_mb_new_blocks, which is always
4914
+ * called with i_data_sem held for data allocations, we can be
4915
+ * sure that another process will never see a pa in rbtree with
4916
+ * pa_free == 0.
4917
+ */
4918
+ WARN_ON_ONCE (tmp_pa -> pa_free == 0 );
4813
4919
}
4920
+ spin_unlock (& tmp_pa -> pa_lock );
4921
+ try_group_pa :
4814
4922
read_unlock (& ei -> i_prealloc_lock );
4815
4923
4816
4924
/* can we use group allocation? */
0 commit comments