Skip to content

Commit 15b593b

Browse files
committed
Merge tag 'ext4_for_linus-6.5-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 fixes from Ted Ts'o: "Bug and regression fixes for 6.5-rc3 for ext4's mballoc and jbd2's checkpoint code" * tag 'ext4_for_linus-6.5-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: fix rbtree traversal bug in ext4_mb_use_preallocated ext4: fix off by one issue in ext4_mb_choose_next_group_best_avail() ext4: correct inline offset when handling xattrs in inode body jbd2: remove __journal_try_to_free_buffer() jbd2: fix a race when checking checkpoint buffer busy jbd2: Fix wrongly judgement for buffer head removing while doing checkpoint jbd2: remove journal_clean_one_cp_list() jbd2: remove t_checkpoint_io_list jbd2: recheck chechpointing non-dirty buffer
2 parents 8266f53 + 9d3de7e commit 15b593b

File tree

7 files changed

+262
-263
lines changed

7 files changed

+262
-263
lines changed

fs/ext4/mballoc.c

Lines changed: 140 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,24 +1006,28 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
10061006
* fls() instead since we need to know the actual length while modifying
10071007
* goal length.
10081008
*/
1009-
order = fls(ac->ac_g_ex.fe_len);
1009+
order = fls(ac->ac_g_ex.fe_len) - 1;
10101010
min_order = order - sbi->s_mb_best_avail_max_trim_order;
10111011
if (min_order < 0)
10121012
min_order = 0;
10131013

1014-
if (1 << min_order < ac->ac_o_ex.fe_len)
1015-
min_order = fls(ac->ac_o_ex.fe_len) + 1;
1016-
10171014
if (sbi->s_stripe > 0) {
10181015
/*
10191016
* We are assuming that stripe size is always a multiple of
10201017
* cluster ratio otherwise __ext4_fill_super exists early.
10211018
*/
10221019
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
10231020
if (1 << min_order < num_stripe_clusters)
1024-
min_order = fls(num_stripe_clusters);
1021+
/*
1022+
* We consider 1 order less because later we round
1023+
* up the goal len to num_stripe_clusters
1024+
*/
1025+
min_order = fls(num_stripe_clusters) - 1;
10251026
}
10261027

1028+
if (1 << min_order < ac->ac_o_ex.fe_len)
1029+
min_order = fls(ac->ac_o_ex.fe_len);
1030+
10271031
for (i = order; i >= min_order; i--) {
10281032
int frag_order;
10291033
/*
@@ -4761,56 +4765,160 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
47614765
int order, i;
47624766
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
47634767
struct ext4_locality_group *lg;
4764-
struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
4765-
ext4_lblk_t tmp_pa_start, tmp_pa_end;
4768+
struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
4769+
loff_t tmp_pa_end;
47664770
struct rb_node *iter;
47674771
ext4_fsblk_t goal_block;
47684772

47694773
/* only data can be preallocated */
47704774
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
47714775
return false;
47724776

4773-
/* first, try per-file preallocation */
4777+
/*
4778+
* first, try per-file preallocation by searching the inode pa rbtree.
4779+
*
4780+
* Here, we can't do a direct traversal of the tree because
4781+
* ext4_mb_discard_group_preallocation() can paralelly mark the pa
4782+
* deleted and that can cause direct traversal to skip some entries.
4783+
*/
47744784
read_lock(&ei->i_prealloc_lock);
4785+
4786+
if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
4787+
goto try_group_pa;
4788+
}
4789+
4790+
/*
4791+
* Step 1: Find a pa with logical start immediately adjacent to the
4792+
* original logical start. This could be on the left or right.
4793+
*
4794+
* (tmp_pa->pa_lstart never changes so we can skip locking for it).
4795+
*/
47754796
for (iter = ei->i_prealloc_node.rb_node; iter;
47764797
iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
4777-
tmp_pa_start, iter)) {
4798+
tmp_pa->pa_lstart, iter)) {
47784799
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
47794800
pa_node.inode_node);
4801+
}
47804802

4781-
/* all fields in this condition don't change,
4782-
* so we can skip locking for them */
4783-
tmp_pa_start = tmp_pa->pa_lstart;
4784-
tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4785-
4786-
/* original request start doesn't lie in this PA */
4787-
if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
4788-
ac->ac_o_ex.fe_logical >= tmp_pa_end)
4789-
continue;
4803+
/*
4804+
* Step 2: The adjacent pa might be to the right of logical start, find
4805+
* the left adjacent pa. After this step we'd have a valid tmp_pa whose
4806+
* logical start is towards the left of original request's logical start
4807+
*/
4808+
if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
4809+
struct rb_node *tmp;
4810+
tmp = rb_prev(&tmp_pa->pa_node.inode_node);
47904811

4791-
/* non-extent files can't have physical blocks past 2^32 */
4792-
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
4793-
(tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
4794-
EXT4_MAX_BLOCK_FILE_PHYS)) {
4812+
if (tmp) {
4813+
tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
4814+
pa_node.inode_node);
4815+
} else {
47954816
/*
4796-
* Since PAs don't overlap, we won't find any
4797-
* other PA to satisfy this.
4817+
* If there is no adjacent pa to the left then finding
4818+
* an overlapping pa is not possible hence stop searching
4819+
* inode pa tree
47984820
*/
4799-
break;
4821+
goto try_group_pa;
48004822
}
4823+
}
4824+
4825+
BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
48014826

4802-
/* found preallocated blocks, use them */
4827+
/*
4828+
* Step 3: If the left adjacent pa is deleted, keep moving left to find
4829+
* the first non deleted adjacent pa. After this step we should have a
4830+
* valid tmp_pa which is guaranteed to be non deleted.
4831+
*/
4832+
for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
4833+
if (!iter) {
4834+
/*
4835+
* no non deleted left adjacent pa, so stop searching
4836+
* inode pa tree
4837+
*/
4838+
goto try_group_pa;
4839+
}
4840+
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
4841+
pa_node.inode_node);
48034842
spin_lock(&tmp_pa->pa_lock);
4804-
if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free &&
4805-
likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
4806-
atomic_inc(&tmp_pa->pa_count);
4807-
ext4_mb_use_inode_pa(ac, tmp_pa);
4843+
if (tmp_pa->pa_deleted == 0) {
4844+
/*
4845+
* We will keep holding the pa_lock from
4846+
* this point on because we don't want group discard
4847+
* to delete this pa underneath us. Since group
4848+
* discard is anyways an ENOSPC operation it
4849+
* should be okay for it to wait a few more cycles.
4850+
*/
4851+
break;
4852+
} else {
48084853
spin_unlock(&tmp_pa->pa_lock);
4809-
read_unlock(&ei->i_prealloc_lock);
4810-
return true;
48114854
}
4855+
}
4856+
4857+
BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
4858+
BUG_ON(tmp_pa->pa_deleted == 1);
4859+
4860+
/*
4861+
* Step 4: We now have the non deleted left adjacent pa. Only this
4862+
* pa can possibly satisfy the request hence check if it overlaps
4863+
* original logical start and stop searching if it doesn't.
4864+
*/
4865+
tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
4866+
4867+
if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
48124868
spin_unlock(&tmp_pa->pa_lock);
4869+
goto try_group_pa;
4870+
}
4871+
4872+
/* non-extent files can't have physical blocks past 2^32 */
4873+
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
4874+
(tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
4875+
EXT4_MAX_BLOCK_FILE_PHYS)) {
4876+
/*
4877+
* Since PAs don't overlap, we won't find any other PA to
4878+
* satisfy this.
4879+
*/
4880+
spin_unlock(&tmp_pa->pa_lock);
4881+
goto try_group_pa;
4882+
}
4883+
4884+
if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
4885+
atomic_inc(&tmp_pa->pa_count);
4886+
ext4_mb_use_inode_pa(ac, tmp_pa);
4887+
spin_unlock(&tmp_pa->pa_lock);
4888+
read_unlock(&ei->i_prealloc_lock);
4889+
return true;
4890+
} else {
4891+
/*
4892+
* We found a valid overlapping pa but couldn't use it because
4893+
* it had no free blocks. This should ideally never happen
4894+
* because:
4895+
*
4896+
* 1. When a new inode pa is added to rbtree it must have
4897+
* pa_free > 0 since otherwise we won't actually need
4898+
* preallocation.
4899+
*
4900+
* 2. An inode pa that is in the rbtree can only have it's
4901+
* pa_free become zero when another thread calls:
4902+
* ext4_mb_new_blocks
4903+
* ext4_mb_use_preallocated
4904+
* ext4_mb_use_inode_pa
4905+
*
4906+
* 3. Further, after the above calls make pa_free == 0, we will
4907+
* immediately remove it from the rbtree in:
4908+
* ext4_mb_new_blocks
4909+
* ext4_mb_release_context
4910+
* ext4_mb_put_pa
4911+
*
4912+
* 4. Since the pa_free becoming 0 and pa_free getting removed
4913+
* from tree both happen in ext4_mb_new_blocks, which is always
4914+
* called with i_data_sem held for data allocations, we can be
4915+
* sure that another process will never see a pa in rbtree with
4916+
* pa_free == 0.
4917+
*/
4918+
WARN_ON_ONCE(tmp_pa->pa_free == 0);
48134919
}
4920+
spin_unlock(&tmp_pa->pa_lock);
4921+
try_group_pa:
48144922
read_unlock(&ei->i_prealloc_lock);
48154923

48164924
/* can we use group allocation? */

fs/ext4/xattr.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,6 +1782,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
17821782
memmove(here, (void *)here + size,
17831783
(void *)last - (void *)here + sizeof(__u32));
17841784
memset(last, 0, size);
1785+
1786+
/*
1787+
* Update i_inline_off - moved ibody region might contain
1788+
* system.data attribute. Handling a failure here won't
1789+
* cause other complications for setting an xattr.
1790+
*/
1791+
if (!is_block && ext4_has_inline_data(inode)) {
1792+
ret = ext4_find_inline_data_nolock(inode);
1793+
if (ret) {
1794+
ext4_warning_inode(inode,
1795+
"unable to update i_inline_off");
1796+
goto out;
1797+
}
1798+
}
17851799
} else if (s->not_found) {
17861800
/* Insert new name. */
17871801
size_t size = EXT4_XATTR_LEN(name_len);

0 commit comments

Comments
 (0)