Skip to content

Commit 353e6be

Browse files
biger410jfvogel
authored andcommitted
md: fix deadlock causing by sysfs_notify
The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: ffff8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [ffffb87c4df37260] __schedule at ffffffff9a8678ec #1 [ffffb87c4df372f8] schedule at ffffffff9a867f06 #2 [ffffb87c4df37310] io_schedule at ffffffff9a0c73e6 #3 [ffffb87c4df37328] __dta___xfs_iunpin_wait_3443 at ffffffffc03a4057 [xfs] #4 [ffffb87c4df373a0] xfs_iunpin_wait at ffffffffc03a6c79 [xfs] #5 [ffffb87c4df373b0] __dta_xfs_reclaim_inode_3357 at ffffffffc039a46c [xfs] #6 [ffffb87c4df37400] xfs_reclaim_inodes_ag at ffffffffc039a8b6 [xfs] #7 [ffffb87c4df37590] xfs_reclaim_inodes_nr at ffffffffc039bb33 [xfs] #8 [ffffb87c4df375b0] xfs_fs_free_cached_objects at ffffffffc03af0e9 [xfs] #9 [ffffb87c4df375c0] super_cache_scan at ffffffff9a287ec7 #10 [ffffb87c4df37618] shrink_slab at ffffffff9a1efd93 #11 [ffffb87c4df37700] shrink_node at ffffffff9a1f5968 #12 [ffffb87c4df37788] do_try_to_free_pages at ffffffff9a1f5ea2 #13 [ffffb87c4df377f0] try_to_free_mem_cgroup_pages at ffffffff9a1f6445 #14 [ffffb87c4df37880] try_charge at ffffffff9a26cc5f #15 [ffffb87c4df37920] memcg_kmem_charge_memcg at ffffffff9a270f6a #16 [ffffb87c4df37958] new_slab at ffffffff9a251430 #17 [ffffb87c4df379c0] ___slab_alloc at ffffffff9a251c85 #18 [ffffb87c4df37a80] __slab_alloc at ffffffff9a25635d #19 [ffffb87c4df37ac0] kmem_cache_alloc at ffffffff9a251f89 #20 [ffffb87c4df37b00] alloc_inode at ffffffff9a2a2b10 #21 [ffffb87c4df37b20] iget_locked at ffffffff9a2a4854 #22 [ffffb87c4df37b60] kernfs_get_inode at ffffffff9a311377 #23 [ffffb87c4df37b80] kernfs_iop_lookup at ffffffff9a311e2b #24 [ffffb87c4df37ba8] lookup_slow at ffffffff9a290118 #25 [ffffb87c4df37c10] walk_component at ffffffff9a291e83 #26 [ffffb87c4df37c78] path_lookupat at ffffffff9a293619 #27 [ffffb87c4df37cd8] filename_lookup at ffffffff9a2953af #28 [ffffb87c4df37de8] user_path_at_empty at ffffffff9a295566 #29 [ffffb87c4df37e10] vfs_statx at ffffffff9a289787 #30 [ffffb87c4df37e70] SYSC_newlstat at ffffffff9a289d5d #31 [ffffb87c4df37f18] sys_newlstat at ffffffff9a28a60e #32 [ffffb87c4df37f28] do_syscall_64 at ffffffff9a003949 #33 [ffffb87c4df37f50] entry_SYSCALL_64_after_hwframe at ffffffff9aa001ad RIP: 00007f617a5f2905 RSP: 00007f607334f838 RFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007f6064044b20 RCX: 00007f617a5f2905 RDX: 00007f6064044b20 RSI: 00007f6064044b20 RDI: 00007f6064005890 RBP: 00007f6064044aa0 R8: 0000000000000030 R9: 000000000000011c R10: 0000000000000013 R11: 0000000000000246 R12: 00007f606417e6d0 R13: 00007f6064044aa0 R14: 00007f6064044b10 R15: 00000000ffffffff ORIG_RAX: 0000000000000006 CS: 0033 SS: 002b PID: 927 TASK: ffff8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [ffffb87c4df07b28] __schedule at ffffffff9a8678ec #1 [ffffb87c4df07bc0] schedule at ffffffff9a867f06 #2 [ffffb87c4df07bd8] schedule_preempt_disabled at ffffffff9a86825e #3 [ffffb87c4df07be8] __mutex_lock at ffffffff9a869bcc #4 [ffffb87c4df07ca0] __mutex_lock_slowpath at ffffffff9a86a013 #5 [ffffb87c4df07cb0] mutex_lock at ffffffff9a86a04f #6 [ffffb87c4df07cc8] kernfs_find_and_get_ns at ffffffff9a311d83 #7 [ffffb87c4df07cf0] sysfs_notify at ffffffff9a314b3a #8 [ffffb87c4df07d18] md_update_sb at ffffffff9a688696 #9 [ffffb87c4df07d98] md_update_sb at ffffffff9a6886d5 #10 [ffffb87c4df07da8] md_check_recovery at ffffffff9a68ad9c #11 [ffffb87c4df07dd0] raid1d at ffffffffc01f0375 [raid1] #12 [ffffb87c4df07ea0] md_thread at ffffffff9a680348 #13 [ffffb87c4df07f08] kthread at ffffffff9a0b8005 #14 [ffffb87c4df07f50] ret_from_fork at ffffffff9aa00344 Signed-off-by: Junxiao Bi <[email protected]> Signed-off-by: Song Liu <[email protected]> (cherry picked from commit e1a86db) Orabug: 31682031 Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Joe Jin <[email protected]>
1 parent 68f57ee commit 353e6be

File tree

5 files changed

+42
-20
lines changed

5 files changed

+42
-20
lines changed

drivers/md/md-bitmap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1636,7 +1636,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
16361636
s += blocks;
16371637
}
16381638
bitmap->last_end_sync = jiffies;
1639-
sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
1639+
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
16401640
}
16411641
EXPORT_SYMBOL(md_bitmap_cond_end_sync);
16421642

drivers/md/md.c

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2346,6 +2346,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
23462346
if (sysfs_create_link(&rdev->kobj, ko, "block"))
23472347
/* failure here is OK */;
23482348
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2349+
rdev->sysfs_unack_badblocks =
2350+
sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2351+
rdev->sysfs_badblocks =
2352+
sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
23492353

23502354
list_add_rcu(&rdev->same_set, &mddev->disks);
23512355
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
@@ -2379,7 +2383,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
23792383
rdev->mddev = NULL;
23802384
sysfs_remove_link(&rdev->kobj, "block");
23812385
sysfs_put(rdev->sysfs_state);
2386+
sysfs_put(rdev->sysfs_unack_badblocks);
2387+
sysfs_put(rdev->sysfs_badblocks);
23822388
rdev->sysfs_state = NULL;
2389+
rdev->sysfs_unack_badblocks = NULL;
2390+
rdev->sysfs_badblocks = NULL;
23832391
rdev->badblocks.count = 0;
23842392
/* We need to delay this, otherwise we can deadlock when
23852393
* writing to 'remove' to "dev/state". We also need
@@ -2724,7 +2732,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
27242732
goto repeat;
27252733
wake_up(&mddev->sb_wait);
27262734
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2727-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2735+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
27282736

27292737
rdev_for_each(rdev, mddev) {
27302738
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
@@ -3978,7 +3986,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
39783986
mddev_resume(mddev);
39793987
if (!mddev->thread)
39803988
md_update_sb(mddev, 1);
3981-
sysfs_notify(&mddev->kobj, NULL, "level");
3989+
sysfs_notify_dirent_safe(mddev->sysfs_level);
39823990
md_new_event(mddev);
39833991
rv = len;
39843992
out_unlock:
@@ -4717,7 +4725,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
47174725
}
47184726
if (err)
47194727
return err;
4720-
sysfs_notify(&mddev->kobj, NULL, "degraded");
4728+
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
47214729
} else {
47224730
if (cmd_match(page, "check"))
47234731
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -5371,6 +5379,13 @@ static void md_free(struct kobject *ko)
53715379

53725380
if (mddev->sysfs_state)
53735381
sysfs_put(mddev->sysfs_state);
5382+
if (mddev->sysfs_completed)
5383+
sysfs_put(mddev->sysfs_completed);
5384+
if (mddev->sysfs_degraded)
5385+
sysfs_put(mddev->sysfs_degraded);
5386+
if (mddev->sysfs_level)
5387+
sysfs_put(mddev->sysfs_level);
5388+
53745389

53755390
if (mddev->gendisk)
53765391
del_gendisk(mddev->gendisk);
@@ -5534,6 +5549,9 @@ static int md_alloc(dev_t dev, char *name)
55345549
if (!error && mddev->kobj.sd) {
55355550
kobject_uevent(&mddev->kobj, KOBJ_ADD);
55365551
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5552+
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
5553+
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
5554+
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
55375555
}
55385556
mddev_put(mddev);
55395557
return error;
@@ -5888,7 +5906,7 @@ static int do_md_run(struct mddev *mddev)
58885906
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
58895907
sysfs_notify_dirent_safe(mddev->sysfs_state);
58905908
sysfs_notify_dirent_safe(mddev->sysfs_action);
5891-
sysfs_notify(&mddev->kobj, NULL, "degraded");
5909+
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
58925910
out:
58935911
clear_bit(MD_NOT_READY, &mddev->flags);
58945912
return err;
@@ -8605,7 +8623,7 @@ void md_do_sync(struct md_thread *thread)
86058623
} else
86068624
mddev->curr_resync = 3; /* no longer delayed */
86078625
mddev->curr_resync_completed = j;
8608-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8626+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
86098627
md_new_event(mddev);
86108628
update_time = jiffies;
86118629

@@ -8633,7 +8651,7 @@ void md_do_sync(struct md_thread *thread)
86338651
mddev->recovery_cp = j;
86348652
update_time = jiffies;
86358653
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8636-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8654+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
86378655
}
86388656

86398657
while (j >= mddev->resync_max &&
@@ -8740,7 +8758,7 @@ void md_do_sync(struct md_thread *thread)
87408758
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
87418759
mddev->curr_resync > 3) {
87428760
mddev->curr_resync_completed = mddev->curr_resync;
8743-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8761+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
87448762
}
87458763
mddev->pers->sync_request(mddev, max_sectors, &skipped);
87468764

@@ -8870,7 +8888,7 @@ static int remove_and_add_spares(struct mddev *mddev,
88708888
}
88718889

88728890
if (removed && mddev->kobj.sd)
8873-
sysfs_notify(&mddev->kobj, NULL, "degraded");
8891+
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
88748892

88758893
if (this && removed)
88768894
goto no_add;
@@ -9154,8 +9172,7 @@ void md_reap_sync_thread(struct mddev *mddev)
91549172
/* success...*/
91559173
/* activate any spares */
91569174
if (mddev->pers->spare_active(mddev)) {
9157-
sysfs_notify(&mddev->kobj, NULL,
9158-
"degraded");
9175+
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
91599176
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
91609177
}
91619178
}
@@ -9245,8 +9262,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
92459262
if (rv == 0) {
92469263
/* Make sure they get written out promptly */
92479264
if (test_bit(ExternalBbl, &rdev->flags))
9248-
sysfs_notify(&rdev->kobj, NULL,
9249-
"unacknowledged_bad_blocks");
9265+
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
92509266
sysfs_notify_dirent_safe(rdev->sysfs_state);
92519267
set_mask_bits(&mddev->sb_flags, 0,
92529268
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
@@ -9267,7 +9283,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
92679283
s += rdev->data_offset;
92689284
rv = badblocks_clear(&rdev->badblocks, s, sectors);
92699285
if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9270-
sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9286+
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
92719287
return rv;
92729288
}
92739289
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
@@ -9491,7 +9507,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
94919507
if (rdev->recovery_offset == MaxSector &&
94929508
!test_bit(In_sync, &rdev->flags) &&
94939509
mddev->pers->spare_active(mddev))
9494-
sysfs_notify(&mddev->kobj, NULL, "degraded");
9510+
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
94959511

94969512
put_page(swapout);
94979513
return 0;

drivers/md/md.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,10 @@ struct md_rdev {
121121

122122
struct kernfs_node *sysfs_state; /* handle for 'state'
123123
* sysfs entry */
124-
124+
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */
125+
struct kernfs_node *sysfs_unack_badblocks;
126+
/* handle for 'bad_blocks' sysfs dentry */
127+
struct kernfs_node *sysfs_badblocks;
125128
struct badblocks badblocks;
126129

127130
struct {
@@ -414,6 +417,9 @@ struct mddev {
414417
* file in sysfs.
415418
*/
416419
struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
420+
struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
421+
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
422+
struct kernfs_node *sysfs_level; /*handle for 'level' */
417423

418424
struct work_struct del_work; /* used for delayed sysfs removal */
419425

drivers/md/raid10.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4454,7 +4454,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
44544454
sector_nr = conf->reshape_progress;
44554455
if (sector_nr) {
44564456
mddev->curr_resync_completed = sector_nr;
4457-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4457+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
44584458
*skipped = 1;
44594459
return sector_nr;
44604460
}

drivers/md/raid5.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5796,7 +5796,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
57965796
sector_div(sector_nr, new_data_disks);
57975797
if (sector_nr) {
57985798
mddev->curr_resync_completed = sector_nr;
5799-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5799+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
58005800
*skipped = 1;
58015801
retn = sector_nr;
58025802
goto finish;
@@ -5910,7 +5910,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
59105910
conf->reshape_safe = mddev->reshape_position;
59115911
spin_unlock_irq(&conf->device_lock);
59125912
wake_up(&conf->wait_for_overlap);
5913-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5913+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
59145914
}
59155915

59165916
INIT_LIST_HEAD(&stripes);
@@ -6017,7 +6017,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
60176017
conf->reshape_safe = mddev->reshape_position;
60186018
spin_unlock_irq(&conf->device_lock);
60196019
wake_up(&conf->wait_for_overlap);
6020-
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6020+
sysfs_notify_dirent_safe(mddev->sysfs_completed);
60216021
}
60226022
ret:
60236023
return retn;

0 commit comments

Comments
 (0)