Skip to content

Commit b092991

Browse files
dingtianhongdavem330
authored andcommitted
bonding: Fix RTNL: assertion failed at net/core/rtnetlink.c for ab arp monitor
Veaceslav has reported and fix this problem by commit f2ebd47 (bonding: restructure locking of bond_ab_arp_probe()). According Jay's opinion, the current solution is not very well, because the notification is to indicate that the interface has actually changed state in a meaningful way, but these calls in the ab ARP monitor are internal settings of the flags to allow the ARP monitor to search for a slave to become active when there are no active slaves. The flag setting to active or backup is to permit the ARP monitor's response logic to do the right thing when deciding if the test slave (current_arp_slave) is up or not. So the best way to fix the problem is that we should not send a notification when the slave is in testing state, and check the state at the end of the monitor, if the slave's state recover, avoid to send pointless notification twice. And RTNL is really a big lock, hold it regardless the slave's state changed or not when the current_active_slave is null will loss performance (every 100ms), so we should hold it only when the slave's state changed and need to notify. I revert the old commit and add new modifications. Cc: Jay Vosburgh <[email protected]> Cc: Veaceslav Falico <[email protected]> Cc: Andy Gospodarek <[email protected]> Signed-off-by: Ding Tianhong <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5e5b066 commit b092991

File tree

3 files changed

+55
-46
lines changed

3 files changed

+55
-46
lines changed

drivers/net/bonding/bond_3ad.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2130,13 +2130,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
21302130
read_unlock(&bond->lock);
21312131

21322132
if (should_notify_rtnl && rtnl_trylock()) {
2133-
bond_for_each_slave(bond, slave, iter) {
2134-
if (slave->should_notify) {
2135-
rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0,
2136-
GFP_KERNEL);
2137-
slave->should_notify = 0;
2138-
}
2139-
}
2133+
bond_slave_state_notify(bond);
21402134
rtnl_unlock();
21412135
}
21422136
queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks);

drivers/net/bonding/bond_main.c

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2623,17 +2623,17 @@ static void bond_ab_arp_commit(struct bonding *bond)
26232623

26242624
/*
26252625
* Send ARP probes for active-backup mode ARP monitor.
2626+
*
2627+
* Called with rcu_read_lock hold.
26262628
*/
26272629
static bool bond_ab_arp_probe(struct bonding *bond)
26282630
{
26292631
struct slave *slave, *before = NULL, *new_slave = NULL,
2630-
*curr_arp_slave, *curr_active_slave;
2632+
*curr_arp_slave = rcu_dereference(bond->current_arp_slave),
2633+
*curr_active_slave = rcu_dereference(bond->curr_active_slave);
26312634
struct list_head *iter;
26322635
bool found = false;
2633-
2634-
rcu_read_lock();
2635-
curr_arp_slave = rcu_dereference(bond->current_arp_slave);
2636-
curr_active_slave = rcu_dereference(bond->curr_active_slave);
2636+
bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;
26372637

26382638
if (curr_arp_slave && curr_active_slave)
26392639
pr_info("PROBE: c_arp %s && cas %s BAD\n",
@@ -2642,32 +2642,23 @@ static bool bond_ab_arp_probe(struct bonding *bond)
26422642

26432643
if (curr_active_slave) {
26442644
bond_arp_send_all(bond, curr_active_slave);
2645-
rcu_read_unlock();
2646-
return true;
2645+
return should_notify_rtnl;
26472646
}
2648-
rcu_read_unlock();
26492647

26502648
/* if we don't have a curr_active_slave, search for the next available
26512649
* backup slave from the current_arp_slave and make it the candidate
26522650
* for becoming the curr_active_slave
26532651
*/
26542652

2655-
if (!rtnl_trylock())
2656-
return false;
2657-
/* curr_arp_slave might have gone away */
2658-
curr_arp_slave = ACCESS_ONCE(bond->current_arp_slave);
2659-
26602653
if (!curr_arp_slave) {
2661-
curr_arp_slave = bond_first_slave(bond);
2662-
if (!curr_arp_slave) {
2663-
rtnl_unlock();
2664-
return true;
2665-
}
2654+
curr_arp_slave = bond_first_slave_rcu(bond);
2655+
if (!curr_arp_slave)
2656+
return should_notify_rtnl;
26662657
}
26672658

2668-
bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_NOW);
2659+
bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER);
26692660

2670-
bond_for_each_slave(bond, slave, iter) {
2661+
bond_for_each_slave_rcu(bond, slave, iter) {
26712662
if (!found && !before && IS_UP(slave->dev))
26722663
before = slave;
26732664

@@ -2686,7 +2677,7 @@ static bool bond_ab_arp_probe(struct bonding *bond)
26862677
slave->link_failure_count++;
26872678

26882679
bond_set_slave_inactive_flags(slave,
2689-
BOND_SLAVE_NOTIFY_NOW);
2680+
BOND_SLAVE_NOTIFY_LATER);
26902681

26912682
pr_info("%s: backup interface %s is now down.\n",
26922683
bond->dev->name, slave->dev->name);
@@ -2698,26 +2689,31 @@ static bool bond_ab_arp_probe(struct bonding *bond)
26982689
if (!new_slave && before)
26992690
new_slave = before;
27002691

2701-
if (!new_slave) {
2702-
rtnl_unlock();
2703-
return true;
2704-
}
2692+
if (!new_slave)
2693+
goto check_state;
27052694

27062695
new_slave->link = BOND_LINK_BACK;
2707-
bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
2696+
bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
27082697
bond_arp_send_all(bond, new_slave);
27092698
new_slave->jiffies = jiffies;
27102699
rcu_assign_pointer(bond->current_arp_slave, new_slave);
2711-
rtnl_unlock();
27122700

2713-
return true;
2701+
check_state:
2702+
bond_for_each_slave_rcu(bond, slave, iter) {
2703+
if (slave->should_notify) {
2704+
should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
2705+
break;
2706+
}
2707+
}
2708+
return should_notify_rtnl;
27142709
}
27152710

27162711
static void bond_activebackup_arp_mon(struct work_struct *work)
27172712
{
27182713
struct bonding *bond = container_of(work, struct bonding,
27192714
arp_work.work);
2720-
bool should_notify_peers = false, should_commit = false;
2715+
bool should_notify_peers = false;
2716+
bool should_notify_rtnl = false;
27212717
int delta_in_ticks;
27222718

27232719
delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
@@ -2726,11 +2722,12 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
27262722
goto re_arm;
27272723

27282724
rcu_read_lock();
2725+
27292726
should_notify_peers = bond_should_notify_peers(bond);
2730-
should_commit = bond_ab_arp_inspect(bond);
2731-
rcu_read_unlock();
27322727

2733-
if (should_commit) {
2728+
if (bond_ab_arp_inspect(bond)) {
2729+
rcu_read_unlock();
2730+
27342731
/* Race avoidance with bond_close flush of workqueue */
27352732
if (!rtnl_trylock()) {
27362733
delta_in_ticks = 1;
@@ -2739,23 +2736,28 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
27392736
}
27402737

27412738
bond_ab_arp_commit(bond);
2739+
27422740
rtnl_unlock();
2741+
rcu_read_lock();
27432742
}
27442743

2745-
if (!bond_ab_arp_probe(bond)) {
2746-
/* rtnl locking failed, re-arm */
2747-
delta_in_ticks = 1;
2748-
should_notify_peers = false;
2749-
}
2744+
should_notify_rtnl = bond_ab_arp_probe(bond);
2745+
rcu_read_unlock();
27502746

27512747
re_arm:
27522748
if (bond->params.arp_interval)
27532749
queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
27542750

2755-
if (should_notify_peers) {
2751+
if (should_notify_peers || should_notify_rtnl) {
27562752
if (!rtnl_trylock())
27572753
return;
2758-
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
2754+
2755+
if (should_notify_peers)
2756+
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
2757+
bond->dev);
2758+
if (should_notify_rtnl)
2759+
bond_slave_state_notify(bond);
2760+
27592761
rtnl_unlock();
27602762
}
27612763
}

drivers/net/bonding/bonding.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,19 @@ static inline void bond_slave_state_change(struct bonding *bond)
335335
}
336336
}
337337

338+
static inline void bond_slave_state_notify(struct bonding *bond)
339+
{
340+
struct list_head *iter;
341+
struct slave *tmp;
342+
343+
bond_for_each_slave(bond, tmp, iter) {
344+
if (tmp->should_notify) {
345+
rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_KERNEL);
346+
tmp->should_notify = 0;
347+
}
348+
}
349+
}
350+
338351
static inline int bond_slave_state(struct slave *slave)
339352
{
340353
return slave->backup;

0 commit comments

Comments
 (0)