Skip to content

Commit 82a678e

Browse files
committed
Merge branch 'bridge-mdb-events'
Tobias Waldekranz says: ==================== net: bridge: switchdev: Ensure MDB events are delivered exactly once When a device is attached to a bridge, drivers will request a replay of objects that were created before the device joined the bridge, that are still of interest to the joining port. Typical examples include FDB entries and MDB memberships on other ports ("foreign interfaces") or on the bridge itself. Conversely when a device is detached, the bridge will synthesize deletion events for all those objects that are still live, but no longer applicable to the device in question. This series eliminates two races related to the synching and unsynching phases of a bridge's MDB with a joining or leaving device, that would cause notifications of such objects to be either delivered twice (1/2), or not at all (2/2). A similar race to the one solved by 1/2 still remains for the FDB. This is much harder to solve, due to the lockless operation of the FDB's rhashtable, and is therefore knowingly left out of this series. v1 -> v2: - Squash the previously separate addition of switchdev_port_obj_act_is_deferred into first consumer. - Use ether_addr_equal to compare MAC addresses. - Document switchdev_port_obj_act_is_deferred (renamed from switchdev_port_obj_is_deferred in v1, to indicate that we also match on the action). - Delay allocations of MDB objects until we know they're needed. - Use non-RCU version of the hash list iterator, now that the MDB is not scanned while holding the RCU read lock. - Add Fixes tag to commit message v2 -> v3: - Fix unlocking in error paths - Access RCU protected port list via mlock_dereference, since MDB is guaranteed to remain constant for the duration of the scan. v3 -> v4: - Limit the search for exiting deferred events in 1/2 to only apply to additions, since the problem does not exist in the deletion case. - Add 2/2, to plug a related race when unoffloading an indirectly associated device. v4 -> v5: - Fix grammatical errors in kerneldoc of switchdev_port_obj_act_is_deferred ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents b4ea9b6 + f7a70d6 commit 82a678e

File tree

3 files changed

+132
-28
lines changed

3 files changed

+132
-28
lines changed

include/net/switchdev.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,9 @@ void switchdev_deferred_process(void);
308308
int switchdev_port_attr_set(struct net_device *dev,
309309
const struct switchdev_attr *attr,
310310
struct netlink_ext_ack *extack);
311+
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
312+
enum switchdev_notifier_type nt,
313+
const struct switchdev_obj *obj);
311314
int switchdev_port_obj_add(struct net_device *dev,
312315
const struct switchdev_obj *obj,
313316
struct netlink_ext_ack *extack);

net/bridge/br_switchdev.c

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -595,21 +595,40 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
595595
}
596596

597597
static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
598+
struct net_device *dev,
599+
unsigned long action,
598600
enum switchdev_obj_id id,
599601
const struct net_bridge_mdb_entry *mp,
600602
struct net_device *orig_dev)
601603
{
602-
struct switchdev_obj_port_mdb *mdb;
604+
struct switchdev_obj_port_mdb mdb = {
605+
.obj = {
606+
.id = id,
607+
.orig_dev = orig_dev,
608+
},
609+
};
610+
struct switchdev_obj_port_mdb *pmdb;
603611

604-
mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
605-
if (!mdb)
606-
return -ENOMEM;
612+
br_switchdev_mdb_populate(&mdb, mp);
613+
614+
if (action == SWITCHDEV_PORT_OBJ_ADD &&
615+
switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
616+
/* This event is already in the deferred queue of
617+
* events, so this replay must be elided, lest the
618+
* driver receives duplicate events for it. This can
619+
* only happen when replaying additions, since
620+
* modifications are always immediately visible in
621+
* br->mdb_list, whereas actual event delivery may be
622+
* delayed.
623+
*/
624+
return 0;
625+
}
607626

608-
mdb->obj.id = id;
609-
mdb->obj.orig_dev = orig_dev;
610-
br_switchdev_mdb_populate(mdb, mp);
611-
list_add_tail(&mdb->obj.list, mdb_list);
627+
pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
628+
if (!pmdb)
629+
return -ENOMEM;
612630

631+
list_add_tail(&pmdb->obj.list, mdb_list);
613632
return 0;
614633
}
615634

@@ -677,51 +696,50 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
677696
if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
678697
return 0;
679698

680-
/* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
681-
* because the write-side protection is br->multicast_lock. But we
682-
* need to emulate the [ blocking ] calling context of a regular
683-
* switchdev event, so since both br->multicast_lock and RCU read side
684-
* critical sections are atomic, we have no choice but to pick the RCU
685-
* read side lock, queue up all our events, leave the critical section
686-
* and notify switchdev from blocking context.
699+
if (adding)
700+
action = SWITCHDEV_PORT_OBJ_ADD;
701+
else
702+
action = SWITCHDEV_PORT_OBJ_DEL;
703+
704+
/* br_switchdev_mdb_queue_one() will take care to not queue a
705+
* replay of an event that is already pending in the switchdev
706+
* deferred queue. In order to safely determine that, there
707+
* must be no new deferred MDB notifications enqueued for the
708+
* duration of the MDB scan. Therefore, grab the write-side
709+
* lock to avoid racing with any concurrent IGMP/MLD snooping.
687710
*/
688-
rcu_read_lock();
711+
spin_lock_bh(&br->multicast_lock);
689712

690-
hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
713+
hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
691714
struct net_bridge_port_group __rcu * const *pp;
692715
const struct net_bridge_port_group *p;
693716

694717
if (mp->host_joined) {
695-
err = br_switchdev_mdb_queue_one(&mdb_list,
718+
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
696719
SWITCHDEV_OBJ_ID_HOST_MDB,
697720
mp, br_dev);
698721
if (err) {
699-
rcu_read_unlock();
722+
spin_unlock_bh(&br->multicast_lock);
700723
goto out_free_mdb;
701724
}
702725
}
703726

704-
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
727+
for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
705728
pp = &p->next) {
706729
if (p->key.port->dev != dev)
707730
continue;
708731

709-
err = br_switchdev_mdb_queue_one(&mdb_list,
732+
err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
710733
SWITCHDEV_OBJ_ID_PORT_MDB,
711734
mp, dev);
712735
if (err) {
713-
rcu_read_unlock();
736+
spin_unlock_bh(&br->multicast_lock);
714737
goto out_free_mdb;
715738
}
716739
}
717740
}
718741

719-
rcu_read_unlock();
720-
721-
if (adding)
722-
action = SWITCHDEV_PORT_OBJ_ADD;
723-
else
724-
action = SWITCHDEV_PORT_OBJ_DEL;
742+
spin_unlock_bh(&br->multicast_lock);
725743

726744
list_for_each_entry(obj, &mdb_list, list) {
727745
err = br_switchdev_mdb_replay_one(nb, dev,
@@ -786,6 +804,16 @@ static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
786804
br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
787805

788806
br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
807+
808+
/* Make sure that the device leaving this bridge has seen all
809+
* relevant events before it is disassociated. In the normal
810+
* case, when the device is directly attached to the bridge,
811+
* this is covered by del_nbp(). If the association was indirect
812+
* however, e.g. via a team or bond, and the device is leaving
813+
* that intermediate device, then the bridge port remains in
814+
* place.
815+
*/
816+
switchdev_deferred_process();
789817
}
790818

791819
/* Let the bridge know that this port is offloaded, so that it can assign a

net/switchdev/switchdev.c

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,35 @@
1919
#include <linux/rtnetlink.h>
2020
#include <net/switchdev.h>
2121

22+
static bool switchdev_obj_eq(const struct switchdev_obj *a,
23+
const struct switchdev_obj *b)
24+
{
25+
const struct switchdev_obj_port_vlan *va, *vb;
26+
const struct switchdev_obj_port_mdb *ma, *mb;
27+
28+
if (a->id != b->id || a->orig_dev != b->orig_dev)
29+
return false;
30+
31+
switch (a->id) {
32+
case SWITCHDEV_OBJ_ID_PORT_VLAN:
33+
va = SWITCHDEV_OBJ_PORT_VLAN(a);
34+
vb = SWITCHDEV_OBJ_PORT_VLAN(b);
35+
return va->flags == vb->flags &&
36+
va->vid == vb->vid &&
37+
va->changed == vb->changed;
38+
case SWITCHDEV_OBJ_ID_PORT_MDB:
39+
case SWITCHDEV_OBJ_ID_HOST_MDB:
40+
ma = SWITCHDEV_OBJ_PORT_MDB(a);
41+
mb = SWITCHDEV_OBJ_PORT_MDB(b);
42+
return ma->vid == mb->vid &&
43+
ether_addr_equal(ma->addr, mb->addr);
44+
default:
45+
break;
46+
}
47+
48+
BUG();
49+
}
50+
2251
static LIST_HEAD(deferred);
2352
static DEFINE_SPINLOCK(deferred_lock);
2453

@@ -307,6 +336,50 @@ int switchdev_port_obj_del(struct net_device *dev,
307336
}
308337
EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
309338

339+
/**
340+
* switchdev_port_obj_act_is_deferred - Is object action pending?
341+
*
342+
* @dev: port device
343+
* @nt: type of action; add or delete
344+
* @obj: object to test
345+
*
346+
* Returns true if a deferred item is pending, which is
347+
* equivalent to the action @nt on an object @obj.
348+
*
349+
* rtnl_lock must be held.
350+
*/
351+
bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
352+
enum switchdev_notifier_type nt,
353+
const struct switchdev_obj *obj)
354+
{
355+
struct switchdev_deferred_item *dfitem;
356+
bool found = false;
357+
358+
ASSERT_RTNL();
359+
360+
spin_lock_bh(&deferred_lock);
361+
362+
list_for_each_entry(dfitem, &deferred, list) {
363+
if (dfitem->dev != dev)
364+
continue;
365+
366+
if ((dfitem->func == switchdev_port_obj_add_deferred &&
367+
nt == SWITCHDEV_PORT_OBJ_ADD) ||
368+
(dfitem->func == switchdev_port_obj_del_deferred &&
369+
nt == SWITCHDEV_PORT_OBJ_DEL)) {
370+
if (switchdev_obj_eq((const void *)dfitem->data, obj)) {
371+
found = true;
372+
break;
373+
}
374+
}
375+
}
376+
377+
spin_unlock_bh(&deferred_lock);
378+
379+
return found;
380+
}
381+
EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred);
382+
310383
static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
311384
static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
312385

0 commit comments

Comments
 (0)