Skip to content

Commit 232c001

Browse files
hadarhenzionSaeed Mahameed
authored and
Saeed Mahameed
committed
net/mlx5e: Add support to neighbour update flow
In order to offload TC encap rules, the driver does a lookup for the IP tunnel neighbour according to the output device and the destination IP given by the user. To keep tracking after the validity state of such neighbours, we keep the neighbours information (pair of device pointer and destination IP) in a hash table maintained at the relevant egress representor and register to get NETEVENT_NEIGH_UPDATE events. When getting neighbour update netevent, we search for a match among the cached neighbours entries used for encapsulation. In case the neighbour isn't valid, we can't offload the flow into the HW. We cache the flow (requested matching and actions) in the driver and offload the rule later, when the neighbour is resolved and becomes valid. When a flow is only cached in the driver and not offloaded into HW yet, we use EAGAIN return value to mark it internally, the TC ndo still returns success. Listen to kernel neighbour update netevents to trace relevant neighbours validity state: 1. If a neighbour becomes valid, offload the related rules to HW. 2. If the neighbour becomes invalid, remove the related rules from HW. 3. If the neighbour mac address was changed, update the encap header. Remove all the offloaded rules using the old encap header from the HW and insert new rules to HW with updated encap header. Access to the neighbors hash table is protected by RTNL lock of its caller or by the table's spinlock. Details of the locking/synchronization among the different actions applied on the neighbour table: Add/remove operations - protected by RTNL lock of its caller (all TC commands are protected by RTNL lock). Add and remove operations are initiated only when the user inserts/removes a TC rule into/from the driver. Lookup/remove operations - since the lookup operation is done from netevent notifier block, RTNL lock can't be used (atomic context). Use the table's spin lock to protect lookups from TC user removal operation. bh is used since netevent can be called from a softirq context. Lookup/add operations - The hash table access functions are taking care of the protection between lookup and add operations. When adding/removing encap headers and rules to/from the HW, RTNL lock is used. It can happen when: 1. The user inserts/removes a TC rule into/from the driver (TC commands are protected by RTNL lock of it's caller). 2. The driver gets neighbour notification event, which reports about neighbour validity status change. Before adding/removing encap headers and rules to/from the HW, RTNL lock is taken. A neighbour hash table entry should be freed when its encap list is empty. Since The neighbour update netevent notification schedules a neighbour update work that uses the neighbour hash entry, it can't be freed unconditionally when the encap list becomes empty during TC delete rule flow. Use reference count to protect from freeing neighbour hash table entry while it's still in use. When the user asks to unregister a netdvice used by one of the neigbours, neighbour removal notification is received. Then we take a reference on the neighbour and don't free it until the relevant encap entries (and flows) are marked as invalid (not offloaded) and removed from HW. As long as the encap entry is still valid (checked under RTNL lock) we can safely access the neighbour device saved on mlx5e_neigh struct. Signed-off-by: Hadar Hen Zion <[email protected]> Reviewed-by: Or Gerlitz <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 37b498f commit 232c001

File tree

5 files changed

+434
-43
lines changed

5 files changed

+434
-43
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en_rep.c

Lines changed: 229 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include <linux/mlx5/fs.h>
3535
#include <net/switchdev.h>
3636
#include <net/pkt_cls.h>
37+
#include <net/netevent.h>
38+
#include <net/arp.h>
3739

3840
#include "eswitch.h"
3941
#include "en.h"
@@ -224,6 +226,140 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
224226
mlx5_eswitch_sqs2vport_stop(esw, rep);
225227
}
226228

229+
static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
230+
{
231+
refcount_inc(&nhe->refcnt);
232+
}
233+
234+
static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
235+
{
236+
if (refcount_dec_and_test(&nhe->refcnt))
237+
kfree(nhe);
238+
}
239+
240+
static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
241+
struct mlx5e_encap_entry *e,
242+
bool neigh_connected,
243+
unsigned char ha[ETH_ALEN])
244+
{
245+
struct ethhdr *eth = (struct ethhdr *)e->encap_header;
246+
247+
ASSERT_RTNL();
248+
249+
if ((!neigh_connected && (e->flags & MLX5_ENCAP_ENTRY_VALID)) ||
250+
!ether_addr_equal(e->h_dest, ha))
251+
mlx5e_tc_encap_flows_del(priv, e);
252+
253+
if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
254+
ether_addr_copy(e->h_dest, ha);
255+
ether_addr_copy(eth->h_dest, ha);
256+
257+
mlx5e_tc_encap_flows_add(priv, e);
258+
}
259+
}
260+
261+
static void mlx5e_rep_neigh_update(struct work_struct *work)
262+
{
263+
struct mlx5e_neigh_hash_entry *nhe =
264+
container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
265+
struct neighbour *n = nhe->n;
266+
struct mlx5e_encap_entry *e;
267+
unsigned char ha[ETH_ALEN];
268+
struct mlx5e_priv *priv;
269+
bool neigh_connected;
270+
bool encap_connected;
271+
u8 nud_state, dead;
272+
273+
rtnl_lock();
274+
275+
/* If these parameters are changed after we release the lock,
276+
* we'll receive another event letting us know about it.
277+
* We use this lock to avoid inconsistency between the neigh validity
278+
* and it's hw address.
279+
*/
280+
read_lock_bh(&n->lock);
281+
memcpy(ha, n->ha, ETH_ALEN);
282+
nud_state = n->nud_state;
283+
dead = n->dead;
284+
read_unlock_bh(&n->lock);
285+
286+
neigh_connected = (nud_state & NUD_VALID) && !dead;
287+
288+
list_for_each_entry(e, &nhe->encap_list, encap_list) {
289+
encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
290+
priv = netdev_priv(e->out_dev);
291+
292+
if (encap_connected != neigh_connected ||
293+
!ether_addr_equal(e->h_dest, ha))
294+
mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
295+
}
296+
mlx5e_rep_neigh_entry_release(nhe);
297+
rtnl_unlock();
298+
neigh_release(n);
299+
}
300+
301+
static struct mlx5e_neigh_hash_entry *
302+
mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
303+
struct mlx5e_neigh *m_neigh);
304+
305+
static int mlx5e_rep_netevent_event(struct notifier_block *nb,
306+
unsigned long event, void *ptr)
307+
{
308+
struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
309+
neigh_update.netevent_nb);
310+
struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
311+
struct net_device *netdev = rpriv->rep->netdev;
312+
struct mlx5e_priv *priv = netdev_priv(netdev);
313+
struct mlx5e_neigh_hash_entry *nhe = NULL;
314+
struct mlx5e_neigh m_neigh = {};
315+
struct neighbour *n;
316+
317+
switch (event) {
318+
case NETEVENT_NEIGH_UPDATE:
319+
n = ptr;
320+
#if IS_ENABLED(CONFIG_IPV6)
321+
if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl)
322+
#else
323+
if (n->tbl != &arp_tbl)
324+
#endif
325+
return NOTIFY_DONE;
326+
327+
m_neigh.dev = n->dev;
328+
memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
329+
330+
/* We are in atomic context and can't take RTNL mutex, so use
331+
* spin_lock_bh to lookup the neigh table. bh is used since
332+
* netevent can be called from a softirq context.
333+
*/
334+
spin_lock_bh(&neigh_update->encap_lock);
335+
nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
336+
if (!nhe) {
337+
spin_unlock_bh(&neigh_update->encap_lock);
338+
return NOTIFY_DONE;
339+
}
340+
341+
/* This assignment is valid as long as the the neigh reference
342+
* is taken
343+
*/
344+
nhe->n = n;
345+
346+
/* Take a reference to ensure the neighbour and mlx5 encap
347+
* entry won't be destructed until we drop the reference in
348+
* delayed work.
349+
*/
350+
neigh_hold(n);
351+
mlx5e_rep_neigh_entry_hold(nhe);
352+
353+
if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
354+
mlx5e_rep_neigh_entry_release(nhe);
355+
neigh_release(n);
356+
}
357+
spin_unlock_bh(&neigh_update->encap_lock);
358+
break;
359+
}
360+
return NOTIFY_DONE;
361+
}
362+
227363
static const struct rhashtable_params mlx5e_neigh_ht_params = {
228364
.head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
229365
.key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
@@ -234,14 +370,34 @@ static const struct rhashtable_params mlx5e_neigh_ht_params = {
234370
static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
235371
{
236372
struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
373+
int err;
374+
375+
err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
376+
if (err)
377+
return err;
237378

238379
INIT_LIST_HEAD(&neigh_update->neigh_list);
239-
return rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
380+
spin_lock_init(&neigh_update->encap_lock);
381+
382+
rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
383+
err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
384+
if (err)
385+
goto out_err;
386+
return 0;
387+
388+
out_err:
389+
rhashtable_destroy(&neigh_update->neigh_ht);
390+
return err;
240391
}
241392

242393
static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
243394
{
244395
struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
396+
struct mlx5e_priv *priv = netdev_priv(rpriv->rep->netdev);
397+
398+
unregister_netevent_notifier(&neigh_update->netevent_nb);
399+
400+
flush_workqueue(priv->wq); /* flush neigh update works */
245401

246402
rhashtable_destroy(&neigh_update->neigh_ht);
247403
}
@@ -268,13 +424,19 @@ static void mlx5e_rep_neigh_entry_remove(struct mlx5e_priv *priv,
268424
{
269425
struct mlx5e_rep_priv *rpriv = priv->ppriv;
270426

427+
spin_lock_bh(&rpriv->neigh_update.encap_lock);
428+
271429
list_del(&nhe->neigh_list);
272430

273431
rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
274432
&nhe->rhash_node,
275433
mlx5e_neigh_ht_params);
434+
spin_unlock_bh(&rpriv->neigh_update.encap_lock);
276435
}
277436

437+
/* This function must only be called under RTNL lock or under the
438+
* representor's encap_lock in case RTNL mutex can't be held.
439+
*/
278440
static struct mlx5e_neigh_hash_entry *
279441
mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
280442
struct mlx5e_neigh *m_neigh)
@@ -286,6 +448,72 @@ mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
286448
mlx5e_neigh_ht_params);
287449
}
288450

451+
static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
452+
struct mlx5e_encap_entry *e,
453+
struct mlx5e_neigh_hash_entry **nhe)
454+
{
455+
int err;
456+
457+
*nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
458+
if (!*nhe)
459+
return -ENOMEM;
460+
461+
memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
462+
INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
463+
INIT_LIST_HEAD(&(*nhe)->encap_list);
464+
refcount_set(&(*nhe)->refcnt, 1);
465+
466+
err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
467+
if (err)
468+
goto out_free;
469+
return 0;
470+
471+
out_free:
472+
kfree(*nhe);
473+
return err;
474+
}
475+
476+
static void mlx5e_rep_neigh_entry_destroy(struct mlx5e_priv *priv,
477+
struct mlx5e_neigh_hash_entry *nhe)
478+
{
479+
/* The neigh hash entry must be removed from the hash table regardless
480+
* of the reference count value, so it won't be found by the next
481+
* neigh notification call. The neigh hash entry reference count is
482+
* incremented only during creation and neigh notification calls and
483+
* protects from freeing the nhe struct.
484+
*/
485+
mlx5e_rep_neigh_entry_remove(priv, nhe);
486+
mlx5e_rep_neigh_entry_release(nhe);
487+
}
488+
489+
int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
490+
struct mlx5e_encap_entry *e)
491+
{
492+
struct mlx5e_neigh_hash_entry *nhe;
493+
int err;
494+
495+
nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
496+
if (!nhe) {
497+
err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
498+
if (err)
499+
return err;
500+
}
501+
list_add(&e->encap_list, &nhe->encap_list);
502+
return 0;
503+
}
504+
505+
void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
506+
struct mlx5e_encap_entry *e)
507+
{
508+
struct mlx5e_neigh_hash_entry *nhe;
509+
510+
list_del(&e->encap_list);
511+
nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
512+
513+
if (list_empty(&nhe->encap_list))
514+
mlx5e_rep_neigh_entry_destroy(priv, nhe);
515+
}
516+
289517
static int mlx5e_rep_open(struct net_device *dev)
290518
{
291519
struct mlx5e_priv *priv = netdev_priv(dev);

drivers/net/ethernet/mellanox/mlx5/core/en_rep.h

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ struct mlx5e_neigh_update_table {
4545
* Used for stats query.
4646
*/
4747
struct list_head neigh_list;
48+
/* protect lookup/remove operations */
49+
spinlock_t encap_lock;
50+
struct notifier_block netevent_nb;
4851
};
4952

5053
struct mlx5e_rep_priv {
@@ -69,18 +72,46 @@ struct mlx5e_neigh_hash_entry {
6972
* neighbour entries. Used for stats query.
7073
*/
7174
struct list_head neigh_list;
75+
76+
/* encap list sharing the same neigh */
77+
struct list_head encap_list;
78+
79+
/* valid only when the neigh reference is taken during
80+
* neigh_update_work workqueue callback.
81+
*/
82+
struct neighbour *n;
83+
struct work_struct neigh_update_work;
84+
85+
/* neigh hash entry can be deleted only when the refcount is zero.
86+
* refcount is needed to avoid neigh hash entry removal by TC, while
87+
* it's used by the neigh notification call.
88+
*/
89+
refcount_t refcnt;
90+
};
91+
92+
enum {
93+
/* set when the encap entry is successfully offloaded into HW */
94+
MLX5_ENCAP_ENTRY_VALID = BIT(0),
7295
};
7396

7497
struct mlx5e_encap_entry {
98+
/* neigh hash entry list of encaps sharing the same neigh */
99+
struct list_head encap_list;
100+
struct mlx5e_neigh m_neigh;
101+
/* a node of the eswitch encap hash table which keeping all the encap
102+
* entries
103+
*/
75104
struct hlist_node encap_hlist;
76105
struct list_head flows;
77106
u32 encap_id;
78-
struct neighbour *n;
79107
struct ip_tunnel_info tun_info;
80108
unsigned char h_dest[ETH_ALEN]; /* destination eth addr */
81109

82110
struct net_device *out_dev;
83111
int tunnel_type;
112+
u8 flags;
113+
char *encap_header;
114+
int encap_size;
84115
};
85116

86117
void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
@@ -95,4 +126,9 @@ bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id);
95126
int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
96127
void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
97128

129+
int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
130+
struct mlx5e_encap_entry *e);
131+
void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
132+
struct mlx5e_encap_entry *e);
133+
98134
#endif /* __MLX5E_REP_H__ */

0 commit comments

Comments
 (0)