Skip to content

Commit 8c8ecc9

Browse files
ecsvsimonwunderlich
authored andcommitted
batman-adv: Drop unmanaged ELP metric worker
The ELP worker needs to calculate new metric values for all neighbors "reachable" over an interface. Some of the used metric sources require locks which might need to sleep. This sleep is incompatible with the RCU list iterator used for the recorded neighbors. The initial approach to work around of this problem was to queue another work item per neighbor and then run this in a new context. Even when this solved the RCU vs might_sleep() conflict, it has a major problems: Nothing was stopping the work item in case it is not needed anymore - for example because one of the related interfaces was removed or the batman-adv module was unloaded - resulting in potential invalid memory accesses. Directly canceling the metric worker also has various problems: * cancel_work_sync for a to-be-deactivated interface is called with rtnl_lock held. But the code in the ELP metric worker also tries to use rtnl_lock() - which will never return in this case. This also means that cancel_work_sync would never return because it is waiting for the worker to finish. * iterating over the neighbor list for the to-be-deactivated interface is currently done using the RCU specific methods. Which means that it is possible to miss items when iterating over it without the associated spinlock - a behaviour which is acceptable for a periodic metric check but not for a cleanup routine (which must "stop" all still running workers) The better approch is to get rid of the per interface neighbor metric worker and handle everything in the interface worker. The original problems are solved by: * creating a list of neighbors which require new metric information inside the RCU protected context, gathering the metric according to the new list outside the RCU protected context * only use rcu_trylock inside metric gathering code to avoid a deadlock when the cancel_delayed_work_sync is called in the interface removal code (which is called with the rtnl_lock held) Cc: stable@vger.kernel.org Fixes: c833484 ("batman-adv: ELP - compute the metric based on the estimated throughput") Signed-off-by: Sven Eckelmann <sven@narfation.org> Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
1 parent e7e34ff commit 8c8ecc9

File tree

4 files changed

+48
-30
lines changed

4 files changed

+48
-30
lines changed

net/batman-adv/bat_v.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,6 @@ static void
113113
batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
114114
{
115115
ewma_throughput_init(&hardif_neigh->bat_v.throughput);
116-
INIT_WORK(&hardif_neigh->bat_v.metric_work,
117-
batadv_v_elp_throughput_metric_update);
118116
}
119117

120118
/**

net/batman-adv/bat_v_elp.c

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/if_ether.h>
1919
#include <linux/jiffies.h>
2020
#include <linux/kref.h>
21+
#include <linux/list.h>
2122
#include <linux/minmax.h>
2223
#include <linux/netdevice.h>
2324
#include <linux/nl80211.h>
@@ -26,6 +27,7 @@
2627
#include <linux/rcupdate.h>
2728
#include <linux/rtnetlink.h>
2829
#include <linux/skbuff.h>
30+
#include <linux/slab.h>
2931
#include <linux/stddef.h>
3032
#include <linux/string.h>
3133
#include <linux/types.h>
@@ -41,6 +43,18 @@
4143
#include "routing.h"
4244
#include "send.h"
4345

46+
/**
47+
* struct batadv_v_metric_queue_entry - list of hardif neighbors which require
48+
* and metric update
49+
*/
50+
struct batadv_v_metric_queue_entry {
51+
/** @hardif_neigh: hardif neighbor scheduled for metric update */
52+
struct batadv_hardif_neigh_node *hardif_neigh;
53+
54+
/** @list: list node for metric_queue */
55+
struct list_head list;
56+
};
57+
4458
/**
4559
* batadv_v_elp_start_timer() - restart timer for ELP periodic work
4660
* @hard_iface: the interface for which the timer has to be reset
@@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
137151
goto default_throughput;
138152
}
139153

154+
/* only use rtnl_trylock because the elp worker will be cancelled while
155+
* the rntl_lock is held. the cancel_delayed_work_sync() would otherwise
156+
* wait forever when the elp work_item was started and it is then also
157+
* trying to rtnl_lock
158+
*/
159+
if (!rtnl_trylock())
160+
return false;
161+
140162
/* if not a wifi interface, check if this device provides data via
141163
* ethtool (e.g. an Ethernet adapter)
142164
*/
143-
rtnl_lock();
144165
ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
145166
rtnl_unlock();
146167
if (ret == 0) {
@@ -175,31 +196,19 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
175196
/**
176197
* batadv_v_elp_throughput_metric_update() - worker updating the throughput
177198
* metric of a single hop neighbour
178-
* @work: the work queue item
199+
* @neigh: the neighbour to probe
179200
*/
180-
void batadv_v_elp_throughput_metric_update(struct work_struct *work)
201+
static void
202+
batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh)
181203
{
182-
struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
183-
struct batadv_hardif_neigh_node *neigh;
184204
u32 throughput;
185205
bool valid;
186206

187-
neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
188-
metric_work);
189-
neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
190-
bat_v);
191-
192207
valid = batadv_v_elp_get_throughput(neigh, &throughput);
193208
if (!valid)
194-
goto put_neigh;
209+
return;
195210

196211
ewma_throughput_add(&neigh->bat_v.throughput, throughput);
197-
198-
put_neigh:
199-
/* decrement refcounter to balance increment performed before scheduling
200-
* this task
201-
*/
202-
batadv_hardif_neigh_put(neigh);
203212
}
204213

205214
/**
@@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
273282
*/
274283
static void batadv_v_elp_periodic_work(struct work_struct *work)
275284
{
285+
struct batadv_v_metric_queue_entry *metric_entry;
286+
struct batadv_v_metric_queue_entry *metric_safe;
276287
struct batadv_hardif_neigh_node *hardif_neigh;
277288
struct batadv_hard_iface *hard_iface;
278289
struct batadv_hard_iface_bat_v *bat_v;
279290
struct batadv_elp_packet *elp_packet;
291+
struct list_head metric_queue;
280292
struct batadv_priv *bat_priv;
281293
struct sk_buff *skb;
282294
u32 elp_interval;
283-
bool ret;
284295

285296
bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
286297
hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
@@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
316327

317328
atomic_inc(&hard_iface->bat_v.elp_seqno);
318329

330+
INIT_LIST_HEAD(&metric_queue);
331+
319332
/* The throughput metric is updated on each sent packet. This way, if a
320333
* node is dead and no longer sends packets, batman-adv is still able to
321334
* react timely to its death.
@@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
340353

341354
/* Reading the estimated throughput from cfg80211 is a task that
342355
* may sleep and that is not allowed in an rcu protected
343-
* context. Therefore schedule a task for that.
356+
* context. Therefore add it to metric_queue and process it
357+
* outside rcu protected context.
344358
*/
345-
ret = queue_work(batadv_event_workqueue,
346-
&hardif_neigh->bat_v.metric_work);
347-
348-
if (!ret)
359+
metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC);
360+
if (!metric_entry) {
349361
batadv_hardif_neigh_put(hardif_neigh);
362+
continue;
363+
}
364+
365+
metric_entry->hardif_neigh = hardif_neigh;
366+
list_add(&metric_entry->list, &metric_queue);
350367
}
351368
rcu_read_unlock();
352369

370+
list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) {
371+
batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh);
372+
373+
batadv_hardif_neigh_put(metric_entry->hardif_neigh);
374+
list_del(&metric_entry->list);
375+
kfree(metric_entry);
376+
}
377+
353378
restart_timer:
354379
batadv_v_elp_start_timer(hard_iface);
355380
out:

net/batman-adv/bat_v_elp.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include "main.h"
1111

1212
#include <linux/skbuff.h>
13-
#include <linux/workqueue.h>
1413

1514
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
1615
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
@@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
1918
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
2019
int batadv_v_elp_packet_recv(struct sk_buff *skb,
2120
struct batadv_hard_iface *if_incoming);
22-
void batadv_v_elp_throughput_metric_update(struct work_struct *work);
2321

2422
#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */

net/batman-adv/types.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v {
596596
* neighbor
597597
*/
598598
unsigned long last_unicast_tx;
599-
600-
/** @metric_work: work queue callback item for metric update */
601-
struct work_struct metric_work;
602599
};
603600

604601
/**

0 commit comments

Comments
 (0)