Skip to content

Commit b56d48c

Browse files
committed
allowedips: remove nodes in O(1)
Previously, deleting peers would require traversing the entire trie in order to rebalance nodes and safely free them. This meant that removing 1000 peers from a trie with a half million nodes would take an extremely long time, during which we're holding the rtnl lock. Large-scale users were reporting 200ms latencies added to the networking stack as a whole every time their userspace software would queue up significant removals. That's a serious situation. This commit fixes that by maintaining a double pointer to the parent's bit pointer for each node, and then using the already existing node list belonging to each peer to go directly to the node, fix up its pointers, and free it with RCU. This means removal is O(1) instead of O(n), and we don't use gobs of stack. The removal algorithm has the same downside as the code that it fixes: it won't collapse needlessly long runs of fillers. We can enhance that in the future if it ever becomes a problem. This commit documents that limitation with a TODO comment in code, a small but meaningful improvement over the prior situation. Currently the biggest flaw, which the next commit addresses, is that because this increases the node size on 64-bit machines from 60 bytes to 68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up using twice as much memory per node, because of power-of-two allocations, which is a big bummer. We'll need to figure something out there. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
1 parent 3c14c4b commit b56d48c

File tree

2 files changed

+57
-84
lines changed

2 files changed

+57
-84
lines changed

src/allowedips.c

Lines changed: 54 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -71,60 +71,6 @@ static void root_remove_peer_lists(struct allowedips_node *root)
7171
}
7272
}
7373

74-
static void walk_remove_by_peer(struct allowedips_node __rcu **top,
75-
struct wg_peer *peer, struct mutex *lock)
76-
{
77-
#define REF(p) rcu_access_pointer(p)
78-
#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
79-
#define PUSH(p) ({ \
80-
WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
81-
stack[len++] = p; \
82-
})
83-
84-
struct allowedips_node __rcu **stack[128], **nptr;
85-
struct allowedips_node *node, *prev;
86-
unsigned int len;
87-
88-
if (unlikely(!peer || !REF(*top)))
89-
return;
90-
91-
for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
92-
nptr = stack[len - 1];
93-
node = DEREF(nptr);
94-
if (!node) {
95-
--len;
96-
continue;
97-
}
98-
if (!prev || REF(prev->bit[0]) == node ||
99-
REF(prev->bit[1]) == node) {
100-
if (REF(node->bit[0]))
101-
PUSH(&node->bit[0]);
102-
else if (REF(node->bit[1]))
103-
PUSH(&node->bit[1]);
104-
} else if (REF(node->bit[0]) == prev) {
105-
if (REF(node->bit[1]))
106-
PUSH(&node->bit[1]);
107-
} else {
108-
if (rcu_dereference_protected(node->peer,
109-
lockdep_is_held(lock)) == peer) {
110-
RCU_INIT_POINTER(node->peer, NULL);
111-
list_del_init(&node->peer_list);
112-
if (!node->bit[0] || !node->bit[1]) {
113-
rcu_assign_pointer(*nptr, DEREF(
114-
&node->bit[!REF(node->bit[0])]));
115-
call_rcu(&node->rcu, node_free_rcu);
116-
node = DEREF(nptr);
117-
}
118-
}
119-
--len;
120-
}
121-
}
122-
123-
#undef REF
124-
#undef DEREF
125-
#undef PUSH
126-
}
127-
12874
static unsigned int fls128(u64 a, u64 b)
12975
{
13076
return a ? fls64(a) + 64U : fls64(b);
@@ -229,6 +175,7 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
229175
RCU_INIT_POINTER(node->peer, peer);
230176
list_add_tail(&node->peer_list, &peer->allowedips_list);
231177
copy_and_assign_cidr(node, key, cidr, bits);
178+
rcu_assign_pointer(node->parent_bit, trie);
232179
rcu_assign_pointer(*trie, node);
233180
return 0;
234181
}
@@ -248,9 +195,9 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
248195
if (!node) {
249196
down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
250197
} else {
251-
down = rcu_dereference_protected(CHOOSE_NODE(node, key),
252-
lockdep_is_held(lock));
198+
down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
253199
if (!down) {
200+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
254201
rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
255202
return 0;
256203
}
@@ -259,29 +206,37 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
259206
parent = node;
260207

261208
if (newnode->cidr == cidr) {
209+
rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
262210
rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
263-
if (!parent)
211+
if (!parent) {
212+
rcu_assign_pointer(newnode->parent_bit, trie);
264213
rcu_assign_pointer(*trie, newnode);
265-
else
266-
rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
267-
newnode);
268-
} else {
269-
node = kzalloc(sizeof(*node), GFP_KERNEL);
270-
if (unlikely(!node)) {
271-
list_del(&newnode->peer_list);
272-
kfree(newnode);
273-
return -ENOMEM;
214+
} else {
215+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
216+
rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
274217
}
275-
INIT_LIST_HEAD(&node->peer_list);
276-
copy_and_assign_cidr(node, newnode->bits, cidr, bits);
277-
278-
rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
279-
rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
280-
if (!parent)
281-
rcu_assign_pointer(*trie, node);
282-
else
283-
rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
284-
node);
218+
return 0;
219+
}
220+
221+
node = kzalloc(sizeof(*node), GFP_KERNEL);
222+
if (unlikely(!node)) {
223+
list_del(&newnode->peer_list);
224+
kfree(newnode);
225+
return -ENOMEM;
226+
}
227+
INIT_LIST_HEAD(&node->peer_list);
228+
copy_and_assign_cidr(node, newnode->bits, cidr, bits);
229+
230+
rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
231+
rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
232+
rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
233+
rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
234+
if (!parent) {
235+
rcu_assign_pointer(node->parent_bit, trie);
236+
rcu_assign_pointer(*trie, node);
237+
} else {
238+
rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
239+
rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
285240
}
286241
return 0;
287242
}
@@ -340,9 +295,30 @@ int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip,
340295
void wg_allowedips_remove_by_peer(struct allowedips *table,
341296
struct wg_peer *peer, struct mutex *lock)
342297
{
298+
struct allowedips_node *node, *child, *tmp;
299+
300+
if (list_empty(&peer->allowedips_list))
301+
return;
343302
++table->seq;
344-
walk_remove_by_peer(&table->root4, peer, lock);
345-
walk_remove_by_peer(&table->root6, peer, lock);
303+
list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
304+
list_del_init(&node->peer_list);
305+
RCU_INIT_POINTER(node->peer, NULL);
306+
if (node->bit[0] && node->bit[1])
307+
continue;
308+
child = rcu_dereference_protected(
309+
node->bit[!rcu_access_pointer(node->bit[0])],
310+
lockdep_is_held(lock));
311+
if (child)
312+
child->parent_bit = node->parent_bit;
313+
*rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
314+
kfree_rcu(node, rcu);
315+
316+
/* TODO: Note that we currently don't walk up and down in order to
317+
* free any potential filler nodes. This means that this function
318+
* doesn't free up as much as it could, which could be revisited
319+
* at some point.
320+
*/
321+
}
346322
}
347323

348324
int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)

src/allowedips.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,11 @@ struct wg_peer;
1515
struct allowedips_node {
1616
struct wg_peer __rcu *peer;
1717
struct allowedips_node __rcu *bit[2];
18-
/* While it may seem scandalous that we waste space for v4,
19-
* we're alloc'ing to the nearest power of 2 anyway, so this
20-
* doesn't actually make a difference.
21-
*/
22-
u8 bits[16] __aligned(__alignof(u64));
2318
u8 cidr, bit_at_a, bit_at_b, bitlen;
19+
u8 bits[16] __aligned(__alignof(u64));
2420

25-
/* Keep rarely used list at bottom to be beyond cache line. */
21+
/* Keep rarely used members at bottom to be beyond cache line. */
22+
struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
2623
union {
2724
struct list_head peer_list;
2825
struct rcu_head rcu;

0 commit comments

Comments
 (0)