Skip to content

Commit 3e91b0e

Browse files
committed
Merge tag 'nf-23-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
Pablo Neira Ayuso says: ==================== Netfilter fixes for net The existing attempt to resolve races between control plane and GC work is error prone, as reported by Bien Pham <phamnnb@sea.com>, some places forgot to call nft_set_elem_mark_busy(), leading to double-deactivation of elements. This series contains the following patches: 1) Do not skip expired elements during walk otherwise elements might never decrement the reference counter on data, leading to memleak. 2) Add a GC transaction API to replace the former attempt to deal with races between control plane and GC. GC worker sets on NFT_SET_ELEM_DEAD_BIT on elements and it creates a GC transaction to remove the expired elements, GC transaction could abort in case of interference with control plane and retried later (GC async). Set backends such as rbtree and pipapo also perform GC from control plane (GC sync), in such case, element deactivation and removal is safe because mutex is held then collected elements are released via call_rcu(). 3) Adapt existing set backends to use the GC transaction API. 4) Update rhash set backend to set on _DEAD bit to report deleted elements from datapath for GC. 5) Remove old GC batch API and the NFT_SET_ELEM_BUSY_BIT. * tag 'nf-23-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: netfilter: nf_tables: remove busy mark and gc batch API netfilter: nft_set_hash: mark set element as dead when deleting from packet path netfilter: nf_tables: adapt set backend to use GC transaction API netfilter: nf_tables: GC transaction API to avoid race with control plane netfilter: nf_tables: don't skip expired elements during walk ==================== Link: https://lore.kernel.org/r/20230810070830.24064-1-pablo@netfilter.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 62d02fc + a2dd023 commit 3e91b0e

File tree

5 files changed

+476
-248
lines changed

5 files changed

+476
-248
lines changed

include/net/netfilter/nf_tables.h

Lines changed: 45 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,7 @@ struct nft_set_elem_expr {
512512
*
513513
* @list: table set list node
514514
* @bindings: list of set bindings
515+
* @refs: internal refcounting for async set destruction
515516
* @table: table this set belongs to
516517
* @net: netnamespace this set belongs to
517518
* @name: name of the set
@@ -541,6 +542,7 @@ struct nft_set_elem_expr {
541542
struct nft_set {
542543
struct list_head list;
543544
struct list_head bindings;
545+
refcount_t refs;
544546
struct nft_table *table;
545547
possible_net_t net;
546548
char *name;
@@ -562,7 +564,8 @@ struct nft_set {
562564
struct list_head pending_update;
563565
/* runtime data below here */
564566
const struct nft_set_ops *ops ____cacheline_aligned;
565-
u16 flags:14,
567+
u16 flags:13,
568+
dead:1,
566569
genmask:2;
567570
u8 klen;
568571
u8 dlen;
@@ -596,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
596599

597600
struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
598601
const struct nft_set *set);
599-
void *nft_set_catchall_gc(const struct nft_set *set);
600602

601603
static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
602604
{
@@ -813,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
813815
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
814816
const struct nft_set *set, void *elem);
815817

816-
/**
817-
* struct nft_set_gc_batch_head - nf_tables set garbage collection batch
818-
*
819-
* @rcu: rcu head
820-
* @set: set the elements belong to
821-
* @cnt: count of elements
822-
*/
823-
struct nft_set_gc_batch_head {
824-
struct rcu_head rcu;
825-
const struct nft_set *set;
826-
unsigned int cnt;
827-
};
828-
829-
#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \
830-
sizeof(struct nft_set_gc_batch_head)) / \
831-
sizeof(void *))
832-
833-
/**
834-
* struct nft_set_gc_batch - nf_tables set garbage collection batch
835-
*
836-
* @head: GC batch head
837-
* @elems: garbage collection elements
838-
*/
839-
struct nft_set_gc_batch {
840-
struct nft_set_gc_batch_head head;
841-
void *elems[NFT_SET_GC_BATCH_SIZE];
842-
};
843-
844-
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
845-
gfp_t gfp);
846-
void nft_set_gc_batch_release(struct rcu_head *rcu);
847-
848-
static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
849-
{
850-
if (gcb != NULL)
851-
call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
852-
}
853-
854-
static inline struct nft_set_gc_batch *
855-
nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
856-
gfp_t gfp)
857-
{
858-
if (gcb != NULL) {
859-
if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
860-
return gcb;
861-
nft_set_gc_batch_complete(gcb);
862-
}
863-
return nft_set_gc_batch_alloc(set, gfp);
864-
}
865-
866-
static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
867-
void *elem)
868-
{
869-
gcb->elems[gcb->head.cnt++] = elem;
870-
}
871-
872818
struct nft_expr_ops;
873819
/**
874820
* struct nft_expr_type - nf_tables expression type
@@ -1557,39 +1503,30 @@ static inline void nft_set_elem_change_active(const struct net *net,
15571503

15581504
#endif /* IS_ENABLED(CONFIG_NF_TABLES) */
15591505

1560-
/*
1561-
* We use a free bit in the genmask field to indicate the element
1562-
* is busy, meaning it is currently being processed either by
1563-
* the netlink API or GC.
1564-
*
1565-
* Even though the genmask is only a single byte wide, this works
1566-
* because the extension structure if fully constant once initialized,
1567-
* so there are no non-atomic write accesses unless it is already
1568-
* marked busy.
1569-
*/
1570-
#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
1506+
#define NFT_SET_ELEM_DEAD_MASK (1 << 2)
15711507

15721508
#if defined(__LITTLE_ENDIAN_BITFIELD)
1573-
#define NFT_SET_ELEM_BUSY_BIT 2
1509+
#define NFT_SET_ELEM_DEAD_BIT 2
15741510
#elif defined(__BIG_ENDIAN_BITFIELD)
1575-
#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
1511+
#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
15761512
#else
15771513
#error
15781514
#endif
15791515

1580-
static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext)
1516+
static inline void nft_set_elem_dead(struct nft_set_ext *ext)
15811517
{
15821518
unsigned long *word = (unsigned long *)ext;
15831519

15841520
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
1585-
return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word);
1521+
set_bit(NFT_SET_ELEM_DEAD_BIT, word);
15861522
}
15871523

1588-
static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
1524+
static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
15891525
{
15901526
unsigned long *word = (unsigned long *)ext;
15911527

1592-
clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
1528+
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
1529+
return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
15931530
}
15941531

15951532
/**
@@ -1732,6 +1669,38 @@ struct nft_trans_flowtable {
17321669
#define nft_trans_flowtable_flags(trans) \
17331670
(((struct nft_trans_flowtable *)trans->data)->flags)
17341671

1672+
#define NFT_TRANS_GC_BATCHCOUNT 256
1673+
1674+
struct nft_trans_gc {
1675+
struct list_head list;
1676+
struct net *net;
1677+
struct nft_set *set;
1678+
u32 seq;
1679+
u8 count;
1680+
void *priv[NFT_TRANS_GC_BATCHCOUNT];
1681+
struct rcu_head rcu;
1682+
};
1683+
1684+
struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
1685+
unsigned int gc_seq, gfp_t gfp);
1686+
void nft_trans_gc_destroy(struct nft_trans_gc *trans);
1687+
1688+
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
1689+
unsigned int gc_seq, gfp_t gfp);
1690+
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
1691+
1692+
struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
1693+
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
1694+
1695+
void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
1696+
1697+
struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
1698+
unsigned int gc_seq);
1699+
1700+
void nft_setelem_data_deactivate(const struct net *net,
1701+
const struct nft_set *set,
1702+
struct nft_set_elem *elem);
1703+
17351704
int __init nft_chain_filter_init(void);
17361705
void nft_chain_filter_fini(void);
17371706

@@ -1758,6 +1727,7 @@ struct nftables_pernet {
17581727
struct mutex commit_mutex;
17591728
u64 table_handle;
17601729
unsigned int base_seq;
1730+
unsigned int gc_seq;
17611731
};
17621732

17631733
extern unsigned int nf_tables_net_id;

0 commit comments

Comments
 (0)