diff --git a/.github/buildomat/jobs/image.sh b/.github/buildomat/jobs/image.sh index 57dcb18..929960d 100755 --- a/.github/buildomat/jobs/image.sh +++ b/.github/buildomat/jobs/image.sh @@ -101,11 +101,11 @@ pfexec chown "$UID" /out banner "P4 Codegen" # Add gcc-12 so the p4 compiler can find cpp -# The tofino2 has 20 stages, but the current sidecar.p4 will fit into 14. We -# add the "--stages 14" here to detect if/when the program grows beyond that +# The tofino2 has 20 stages, but the current sidecar.p4 will fit into 18. We +# add the "--stages 18" here to detect if/when the program grows beyond that # limit. It's not necessarily a problem if we grow, but given the limited space # on the ASIC, we want to grow deliberatately and thoughtfully. -PATH=/opt/gcc-12/bin:$PATH cargo xtask codegen --stages 14 +PATH=/opt/gcc-12/bin:$PATH cargo xtask codegen --stages 18 # Preserve all the diagnostics spit out by the compiler mkdir -p /out/p4c-diags diff --git a/.gitignore b/.gitignore index 6f19fd5..eb3ea08 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ p4_artifacts* # Editor config .vscode .dir-locals.el +bacon.toml # OS artifacts .DS_Store diff --git a/Cargo.lock b/Cargo.lock index d51c021..4e9c672 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2755,7 +2755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -3751,9 +3751,9 @@ dependencies = [ [[package]] name = "oxnet" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95f58698da06f0f57b1ea4a8f1b0ca5741ee17927729d2e87dcfcb682266d21d" +checksum = "80e1dc143c5a701f879552428910f357df8bd725575087cc713088fdfeafe812" dependencies = [ "ipnetwork", "schemars", @@ -4936,9 +4936,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] @@ -4963,9 +4963,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", @@ -6846,7 +6846,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 58b616c..35ea0b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ omicron-common = { git = "https://github.com/oxidecomputer/omicron", branch= "ma oximeter = { git = "https://github.com/oxidecomputer/omicron", branch = "main" } oximeter-producer = { git = "https://github.com/oxidecomputer/omicron", branch = "main" } oximeter-instruments = { git = "https://github.com/oxidecomputer/omicron", branch = "main", default-features = false, features = ["kstat"] } -oxnet = { version = "0.1.1", default-features = false, features = ["schemars", "serde"] } +oxnet = { version = "0.1.2", default-features = false, features = ["schemars", "serde"] } propolis = { git = "https://github.com/oxidecomputer/propolis" } sled-agent-client = { git = "https://github.com/oxidecomputer/omicron", branch = "main" } smf = { git = "https://github.com/illumos/smf-rs" } diff --git a/aal/src/lib.rs b/aal/src/lib.rs index 1ece1d8..eec2c41 100644 --- a/aal/src/lib.rs +++ b/aal/src/lib.rs @@ -202,9 +202,15 @@ pub trait AsicOps { /// For a given multicast group, return the number of ports assigned to it. fn mc_port_count(&self, group_id: u16) -> AsicResult; - /// Add a port to a multicast group. The port is identified using its ASIC + /// Add a port to a multicast group. The port is identified using its ASIC /// identifier. - fn mc_port_add(&self, group_id: u16, port: AsicId) -> AsicResult<()>; + fn mc_port_add( + &self, + group_id: u16, + port: AsicId, + rid: u16, + level_1_excl_id: u16, + ) -> AsicResult<()>; /// Remove a port from a multicast group. The port is identified using its ASIC /// identifier. @@ -216,6 +222,21 @@ pub trait AsicOps { /// Destroy a multicast group. fn mc_group_destroy(&self, group_id: u16) -> AsicResult<()>; + /// Check if a multicast group exists. + fn mc_group_exists(&self, group_id: u16) -> bool { + self.mc_domains().contains(&group_id) + } + + /// Get the total number of multicast groups. + fn mc_groups_count(&self) -> AsicResult; + + /// Set the maximum number of multicast nodes. + fn mc_set_max_nodes( + &self, + max_nodes: u32, + max_link_aggregated_nodes: u32, + ) -> AsicResult<()>; + /// Get sidecar identifiers of the device being managed. fn get_sidecar_identifiers(&self) -> AsicResult; diff --git a/aal/src/match_action.rs b/aal/src/match_action.rs index 1cfa45e..e713a22 100644 --- a/aal/src/match_action.rs +++ b/aal/src/match_action.rs @@ -72,7 +72,7 @@ impl MatchData { /// The MatchParse trait defines the behavior needed to convert a high-level /// Match field into our intermediate representation. pub trait MatchParse { - /// Return all the name sand values of the key fields as strings + /// Return all the names and values of the key fields as strings fn key_values(&self) -> BTreeMap; /// Convert the key Struct to a MatchData struct fn key_to_ir(&self) -> AsicResult; @@ -452,6 +452,27 @@ impl From for ValueTypes { } } +impl TryFrom<&ValueTypes> for bool { + type Error = &'static str; + + fn try_from(v: &ValueTypes) -> Result { + match v { + ValueTypes::U64(v) => { + if *v == 0 { + Ok(false) + } else if *v == 1 { + Ok(true) + } else { + Err("value not a boolean") + } + } + _ => Err("value not a boolean"), + } + } +} + +unwrap_value_entry!(bool); + #[derive(Debug, Hash, Clone)] pub enum ValueTypes { U64(u64), diff --git a/asic/src/chaos/mod.rs b/asic/src/chaos/mod.rs index feb1cbe..ab87da1 100644 --- a/asic/src/chaos/mod.rs +++ b/asic/src/chaos/mod.rs @@ -94,7 +94,18 @@ impl TableChaos { (table::SWITCH_IPV4_ADDR, v), (table::SWITCH_IPV6_ADDR, v), (table::NAT_INGRESS_IPV4, v), - (table::NAT_INGRESS_IPV6, v) + (table::NAT_INGRESS_IPV6, v), + (table::MCAST_NAT_INGRESS_IPV4, v), + (table::MCAST_NAT_INGRESS_IPV6, v), + (table::MCAST_REPLICATION_IPV4, v), + (table::MCAST_REPLICATION_IPV6, v), + (table::MCAST_SRC_FILTER_IPV4, v), + (table::MCAST_SRC_FILTER_IPV6, v), + (table::MCAST_ROUTE_IPV4, v), + (table::MCAST_ROUTE_IPV6, v), + (table::MCAST_MAC_REWRITE, v), + (table::MCAST_DECAP_PORTS, v), + (table::MCAST_PORT_ID_MAPPING, v) ) } @@ -141,6 +152,8 @@ pub struct AsicConfig { pub mc_port_remove: Chaos, pub mc_group_create: Chaos, pub mc_group_destroy: Chaos, + pub mc_groups_count: Chaos, + pub mc_set_max_nodes: Chaos, pub get_sidecar_identifiers: Chaos, pub table_new: TableChaos, pub table_clear: TableChaos, @@ -177,6 +190,8 @@ impl AsicConfig { mc_port_remove: Chaos::new(v), mc_group_create: Chaos::new(v), mc_group_destroy: Chaos::new(v), + mc_groups_count: Chaos::new(v), + mc_set_max_nodes: Chaos::new(v), get_sidecar_identifiers: Chaos::new(v), table_new: TableChaos::uniform(v), table_clear: TableChaos::uniform(v), @@ -203,6 +218,7 @@ impl AsicConfig { port_enable_get: Chaos::new(v), connector_avail_channels: Chaos::new(v), mc_port_count: Chaos::new(v), + mc_groups_count: Chaos::new(v), get_sidecar_identifiers: Chaos::new(v), ..Default::default() } @@ -224,6 +240,7 @@ impl AsicConfig { mc_port_remove: Chaos::new(v), mc_group_create: Chaos::new(v), mc_group_destroy: Chaos::new(v), + mc_set_max_nodes: Chaos::new(v), // TODO this can cause dpd to fail to start //table_clear: TableChaos::uniform(v), table_default_set: TableChaos::uniform(v), @@ -476,7 +493,13 @@ impl AsicOps for Handle { Ok(self.ports.lock().unwrap().len()) } - fn mc_port_add(&self, _group_id: u16, _port: u16) -> AsicResult<()> { + fn mc_port_add( + &self, + _group_id: u16, + _port: u16, + _rid: u16, + _level1_excl_id: u16, + ) -> AsicResult<()> { unfurl!(self, mc_port_add); Err(AsicError::OperationUnsupported) } @@ -496,6 +519,20 @@ impl AsicOps for Handle { Ok(()) } + fn mc_groups_count(&self) -> AsicResult { + unfurl!(self, mc_groups_count); + Ok(self.ports.lock().unwrap().len()) + } + + fn mc_set_max_nodes( + &self, + _max_nodes: u32, + _max_link_aggregated_nodes: u32, + ) -> AsicResult<()> { + unfurl!(self, mc_set_max_nodes); + Ok(()) + } + fn get_sidecar_identifiers(&self) -> AsicResult { unfurl!(self, get_sidecar_identifiers); Ok(Identifiers::default()) diff --git a/asic/src/chaos/table.rs b/asic/src/chaos/table.rs index 19ae33a..24699d8 100644 --- a/asic/src/chaos/table.rs +++ b/asic/src/chaos/table.rs @@ -25,6 +25,28 @@ pub const SWITCH_IPV4_ADDR: &str = "pipe.Ingress.filter.switch_ipv4_addr"; pub const SWITCH_IPV6_ADDR: &str = "pipe.Ingress.filter.switch_ipv6_addr"; pub const NAT_INGRESS_IPV4: &str = "pipe.Ingress.nat_ingress.ingress_ipv4"; pub const NAT_INGRESS_IPV6: &str = "pipe.Ingress.nat_ingress.ingress_ipv6"; +pub(crate) const MCAST_NAT_INGRESS_IPV4: &str = + "pipe.Ingress.nat_ingress.ingress_ipv4_mcast"; +pub(crate) const MCAST_NAT_INGRESS_IPV6: &str = + "pipe.Ingress.nat_ingress.ingress_ipv6_mcast"; +pub(crate) const MCAST_REPLICATION_IPV4: &str = + "pipe.Ingress.mcast_ingress.mcast_replication_ipv4"; +pub(crate) const MCAST_REPLICATION_IPV6: &str = + "pipe.Ingress.mcast_ingress.mcast_replication_ipv6"; +pub(crate) const MCAST_SRC_FILTER_IPV4: &str = + "pipe.Ingress.mcast_ingress.mcast_source_filter_ipv4"; +pub(crate) const MCAST_SRC_FILTER_IPV6: &str = + "pipe.Ingress.mcast_ingress.mcast_source_filter_ipv6"; +pub(crate) const MCAST_ROUTE_IPV4: &str = + "pipe.Ingress.l3_router.MulticastRouter4.tbl"; +pub(crate) const MCAST_ROUTE_IPV6: &str = + "pipe.Ingress.l3_router.MulticastRouter6.tbl"; +pub(crate) const MCAST_MAC_REWRITE: &str = + "pipe.Egress.mac_rewrite.mac_rewrite"; +pub(crate) const MCAST_DECAP_PORTS: &str = + "pipe.Egress.mcast_egress.tbl_decap_ports"; +pub(crate) const MCAST_PORT_ID_MAPPING: &str = + "pipe.Egress.mcast_egress.asic_id_to_port"; pub struct Table { name: String, diff --git a/asic/src/softnpu/mod.rs b/asic/src/softnpu/mod.rs index 8edd1bc..73d2095 100644 --- a/asic/src/softnpu/mod.rs +++ b/asic/src/softnpu/mod.rs @@ -349,7 +349,13 @@ impl AsicOps for Handle { Ok(self.ports.lock().unwrap().len()) } - fn mc_port_add(&self, _group_id: u16, _port: u16) -> AsicResult<()> { + fn mc_port_add( + &self, + _group_id: u16, + _port: u16, + _rid: u16, + _level1_excl_id: u16, + ) -> AsicResult<()> { Err(AsicError::OperationUnsupported) } @@ -365,6 +371,18 @@ impl AsicOps for Handle { Ok(()) } + fn mc_groups_count(&self) -> AsicResult { + Ok(self.ports.lock().unwrap().len()) + } + + fn mc_set_max_nodes( + &self, + _max_nodes: u32, + _max_link_aggregated_nodes: u32, + ) -> AsicResult<()> { + Ok(()) + } + fn get_sidecar_identifiers(&self) -> AsicResult { Ok(Identifiers { id: Uuid::new_v4(), diff --git a/asic/src/tofino_asic/imported_bf_functions b/asic/src/tofino_asic/imported_bf_functions index 5b1c993..54775dd 100644 --- a/asic/src/tofino_asic/imported_bf_functions +++ b/asic/src/tofino_asic/imported_bf_functions @@ -22,11 +22,13 @@ bf_mc_create_session bf_mc_destroy_session bf_mc_mgrp_create bf_mc_mgrp_destroy +bf_mc_mgrp_get_count bf_mc_node_create bf_mc_node_destroy bf_mc_node_update bf_mc_associate_node bf_mc_dissociate_node +bf_mc_set_max_node_threshold # bf_rt calls bf_rt_table_from_name_get diff --git a/asic/src/tofino_asic/multicast.rs b/asic/src/tofino_asic/mcast.rs similarity index 82% rename from asic/src/tofino_asic/multicast.rs rename to asic/src/tofino_asic/mcast.rs index 7458d02..214b801 100644 --- a/asic/src/tofino_asic/multicast.rs +++ b/asic/src/tofino_asic/mcast.rs @@ -80,6 +80,19 @@ fn mgrp_destroy( Ok(()) } +fn mgrp_get_count( + mcast_hdl: &Handle, + dev_id: bf_dev_id_t, + mut count: u32, +) -> AsicResult { + unsafe { + bf_mc_mgrp_get_count(mcast_hdl.bf_get().mcast_hdl, dev_id, &mut count) + .check_error("getting total count of multicast groups")?; + } + + Ok(count as usize) +} + fn associate_node( mcast_hdl: bf_mc_session_hdl_t, dev_id: bf_dev_id_t, @@ -93,7 +106,7 @@ fn associate_node( dev_id, mgrp_hdl, node_hdl, - true, + exclusion_id != 0, exclusion_id, ) .check_error("associating multicast node")?; @@ -166,6 +179,26 @@ fn cleanup_node( node_destroy(bf.mcast_hdl, bf.dev_id, port_state.node_hdl) } +fn set_max_node_threshold( + mcast_hdl: bf_mc_session_hdl_t, + dev_id: bf_dev_id_t, + node_count: i32, + node_port_lag_count: i32, +) -> AsicResult<()> { + unsafe { + bf_mc_set_max_node_threshold( + mcast_hdl, + dev_id, + node_count, + node_port_lag_count, + ) + .check_error("setting max node threshold")?; + } + + Ok(()) +} + +/// All multicast domains. pub fn domains(hdl: &Handle) -> Vec { let mut list = Vec::new(); let domains = hdl.domains.lock().unwrap(); @@ -190,6 +223,7 @@ fn domain_ports(domain: &DomainState) -> Vec { list } +/// Get the number of ports in a multicast domain. pub fn domain_port_count(hdl: &Handle, group_id: u16) -> AsicResult { let mut domains = hdl.domains.lock().unwrap(); match domains.get_mut(&group_id) { @@ -198,15 +232,19 @@ pub fn domain_port_count(hdl: &Handle, group_id: u16) -> AsicResult { } } +/// Add a port to a multicast domain. pub fn domain_add_port( hdl: &Handle, group_id: u16, port: u16, + rid: u16, + level_1_excl_id: u16, ) -> AsicResult<()> { debug!( hdl.log, "adding port {} to multicast domain {}", port, group_id ); + let mut domains = hdl.domains.lock().unwrap(); let domain = match domains.get_mut(&group_id) { Some(d) => Ok(d), @@ -231,7 +269,7 @@ pub fn domain_add_port( mc.node_hdl = node_create( bf.mcast_hdl, bf.dev_id, - port, // Use port_id as the replication ID + rid, &mut mc.portmap, &mut mc.lagmap, )?; @@ -241,7 +279,7 @@ pub fn domain_add_port( bf.dev_id, domain.mgrp_hdl, mc.node_hdl, - port, // use the port number as the l1 exclusion ID + level_1_excl_id, ) { Ok(_) => { domain.ports.insert(port, mc); @@ -259,6 +297,7 @@ pub fn domain_add_port( } } +/// Remove a port from a multicast domain. pub fn domain_remove_port( hdl: &Handle, group_id: u16, @@ -287,6 +326,7 @@ pub fn domain_remove_port( Ok(()) } +/// Create a multicast domain. pub fn domain_create(hdl: &Handle, group_id: u16) -> AsicResult<()> { info!(hdl.log, "creating multicast domain {}", group_id); let mut domains = hdl.domains.lock().unwrap(); @@ -308,6 +348,7 @@ pub fn domain_create(hdl: &Handle, group_id: u16) -> AsicResult<()> { Ok(()) } +/// Destroy a multicast domain. pub fn domain_destroy(hdl: &Handle, group_id: u16) -> AsicResult<()> { info!(hdl.log, "destroying multicast domain {}", group_id); let mut domains = hdl.domains.lock().unwrap(); @@ -333,3 +374,30 @@ pub fn domain_destroy(hdl: &Handle, group_id: u16) -> AsicResult<()> { mgrp_destroy(bf.mcast_hdl, bf.dev_id, domain.mgrp_hdl) } + +/// Domain exists. +pub fn domain_exists(hdl: &Handle, group_id: u16) -> bool { + let domains = hdl.domains.lock().unwrap(); + domains.contains_key(&group_id) +} + +/// Get the total number of multicast domains. +pub fn domains_count(hdl: &Handle) -> AsicResult { + let bf = hdl.bf_get(); + mgrp_get_count(hdl, bf.dev_id, 0) +} + +/// Set the maximum number of multicast nodes. +pub fn set_max_nodes( + hdl: &Handle, + node_count: u32, + node_port_lag_count: u32, +) -> AsicResult<()> { + let bf = hdl.bf_get(); + set_max_node_threshold( + bf.mcast_hdl, + bf.dev_id, + node_count as i32, + node_port_lag_count as i32, + ) +} diff --git a/asic/src/tofino_asic/mod.rs b/asic/src/tofino_asic/mod.rs index 98da4f1..90c4464 100644 --- a/asic/src/tofino_asic/mod.rs +++ b/asic/src/tofino_asic/mod.rs @@ -19,7 +19,7 @@ mod bf_wrapper; mod genpd; mod link_fsm; -pub mod multicast; +pub mod mcast; pub mod ports; pub mod qsfp; mod sde_log; @@ -154,27 +154,49 @@ impl AsicOps for Handle { } fn mc_domains(&self) -> Vec { - multicast::domains(self) + mcast::domains(self) } fn mc_port_count(&self, group_id: u16) -> AsicResult { - multicast::domain_port_count(self, group_id) + mcast::domain_port_count(self, group_id) } - fn mc_port_add(&self, group_id: u16, port: u16) -> AsicResult<()> { - multicast::domain_add_port(self, group_id, port) + fn mc_port_add( + &self, + group_id: u16, + port: u16, + rid: u16, + level_1_excl_id: u16, + ) -> AsicResult<()> { + mcast::domain_add_port(self, group_id, port, rid, level_1_excl_id) } fn mc_port_remove(&self, group_id: u16, port: u16) -> AsicResult<()> { - multicast::domain_remove_port(self, group_id, port) + mcast::domain_remove_port(self, group_id, port) } fn mc_group_create(&self, group_id: u16) -> AsicResult<()> { - multicast::domain_create(self, group_id) + mcast::domain_create(self, group_id) } fn mc_group_destroy(&self, group_id: u16) -> AsicResult<()> { - multicast::domain_destroy(self, group_id) + mcast::domain_destroy(self, group_id) + } + + fn mc_group_exists(&self, group_id: u16) -> bool { + mcast::domain_exists(self, group_id) + } + + fn mc_groups_count(&self) -> AsicResult { + mcast::domains_count(self) + } + + fn mc_set_max_nodes( + &self, + max_nodes: u32, + max_link_aggregated_nodes: u32, + ) -> AsicResult<()> { + mcast::set_max_nodes(self, max_nodes, max_link_aggregated_nodes) } // Ideally we would get some sort of sidecar-level ID from the FRUID. @@ -242,7 +264,7 @@ pub struct Handle { rt: tofino_common::BfRt, log: slog::Logger, phys_ports: Mutex, - domains: Mutex>, + domains: Mutex>, eth_connector_id: Option, } @@ -326,7 +348,7 @@ impl Handle { // Note: we assume that bf_mc_init() has been called as part of the // bf_switch_init() operation. - bf.mcast_hdl = multicast::create_session()?; + bf.mcast_hdl = mcast::create_session()?; Ok(Handle { dev_id, diff --git a/asic/src/tofino_stub/multicast.rs b/asic/src/tofino_stub/mcast.rs similarity index 73% rename from asic/src/tofino_stub/multicast.rs rename to asic/src/tofino_stub/mcast.rs index e5d74c9..ec2ae3b 100644 --- a/asic/src/tofino_stub/multicast.rs +++ b/asic/src/tofino_stub/mcast.rs @@ -17,9 +17,12 @@ fn no_group(group_id: u16) -> AsicError { } impl McGroupData { + /// Get the list of multicast domains. pub fn domains(&self) -> Vec { self.groups.keys().copied().collect() } + + /// Get the number of ports in a multicast domain. pub fn domain_port_count(&self, group_id: u16) -> AsicResult { match self.groups.get(&group_id) { Some(g) => Ok(g.len()), @@ -27,10 +30,13 @@ impl McGroupData { } } + /// Add a port to a multicast domain. pub fn domain_port_add( &mut self, group_id: u16, port: u16, + _rid: u16, + _level1_excl_id: u16, ) -> AsicResult<()> { let group = match self.groups.get_mut(&group_id) { Some(g) => Ok(g), @@ -45,6 +51,7 @@ impl McGroupData { } } + /// Remove a port from a multicast domain. pub fn domain_port_remove( &mut self, group_id: u16, @@ -63,6 +70,7 @@ impl McGroupData { } } + /// Create a multicast domain. #[allow(clippy::map_entry)] pub fn domain_create(&mut self, group_id: u16) -> AsicResult<()> { if self.groups.contains_key(&group_id) { @@ -75,12 +83,34 @@ impl McGroupData { } } + /// Destroy a multicast domain. pub fn domain_destroy(&mut self, group_id: u16) -> AsicResult<()> { match self.groups.remove(&group_id) { Some(_) => Ok(()), None => Err(no_group(group_id)), } } + + /// Get the total number of multicast domains. + pub fn domains_count(&self) -> usize { + self.groups.len() + } + + /// Set the maximum number of nodes in a multicast group. + pub fn set_max_nodes( + &mut self, + max_nodes: u32, + _max_link_aggregated_nodes: u32, + ) -> AsicResult<()> { + let total = self.domains_count(); + if total as u32 > max_nodes { + return Err(AsicError::InvalidArg(format!( + "number of multicast groups {total} exceeds max nodes {max_nodes}" + ))); + } + + Ok(()) + } } pub fn init() -> McGroupData { diff --git a/asic/src/tofino_stub/mod.rs b/asic/src/tofino_stub/mod.rs index 8279686..9191fba 100644 --- a/asic/src/tofino_stub/mod.rs +++ b/asic/src/tofino_stub/mod.rs @@ -22,7 +22,7 @@ pub use crate::faux_fsm::FsmState; pub use crate::faux_fsm::FsmType; pub use crate::faux_fsm::PortFsmState; -pub mod multicast; +pub mod mcast; pub mod ports; pub mod table; @@ -168,13 +168,19 @@ impl AsicOps for StubHandle { let mc_data = self.mc_data.lock().unwrap(); mc_data.domain_port_count(group_id) } - fn mc_port_add(&self, group_id: u16, port: u16) -> AsicResult<()> { + fn mc_port_add( + &self, + group_id: u16, + port: u16, + rid: u16, + level1_excl_id: u16, + ) -> AsicResult<()> { info!( self.log, "adding port {} to multicast group {}", port, group_id ); let mut mc_data = self.mc_data.lock().unwrap(); - mc_data.domain_port_add(group_id, port) + mc_data.domain_port_add(group_id, port, rid, level1_excl_id) } fn mc_port_remove(&self, group_id: u16, port: u16) -> AsicResult<()> { info!( @@ -196,6 +202,27 @@ impl AsicOps for StubHandle { mc_data.domain_destroy(group_id) } + fn mc_groups_count(&self) -> AsicResult { + info!(self.log, "number of multicast groups"); + let mc_data = self.mc_data.lock().unwrap(); + Ok(mc_data.domains().len()) + } + + fn mc_set_max_nodes( + &self, + max_nodes: u32, + max_link_aggregated_nodes: u32, + ) -> AsicResult<()> { + info!( + self.log, + "setting max nodes to {} and max link aggregated nodes to {}", + max_nodes, + max_link_aggregated_nodes + ); + let mut mc_data = self.mc_data.lock().unwrap(); + mc_data.set_max_nodes(max_nodes, max_link_aggregated_nodes) + } + fn get_sidecar_identifiers(&self) -> AsicResult { Ok(Identifiers { id: uuid::Uuid::parse_str(SIDECAR_UUID).unwrap(), @@ -222,8 +249,7 @@ pub struct StubHandle { log: slog::Logger, phys_ports: Mutex, port_state: Mutex>, - mc_data: Mutex, - + mc_data: Mutex, update_tx: Mutex>>, } @@ -233,7 +259,7 @@ impl StubHandle { let rt = BfRt::init(&p4_dir)?; let phys_ports = Mutex::new(ports::init()?); let port_state = Mutex::new(BTreeMap::new()); - let mc_data = Mutex::new(multicast::init()); + let mc_data = Mutex::new(mcast::init()); let log = log.new(o!()); Ok(StubHandle { diff --git a/common/src/network.rs b/common/src/network.rs index fe500a0..6bf8b8d 100644 --- a/common/src/network.rs +++ b/common/src/network.rs @@ -94,6 +94,11 @@ impl MacAddr { self == EMPTY } + /// Return `true` if `self` is a multicast MAC address. + pub fn is_multicast(self) -> bool { + (self.a[0] & 0x01) == 0x01 + } + /// Generate a random MAC address. pub fn random() -> MacAddr { let mut rng = rand::thread_rng(); diff --git a/dpd-client/.cargo/config b/dpd-client/.cargo/config.toml similarity index 100% rename from dpd-client/.cargo/config rename to dpd-client/.cargo/config.toml diff --git a/dpd-client/tests/integration_tests/common.rs b/dpd-client/tests/integration_tests/common.rs index a653737..a8950e1 100644 --- a/dpd-client/tests/integration_tests/common.rs +++ b/dpd-client/tests/integration_tests/common.rs @@ -571,6 +571,19 @@ impl Switch { } } + /// Return the port label for the given physical port, useful for + /// counter information. + pub fn port_label(&self, phys_port: PhysPort) -> Option { + let idx: usize = phys_port.into(); + if phys_port == NO_PORT { + None + } else if let Some(port) = &self.ports[idx] { + format!("{}/{}", port.port_id, port.link_id).parse().ok() + } else { + panic!("request for missing port: {phys_port}"); + } + } + /// Return an iterator over all links. pub fn iter_links( &self, @@ -780,6 +793,38 @@ impl Switch { .join(", "))), } } + + /// If no client_name is given, this returns the number of packets dropped + /// for a given reason. + /// + /// If a client_name is given, we look up the counter for that client + /// name and the given counter name. + /// + /// If that counter isn't in the set returned by dpd, we return + /// an error to the caller. + pub async fn get_counter( + &self, + counter: &str, + client_name: Option<&str>, + ) -> anyhow::Result { + let client_name = if let Some(client_name) = client_name { + client_name.to_string() + } else { + "ingress_drop_reason".to_string() + }; + + self.client + .counter_get(&client_name, true) + .await + .map_err(|e| anyhow::anyhow!("failed to fetch counters: {e:?}")) + .and_then(|entries| { + entries + .iter() + .find(|e| e.keys.get("label").unwrap().as_str() == counter) + .map(|e| e.data.pkts.unwrap()) + .ok_or(anyhow::anyhow!("no such counter: {counter}")) + }) + } } // Construct a single TCP packet with an optional payload diff --git a/dpd-client/tests/integration_tests/counters.rs b/dpd-client/tests/integration_tests/counters.rs index 8b78b9f..bd2db32 100644 --- a/dpd-client/tests/integration_tests/counters.rs +++ b/dpd-client/tests/integration_tests/counters.rs @@ -12,8 +12,6 @@ use std::net::Ipv4Addr; use std::sync::Arc; -use anyhow::anyhow; - use packet::Endpoint; use crate::integration_tests::common; @@ -23,23 +21,6 @@ use ::common::network::MacAddr; use dpd_client::types::Ipv4Entry; use dpd_client::types::Ipv6Entry; -// Returns the number of packets dropped for the given reason. If that -// counter isn't in the set returned by dpd, we return an error to the caller. -async fn get_counter(switch: &Switch, counter: &str) -> anyhow::Result { - switch - .client - .counter_get("drop_reason", true) - .await - .map_err(|e| anyhow!("failed to fetch counters: {e:?}")) - .and_then(|entries| { - entries - .iter() - .find(|e| e.keys.get("label").unwrap().as_str() == counter) - .map(|e| e.data.pkts.unwrap()) - .ok_or(anyhow!("no such counter: {counter}")) - }) -} - // Run a single drop test. This sends a packet that we expect to be dropped, // and verifies that the expected drop counter is bumped by one. If the test // runs to completion we return the counter evaluation as a boolean rather than @@ -56,7 +37,7 @@ async fn one_drop_test( port, }; - let old = get_counter(switch, counter).await?; + let old = switch.get_counter(counter, None).await?; switch.packet_test(vec![send], Vec::new())?; let mut new = 0; @@ -67,7 +48,7 @@ async fn one_drop_test( // avoid a long pointless delay here, we try multiple times with a short // sleep rather than once with a long sleep. std::thread::sleep(std::time::Duration::from_millis(100)); - new = get_counter(switch, counter).await?; + new = switch.get_counter(counter, None).await?; if old + 1 == new { break; } diff --git a/dpd-client/tests/integration_tests/mcast.rs b/dpd-client/tests/integration_tests/mcast.rs new file mode 100644 index 0000000..1dfe6ca --- /dev/null +++ b/dpd-client/tests/integration_tests/mcast.rs @@ -0,0 +1,4632 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +use std::{ + collections::HashSet, + net::{IpAddr, Ipv4Addr, Ipv6Addr}, + sync::Arc, +}; + +use crate::integration_tests::{ + common::{self, get_switch, prelude::*}, + nat::{gen_geneve_packet, gen_geneve_packet_with_mcast_tag}, +}; +use ::common::network::MacAddr; +use anyhow::anyhow; +use dpd_client::{types, Error}; +use futures::TryStreamExt; +use oxnet::Ipv4Net; +use packet::{eth, geneve, ipv4, ipv6, udp, Endpoint}; + +const MULTICAST_TEST_IPV4: Ipv4Addr = Ipv4Addr::new(224, 0, 1, 0); +const MULTICAST_TEST_IPV6: Ipv6Addr = + Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 1, 0x1010); +const MULTICAST_TEST_IPV4_SSM: Ipv4Addr = Ipv4Addr::new(232, 123, 45, 67); +const MULTICAST_TEST_IPV6_SSM: Ipv6Addr = + Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0x1111); +const MULTICAST_NAT_IP: Ipv6Addr = Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 1); +const GIMLET_MAC: &str = "11:22:33:44:55:66"; +const GIMLET_IP: Ipv6Addr = + Ipv6Addr::new(0xfd00, 0x1122, 0x7788, 0x0101, 0, 0, 0, 4); + +// Bifurcated Multicast Design: +// +// The multicast implementation uses a bifurcated design that separates external +// (customer) and (internal) underlay traffic: +// +// 1. External-only groups (IPv4 and non-admin-scoped IPv6): +// - Created from API control plane IPs for customer traffic +// - Handle customer traffic to/from outside the rack +// - Use the external multicast API (/multicast/external-groups) +// - Must have NAT targets pointing to internal groups for proper forwarding +// +// 2. Internal groups (admin-scoped IPv6 multicast): +// - Admin-scoped = admin-local, site-local, or organization-local scope (RFC 7346, RFC 4291) +// - Geneve encapsulated multicast traffic (NAT targets of external-only groups) +// - Use the internal multicast API (/multicast/groups) +// - Can replicate to: +// a) External group members (customer traffic) +// b) Underlay-only members (infrastructure traffic) +// c) Both external and underlay members (bifurcated replication) +// - Don't require NAT targets (they serve as targets for external-only groups) +// +// This design ensures proper traffic separation and enables flexible multicast forwarding +// policies between external networks and internal rack infrastructure. + +fn derive_ipv6_mcast_mac(ipv6_addr: &Ipv6Addr) -> MacAddr { + // Get the octets of the IPv6 address + let ip_octets = ipv6_addr.octets(); + + // Create the MAC address + // First 2 bytes: 0x33, 0x33 (fixed prefix for IPv6 multicast) + // Last 4 bytes: Take the last 4 bytes of the IPv6 address + let mac_bytes = [ + 0x33, // First byte: 33 + 0x33, // Second byte: 33 + ip_octets[12], // Third byte: 13th octet of IPv6 address + ip_octets[13], // Fourth byte: 14th octet of IPv6 address + ip_octets[14], // Fifth byte: 15th octet of IPv6 address + ip_octets[15], // Sixth byte: 16th octet of IPv6 address + ]; + + MacAddr::from(mac_bytes) +} + +async fn check_counter_incremented( + switch: &Switch, + counter_name: &str, + baseline: u64, + expected_increment: u64, + client_name: Option<&str>, +) -> anyhow::Result { + let mut new_value = 0; + + // Poll for the counter value (with timeout) + for _i in 0..20 { + std::thread::sleep(std::time::Duration::from_millis(100)); + new_value = + switch.get_counter(counter_name, client_name).await.unwrap(); + + if new_value == baseline + expected_increment { + return Ok(new_value); + } + } + + // Counter didn't increment as expected + Err(anyhow!( + "Counter '{}' expected to increase by {} (from {} to {}), but only reached {}", + counter_name, + expected_increment, + baseline, + baseline + expected_increment, + new_value + )) +} + +fn create_nat_target_ipv4() -> types::NatTarget { + types::NatTarget { + internal_ip: MULTICAST_NAT_IP.into(), + inner_mac: MacAddr::new(0x01, 0x00, 0x5e, 0x00, 0x00, 0x01).into(), + vni: 100.into(), + } +} + +fn create_nat_target_ipv6() -> types::NatTarget { + types::NatTarget { + internal_ip: MULTICAST_NAT_IP.into(), + inner_mac: MacAddr::new(0x33, 0x33, 0x00, 0x00, 0x00, 0x01).into(), + vni: 100.into(), + } +} + +/// Create a multicast group for testing. +async fn create_test_multicast_group( + switch: &Switch, + group_ip: IpAddr, + tag: Option<&str>, + ports: &[(PhysPort, types::Direction)], + vlan_id: Option, + create_nat: bool, + sources: Option>, +) -> types::MulticastGroupResponse { + let members = ports + .iter() + .map(|(port, dir)| { + let (port_id, link_id) = switch.link_id(*port).unwrap(); + types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: *dir, + } + }) + .collect(); + + let nat_target = if create_nat { + if group_ip.is_ipv4() { + Some(create_nat_target_ipv4()) + } else { + Some(create_nat_target_ipv6()) + } + } else { + None + }; + + match group_ip { + IpAddr::V4(_) => { + // IPv4 groups are always external and require NAT targets + let nat_target = + nat_target.expect("IPv4 external groups require NAT targets"); + let external_entry = types::MulticastGroupCreateExternalEntry { + group_ip, + tag: tag.map(String::from), + nat_target, + vlan_id, + sources, + }; + switch + .client + .multicast_group_create_external(&external_entry) + .await + .expect("Failed to create external multicast group") + .into_inner() + } + IpAddr::V6(ipv6) => { + if oxnet::Ipv6Net::new_unchecked(ipv6, 128) + .is_admin_scoped_multicast() + { + // Admin-scoped IPv6 groups are internal + let internal_entry = types::MulticastGroupCreateEntry { + group_ip: match group_ip { + IpAddr::V6(ipv6) => ipv6, + _ => panic!("Expected IPv6 address"), + }, + tag: tag.map(String::from), + sources, + members, + }; + switch + .client + .multicast_group_create(&internal_entry) + .await + .expect("Failed to create internal multicast group") + .into_inner() + } else { + // Non-admin-scoped IPv6 groups are external-only and require NAT targets + let nat_target = nat_target + .expect("IPv6 external groups require NAT targets"); + let external_entry = types::MulticastGroupCreateExternalEntry { + group_ip, + tag: tag.map(String::from), + nat_target, + vlan_id, + sources, + }; + switch + .client + .multicast_group_create_external(&external_entry) + .await + .expect("Failed to create external multicast group") + .into_inner() + } + } + } +} + +/// Clean up a test group. +async fn cleanup_test_group(switch: &Switch, group_ip: IpAddr) { + let _ = switch.client.multicast_group_delete(&group_ip).await; +} + +/// Create an IPv4 multicast packet for testing. +fn create_ipv4_multicast_packet( + multicast_ip_addr: IpAddr, + src_mac: MacAddr, + src_ip: &str, + src_port: u16, + dst_port: u16, +) -> packet::Packet { + let multicast_ip = match multicast_ip_addr { + IpAddr::V4(addr) => addr, + _ => panic!("Expected IPv4 address"), + }; + + // Create the multicast MAC address following RFC 1112 + let mac_bytes = [ + 0x01, + 0x00, + 0x5e, + multicast_ip.octets()[1] & 0x7f, + multicast_ip.octets()[2], + multicast_ip.octets()[3], + ]; + let multicast_mac = MacAddr::from(mac_bytes); + + let src_endpoint = + Endpoint::parse(&src_mac.to_string(), src_ip, src_port).unwrap(); + + let dst_endpoint = Endpoint::parse( + &multicast_mac.to_string(), + &multicast_ip.to_string(), + dst_port, + ) + .unwrap(); + + // Generate a UDP packet + common::gen_udp_packet(src_endpoint, dst_endpoint) +} + +/// Create an IPv6 multicast packet for testing. +fn create_ipv6_multicast_packet( + multicast_ip_addr: IpAddr, + src_mac: MacAddr, + src_ip: &str, + src_port: u16, + dst_port: u16, +) -> packet::Packet { + let multicast_ip = match multicast_ip_addr { + IpAddr::V6(addr) => addr, + _ => panic!("Expected IPv6 address"), + }; + + // Create the multicast MAC address following RFC 2464 + // IPv6 multicast addresses use the prefix 33:33 followed by + // the last 32 bits of the IPv6 address + let mac_bytes = [ + 0x33, + 0x33, + multicast_ip.octets()[12], + multicast_ip.octets()[13], + multicast_ip.octets()[14], + multicast_ip.octets()[15], + ]; + let multicast_mac = MacAddr::from(mac_bytes); + + let src_endpoint = + Endpoint::parse(&src_mac.to_string(), src_ip, src_port).unwrap(); + + let dst_endpoint = Endpoint::parse( + &multicast_mac.to_string(), + &multicast_ip.to_string(), + dst_port, + ) + .unwrap(); + + // Generate a UDP packet + common::gen_udp_packet(src_endpoint, dst_endpoint) +} + +/// Prepare the expected packet for multicast testing that either goes +/// through NAT or is forwarded directly. +fn prepare_expected_pkt( + switch: &Switch, + send_pkt: &packet::Packet, + vlan: Option, + nat_target: Option<&types::NatTarget>, + switch_port: Option, +) -> packet::Packet { + match nat_target { + Some(nat) => { + // Deparse the incoming packet so we can copy it into the encapsulated + // packet + let ingress_payload = { + let mut encapped = send_pkt.clone(); + let eth = encapped.hdrs.eth_hdr.as_mut().unwrap(); + eth.eth_smac = MacAddr::new(0, 0, 0, 0, 0, 0); + eth.eth_dmac = nat.inner_mac.clone().into(); + encapped.deparse().unwrap().to_vec() + }; + + let switch_port_mac = switch + .get_port_mac(switch_port.unwrap()) + .unwrap() + .to_string(); + + let mut forward_pkt = gen_geneve_packet( + Endpoint::parse( + &switch_port_mac, + "::0", + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + Endpoint::parse( + &derive_ipv6_mcast_mac(&nat.internal_ip).to_string(), + &nat.internal_ip.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + eth::ETHER_ETHER, + *nat.vni, + true, + &ingress_payload, + ); + + ipv6::Ipv6Hdr::adjust_hlim(&mut forward_pkt, -1); + udp::UdpHdr::update_checksum(&mut forward_pkt); + + forward_pkt + } + None => { + // For non-NAT case, just forward the packet with proper TTL/hop limit adjustment + let mut recv_pkt = send_pkt.clone(); + + if let Some(_) = recv_pkt.hdrs.ipv4_hdr.as_mut() { + ipv4::Ipv4Hdr::adjust_ttl(&mut recv_pkt, -1); + } else if let Some(_) = recv_pkt.hdrs.ipv6_hdr.as_mut() { + ipv6::Ipv6Hdr::adjust_hlim(&mut recv_pkt, -1); + } + + // Add VLAN tag if required + if let Some(vlan_id) = vlan { + recv_pkt.hdrs.eth_hdr.as_mut().unwrap().eth_8021q = + Some(eth::EthQHdr { + eth_pcp: 0, + eth_dei: 0, + eth_vlan_tag: vlan_id, + }); + } + + // Rewrite src mac + if let Some(port) = switch_port { + let port_mac = switch.get_port_mac(port).unwrap(); + recv_pkt.hdrs.eth_hdr.as_mut().unwrap().eth_smac = + port_mac.clone(); + } + + recv_pkt + } + } +} + +#[tokio::test] +#[ignore] +async fn test_nonexisting_group() { + let switch = &*get_switch().await; + + // Test retrieving by IP address + let group_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let res = switch + .client + .multicast_group_get(&group_ip) + .await + .expect_err("Should not be able to get non-existent group by IP"); + + match res { + Error::ErrorResponse(inner) => { + assert_eq!(inner.status(), 404, "Expected 404 Not Found status code"); + }, + _ => panic!("Expected ErrorResponse when getting a non-existent multicast group"), + } +} + +#[tokio::test] +#[ignore] +async fn test_group_creation_with_validation() { + let switch = &*get_switch().await; + + // Test the bifurcated multicast design: + // - IPv4 external groups require NAT targets pointing to internal groups + // - Internal groups handle Geneve encapsulated replication infrastructure + let nat_target = create_nat_target_ipv4(); + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + + let egress1 = PhysPort(28); + let internal_group = create_test_multicast_group( + switch, + internal_multicast_ip, + Some("valid_internal_group"), + &[(egress1, types::Direction::Underlay)], + None, + false, + None, + ) + .await; + + assert!(internal_group.underlay_group_id.is_some()); + + // 1. Test creating a group with invalid parameters (e.g., invalid VLAN ID) + // IPv4 groups are always external + let external_invalid = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4(MULTICAST_TEST_IPV4), + tag: Some("test_invalid".to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(4096), // Invalid: VLAN ID must be 1-4095 + sources: None, + }; + + let res = switch + .client + .multicast_group_create_external(&external_invalid) + .await + .expect_err("Should fail with invalid group ID"); + + match res { + Error::ErrorResponse(inner) => { + assert_eq!( + inner.status(), + 400, + "Expected 400 Bad Request status code" + ); + } + _ => panic!("Expected ErrorResponse for invalid group ID"), + } + + // 2. Test with valid parameters + // IPv4 groups are always external + let external_valid = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4(MULTICAST_TEST_IPV4_SSM), + tag: Some("test_valid".to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(10), + sources: Some(vec![types::IpSrc::Exact( + "192.168.1.1".parse::().unwrap(), + )]), + }; + + let created = switch + .client + .multicast_group_create_external(&external_valid) + .await + .expect("Should successfully create valid group") + .into_inner(); + + assert_eq!(created.group_ip, MULTICAST_TEST_IPV4_SSM); + assert!(created.external_group_id.is_none()); + assert!(created.underlay_group_id.is_none()); + assert_eq!(created.tag, Some("test_valid".to_string())); + assert_eq!(created.int_fwding.nat_target, Some(nat_target.clone())); + assert_eq!(created.ext_fwding.vlan_id, Some(10)); + assert_eq!( + created.sources, + Some(vec![types::IpSrc::Exact( + "192.168.1.1".parse::().unwrap(), + )]) + ); + assert_eq!(created.members.len(), 0); // External groups don't have members + + switch + .client + .multicast_group_delete(&created.group_ip) + .await + .expect("Failed to delete test group"); +} + +#[tokio::test] +#[ignore] +async fn test_internal_ipv6_validation() { + let switch = &*get_switch().await; + + let (port_id, link_id) = switch.link_id(PhysPort(26)).unwrap(); + + // Test 1: IPv4-mapped IPv6 addresses should be rejected as invalid multicast + let ipv4_mapped_internal = types::MulticastGroupCreateEntry { + group_ip: "::ffff:224.1.1.1".parse().unwrap(), // IPv4-mapped IPv6 + tag: Some("test_ipv4_mapped_internal".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::External, + }], + }; + + let ipv4_mapped_res = switch + .client + .multicast_group_create(&ipv4_mapped_internal) + .await; + + assert!( + ipv4_mapped_res.is_err(), + "Should reject IPv4-mapped IPv6 addresses" + ); + let ipv4_mapped_error_msg = format!("{:?}", ipv4_mapped_res.unwrap_err()); + assert!( + ipv4_mapped_error_msg.contains("is not a multicast address"), + "Error message should indicate invalid multicast address: {}", + ipv4_mapped_error_msg + ); + + // Test 2: Non-admin-scoped IPv6 groups should be rejected from internal API + let non_admin_ipv6 = types::MulticastGroupCreateEntry { + group_ip: "ff0e::1".parse().unwrap(), // Global scope, not admin-scoped + tag: Some("test_non_admin".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::External, + }], + }; + + let non_admin_res = + switch.client.multicast_group_create(&non_admin_ipv6).await; + + assert!( + non_admin_res.is_err(), + "Should reject non-admin-scoped IPv6 groups from internal API" + ); + let non_admin_error_msg = format!("{:?}", non_admin_res.unwrap_err()); + assert!( + non_admin_error_msg.contains( + "Non-admin-scoped IPv6 multicast groups must use the external API" + ), + "Error message should direct to external API: {}", + non_admin_error_msg + ); + + // Test 3: Admin-scoped IPv6 groups work correctly (no VLAN IDs supported) + let internal_group = types::MulticastGroupCreateEntry { + group_ip: "ff04::2".parse().unwrap(), // Admin-scoped IPv6 + tag: Some("test_admin_scoped".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::Underlay, + }], + }; + + let created = switch + .client + .multicast_group_create(&internal_group) + .await + .expect("Should create internal IPv6 group") + .into_inner(); + + assert_eq!(created.ext_fwding.vlan_id, None); + assert!(created.underlay_group_id.is_some()); + + // Test update works correctly + let update_entry = types::MulticastGroupUpdateEntry { + tag: Some("updated_tag".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id, + link_id, + direction: types::Direction::Underlay, + }], + }; + + let updated = switch + .client + .multicast_group_update(&created.group_ip, &update_entry) + .await + .expect("Should update internal IPv6 group") + .into_inner(); + + assert_eq!(updated.tag, Some("updated_tag".to_string())); + assert_eq!(updated.ext_fwding.vlan_id, None); + + cleanup_test_group(switch, created.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_vlan_propagation_to_internal() { + let switch = &*get_switch().await; + + let (port_id, link_id) = switch.link_id(PhysPort(30)).unwrap(); + + // Step 1: Create internal IPv6 group first + let internal_group_entry = types::MulticastGroupCreateEntry { + group_ip: "ff04::200".parse().unwrap(), // Admin-scoped IPv6 + tag: Some("test_vlan_propagation".to_string()), + sources: None, + members: vec![ + types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::External, // External member for bifurcation + }, + types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::Underlay, // Underlay member for bifurcation + }, + ], + }; + + let created_admin = switch + .client + .multicast_group_create(&internal_group_entry) + .await + .expect("Should create admin-scoped group") + .into_inner(); + + assert!(created_admin.external_group_id.is_some()); + assert_eq!(created_admin.ext_fwding.vlan_id, None); // No VLAN initially + + // Step 2: Create external group that references the admin-scoped group + let nat_target = types::NatTarget { + internal_ip: "ff04::200".parse().unwrap(), // References admin-scoped group + inner_mac: MacAddr::new(0x03, 0x00, 0x00, 0x00, 0x00, 0x03).into(), + vni: 200.into(), + }; + + let external_group = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4("224.1.2.3".parse().unwrap()), + tag: Some("test_external_with_vlan".to_string()), + nat_target, + vlan_id: Some(42), // This VLAN should be used by admin-scoped group + sources: None, + }; + + let created_external = switch + .client + .multicast_group_create_external(&external_group) + .await + .expect("Should create external group with NAT target") + .into_inner(); + + assert_eq!(created_external.ext_fwding.vlan_id, Some(42)); + assert_eq!( + created_external.int_fwding.nat_target.unwrap().internal_ip, + "ff04::200".parse::().unwrap() + ); + + // Step 3: Verify the admin-scoped group now has access to the VLAN via NAT target reference + // Check the bitmap table to see if VLAN 42 is properly set (this is where VLAN matters for P4) + let bitmap_table = switch + .client + .table_dump("pipe.Egress.mcast_egress.tbl_decap_ports") + .await + .expect("Should clean up internal group") + .into_inner(); + + // Verify the admin-scoped group's bitmap entry has VLAN 42 from external group propagation + assert!( + bitmap_table + .entries + .iter() + .any(|entry| entry.action_args.values().any(|v| v.contains("42"))), + "Admin-scoped group bitmap should have VLAN 42 from external group" + ); + + cleanup_test_group(switch, created_admin.group_ip).await; + cleanup_test_group(switch, created_external.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_group_api_lifecycle() { + let switch = &*get_switch().await; + + let egress1 = PhysPort(28); + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let underlay_group = create_test_multicast_group( + switch, + internal_multicast_ip, + Some("valid_underlay_group"), + &[(egress1, types::Direction::Underlay)], + None, + false, + None, + ) + .await; + + assert!(underlay_group.underlay_group_id.is_some()); + + // Create IPv4 external group with NAT target referencing the underlay group + let group_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan_id = 10; + let nat_target = create_nat_target_ipv4(); + let external_create = types::MulticastGroupCreateExternalEntry { + group_ip, + tag: Some("test_lifecycle".to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(vlan_id), + sources: None, + }; + + let created = switch + .client + .multicast_group_create_external(&external_create) + .await + .expect("Should be able to create group") + .into_inner(); + + let external_group_id = created.external_group_id; + + assert_eq!(created.group_ip, MULTICAST_TEST_IPV4); + assert!(created.external_group_id.is_none()); + assert!(created.underlay_group_id.is_none()); + assert_eq!(created.tag, Some("test_lifecycle".to_string())); + assert_eq!(created.int_fwding.nat_target, Some(nat_target.clone())); + assert_eq!(created.ext_fwding.vlan_id, Some(vlan_id)); + assert_eq!(created.members.len(), 0); // External groups don't have members + + // 3. Get all groups and verify our group is included + let groups = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to list groups"); + + let found_in_list = groups + .iter() + .any(|g| g.external_group_id == external_group_id); + assert!(found_in_list, "Created group should be in the list"); + + // 4. Get groups by tag + let tagged_groups = switch + .client + .multicast_groups_list_by_tag_stream("test_lifecycle", None) + .try_collect::>() + .await + .expect("Should be able to get groups by tag"); + + assert!( + !tagged_groups.is_empty(), + "Tagged group list should not be empty" + ); + let found_by_tag = tagged_groups + .iter() + .any(|g| g.external_group_id == external_group_id); + assert!(found_by_tag, "Created group should be found by tag"); + + // 5. Get the specific group + let group = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to get group by ID"); + + assert_eq!(group[0].external_group_id, external_group_id); + assert_eq!(group[0].tag, Some("test_lifecycle".to_string())); + + // Also test getting by IP address + let group_by_ip = switch + .client + .multicast_group_get(&group_ip) + .await + .expect("Should be able to get group by IP"); + + assert_eq!(group_by_ip.external_group_id, external_group_id); + + // 6. Update the group + let updated_nat_target = types::NatTarget { + internal_ip: MULTICAST_NAT_IP.into(), + inner_mac: MacAddr::new(0xe0, 0xd5, 0x5e, 0x00, 0x11, 0x22).into(), + vni: 200.into(), + }; + + let external_update = types::MulticastGroupUpdateExternalEntry { + tag: Some("updated_lifecycle".to_string()), + nat_target: updated_nat_target.clone(), + vlan_id: Some(20), + sources: Some(vec![types::IpSrc::Exact( + "192.168.1.5".parse::().unwrap(), + )]), + }; + + let updated = switch + .client + .multicast_group_update_external(&group_ip, &external_update) + .await + .expect("Should be able to update group") + .into_inner(); + + assert_eq!(updated.external_group_id, external_group_id); + assert!(updated.underlay_group_id.is_none()); + assert_eq!(updated.tag, Some("updated_lifecycle".to_string())); + assert_eq!(updated.int_fwding.nat_target, Some(updated_nat_target)); + assert_eq!(updated.ext_fwding.vlan_id, Some(20)); + assert_eq!( + updated.sources, + Some(vec![types::IpSrc::Exact( + "192.168.1.5".parse::().unwrap(), + )]) + ); + assert_eq!(updated.members.len(), 0); // External groups don't have members + + // 7. Delete the group + switch + .client + .multicast_group_delete(&group_ip) + .await + .expect("Should be able to delete group"); + + // 8. Verify group was deleted + let result = switch + .client + .multicast_group_get(&group_ip) + .await + .expect_err("Should not be able to get deleted group"); + + match result { + Error::ErrorResponse(inner) => { + assert_eq!( + inner.status(), + 404, + "Expected 404 Not Found status code" + ); + } + _ => panic!("Expected ErrorResponse when getting a deleted group"), + } + + // 9. Verify group no longer appears in the list + let groups_after_delete = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to list groups"); + + // Check if the specific deleted group is still in the list + let deleted_group_still_in_list = + groups_after_delete.iter().any(|g| g.group_ip == group_ip); + assert!( + !deleted_group_still_in_list, + "Deleted group should not be in the list" + ); +} + +#[tokio::test] +#[ignore] +async fn test_multicast_tagged_groups_management() { + let switch = &*get_switch().await; + + // Create multiple groups with the same tag + let tag = "test_tag_management"; + + // Step 1: Create admin-scoped IPv6 internal group for actual replication + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some(&format!("{}_internal", tag)), + &[(PhysPort(11), types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + let nat_target = create_nat_target_ipv4(); + let group_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + // Step 2: Create first IPv4 external group (entry point only, no members) + let external_group1 = types::MulticastGroupCreateExternalEntry { + group_ip, + tag: Some(tag.to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(10), + sources: None, + }; + + let created1 = switch + .client + .multicast_group_create_external(&external_group1) + .await + .expect("Should create first group") + .into_inner(); + + // Step 3: Create second IPv4 external group (same tag, different IP) + let external_group2 = types::MulticastGroupCreateExternalEntry { + group_ip: "224.0.1.2".parse().unwrap(), // Different IP + tag: Some(tag.to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(10), + sources: None, + }; + + let created2 = switch + .client + .multicast_group_create_external(&external_group2) + .await + .expect("Should create second group") + .into_inner(); + + // Step 4: Create third IPv4 external group (different tag) + let external_group3 = types::MulticastGroupCreateExternalEntry { + group_ip: "224.0.1.3".parse().unwrap(), // Different IP + tag: Some("different_tag".to_string()), + nat_target: nat_target.clone(), + vlan_id: Some(10), + sources: None, + }; + + let created3 = switch + .client + .multicast_group_create_external(&external_group3) + .await + .expect("Should create third group") + .into_inner(); + + // List groups by tag + let tagged_groups = switch + .client + .multicast_groups_list_by_tag_stream(tag, None) + .try_collect::>() + .await + .expect("Should list groups by tag"); + + assert_eq!(tagged_groups.len(), 2, "Should find 2 groups with the tag"); + + let group_ips: HashSet<_> = + tagged_groups.iter().map(|g| g.group_ip).collect(); + assert!(group_ips.contains(&created1.group_ip)); + assert!(group_ips.contains(&created2.group_ip)); + assert!(!group_ips.contains(&created3.group_ip)); + + // Delete all groups with the tag + switch + .client + .multicast_reset_by_tag(tag) + .await + .expect("Should delete all groups with tag"); + + // Verify the groups with the tag are gone + let remaining_groups = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should list remaining groups"); + + let remaining_ips: HashSet<_> = + remaining_groups.iter().map(|g| g.group_ip).collect(); + assert!(!remaining_ips.contains(&created1.group_ip)); + assert!(!remaining_ips.contains(&created2.group_ip)); + assert!(remaining_ips.contains(&created3.group_ip)); + + // Clean up the remaining group and underlay group + switch + .client + .multicast_group_delete(&created3.group_ip) + .await + .expect("Should delete the remaining group"); + + switch + .client + .multicast_group_delete(&internal_multicast_ip) + .await + .expect("Should delete the remaining group"); +} + +#[tokio::test] +#[ignore] +async fn test_multicast_untagged_groups() { + let switch = &*get_switch().await; + + // First create the internal admin-scoped group that will be the NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + None, // No tag for NAT target + &[(PhysPort(26), types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Create a group without a tag + let group_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + // IPv4 groups are always external - create external entry directly + let external_untagged = types::MulticastGroupCreateExternalEntry { + group_ip, + tag: None, // No tag + nat_target: create_nat_target_ipv4(), + vlan_id: Some(10), + sources: None, + }; + + let created_untagged = switch + .client + .multicast_group_create_external(&external_untagged) + .await + .expect("Should create untagged group") + .into_inner(); + + // Create a group with a tag + // IPv4 groups are always external - create external entry directly + let tagged_group = types::MulticastGroupCreateExternalEntry { + group_ip: "224.0.2.2".parse().unwrap(), // Different IP + tag: Some("some_tag".to_string()), + nat_target: create_nat_target_ipv4(), + vlan_id: Some(10), + sources: None, + }; + + let created_tagged = switch + .client + .multicast_group_create_external(&tagged_group) + .await + .expect("Should create tagged group") + .into_inner(); + + // Delete all untagged groups + switch + .client + .multicast_reset_untagged() + .await + .expect("Should delete all untagged groups"); + + // Verify only the untagged group is gone + let remaining_groups = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should list remaining groups"); + + let remaining_ips: HashSet<_> = + remaining_groups.iter().map(|g| g.group_ip).collect(); + assert!(!remaining_ips.contains(&created_untagged.group_ip)); + assert!(remaining_ips.contains(&created_tagged.group_ip)); + + // Clean up the remaining tagged group + // (NAT target group was already deleted by multicast_reset_untagged since it had no tag) + switch + .client + .multicast_group_delete(&created_tagged.group_ip) + .await + .expect("Should delete remaining tagged group"); +} + +#[tokio::test] +#[ignore] +async fn test_api_internal_ipv6_bifurcated_replication() { + let switch = &*get_switch().await; + + let (port_id1, link_id1) = switch.link_id(PhysPort(11)).unwrap(); + let (port_id2, link_id2) = switch.link_id(PhysPort(12)).unwrap(); + + // Create admin-scoped IPv6 group with both external and underlay members + let admin_scoped_group = types::MulticastGroupCreateEntry { + group_ip: "ff04::100".parse().unwrap(), // Admin-scoped IPv6 + tag: Some("test_bifurcated".to_string()), + sources: None, + members: vec![ + types::MulticastGroupMember { + port_id: port_id1.clone(), + link_id: link_id1, + direction: types::Direction::External, + }, + types::MulticastGroupMember { + port_id: port_id2.clone(), + link_id: link_id2, + direction: types::Direction::Underlay, + }, + ], + }; + + let created = switch + .client + .multicast_group_create(&admin_scoped_group) + .await + .expect("Should create bifurcated admin-scoped group") + .into_inner(); + + // Verify both group IDs are populated + assert!( + created.external_group_id.is_some(), + "Should have external group ID" + ); + assert!( + created.underlay_group_id.is_some(), + "Should have underlay group ID" + ); + assert_ne!( + created.external_group_id, created.underlay_group_id, + "Group IDs should be different" + ); + + // Verify group has external_group_id (replication is handled internally) + assert!( + created.external_group_id.is_some(), + "Bifurcated group should have external_group_id" + ); + + // Verify members are preserved + assert_eq!(created.members.len(), 2); + let external_members: Vec<_> = created + .members + .iter() + .filter(|m| m.direction == types::Direction::External) + .collect(); + let underlay_members: Vec<_> = created + .members + .iter() + .filter(|m| m.direction == types::Direction::Underlay) + .collect(); + + assert_eq!(external_members.len(), 1); + assert_eq!(underlay_members.len(), 1); + + cleanup_test_group(switch, created.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_api_internal_ipv6_underlay_only() { + let switch = &*get_switch().await; + + let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); + + // Create admin-scoped IPv6 group with only underlay members + let underlay_only_group = types::MulticastGroupCreateEntry { + group_ip: "ff05::200".parse().unwrap(), // Site-local admin-scoped + tag: Some("test_underlay_only".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::Underlay, + }], + }; + + let created = switch + .client + .multicast_group_create(&underlay_only_group) + .await + .expect("Should create underlay-only admin-scoped group") + .into_inner(); + + // Should have underlay group ID but no external group ID + assert!( + created.underlay_group_id.is_some(), + "Should have underlay group ID" + ); + assert!( + created.external_group_id.is_none(), + "Should NOT have external group ID" + ); + + // Verify group has underlay_group_id (replication is handled internally) + assert!( + created.underlay_group_id.is_some(), + "Underlay-only group should have underlay_group_id" + ); + + // Verify only underlay members + assert_eq!(created.members.len(), 1); + assert_eq!(created.members[0].direction, types::Direction::Underlay); + + cleanup_test_group(switch, created.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_api_internal_ipv6_external_only() { + let switch = &*get_switch().await; + + let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); + + // Create admin-scoped IPv6 group with only external members + let external_only_group = types::MulticastGroupCreateEntry { + group_ip: "ff08::300".parse().unwrap(), // Org-local admin-scoped + tag: Some("test_external_only".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::External, + }], + }; + + let created = switch + .client + .multicast_group_create(&external_only_group) + .await + .expect("Should create external-only admin-scoped group") + .into_inner(); + + // Should have external group ID but no underlay group ID + assert!( + created.external_group_id.is_some(), + "Should have external group ID" + ); + assert!( + created.underlay_group_id.is_none(), + "Should NOT have underlay group ID" + ); + + // Verify group has external_group_id (replication is handled internally) + assert!( + created.external_group_id.is_some(), + "External-only group should have external_group_id" + ); + + // Verify only external members + assert_eq!(created.members.len(), 1); + assert_eq!(created.members[0].direction, types::Direction::External); + + cleanup_test_group(switch, created.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_api_invalid_combinations() { + let switch = &*get_switch().await; + + // First create the internal admin-scoped group that will be the NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("nat_target_for_invalid_combos"), + &[(PhysPort(26), types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Test 1: IPv4 with underlay members should fail + let ipv4_with_underlay = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4("224.1.0.200".parse().unwrap()), // Avoid 224.0.0.0/24 reserved range + tag: Some("test_invalid_ipv4".to_string()), + nat_target: create_nat_target_ipv4(), + vlan_id: Some(10), + sources: None, + }; + + // This should succeed via external API (IPv4 groups are external-only) + let created_ipv4 = switch + .client + .multicast_group_create_external(&ipv4_with_underlay) + .await + .expect("IPv4 external group should be created") + .into_inner(); + + // But it should not have underlay group ID or replication info + assert!(created_ipv4.underlay_group_id.is_none()); + + // Test 2: Non-admin-scoped IPv6 should use external API + let non_admin_ipv6 = types::MulticastGroupCreateExternalEntry { + group_ip: "ff0e::400".parse().unwrap(), // Global scope, not admin-scoped + tag: Some("test_non_admin_ipv6".to_string()), + nat_target: create_nat_target_ipv6(), + vlan_id: Some(20), + sources: None, + }; + + let created_non_admin = switch + .client + .multicast_group_create_external(&non_admin_ipv6) + .await + .expect("Non-admin-scoped IPv6 should use external API") + .into_inner(); + + // Should not have underlay group ID or replication info + assert!(created_non_admin.underlay_group_id.is_none()); + + // Test 3: Admin-scoped IPv6 with underlay members should fail via external API + let admin_scoped_external_entry = + types::MulticastGroupCreateExternalEntry { + group_ip: "ff04::500".parse().unwrap(), // Admin-scoped + tag: Some("test_admin_external".to_string()), + nat_target: create_nat_target_ipv6(), + vlan_id: Some(30), + sources: None, + }; + + // This should fail because admin-scoped groups must use internal API + let result = switch + .client + .multicast_group_create_external(&admin_scoped_external_entry) + .await + .expect_err("Admin-scoped IPv6 should fail via external API"); + + // Verify it's the expected validation error + match result { + Error::ErrorResponse(inner) => { + assert_eq!(inner.status(), 400); + assert!(inner.message.contains("admin-scoped multicast address")); + } + _ => panic!( + "Expected ErrorResponse for admin-scoped external group creation" + ), + } + + cleanup_test_group(switch, created_ipv4.group_ip).await; + cleanup_test_group(switch, created_non_admin.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_invalid_destination_mac() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + + // Create admin-scoped IPv6 multicast group for underlay replication + // This group handles replication within the rack infrastructure + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let vlan = Some(10); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_invalid_mac_underlay"), + &[(egress1, types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Create IPv4 external multicast group with NAT target + // This group handles external traffic and references the underlay group via NAT target + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_invalid_mac"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped underlay group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let ipv4_addr = match multicast_ip { + IpAddr::V4(addr) => addr, + _ => panic!("Expected IPv4 address"), + }; + + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + // Create an INVALID multicast MAC address - doesn't follow RFC 1112 + // Using a unicast MAC instead of the proper multicast MAC + let invalid_mac = MacAddr::new(0x00, 0x11, 0x22, 0x33, 0x44, 0x55); + + let src_endpoint = + Endpoint::parse(&src_mac.to_string(), src_ip, src_port).unwrap(); + + let dst_endpoint = Endpoint::parse( + &invalid_mac.to_string(), + &ipv4_addr.to_string(), + dst_port, + ) + .unwrap(); + + // Generate packet with invalid MAC + let to_send = common::gen_udp_packet(src_endpoint, dst_endpoint); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (invalid MAC should be dropped) + let expected_pkts = vec![]; + + let ctr_baseline = switch + .get_counter("multicast_invalid_mac", None) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + "multicast_invalid_mac", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + // Cleanup: Remove both external IPv4 group and underlay IPv6 group + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_invalid_destination_mac() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + + // Create admin-scoped IPv6 multicast group + let multicast_ip = IpAddr::V6("ff04::300".parse().unwrap()); // Admin-scoped + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv6_invalid_mac"), + &[(egress1, types::Direction::External)], + vlan, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let ipv6_addr = match multicast_ip { + IpAddr::V6(addr) => addr, + _ => panic!("Expected IPv6 address"), + }; + + // Create an INVALID multicast MAC address - doesn't follow RFC 2464 + // Using a unicast MAC instead of the proper 33:33:xx:xx:xx:xx format + let invalid_mac = MacAddr::new(0x00, 0x11, 0x22, 0x33, 0x44, 0x55); + + let src_endpoint = + Endpoint::parse(&src_mac.to_string(), "2001:db8::1", 3333).unwrap(); + + let dst_endpoint = + Endpoint::parse(&invalid_mac.to_string(), &ipv6_addr.to_string(), 4444) + .unwrap(); + + // Generate packet with invalid MAC + let to_send = common::gen_udp_packet(src_endpoint, dst_endpoint); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (invalid MAC should be dropped) + let expected_pkts = vec![]; + + let ctr_baseline = switch + .get_counter("multicast_invalid_mac", None) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + "multicast_invalid_mac", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_ttl_zero() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + + // First create the internal admin-scoped group that will be the NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("nat_target_for_ttl"), + &[(egress1, types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Create IPv4 multicast group with two egress ports + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ttl_drop"), + &[], // External groups have no members + vlan, + true, // IPv4 groups need NAT targets + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let src_ip = "192.168.1.20"; + let src_port = 4444; + let dst_port = 5555; + + let mut to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Set TTL to 0 (should be dropped) + ipv4::Ipv4Hdr::adjust_ttl(&mut to_send, -255); // Set to 0 + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (should be dropped due to TTL=0) + let expected_pkts = vec![]; + + let ctr_baseline = + switch.get_counter("ipv4_ttl_invalid", None).await.unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + "ipv4_ttl_invalid", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_ttl_one() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + + // First create the internal admin-scoped group that will be the NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("nat_target_for_ttl_one"), + &[(egress1, types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Create IPv4 multicast group with two egress ports + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ttl_one_drop"), + &[], // External groups have no members + vlan, + true, // IPv4 groups need NAT targets + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.20"; + let src_port = 4444; + let dst_port = 5555; + + let mut to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Set TTL to 1 - this should be dropped for multicast + // because the switch decrements it to 0 during processing + ipv4::Ipv4Hdr::adjust_ttl(&mut to_send, -254); // Set to 1 + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (should be dropped due to TTL=1) + let expected_pkts = vec![]; + + let ctr_baseline = + switch.get_counter("ipv4_ttl_invalid", None).await.unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + "ipv4_ttl_invalid", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_basic_replication_nat_no_admin_ula() -> TestResult +{ + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + + // Step 1: Create admin-scoped IPv6 multicast group for underlay replication + // This group handles replication within the rack infrastructure + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let vlan = Some(10); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_replication_underlay"), + &[(egress1, types::Direction::Underlay)], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external multicast group with NAT target + // This group handles external traffic and references the underlay group via NAT target + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_replication"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + (egress3, types::Direction::External), + ], + vlan, + true, // Create NAT target that points to the admin-scoped underlay group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts = vec![]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_basic_replication_nat_ingress() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + + // Step 1: Create admin-scoped IPv6 multicast group for underlay replication + // This handles the actual packet replication within the rack infrastructure + // after NAT ingress processing + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let underlay_members = [ + (egress1, types::Direction::Underlay), + (egress3, types::Direction::Underlay), + ]; + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_replication_internal"), + &underlay_members, + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external multicast group with NAT target + // This group handles external traffic and references the underlay group via NAT target + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let external_members = [ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ]; + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv4_replication"), + &external_members, + vlan, + true, // Create NAT target that points to the admin-scoped underlay group + None, + ) + .await; + + let (port_id1, link_id1) = switch.link_id(egress1).unwrap(); + let (port_id3, link_id3) = switch.link_id(egress3).unwrap(); + let port_mac1 = switch.get_port_mac(egress1).unwrap(); + let port_mac3 = switch.get_port_mac(egress3).unwrap(); + + // Set MAC addresses for rewriting + switch + .client + .link_mac_set(&port_id1, &link_id1, &port_mac1.into()) + .await + .expect("Should set link MAC"); + switch + .client + .link_mac_set(&port_id3, &link_id3, &port_mac3.into()) + .await + .expect("Should set link MAC"); + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + let to_recv1 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + let to_recv2 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress3), + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(to_recv1), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv2), + port: egress3, + }, + ]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_encapped_multicast_geneve_mcast_tag_to_external_members( +) -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // Step 1: Create admin-scoped IPv6 group for actual replication first + // This group uses the MULTICAST_NAT_IP address that the external group will reference + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let replication_members = [ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ]; + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_geneve_mcast_tag_underlay"), + &replication_members, + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external multicast group with NAT target (no members) + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_geneve_mcast_tag_0"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + // Create the original packet + let og_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Modify the original packet to set the multicast tag, Vlan ID, + // decrement the ttl/hlim, and update the desination MAC to the + // Egress port MAC + let expected_pkt1 = + prepare_expected_pkt(switch, &og_pkt, vlan, None, Some(egress1)); + + let expected_pkt2 = + prepare_expected_pkt(switch, &og_pkt, vlan, None, Some(egress2)); + + // Use same NAT target as the one used in the original packet + let nat_target = create_nat_target_ipv4(); + + // Skip Ethernet header as it will be added by gen_geneve_packet + let eth_hdr_len = 14; // Standard Ethernet header length + let payload = og_pkt.deparse().unwrap()[eth_hdr_len..].to_vec(); + + // Create the Geneve packet with mcast_tag = 0 + // According to mcast_tag_check table, when geneve.isValid() is true and + // mcast_tag is 0, it should invalidate the underlay group and set decap + let geneve_pkt = gen_geneve_packet_with_mcast_tag( + Endpoint::parse( + GIMLET_MAC, + &GIMLET_IP.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + Endpoint::parse( + &derive_ipv6_mcast_mac(&nat_target.internal_ip).to_string(), + &nat_target.internal_ip.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + eth::ETHER_IPV4, + nat_target.vni.clone().into(), + true, // tag_ingress = true to enable option setting + Some(0), // mcast_tag = 0 + &payload, + ); + + let test_pkt = TestPacket { + packet: Arc::new(geneve_pkt), + port: ingress, + }; + + // We expect the packet to be decapsulated and forwarded to both egress + // ports + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(expected_pkt1), + port: egress1, + }, + TestPacket { + packet: Arc::new(expected_pkt2), + port: egress2, + }, + ]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, MULTICAST_NAT_IP.into()).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_encapped_multicast_geneve_mcast_tag_to_underlay_members( +) -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress3 = PhysPort(19); + let egress4 = PhysPort(20); + + // Step 1: Create admin-scoped IPv6 group for underlay replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_geneve_mcast_tag_underlay"), + &[ + (egress3, types::Direction::Underlay), + (egress4, types::Direction::Underlay), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external multicast group with NAT target (no members) + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_geneve_mcast_tag_1"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + // Create the original packet + let og_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Emulate Nat Target from previous packet in the chain. + let nat_target = create_nat_target_ipv6(); + + // Skip Ethernet header as it will be added by gen_geneve_packet + let eth_hdr_len = 14; // Standard Ethernet header length + let payload = og_pkt.deparse().unwrap()[eth_hdr_len..].to_vec(); + + let geneve_src = Endpoint::parse( + GIMLET_MAC, + &GIMLET_IP.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(); + let geneve_dst = Endpoint::parse( + &derive_ipv6_mcast_mac(&nat_target.internal_ip).to_string(), + &nat_target.internal_ip.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(); + + // Create the Geneve packet with mcast_tag = 1 + // According to mcast_tag_check table, when geneve.isValid() is true and + // mcast_tag is 1, it should invalidate the external group and not decap + let geneve_pkt = gen_geneve_packet_with_mcast_tag( + geneve_src, + geneve_dst, + eth::ETHER_IPV4, + nat_target.vni.clone().into(), + true, // tag_ingress = true to enable option setting + Some(1), // mcast_tag = 1 + &payload, + ); + + let test_pkt = TestPacket { + packet: Arc::new(geneve_pkt.clone()), + port: ingress, + }; + + // Vlan should be stripped and we only replicate to underlay ports + let recv_pkt1 = + prepare_expected_pkt(switch, &geneve_pkt, None, None, Some(egress3)); + let recv_pkt2 = + prepare_expected_pkt(switch, &geneve_pkt, None, None, Some(egress4)); + + // We expect the packet not be decapped and forwarded to both egress + // ports + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(recv_pkt1), + port: egress3, + }, + TestPacket { + packet: Arc::new(recv_pkt2), + port: egress4, + }, + ]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, MULTICAST_NAT_IP.into()).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_encapped_multicast_geneve_mcast_tag_to_underlay_and_external_members( +) -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + let egress4 = PhysPort(20); + + // Step 1: Create admin-scoped IPv6 group for bifurcated replication first + // This group has both External and Underlay direction members + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_geneve_mcast_tag_bifurcated"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + (egress3, types::Direction::Underlay), + (egress4, types::Direction::Underlay), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external multicast group with NAT target (no members) + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_geneve_mcast_tag_1"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + // Create the original packet + let og_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Emulate Nat Target from previous packet in the chain. + let nat_target = create_nat_target_ipv6(); + + // Skip Ethernet header as it will be added by gen_geneve_packet + let eth_hdr_len = 14; // Standard Ethernet header length + let payload = og_pkt.deparse().unwrap()[eth_hdr_len..].to_vec(); + + // Create the Geneve packet with mcast_tag = 2 + // According to mcast_tag_check table, when geneve.isValid() is true and + // mcast_tag is 2, it should not invalidate any group, decapping only the + // external group(s) + let geneve_pkt = gen_geneve_packet_with_mcast_tag( + Endpoint::parse( + GIMLET_MAC, + &GIMLET_IP.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + Endpoint::parse( + &derive_ipv6_mcast_mac(&nat_target.internal_ip).to_string(), + &nat_target.internal_ip.to_string(), + geneve::GENEVE_UDP_PORT, + ) + .unwrap(), + eth::ETHER_IPV4, + nat_target.vni.clone().into(), + true, // tag_ingress = true to enable option setting + Some(2), // mcast_tag = 2 + &payload, + ); + + let test_pkt = TestPacket { + packet: Arc::new(geneve_pkt.clone()), + port: ingress, + }; + + // External ports should be replicated with Vlan information + let recv_pkt1 = + prepare_expected_pkt(switch, &og_pkt, vlan, None, Some(egress1)); + + let recv_pkt2 = + prepare_expected_pkt(switch, &og_pkt, vlan, None, Some(egress2)); + + // Vlan should be stripped when we replicate to underlay ports + let recv_pkt3 = + prepare_expected_pkt(switch, &geneve_pkt, None, None, Some(egress3)); + let recv_pkt4 = + prepare_expected_pkt(switch, &geneve_pkt, None, None, Some(egress4)); + + // We expect 2 packets to be decapped and forwarded to external ports + // and 2 packets to be forwarded to underlay ports (still encapped) + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(recv_pkt1), + port: egress1, + }, + TestPacket { + packet: Arc::new(recv_pkt2), + port: egress2, + }, + TestPacket { + packet: Arc::new(recv_pkt3), + port: egress3, + }, + TestPacket { + packet: Arc::new(recv_pkt4), + port: egress4, + }, + ]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, MULTICAST_NAT_IP.into()).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_drops_ingress_is_egress_port() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + + // First create the underlay admin-scoped IPv6 group for NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_drops_underlay"), + &[(ingress, types::Direction::Underlay)], + None, + false, // No NAT target for admin-scoped group + None, + ) + .await; + + // Create IPv4 external multicast group with NAT target (no members) + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_replication"), + &[], // External groups have no members + None, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "192.168.1.10"; + let src_port = 3333; + let dst_port = 4444; + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts = vec![]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_hop_limit_zero() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // Step 1: Create admin-scoped IPv6 group for actual replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_ipv6_hop_limit_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create external IPv6 group with NAT target (no members) + let multicast_ip = IpAddr::V6(MULTICAST_TEST_IPV6); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv6_hop_limit_zero"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let mut to_send = create_ipv6_multicast_packet( + multicast_ip, + src_mac, + "2001:db8::1", + 3333, + 4444, + ); + + // Set Hop Limit to 0 (should be dropped) + ipv6::Ipv6Hdr::adjust_hlim(&mut to_send, -255); // Set to 0 + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (should be dropped due to Hop Limit=0) + let expected_pkts = vec![]; + + let ctr_baseline = + switch.get_counter("ipv6_ttl_invalid", None).await.unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + cleanup_test_group(switch, created_group.group_ip).await; + + check_counter_incremented( + switch, + "ipv6_ttl_invalid", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_hop_limit_one() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // Step 1: Create admin-scoped IPv6 group for actual replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_ipv6_hop_limit_one_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create external IPv6 group with NAT target (no members) + let multicast_ip = IpAddr::V6(MULTICAST_TEST_IPV6); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv6_hop_limit_one"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + let src_ip = "2001:db8::1"; + let src_port = 4444; + let dst_port = 5555; + + let mut to_send = create_ipv6_multicast_packet( + multicast_ip, + src_mac, + src_ip, + src_port, + dst_port, + ); + + // Set Hop Limit to 1 - this should be dropped for multicast + // because the switch decrements it to 0 during processing + ipv6::Ipv6Hdr::adjust_hlim(&mut to_send, -254); // Set to 1 (255 - 254 = 1) + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets (should be dropped due to Hop Limit=1) + let expected_pkts = vec![]; + + let ctr_baseline = + switch.get_counter("ipv6_ttl_invalid", None).await.unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + "ipv6_ttl_invalid", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_basic_replication_nat_ingress() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + + // Step 1: Create admin-scoped IPv6 group for underlay replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let underlay_members = [(egress1, types::Direction::Underlay)]; + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_replication_internal"), + &underlay_members, + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create external IPv6 group with NAT target (no members) + let multicast_ip = IpAddr::V6(MULTICAST_TEST_IPV6); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv6_replication"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let (port_id1, link_id1) = switch.link_id(egress1).unwrap(); + let port_mac1 = switch.get_port_mac(egress1).unwrap(); + + // Set MAC addresses for rewriting + switch + .client + .link_mac_set(&port_id1, &link_id1, &port_mac1.into()) + .await + .expect("Should set link MAC"); + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let to_send = create_ipv6_multicast_packet( + multicast_ip, + src_mac, + "2001:db8::1", + 3333, + 4444, + ); + + let to_recv1 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts = vec![TestPacket { + packet: Arc::new(to_recv1), + port: egress1, + }]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_source_filtering_exact_match() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress1 = PhysPort(10); + let ingress2 = PhysPort(11); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // First create the underlay admin-scoped IPv6 group for NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let vlan = Some(10); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_source_filtering_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 SSM external group with source filtering and NAT target (no members) + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4_SSM); + let allowed_src_ip = "192.168.1.5".parse().unwrap(); + let filtered_src_ip: IpAddr = "192.168.1.6".parse().unwrap(); + let allowed_src = types::IpSrc::Exact(allowed_src_ip); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_source_filtering"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + Some(vec![allowed_src]), + ) + .await; + + let src_mac1 = switch.get_port_mac(ingress1).unwrap(); + let src_mac2 = switch.get_port_mac(ingress2).unwrap(); + + // Create test packets - one from allowed source, one from filtered source + let allowed_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac1, + &allowed_src_ip.to_string(), + 3333, + 4444, + ); + + let filtered_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac2, + &filtered_src_ip.to_string(), + 3333, + 4444, + ); + + let to_recv11 = prepare_expected_pkt( + switch, + &allowed_pkt, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv12 = prepare_expected_pkt( + switch, + &allowed_pkt, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let test_pkts = vec![ + TestPacket { + packet: Arc::new(allowed_pkt), + port: ingress1, + }, + TestPacket { + packet: Arc::new(filtered_pkt), + port: ingress2, + }, + ]; + + // Only expect packets from the allowed source + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(to_recv11), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv12), + port: egress2, + }, + ]; + + let ctr_baseline = switch + .get_counter("multicast_src_filtered", None) + .await + .unwrap(); + + let result = switch.packet_test(test_pkts, expected_pkts); + + check_counter_incremented( + switch, + "multicast_src_filtered", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv4_multicast_source_filtering_prefix_match() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress1 = PhysPort(10); + let ingress2 = PhysPort(11); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // First create the underlay admin-scoped IPv6 group for NAT target + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let vlan = Some(10); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_source_filtering_prefix_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Create multicast group with two egress ports and source filtering + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4_SSM); + + let allowed_src_ip1 = "192.168.1.5".parse().unwrap(); + let allowed_src_ip2: IpAddr = "192.168.1.10".parse().unwrap(); + let filtered_src_ip: IpAddr = "10.0.0.5".parse().unwrap(); + + let allowed_src = + types::IpSrc::Subnet(Ipv4Net::new(allowed_src_ip1, 24).unwrap()); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_source_filtering"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + Some(vec![allowed_src]), + ) + .await; + + let src_mac1 = switch.get_port_mac(ingress1).unwrap(); + let src_mac2 = switch.get_port_mac(ingress2).unwrap(); + + // Create test packets - two from allowed source, one from filtered source + let allowed_pkt1 = create_ipv4_multicast_packet( + multicast_ip, + src_mac1, + &allowed_src_ip1.to_string(), + 3333, + 4444, + ); + + let allowed_pkt2 = create_ipv4_multicast_packet( + multicast_ip, + src_mac1, + &allowed_src_ip2.to_string(), + 3333, + 4444, + ); + + let filtered_pkt = create_ipv4_multicast_packet( + multicast_ip, + src_mac2, + &filtered_src_ip.to_string(), + 3333, + 4444, + ); + + let to_recv11 = prepare_expected_pkt( + switch, + &allowed_pkt1, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv22 = prepare_expected_pkt( + switch, + &allowed_pkt2, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv12 = prepare_expected_pkt( + switch, + &allowed_pkt1, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv21 = prepare_expected_pkt( + switch, + &allowed_pkt2, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let test_pkts = vec![ + TestPacket { + packet: Arc::new(allowed_pkt1), + port: ingress1, + }, + TestPacket { + packet: Arc::new(allowed_pkt2), + port: ingress2, + }, + TestPacket { + packet: Arc::new(filtered_pkt), + port: ingress2, + }, + ]; + + // Only expect packets from the allowed sources + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(to_recv11), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv22), + port: egress2, + }, + TestPacket { + packet: Arc::new(to_recv12), + port: egress2, + }, + TestPacket { + packet: Arc::new(to_recv21), + port: egress1, + }, + ]; + + let ctr_baseline = switch + .get_counter("multicast_src_filtered", None) + .await + .unwrap(); + + let result = switch.packet_test(test_pkts, expected_pkts); + + check_counter_incremented( + switch, + "multicast_src_filtered", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_multiple_source_filtering() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress1 = PhysPort(10); + let ingress2 = PhysPort(11); + let ingress3 = PhysPort(12); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // Step 1: Create admin-scoped IPv6 group for actual replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_ipv6_source_filtering_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create external IPv6 SSM group with source filtering and NAT target (no members) + let multicast_ip = IpAddr::V6(MULTICAST_TEST_IPV6_SSM); + let vlan = Some(10); + + let allowed_src_ip1 = "2001:db8::1".parse().unwrap(); + let allowed_src_ip2 = "2001:db8::2".parse().unwrap(); + + let sources = vec![ + types::IpSrc::Exact(allowed_src_ip1), + types::IpSrc::Exact(allowed_src_ip2), + ]; + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_ipv6_source_filtering"), + &[], // External groups have no members + vlan, + true, // Create NAT target that points to the admin-scoped group + Some(sources), + ) + .await; + + let src_mac1 = switch.get_port_mac(ingress1).unwrap(); + let src_mac2 = switch.get_port_mac(ingress2).unwrap(); + let src_mac3 = switch.get_port_mac(ingress3).unwrap(); + + // Create test packets from different sources and a filtered source + let allowed_pkt1 = create_ipv6_multicast_packet( + multicast_ip, + src_mac1, + &allowed_src_ip1.to_string(), + 3333, + 4444, + ); + + let allowed_pkt2 = create_ipv6_multicast_packet( + multicast_ip, + src_mac2, + &allowed_src_ip2.to_string(), + 3333, + 4444, + ); + + let filtered_pkt = create_ipv6_multicast_packet( + multicast_ip, + src_mac3, + "2001:db8::3", // Not in the allowed sources list + 3333, + 4444, + ); + + let to_recv11 = prepare_expected_pkt( + switch, + &allowed_pkt1, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv22 = prepare_expected_pkt( + switch, + &allowed_pkt2, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv12 = prepare_expected_pkt( + switch, + &allowed_pkt1, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv21 = prepare_expected_pkt( + switch, + &allowed_pkt2, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let test_pkts = vec![ + TestPacket { + packet: Arc::new(allowed_pkt1), + port: ingress1, + }, + TestPacket { + packet: Arc::new(allowed_pkt2), + port: ingress2, + }, + TestPacket { + packet: Arc::new(filtered_pkt), + port: ingress3, + }, + ]; + + // Only expect packets from the allowed sources + let expected_pkts = vec![ + // First allowed source + TestPacket { + packet: Arc::new(to_recv11), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv12), + port: egress2, + }, + // Second allowed source + TestPacket { + packet: Arc::new(to_recv21), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv22), + port: egress2, + }, + ]; + + let ctr_baseline = switch + .get_counter("multicast_src_filtered", None) + .await + .unwrap(); + + let result = switch.packet_test(test_pkts, expected_pkts); + + check_counter_incremented( + switch, + "multicast_src_filtered", + ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_dynamic_membership() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + + // Step 1: Create admin-scoped IPv6 internal group with initial replication members + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_dynamic_membership_internal"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external group as entry point with NAT target + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_dynamic_membership"), + &[], // External groups have no members + vlan, + true, // Create NAT target pointing to underlay group + None, + ) + .await; + + // Get port and link IDs (not used in this test since external groups don't have members) + + // First test with initial configuration + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + "192.168.1.10", + 3333, + 4444, + ); + let to_recv1 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv2 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send.clone()), + port: ingress, + }; + + let expected_pkts = vec![ + TestPacket { + packet: Arc::new(to_recv1), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv2), + port: egress2, + }, + ]; + + let result1 = switch.packet_test(vec![test_pkt], expected_pkts); + assert!(result1.is_ok(), "Initial test failed: {:?}", result1); + + // Now update the external group - external groups don't have members to update, + // but we can update their NAT target, tag, vlan, and sources + let external_update_entry = types::MulticastGroupUpdateExternalEntry { + tag: None, + nat_target: create_nat_target_ipv4(), // Keep the same NAT target + vlan_id: None, + sources: None, + }; + + let updated = switch + .client + .multicast_group_update_external( + &created_group.group_ip, + &external_update_entry, + ) + .await + .expect("Should be able to update group"); + + assert_eq!(updated.members.len(), 0); // External groups don't have members + + // Update the admin-scoped group membership to demonstrate dynamic membership + let (port_id2, link_id2) = switch.link_id(egress2).unwrap(); + let (port_id3, link_id3) = switch.link_id(egress3).unwrap(); + + let internal_update_entry = types::MulticastGroupUpdateEntry { + tag: None, + members: vec![ + types::MulticastGroupMember { + port_id: port_id2, + link_id: link_id2, + direction: types::Direction::External, + }, + types::MulticastGroupMember { + port_id: port_id3, + link_id: link_id3, + direction: types::Direction::External, + }, + ], + sources: None, + }; + + switch + .client + .multicast_group_update(&internal_multicast_ip, &internal_update_entry) + .await + .expect("Should be able to update admin-scoped group membership"); + + // Test with updated configuration + let to_recv1_new = prepare_expected_pkt( + switch, + &to_send, + None, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + let to_recv2_new = prepare_expected_pkt( + switch, + &to_send, + None, + created_group.int_fwding.nat_target.as_ref(), + Some(egress3), + ); + + let test_pkt_new = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts_new = vec![ + TestPacket { + packet: Arc::new(to_recv1_new), + port: egress2, + }, + TestPacket { + packet: Arc::new(to_recv2_new), + port: egress3, + }, + ]; + + let result2 = switch.packet_test(vec![test_pkt_new], expected_pkts_new); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result2 +} + +#[tokio::test] +#[ignore] +async fn test_multicast_multiple_groups() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + let egress4 = PhysPort(21); + + // Step 1: Create admin-scoped IPv6 group for actual replication first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_multi_group_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + (egress3, types::Direction::External), + (egress4, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create first IPv4 external group with NAT target (no members) + let multicast_ip1 = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan1 = Some(10); + + let created_group1 = create_test_multicast_group( + switch, + multicast_ip1, + Some("test_multi_group_1"), + &[], // External groups have no members + vlan1, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + // Step 3: Create second IPv4 external group with NAT target (no members) + let multicast_ip2 = IpAddr::V4(Ipv4Addr::new(224, 1, 2, 0)); // Changed to valid range + let vlan2 = Some(20); + + let created_group2 = create_test_multicast_group( + switch, + multicast_ip2, + Some("test_multi_group_2"), + &[], // External groups have no members + vlan2, + true, // Create NAT target that points to the admin-scoped group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let to_send1 = create_ipv4_multicast_packet( + multicast_ip1, + src_mac, + "192.168.1.10", + 3333, + 4444, + ); + + let to_send2 = create_ipv4_multicast_packet( + multicast_ip2, + src_mac, + "192.168.1.10", + 3333, + 4444, + ); + + let to_recv1_1 = prepare_expected_pkt( + switch, + &to_send1, + vlan1, + created_group1.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv1_2 = prepare_expected_pkt( + switch, + &to_send1, + vlan1, + created_group1.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv2_1 = prepare_expected_pkt( + switch, + &to_send2, + vlan2, + created_group2.int_fwding.nat_target.as_ref(), + Some(egress3), + ); + + let to_recv2_2 = prepare_expected_pkt( + switch, + &to_send2, + vlan2, + created_group2.int_fwding.nat_target.as_ref(), + Some(egress4), + ); + + // Since both groups NAT to the same admin-scoped group, they both replicate to all ports + let to_recv1_3 = prepare_expected_pkt( + switch, + &to_send1, + vlan1, + created_group1.int_fwding.nat_target.as_ref(), + Some(egress3), + ); + + let to_recv1_4 = prepare_expected_pkt( + switch, + &to_send1, + vlan1, + created_group1.int_fwding.nat_target.as_ref(), + Some(egress4), + ); + + let to_recv2_3 = prepare_expected_pkt( + switch, + &to_send2, + vlan2, + created_group2.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv2_4 = prepare_expected_pkt( + switch, + &to_send2, + vlan2, + created_group2.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let test_pkts = vec![ + TestPacket { + packet: Arc::new(to_send1), + port: ingress, + }, + TestPacket { + packet: Arc::new(to_send2), + port: ingress, + }, + ]; + + let expected_pkts = vec![ + // First multicast group - replicates to all ports since both groups share same NAT target + TestPacket { + packet: Arc::new(to_recv1_1), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv1_2), + port: egress2, + }, + TestPacket { + packet: Arc::new(to_recv1_3), + port: egress3, + }, + TestPacket { + packet: Arc::new(to_recv1_4), + port: egress4, + }, + // Second multicast group - also replicates to all ports + TestPacket { + packet: Arc::new(to_recv2_3), + port: egress1, + }, + TestPacket { + packet: Arc::new(to_recv2_4), + port: egress2, + }, + TestPacket { + packet: Arc::new(to_recv2_1), + port: egress3, + }, + TestPacket { + packet: Arc::new(to_recv2_2), + port: egress4, + }, + ]; + + let result = switch.packet_test(test_pkts, expected_pkts); + + cleanup_test_group(switch, created_group1.group_ip).await; + cleanup_test_group(switch, created_group2.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_reset_all_tables() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + + // Create multicast groups with different configurations to populate all tables + + // Step 1: Create admin-scoped IPv6 groups for NAT targets first + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_reset_all_underlay"), + &[ + (egress1, types::Direction::External), + (egress2, types::Direction::External), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: IPv4 external group with NAT and VLAN + let multicast_ip1 = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan1 = Some(10); + + let created_group1 = create_test_multicast_group( + switch, + multicast_ip1, + Some("test_reset_all_1"), + &[], // External groups have no members + vlan1, + true, // Create NAT target + None, + ) + .await; + + // 2. IPv6 external group (non-admin-scoped must use external API) + let multicast_ip2 = IpAddr::V6(MULTICAST_TEST_IPV6); + + let created_group2 = create_test_multicast_group( + switch, + multicast_ip2, + Some("test_reset_all_2"), + &[], // External groups have no members + Some(20), // Add VLAN for this external group + true, // Create NAT target + None, // No sources for this group + ) + .await; + + // 2b. Admin-scoped IPv6 group to test internal API with custom replication parameters + let group_entry2b = types::MulticastGroupCreateEntry { + group_ip: Ipv6Addr::new(0xff04, 0, 0, 0, 0, 0, 0, 2), + tag: Some("test_reset_all_2b".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: switch.link_id(egress1).unwrap().0, + link_id: switch.link_id(egress1).unwrap().1, + direction: types::Direction::Underlay, + }], + }; + + let created_group2b = switch + .client + .multicast_group_create(&group_entry2b) + .await + .expect("Failed to create admin-scoped IPv6 multicast group") + .into_inner(); + + // 3. IPv4 SSM group with source filters + let multicast_ip3 = IpAddr::V4(MULTICAST_TEST_IPV4_SSM); + let vlan3 = Some(30); + let sources = Some(vec![ + types::IpSrc::Exact("192.168.1.5".parse().unwrap()), + types::IpSrc::Subnet( + Ipv4Net::new("192.168.2.0".parse().unwrap(), 24).unwrap(), + ), + ]); + + let created_group3 = create_test_multicast_group( + switch, + multicast_ip3, + Some("test_reset_all_3"), + &[], // External groups have no members + vlan3, + true, // Create NAT target + sources.clone(), + ) + .await; + + // 4. IPv6 SSM external group with source filters + let multicast_ip4 = IpAddr::V6(MULTICAST_TEST_IPV6_SSM); + let vlan4 = Some(40); + let ipv6_sources = + Some(vec![types::IpSrc::Exact("2001:db8::1".parse().unwrap())]); + + let created_group4 = create_test_multicast_group( + switch, + multicast_ip4, + Some("test_reset_all_4"), + &[], // External groups have no members + vlan4, + true, // IPv6 SSM external groups need NAT targets + ipv6_sources.clone(), + ) + .await; + + // Verify all tables have entries before reset + + // 1. Check replication tables + // Note: Only IPv6 has a replication table; IPv4 uses different mechanisms + let ipv6_repl_table_before = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_replication_ipv6") + .await + .expect("Should be able to dump IPv6 replication table"); + + assert!( + !ipv6_repl_table_before.entries.is_empty(), + "IPv6 replication table should have entries before reset" + ); + + // 2. Check route tables + let ipv4_route_table_before = switch + .client + .table_dump("pipe.Ingress.l3_router.MulticastRouter4.tbl") + .await + .expect("Should be able to dump IPv4 route table"); + + let ipv6_route_table_before = switch + .client + .table_dump("pipe.Ingress.l3_router.MulticastRouter6.tbl") + .await + .expect("Should be able to dump IPv6 route table"); + + assert!( + !ipv4_route_table_before.entries.is_empty(), + "IPv4 route table should have entries before reset" + ); + assert!( + !ipv6_route_table_before.entries.is_empty(), + "IPv6 route table should have entries before reset" + ); + + // 3. Check NAT tables + let ipv4_nat_table_before = switch + .client + .table_dump("pipe.Ingress.nat_ingress.ingress_ipv4_mcast") + .await + .expect("Should be able to dump IPv4 NAT table"); + + let ipv6_nat_table_before = switch + .client + .table_dump("pipe.Ingress.nat_ingress.ingress_ipv6_mcast") + .await + .expect("Should be able to dump IPv6 NAT table"); + + assert!( + !ipv4_nat_table_before.entries.is_empty(), + "IPv4 NAT table should have entries before reset" + ); + assert!( + !ipv6_nat_table_before.entries.is_empty(), + "IPv6 NAT table should have entries before reset" + ); + + // 4. Check source filter tables + let ipv4_src_filter_table_before = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_source_filter_ipv4") + .await + .expect("Should be able to dump IPv4 source filter table"); + + let ipv6_src_filter_table_before = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_source_filter_ipv6") + .await + .expect("Should be able to dump IPv6 source filter table"); + + assert!( + !ipv4_src_filter_table_before.entries.is_empty(), + "IPv4 source filter table should have entries before reset" + ); + assert!( + !ipv6_src_filter_table_before.entries.is_empty(), + "IPv6 source filter table should have entries before reset" + ); + + // Perform full reset + switch + .client + .multicast_reset() + .await + .expect("Should be able to reset all multicast groups"); + + // Verify all tables are empty after reset + + // 1. Check replication tables after reset + // Note: Only IPv6 has a replication table; IPv4 uses different mechanisms + let ipv6_repl_table_after = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_replication_ipv6") + .await + .expect("Should be able to dump IPv6 replication table"); + + assert!( + ipv6_repl_table_after.entries.is_empty(), + "IPv6 replication table should be empty after reset" + ); + + // 2. Check route tables after reset + let ipv4_route_table_after = switch + .client + .table_dump("pipe.Ingress.l3_router.MulticastRouter4.tbl") + .await + .expect("Should be able to dump IPv4 route table"); + + let ipv6_route_table_after = switch + .client + .table_dump("pipe.Ingress.l3_router.MulticastRouter6.tbl") + .await + .expect("Should be able to dump IPv6 route table"); + + assert!( + ipv4_route_table_after.entries.is_empty(), + "IPv4 route table should be empty after reset" + ); + assert!( + ipv6_route_table_after.entries.is_empty(), + "IPv6 route table should be empty after reset" + ); + + // 3. Check NAT tables after reset + let ipv4_nat_table_after = switch + .client + .table_dump("pipe.Ingress.nat_ingress.ingress_ipv4_mcast") + .await + .expect("Should be able to dump IPv4 NAT table"); + + let ipv6_nat_table_after = switch + .client + .table_dump("pipe.Ingress.nat_ingress.ingress_ipv6_mcast") + .await + .expect("Should be able to dump IPv6 NAT table"); + + assert!( + ipv4_nat_table_after.entries.is_empty(), + "IPv4 NAT table should be empty after reset" + ); + assert!( + ipv6_nat_table_after.entries.is_empty(), + "IPv6 NAT table should be empty after reset" + ); + + // 4. Check source filter tables after reset + let ipv4_src_filter_table_after = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_source_filter_ipv4") + .await + .expect("Should be able to dump IPv4 source filter table"); + + let ipv6_src_filter_table_after = switch + .client + .table_dump("pipe.Ingress.mcast_ingress.mcast_source_filter_ipv6") + .await + .expect("Should be able to dump IPv6 source filter table"); + + assert!( + ipv4_src_filter_table_after.entries.is_empty(), + "IPv4 source filter table should be empty after reset" + ); + assert!( + ipv6_src_filter_table_after.entries.is_empty(), + "IPv6 source filter table should be empty after reset" + ); + + // Verify that all groups no longer exist + let groups_after = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to list groups"); + + assert!( + groups_after.is_empty(), + "No groups should exist after reset" + ); + + // Try to get each group specifically + for group_ip in [ + created_group1.group_ip, + created_group2.group_ip, + created_group2b.group_ip, + created_group3.group_ip, + created_group4.group_ip, + internal_multicast_ip, + ] { + let result = switch.client.multicast_group_get(&group_ip).await; + + assert!( + result.is_err(), + "Group {} should be deleted after reset", + group_ip + ); + } + Ok(()) +} + +#[tokio::test] +#[ignore] +async fn test_multicast_vlan_translation_not_possible() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + + // Step 1: Create admin-scoped IPv6 underlay group that will handle actual replication + // Must have at least one member to satisfy validation requirements + let egress1 = PhysPort(15); + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_vlan_underlay"), + &[(egress1, types::Direction::External)], // Need at least one member for admin-scoped groups + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create external group with VLAN + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let output_vlan = Some(20); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_vlan_behavior"), + &[], // External groups have no members + output_vlan, + true, // Create NAT target + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + // Create test packet with input VLAN + let input_vlan = 10; + let mut to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + "192.168.1.20", + 4444, + 5555, + ); + + // Add input VLAN tag + to_send.hdrs.eth_hdr.as_mut().unwrap().eth_8021q = Some(eth::EthQHdr { + eth_pcp: 0, + eth_dei: 0, + eth_vlan_tag: input_vlan, + }); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect NO packets - this test demonstrates that VLAN translation + // is not possible for multicast packets + let expected_pkts = vec![]; + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_multiple_packets() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(17); + let egress3 = PhysPort(19); + + // Step 1: Create admin-scoped IPv6 underlay group for actual replication + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + create_test_multicast_group( + switch, + internal_multicast_ip, + Some("test_performance_underlay"), + &[ + (egress1, types::Direction::Underlay), + (egress2, types::Direction::Underlay), + (egress3, types::Direction::Underlay), + ], + None, + false, // Admin-scoped groups don't need NAT targets + None, + ) + .await; + + // Step 2: Create IPv4 external group as entry point with NAT target + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + let vlan = Some(10); + + let created_group = create_test_multicast_group( + switch, + multicast_ip, + Some("test_performance"), + &[], // External groups have no members + vlan, + true, // Create NAT target pointing to underlay group + None, + ) + .await; + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + // Number of packets to send + const NUM_PACKETS: usize = 10; + + let mut test_pkts = Vec::with_capacity(NUM_PACKETS); + let mut expected_pkts = Vec::with_capacity(NUM_PACKETS * 3); // 3 egress ports + + for i in 0..NUM_PACKETS { + // Create a unique source port for each packet to differentiate them + let src_port = 3000 + i as u16; + let dst_port = 4444; + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + "192.168.1.10", + src_port, + dst_port, + ); + + let to_recv1 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress1), + ); + + let to_recv2 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress2), + ); + + let to_recv3 = prepare_expected_pkt( + switch, + &to_send, + vlan, + created_group.int_fwding.nat_target.as_ref(), + Some(egress3), + ); + + test_pkts.push(TestPacket { + packet: Arc::new(to_send), + port: ingress, + }); + + expected_pkts.push(TestPacket { + packet: Arc::new(to_recv1), + port: egress1, + }); + expected_pkts.push(TestPacket { + packet: Arc::new(to_recv2), + port: egress2, + }); + expected_pkts.push(TestPacket { + packet: Arc::new(to_recv3), + port: egress3, + }); + } + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(test_pkts, expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + NUM_PACKETS as u64, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_multicast_no_group_configured() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + + // Use unique multicast IP addresses that we will NOT configure any group for + let unconfigured_multicast_ipv4 = IpAddr::V4(Ipv4Addr::new(224, 1, 255, 1)); // Unique IPv4 multicast + let unconfigured_multicast_ipv6 = + IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 255, 1)); // Unique IPv6 multicast + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + // Get baseline counter before any test packets + let initial_ctr_baseline = switch + .get_counter("multicast_no_group", None) + .await + .unwrap(); + + // Test IPv4 multicast with no configured group + { + let to_send = create_ipv4_multicast_packet( + unconfigured_multicast_ipv4, + src_mac, + "192.168.1.10", + 3333, + 4444, + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + let expected_pkts = vec![]; + + switch + .packet_test(vec![test_pkt], expected_pkts) + .expect("No packets should be sent"); + + // Verify counter incremented for IPv4 + check_counter_incremented( + switch, + "multicast_no_group", + initial_ctr_baseline, + 1, + None, + ) + .await + .unwrap(); + } + + // Test IPv6 multicast with no configured group + { + let to_send = create_ipv6_multicast_packet( + unconfigured_multicast_ipv6, + src_mac, + "2001:db8::1", + 3333, + 4444, + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send), + port: ingress, + }; + + // Expect no output packets - should be dropped + let expected_pkts = vec![]; + + switch + .packet_test(vec![test_pkt], expected_pkts) + .expect("No packets should be sent"); + + // Verify counter incremented for IPv6 - expect 2 total drops now + check_counter_incremented( + switch, + "multicast_no_group", + initial_ctr_baseline, + 2, + None, + ) + .await + .unwrap(); + } + + Ok(()) +} + +#[tokio::test] +#[ignore] +async fn test_multicast_level1_exclusion_group_pruned() -> TestResult { + let switch = &*get_switch().await; + + // Define test ports + let ingress = PhysPort(10); + let egress1 = PhysPort(15); + let egress2 = PhysPort(22); + + let src_mac = switch.get_port_mac(ingress).unwrap(); + + let multicast_ip = IpAddr::V4(MULTICAST_TEST_IPV4); + + // Step 1: Create admin-scoped IPv6 internal group with replication members and exclusion + let internal_multicast_ip = IpAddr::V6(MULTICAST_NAT_IP); + let underlay_group = types::MulticastGroupCreateEntry { + group_ip: MULTICAST_NAT_IP, + tag: Some("test_level1_excl_underlay".to_string()), + sources: None, + members: vec![ + types::MulticastGroupMember { + port_id: switch.link_id(egress1).unwrap().0, + link_id: switch.link_id(egress1).unwrap().1, + direction: types::Direction::Underlay, + }, + types::MulticastGroupMember { + port_id: switch.link_id(egress2).unwrap().0, + link_id: switch.link_id(egress2).unwrap().1, + direction: types::Direction::Underlay, + }, + ], + }; + + let _underlay_created = switch + .client + .multicast_group_create(&underlay_group) + .await + .expect("Should create underlay group") + .into_inner(); + + // Step 2: Create IPv4 external group as entry point with NAT target + let external_group = types::MulticastGroupCreateExternalEntry { + group_ip: multicast_ip, + tag: Some("test_level1_excl_group1".to_string()), + nat_target: create_nat_target_ipv4(), + vlan_id: Some(10), + sources: None, + }; + + let created_group = switch + .client + .multicast_group_create_external(&external_group) + .await + .expect("Should create first exclusion group") + .into_inner(); + + let to_send = create_ipv4_multicast_packet( + multicast_ip, + src_mac, + "192.168.1.10", + 3333, + 4444, + ); + + let test_pkt = TestPacket { + packet: Arc::new(to_send.clone()), + port: ingress, + }; + + // Each node also has a “prune” condition, which if true causes the PRE to + // make no copies of the packet for that node. Being that we exclude egress2, + // there will not be any muliticast copies made for either egress1 or egress2. + let expected_pkts = vec![]; + + let port_label_ingress = switch.port_label(ingress).unwrap(); + + let ctr_baseline_ingress = switch + .get_counter(&port_label_ingress, Some("ingress")) + .await + .unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected_pkts); + + check_counter_incremented( + switch, + &port_label_ingress, + ctr_baseline_ingress, + 1, + Some("ingress"), + ) + .await + .unwrap(); + + cleanup_test_group(switch, created_group.group_ip).await; + cleanup_test_group(switch, internal_multicast_ip).await; + + result +} + +#[tokio::test] +#[ignore] +async fn test_external_group_nat_target_validation() { + let switch = &*get_switch().await; + + let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); + + // Test 1: Creating external group with NAT target referencing non-existent group should fail + let nonexistent_nat_target = types::NatTarget { + internal_ip: "ff04::1".parse().unwrap(), // Admin-scoped IPv6 that does not exist + inner_mac: MacAddr::new(0x03, 0x00, 0x00, 0x00, 0x00, 0x01).into(), + vni: 100.into(), + }; + + let group_with_invalid_nat = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4("224.1.0.101".parse().unwrap()), + tag: Some("test_invalid_nat".to_string()), + nat_target: nonexistent_nat_target.clone(), + vlan_id: Some(10), + sources: None, + }; + + let res = switch + .client + .multicast_group_create_external(&group_with_invalid_nat) + .await + .expect_err("Should fail with non-existent NAT target"); + + match res { + Error::ErrorResponse(inner) => { + assert_eq!(inner.status(), 400, "Expected 400 Bad Request"); + } + _ => panic!("Expected ErrorResponse for invalid NAT target"), + } + + // Test 2: Create admin-scoped IPv6 group first, then external group with valid NAT target + let admin_scoped_group = types::MulticastGroupCreateEntry { + group_ip: "ff04::1".parse().unwrap(), // Admin-scoped IPv6 + tag: Some("test_admin_scoped".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: port_id.clone(), + link_id, + direction: types::Direction::Underlay, + }], + }; + + let created_admin = switch + .client + .multicast_group_create(&admin_scoped_group) + .await + .expect("Should create admin-scoped group") + .into_inner(); + + assert!(created_admin.underlay_group_id.is_some()); + + // Test 3: Now create external group with valid NAT target + let valid_nat_target = types::NatTarget { + internal_ip: "ff04::1".parse().unwrap(), // References the admin-scoped group we just created + inner_mac: MacAddr::new(0x03, 0x00, 0x00, 0x00, 0x00, 0x02).into(), + vni: 100.into(), + }; + + let group_with_valid_nat = types::MulticastGroupCreateExternalEntry { + group_ip: IpAddr::V4("224.1.0.102".parse().unwrap()), + tag: Some("test_valid_nat".to_string()), + nat_target: valid_nat_target, + vlan_id: Some(10), + sources: None, + }; + + let created_external = switch + .client + .multicast_group_create_external(&group_with_valid_nat) + .await + .expect("Should create external group with valid NAT target") + .into_inner(); + + // External groups created via external API don't have external_group_id unless + // there are external members in the referenced admin-scoped group + assert!( + created_external.external_group_id.is_none(), + "External API groups shouldn't have external_group_id without external members" + ); + assert!( + created_external.underlay_group_id.is_none(), + "External group should not have underlay_group_id" + ); + assert_eq!( + created_external.members.len(), + 0, + "External group should have no members" + ); + + cleanup_test_group(switch, created_admin.group_ip).await; + cleanup_test_group(switch, created_external.group_ip).await; +} + +#[tokio::test] +#[ignore] +async fn test_ipv6_multicast_scope_validation() { + let switch = &*get_switch().await; + let (egress_port, egress_link) = switch.link_id(PhysPort(11)).unwrap(); + + // Test all IPv6 multicast scope types for proper API routing + + // Admin-local scope (ff04::/16) - should work with internal API + let admin_local_group = types::MulticastGroupCreateEntry { + group_ip: "ff04::100".parse().unwrap(), + tag: Some("test_admin_local".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: egress_port.clone(), + link_id: egress_link, + direction: types::Direction::External, + }], + }; + + let admin_local_result = switch + .client + .multicast_group_create(&admin_local_group) + .await; + assert!( + admin_local_result.is_ok(), + "Admin-local scope (ff04::/16) should work with internal API" + ); + + // Site-local scope (ff05::/16) - should work with internal API + let site_local_group = types::MulticastGroupCreateEntry { + group_ip: "ff05::200".parse().unwrap(), + tag: Some("test_site_local".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: egress_port.clone(), + link_id: egress_link, + direction: types::Direction::External, + }], + }; + + let site_local_result = switch + .client + .multicast_group_create(&site_local_group) + .await; + assert!( + site_local_result.is_ok(), + "Site-local scope (ff05::/16) should work with internal API" + ); + + // Organization-local scope (ff08::/16) - should work with internal API + let org_local_group = types::MulticastGroupCreateEntry { + group_ip: "ff08::300".parse().unwrap(), + tag: Some("test_org_local".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: egress_port.clone(), + link_id: egress_link, + direction: types::Direction::External, + }], + }; + + let org_local_result = + switch.client.multicast_group_create(&org_local_group).await; + assert!( + org_local_result.is_ok(), + "Organization-local scope (ff08::/16) should work with internal API" + ); + + // Global scope (ff0e::/16) - should be rejected by internal API + let global_scope_group = types::MulticastGroupCreateEntry { + group_ip: "ff0e::400".parse().unwrap(), + tag: Some("test_global".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: egress_port.clone(), + link_id: egress_link, + direction: types::Direction::External, + }], + }; + + let global_scope_result = switch + .client + .multicast_group_create(&global_scope_group) + .await; + assert!( + global_scope_result.is_err(), + "Global scope (ff0e::/16) should be rejected by internal API" + ); + let error_msg = format!("{:?}", global_scope_result.unwrap_err()); + assert!( + error_msg.contains( + "Non-admin-scoped IPv6 multicast groups must use the external API" + ), + "Error should indicate external API required for global scope" + ); + + // Test the reverse: admin-scoped should be rejected by external API + // First create an admin-scoped group to reference + let admin_target_group = types::MulticastGroupCreateEntry { + group_ip: "ff04::1000".parse().unwrap(), + tag: Some("test_target".to_string()), + sources: None, + members: vec![types::MulticastGroupMember { + port_id: egress_port.clone(), + link_id: egress_link, + direction: types::Direction::External, + }], + }; + + let target_result = switch + .client + .multicast_group_create(&admin_target_group) + .await + .expect("Should create target group"); + + let admin_scoped_external = types::MulticastGroupCreateExternalEntry { + group_ip: "ff04::500".parse().unwrap(), + tag: Some("test_admin_external".to_string()), + nat_target: types::NatTarget { + internal_ip: "ff04::1000".parse().unwrap(), + inner_mac: MacAddr::new(0x02, 0x00, 0x00, 0x00, 0x00, 0x01).into(), + vni: 100.into(), + }, + vlan_id: Some(42), + sources: None, + }; + + let admin_external_result = switch + .client + .multicast_group_create_external(&admin_scoped_external) + .await; + assert!( + admin_external_result.is_err(), + "Admin-scoped addresses should be rejected by external API" + ); + let external_error_msg = + format!("{:?}", admin_external_result.unwrap_err()); + assert!( + external_error_msg.contains("admin-scoped multicast address"), + "Error should indicate admin-scoped addresses require internal API" + ); + + // Cleanup all created groups + let admin_local_group = admin_local_result.unwrap().into_inner(); + let site_local_group = site_local_result.unwrap().into_inner(); + let org_local_group = org_local_result.unwrap().into_inner(); + let target_group = target_result.into_inner(); + + switch + .client + .multicast_group_delete(&admin_local_group.group_ip) + .await + .ok(); + switch + .client + .multicast_group_delete(&site_local_group.group_ip) + .await + .ok(); + switch + .client + .multicast_group_delete(&org_local_group.group_ip) + .await + .ok(); + switch + .client + .multicast_group_delete(&target_group.group_ip) + .await + .ok(); +} + +#[tokio::test] +#[ignore] +async fn test_multicast_group_id_recycling() { + let switch = &*get_switch().await; + + // Use admin-scoped IPv6 addresses that get group IDs assigned + let group1_ip = IpAddr::V6(Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 10)); + let group2_ip = IpAddr::V6(Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 11)); + let group3_ip = IpAddr::V6(Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 12)); + + // Create first group and capture its group IDs + let group1 = create_test_multicast_group( + switch, + group1_ip, + Some("test_recycling_1"), + &[(PhysPort(11), types::Direction::External)], + None, + false, + None, + ) + .await; + + let group1_external_id = group1.external_group_id; + assert!(group1_external_id.is_some()); + + // Create second group and capture its group IDs + let group2 = create_test_multicast_group( + switch, + group2_ip, + Some("test_recycling_2"), + &[(PhysPort(12), types::Direction::External)], + None, + false, + None, + ) + .await; + + let group2_external_id = group2.external_group_id; + assert!(group2_external_id.is_some()); + assert_ne!(group1_external_id, group2_external_id); + + // Delete the first group + switch + .client + .multicast_group_delete(&group1_ip) + .await + .expect("Should be able to delete first group"); + + // Verify group1 was actually deleted + let groups_after_delete1 = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to list groups"); + assert!( + !groups_after_delete1.iter().any(|g| g.group_ip == group1_ip), + "Group1 should be deleted" + ); + + // Create third group - should reuse the first group's ID + let group3 = create_test_multicast_group( + switch, + group3_ip, + Some("test_recycling_3"), + &[(PhysPort(13), types::Direction::External)], + None, + false, + None, + ) + .await; + + let group3_external_id = group3.external_group_id; + assert!(group3_external_id.is_some()); + + // Verify that ID recycling is working - group3 should get an ID that was + // previously used + assert_ne!( + group2_external_id, group3_external_id, + "Third group should get a different ID than the active second group" + ); + + // Create a fourth group after deleting group2, it should reuse group2's ID + switch + .client + .multicast_group_delete(&group2_ip) + .await + .expect("Should be able to delete second group"); + + // Verify group2 was actually deleted + let groups_after_delete2 = switch + .client + .multicast_groups_list_stream(None) + .try_collect::>() + .await + .expect("Should be able to list groups"); + assert!( + !groups_after_delete2.iter().any(|g| g.group_ip == group2_ip), + "Group2 should be deleted" + ); + + let group4_ip = IpAddr::V6(Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 13)); + let group4 = create_test_multicast_group( + switch, + group4_ip, + Some("test_recycling_4"), + &[(PhysPort(14), types::Direction::External)], + None, + false, + None, + ) + .await; + + let group4_external_id = group4.external_group_id; + assert!(group4_external_id.is_some()); + + // Group4 should reuse group2's recently freed ID due to stack-like + // allocation + assert_eq!( + group2_external_id, group4_external_id, + "Fourth group should reuse second group's recycled ID" + ); + + // Cleanup - clean up remaining active groups + cleanup_test_group(switch, group3_ip).await; + cleanup_test_group(switch, group4_ip).await; +} diff --git a/dpd-client/tests/integration_tests/mod.rs b/dpd-client/tests/integration_tests/mod.rs index 1c219ce..b7554ef 100644 --- a/dpd-client/tests/integration_tests/mod.rs +++ b/dpd-client/tests/integration_tests/mod.rs @@ -8,6 +8,7 @@ mod common; mod counters; mod icmp_ipv4; mod loopback; +mod mcast; mod nat; mod port_api; mod route_ipv4; diff --git a/dpd-client/tests/integration_tests/nat.rs b/dpd-client/tests/integration_tests/nat.rs index b775b07..541aaea 100644 --- a/dpd-client/tests/integration_tests/nat.rs +++ b/dpd-client/tests/integration_tests/nat.rs @@ -30,6 +30,7 @@ use crate::integration_tests::common::prelude::*; use futures::TryStreamExt; +/// Build a Geneve packet with the given parameters. pub fn gen_geneve_packet( src: Endpoint, dst: Endpoint, @@ -37,6 +38,27 @@ pub fn gen_geneve_packet( vni: u32, tag_ingress: bool, payload: &[u8], +) -> Packet { + gen_geneve_packet_with_mcast_tag( + src, + dst, + inner_type, + vni, + tag_ingress, + None, // No multicast tag + payload, + ) +} + +/// Build a Geneve packet with a possible multicast tag. +pub fn gen_geneve_packet_with_mcast_tag( + src: Endpoint, + dst: Endpoint, + inner_type: u16, + vni: u32, + tag_ingress: bool, + mcast_tag: Option, // New parameter for multicast tag + payload: &[u8], ) -> Packet { let udp_stack = match src.get_ip("src").unwrap() { IpAddr::V4(_) => { @@ -51,34 +73,79 @@ pub fn gen_geneve_packet( let geneve = pkt.hdrs.geneve_hdr.as_mut().unwrap(); geneve.vni = vni; - if tag_ingress { - // XXX: Consider adding `push_option` to GeneveHdr and defining - // option enums. - geneve.opt_len = 1; - #[rustfmt::skip] - geneve.options.extend_from_slice(&[ - // class - 0x01, 0x29, - // crit + type - 0x00, - // reserved + body len - 0x00, - ]); - - let extra_bytes = geneve.options.len() as u16; - - match src.get_ip("src").unwrap() { - IpAddr::V4(_) => { - pkt.hdrs.ipv4_hdr.as_mut().unwrap().ipv4_total_len += - extra_bytes - } - IpAddr::V6(_) => { - pkt.hdrs.ipv6_hdr.as_mut().unwrap().ipv6_payload_len += - extra_bytes + match (tag_ingress, mcast_tag) { + (true, Some(tag)) if tag < 3 => { + geneve.opt_len = 2; + // Multicast tag option + #[rustfmt::skip] + geneve.options.extend_from_slice(&[ + // First 2 bytes: Geneve option class (0x0129) + // The OXIDE vendor-specific class identifier + 0x01, 0x29, + // Third byte: Critical bit (0) + Option type (1) + // Type 1 represents multicast tagged packets + 0x01, + // Fourth byte: Option(s) length + 0x01, + // Fifth byte: Tag value (encoded in the data) + (tag & 0x03) << 6, + // Sixth byte: reserved + 0x00, + // Seventh byte + 0x00, + // Eighth byte + 0x00, + ]); + + let extra_bytes = geneve.options.len() as u16; + + match src.get_ip("src").unwrap() { + IpAddr::V4(_) => { + pkt.hdrs.ipv4_hdr.as_mut().unwrap().ipv4_total_len += + extra_bytes + } + IpAddr::V6(_) => { + pkt.hdrs.ipv6_hdr.as_mut().unwrap().ipv6_payload_len += + extra_bytes + } } + + pkt.hdrs.udp_hdr.as_mut().unwrap().udp_len += extra_bytes; } + (true, Some(_)) => { + // Multicast tag is not valid + panic!("Multicast tag must be less than 3"); + } + (true, None) => { + // External packet option + geneve.opt_len = 1; + #[rustfmt::skip] + geneve.options.extend_from_slice(&[ + // First 2 bytes: Geneve option class (0x0129) + // The OXIDE vendor-specific class identifier + 0x01, 0x29, + // Third byte: Critical bit (0) + Option type (1) + 0x00, + // reserved + body len + 0x00, + ]); + + let extra_bytes = geneve.options.len() as u16; + + match src.get_ip("src").unwrap() { + IpAddr::V4(_) => { + pkt.hdrs.ipv4_hdr.as_mut().unwrap().ipv4_total_len += + extra_bytes + } + IpAddr::V6(_) => { + pkt.hdrs.ipv6_hdr.as_mut().unwrap().ipv6_payload_len += + extra_bytes + } + } - pkt.hdrs.udp_hdr.as_mut().unwrap().udp_len += extra_bytes; + pkt.hdrs.udp_hdr.as_mut().unwrap().udp_len += extra_bytes; + } + _ => {} } pkt diff --git a/dpd-client/tests/integration_tests/route_ipv6.rs b/dpd-client/tests/integration_tests/route_ipv6.rs index fe52d5e..77e2a01 100644 --- a/dpd-client/tests/integration_tests/route_ipv6.rs +++ b/dpd-client/tests/integration_tests/route_ipv6.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use oxnet::Ipv6Net; use ::common::network::MacAddr; -use packet::{sidecar, Endpoint}; +use packet::{ipv6, sidecar, Endpoint}; use crate::integration_tests::common; use crate::integration_tests::common::prelude::*; @@ -458,6 +458,61 @@ async fn test_link_local_multicast_outbound() -> TestResult { switch.packet_test(vec![send], expected) } +#[tokio::test] +#[ignore] +async fn test_ipv6_link_local_multicast_hop_limit_one() -> TestResult { + let switch = &*get_switch().await; + + let ingress = PhysPort(10); + + let src = Endpoint::parse("e0:d5:5e:67:89:ab", "fe80::1", 3333).unwrap(); + let dst = Endpoint::parse("33:33:00:00:00:01", "ff02::1", 4444).unwrap(); + + let mut send = common::gen_udp_packet(src, dst); + + // Set hop limit to 1 - this should be ALLOWED for link-local multicast + ipv6::Ipv6Hdr::adjust_hlim(&mut send, -254); // Set to 1 (255 - 254 = 1) + + let test_pkt = TestPacket { + packet: Arc::new(send.clone()), + port: ingress, + }; + + // Link-local multicast packets should be forwarded to userspace with sidecar header + let mut recv = send.clone(); + common::add_sidecar_hdr( + switch, + &mut recv, + sidecar::SC_FWD_TO_USERSPACE, + ingress, + NO_PORT, + None, + ); + + let expected = vec![TestPacket { + packet: Arc::new(recv), + port: SERVICE_PORT, + }]; + + // Verify that the hop limit invalid counter does NOT increment + let ctr_baseline_hop_limit = + switch.get_counter("ipv6_ttl_invalid", None).await.unwrap(); + + let result = switch.packet_test(vec![test_pkt], expected); + + // Verify hop limit invalid counter did NOT increment (packet was not dropped) + let ctr_final_hop_limit = + switch.get_counter("ipv6_ttl_invalid", None).await.unwrap(); + + assert_eq!( + ctr_final_hop_limit, + ctr_baseline_hop_limit, + "Hop limit invalid counter should not increment for link-local multicast with hop limit 1" + ); + + result +} + #[tokio::test] #[ignore] async fn test_reset() -> TestResult { diff --git a/dpd-client/tests/integration_tests/table_tests.rs b/dpd-client/tests/integration_tests/table_tests.rs index d1ce0ce..7fb3eeb 100644 --- a/dpd-client/tests/integration_tests/table_tests.rs +++ b/dpd-client/tests/integration_tests/table_tests.rs @@ -4,10 +4,11 @@ // // Copyright 2025 Oxide Computer Company +use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; -use async_trait::async_trait; +use ::common::network::MacAddr; use futures::TryStreamExt; use oxnet::IpNet; use oxnet::Ipv4Net; @@ -35,19 +36,23 @@ use dpd_client::ResponseValue; // This table has further shrunk to 4022 entries with the open source // compiler. That is being tracked as issue #1092, which will presumably // subsume #1013. -// update: with the move to 8192 entries we're now at 8124 -const IPV4_LPM_SIZE: usize = 8125; // ipv4 forwarding table -const IPV6_LPM_SIZE: usize = 1023; // ipv6 forwarding table +// update: with the move to 8192 entries we're now at 8190 entries. +const IPV4_LPM_SIZE: usize = 8191; // ipv4 forwarding table +const IPV6_LPM_SIZE: usize = 1025; // ipv6 forwarding table const SWITCH_IPV4_ADDRS_SIZE: usize = 511; // ipv4 addrs assigned to our ports const SWITCH_IPV6_ADDRS_SIZE: usize = 511; // ipv6 addrs assigned to our ports const IPV4_NAT_TABLE_SIZE: usize = 1024; // nat routing table const IPV6_NAT_TABLE_SIZE: usize = 1024; // nat routing table const IPV4_ARP_SIZE: usize = 512; // arp cache const IPV6_NEIGHBOR_SIZE: usize = 512; // ipv6 neighbor cache +/// The size of the multicast table related to replication on +/// admin-scoped (internal) multicast groups. +const MULTICAST_TABLE_SIZE: usize = 1024; +const MCAST_TAG: &str = "mcast_table_test"; // multicast group tag // The result of a table insert or delete API operation. -type OpResult = - Result, dpd_client::Error>; +type OpResult = + Result, dpd_client::Error>; fn gen_ipv4_addr(idx: usize) -> Ipv4Addr { let base_addr: u32 = Ipv4Addr::new(192, 168, 0, 0).into(); @@ -68,18 +73,33 @@ fn gen_ipv6_cidr(idx: usize) -> Ipv6Net { Ipv6Net::new(gen_ipv6_addr(idx), 128).unwrap() } +// Generates valid IPv6 multicast addresses that are admin-scoped. +fn gen_ipv6_multicast_addr(idx: usize) -> Ipv6Addr { + // Use admin-scoped multicast addresses (ff04::/16, ff05::/16, ff08::/16) + // This ensures they will be created as internal groups + let scope = match idx % 3 { + 0 => 0xFF04, // admin-scoped + 1 => 0xFF05, // admin-scoped + _ => 0xFF08, // admin-scoped + }; + Ipv6Addr::new(scope, 0, 0, 0, 0, 0, 0, (1000 + idx) as u16) +} + // For each table we want to test, we define functions to insert, delete, and // count entries. -#[async_trait] -trait TableTest { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult; - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult; +trait TableTest { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult; + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult; async fn count_entries(switch: &Switch) -> usize; } // Verify that we can fill and empty a table, and that it has exactly the // capacity that we expect. -async fn test_table_capacity(table_size: usize) -> TestResult { +async fn test_table_capacity(table_size: usize) -> TestResult +where + T: TableTest, + I: std::fmt::Debug, +{ let switch = &*get_switch().await; // Verify that the table is now empty @@ -118,9 +138,8 @@ async fn test_table_capacity(table_size: usize) -> TestResult { Ok(()) } -#[async_trait] impl TableTest for types::Ipv4Entry { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); switch .client @@ -132,7 +151,7 @@ impl TableTest for types::Ipv4Entry { .await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); switch .client @@ -158,12 +177,12 @@ async fn test_ipv4_full() -> TestResult { // The limit for the switch port addresses is half the size of the table // because each address consumes two table entries: one to "accept" on the // correct port and one to "drop" on all the other ports. - test_table_capacity::(SWITCH_IPV4_ADDRS_SIZE / 2).await + test_table_capacity::(SWITCH_IPV4_ADDRS_SIZE / 2) + .await } -#[async_trait] impl TableTest for types::Ipv6Entry { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); switch .client @@ -175,7 +194,7 @@ impl TableTest for types::Ipv6Entry { .await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); switch .client @@ -201,25 +220,22 @@ async fn test_ipv6_full() -> TestResult { // The limit for the switch port addresses is half the size of the table // because each address consumes two table entries: one to "accept" on the // correct port and one to "drop" on all the other ports. - test_table_capacity::(SWITCH_IPV6_ADDRS_SIZE / 2).await + test_table_capacity::(SWITCH_IPV6_ADDRS_SIZE / 2) + .await } -#[async_trait] impl TableTest for types::ArpEntry { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let entry = types::ArpEntry { ip: gen_ipv4_addr(idx).into(), - mac: common::network::MacAddr::new( - 0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab, - ) - .into(), + mac: MacAddr::new(0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab).into(), tag: switch.client.inner().tag.clone(), update: String::new(), }; switch.client.arp_create(&entry).await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { switch.client.arp_delete(&gen_ipv4_addr(idx)).await } @@ -237,27 +253,23 @@ impl TableTest for types::ArpEntry { #[tokio::test] #[ignore] async fn test_arp_full() -> TestResult { - test_table_capacity::(IPV4_ARP_SIZE).await + test_table_capacity::(IPV4_ARP_SIZE).await } struct NdpEntry {} -#[async_trait] impl TableTest for NdpEntry { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let entry = types::ArpEntry { ip: gen_ipv6_addr(idx).into(), - mac: common::network::MacAddr::new( - 0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab, - ) - .into(), + mac: MacAddr::new(0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab).into(), tag: switch.client.inner().tag.clone(), update: String::new(), }; switch.client.ndp_create(&entry).await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { switch.client.ndp_delete(&gen_ipv6_addr(idx)).await } @@ -275,20 +287,16 @@ impl TableTest for NdpEntry { #[tokio::test] #[ignore] async fn test_ndp_full() -> TestResult { - test_table_capacity::(IPV6_NEIGHBOR_SIZE).await + test_table_capacity::(IPV6_NEIGHBOR_SIZE).await } -#[async_trait] impl TableTest for types::Ipv4Nat { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let external_ip = Ipv4Addr::new(192, 168, 0, 1); let tgt = types::NatTarget { - internal_ip: "fd00:1122:7788:0101::4".parse::().unwrap(), - inner_mac: common::network::MacAddr::new( - 0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab, - ) - .into(), + internal_ip: Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 1), + inner_mac: MacAddr::new(0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab).into(), vni: 0.into(), }; switch @@ -297,7 +305,7 @@ impl TableTest for types::Ipv4Nat { .await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let external_ip = Ipv4Addr::new(192, 168, 0, 1); switch .client @@ -320,20 +328,16 @@ impl TableTest for types::Ipv4Nat { #[tokio::test] #[ignore] async fn test_natv4_full() -> TestResult { - test_table_capacity::(IPV4_NAT_TABLE_SIZE).await + test_table_capacity::(IPV4_NAT_TABLE_SIZE).await } -#[async_trait] impl TableTest for types::Ipv6Nat { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let external_ip = "fd00:1122:1122:0101::4".parse::().unwrap(); let tgt = types::NatTarget { internal_ip: "fd00:1122:7788:0101::4".parse::().unwrap(), - inner_mac: common::network::MacAddr::new( - 0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab, - ) - .into(), + inner_mac: MacAddr::new(0xe0, 0xd5, 0x5e, 0x67, 0x89, 0xab).into(), vni: 0.into(), }; switch @@ -342,7 +346,7 @@ impl TableTest for types::Ipv6Nat { .await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let external_ip = "fd00:1122:1122:0101::4".parse::().unwrap(); switch .client @@ -365,14 +369,13 @@ impl TableTest for types::Ipv6Nat { #[tokio::test] #[ignore] async fn test_natv6_full() -> TestResult { - test_table_capacity::(IPV6_NAT_TABLE_SIZE).await + test_table_capacity::(IPV6_NAT_TABLE_SIZE).await } struct RouteV4 {} -#[async_trait] impl TableTest for RouteV4 { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); let route = types::RouteSet { cidr: IpNet::V4(gen_ipv4_cidr(idx)), @@ -388,7 +391,7 @@ impl TableTest for RouteV4 { switch.client.route_ipv4_set(&route).await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let cidr = gen_ipv4_cidr(idx); switch.client.route_ipv4_delete(&cidr).await } @@ -407,14 +410,13 @@ impl TableTest for RouteV4 { #[tokio::test] #[ignore] async fn test_routev4_full() -> TestResult { - test_table_capacity::(IPV4_LPM_SIZE).await + test_table_capacity::(IPV4_LPM_SIZE).await } struct RouteV6 {} -#[async_trait] impl TableTest for RouteV6 { - async fn insert_entry(switch: &Switch, idx: usize) -> OpResult { + async fn insert_entry(switch: &Switch, idx: usize) -> OpResult<()> { let (port_id, link_id) = switch.link_id(PhysPort(11)).unwrap(); let route = types::RouteSet { cidr: IpNet::V6(gen_ipv6_cidr(idx)), @@ -433,7 +435,7 @@ impl TableTest for RouteV6 { switch.client.route_ipv6_set(&route).await } - async fn delete_entry(switch: &Switch, idx: usize) -> OpResult { + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { let cidr = gen_ipv6_cidr(idx); switch.client.route_ipv6_delete(&cidr).await } @@ -452,5 +454,69 @@ impl TableTest for RouteV6 { #[tokio::test] #[ignore] async fn test_routev6_full() -> TestResult { - test_table_capacity::(IPV6_LPM_SIZE).await + test_table_capacity::(IPV6_LPM_SIZE).await +} + +struct MulticastReplicationTableTest {} + +impl TableTest + for MulticastReplicationTableTest +{ + async fn insert_entry( + switch: &Switch, + idx: usize, + ) -> OpResult { + let (port_id1, link_id1) = switch.link_id(PhysPort(11)).unwrap(); + let (port_id2, link_id2) = switch.link_id(PhysPort(12)).unwrap(); + + // Only IPv6 admin-scoped multicast addresses for replication table testing + let group_ip = gen_ipv6_multicast_addr(idx); + + // Admin-scoped IPv6 groups are internal with replication info and members + let internal_entry = types::MulticastGroupCreateEntry { + group_ip, + tag: Some(MCAST_TAG.to_string()), + sources: None, + members: vec![ + types::MulticastGroupMember { + port_id: port_id1.clone(), + link_id: link_id1, + direction: types::Direction::External, + }, + types::MulticastGroupMember { + port_id: port_id2.clone(), + link_id: link_id2, + direction: types::Direction::External, + }, + ], + }; + switch.client.multicast_group_create(&internal_entry).await + } + + async fn delete_entry(switch: &Switch, idx: usize) -> OpResult<()> { + let ip = IpAddr::V6(gen_ipv6_multicast_addr(idx)); + switch.client.multicast_group_delete(&ip).await + } + + async fn count_entries(switch: &Switch) -> usize { + // Count all groups with our test tag + switch + .client + .multicast_groups_list_by_tag_stream(MCAST_TAG, None) + .try_collect::>() + .await + .expect("Should be able to list groups by tag paginated") + .len() + } +} + +#[tokio::test] +#[ignore] +async fn test_multicast_replication_table_full() -> TestResult { + test_table_capacity::< + MulticastReplicationTableTest, + types::MulticastGroupResponse, + (), + >(MULTICAST_TABLE_SIZE) + .await } diff --git a/dpd/p4/constants.p4 b/dpd/p4/constants.p4 index a6224c0..e893d37 100644 --- a/dpd/p4/constants.p4 +++ b/dpd/p4/constants.p4 @@ -1,36 +1,43 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + const bit<16> L2_ISOLATED_FLAG = 0x8000; -#define IS_SERVICE(p) ((p) == USER_SPACE_SERVICE_PORT) - -//TODO these all need to be bigger. Early experimentation is showing that this -//is going to need to come either through ATCAM/ALPM or code restructuring. -const int IPV4_NAT_TABLE_SIZE = 1024; // nat routing table -const int IPV6_NAT_TABLE_SIZE = 1024; // nat routing table -const int IPV4_LPM_SIZE = 8192; // ipv4 forwarding table -const int IPV6_LPM_SIZE = 1024; // ipv6 forwarding table - -const int IPV4_ARP_SIZE = 512; // arp cache -const int IPV6_NEIGHBOR_SIZE = 512; // ipv6 neighbor cache -const int SWITCH_IPV4_ADDRS_SIZE = 512; // ipv4 addrs assigned to our ports -const int SWITCH_IPV6_ADDRS_SIZE = 512; // ipv6 addrs assigned to our ports - -const bit<8> SC_FWD_FROM_USERSPACE = 0x00; -const bit<8> SC_FWD_TO_USERSPACE = 0x01; -const bit<8> SC_ICMP_NEEDED = 0x02; -const bit<8> SC_ARP_NEEDED = 0x03; -const bit<8> SC_NEIGHBOR_NEEDED = 0x04; -const bit<8> SC_INVALID = 0xff; + +// TODO: these all need to be bigger. Early experimentation is showing that this +// is going to need to come either through ATCAM/ALPM or code restructuring. +const int IPV4_NAT_TABLE_SIZE = 1024; // nat routing table +const int IPV6_NAT_TABLE_SIZE = 1024; // nat routing table +const int IPV4_LPM_SIZE = 8192; // ipv4 forwarding table +const int IPV6_LPM_SIZE = 1024; // ipv6 forwarding table +const int IPV4_ARP_SIZE = 512; // arp cache +const int IPV6_NEIGHBOR_SIZE = 512; // ipv6 neighbor cache +const int SWITCH_IPV4_ADDRS_SIZE = 512; // ipv4 addrs assigned to our ports +const int SWITCH_IPV6_ADDRS_SIZE = 512; // ipv6 addrs assigned to our ports +const int IPV4_MULTICAST_TABLE_SIZE = 1024; // multicast routing table(s) for IPv4 +const int IPV6_MULTICAST_TABLE_SIZE = 1024; // multicast routing table(s) for IPv6 + +const bit<8> SC_FWD_FROM_USERSPACE = 0x00; +const bit<8> SC_FWD_TO_USERSPACE = 0x01; +const bit<8> SC_ICMP_NEEDED = 0x02; +const bit<8> SC_ARP_NEEDED = 0x03; +const bit<8> SC_NEIGHBOR_NEEDED = 0x04; +const bit<8> SC_INVALID = 0xff; /* flags used for per-packet-type counters */ -const bit<10> PKT_ETHER = 0x200; -const bit<10> PKT_LLDP = 0x100; -const bit<10> PKT_VLAN = 0x080; -const bit<10> PKT_SIDECAR = 0x040; -const bit<10> PKT_ICMP = 0x020; -const bit<10> PKT_IPV4 = 0x010; -const bit<10> PKT_IPV6 = 0x008; -const bit<10> PKT_UDP = 0x004; -const bit<10> PKT_TCP = 0x002; -const bit<10> PKT_ARP = 0x001; +const bit<10> PKT_RESUBMIT = 0x300; +const bit<10> PKT_ETHER = 0x200; +const bit<10> PKT_LLDP = 0x100; +const bit<10> PKT_VLAN = 0x080; +const bit<10> PKT_SIDECAR = 0x040; +const bit<10> PKT_ICMP = 0x020; +const bit<10> PKT_IPV4 = 0x010; +const bit<10> PKT_IPV6 = 0x008; +const bit<10> PKT_UDP = 0x004; +const bit<10> PKT_TCP = 0x002; +const bit<10> PKT_ARP = 0x001; /* Indexes into the service_ctr table */ const bit<8> SVC_COUNTER_FW_TO_USER = 0; @@ -40,22 +47,43 @@ const bit<8> SVC_COUNTER_V6_PING_REPLY = 3; const bit<8> SVC_COUNTER_BAD_PING = 4; const bit<32> SVC_COUNTER_MAX = 5; +/* Encapped Multicast Tags */ +const bit<2> MULTICAST_TAG_EXTERNAL = 0; +const bit<2> MULTICAST_TAG_UNDERLAY = 1; +const bit<2> MULTICAST_TAG_UNDERLAY_EXTERNAL = 2; + +/* IPv6 Address Mask Constants */ +const bit<128> IPV6_SCOPE_MASK = 0xffff0000000000000000000000000000; // Match ff00::/16 +const bit<128> IPV6_ULA_MASK = 0xff000000000000000000000000000000; // Match fd00::/8 + +/* IPv6 Address Pattern Constants */ +const bit<128> IPV6_ADMIN_LOCAL_PATTERN = 0xff040000000000000000000000000000; // ff04::/16 +const bit<128> IPV6_SITE_LOCAL_PATTERN = 0xff050000000000000000000000000000; // ff05::/16 +const bit<128> IPV6_ORG_SCOPE_PATTERN = 0xff080000000000000000000000000000; // ff08::/16 +const bit<128> IPV6_ULA_PATTERN = 0xfd000000000000000000000000000000; // fd00::/8 + /* Reasons a packet may be dropped by the p4 pipeline */ -const bit<8> DROP_IPV4_SWITCH_ADDR_MISS = 0x01; -const bit<8> DROP_IPV6_SWITCH_ADDR_MISS = 0x02; -const bit<8> DROP_BAD_PING = 0x03; -const bit<8> DROP_NAT_HEADER_ERROR = 0x04; -const bit<8> DROP_ARP_NULL = 0x05; -const bit<8> DROP_ARP_MISS = 0x06; -const bit<8> DROP_NDP_NULL = 0x07; -const bit<8> DROP_NDP_MISS = 0x08; +const bit<8> DROP_IPV4_SWITCH_ADDR_MISS = 0x01; +const bit<8> DROP_IPV6_SWITCH_ADDR_MISS = 0x02; +const bit<8> DROP_BAD_PING = 0x03; +const bit<8> DROP_NAT_HEADER_ERROR = 0x04; +const bit<8> DROP_ARP_NULL = 0x05; +const bit<8> DROP_ARP_MISS = 0x06; +const bit<8> DROP_NDP_NULL = 0x07; +const bit<8> DROP_NDP_MISS = 0x08; const bit<8> DROP_MULTICAST_TO_LOCAL_INTERFACE = 0x09; -const bit<8> DROP_IPV4_CHECKSUM_ERR = 0x0A; -const bit<8> DROP_IPV4_TTL_INVALID = 0x0B; -const bit<8> DROP_IPV4_TTL_EXCEEDED = 0x0C; -const bit<8> DROP_IPV6_TTL_INVALID = 0x0D; -const bit<8> DROP_IPV6_TTL_EXCEEDED = 0x0E; -const bit<8> DROP_IPV4_UNROUTEABLE = 0x0F; -const bit<8> DROP_IPV6_UNROUTEABLE = 0x10; -const bit<8> DROP_NAT_INGRESS_MISS = 0x11; -const bit<32> DROP_REASON_MAX = 0x12; +const bit<8> DROP_IPV4_CHECKSUM_ERR = 0x0A; +const bit<8> DROP_IPV4_TTL_INVALID = 0x0B; +const bit<8> DROP_IPV4_TTL_EXCEEDED = 0x0C; +const bit<8> DROP_IPV6_TTL_INVALID = 0x0D; +const bit<8> DROP_IPV6_TTL_EXCEEDED = 0x0E; +const bit<8> DROP_IPV4_UNROUTEABLE = 0x0F; +const bit<8> DROP_IPV6_UNROUTEABLE = 0x10; +const bit<8> DROP_NAT_INGRESS_MISS = 0x11; +const bit<8> DROP_MULTICAST_NO_GROUP = 0x12; +const bit<8> DROP_MULTICAST_INVALID_MAC = 0x13; +const bit<8> DROP_MULTICAST_CPU_COPY = 0x14; +const bit<8> DROP_MULTICAST_SOURCE_FILTERED = 0x15; +const bit<8> DROP_MULTICAST_PATH_FILTERED = 0x16; +const bit<32> DROP_REASON_MAX = 0x17; + diff --git a/dpd/p4/headers.p4 b/dpd/p4/headers.p4 index 6c1e340..4f9cdc6 100644 --- a/dpd/p4/headers.p4 +++ b/dpd/p4/headers.p4 @@ -1,3 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + const bit<16> ETHERTYPE_IPV4 = 0x0800; const bit<16> ETHERTYPE_ARP = 0x0806; const bit<16> ETHERTYPE_VLAN = 0x8100; @@ -152,6 +158,7 @@ header geneve_h { const bit<16> GENEVE_OPT_CLASS_OXIDE = 0x0129; const bit<7> GENEVE_OPT_OXIDE_EXTERNAL = 0x00; +const bit<7> GENEVE_OPT_OXIDE_MCAST = 0x01; // Multicast tag header geneve_opt_h { bit<16> class; @@ -161,13 +168,30 @@ header geneve_opt_h { bit<5> opt_len; } +/* Geneve option for an `mcast_tag`. + * This is a 2-bit field that indicates the type of + * multicast traffic: + * 0 - Replicate packets to ports set for external multicast traffic + * 1 - Replicate packets to ports set for underlay multicast traffic + * 2 - Replicate packets to ports set for underlay and external multicast + traffic (bifurcated) + * + * The rest of the option is reserved. +*/ +header geneve_opt_mcast_h { + bit<2> mcast_tag; + bit<30> reserved; +} + // Since we're a TEP, we need to push and read Geneve options. // `varbit` only allows us to carry. // XXX: For parsing past one option, add `extern ParserCounter` // to oxidecomputer/p4/lang/p4rs/src/externs.rs, consider // storing via `header_union`s. struct geneve_opt_headers_t { - geneve_opt_h ox_external_tag; + geneve_opt_h ox_external_tag; + // Multicast-specific options + geneve_opt_mcast_h ox_mcast_tag; } struct sidecar_headers_t { @@ -190,42 +214,3 @@ struct sidecar_headers_t { tcp_h inner_tcp; udp_h inner_udp; } - -struct sidecar_ingress_meta_t { - PortId_t in_port; // ingress port for this packet - - bool ipv4_checksum_err; // failed ipv4 checksum - bool routed; // packet routed at layer 3 - bool is_switch_address; // destination IP was a switch port - bool multicast; // packet was multicast - bool service_routed; // routed to or from a service routine - bool nat_egress; // NATed packet from guest -> uplink - bool nat_ingress; // NATed packet from uplink -> guest - bool nat_ingress_port; // This port accepts only NAT traffic - ipv4_addr_t nexthop_ipv4; // ip address of next router - ipv6_addr_t nexthop_ipv6; // ip address of next router - bit<10> pkt_type; - bit<8> drop_reason; // reason a packet was dropped - - bit<16> l4_src_port; // tcp or udp destination port - bit<16> l4_dst_port; // tcp or udp destination port - ipv6_addr_t nat_ingress_tgt; - mac_addr_t nat_inner_mac; - geneve_vni_t nat_geneve_vni; - - // If we modify an ICMP header, we need to recalculate its checksum. - // To do the math, we need the original checksum. - bool icmp_recalc; - bit<16> icmp_csum; - - // Used when calculating outer UDP checksum for encapsulated NAT - // ingress packets - bit<16> body_checksum; // residual csum for packet body - bit<16> l4_length; - - // Used for responding to pings - mac_addr_t orig_src_mac; // source mac address before rewriting - ipv4_addr_t orig_src_ipv4; // original ipv4 source - - ipv4_addr_t orig_dst_ipv4; // original ipv4 target -} diff --git a/dpd/p4/metadata.p4 b/dpd/p4/metadata.p4 new file mode 100644 index 0000000..2b6ab5a --- /dev/null +++ b/dpd/p4/metadata.p4 @@ -0,0 +1,102 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +/* Flexible bridge header for passing metadata between ingress and egress + * pipelines. + */ +@flexible +header bridge_h { + PortId_t ingress_port; +} + +struct sidecar_ingress_meta_t { + bool ipv4_checksum_err; // failed ipv4 checksum + bool is_switch_address; // destination IP was a switch port + bool is_mcast; // packet is multicast + bool is_valid; // packet is valid + bool allow_source_mcast; // allowed to be sent from a source address for SSM + bool is_link_local_mcastv6; // packet is a IPv6 link-local multicast packet + bool service_routed; // routed to or from a service routine + bool nat_egress_hit; // NATed packet from guest -> uplink + bool nat_ingress_hit; // NATed packet from uplink -> guest + bool nat_ingress_port; // This port accepts only NAT traffic + ipv4_addr_t nexthop_ipv4; // ip address of next router + ipv6_addr_t nexthop_ipv6; // ip address of next router + bit<10> pkt_type; + bit<8> drop_reason; // reason a packet was dropped + bit<16> l4_src_port; // tcp or udp destination port + bit<16> l4_dst_port; // tcp or udp destination port + ipv6_addr_t nat_ingress_tgt; // target address for NAT ingress + mac_addr_t nat_inner_mac; // inner mac address for NAT ingress + geneve_vni_t nat_geneve_vni; // VNI for NAT ingress + + // If we modify an ICMP header, we need to recalculate its checksum. + // To do the math, we need the original checksum. + bool icmp_recalc; + bit<16> icmp_csum; + + // Used when calculating outer UDP checksum for encapsulated NAT + // ingress packets + bit<16> body_checksum; // residual csum for packet body + bit<16> l4_length; + + mac_addr_t orig_src_mac; // source mac address before rewriting + ipv4_addr_t orig_src_ipv4; // original ipv4 source + ipv4_addr_t orig_dst_ipv4; // original ipv4 target + + bridge_h bridge_hdr; // bridge header +} + +struct sidecar_egress_meta_t { + bit<8> drop_reason; // reason a packet was dropped + bridge_h bridge_hdr; // bridge header + + // 256-bit port bitmap separated across 8 x 32-bit values + bit<32> decap_ports_0; // Ports 0-31 + bit<32> decap_ports_1; // Ports 32-63 + bit<32> decap_ports_2; // Ports 64-95 + bit<32> decap_ports_3; // Ports 96-127 + bit<32> decap_ports_4; // Ports 128-159 + bit<32> decap_ports_5; // Ports 160-191 + bit<32> decap_ports_6; // Ports 192-223 + bit<32> decap_ports_7; // Ports 224-255 + + bit<32> bitmap_result; // result of decap bitmap + bool ipv4_checksum_recalc; // recalc checksum for IPv4 + bit<12> vlan_id; // VLAN ID for the packet + bit<8> port_number; // Port number for the outgoing port (0-255) +} + +struct route4_result_t { + /* + * The result of the multistage route selection process is an egress + * port and a nexthop address + */ + ipv4_addr_t nexthop; + PortId_t port; + + /* Did we successfully look up the route in the table? */ + bool is_hit; + + /* + * A hash of the (address,port) fields, which is used to choose between + * multiple potential routes. + */ + bit<8> hash; + + /* Index into the target table of the first potential route */ + bit<16> idx; + /* Number of consecutive slots containing potential routes */ + bit<8> slots; + /* Which of those routes we should select, based the flow hash */ + bit<16> slot; +} + +struct route6_result_t { + ipv6_addr_t nexthop; + PortId_t port; + bool is_hit; +} diff --git a/dpd/p4/parser.p4 b/dpd/p4/parser.p4 index 1653854..6c47ac8 100644 --- a/dpd/p4/parser.p4 +++ b/dpd/p4/parser.p4 @@ -1,9 +1,15 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + parser IngressParser( packet_in pkt, out sidecar_headers_t hdr, out sidecar_ingress_meta_t meta, - out ingress_intrinsic_metadata_t ig_intr_md) -{ + out ingress_intrinsic_metadata_t ig_intr_md +) { Checksum() ipv4_checksum; Checksum() icmp_checksum; Checksum() nat_checksum; @@ -15,15 +21,15 @@ parser IngressParser( } state meta_init { - meta.in_port = ig_intr_md.ingress_port; - meta.ipv4_checksum_err = false; - meta.routed = false; - meta.multicast = false; - meta.service_routed = false; meta.is_switch_address = false; - meta.nat_egress = false; - meta.nat_ingress = false; + meta.is_mcast = false; + meta.is_valid = true; + meta.allow_source_mcast = false; + meta.is_link_local_mcastv6 = false; + meta.service_routed = false; + meta.nat_egress_hit = false; + meta.nat_ingress_hit = false; meta.nat_ingress_port = false; meta.nat_ingress_tgt = 0; meta.nat_inner_mac = 0; @@ -42,6 +48,9 @@ parser IngressParser( meta.pkt_type = 0; meta.drop_reason = 0; + meta.bridge_hdr.setValid(); + meta.bridge_hdr.ingress_port = ig_intr_md.ingress_port; + transition port_metadata; } @@ -122,9 +131,94 @@ parser IngressParser( nat_checksum.subtract({ hdr.ipv4.src_addr, hdr.ipv4.dst_addr, - (bit<16>)hdr.ipv4.protocol + 8w0, + hdr.ipv4.protocol, }); + transition select(hdr.ipv4.dst_addr[31:28]) { + 4w0xe: validate_ipv4_mcast1_2; + default: validate_ttl; + } + } + + state validate_ttl { + transition select(hdr.ipv4.ttl) { + 8w0: invalidate_ttl; + default: goto_proto_ipv4; + } + } + + // IPv4 Multicast Address Validation (RFC 1112, RFC 7042). + // + // This validates that IPv4 multicast packets (224.0.0.0/4) use the proper + // MAC address format: + // + // - First byte: Must be 0x01 (IANA-assigned OUI format) + // - Second byte: Must be 0x00 (with first bit of second byte being 0) + // - Third byte: Must be 0x5e + // - Last 3 bytes: Must contain the lower 23 bits of the IPv4 multicast + // address (with the 24th bit set to 0) + state validate_ipv4_mcast1_2 { + // Extract the first byte of the MAC address + bit<8> mac_byte1 = hdr.ethernet.dst_mac[47:40]; // First byte must be 0x01 + // Extract the second byte of the MAC address + bit<8> mac_byte2 = hdr.ethernet.dst_mac[39:32]; // Second byte must be 0x00 + + transition select(mac_byte1, mac_byte2) { + (8w0x01, 8w0x00): validate_ipv4_mcast_3; + default: invalidate_ipv4_mcast; + } + } + + // IPv4 Multicast Third Byte Validation + // + // Following the IANA mapping rules for IPv4 multicast MAC addresses: + // + // - The third byte must always be 0x5e + // - This completes the IANA OUI prefix (01:00:5e) + // - After this validation, the remaining 23 bits of the MAC will be + // verified against the multicast IPv4 address in the `Filter` control + state validate_ipv4_mcast_3 { + // Extract the 3rd byte of the MAC address + bit<8> mac_byte3 = hdr.ethernet.dst_mac[31:24]; // Third byte must be 0x5e + + transition select(mac_byte3) { + 8w0x5e: set_mcast_ipv4; + default: invalidate_ipv4_mcast; + } + } + + state set_mcast_ipv4 { + meta.is_mcast = true; + transition validate_mcast_ttl; + } + + state validate_mcast_ttl { + transition select(hdr.ipv4.ttl) { + 8w0: invalidate_ttl; + 8w1: invalidate_ttl; + default: goto_proto_ipv4; + } + } + + state invalidate_ttl { + meta.is_valid = false; + meta.drop_reason = DROP_IPV4_TTL_INVALID; + + // We don't reject here because we want to update our stats and reason + transition accept; + } + + state invalidate_ipv4_mcast { + meta.is_mcast = true; + meta.is_valid = false; + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + + // We don't reject here because we want to update our stats and reason + transition accept; + } + + state goto_proto_ipv4 { transition select(hdr.ipv4.protocol) { IPPROTO_ICMP: parse_icmp; IPPROTO_TCP: parse_tcp; @@ -140,15 +234,86 @@ parser IngressParser( nat_checksum.subtract({ hdr.ipv6.src_addr, hdr.ipv6.dst_addr, - hdr.ipv6.payload_len, - (bit<16>)hdr.ipv6.next_hdr + 8w0, + hdr.ipv6.next_hdr, + hdr.ipv6.payload_len }); + transition select(hdr.ipv6.dst_addr[127:112]) { + 16w0xff01: drop_interface_local_mcast; + 16w0xff02: set_link_local_mcast; + default: check_ipv6_mcast; + } + } + + state drop_interface_local_mcast { + meta.is_mcast = true; + meta.is_valid = false; + meta.drop_reason = DROP_MULTICAST_TO_LOCAL_INTERFACE; + + // We don't reject here because we want to update our stats and reason + transition accept; + } + + state set_link_local_mcast { + meta.is_link_local_mcastv6 = true; + meta.is_mcast = true; + transition validate_mcast_link_local_hop_limit; + } + + state check_ipv6_mcast { + // Check if the destination address is a multicast address + // (ff00::/8) and if the MAC address is in the correct format. + transition select(hdr.ipv6.dst_addr[127:120]) { + 8w0xff: set_mcast_ipv6; + default: validate_hop_limit; + } + } + + state validate_hop_limit { + transition select(hdr.ipv6.hop_limit) { + 8w0: invalidate_hop_limit; + default: goto_proto_ipv6; + } + } + + state validate_mcast_link_local_hop_limit { + // For link-local multicast, we allow a hop limit of 1. + // This is to ensure that link-local multicast packets + // are not forwarded beyond the local link. + transition select(hdr.ipv6.hop_limit) { + 8w0: invalidate_hop_limit; + 8w1: goto_proto_ipv6; + default: goto_proto_ipv6; + } + } + + state validate_mcast_hop_limit { + transition select(hdr.ipv6.hop_limit) { + 8w0: invalidate_hop_limit; + 8w1: invalidate_hop_limit; + default: goto_proto_ipv6; + } + } + + state set_mcast_ipv6 { + meta.is_mcast = true; + transition validate_mcast_hop_limit; + } + + state invalidate_hop_limit { + meta.is_valid = false; + meta.drop_reason = DROP_IPV6_TTL_INVALID; + + // We don't reject here because we want to update our stats and reason + transition accept; + } + + state goto_proto_ipv6 { transition select(hdr.ipv6.next_hdr) { IPPROTO_ICMPV6: parse_icmp; IPPROTO_TCP: parse_tcp; IPPROTO_UDP: parse_udp; - default: accept; } } @@ -220,10 +385,10 @@ parser IngressParser( // - ICRP forums suggest higher parse cost? // - Probably a lot of ugly states/branching on opt_len // to get a const value for counter decrement. - transition select(hdr.geneve.opt_len) { 0: geneve_parsed; 1: parse_geneve_opt; + 2: parse_geneve_opt; default: reject; } } @@ -239,10 +404,16 @@ parser IngressParser( state parse_geneve_ox_opt { transition select(hdr.geneve_opts.ox_external_tag.type) { GENEVE_OPT_OXIDE_EXTERNAL: geneve_parsed; + GENEVE_OPT_OXIDE_MCAST: parse_geneve_mcast_tag; default: reject; } } + state parse_geneve_mcast_tag { + pkt.extract(hdr.geneve_opts.ox_mcast_tag); + transition geneve_parsed; + } + state geneve_parsed { transition select(hdr.geneve.protocol) { GENEVE_ENCAP_ETH: parse_inner_eth; @@ -296,3 +467,188 @@ parser IngressParser( transition accept; } } + +parser EgressParser( + packet_in pkt, + out sidecar_headers_t hdr, + out sidecar_egress_meta_t meta, + out egress_intrinsic_metadata_t eg_intr_md +) { + + bridge_h bridge_hdr; + + state start { + pkt.extract(eg_intr_md); + transition meta_init; + } + + state meta_init { + meta.drop_reason = 0; + meta.bridge_hdr.setInvalid(); + + meta.decap_ports_0 = 0; + meta.decap_ports_1 = 0; + meta.decap_ports_2 = 0; + meta.decap_ports_3 = 0; + meta.decap_ports_4 = 0; + meta.decap_ports_5 = 0; + meta.decap_ports_6 = 0; + meta.decap_ports_7 = 0; + + meta.bitmap_result = 0; + meta.ipv4_checksum_recalc = false; + meta.vlan_id = 0; + meta.port_number = 0; + + + transition parse_bridge_hdr; + } + + state parse_bridge_hdr { + pkt.extract(bridge_hdr); + meta.bridge_hdr = bridge_hdr; + meta.bridge_hdr.setValid(); + + transition parse_ethernet; + } + + state parse_ethernet { + pkt.extract(hdr.ethernet); + + transition select(hdr.ethernet.ether_type) { + ETHERTYPE_VLAN: parse_vlan; + ETHERTYPE_IPV4: parse_ipv4; + ETHERTYPE_IPV6: parse_ipv6; + default: accept; + } + } + + state parse_vlan { + pkt.extract(hdr.vlan); + + transition select(hdr.vlan.ether_type) { + ETHERTYPE_IPV4: parse_ipv4; + ETHERTYPE_IPV6: parse_ipv6; + default: accept; + } + } + + state parse_ipv4 { + pkt.extract(hdr.ipv4); + + transition select(hdr.ipv4.protocol) { + IPPROTO_ICMP: parse_icmp; + IPPROTO_TCP: parse_tcp; + IPPROTO_UDP: parse_udp; + default: accept; + } + } + + state parse_ipv6 { + pkt.extract(hdr.ipv6); + + transition select(hdr.ipv6.next_hdr) { + IPPROTO_TCP: parse_tcp; + IPPROTO_UDP: parse_udp; + default: accept; + } + } + + state parse_icmp { + pkt.extract(hdr.icmp); + + transition accept; + } + + state parse_tcp { + pkt.extract(_); + + transition accept; + } + + state parse_udp { + pkt.extract(hdr.udp); + + transition select(hdr.udp.dst_port) { + GENEVE_UDP_PORT: parse_geneve; + default: accept; + } + } + + state parse_geneve { + pkt.extract(hdr.geneve); + + transition select(hdr.geneve.opt_len) { + 0: geneve_parsed; + 1: parse_geneve_opt; + 2: parse_geneve_opt; + default: reject; + } + } + + state parse_geneve_opt { + pkt.extract(hdr.geneve_opts.ox_external_tag); + transition select(hdr.geneve_opts.ox_external_tag.class) { + GENEVE_OPT_CLASS_OXIDE: parse_geneve_ox_opt; + default: reject; + } + } + + state parse_geneve_ox_opt { + transition select(hdr.geneve_opts.ox_external_tag.type) { + GENEVE_OPT_OXIDE_MCAST: parse_geneve_mcast_tag; + default: reject; + } + } + + state parse_geneve_mcast_tag { + pkt.extract(hdr.geneve_opts.ox_mcast_tag); + transition geneve_parsed; + } + + state geneve_parsed { + transition select(hdr.geneve.protocol) { + GENEVE_ENCAP_ETH: parse_inner_eth; + ETHERTYPE_IPV4: parse_inner_ipv4; + ETHERTYPE_IPV6: parse_inner_ipv6; + default: accept; + } + } + + state parse_inner_eth { + pkt.extract(hdr.inner_eth); + transition select(hdr.inner_eth.ether_type) { + ETHERTYPE_IPV4: parse_inner_ipv4; + ETHERTYPE_IPV6: parse_inner_ipv6; + default: accept; + } + } + + state parse_inner_ipv4 { + pkt.extract(hdr.inner_ipv4); + transition select(hdr.inner_ipv4.protocol) { + IPPROTO_TCP: parse_inner_tcp; + IPPROTO_UDP: parse_inner_udp; + default: accept; + } + } + + state parse_inner_ipv6 { + pkt.extract(hdr.inner_ipv6); + transition select(hdr.inner_ipv6.next_hdr) { + IPPROTO_TCP: parse_inner_tcp; + IPPROTO_UDP: parse_inner_udp; + default: accept; + } + } + + state parse_inner_tcp { + pkt.extract(hdr.inner_tcp); + transition accept; + } + + state parse_inner_udp { + pkt.extract(hdr.inner_udp); + transition accept; + } +} diff --git a/dpd/p4/port_bitmap_check.p4 b/dpd/p4/port_bitmap_check.p4 new file mode 100644 index 0000000..75a79ca --- /dev/null +++ b/dpd/p4/port_bitmap_check.p4 @@ -0,0 +1,321 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + + action check_port_bitmap_0(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_0 & bit_mask; + } + + action check_port_bitmap_1(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_1 & bit_mask; + } + + action check_port_bitmap_2(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_2 & bit_mask; + } + + action check_port_bitmap_3(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_3 & bit_mask; + } + + action check_port_bitmap_4(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_4 & bit_mask; + } + + action check_port_bitmap_5(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_5 & bit_mask; + } + + action check_port_bitmap_6(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_6 & bit_mask; + } + + action check_port_bitmap_7(bit<32> bit_mask) { + meta.bitmap_result = meta.decap_ports_7 & bit_mask; + } + + table port_bitmap_check { + key = { meta.port_number: exact; } + + actions = { + check_port_bitmap_0; + check_port_bitmap_1; + check_port_bitmap_2; + check_port_bitmap_3; + check_port_bitmap_4; + check_port_bitmap_5; + check_port_bitmap_6; + check_port_bitmap_7; + } + + const entries = { + // Ports 0-31 - Check against decap_ports_0 + 0 : check_port_bitmap_0(32w0x00000001); + 1 : check_port_bitmap_0(32w0x00000002); + 2 : check_port_bitmap_0(32w0x00000004); + 3 : check_port_bitmap_0(32w0x00000008); + 4 : check_port_bitmap_0(32w0x00000010); + 5 : check_port_bitmap_0(32w0x00000020); + 6 : check_port_bitmap_0(32w0x00000040); + 7 : check_port_bitmap_0(32w0x00000080); + 8 : check_port_bitmap_0(32w0x00000100); + 9 : check_port_bitmap_0(32w0x00000200); + 10 : check_port_bitmap_0(32w0x00000400); + 11 : check_port_bitmap_0(32w0x00000800); + 12 : check_port_bitmap_0(32w0x00001000); + 13 : check_port_bitmap_0(32w0x00002000); + 14 : check_port_bitmap_0(32w0x00004000); + 15 : check_port_bitmap_0(32w0x00008000); + 16 : check_port_bitmap_0(32w0x00010000); + 17 : check_port_bitmap_0(32w0x00020000); + 18 : check_port_bitmap_0(32w0x00040000); + 19 : check_port_bitmap_0(32w0x00080000); + 20 : check_port_bitmap_0(32w0x00100000); + 21 : check_port_bitmap_0(32w0x00200000); + 22 : check_port_bitmap_0(32w0x00400000); + 23 : check_port_bitmap_0(32w0x00800000); + 24 : check_port_bitmap_0(32w0x01000000); + 25 : check_port_bitmap_0(32w0x02000000); + 26 : check_port_bitmap_0(32w0x04000000); + 27 : check_port_bitmap_0(32w0x08000000); + 28 : check_port_bitmap_0(32w0x10000000); + 29 : check_port_bitmap_0(32w0x20000000); + 30 : check_port_bitmap_0(32w0x40000000); + 31 : check_port_bitmap_0(32w0x80000000); + // Ports 32-63 - Check against decap_ports_1 + 32 : check_port_bitmap_1(32w0x00000001); + 33 : check_port_bitmap_1(32w0x00000002); + 34 : check_port_bitmap_1(32w0x00000004); + 35 : check_port_bitmap_1(32w0x00000008); + 36 : check_port_bitmap_1(32w0x00000010); + 37 : check_port_bitmap_1(32w0x00000020); + 38 : check_port_bitmap_1(32w0x00000040); + 39 : check_port_bitmap_1(32w0x00000080); + 40 : check_port_bitmap_1(32w0x00000100); + 41 : check_port_bitmap_1(32w0x00000200); + 42 : check_port_bitmap_1(32w0x00000400); + 43 : check_port_bitmap_1(32w0x00000800); + 44 : check_port_bitmap_1(32w0x00001000); + 45 : check_port_bitmap_1(32w0x00002000); + 46 : check_port_bitmap_1(32w0x00004000); + 47 : check_port_bitmap_1(32w0x00008000); + 48 : check_port_bitmap_1(32w0x00010000); + 49 : check_port_bitmap_1(32w0x00020000); + 50 : check_port_bitmap_1(32w0x00040000); + 51 : check_port_bitmap_1(32w0x00080000); + 52 : check_port_bitmap_1(32w0x00100000); + 53 : check_port_bitmap_1(32w0x00200000); + 54 : check_port_bitmap_1(32w0x00400000); + 55 : check_port_bitmap_1(32w0x00800000); + 56 : check_port_bitmap_1(32w0x01000000); + 57 : check_port_bitmap_1(32w0x02000000); + 58 : check_port_bitmap_1(32w0x04000000); + 59 : check_port_bitmap_1(32w0x08000000); + 60 : check_port_bitmap_1(32w0x10000000); + 61 : check_port_bitmap_1(32w0x20000000); + 62 : check_port_bitmap_1(32w0x40000000); + 63 : check_port_bitmap_1(32w0x80000000); + // Ports 64-95 - Check against decap_ports_2 + 64 : check_port_bitmap_2(32w0x00000001); + 65 : check_port_bitmap_2(32w0x00000002); + 66 : check_port_bitmap_2(32w0x00000004); + 67 : check_port_bitmap_2(32w0x00000008); + 68 : check_port_bitmap_2(32w0x00000010); + 69 : check_port_bitmap_2(32w0x00000020); + 70 : check_port_bitmap_2(32w0x00000040); + 71 : check_port_bitmap_2(32w0x00000080); + 72 : check_port_bitmap_2(32w0x00000100); + 73 : check_port_bitmap_2(32w0x00000200); + 74 : check_port_bitmap_2(32w0x00000400); + 75 : check_port_bitmap_2(32w0x00000800); + 76 : check_port_bitmap_2(32w0x00001000); + 77 : check_port_bitmap_2(32w0x00002000); + 78 : check_port_bitmap_2(32w0x00004000); + 79 : check_port_bitmap_2(32w0x00008000); + 80 : check_port_bitmap_2(32w0x00010000); + 81 : check_port_bitmap_2(32w0x00020000); + 82 : check_port_bitmap_2(32w0x00040000); + 83 : check_port_bitmap_2(32w0x00080000); + 84 : check_port_bitmap_2(32w0x00100000); + 85 : check_port_bitmap_2(32w0x00200000); + 86 : check_port_bitmap_2(32w0x00400000); + 87 : check_port_bitmap_2(32w0x00800000); + 88 : check_port_bitmap_2(32w0x01000000); + 89 : check_port_bitmap_2(32w0x02000000); + 90 : check_port_bitmap_2(32w0x04000000); + 91 : check_port_bitmap_2(32w0x08000000); + 92 : check_port_bitmap_2(32w0x10000000); + 93 : check_port_bitmap_2(32w0x20000000); + 94 : check_port_bitmap_2(32w0x40000000); + 95 : check_port_bitmap_2(32w0x80000000); + // Ports 96-127 - Check against decap_ports_3 + 96 : check_port_bitmap_3(32w0x00000001); + 97 : check_port_bitmap_3(32w0x00000002); + 98 : check_port_bitmap_3(32w0x00000004); + 99 : check_port_bitmap_3(32w0x00000008); + 100 : check_port_bitmap_3(32w0x00000010); + 101 : check_port_bitmap_3(32w0x00000020); + 102 : check_port_bitmap_3(32w0x00000040); + 103 : check_port_bitmap_3(32w0x00000080); + 104 : check_port_bitmap_3(32w0x00000100); + 105 : check_port_bitmap_3(32w0x00000200); + 106 : check_port_bitmap_3(32w0x00000400); + 107 : check_port_bitmap_3(32w0x00000800); + 108 : check_port_bitmap_3(32w0x00001000); + 109 : check_port_bitmap_3(32w0x00002000); + 110 : check_port_bitmap_3(32w0x00004000); + 111 : check_port_bitmap_3(32w0x00008000); + 112 : check_port_bitmap_3(32w0x00010000); + 113 : check_port_bitmap_3(32w0x00020000); + 114 : check_port_bitmap_3(32w0x00040000); + 115 : check_port_bitmap_3(32w0x00080000); + 116 : check_port_bitmap_3(32w0x00100000); + 117 : check_port_bitmap_3(32w0x00200000); + 118 : check_port_bitmap_3(32w0x00400000); + 119 : check_port_bitmap_3(32w0x00800000); + 120 : check_port_bitmap_3(32w0x01000000); + 121 : check_port_bitmap_3(32w0x02000000); + 122 : check_port_bitmap_3(32w0x04000000); + 123 : check_port_bitmap_3(32w0x08000000); + 124 : check_port_bitmap_3(32w0x10000000); + 125 : check_port_bitmap_3(32w0x20000000); + 126 : check_port_bitmap_3(32w0x40000000); + 127 : check_port_bitmap_3(32w0x80000000); + // Ports 128-159 - Check against decap_ports_4 + 128 : check_port_bitmap_4(32w0x00000001); + 129 : check_port_bitmap_4(32w0x00000002); + 130 : check_port_bitmap_4(32w0x00000004); + 131 : check_port_bitmap_4(32w0x00000008); + 132 : check_port_bitmap_4(32w0x00000010); + 133 : check_port_bitmap_4(32w0x00000020); + 134 : check_port_bitmap_4(32w0x00000040); + 135 : check_port_bitmap_4(32w0x00000080); + 136 : check_port_bitmap_4(32w0x00000100); + 137 : check_port_bitmap_4(32w0x00000200); + 138 : check_port_bitmap_4(32w0x00000400); + 139 : check_port_bitmap_4(32w0x00000800); + 140 : check_port_bitmap_4(32w0x00001000); + 141 : check_port_bitmap_4(32w0x00002000); + 142 : check_port_bitmap_4(32w0x00004000); + 143 : check_port_bitmap_4(32w0x00008000); + 144 : check_port_bitmap_4(32w0x00010000); + 145 : check_port_bitmap_4(32w0x00020000); + 146 : check_port_bitmap_4(32w0x00040000); + 147 : check_port_bitmap_4(32w0x00080000); + 148 : check_port_bitmap_4(32w0x00100000); + 149 : check_port_bitmap_4(32w0x00200000); + 150 : check_port_bitmap_4(32w0x00400000); + 151 : check_port_bitmap_4(32w0x00800000); + 152 : check_port_bitmap_4(32w0x01000000); + 153 : check_port_bitmap_4(32w0x02000000); + 154 : check_port_bitmap_4(32w0x04000000); + 155 : check_port_bitmap_4(32w0x08000000); + 156 : check_port_bitmap_4(32w0x10000000); + 157 : check_port_bitmap_4(32w0x20000000); + 158 : check_port_bitmap_4(32w0x40000000); + 159 : check_port_bitmap_4(32w0x80000000); + // Ports 160-191 - Check against decap_ports_5 + 160 : check_port_bitmap_5(32w0x00000001); + 161 : check_port_bitmap_5(32w0x00000002); + 162 : check_port_bitmap_5(32w0x00000004); + 163 : check_port_bitmap_5(32w0x00000008); + 164 : check_port_bitmap_5(32w0x00000010); + 165 : check_port_bitmap_5(32w0x00000020); + 166 : check_port_bitmap_5(32w0x00000040); + 167 : check_port_bitmap_5(32w0x00000080); + 168 : check_port_bitmap_5(32w0x00000100); + 169 : check_port_bitmap_5(32w0x00000200); + 170 : check_port_bitmap_5(32w0x00000400); + 171 : check_port_bitmap_5(32w0x00000800); + 172 : check_port_bitmap_5(32w0x00001000); + 173 : check_port_bitmap_5(32w0x00002000); + 174 : check_port_bitmap_5(32w0x00004000); + 175 : check_port_bitmap_5(32w0x00008000); + 176 : check_port_bitmap_5(32w0x00010000); + 177 : check_port_bitmap_5(32w0x00020000); + 178 : check_port_bitmap_5(32w0x00040000); + 179 : check_port_bitmap_5(32w0x00080000); + 180 : check_port_bitmap_5(32w0x00100000); + 181 : check_port_bitmap_5(32w0x00200000); + 182 : check_port_bitmap_5(32w0x00400000); + 183 : check_port_bitmap_5(32w0x00800000); + 184 : check_port_bitmap_5(32w0x01000000); + 185 : check_port_bitmap_5(32w0x02000000); + 186 : check_port_bitmap_5(32w0x04000000); + 187 : check_port_bitmap_5(32w0x08000000); + 188 : check_port_bitmap_5(32w0x10000000); + 189 : check_port_bitmap_5(32w0x20000000); + 190 : check_port_bitmap_5(32w0x40000000); + 191 : check_port_bitmap_5(32w0x80000000); + // Ports 192-223 - Check against decap_ports_6 + 192 : check_port_bitmap_6(32w0x00000001); + 193 : check_port_bitmap_6(32w0x00000002); + 194 : check_port_bitmap_6(32w0x00000004); + 195 : check_port_bitmap_6(32w0x00000008); + 196 : check_port_bitmap_6(32w0x00000010); + 197 : check_port_bitmap_6(32w0x00000020); + 198 : check_port_bitmap_6(32w0x00000040); + 199 : check_port_bitmap_6(32w0x00000080); + 200 : check_port_bitmap_6(32w0x00000100); + 201 : check_port_bitmap_6(32w0x00000200); + 202 : check_port_bitmap_6(32w0x00000400); + 203 : check_port_bitmap_6(32w0x00000800); + 204 : check_port_bitmap_6(32w0x00001000); + 205 : check_port_bitmap_6(32w0x00002000); + 206 : check_port_bitmap_6(32w0x00004000); + 207 : check_port_bitmap_6(32w0x00008000); + 208 : check_port_bitmap_6(32w0x00010000); + 209 : check_port_bitmap_6(32w0x00020000); + 210 : check_port_bitmap_6(32w0x00040000); + 211 : check_port_bitmap_6(32w0x00080000); + 212 : check_port_bitmap_6(32w0x00100000); + 213 : check_port_bitmap_6(32w0x00200000); + 214 : check_port_bitmap_6(32w0x00400000); + 215 : check_port_bitmap_6(32w0x00800000); + 216 : check_port_bitmap_6(32w0x01000000); + 217 : check_port_bitmap_6(32w0x02000000); + 218 : check_port_bitmap_6(32w0x04000000); + 219 : check_port_bitmap_6(32w0x08000000); + 220 : check_port_bitmap_6(32w0x10000000); + 221 : check_port_bitmap_6(32w0x20000000); + 222 : check_port_bitmap_6(32w0x40000000); + 223 : check_port_bitmap_6(32w0x80000000); + // Ports 224-255 - Check against decap_ports_7 + 224 : check_port_bitmap_7(32w0x00000001); + 225 : check_port_bitmap_7(32w0x00000002); + 226 : check_port_bitmap_7(32w0x00000004); + 227 : check_port_bitmap_7(32w0x00000008); + 228 : check_port_bitmap_7(32w0x00000010); + 229 : check_port_bitmap_7(32w0x00000020); + 230 : check_port_bitmap_7(32w0x00000040); + 231 : check_port_bitmap_7(32w0x00000080); + 232 : check_port_bitmap_7(32w0x00000100); + 233 : check_port_bitmap_7(32w0x00000200); + 234 : check_port_bitmap_7(32w0x00000400); + 235 : check_port_bitmap_7(32w0x00000800); + 236 : check_port_bitmap_7(32w0x00001000); + 237 : check_port_bitmap_7(32w0x00002000); + 238 : check_port_bitmap_7(32w0x00004000); + 239 : check_port_bitmap_7(32w0x00008000); + 240 : check_port_bitmap_7(32w0x00010000); + 241 : check_port_bitmap_7(32w0x00020000); + 242 : check_port_bitmap_7(32w0x00040000); + 243 : check_port_bitmap_7(32w0x00080000); + 244 : check_port_bitmap_7(32w0x00100000); + 245 : check_port_bitmap_7(32w0x00200000); + 246 : check_port_bitmap_7(32w0x00400000); + 247 : check_port_bitmap_7(32w0x00800000); + 248 : check_port_bitmap_7(32w0x01000000); + 249 : check_port_bitmap_7(32w0x02000000); + 250 : check_port_bitmap_7(32w0x04000000); + 251 : check_port_bitmap_7(32w0x08000000); + 252 : check_port_bitmap_7(32w0x10000000); + 253 : check_port_bitmap_7(32w0x20000000); + 254 : check_port_bitmap_7(32w0x40000000); + 255 : check_port_bitmap_7(32w0x80000000); + } + + const size = 256; + } diff --git a/dpd/p4/route_selector.p4 b/dpd/p4/route_selector.p4 index 9b35313..7556e15 100644 --- a/dpd/p4/route_selector.p4 +++ b/dpd/p4/route_selector.p4 @@ -1,3 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + + action set_slot(bit<8> slot) { res.slot = (bit<16>) slot; } diff --git a/dpd/p4/sidecar.p4 b/dpd/p4/sidecar.p4 index ff70ab8..5604d12 100644 --- a/dpd/p4/sidecar.p4 +++ b/dpd/p4/sidecar.p4 @@ -1,3 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + #if __TARGET_TOFINO__ == 2 #include const bit<9> USER_SPACE_SERVICE_PORT = 0; @@ -7,17 +13,101 @@ const bit<9> USER_SPACE_SERVICE_PORT = 192; #endif #include +#include #include #include +// This is the port that we use to send packets to user space. +#define IS_SERVICE(p) ((p) == USER_SPACE_SERVICE_PORT) + +// Includes the checksum for the original data, the geneve header, the +// outer udp header, and the outer ipv6 pseudo-header. +// NOTE: safe to include geneve ox_external_tag here as it is filled +// on nat_ingress, and nat_checksum is only computer on nat_ingress. +#define COMMON_FIELDS \ + meta.body_checksum, \ + hdr.inner_eth, \ + hdr.geneve, \ + hdr.geneve_opts.ox_external_tag, \ + hdr.udp.src_port, \ + hdr.udp.dst_port, \ + hdr.udp.hdr_length, \ + (bit<16>)hdr.ipv6.next_hdr, \ + hdr.ipv6.src_addr, \ + hdr.ipv6.dst_addr, \ + hdr.ipv6.payload_len + +// Includes the final bit of the inner ipv4 pseudo-header and the inner ipv4 +// header +#define IPV4_FIELDS \ + meta.l4_length, \ + hdr.inner_ipv4 + +// Includes the inner ipv6 header +#define IPV6_FIELDS \ + hdr.inner_ipv6 + +// This control handles the calculation of Layer 4 payload length +// by subtracting the IPv4 header length from the total packet length. +// +// This is accomplished using a table-based approach due to P4/Tofino limitations: +// 1. We can't directly subtract a variable value (the IPv4 header length) +// 2. Instead, we use a table with IHL (IP Header Length) as the key +// 3. For each IHL value, we add a negative constant that achieves the subtraction +// (e.g., adding 0xffec, which is -20 in two's complement, subtracts 20 bytes) +control CalculateIPv4Len( + inout sidecar_headers_t hdr, + inout sidecar_ingress_meta_t meta +) { + // Action to add (or effectively subtract) a value from IPv4 total length + action add(bit<16> a) { + meta.l4_length = hdr.ipv4.total_len + a; + } + + action invert() { + meta.l4_length = ~meta.l4_length; + } + + // Table maps IPv4 header length (IHL) to the appropriate "add" action + // with the correct negative constant + table ipv4_set_len { + key = { hdr.ipv4.ihl : exact; } + actions = { add; } + + const entries = { + (5) : add(0xffec); // Subtract 20 bytes (standard header) + (6) : add(0xffe8); // Subtract 24 bytes + (7) : add(0xffe4); // Subtract 28 bytes + (8) : add(0xffe0); // Subtract 32 bytes + (9) : add(0xffdc); // Subtract 36 bytes + (10): add(0xffd8); // Subtract 40 bytes + (11): add(0xffd4); // Subtract 44 bytes + (12): add(0xffd0); // Subtract 48 bytes + (13): add(0xffcc); // Subtract 52 bytes + (14): add(0xffc8); // Subtract 56 bytes + (15): add(0xffc4); // Subtract 60 bytes + } + + const size = 16; + } + + apply { + ipv4_set_len.apply(); + invert(); + } +} + control Filter( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, - inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) -{ + inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md +) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv4_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv6_ctr; - bit<16> multicast_scope; + Counter, PortId_t>(512, CounterType_t.PACKETS) drop_mcast_ctr; + Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; + bit<16> mcast_scope; action dropv4() { ig_dprsr_md.drop_ctl = 1; @@ -31,6 +121,27 @@ control Filter( ipv6_ctr.count(); } + action drop_with_reason(bit<8> reason) { + meta.is_valid = false; + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = reason; + } + + action drop_mcast_with_reason(bit<8> reason) { + meta.is_mcast = true; + meta.is_valid = false; + meta.is_link_local_mcastv6 = false; + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = reason; + } + + action drop_mcast() { + meta.is_mcast = true; + meta.is_valid = false; + meta.is_link_local_mcastv6 = false; + ig_dprsr_md.drop_ctl = 1; + } + action claimv4() { meta.is_switch_address = true; ipv4_ctr.count(); @@ -44,8 +155,8 @@ control Filter( // Table of the IPv4 addresses assigned to ports on the switch. table switch_ipv4_addr { key = { - meta.orig_dst_ipv4: exact; - meta.in_port: ternary; + meta.orig_dst_ipv4 : exact; + ig_intr_md.ingress_port : ternary; } actions = { claimv4; dropv4; } @@ -56,8 +167,8 @@ control Filter( // Table of the IPv6 addresses assigned to ports on the switch. table switch_ipv6_addr { key = { - hdr.ipv6.dst_addr: exact; - meta.in_port: ternary; + hdr.ipv6.dst_addr : exact; + ig_intr_md.ingress_port : ternary; } actions = { claimv6; dropv6; } @@ -65,17 +176,90 @@ control Filter( counters = ipv6_ctr; } - apply { - if (hdr.ipv4.isValid() || hdr.arp.isValid()) { + if (hdr.arp.isValid()) { switch_ipv4_addr.apply(); - } else if (hdr.ipv6.isValid()) { - multicast_scope = (bit<16>)hdr.ipv6.dst_addr[127:112]; - if (multicast_scope == 16w0xff01) { - ig_dprsr_md.drop_ctl = 1; - meta.drop_reason = DROP_MULTICAST_TO_LOCAL_INTERFACE; + } else if (hdr.ipv4.isValid()) { + if (meta.is_mcast && !meta.is_valid) { + drop_mcast(); + drop_mcast_ctr.count(ig_intr_md.ingress_port); + drop_reason_ctr.count(meta.drop_reason); + return; + } else if (meta.is_mcast && meta.is_valid) { + // IPv4 Multicast Address Validation (RFC 1112, RFC 7042) + // + // We've already validated the first 3 bytes of the MAC in the parser. + // This cannot be checked by the parser statically. + // + // First, check that 4th byte of the MAC address is the lower 7 + // bits of the IPv4 address. + bit<8> mac_byte4 = hdr.ethernet.dst_mac[23:16]; + bit<7> ipv4_lower7 = hdr.ipv4.dst_addr[22:16]; // The lower 7 bits of the first byte + + // Check 5th byte of MAC against 3rd octet of IPv4 address. + bit<8> mac_byte5 = hdr.ethernet.dst_mac[15:8]; + bit<8> ipv4_byte3 = hdr.ipv4.dst_addr[15:8]; // Third byte + + // Check 6th byte of MAC against 4th octet of IPv4 address. + bit<8> mac_byte6 = hdr.ethernet.dst_mac[7:0]; + bit<8> ipv4_byte4 = hdr.ipv4.dst_addr[7:0]; + + // Check if MAC address follows the multicast mapping standard. + if (mac_byte4 != (bit<8>)ipv4_lower7 || + mac_byte5 != ipv4_byte3 || + mac_byte6 != ipv4_byte4) { + drop_mcast_with_reason(DROP_MULTICAST_INVALID_MAC); + drop_mcast_ctr.count(ig_intr_md.ingress_port); + return; + } + } else if (!meta.is_valid) { + drop_with_reason(meta.drop_reason); + return; } else { - meta.multicast = (multicast_scope == 16w0xff02); + switch_ipv4_addr.apply(); + } + } else if (hdr.ipv6.isValid()) { + if (meta.is_mcast && !meta.is_valid) { + drop_mcast(); + drop_mcast_ctr.count(ig_intr_md.ingress_port); + drop_reason_ctr.count(meta.drop_reason); + return; + } else if (meta.is_mcast && meta.is_valid) { + // Validate the IPv6 multicast MAC address format (RFC 2464, + // RFC 7042). + // + // IPv6 multicast addresses (ff00::/8) must use MAC addresses + // that follow the format 33:33:xxxx:xxxx where the last 32 bits + // are taken directly from the last 32 bits of the IPv6 address. + // + // Sadly, the first two conditions cannot be checked properly by + // the parser, as we reach the total available parser match + // registers on the device. + if (hdr.ethernet.dst_mac[47:40] != 8w0x33 || + hdr.ethernet.dst_mac[39:32] != 8w0x33) { + drop_mcast_with_reason(DROP_MULTICAST_INVALID_MAC); + drop_mcast_ctr.count(ig_intr_md.ingress_port); + drop_reason_ctr.count(meta.drop_reason); + return; + } + + // The last four conditions cannot be checked by the parser + // statically, so we have to do this in the control stage. + + // For a 128-bit IPv6 address, we need to check the last 32 bits + // against the last 32 bits of the MAC address. + if (hdr.ethernet.dst_mac[31:24] != hdr.ipv6.dst_addr[31:24] || + hdr.ethernet.dst_mac[23:16] != hdr.ipv6.dst_addr[23:16] || + hdr.ethernet.dst_mac[15:8] != hdr.ipv6.dst_addr[15:8] || + hdr.ethernet.dst_mac[7:0] != hdr.ipv6.dst_addr[7:0]) { + drop_mcast_with_reason(DROP_MULTICAST_INVALID_MAC); + drop_mcast_ctr.count(ig_intr_md.ingress_port); + drop_reason_ctr.count(meta.drop_reason); + return; + } + } + + if (!meta.is_mcast || meta.is_link_local_mcastv6) { switch_ipv6_addr.apply(); } } @@ -83,7 +267,7 @@ control Filter( } // This control checks for packets that require special -// handling rather than being routed normally. These +// handling rather than being routed normally. These // fall into three categories: // - packets that need to be handed to user space for additional processing. // - packets that are coming from user space and include metadata telling us @@ -94,8 +278,9 @@ control Services( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, - inout ingress_intrinsic_metadata_for_tm_t ig_tm_md) -{ + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md +) { Counter, bit<8>>(SVC_COUNTER_MAX, CounterType_t.PACKETS) service_ctr; // We are replying to a ping to an IP address representing one of our @@ -116,7 +301,7 @@ control Services( hdr.icmp.type = ICMP_ECHOREPLY; meta.icmp_recalc = true; - ig_tm_md.ucast_egress_port = meta.in_port; + ig_tm_md.ucast_egress_port = ig_intr_md.ingress_port; meta.service_routed = true; } @@ -133,7 +318,7 @@ control Services( hdr.icmp.type = ICMP6_ECHOREPLY; meta.icmp_recalc = true; - ig_tm_md.ucast_egress_port = meta.in_port; + ig_tm_md.ucast_egress_port = ig_intr_md.ingress_port; meta.service_routed = true; } @@ -142,7 +327,7 @@ control Services( // sidecar tag, which indicates which port the request arrived on. action forward_to_userspace() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; - hdr.sidecar.sc_ingress = (bit<16>)meta.in_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; @@ -170,38 +355,39 @@ control Services( // In our implementation, there can be only two nodes on a link: // the switch and whatever is connected directly to it. This // simple model allows us to implement link-local "multicast" - // essentially like unicast. In particular, we don't need to engage - // the Tofino packet replication mechanism. "Inbound" multicast - // packets always go to the service port. "Outbound" multicast + // essentially like unicast. In particular, for these, we don't need to + // engage the Tofino packet replication mechanism. "Inbound" multicast + // packets always go to the service port. "Outbound" multicast // packets always go to the port indicated by the sidecar header. - action multicast_inbound() { + action mcast_inbound_link_local() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; - hdr.sidecar.sc_ingress = (bit<16>) meta.in_port; - hdr.sidecar.sc_egress = (bit<16>) ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; hdr.sidecar.sc_payload = 0; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.routed = false; meta.service_routed = true; ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - meta.multicast = true; + meta.is_mcast = true; + meta.is_link_local_mcastv6 = true; } table service { key = { - ig_dprsr_md.drop_ctl: exact; - meta.nat_ingress: exact; - meta.multicast: exact; - meta.is_switch_address: ternary; - meta.in_port: ternary; - hdr.sidecar.isValid(): ternary; - hdr.arp.isValid(): ternary; - hdr.icmp.isValid(): ternary; - hdr.ipv4.isValid(): ternary; - hdr.ipv6.isValid(): ternary; - hdr.icmp.type: ternary; - hdr.icmp.code: ternary; + ig_dprsr_md.drop_ctl : exact; + meta.nat_ingress_hit : exact; + meta.is_mcast : exact; + meta.is_link_local_mcastv6 : ternary; + meta.is_switch_address : ternary; + ig_intr_md.ingress_port : ternary; + hdr.sidecar.isValid() : ternary; + hdr.arp.isValid() : ternary; + hdr.icmp.isValid() : ternary; + hdr.ipv4.isValid() : ternary; + hdr.ipv6.isValid() : ternary; + hdr.icmp.type : ternary; + hdr.icmp.code : ternary; } actions = { @@ -210,27 +396,26 @@ control Services( drop_bad_ping; forward_from_userspace; forward_to_userspace; - multicast_inbound; - NoAction; + mcast_inbound_link_local; } const entries = { - ( 0, false, false, true, _, _, false, true, true, false, ICMP_ECHOREPLY, 0 ) : forward_to_userspace; - ( 0, false, false, true, _, _, false, true, true, false, ICMP_ECHOREPLY, _ ) : drop_bad_ping; - ( 0, false, false, true, _, _, false, true, true, false, ICMP_ECHO, 0 ) : ping4_reply; - ( 0, false, false, true, _, _, false, true, true, false, ICMP_ECHO, _ ) : drop_bad_ping; - ( 0, false, false, true, _, _, false, true, false, true, ICMP6_ECHOREPLY, 0 ) : forward_to_userspace; - ( 0, false, false, true, _, _, false, true, false, true, ICMP6_ECHOREPLY, _ ) : drop_bad_ping; - ( 0, false, false, true, _, _, false, true, false, true, ICMP6_ECHO, 0 ) : ping6_reply; - ( 0, false, false, true, _, _, false, true, false, true, ICMP6_ECHO, _ ) : drop_bad_ping; - ( 0, false, false, _, USER_SPACE_SERVICE_PORT, true, _, _, _, _, _, _ ) : forward_from_userspace; - ( 0, false, true, _, USER_SPACE_SERVICE_PORT, true, _, _, _, _, _, _ ) : forward_from_userspace; - ( 0, false, false, _, _, false, true, _, _, _, _, _ ) : forward_to_userspace; - ( 0, false, false, true, _, _, _, _, _, _, _, _ ) : forward_to_userspace; - ( 0, false, true, _, _, _, _, _, _, _, _, _ ) : multicast_inbound; - } - - default_action = NoAction; + ( 0, false, false, _, true, _, _, false, true, true, false, ICMP_ECHOREPLY, 0 ) : forward_to_userspace; + ( 0, false, false, _, true, _, _, false, true, true, false, ICMP_ECHOREPLY, _ ) : drop_bad_ping; + ( 0, false, false, _, true, _, _, false, true, true, false, ICMP_ECHO, 0 ) : ping4_reply; + ( 0, false, false, _, true, _, _, false, true, true, false, ICMP_ECHO, _ ) : drop_bad_ping; + ( 0, false, false, _, true, _, _, false, true, false, true, ICMP6_ECHOREPLY, 0 ) : forward_to_userspace; + ( 0, false, false, _, true, _, _, false, true, false, true, ICMP6_ECHOREPLY, _ ) : drop_bad_ping; + ( 0, false, false, _, true, _, _, false, true, false, true, ICMP6_ECHO, 0 ) : ping6_reply; + ( 0, false, false, _, true, _, _, false, true, false, true, ICMP6_ECHO, _ ) : drop_bad_ping; + ( 0, false, false, _, _, USER_SPACE_SERVICE_PORT, true, _, _, _, _, _, _ ) : forward_from_userspace; + ( 0, false, true, true, _, USER_SPACE_SERVICE_PORT, true, _, _, _, _, _, _ ) : forward_from_userspace; + ( 0, false, false, _, _, _, false, true, _, _, _, _, _ ) : forward_to_userspace; + ( 0, false, false, _, true, _, _, _, _, _, _, _, _ ) : forward_to_userspace; + // Link-local multicast + ( 0, false, true, true, _, _, _, _, _, _, _, _, _ ) : mcast_inbound_link_local; + } + const size = 16; } @@ -243,15 +428,17 @@ control Services( * as the restructuring is likely to have knock-on effects in * dpd and sidecar-lite. */ - if (!meta.is_switch_address && meta.nat_ingress_port && !meta.nat_ingress) { + if (!meta.is_switch_address && meta.nat_ingress_port && !meta.nat_ingress_hit) { // For packets that were not marked for NAT ingress, but which // arrived on an uplink port that only allows in traffic that // is meant to be NAT encapsulated. meta.drop_reason = DROP_NAT_INGRESS_MISS; ig_dprsr_md.drop_ctl = 1; - } else if (meta.is_switch_address && hdr.geneve.isValid() && hdr.geneve.vni != 0) { - meta.nat_egress = true; - } else { + } + else if (meta.is_switch_address && hdr.geneve.isValid() && hdr.geneve.vni != 0) { + meta.nat_egress_hit = true; + } + else { service.apply(); } } @@ -260,11 +447,13 @@ control Services( control NatIngress ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, - inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md + in ingress_intrinsic_metadata_t ig_intr_md ) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv4_ingress_counter; - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv6_ingress_counter; - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) nat_only_counter; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv4_ingress_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv6_ingress_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) nat_only_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) mcast_ipv4_ingress_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) mcast_ipv6_ingress_ctr; action add_encap_headers(bit<16> udp_len) { // 8 bytes with a 4 byte option @@ -336,13 +525,13 @@ control NatIngress ( add_encap_headers(payload_len); } - action forward_ipv4_to(ipv6_addr_t target, mac_addr_t inner_mac, - geneve_vni_t vni) { - meta.nat_ingress = true; + action forward_ipv4_to(ipv6_addr_t target, mac_addr_t inner_mac, geneve_vni_t vni) { + meta.nat_ingress_hit = true; meta.nat_ingress_tgt = target; meta.nat_inner_mac = inner_mac; meta.nat_geneve_vni = vni; - ipv4_ingress_counter.count(); + + ipv4_ingress_ctr.count(); } table ingress_ipv4 { @@ -353,16 +542,16 @@ control NatIngress ( actions = { forward_ipv4_to; } const size = IPV4_NAT_TABLE_SIZE; - counters = ipv4_ingress_counter; + counters = ipv4_ingress_ctr; } - action forward_ipv6_to(ipv6_addr_t target, mac_addr_t inner_mac, - geneve_vni_t vni) { - meta.nat_ingress = true; + action forward_ipv6_to(ipv6_addr_t target, mac_addr_t inner_mac, geneve_vni_t vni) { + meta.nat_ingress_hit = true; meta.nat_ingress_tgt = target; meta.nat_inner_mac = inner_mac; meta.nat_geneve_vni = vni; - ipv6_ingress_counter.count(); + + ipv6_ingress_ctr.count(); } table ingress_ipv6 { @@ -373,111 +562,159 @@ control NatIngress ( actions = { forward_ipv6_to; } const size = IPV6_NAT_TABLE_SIZE; - counters = ipv6_ingress_counter; - } - - // The following actions and table are used to generate the final - // "length" field in the ipv4 pseudo header, which needs to be backed - // out of the inner udp/tcp checksums to find the residual for the - // packet body. This seems ludicrously complicated, but it's the only - // way I've found to do the calculation without running afoul of - // limitations in p4 and/or tofino, governing exactly how much work - // can be done in each stage and which PHV fields you are allowed - // to access. We are using the 'add' action to subtract the size of - // the IPv4 header. Why? Because the p4 compiler will let me add a - // parameter in an action, but will only let me subtract a constant. - // So, I can create a single action that will add the negative - // parameter I've manually computed, or I can create 11 actions, each - // of which will subtract a hard-coded constant. Either seems stupid, - // but here we are. - // XXX: find a less stupid solution - action invert() { - meta.l4_length = ~meta.l4_length; + counters = ipv6_ingress_ctr; } - action add(bit<16> a) { - meta.l4_length = hdr.ipv4.total_len + a; + action nat_only_port() { + meta.nat_ingress_port = true; + nat_only_ctr.count(); } - table ipv4_set_len { - key = { hdr.ipv4.ihl : exact; } - actions = { add; } + table nat_only { + key = { ig_intr_md.ingress_port : exact; } + actions = { nat_only_port; } + + const size = 256; + counters = nat_only_ctr; + } + + action mcast_forward_ipv4_to(ipv6_addr_t target, mac_addr_t inner_mac, geneve_vni_t vni) { + meta.nat_ingress_hit = true; + meta.nat_ingress_tgt = target; + meta.nat_inner_mac = inner_mac; + meta.nat_geneve_vni = vni; + mcast_ipv4_ingress_ctr.count(); + } + + // Separate table for IPv4 multicast packets that need to be encapsulated. + table ingress_ipv4_mcast { + key = { hdr.ipv4.dst_addr : exact; } + actions = { mcast_forward_ipv4_to; } + const size = IPV4_MULTICAST_TABLE_SIZE; + counters = mcast_ipv4_ingress_ctr; + } + + action mcast_forward_ipv6_to(ipv6_addr_t target, mac_addr_t inner_mac, geneve_vni_t vni) { + meta.nat_ingress_hit = true; + meta.nat_ingress_tgt = target; + meta.nat_inner_mac = inner_mac; + meta.nat_geneve_vni = vni; + + mcast_ipv6_ingress_ctr.count(); + } + + // Separate table for IPv6 multicast packets that need to be encapsulated. + table ingress_ipv6_mcast { + key = { hdr.ipv6.dst_addr : exact; } + actions = { mcast_forward_ipv6_to; } + const size = IPV6_MULTICAST_TABLE_SIZE; + counters = mcast_ipv6_ingress_ctr; + } + + action set_icmp_dst_port() { + meta.l4_dst_port = hdr.icmp.data[31:16]; + } + + table icmp_dst_port { + key = { + hdr.icmp.isValid(): ternary; + hdr.icmp.type: ternary; + } + + actions = { + set_icmp_dst_port; + } const entries = { - (5) : add(0xffec); - (6) : add(0xffe8); - (7) : add(0xffe4); - (8) : add(0xffe0); - (9) : add(0xffdc); - (10) : add(0xffd8); - (11) : add(0xffd4); - (12) : add(0xffd0); - (13) : add(0xffcc); - (14) : add(0xffc8); - (15) : add(0xffc4); + ( true, ICMP_ECHO ) : set_icmp_dst_port; + ( true, ICMP_ECHOREPLY ) : set_icmp_dst_port; + ( true, ICMP6_ECHO ) : set_icmp_dst_port; + ( true, ICMP6_ECHOREPLY ) : set_icmp_dst_port; } - const size = 16; + const size = 4; } - action nat_only_port() { - meta.nat_ingress_port = true; - nat_only_counter.count(); + action set_inner_tcp() { + hdr.inner_tcp = hdr.tcp; + hdr.inner_tcp.setValid(); + hdr.tcp.setInvalid(); } - table nat_only { + action set_inner_udp() { + hdr.inner_udp = hdr.udp; + hdr.inner_udp.setValid(); + hdr.udp.setInvalid(); + } + + action set_inner_icmp() { + hdr.inner_icmp = hdr.icmp; + hdr.inner_icmp.setValid(); + hdr.icmp.setInvalid(); + } + + table ingress_hit { key = { - meta.in_port : exact; + meta.nat_ingress_hit : exact; + meta.nat_egress_hit : ternary; + meta.is_mcast : ternary; + meta.is_link_local_mcastv6 : ternary; + hdr.tcp.isValid() : ternary; + hdr.udp.isValid() : ternary; + hdr.icmp.isValid() : ternary; + } + actions = { + set_inner_tcp; + set_inner_udp; + set_inner_icmp; + NoAction; } - actions = { nat_only_port; } - const size = 256; - counters = nat_only_counter; + const entries = { + ( true, _, _, _, true, false, false ) : set_inner_tcp; + ( true, _, _, _, false, true, false ) : set_inner_udp; + ( true, _, _, _, false, false, true ) : set_inner_icmp; + ( true, _, _, _, _, _, _ ) : NoAction; + } + + const size = 8; } + apply { - // TODO ideally we would do this during parsing, but the Intel compiler - // throws a fit. - if (hdr.icmp.isValid()) { - if( hdr.icmp.type == ICMP_ECHO || hdr.icmp.type == ICMP_ECHOREPLY || - hdr.icmp.type == ICMP6_ECHO || hdr.icmp.type == ICMP6_ECHOREPLY - ) { - meta.l4_dst_port = hdr.icmp.data[31:16]; + icmp_dst_port.apply(); + + // Note: This whole conditional could be simpler as a set of */ + // `const entries`, but apply (on tables) cannot be called from actions + if (hdr.ipv4.isValid() && meta.is_valid) { + if (meta.is_mcast) { + ingress_ipv4_mcast.apply(); + } else { + ingress_ipv4.apply(); } - } - if (hdr.ipv4.isValid()) { - ingress_ipv4.apply(); - } else if (hdr.ipv6.isValid()) { - ingress_ipv6.apply(); - } - - if (meta.nat_ingress) { - if (hdr.tcp.isValid()) { - hdr.inner_tcp = hdr.tcp; - hdr.inner_tcp.setValid(); - hdr.tcp.setInvalid(); - } else if (hdr.udp.isValid()) { - hdr.inner_udp = hdr.udp; - hdr.inner_udp.setValid(); - hdr.udp.setInvalid(); - } else if (hdr.icmp.isValid()) { - hdr.inner_icmp = hdr.icmp; - hdr.inner_icmp.setValid(); - hdr.icmp.setInvalid(); + } else if (hdr.ipv6.isValid() && meta.is_valid) { + // If this is a multicast packet and not a link-local multicast, + // we need to check the multicast table + if (meta.is_mcast && !meta.is_link_local_mcastv6) { + ingress_ipv6_mcast.apply(); + } else { + ingress_ipv6.apply(); } + } + if (ingress_hit.apply().hit) { if (hdr.ipv4.isValid()) { - ipv4_set_len.apply(); - invert(); + CalculateIPv4Len.apply(hdr, meta); encap_ipv4(); } else if (hdr.ipv6.isValid()) { encap_ipv6(); } + if (hdr.vlan.isValid()) { // When setting up the inner headers above, we // copied the ether type from the outer to // the inner. If this is a vlan packet, we - // actually want the ethertype of the payload. + // actually want the ethertype of the payload hdr.inner_eth.ether_type = hdr.vlan.ether_type; hdr.vlan.setInvalid(); } @@ -512,6 +749,7 @@ control NatEgress ( // Should never be valid for outbound traffic, but no harm // in being careful. hdr.geneve_opts.ox_external_tag.setInvalid(); + hdr.geneve_opts.ox_mcast_tag.setInvalid(); } action decap_ipv4() { @@ -521,13 +759,6 @@ control NatEgress ( hdr.inner_ipv4.setInvalid(); } - action decap_ipv6() { - hdr.ethernet.ether_type = ETHERTYPE_IPV6; - hdr.ipv6 = hdr.inner_ipv6; - hdr.ipv6.setValid(); - hdr.inner_ipv6.setInvalid(); - } - action decap_tcp() { hdr.tcp = hdr.inner_tcp; hdr.tcp.setValid(); @@ -540,6 +771,13 @@ control NatEgress ( hdr.inner_udp.setInvalid(); } + action decap_ipv6() { + hdr.ethernet.ether_type = ETHERTYPE_IPV6; + hdr.ipv6 = hdr.inner_ipv6; + hdr.ipv6.setValid(); + hdr.inner_ipv6.setInvalid(); + } + action decap_icmp() { hdr.icmp = hdr.inner_icmp; hdr.icmp.setValid(); @@ -584,11 +822,11 @@ control NatEgress ( table nat_egress { key = { - hdr.inner_ipv4.isValid(): exact; - hdr.inner_ipv6.isValid(): exact; - hdr.inner_tcp.isValid(): exact; - hdr.inner_udp.isValid(): exact; - hdr.inner_icmp.isValid(): exact; + hdr.inner_ipv4.isValid() : exact; + hdr.inner_ipv6.isValid() : exact; + hdr.inner_tcp.isValid() : exact; + hdr.inner_udp.isValid() : exact; + hdr.inner_icmp.isValid() : exact; } actions = { @@ -615,112 +853,84 @@ control NatEgress ( } apply { - if (meta.nat_egress) { + if (meta.nat_egress_hit) { nat_egress.apply(); } } } -struct route6_result_t { - ipv6_addr_t nexthop; - PortId_t port; - bool is_hit; -} - control RouterLookup6( inout sidecar_headers_t hdr, - inout sidecar_ingress_meta_t meta, out route6_result_t res ) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) counter; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; action unreachable() { res.port = 0; res.nexthop = 0; res.is_hit = false; - counter.count(); + ctr.count(); } action forward_vlan(PortId_t port, ipv6_addr_t nexthop, bit<12> vlan_id) { + hdr.vlan.setValid(); + hdr.vlan.pcp = 0; hdr.vlan.dei = 0; hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; - hdr.vlan.setValid(); hdr.ethernet.ether_type = ETHERTYPE_VLAN; res.port = port; res.nexthop = nexthop; res.is_hit = true; - counter.count(); + ctr.count(); } action forward(PortId_t port, ipv6_addr_t nexthop) { res.port = port; res.nexthop = nexthop; res.is_hit = true; - counter.count(); + ctr.count(); } table tbl { key = { hdr.ipv6.dst_addr: lpm; } actions = { forward; forward_vlan; unreachable; } default_action = unreachable; - const size = IPV6_LPM_SIZE; - counters = counter; + // The table size is incremented by one here just to allow the + // integration tests to pass, as this is used by the multicast + // implementation as well + const size = IPV6_LPM_SIZE + 1; + counters = ctr; } apply { tbl.apply(); } } -struct route4_result_t { - /* - * The result of the multistage route selection process is an egress - * port and a nexthop address - */ - ipv4_addr_t nexthop; - PortId_t port; - - /* Did we successfully look up the route in the table? */ - bool is_hit; - - /* - * A hash of the (address,port) fields, which is used to choose between - * multiple potential routes. - */ - bit<8> hash; - - /* Index into the target table of the first potential route */ - bit<16> idx; - /* Number of consecutive slots containing potential routes */ - bit<8> slots; - /* Which of those routes we should select, based the flow hash */ - bit<16> slot; -} - control RouterLookupIndex4( inout sidecar_headers_t hdr, - inout sidecar_ingress_meta_t meta, inout route4_result_t res ) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_counter; - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_counter; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_ctr; action forward_vlan(PortId_t port, ipv4_addr_t nexthop, bit<12> vlan_id) { + hdr.vlan.setValid(); + hdr.vlan.pcp = 0; hdr.vlan.dei = 0; hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; - hdr.vlan.setValid(); hdr.ethernet.ether_type = ETHERTYPE_VLAN; res.port = port; res.nexthop = nexthop; - forward_counter.count(); + forward_ctr.count(); } action forward(PortId_t port, ipv4_addr_t nexthop) { res.port = port; res.nexthop = nexthop; - forward_counter.count(); + forward_ctr.count(); } /* @@ -733,7 +943,7 @@ control RouterLookupIndex4( key = { res.idx: exact; } actions = { forward; forward_vlan; } const size = IPV4_LPM_SIZE - 1; - counters = forward_counter; + counters = forward_ctr; } action unreachable() { @@ -743,14 +953,14 @@ control RouterLookupIndex4( res.slot = 0; res.port = 0; res.nexthop = 0; - index_counter.count(); + index_ctr.count(); } /* * The select_route table contains 2048 pre-computed entries. * It lives in another file just to keep this one manageable. */ -#include + #include action index(bit<16> idx, bit<8> slots) { res.is_hit = true; @@ -763,7 +973,7 @@ control RouterLookupIndex4( // entry `res.idx`. res.port = 0; res.nexthop = 0; - index_counter.count(); + index_ctr.count(); } table lookup { @@ -771,15 +981,12 @@ control RouterLookupIndex4( actions = { index; unreachable; } default_action = unreachable; const size = IPV4_LPM_SIZE; - counters = index_counter; + counters = index_ctr; } apply { - /* - * If the route exists, find the index of its first target in - * the target table. - */ lookup.apply(); + if (res.is_hit) { /* * Select which of the possible targets to use for this @@ -799,36 +1006,36 @@ control Arp ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) counter; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; action drop() { ig_dprsr_md.drop_ctl = 1; // This happens if we have explicitly added an ipv4 -> NULL_MAC // entry. meta.drop_reason = DROP_ARP_NULL; - counter.count(); + ctr.count(); } action rewrite(mac_addr_t dst_mac) { hdr.ethernet.dst_mac = dst_mac; - counter.count(); + ctr.count(); } action request() { hdr.sidecar.sc_code = SC_ARP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>) meta.in_port; - hdr.sidecar.sc_egress = (bit<16>) ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>) meta.nexthop_ipv4; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv4; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.routed = false; meta.service_routed = true; meta.drop_reason = DROP_ARP_MISS; ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - counter.count(); + ctr.count(); } table tbl { @@ -836,7 +1043,7 @@ control Arp ( actions = { drop; request; rewrite; } default_action = request; const size = IPV4_ARP_SIZE; - counters = counter; + counters = ctr; } apply { tbl.apply(); } @@ -846,36 +1053,36 @@ control Ndp ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) counter; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; action drop() { ig_dprsr_md.drop_ctl = 1; // This happens if we have explicitly added an ipv6 -> NULL_MAC // entry. meta.drop_reason = DROP_NDP_NULL; - counter.count(); + ctr.count(); } action rewrite(mac_addr_t dst_mac) { hdr.ethernet.dst_mac = dst_mac; - counter.count(); + ctr.count(); } action request() { hdr.sidecar.sc_code = SC_NEIGHBOR_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>) meta.in_port; - hdr.sidecar.sc_egress = (bit<16>) ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>) meta.nexthop_ipv6; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv6; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.routed = false; meta.service_routed = true; meta.drop_reason = DROP_NDP_MISS; ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - counter.count(); + ctr.count(); } table tbl { @@ -883,7 +1090,7 @@ control Ndp ( actions = { drop; rewrite; request; } default_action = request; const size = IPV6_NEIGHBOR_SIZE; - counters = counter; + counters = ctr; } apply { tbl.apply(); } @@ -893,21 +1100,20 @@ control Router4 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { RouterLookupIndex4() lookup_idx; - Hash>(HashAlgorithm_t.CRC8) index_hash; action icmp_error(bit<8> type, bit<8> code) { hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>) meta.in_port; - hdr.sidecar.sc_egress = (bit<16>) ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.routed = false; meta.service_routed = true; ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; } @@ -917,10 +1123,6 @@ control Router4 ( ig_dprsr_md.drop_ctl = 1; meta.drop_reason = DROP_IPV4_CHECKSUM_ERR; return; - } else if (hdr.ipv4.ttl == 0) { - ig_dprsr_md.drop_ctl = 1; - meta.drop_reason = DROP_IPV4_TTL_INVALID; - return; } route4_result_t fwd; @@ -937,7 +1139,8 @@ control Router4 ( meta.l4_src_port }); - lookup_idx.apply(hdr, meta, fwd); + lookup_idx.apply(hdr, fwd); + if (!fwd.is_hit) { icmp_error(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); meta.drop_reason = DROP_IPV4_UNROUTEABLE; @@ -949,43 +1152,119 @@ control Router4 ( ig_tm_md.ucast_egress_port = fwd.port; meta.nexthop_ipv4 = fwd.nexthop; - meta.routed = true; - Arp.apply(hdr, meta, ig_dprsr_md, ig_tm_md); + Arp.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); } } } -control Router6 ( +control MulticastRouter4( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { - RouterLookup6() lookup; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; action icmp_error(bit<8> type, bit<8> code) { hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>) meta.in_port; - hdr.sidecar.sc_egress = (bit<16>) ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.routed = false; meta.service_routed = true; ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; } + action unreachable() { + ctr.count(); + } + + action forward_vlan(bit<12> vlan_id) { + hdr.vlan.setValid(); + + hdr.vlan.pcp = 0; + hdr.vlan.dei = 0; + hdr.vlan.vlan_id = vlan_id; + hdr.vlan.ether_type = hdr.ethernet.ether_type; + hdr.ethernet.ether_type = ETHERTYPE_VLAN; + ctr.count(); + } + + action forward() { + ctr.count(); + } + + table tbl { + key = { + hdr.ipv4.dst_addr : exact; + } + actions = { forward; forward_vlan; unreachable; } + default_action = unreachable; + const size = IPV4_MULTICAST_TABLE_SIZE; + counters = ctr; + } + apply { - if (hdr.ipv6.hop_limit == 0) { + if (meta.ipv4_checksum_err) { ig_dprsr_md.drop_ctl = 1; - meta.drop_reason = DROP_IPV6_TTL_INVALID; + meta.drop_reason = DROP_IPV4_CHECKSUM_ERR; return; } + // If the packet came in with a VLAN tag, we need to invalidate + // the VLAN header before we do the lookup. The VLAN header + // will be re-attached if set in the forward_vlan action. + if (hdr.vlan.isValid()) { + hdr.ethernet.ether_type = hdr.vlan.ether_type; + hdr.vlan.setInvalid(); + } + + if (!tbl.apply().hit) { + icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + meta.drop_reason = DROP_IPV6_UNROUTEABLE; + } else if (hdr.ipv4.ttl == 1 && !meta.service_routed) { + icmp_error(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV4_TTL_INVALID; + } else { + // Set the destination port to an invalid value + ig_tm_md.ucast_egress_port = (PortId_t)0x1ff; + + if (hdr.ipv4.isValid()) { + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + } + } + } +} + +control Router6 ( + inout sidecar_headers_t hdr, + inout sidecar_ingress_meta_t meta, + inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md +) { + RouterLookup6() lookup; + + action icmp_error(bit<8> type, bit<8> code) { + hdr.sidecar.sc_code = SC_ICMP_NEEDED; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; + hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; + hdr.sidecar.setValid(); + hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; + meta.service_routed = true; + ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; + } + + apply { route6_result_t fwd; fwd.nexthop = 0; - lookup.apply(hdr, meta, fwd); + lookup.apply(hdr, fwd); + if (!fwd.is_hit) { icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); meta.drop_reason = DROP_IPV6_UNROUTEABLE; @@ -996,41 +1275,119 @@ control Router6 ( ig_tm_md.ucast_egress_port = fwd.port; hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; meta.nexthop_ipv6 = fwd.nexthop; - meta.routed = true; - Ndp.apply(hdr, meta, ig_dprsr_md, ig_tm_md); + Ndp.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); } } } -control L3Router( +control MulticastRouter6 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { - apply { - if (hdr.ipv4.isValid()) { - Router4.apply(hdr, meta, ig_dprsr_md, ig_tm_md); - } else if (hdr.ipv6.isValid()) { - Router6.apply(hdr, meta, ig_dprsr_md, ig_tm_md); - } - } -} + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; -control MacRewrite( - inout sidecar_headers_t hdr, - in sidecar_ingress_meta_t meta, - in PortId_t port) -{ - action rewrite(mac_addr_t mac) { - hdr.ethernet.src_mac = mac; + action icmp_error(bit<8> type, bit<8> code) { + hdr.sidecar.sc_code = SC_ICMP_NEEDED; + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; + hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; + hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; + hdr.sidecar.setValid(); + hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; + meta.service_routed = true; + ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; + } + + action unreachable() { + ctr.count(); + } + + action forward_vlan(bit<12> vlan_id) { + hdr.vlan.setValid(); + hdr.vlan.pcp = 0; + hdr.vlan.dei = 0; + hdr.vlan.vlan_id = vlan_id; + hdr.vlan.ether_type = hdr.ethernet.ether_type; + hdr.ethernet.ether_type = ETHERTYPE_VLAN; + ctr.count(); + } + + action forward() { + ctr.count(); + } + + table tbl { + key = { + hdr.ipv6.dst_addr : exact; + } + actions = { forward; forward_vlan; unreachable; } + default_action = unreachable; + const size = IPV6_MULTICAST_TABLE_SIZE; + counters = ctr; + } + + apply { + // If the packet came in with a VLAN tag, we need to invalidate + // the VLAN header before we do the lookup. The VLAN header + // will be re-attached if set in the forward_vlan action. + if (hdr.vlan.isValid()) { + hdr.ethernet.ether_type = hdr.vlan.ether_type; + hdr.vlan.setInvalid(); + } + + if (!tbl.apply().hit) { + icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + meta.drop_reason = DROP_IPV6_UNROUTEABLE; + } else if (hdr.ipv6.hop_limit == 1) { + icmp_error(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; + } else { + // Set the destination port to an invalid value + ig_tm_md.ucast_egress_port = (PortId_t)0x1ff; + hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; + } + } +} + +control L3Router( + inout sidecar_headers_t hdr, + inout sidecar_ingress_meta_t meta, + inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md +) { + apply { + if (hdr.ipv4.isValid()) { + if (meta.is_mcast && !meta.is_link_local_mcastv6) { + MulticastRouter4.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } else { + Router4.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } + } else if (hdr.ipv6.isValid()) { + if (meta.is_mcast && !meta.is_link_local_mcastv6) { + MulticastRouter6.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } else { + Router6.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } + } + } +} + +control MacRewrite( + inout sidecar_headers_t hdr, + in PortId_t port +) { + action rewrite(mac_addr_t mac) { + hdr.ethernet.src_mac = mac; } table mac_rewrite { key = { port: exact ; } - actions = { rewrite; NoAction; } + actions = { rewrite; } - default_action = NoAction; const size = 256; } @@ -1039,6 +1396,493 @@ control MacRewrite( } } +/* This control is used to rewrite the source and destination MAC addresses + * for multicast packets. The destination MAC address is derived from the + * destination IP address, and the source MAC address is set based on the + * egress port the packet is being sent out on. + */ +control MulticastMacRewrite( + inout sidecar_headers_t hdr, + in PortId_t port +) { + action rewrite(mac_addr_t mac) { + hdr.ethernet.src_mac = mac; + } + + table mac_rewrite { + key = { port: exact ; } + actions = { rewrite; } + + const size = 256; + } + + apply { + if (mac_rewrite.apply().hit) { + // Derive the destination MAC based on IP type. + // IPV4: https://www.rfc-editor.org/rfc/rfc1112.html#section-6.4 + // IPV6: https://www.rfc-editor.org/rfc/rfc2464.html + if (hdr.ipv4.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv4.isValid())) { + // IPv4 multicast MAC address (01:00:5e + 23 bits of IP) + bit<48> mcast_mac = 0; + // Set the first three bytes to 01:00:5e (0x01005e) + mcast_mac = (bit<48>)0x01005e << 24; + + bit<24> ip_suffix; + // Take the last 23 bits of IPv4 address and append them + // We mask the first byte to clear the top bit + if (hdr.ipv4.isValid()) { + ip_suffix = (bit<24>)(hdr.ipv4.dst_addr & 0x007fffff); + } else { + ip_suffix = (bit<24>)(hdr.inner_ipv4.dst_addr & 0x007fffff); + } + + hdr.ethernet.dst_mac = mcast_mac | ((bit<48>)ip_suffix); + } else if (hdr.ipv6.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv6.isValid())) { + // IPv6 multicast MAC address (33:33 + last 32 bits of IPv6) + bit<48> mcast_mac = 0; + // Set the first two bytes to 33:33 + mcast_mac = (bit<48>)0x3333 << 32; + + bit<48> ip_suffix; + // Take the last 32 bits of IPv6 address and append them + if (hdr.ipv6.isValid()) { + ip_suffix = (bit<48>)(hdr.ipv6.dst_addr[31:0]); + } else { + ip_suffix = (bit<48>)(hdr.inner_ipv6.dst_addr[31:0]); + } + + hdr.ethernet.dst_mac = mcast_mac | ip_suffix; + } + } + } +} + +/* This control is used to configure multicast packets for replication. + * It includes actions for dropping packets with no group, allowing + * source-specific multicast, and configuring multicast group IDs and hashes. + */ +control MulticastIngress ( + inout sidecar_headers_t hdr, + inout sidecar_ingress_meta_t meta, + inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md) +{ + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) mcast_ipv6_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) mcast_ipv4_ssm_ctr; + DirectCounter>(CounterType_t.PACKETS_AND_BYTES) mcast_ipv6_ssm_ctr; + + Hash>(HashAlgorithm_t.CRC16) mcast_hashv6_level1; + Hash>(HashAlgorithm_t.CRC16) mcast_hashv6_level2; + + // Drop action for IPv4 multicast packets with no group. + // + // At this point, We should only allow replication for IPv6 packets that + // are admin-scoped before possible decapping. + action drop_mcastv4_no_group() { + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_NO_GROUP; + } + + // Drop action for IPv6 multicast packets with no group. + // + // At this point, we should only allow replication for IPv6 packets that + // are admin-scoped before possible decapping. + action drop_mcastv6_no_group() { + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_NO_GROUP; + } + + // Drop action for IPv6 multicast packets with no group + // that is a valid admin-scoped multicast group. + action drop_mcastv6_admin_scoped_no_group() { + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_NO_GROUP; + mcast_ipv6_ctr.count(); + } + + // Drop action for IPv4 multicast packets with no source-specific multicast + // group. + action drop_mcastv4_filtered_source() { + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_SOURCE_FILTERED; + mcast_ipv4_ssm_ctr.count(); + } + + // Drop action for IPv6 ulticast packets with no source-specific multicast + // group. + action drop_mcastv6_filtered_source() { + ig_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_SOURCE_FILTERED; + mcast_ipv6_ssm_ctr.count(); + } + + action allow_source_mcastv4() { + // Source is allowed for source-specific multicast + meta.allow_source_mcast = true; + mcast_ipv4_ssm_ctr.count(); + } + + action allow_source_mcastv6() { + // Source is allowed for source-specific multicast + meta.allow_source_mcast = true; + mcast_ipv6_ssm_ctr.count(); + } + + // Configure IPv6 multicast replication with bifurcated design: + // mcast_grp_a: external/customer replication group + // mcast_grp_b: underlay/infrastructure replication group + action configure_mcastv6( + MulticastGroupId_t mcast_grp_a, + MulticastGroupId_t mcast_grp_b, + bit<16> rid, + bit<16> level1_excl_id, + bit<9> level2_excl_id + ) { + ig_tm_md.mcast_grp_a = mcast_grp_a; + ig_tm_md.mcast_grp_b = mcast_grp_b; + ig_tm_md.rid = rid; + ig_tm_md.level1_exclusion_id = level1_excl_id; + ig_tm_md.level2_exclusion_id = level2_excl_id; + + // Set multicast hash based on IPv6 packet fields + ig_tm_md.level1_mcast_hash = (bit<13>)mcast_hashv6_level1.get({ + hdr.ipv6.src_addr, + hdr.ipv6.dst_addr, + hdr.ipv6.next_hdr, + meta.l4_src_port, + meta.l4_dst_port + }); + + // Set secondary multicast hash based on IPv6 packet fields + ig_tm_md.level2_mcast_hash = (bit<13>)mcast_hashv6_level2.get({ + hdr.ipv6.flow_label, + ig_intr_md.ingress_port + }); + + mcast_ipv6_ctr.count(); + } + + table mcast_source_filter_ipv4 { + key = { + hdr.inner_ipv4.src_addr: lpm; + hdr.inner_ipv4.dst_addr: exact; + } + actions = { + allow_source_mcastv4; + drop_mcastv4_filtered_source; + } + default_action = drop_mcastv4_filtered_source; + const size = IPV4_MULTICAST_TABLE_SIZE; + counters = mcast_ipv4_ssm_ctr; + } + + table mcast_replication_ipv6 { + key = { hdr.ipv6.dst_addr: exact; } + actions = { + configure_mcastv6; + drop_mcastv6_admin_scoped_no_group; + } + default_action = drop_mcastv6_admin_scoped_no_group; + const size = IPV6_MULTICAST_TABLE_SIZE; + counters = mcast_ipv6_ctr; + } + + table mcast_source_filter_ipv6 { + key = { + hdr.inner_ipv6.src_addr: exact; + hdr.inner_ipv6.dst_addr: exact; + } + actions = { + allow_source_mcastv6; + drop_mcastv6_filtered_source; + } + default_action = drop_mcastv6_filtered_source; + const size = IPV6_MULTICAST_TABLE_SIZE; + counters = mcast_ipv6_ssm_ctr; + } + + action invalidate_external_grp() { + invalidate(ig_tm_md.mcast_grp_a); + } + + action invalidate_underlay_grp() { + invalidate(ig_tm_md.mcast_grp_b); + } + + action invalidate_grps() { + invalidate_external_grp(); + invalidate_underlay_grp(); + } + + action invalidate_underlay_grp_and_set_decap() { + invalidate_underlay_grp(); + meta.nat_egress_hit = true; + } + + table mcast_tag_check { + key = { + ig_tm_md.mcast_grp_a : ternary; + ig_tm_md.mcast_grp_b : ternary; + hdr.geneve.isValid() : ternary; + hdr.geneve_opts.ox_mcast_tag.isValid() : ternary; + hdr.geneve_opts.ox_mcast_tag.mcast_tag : ternary; + } + actions = { + invalidate_external_grp; + invalidate_underlay_grp; + invalidate_underlay_grp_and_set_decap; + invalidate_grps; + NoAction; + } + + const entries = { + ( _, _, true, true, MULTICAST_TAG_EXTERNAL ) : invalidate_underlay_grp_and_set_decap; + ( _, _, true, true, MULTICAST_TAG_UNDERLAY ) : invalidate_external_grp; + ( _, _, true, true, MULTICAST_TAG_UNDERLAY_EXTERNAL ) : NoAction; + ( 0, _, _, _, _ ) : invalidate_external_grp; + ( _, 0, _, _, _ ) : invalidate_underlay_grp; + ( 0, 0, _, _, _ ) : invalidate_grps; + } + + const size = 6; + } + + // Note: SSM tables currently take one extra stage in the pipeline (17->18). + apply { + if (hdr.geneve.isValid() && hdr.inner_ipv4.isValid()) { + // Check if the inner destination address is an IPv4 SSM multicast + // address. + if (hdr.inner_ipv4.dst_addr[31:24] == 8w0xe8) { + mcast_source_filter_ipv4.apply(); + } else { + meta.allow_source_mcast = true; + } + } else if (hdr.geneve.isValid() && hdr.inner_ipv6.isValid()) { + // Check if the inner destination address is an IPv6 SSM multicast + // address. + if ((hdr.inner_ipv6.dst_addr[127:120] == 8w0xff) + && ((hdr.inner_ipv6.dst_addr[119:116] == 4w0x3))) { + mcast_source_filter_ipv6.apply(); + } else { + meta.allow_source_mcast = true; + } + } else if (hdr.ipv4.isValid()) { + drop_mcastv4_no_group(); + } else if (hdr.ipv6.isValid()) { + drop_mcastv6_no_group(); + } + + if (hdr.ipv6.isValid() && meta.allow_source_mcast) { + mcast_replication_ipv6.apply(); + mcast_tag_check.apply(); + } + } +} + + +/* This control is used to configure the egress port for multicast packets. + * It includes actions for setting the decap ports bitmap and VLAN ID + * (if necessary), as well as stripping headers and decrementing TTL or hop + * limit. + */ +control MulticastEgress ( + inout sidecar_headers_t hdr, + inout sidecar_egress_meta_t meta, + in egress_intrinsic_metadata_t eg_intr_md, + in egress_intrinsic_metadata_for_deparser_t eg_dprsr_md +) { + + action set_decap_ports( + bit<32> ports_0, bit<32> ports_1, bit<32> ports_2, bit<32> ports_3, + bit<32> ports_4, bit<32> ports_5, bit<32> ports_6, bit<32> ports_7) { + + // Store the decap port configuration in metadata + meta.decap_ports_0 = ports_0; + meta.decap_ports_1 = ports_1; + meta.decap_ports_2 = ports_2; + meta.decap_ports_3 = ports_3; + meta.decap_ports_4 = ports_4; + meta.decap_ports_5 = ports_5; + meta.decap_ports_6 = ports_6; + meta.decap_ports_7 = ports_7; + } + + action set_decap_ports_and_vlan( + bit<32> ports_0, bit<32> ports_1, bit<32> ports_2, bit<32> ports_3, + bit<32> ports_4, bit<32> ports_5, bit<32> ports_6, bit<32> ports_7, + bit<12> vlan_id) { + + set_decap_ports(ports_0, ports_1, ports_2, ports_3, + ports_4, ports_5, ports_6, ports_7); + + meta.vlan_id = vlan_id; + } + + + table mcast_tag_check { + key = { + hdr.ipv6.isValid(): exact; + hdr.ipv6.dst_addr: ternary; + hdr.geneve.isValid(): exact; + hdr.geneve_opts.ox_mcast_tag.isValid(): exact; + hdr.geneve_opts.ox_mcast_tag.mcast_tag: exact; + } + + actions = { NoAction; } + + const entries = { + // Admin-local (scope value 4): Matches IPv6 multicast addresses + // with scope ff04::/16 + ( true, IPV6_ADMIN_LOCAL_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; + // Site-local (scope value 5): Matches IPv6 multicast addresses with + // scope ff05::/16 + ( true, IPV6_SITE_LOCAL_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; + // Organization-local (scope value 8): Matches IPv6 multicast + // addresses with scope ff08::/16 + ( true, IPV6_ORG_SCOPE_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; + // ULA (Unique Local Address): Matches IPv6 addresses that start + // with fc00::/7. This is not a multicast address, but it is used + // for other internal routing purposes. + ( true, IPV6_ULA_PATTERN &&& IPV6_ULA_MASK, true, true, 2 ) : NoAction; + } + + const size = 4; + } + + table tbl_decap_ports { + key = { + // Matches the `external` multicast group ID. + eg_intr_md.egress_rid: exact; + } + + actions = { + set_decap_ports; + set_decap_ports_and_vlan; + } + + // Group RIDs == Group IPs + const size = IPV6_MULTICAST_TABLE_SIZE; + } + + action set_port_number(bit<8> port_number) { + meta.port_number = port_number; + } + + table asic_id_to_port { + key = { eg_intr_md.egress_port: exact; } + + actions = { set_port_number; } + + const size = 256; + } + + action strip_outer_header() { + hdr.inner_eth.setInvalid(); + hdr.ipv4.setInvalid(); + hdr.ipv6.setInvalid(); + hdr.tcp.setInvalid(); + hdr.udp.setInvalid(); + hdr.geneve.setInvalid(); + hdr.geneve_opts.ox_external_tag.setInvalid(); + hdr.geneve_opts.ox_mcast_tag.setInvalid(); + } + + #include + + action strip_vlan_header() { + hdr.vlan.setInvalid(); + } + + action decrement_ttl() { + hdr.inner_ipv4.ttl = hdr.inner_ipv4.ttl - 1; + } + + action decrement_hop_limit() { + hdr.inner_ipv6.hop_limit = hdr.inner_ipv6.hop_limit - 1; + } + + action modify_ipv4() { + strip_outer_header(); + strip_vlan_header(); + hdr.ethernet.ether_type = ETHERTYPE_IPV4; + decrement_ttl(); + } + + action modify_ipv6() { + strip_outer_header(); + strip_vlan_header(); + hdr.ethernet.ether_type = ETHERTYPE_IPV6; + decrement_hop_limit(); + } + + action modify_vlan_ipv4() { + strip_outer_header(); + + hdr.vlan.setValid(); + + hdr.vlan.pcp = 0; + hdr.vlan.dei = 0; + hdr.vlan.vlan_id = meta.vlan_id; + hdr.vlan.ether_type = ETHERTYPE_IPV4; + hdr.ethernet.ether_type = ETHERTYPE_VLAN; + + decrement_ttl(); + } + + action modify_vlan_ipv6() { + strip_outer_header(); + + hdr.vlan.setValid(); + + hdr.vlan.pcp = 0; + hdr.vlan.dei = 0; + hdr.vlan.vlan_id = meta.vlan_id; + hdr.vlan.ether_type = ETHERTYPE_IPV6; + hdr.ethernet.ether_type = ETHERTYPE_VLAN; + + decrement_hop_limit(); + } + + table modify_hdr { + key = { + meta.vlan_id: ternary; + hdr.inner_ipv4.isValid(): exact; + hdr.inner_ipv6.isValid(): exact; + } + + actions = { + modify_vlan_ipv4; + modify_vlan_ipv6; + modify_ipv4; + modify_ipv6; + } + + const entries = { + (0, true, false) : modify_ipv4(); + (0, false, true) : modify_ipv6(); + (_, true, false) : modify_vlan_ipv4(); + (_, false, true) : modify_vlan_ipv6(); + } + + const size = 4; + } + + apply { + if (mcast_tag_check.apply().hit) { + if (tbl_decap_ports.apply().hit) { + if (asic_id_to_port.apply().hit) { + port_bitmap_check.apply(); + } + if (meta.bitmap_result != 0) { + meta.ipv4_checksum_recalc = true; + modify_hdr.apply(); + } + } + } + } +} + control Ingress( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, @@ -1052,6 +1896,7 @@ control Ingress( NatIngress() nat_ingress; NatEgress() nat_egress; L3Router() l3_router; + MulticastIngress() mcast_ingress; MacRewrite() mac_rewrite; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) ingress_ctr; @@ -1061,66 +1906,62 @@ control Ingress( Counter, bit<10>>(1024, CounterType_t.PACKETS) packet_ctr; apply { - ingress_ctr.count(meta.in_port); + ingress_ctr.count(ig_intr_md.ingress_port); packet_ctr.count(meta.pkt_type); - filter.apply(hdr, meta, ig_dprsr_md); - nat_ingress.apply(hdr, meta, ig_dprsr_md); - services.apply(hdr, meta, ig_dprsr_md, ig_tm_md); + // Always apply the filter first, as it may drop packets + // that are not valid for the rest of the pipeline or tag metadata + // accordingly. + // + // Additionally, it sets the `meta.is_valid` flag to indicate + // whether the packet is valid for further processing. + filter.apply(hdr, meta, ig_dprsr_md, ig_intr_md); + + if (meta.is_valid && !hdr.geneve.isValid()) { + nat_ingress.apply(hdr, meta, ig_intr_md); + } + + if (meta.is_valid && (!meta.is_mcast || meta.is_link_local_mcastv6)) { + services.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } - if (!meta.service_routed && ig_dprsr_md.drop_ctl == 0) { - nat_egress.apply(hdr, meta, ig_dprsr_md); - l3_router.apply(hdr, meta, ig_dprsr_md, ig_tm_md); + // We perform NAT ingress before multicast replication to ensure that + // the NAT'd outer address is used for multicast replication to inbound + // groups + if (meta.is_valid && meta.is_mcast && !meta.is_link_local_mcastv6) { + mcast_ingress.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); + } + + if (meta.is_valid && !meta.service_routed && ig_dprsr_md.drop_ctl == 0) { + if (hdr.geneve.isValid()) { + nat_egress.apply(hdr, meta, ig_dprsr_md); + } + l3_router.apply(hdr, meta, ig_dprsr_md, ig_intr_md, ig_tm_md); } if (meta.drop_reason != 0) { - drop_port_ctr.count(meta.in_port); + // Handle dropped packets + drop_port_ctr.count(ig_intr_md.ingress_port); drop_reason_ctr.count(meta.drop_reason); - } else if (!meta.multicast) { + } else if (!meta.is_mcast) { egress_ctr.count(ig_tm_md.ucast_egress_port); if (ig_tm_md.ucast_egress_port != USER_SPACE_SERVICE_PORT) { - mac_rewrite.apply(hdr, meta, ig_tm_md.ucast_egress_port); + mac_rewrite.apply(hdr, ig_tm_md.ucast_egress_port); } + meta.bridge_hdr.setInvalid(); ig_tm_md.bypass_egress = 1w1; } } } -// Includes the checksum for the original data, the geneve header, the -// outer udp header, and the outer ipv6 pseudo-header. -// NOTE: safe to include geneve ox_external_tag here as it is filled -// on nat_ingress, and nat_checksum is only computer on nat_ingress. -#define COMMON_FIELDS \ - meta.body_checksum, \ - hdr.inner_eth, \ - hdr.geneve, \ - hdr.geneve_opts.ox_external_tag, \ - hdr.udp.src_port, \ - hdr.udp.dst_port, \ - hdr.udp.hdr_length, \ - (bit<16>)hdr.ipv6.next_hdr, \ - hdr.ipv6.src_addr, \ - hdr.ipv6.dst_addr, \ - hdr.ipv6.payload_len - -// Includes the final bit of the inner ipv4 pseudo-header and the inner ipv4 -// header -#define IPV4_FIELDS \ - meta.l4_length, \ - hdr.inner_ipv4 - -// Includes the inner ipv6 header -#define IPV6_FIELDS \ - hdr.inner_ipv6 - control IngressDeparser(packet_out pkt, inout sidecar_headers_t hdr, in sidecar_ingress_meta_t meta, - in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) -{ - Checksum() ipv4_checksum; + in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md +) { Checksum() icmp_checksum; Checksum() nat_checksum; + Checksum() ipv4_checksum; apply { // The following code would be more naturally (and, one @@ -1132,12 +1973,12 @@ control IngressDeparser(packet_out pkt, // the logic as seen below somehow makes the independence // apparent to the compiler. - if (meta.nat_ingress && hdr.inner_ipv4.isValid() && + if (meta.nat_ingress_hit && hdr.inner_ipv4.isValid() && hdr.inner_udp.isValid()) { hdr.udp.checksum = nat_checksum.update({ COMMON_FIELDS, IPV4_FIELDS, hdr.inner_udp}); } - if (meta.nat_ingress && hdr.inner_ipv4.isValid() && + if (meta.nat_ingress_hit && hdr.inner_ipv4.isValid() && hdr.inner_tcp.isValid()) { hdr.udp.checksum = nat_checksum.update({ COMMON_FIELDS, IPV4_FIELDS, hdr.inner_tcp}); @@ -1145,19 +1986,19 @@ control IngressDeparser(packet_out pkt, /* COMPILER BUG: I cannot convince the tofino to compute this correctly. * Conveniently, we dont actually need it, see RFC 6935. * - * if (meta.nat_ingress && hdr.inner_ipv4.isValid() && + * if (meta.nat_ingress_hit && hdr.inner_ipv4.isValid() && * hdr.inner_icmp.isValid()) { * hdr.udp.checksum = nat_checksum.update({ * COMMON_FIELDS, IPV4_FIELDS, hdr.inner_icmp}); * } * */ - if (meta.nat_ingress && hdr.inner_ipv6.isValid() && + if (meta.nat_ingress_hit && hdr.inner_ipv6.isValid() && hdr.inner_udp.isValid()) { hdr.udp.checksum = nat_checksum.update({ COMMON_FIELDS, IPV6_FIELDS, hdr.inner_udp}); } - if (meta.nat_ingress && hdr.inner_ipv6.isValid() && + if (meta.nat_ingress_hit && hdr.inner_ipv6.isValid() && hdr.inner_tcp.isValid()) { hdr.udp.checksum = nat_checksum.update({ COMMON_FIELDS, IPV6_FIELDS, hdr.inner_tcp}); @@ -1165,7 +2006,7 @@ control IngressDeparser(packet_out pkt, /* COMPILER BUG: I cannot convince the tofino to compute this correctly. * Conveniently, we dont actually need it, see RFC 6935. * - * if (meta.nat_ingress && hdr.inner_ipv6.isValid() && + * if (meta.nat_ingress_hit && hdr.inner_ipv6.isValid() && * hdr.inner_icmp.isValid()) { * hdr.udp.checksum = nat_checksum.update({ * COMMON_FIELDS, IPV6_FIELDS, hdr.inner_icmp}); @@ -1191,22 +2032,11 @@ control IngressDeparser(packet_out pkt, }); } + pkt.emit(meta.bridge_hdr); pkt.emit(hdr); } } -struct sidecar_egress_meta_t {} - -parser EgressParser(packet_in pkt, - out sidecar_headers_t hdr, - out sidecar_egress_meta_t meta, - out egress_intrinsic_metadata_t eg_intr_md -) { - state start { - transition accept; - } -} - control Egress( inout sidecar_headers_t hdr, inout sidecar_egress_meta_t meta, @@ -1215,15 +2045,89 @@ control Egress( inout egress_intrinsic_metadata_for_deparser_t eg_dprsr_md, inout egress_intrinsic_metadata_for_output_port_t eg_oport_md ) { - apply { } + MulticastMacRewrite() mac_rewrite; + MulticastEgress() mcast_egress; + + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) link_local_mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) external_mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) underlay_mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS) drop_port_ctr; + Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; + + apply { + // Check multicast egress packets by checking that RID is not 0. + bool is_egress_rid_mcast = eg_intr_md.egress_rid > 0; + // We track IPv6 multicast packets separately for counters. + bool is_ipv6_mcast = false; + if (hdr.ipv6.isValid()) { + bit<16> ipv6_prefix = (bit<16>)hdr.ipv6.dst_addr[127:112]; + is_ipv6_mcast = (ipv6_prefix != 16w0xff02); + } + bool is_mcast = is_egress_rid_mcast || is_ipv6_mcast; + + if (is_egress_rid_mcast == true) { + if (meta.bridge_hdr.ingress_port == eg_intr_md.egress_port) { + // If the ingress port is the same as the egress port, drop + // the packet + meta.drop_reason = DROP_MULTICAST_PATH_FILTERED; + eg_dprsr_md.drop_ctl = 1; + } else { + mcast_egress.apply(hdr, meta, eg_intr_md, eg_dprsr_md); + mac_rewrite.apply(hdr, eg_intr_md.egress_port); + } + } else if (eg_intr_md.egress_rid == 0 && eg_intr_md.egress_rid_first == 1) { + // Drop CPU copies (RID=0) to prevent unwanted packets on port 0 + eg_dprsr_md.drop_ctl = 1; + meta.drop_reason = DROP_MULTICAST_CPU_COPY; + } + + if (meta.drop_reason != 0) { + // Handle dropped packets + drop_port_ctr.count(eg_intr_md.egress_port); + drop_reason_ctr.count(meta.drop_reason); + } else if (is_mcast == true) { + mcast_ctr.count(eg_intr_md.egress_port); + + if (is_ipv6_mcast) { + link_local_mcast_ctr.count(eg_intr_md.egress_port); + } else if (hdr.geneve.isValid()) { + external_mcast_ctr.count(eg_intr_md.egress_port); + } else if (hdr.geneve.isValid() && + hdr.geneve_opts.ox_mcast_tag.isValid() && + hdr.geneve_opts.ox_mcast_tag.mcast_tag == MULTICAST_TAG_UNDERLAY) { + underlay_mcast_ctr.count(eg_intr_md.egress_port); + } + } + } } -control EgressDeparser(packet_out pkt, +control EgressDeparser( + packet_out pkt, inout sidecar_headers_t hdr, in sidecar_egress_meta_t meta, in egress_intrinsic_metadata_for_deparser_t eg_dprsr_md ) { + Checksum() ipv4_checksum; + apply { + // We only need to recalculate the checksum if the packet is + // modified in the case of replication to both external and + // underlay multicast ports, as the TTL and hop limit + // are decremented if packets headed toward external multicast + // subscribers are decapped/stripped. + if (meta.ipv4_checksum_recalc && hdr.inner_ipv4.isValid()) { + hdr.inner_ipv4.hdr_checksum = ipv4_checksum.update({ + hdr.inner_ipv4.version, hdr.inner_ipv4.ihl, hdr.inner_ipv4.diffserv, + hdr.inner_ipv4.total_len, + hdr.inner_ipv4.identification, + hdr.inner_ipv4.flags, hdr.inner_ipv4.frag_offset, + hdr.inner_ipv4.ttl, hdr.inner_ipv4.protocol, + hdr.inner_ipv4.src_addr, + hdr.inner_ipv4.dst_addr + }); + } + pkt.emit(hdr); } } diff --git a/dpd/src/api_server.rs b/dpd/src/api_server.rs index b5994a4..46ec95b 100644 --- a/dpd/src/api_server.rs +++ b/dpd/src/api_server.rs @@ -39,6 +39,7 @@ use crate::fault::Fault; use crate::link::LinkFsmCounters; use crate::link::LinkId; use crate::link::LinkUpCounter; +use crate::mcast; use crate::oxstats; use crate::port_map::BackplaneLink; use crate::route::Ipv4Route; @@ -2551,6 +2552,10 @@ async fn reset_all( error!(switch.log, "failed to reset ipv6 nat table: {:?}", e); err = Some(e); } + if let Err(e) = mcast::reset(switch) { + error!(switch.log, "failed to reset multicast state: {:?}", e); + err = Some(e); + } match err { Some(e) => Err(e.into()), @@ -3061,6 +3066,296 @@ async fn counter_get( .map_err(HttpError::from) } +/// Used to identify a multicast group by IP address, the main +/// identifier for a multicast group. +#[derive(Deserialize, Serialize, JsonSchema)] +pub struct MulticastGroupIpParam { + pub group_ip: IpAddr, +} + +/// Used to identify a multicast group by ID. +/// +/// If not provided, it will return all multicast groups. +#[derive(Deserialize, Serialize, JsonSchema)] +pub struct MulticastGroupIdParam { + pub group_id: Option, +} + +/** + * Create an external-only multicast group configuration. + * + * External-only groups are used for IPv4 and non-admin-scoped IPv6 multicast + * traffic that doesn't require replication infrastructure. These groups use + * simple forwarding tables and require a NAT target. + */ +#[endpoint { + method = POST, + path = "/multicast/external-groups", +}] +async fn multicast_group_create_external( + rqctx: RequestContext>, + group: TypedBody, +) -> Result, HttpError> { + let switch: &Switch = rqctx.context(); + let entry = group.into_inner(); + + mcast::add_group_external(switch, entry) + .map(HttpResponseCreated) + .map_err(HttpError::from) +} + +/** + * Create an internal multicast group configuration. + * + * Internal groups are used for admin-scoped IPv6 multicast traffic that + * requires replication infrastructure. These groups support both external + * and underlay members with full replication capabilities. + */ +#[endpoint { + method = POST, + path = "/multicast/groups", +}] +async fn multicast_group_create( + rqctx: RequestContext>, + group: TypedBody, +) -> Result, HttpError> { + let switch: &Switch = rqctx.context(); + let entry = group.into_inner(); + + mcast::add_group_internal(switch, entry) + .map(HttpResponseCreated) + .map_err(HttpError::from) +} + +/** + * Delete a multicast group configuration by IP address. + */ +#[endpoint { + method = DELETE, + path = "/multicast/groups/{group_ip}", +}] +async fn multicast_group_delete( + rqctx: RequestContext>, + path: Path, +) -> Result { + let switch: &Switch = rqctx.context(); + let ip = path.into_inner().group_ip; + + mcast::del_group(switch, ip) + .map(|_| HttpResponseDeleted()) + .map_err(HttpError::from) +} + +/** + * Reset all multicast group configurations. + */ +#[endpoint { + method = DELETE, + path = "/multicast/groups", +}] +async fn multicast_reset( + rqctx: RequestContext>, +) -> Result { + let switch: &Switch = rqctx.context(); + + mcast::reset(switch) + .map(|_| HttpResponseDeleted()) + .map_err(HttpError::from) +} + +/** + * Get the multicast group configuration for a given group IP address. + */ +#[endpoint { + method = GET, + path = "/multicast/groups/{group_ip}", +}] +async fn multicast_group_get( + rqctx: RequestContext>, + path: Path, +) -> Result, HttpError> { + let switch: &Switch = rqctx.context(); + let ip = path.into_inner().group_ip; + + // Get the multicast group + mcast::get_group(switch, ip) + .map(HttpResponseOk) + .map_err(HttpError::from) +} + +/** + * Update an internal multicast group configuration for a given group IP address. + * + * Internal groups are used for admin-scoped IPv6 multicast traffic that + * requires replication infrastructure with external and underlay members. + */ +#[endpoint { + method = PUT, + path = "/multicast/groups/{group_ip}", +}] +async fn multicast_group_update( + rqctx: RequestContext>, + path: Path, + group: TypedBody, +) -> Result, HttpError> { + let switch: &Switch = rqctx.context(); + let ip = path.into_inner().group_ip; + + let ipv6 = match ip { + IpAddr::V6(ipv6) => ipv6, + IpAddr::V4(_) => { + return Err(HttpError::for_bad_request( + None, + "Internal multicast groups must use IPv6 addresses".to_string(), + )); + } + }; + + mcast::modify_group_internal(switch, ipv6, group.into_inner()) + .map(HttpResponseOk) + .map_err(HttpError::from) +} + +/** + * Update an external-only multicast group configuration for a given group IP address. + * + * External-only groups are used for IPv4 and non-admin-scoped IPv6 multicast + * traffic that doesn't require replication infrastructure. + */ +#[endpoint { + method = PUT, + path = "/multicast/external-groups/{group_ip}", +}] +async fn multicast_group_update_external( + rqctx: RequestContext>, + path: Path, + group: TypedBody, +) -> Result, HttpError> { + let switch: &Switch = rqctx.context(); + let entry = group.into_inner(); + let ip = path.into_inner().group_ip; + + mcast::modify_group_external(switch, ip, entry) + .map(HttpResponseCreated) + .map_err(HttpError::from) +} + +/** + * List all multicast groups. + */ +#[endpoint { + method = GET, + path = "/multicast/groups", +}] +async fn multicast_groups_list( + rqctx: RequestContext>, + query_params: Query< + PaginationParams, + >, +) -> Result>, HttpError> +{ + let switch: &Switch = rqctx.context(); + + // If a group ID is provided, get the group by ID + + // If no group ID is provided, paginate through the groups + let pag_params = query_params.into_inner(); + let Ok(limit) = usize::try_from(rqctx.page_limit(&pag_params)?.get()) + else { + return Err(DpdError::Invalid("Invalid page limit".to_string()).into()); + }; + + let last_addr = match &pag_params.page { + WhichPage::First(..) => None, + WhichPage::Next(MulticastGroupIpParam { group_ip }) => Some(*group_ip), + }; + + let entries = mcast::get_range(switch, last_addr, limit, None); + + Ok(HttpResponseOk(ResultsPage::new( + entries, + &EmptyScanParams {}, + |e: &mcast::MulticastGroupResponse, _| MulticastGroupIpParam { + group_ip: e.ip(), + }, + )?)) +} + +/** + * List all multicast groups with a given tag. + */ +#[endpoint { + method = GET, + path = "/multicast/tags/{tag}", +}] +async fn multicast_groups_list_by_tag( + rqctx: RequestContext>, + path: Path, + query_params: Query< + PaginationParams, + >, +) -> Result>, HttpError> +{ + let switch: &Switch = rqctx.context(); + let tag = path.into_inner().tag; + + let pag_params = query_params.into_inner(); + let Ok(limit) = usize::try_from(rqctx.page_limit(&pag_params)?.get()) + else { + return Err(DpdError::Invalid("Invalid page limit".to_string()).into()); + }; + + let last_addr = match &pag_params.page { + WhichPage::First(..) => None, + WhichPage::Next(MulticastGroupIpParam { group_ip }) => Some(*group_ip), + }; + + let entries = mcast::get_range(switch, last_addr, limit, Some(&tag)); + Ok(HttpResponseOk(ResultsPage::new( + entries, + &EmptyScanParams {}, + |e: &mcast::MulticastGroupResponse, _| MulticastGroupIpParam { + group_ip: e.ip(), + }, + )?)) +} + +/** + * Delete all multicast groups (and associated routes) with a given tag. + */ +#[endpoint { + method = DELETE, + path = "/multicast/tags/{tag}", +}] +async fn multicast_reset_by_tag( + rqctx: RequestContext>, + path: Path, +) -> Result { + let switch: &Switch = rqctx.context(); + let tag = path.into_inner().tag; + + mcast::reset_tag(switch, &tag) + .map(|_| HttpResponseDeleted()) + .map_err(HttpError::from) +} + +/** + * Delete all multicast groups (and associated routes) without a tag. + */ +#[endpoint { + method = DELETE, + path = "/multicast/untagged", +}] +async fn multicast_reset_untagged( + rqctx: RequestContext>, +) -> Result { + let switch: &Switch = rqctx.context(); + + mcast::reset_untagged(switch) + .map(|_| HttpResponseDeleted()) + .map_err(HttpError::from) +} + pub fn http_api() -> dropshot::ApiDescription> { let mut api = dropshot::ApiDescription::new(); api.register(build_info).unwrap(); @@ -3189,6 +3484,18 @@ pub fn http_api() -> dropshot::ApiDescription> { api.register(ipv4_nat_generation).unwrap(); api.register(ipv4_nat_trigger_update).unwrap(); + api.register(multicast_group_create).unwrap(); + api.register(multicast_group_create_external).unwrap(); + api.register(multicast_reset).unwrap(); + api.register(multicast_group_delete).unwrap(); + api.register(multicast_group_update).unwrap(); + api.register(multicast_group_update_external).unwrap(); + api.register(multicast_group_get).unwrap(); + api.register(multicast_groups_list).unwrap(); + api.register(multicast_groups_list_by_tag).unwrap(); + api.register(multicast_reset_by_tag).unwrap(); + api.register(multicast_reset_untagged).unwrap(); + #[cfg(feature = "tofino_asic")] crate::tofino_api_server::init(&mut api); #[cfg(feature = "softnpu")] diff --git a/dpd/src/counters.rs b/dpd/src/counters.rs index a4c673e..280b81b 100644 --- a/dpd/src/counters.rs +++ b/dpd/src/counters.rs @@ -52,6 +52,7 @@ enum CounterId { Service, Ingress, Egress, + Multicast, Packet, DropPort, DropReason, @@ -77,7 +78,7 @@ struct CounterDescription { p4_name: &'static str, } -const COUNTERS: [CounterDescription; 6] = [ +const COUNTERS: [CounterDescription; 12] = [ CounterDescription { id: CounterId::Service, client_name: "Service", @@ -88,26 +89,56 @@ const COUNTERS: [CounterDescription; 6] = [ client_name: "Ingress", p4_name: "pipe.Ingress.ingress_ctr", }, - CounterDescription { - id: CounterId::Egress, - client_name: "Egress", - p4_name: "pipe.Ingress.egress_ctr", - }, CounterDescription { id: CounterId::Packet, client_name: "Packet", p4_name: "pipe.Ingress.packet_ctr", }, + CounterDescription { + id: CounterId::Egress, + client_name: "Egress", + p4_name: "pipe.Ingress.egress_ctr", + }, CounterDescription { id: CounterId::DropPort, - client_name: "Drop_Port", + client_name: "Ingress_Drop_Port", p4_name: "pipe.Ingress.drop_port_ctr", }, CounterDescription { id: CounterId::DropReason, - client_name: "Drop_Reason", + client_name: "Ingress_Drop_Reason", p4_name: "pipe.Ingress.drop_reason_ctr", }, + CounterDescription { + id: CounterId::DropPort, + client_name: "Egress_Drop_Port", + p4_name: "pipe.Egress.drop_port_ctr", + }, + CounterDescription { + id: CounterId::DropReason, + client_name: "Egress_Drop_Reason", + p4_name: "pipe.Egress.drop_reason_ctr", + }, + CounterDescription { + id: CounterId::Multicast, + client_name: "Multicast", + p4_name: "pipe.Egress.mcast_ctr", + }, + CounterDescription { + id: CounterId::Multicast, + client_name: "Multicast_Link_Local", + p4_name: "pipe.Egress.link_local_mcast_ctr", + }, + CounterDescription { + id: CounterId::Multicast, + client_name: "Multicast_External", + p4_name: "pipe.Egress.external_mcast_ctr", + }, + CounterDescription { + id: CounterId::Multicast, + client_name: "Multicast_Underlay", + p4_name: "pipe.Egress.underlay_mcast_ctr", + }, ]; /// Get the list of names by which end users can refer to a counter. @@ -226,6 +257,11 @@ enum DropReason { Ipv4Unrouteable, Ipv6Unrouteable, NatIngressMiss, + MulticastNoGroup, + MulticastInvalidMac, + MulticastCpuCopy, + MulticastSrcFiltered, + MulticastPathFiltered, } impl TryFrom for DropReason { @@ -251,6 +287,11 @@ impl TryFrom for DropReason { 15 => Ok(DropReason::Ipv4Unrouteable), 16 => Ok(DropReason::Ipv6Unrouteable), 17 => Ok(DropReason::NatIngressMiss), + 18 => Ok(DropReason::MulticastNoGroup), + 19 => Ok(DropReason::MulticastInvalidMac), + 20 => Ok(DropReason::MulticastCpuCopy), + 21 => Ok(DropReason::MulticastSrcFiltered), + 22 => Ok(DropReason::MulticastPathFiltered), x => Err(format!("Unrecognized drop reason: {x}")), } } @@ -280,6 +321,15 @@ fn reason_label(ctr: u8) -> Result, String> { DropReason::Ipv4Unrouteable => "ipv6_unrouteable".to_string(), DropReason::Ipv6Unrouteable => "ipv4_unrouteable".to_string(), DropReason::NatIngressMiss => "nat_ingress_miss".to_string(), + DropReason::MulticastNoGroup => "multicast_no_group".to_string(), + DropReason::MulticastInvalidMac => "multicast_invalid_mac".to_string(), + DropReason::MulticastCpuCopy => "multicast_cpu_copy".to_string(), + DropReason::MulticastSrcFiltered => { + "multicast_src_filtered".to_string() + } + DropReason::MulticastPathFiltered => { + "multicast_path_filtered".to_string() + } }; Ok(Some(label)) } @@ -332,9 +382,10 @@ pub async fn get_values( let key = match counter_id { CounterId::Packet => packet_label(idx.idx), CounterId::Service => service_label(idx.idx as u8), - CounterId::Ingress | CounterId::Egress | CounterId::DropPort => { - port_label(switch, idx.idx).await - } + CounterId::Ingress + | CounterId::Egress + | CounterId::DropPort + | CounterId::Multicast => port_label(switch, idx.idx).await, CounterId::DropReason => reason_label(idx.idx as u8)?, }; diff --git a/dpd/src/link.rs b/dpd/src/link.rs index 9290900..35805a5 100644 --- a/dpd/src/link.rs +++ b/dpd/src/link.rs @@ -11,9 +11,11 @@ use crate::fault; use crate::fault::Faultable; use crate::ports::AdminEvent; use crate::ports::Event; +use crate::table::mcast; use crate::table::port_ip; use crate::table::port_mac; use crate::table::port_nat; +use crate::table::MacOps; use crate::types::DpdError; use crate::types::DpdResult; use crate::views; @@ -598,7 +600,7 @@ impl Switch { /// higher level link it corresponds to. Note, there is no guarantee /// that this link has been configured or plumbed - this function just /// performs an inter-namespace translation. - fn asic_id_to_port_link( + pub(crate) fn asic_id_to_port_link( &self, asic_id: AsicId, ) -> DpdResult<(PortId, LinkId)> { @@ -1688,8 +1690,24 @@ fn unplumb_link( } if link.plumbed.mac.is_some() { - if let Err(e) = port_mac::mac_clear(switch, link.asic_port_id) { - error!(log, "Failed to clear mac address: {e:?}"); + if let Err(e) = MacOps::::mac_clear( + switch, + link.asic_port_id, + ) + .and_then(|_| { + MacOps::::mac_clear( + switch, + link.asic_port_id, + ) + }) + .and_then(|_| { + // We tie this in here as ports and macs are 1:1 + mcast::mcast_egress::del_port_mapping_entry( + switch, + link.asic_port_id, + ) + }) { + error!(log, "Failed to clear mac address and port mapping: {e:?}"); return Err(e); } else { link.plumbed.mac = None; @@ -1950,13 +1968,25 @@ async fn reconcile_link( link.config.mac, link.plumbed.mac.unwrap() ); - if let Err(e) = port_mac::mac_clear(switch, asic_id) { + if let Err(e) = + MacOps::::mac_clear(switch, asic_id) + .and_then(|_| { + MacOps::::mac_clear( + switch, asic_id, + ) + }) + .and_then(|_| { + // We tie this in here as ports and macs are 1:1 + mcast::mcast_egress::del_port_mapping_entry(switch, asic_id) + }) + { record_plumb_failure( switch, &mut link, "clearing a stale MAC address", &e, ); + error!(log, "Failed to clear stale mac address: {e:?}"); return; } @@ -1965,14 +1995,29 @@ async fn reconcile_link( if link.plumbed.mac.is_none() { debug!(log, "Programming mac {}", link.config.mac); - if let Err(e) = port_mac::mac_set(switch, asic_id, link.config.mac) { + if let Err(e) = MacOps::::mac_set( + switch, + asic_id, + link.config.mac, + ) + .and_then(|_| { + MacOps::::mac_set( + switch, + asic_id, + link.config.mac, + ) + }) + .and_then(|_| { + // We tie this in here as ports and macs are 1:1 + mcast::mcast_egress::add_port_mapping_entry(switch, asic_id) + }) { record_plumb_failure( switch, &mut link, - "programming the MAC address", + "programming the MAC address and port mapping", &e, ); - error!(log, "Failed to program mac: {:?}", e); + error!(log, "Failed to program mac and port mapping: {:?}", e); return; } link.plumbed.mac = Some(link.config.mac); diff --git a/dpd/src/macaddrs.rs b/dpd/src/macaddrs.rs index e4c0ed8..71535b4 100644 --- a/dpd/src/macaddrs.rs +++ b/dpd/src/macaddrs.rs @@ -22,7 +22,10 @@ use common::ports::PORT_COUNT_REAR; cfg_if::cfg_if! { if #[cfg(feature = "tofino_asic")] { use std::convert::TryFrom; - use crate::api_server::LinkCreate; + use crate::api_server::LinkCreate; + use crate::table::mcast; + use crate::table::port_mac; + use crate::table::MacOps; use common::ports::PortFec; use common::ports::PortSpeed; use common::ports::InternalPort; @@ -422,7 +425,11 @@ impl Switch { let mut mgr = self.mac_mgmt.lock().unwrap(); assert_eq!(mgr.set_base_mac(temp_mac)?, None); } - crate::table::port_mac::reset(self)?; + + // Reset ingress and egress MAC tables and Port ID table(s). + MacOps::::reset(self)?; + MacOps::::reset(self)?; + mcast::mcast_egress::reset_bitmap_table(self)?; // Create the link on the CPU port. let link_id = self.create_link(port_id, ¶ms)?; diff --git a/dpd/src/main.rs b/dpd/src/main.rs index 49cb574..38f6acb 100644 --- a/dpd/src/main.rs +++ b/dpd/src/main.rs @@ -59,6 +59,7 @@ mod freemap; mod link; mod loopback; mod macaddrs; +mod mcast; mod nat; mod oxstats; mod port_map; @@ -202,8 +203,8 @@ pub struct Switch { pub identifiers: Mutex>, pub oximeter_producer: Mutex>, pub oximeter_meta: Mutex>, - pub reconciler: link::LinkReconciler, + pub mcast: Mutex, mac_mgmt: Mutex, @@ -311,6 +312,7 @@ impl Switch { oximeter_producer: Mutex::new(None), oximeter_meta: Mutex::new(None), reconciler: link::LinkReconciler::default(), + mcast: Mutex::new(mcast::MulticastGroupData::new()), mac_mgmt, port_history: Mutex::new(BTreeMap::new()), #[cfg(feature = "tofino_asic")] diff --git a/dpd/src/mcast/mod.rs b/dpd/src/mcast/mod.rs new file mode 100644 index 0000000..8d2e5bd --- /dev/null +++ b/dpd/src/mcast/mod.rs @@ -0,0 +1,2420 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Multicast group management and configuration. +//! +//! This is the entrypoint for managing multicast groups, including creating, +//! modifying, and deleting groups. + +use std::{ + collections::{BTreeMap, HashSet}, + fmt, + net::{IpAddr, Ipv4Addr, Ipv6Addr}, + ops::Bound, + sync::{Arc, Mutex, Weak}, +}; + +use crate::{ + link::LinkId, + table, + types::{DpdError, DpdResult}, + Switch, +}; +use aal::{AsicError, AsicOps}; +use common::{nat::NatTarget, ports::PortId}; +use oxnet::{Ipv4Net, Ipv6Net}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use slog::{debug, error}; + +mod validate; +use validate::{ + is_ssm, validate_multicast_address, validate_nat_target, + validate_not_admin_scoped_ipv6, +}; + +/// Type alias for multicast group IDs. +pub(crate) type MulticastGroupId = u16; + +#[derive(Debug)] +struct ScopedIdInner(MulticastGroupId, Weak>>); + +impl Drop for ScopedIdInner { + /// Only return to free pool if not taken and if the free pool still + /// exists + fn drop(&mut self) { + if self.0 != 0 { + if let Some(free_ids) = self.1.upgrade() { + if let Ok(mut pool) = free_ids.lock() { + pool.push(self.0); + } + } + } + } +} + +/// Wrapper for multicast group IDs during allocation that automatically +/// returns them to the free pool when dropped. This prevents group ID leaks +/// when operations fail during group creation. +#[derive(Clone, Debug)] +struct ScopedGroupId(Arc); + +impl ScopedGroupId { + /// Get the underlying group ID value. + fn id(&self) -> MulticastGroupId { + self.0 .0 + } +} + +impl From for ScopedGroupId { + fn from(value: ScopedIdInner) -> Self { + Self(value.into()) + } +} + +/// Source filter match key for multicast traffic. +#[derive( + Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize, JsonSchema, +)] +pub(crate) enum IpSrc { + /// Exact match for the source IP address. + Exact(IpAddr), + /// Subnet match for the source IP address. + Subnet(Ipv4Net), +} + +impl fmt::Display for IpSrc { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + IpSrc::Exact(ip) => write!(f, "{}", ip), + IpSrc::Subnet(subnet) => write!(f, "{}", subnet), + } + } +} + +/// Represents a member of a multicast group. +#[derive( + Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize, JsonSchema, +)] +pub(crate) struct MulticastGroupMember { + pub port_id: PortId, + pub link_id: LinkId, + pub direction: Direction, +} + +/// Represents the NAT target for multicast traffic for internal/underlay +/// forwarding. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub(crate) struct InternalForwarding { + pub nat_target: Option, +} + +/// Represents the forwarding configuration for external multicast traffic. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub(crate) struct ExternalForwarding { + pub vlan_id: Option, +} + +/// Multicast replication configuration (internal only). +#[derive(Clone, Debug, Default, PartialEq, Eq)] +struct MulticastReplicationInfo { + rid: u16, + level1_excl_id: u16, + level2_excl_id: u16, +} + +/// Represents a multicast group configuration. +/// +/// This structure is used to manage multicast groups, including their +/// replication information, forwarding settings, and associated members. +#[derive(Clone, Debug)] +pub(crate) struct MulticastGroup { + external_group_id: Option, + underlay_group_id: Option, + pub(crate) tag: Option, + pub(crate) int_fwding: InternalForwarding, + pub(crate) ext_fwding: ExternalForwarding, + pub(crate) sources: Option>, + replication_info: Option, + pub(crate) members: Vec, +} + +impl MulticastGroup { + fn external_group_id(&self) -> Option { + self.external_group_id.as_ref().map(ScopedGroupId::id) + } + + fn underlay_group_id(&self) -> Option { + self.underlay_group_id.as_ref().map(ScopedGroupId::id) + } +} + +/// A multicast group configuration for POST requests for internal (to the rack) +/// groups. +#[derive(Debug, Deserialize, Serialize, JsonSchema)] +pub(crate) struct MulticastGroupCreateEntry { + group_ip: Ipv6Addr, + tag: Option, + sources: Option>, + members: Vec, +} + +/// A multicast group configuration for POST requests for external (to the rack) +/// groups. +#[derive(Debug, Deserialize, Serialize, JsonSchema)] +pub(crate) struct MulticastGroupCreateExternalEntry { + group_ip: IpAddr, + tag: Option, + nat_target: NatTarget, + vlan_id: Option, + sources: Option>, +} + +/// Represents a multicast replication entry for PUT requests for internal +/// (to the rack) groups. +#[derive(Debug, Deserialize, Serialize, JsonSchema)] +pub(crate) struct MulticastGroupUpdateEntry { + tag: Option, + sources: Option>, + members: Vec, +} + +/// A multicast group update entry for PUT requests for external (to the rack) +/// groups. +#[derive(Debug, Deserialize, Serialize, JsonSchema)] +pub(crate) struct MulticastGroupUpdateExternalEntry { + tag: Option, + nat_target: NatTarget, + vlan_id: Option, + sources: Option>, +} + +/// Response structure for multicast group operations. +#[derive(Debug, Deserialize, Serialize, JsonSchema)] +pub struct MulticastGroupResponse { + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, + tag: Option, + int_fwding: InternalForwarding, + ext_fwding: ExternalForwarding, + sources: Option>, + members: Vec, +} + +impl MulticastGroupResponse { + fn new(group_ip: IpAddr, group: &MulticastGroup) -> Self { + Self { + group_ip, + external_group_id: group.external_group_id(), + underlay_group_id: group.underlay_group_id(), + tag: group.tag.clone(), + int_fwding: InternalForwarding { + nat_target: group.int_fwding.nat_target, + }, + ext_fwding: ExternalForwarding { + vlan_id: group.ext_fwding.vlan_id, + }, + sources: group.sources.clone(), + members: group.members.to_vec(), + } + } + + /// Get the multicast group IP address. + pub(crate) fn ip(&self) -> IpAddr { + self.group_ip + } +} + +/// Direction a multicast group member is reached by. +/// +/// `External` group members must have any packet encapsulation removed +/// before packet delivery. +#[derive( + Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize, JsonSchema, +)] +pub(crate) enum Direction { + Underlay, + External, +} + +/// Stores multicast group configurations. +#[derive(Debug)] +pub struct MulticastGroupData { + /// Multicast group configurations keyed by group IP. + groups: BTreeMap, + /// Stack of available group IDs for O(1) allocation. + /// Pre-populated with all IDs from GENERATOR_START to u16::MAX-1. + free_group_ids: Arc>>, + /// Mapping from admin-scoped group IP to external groups that use it as NAT + /// target (admin_scoped_ip -> set of external_group_ips) + nat_target_refs: BTreeMap>, +} + +impl MulticastGroupData { + const GENERATOR_START: u16 = 100; + + /// Creates a new instance of MulticastGroupData with pre-populated free + /// group IDs. + pub(crate) fn new() -> Self { + // Pre-populate with all available IDs from GENERATOR_START to u16::MAX-1 + // Using a Vec as a stack for O(1) push/pop operations + let free_group_ids = Arc::new(Mutex::new( + (Self::GENERATOR_START..MulticastGroupId::MAX).collect(), + )); + + Self { + groups: BTreeMap::new(), + free_group_ids, + nat_target_refs: BTreeMap::new(), + } + } + + /// Generates a unique multicast group ID with automatic cleanup on drop. + /// + /// O(1) allocation from pre-populated free list. Never allocates. + /// + /// IDs below GENERATOR_START (100) to avoid conflicts with reserved ranges. + /// + /// Returns a ScopedGroupId that will automatically return the ID to the + /// free pool when dropped. + fn generate_group_id(&mut self) -> DpdResult { + let mut pool = self.free_group_ids.lock().unwrap(); + let id = pool.pop().ok_or_else(|| { + DpdError::McastGroupFailure( + "no free multicast group IDs available (exhausted range 100-65534)".to_string(), + ) + })?; + + Ok(ScopedIdInner(id, Arc::downgrade(&self.free_group_ids)).into()) + } + + /// Add a NAT target reference from external group to admin-scoped group. + fn add_nat_target_ref( + &mut self, + external_group_ip: IpAddr, + admin_scoped_ip: IpAddr, + ) { + self.nat_target_refs + .entry(admin_scoped_ip) + .or_default() + .insert(external_group_ip); + } + + /// Remove a NAT target reference. + fn remove_nat_target_ref( + &mut self, + external_group_ip: IpAddr, + admin_scoped_ip: IpAddr, + ) { + if let Some(refs) = self.nat_target_refs.get_mut(&admin_scoped_ip) { + refs.remove(&external_group_ip); + if refs.is_empty() { + self.nat_target_refs.remove(&admin_scoped_ip); + } + } + } + + /// Get VLAN ID for an internal group from its referencing external groups. + fn get_vlan_for_internal_addr(&self, internal_ip: IpAddr) -> Option { + // Find the first external group that references this internal group + // and return its VLAN ID + if let Some(external_refs) = self.nat_target_refs.get(&internal_ip) { + for external_ip in external_refs { + if let Some(external_group) = self.groups.get(external_ip) { + if let Some(vlan_id) = external_group.ext_fwding.vlan_id { + return Some(vlan_id); + } + } + } + } + None + } +} + +impl Default for MulticastGroupData { + fn default() -> Self { + Self::new() + } +} + +/// Add an external multicast group to the switch, which creates the group on +/// the ASIC and associates it with a group IP address and updates associated +/// tables for NAT and L3 routing. +/// +/// If anything fails, the group is cleaned up and an error is returned. +pub(crate) fn add_group_external( + s: &Switch, + group_info: MulticastGroupCreateExternalEntry, +) -> DpdResult { + let group_ip = group_info.group_ip; + + // Acquire the lock to the multicast data structure at thestart to ensure + // deterministic operation order + let mut mcast = s.mcast.lock().unwrap(); + + validate_external_group_creation(&mcast, group_ip, &group_info)?; + validate_nat_target(group_info.nat_target)?; + + // Validate that NAT target points to an existing group + if !mcast + .groups + .contains_key(&group_info.nat_target.internal_ip.into()) + { + return Err(DpdError::Invalid(format!( + "multicast group for IP address {} must have a NAT target that is also a tracked multicast group", + group_ip + ))); + } + + let res = configure_external_tables(s, &group_info); + + if let Err(e) = res { + // Use unified rollback with optional NAT for external groups + rollback_on_group_create( + s, + group_ip, + (None, None), // External groups don't create ASIC groups + &[], // No members added externally + &MulticastReplicationInfo::default(), // Dummy replication info + Some(group_info.nat_target), // External groups have NAT targets + group_info.sources.as_deref(), + ) + .ok(); // Ignore rollback errors, log the original error + return Err(e); + } + + let group = MulticastGroup { + external_group_id: None, + underlay_group_id: None, + tag: group_info.tag, + int_fwding: InternalForwarding { + nat_target: Some(group_info.nat_target), + }, + ext_fwding: ExternalForwarding { + vlan_id: group_info.vlan_id, + }, + sources: group_info.sources, + replication_info: None, + members: Vec::new(), // External groups have no members + }; + + mcast.groups.insert(group_ip, group.clone()); + + // Track NAT target reference for VLAN propagation + mcast + .add_nat_target_ref(group_ip, group_info.nat_target.internal_ip.into()); + + // Extract data needed for VLAN propagation to internal groups + let vlan_propagation_data = group_info.vlan_id.map(|vlan_id| { + let internal_ip = group_info.nat_target.internal_ip.into(); + debug!( + s.log, + "External group {} with VLAN {} references internal group {}, propagating VLAN to existing internal group", + group_ip, + vlan_id, + internal_ip + ); + + let internal_group = mcast + .groups + .get(&internal_ip) + .ok_or_else(|| { + DpdError::Invalid(format!( + "Internal group {} not found", + internal_ip + )) + }) + .expect("Internal group must exist (validated above)"); + + ( + internal_ip, + vlan_id, + internal_group.external_group_id.clone(), + internal_group.underlay_group_id.clone(), + internal_group.members.clone(), + ) + }); + + // Update internal group's tables with the VLAN if necessary + if let Some(( + internal_ip, + vlan_id, + external_group_id, + underlay_group_id, + members, + )) = vlan_propagation_data + { + // Update external group bitmap if it exists + if let Some(external_id) = external_group_id { + let mut port_bitmap = table::mcast::mcast_egress::PortBitmap::new(); + for member in &members { + if member.direction == Direction::External { + port_bitmap.add_port(member.port_id.as_u8()); + } + } + if let Err(e) = table::mcast::mcast_egress::update_bitmap_entry( + s, + external_id.id(), + &port_bitmap, + Some(vlan_id), + ) { + error!( + s.log, + "Failed to update external bitmap for VLAN {} on internal group {}: {:?}", + vlan_id, + internal_ip, + e + ); + } + } + + // Update underlay group bitmap if it exists + if let Some(underlay_id) = underlay_group_id { + let mut port_bitmap = table::mcast::mcast_egress::PortBitmap::new(); + for member in &members { + if member.direction == Direction::Underlay { + port_bitmap.add_port(member.port_id.as_u8()); + } + } + if let Err(e) = table::mcast::mcast_egress::update_bitmap_entry( + s, + underlay_id.id(), + &port_bitmap, + Some(vlan_id), + ) { + error!( + s.log, + "Failed to update underlay bitmap for VLAN {} on internal group {}: {:?}", + vlan_id, + internal_ip, + e + ); + } + } + } + + Ok(MulticastGroupResponse::new(group_ip, &group)) +} + +/// Add an internal multicast group to the switch, which creates the group on +/// the ASIC and associates it with a group IP address and updates associated +/// tables for multicast replication and L3 routing. +/// +/// If anything fails, the group is cleaned up and an error is returned. +pub(crate) fn add_group_internal( + s: &Switch, + group_info: MulticastGroupCreateEntry, +) -> DpdResult { + add_group_internal_only(s, group_info) +} + +fn add_group_internal_only( + s: &Switch, + group_info: MulticastGroupCreateEntry, +) -> DpdResult { + let group_ip = group_info.group_ip; + + // Acquire the lock to the multicast data structure at the start to ensure + // deterministic operation order + let mut mcast = s.mcast.lock().unwrap(); + + validate_internal_group_creation(&mcast, group_ip, &group_info)?; + + let (scoped_external_id, scoped_underlay_id) = + create_multicast_group_ids(s, &mut mcast, group_ip, &group_info)?; + + // Get VLAN ID from referencing external groups + let vlan_id = mcast.get_vlan_for_internal_addr(group_ip.into()); + let external_group_id = scoped_external_id.as_ref().map(ScopedGroupId::id); + let underlay_group_id = scoped_underlay_id.as_ref().map(ScopedGroupId::id); + let mut added_members = Vec::new(); + let replication_info = + configure_replication(external_group_id, underlay_group_id); + + add_ports_to_groups( + s, + group_ip.into(), + &group_info.members, + external_group_id, + underlay_group_id, + &replication_info, + &mut added_members, + )?; + + configure_internal_tables( + s, + group_ip.into(), + external_group_id, + underlay_group_id, + Some(&replication_info), + &group_info, + &added_members, + vlan_id, + )?; + + let group = MulticastGroup { + external_group_id: scoped_external_id, + underlay_group_id: scoped_underlay_id, + tag: group_info.tag, + int_fwding: InternalForwarding { + nat_target: None, // Internal groups don't have NAT targets + }, + ext_fwding: ExternalForwarding { + vlan_id: None, // Internal groups don't have VLANs + }, + sources: group_info.sources, + replication_info: Some(replication_info), + members: group_info.members, + }; + + mcast.groups.insert(group_ip.into(), group.clone()); + + Ok(MulticastGroupResponse::new(group_ip.into(), &group)) +} + +/// Delete a multicast group from the switch, including all associated tables +/// and port mappings. +pub(crate) fn del_group(s: &Switch, group_ip: IpAddr) -> DpdResult<()> { + let mut mcast = s.mcast.lock().unwrap(); + + let group = mcast.groups.remove(&group_ip).ok_or_else(|| { + DpdError::Missing(format!( + "Multicast group for IP {} not found", + group_ip + )) + })?; + + let nat_target_to_remove = group + .int_fwding + .nat_target + .map(|nat| nat.internal_ip.into()); + + debug!(s.log, "deleting multicast group for IP {}", group_ip); + delete_group_tables(s, group_ip, &group)?; + + delete_multicast_groups( + s, + group_ip, + group.external_group_id.clone(), + group.underlay_group_id.clone(), + )?; + + if let Some(internal_ip) = nat_target_to_remove { + mcast.remove_nat_target_ref(group_ip, internal_ip); + } + + Ok(()) +} + +/// Get a multicast group configuration. +pub(crate) fn get_group( + s: &Switch, + group_ip: IpAddr, +) -> DpdResult { + let mcast = s.mcast.lock().unwrap(); + + let group = mcast + .groups + .get(&group_ip) + .ok_or_else(|| { + DpdError::Missing(format!( + "multicast group for IP {} not found", + group_ip + )) + })? + .clone(); + + Ok(MulticastGroupResponse::new(group_ip, &group)) +} + +pub(crate) fn modify_group_external( + s: &Switch, + group_ip: IpAddr, + new_group_info: MulticastGroupUpdateExternalEntry, +) -> DpdResult { + let mut mcast = s.mcast.lock().unwrap(); + + if !mcast.groups.contains_key(&group_ip) { + return Err(DpdError::Missing(format!( + "Multicast group for IP {} not found", + group_ip + ))); + } + + let group_entry = mcast.groups.remove(&group_ip).unwrap(); + let old_nat_target = group_entry.int_fwding.nat_target; + + let table_result = + update_external_tables(s, group_ip, &group_entry, &new_group_info); + + match table_result { + Ok(_) => { + let mut updated_group = group_entry; // Take ownership + + // Update NAT target references if NAT target changed + if let Some(old_nat) = old_nat_target { + if old_nat.internal_ip != new_group_info.nat_target.internal_ip + { + mcast.remove_nat_target_ref( + group_ip, + old_nat.internal_ip.into(), + ); + mcast.add_nat_target_ref( + group_ip, + new_group_info.nat_target.internal_ip.into(), + ); + } + } + + // Update the external group fields + updated_group.tag = new_group_info.tag.or(updated_group.tag); + updated_group.int_fwding.nat_target = + Some(new_group_info.nat_target); + updated_group.ext_fwding.vlan_id = + new_group_info.vlan_id.or(updated_group.ext_fwding.vlan_id); + updated_group.sources = + new_group_info.sources.or(updated_group.sources); + + let response = + MulticastGroupResponse::new(group_ip, &updated_group); + mcast.groups.insert(group_ip, updated_group); + Ok(response) + } + Err(e) => { + mcast.groups.insert(group_ip, group_entry); + + // Use unified rollback for external modify failures + rollback_on_group_update( + s, + group_ip, + &[], // External groups don't have member changes + &[], // External groups don't have member changes + mcast.groups.get_mut(&group_ip).unwrap(), + new_group_info.sources.as_deref(), // New sources that might need rollback + ) + .ok(); // Ignore rollback errors, return original error + + Err(e) + } + } +} + +pub(crate) fn modify_group_internal( + s: &Switch, + group_ip: Ipv6Addr, + new_group_info: MulticastGroupUpdateEntry, +) -> DpdResult { + modify_group_internal_only(s, group_ip, new_group_info) +} + +/// Modify an internal multicast group configuration. +fn modify_group_internal_only( + s: &Switch, + group_ip: Ipv6Addr, + new_group_info: MulticastGroupUpdateEntry, +) -> DpdResult { + let mut mcast = s.mcast.lock().unwrap(); + + if !mcast.groups.contains_key(&group_ip.into()) { + return Err(DpdError::Missing(format!( + "Multicast group for IP {} not found", + group_ip + ))); + } + + let mut group_entry = mcast.groups.remove(&group_ip.into()).unwrap(); + + // Validate sources + let (sources, sources_diff) = if let Some(new_srcs) = + new_group_info.sources.clone() + { + if is_ssm(group_ip.into()) && new_srcs.is_empty() { + mcast.groups.insert(group_ip.into(), group_entry.clone()); // Restore on error + return Err(DpdError::Invalid(format!( + "IP {} is a Source-Specific Multicast address and requires at least one source to be defined", + group_ip + ))); + } + (Some(new_srcs), true) + } else { + (group_entry.sources.clone(), false) + }; + + let replication_info = group_entry.replication_info.clone(); + + // Pre-allocate group IDs if needed (avoids nested locking later) + let new_members_set = new_group_info + .members + .iter() + .cloned() + .collect::>(); + + let external_scoped_id = if group_entry.external_group_id.is_none() + && new_members_set + .iter() + .any(|m| m.direction == Direction::External) + { + Some(mcast.generate_group_id()?) + } else { + None + }; + + let underlay_scoped_id = if group_entry.underlay_group_id.is_none() + && new_members_set + .iter() + .any(|m| m.direction == Direction::Underlay) + { + Some(mcast.generate_group_id()?) + } else { + None + }; + + let (added_members, removed_members) = + if let Some(ref repl_info) = replication_info { + process_membership_changes( + s, + group_ip.into(), + &new_group_info.members, + &mut group_entry, + repl_info, + external_scoped_id, + underlay_scoped_id, + )? + } else { + (Vec::new(), Vec::new()) + }; + + // Perform table updates + let table_update_result = if let Some(ref repl_info) = replication_info { + update_group_tables( + s, + group_ip.into(), + &group_entry, + repl_info, + &sources, + &group_entry.sources, + ) + } else { + Ok(()) + }; + + match table_update_result { + Ok(_) => { + group_entry.tag = new_group_info.tag.or(group_entry.tag.clone()); + group_entry.sources = sources; + group_entry.replication_info = replication_info; + group_entry.members = new_group_info.members; + + let response = + MulticastGroupResponse::new(group_ip.into(), &group_entry); + mcast.groups.insert(group_ip.into(), group_entry); + Ok(response) + } + Err(e) => { + mcast.groups.insert(group_ip.into(), group_entry.clone()); + + rollback_on_group_update( + s, + group_ip.into(), + &added_members, + &removed_members, + mcast.groups.get_mut(&group_ip.into()).unwrap(), + sources_diff.then_some(sources.as_ref().unwrap()), + )?; + + Err(e) + } + } +} + +/// List all multicast groups over a range. +pub(crate) fn get_range( + s: &Switch, + last: Option, + limit: usize, + tag: Option<&str>, +) -> Vec { + let mcast = s.mcast.lock().unwrap(); + + let lower_bound = match last { + None => Bound::Unbounded, + Some(last_ip) => Bound::Excluded(last_ip), + }; + + mcast + .groups + .range((lower_bound, Bound::Unbounded)) + .filter_map(|(ip, group)| { + if let Some(tag_filter) = tag { + if group.tag.as_deref() != Some(tag_filter) { + return None; + } + } + + Some(MulticastGroupResponse::new(*ip, group)) + }) + .take(limit) + .collect() +} + +/// Reset all multicast groups (and associated routes) for a given tag. +pub(crate) fn reset_tag(s: &Switch, tag: &str) -> DpdResult<()> { + let groups_to_delete = { + let mcast = s.mcast.lock().unwrap(); + mcast + .groups + .iter() + .filter_map(|(ip, group)| { + if group.tag.as_deref() == Some(tag) { + Some(*ip) + } else { + None + } + }) + .collect::>() + }; + + for group_ip in groups_to_delete { + if let Err(e) = del_group(s, group_ip) { + error!( + s.log, + "failed to delete multicast group for IP {}: {:?}", group_ip, e + ); + } + } + + Ok(()) +} + +/// Reset all multicast groups (and associated routes) without a tag. +pub(crate) fn reset_untagged(s: &Switch) -> DpdResult<()> { + let groups_to_delete = { + let mcast = s.mcast.lock().unwrap(); + mcast + .groups + .iter() + .filter_map( + |(ip, group)| { + if group.tag.is_none() { + Some(*ip) + } else { + None + } + }, + ) + .collect::>() + }; + + for group_ip in groups_to_delete { + if let Err(e) = del_group(s, group_ip) { + error!( + s.log, + "failed to delete multicast group for IP {}: {:?}", group_ip, e + ); + } + } + + Ok(()) +} + +/// Reset all multicast groups (and associated routes). +pub(crate) fn reset(s: &Switch) -> DpdResult<()> { + let mut mcast = s.mcast.lock().unwrap(); + + // Destroy ASIC groups + let group_ids = s.asic_hdl.mc_domains(); + for group_id in group_ids { + if let Err(e) = s.asic_hdl.mc_group_destroy(group_id) { + error!( + s.log, + "failed to delete multicast group with ID {}: {:?}", + group_id, + e + ); + } + } + + // Reset all table entries + table::mcast::mcast_replication::reset_ipv6(s)?; + table::mcast::mcast_src_filter::reset_ipv4(s)?; + table::mcast::mcast_src_filter::reset_ipv6(s)?; + table::mcast::mcast_nat::reset_ipv4(s)?; + table::mcast::mcast_nat::reset_ipv6(s)?; + table::mcast::mcast_route::reset_ipv4(s)?; + table::mcast::mcast_route::reset_ipv6(s)?; + table::mcast::mcast_egress::reset_bitmap_table(s)?; + + // Clear data structures + mcast.groups.clear(); + mcast.nat_target_refs.clear(); + + Ok(()) +} + +fn remove_source_filters( + s: &Switch, + group_ip: IpAddr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + match group_ip { + IpAddr::V4(ipv4) => remove_ipv4_source_filters(s, ipv4, sources)?, + IpAddr::V6(ipv6) => remove_ipv6_source_filters(s, ipv6, sources)?, + } + + Ok(()) +} + +fn remove_ipv4_source_filters( + s: &Switch, + ipv4: Ipv4Addr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + if let Some(srcs) = sources { + for src in srcs { + match src { + IpSrc::Exact(IpAddr::V4(src)) => { + table::mcast::mcast_src_filter::del_ipv4_entry( + s, + Ipv4Net::new(*src, 32).unwrap(), + ipv4, + )?; + } + IpSrc::Subnet(src) => { + table::mcast::mcast_src_filter::del_ipv4_entry( + s, *src, ipv4, + )?; + } + _ => {} + } + } + } + + Ok(()) +} + +fn remove_ipv6_source_filters( + s: &Switch, + ipv6: Ipv6Addr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + if let Some(srcs) = sources { + for src in srcs { + if let IpSrc::Exact(IpAddr::V6(src)) = src { + table::mcast::mcast_src_filter::del_ipv6_entry(s, *src, ipv6)?; + } + } + } + + Ok(()) +} + +fn add_source_filters( + s: &Switch, + group_ip: IpAddr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + if let Some(srcs) = sources { + match group_ip { + IpAddr::V4(ipv4) => add_ipv4_source_filters(s, srcs, ipv4)?, + IpAddr::V6(ipv6) => add_ipv6_source_filters(s, srcs, ipv6)?, + } + } + + Ok(()) +} + +fn add_ipv4_source_filters( + s: &Switch, + sources: &[IpSrc], + dest_ip: Ipv4Addr, +) -> DpdResult<()> { + for src in sources { + match src { + IpSrc::Exact(IpAddr::V4(src)) => { + table::mcast::mcast_src_filter::add_ipv4_entry( + s, + Ipv4Net::new(*src, 32).unwrap(), + dest_ip, + ) + } + IpSrc::Subnet(subnet) => { + table::mcast::mcast_src_filter::add_ipv4_entry( + s, *subnet, dest_ip, + ) + } + _ => Ok(()), + }?; + } + + Ok(()) +} + +fn add_ipv6_source_filters( + s: &Switch, + sources: &[IpSrc], + dest_ip: Ipv6Addr, +) -> DpdResult<()> { + for src in sources { + if let IpSrc::Exact(IpAddr::V6(src)) = src { + table::mcast::mcast_src_filter::add_ipv6_entry(s, *src, dest_ip)?; + } + } + + Ok(()) +} + +fn validate_internal_group_creation( + mcast: &MulticastGroupData, + group_ip: Ipv6Addr, + group_info: &MulticastGroupCreateEntry, +) -> DpdResult<()> { + validate_group_exists(mcast, group_ip.into())?; + validate_multicast_address(group_ip.into(), group_info.sources.as_deref())?; + + if !Ipv6Net::new_unchecked(group_ip, 128).is_admin_scoped_multicast() { + return Err(DpdError::Invalid(format!( + "Non-admin-scoped IPv6 multicast groups must use the external API (/multicast/groups/external). Address {} is not admin-scoped (ff04::/16, ff05::/16, ff08::/16)", + group_ip + ))); + } + + Ok(()) +} + +fn validate_external_group_creation( + mcast: &MulticastGroupData, + group_ip: IpAddr, + group_info: &MulticastGroupCreateExternalEntry, +) -> DpdResult<()> { + validate_group_exists(mcast, group_ip)?; + validate_multicast_address(group_ip, group_info.sources.as_deref())?; + validate_not_admin_scoped_ipv6(group_ip)?; + Ok(()) +} + +fn validate_group_exists( + mcast: &MulticastGroupData, + group_ip: IpAddr, +) -> DpdResult<()> { + if mcast.groups.contains_key(&group_ip) { + return Err(DpdError::Invalid(format!( + "multicast group for IP {} already exists", + group_ip + ))); + } + Ok(()) +} + +fn configure_external_tables( + s: &Switch, + group_info: &MulticastGroupCreateExternalEntry, +) -> DpdResult<()> { + let group_ip = group_info.group_ip; + let nat_target = group_info.nat_target; + + // Add source filter entries if needed + let mut res = if let Some(srcs) = &group_info.sources { + match group_ip { + IpAddr::V4(ipv4) => add_ipv4_source_filters(s, srcs, ipv4), + IpAddr::V6(ipv6) => add_ipv6_source_filters(s, srcs, ipv6), + } + } else { + Ok(()) + }; + + // Add NAT entry + if res.is_ok() { + res = match group_ip { + IpAddr::V4(ipv4) => { + table::mcast::mcast_nat::add_ipv4_entry(s, ipv4, nat_target) + } + IpAddr::V6(ipv6) => { + table::mcast::mcast_nat::add_ipv6_entry(s, ipv6, nat_target) + } + }; + } + + // Add routing entry + if res.is_ok() { + res = match group_ip { + IpAddr::V4(ipv4) => table::mcast::mcast_route::add_ipv4_entry( + s, + ipv4, + group_info.vlan_id, + ), + IpAddr::V6(ipv6) => table::mcast::mcast_route::add_ipv6_entry( + s, + ipv6, + group_info.vlan_id, + ), + }; + } + + res +} + +fn create_multicast_group_ids( + s: &Switch, + mcast: &mut MulticastGroupData, + group_ip: Ipv6Addr, + group_info: &MulticastGroupCreateEntry, +) -> DpdResult<(Option, Option)> { + let has_external_member = group_info + .members + .iter() + .any(|m| m.direction == Direction::External); + let has_underlay_member = group_info + .members + .iter() + .any(|m| m.direction == Direction::Underlay); + + if !has_external_member && !has_underlay_member { + return Err(DpdError::Invalid(format!( + "multicast group for admin-scoped IP {} must have at least one external/underlay member", + group_ip + ))); + } + + debug!(s.log, "creating multicast group IDs for IP {}", group_ip); + + // Pre-allocate group IDs to avoid nested locking + let external_group_id = has_external_member + .then(|| mcast.generate_group_id()) + .transpose()?; + let underlay_group_id = has_underlay_member + .then(|| mcast.generate_group_id()) + .transpose()?; + + // Create ASIC groups without holding the lock + if let Some(scoped_id) = &external_group_id { + create_asic_group(s, scoped_id.id(), group_ip.into())?; + } + + if let Some(scoped_id) = &underlay_group_id { + create_asic_group(s, scoped_id.id(), group_ip.into())?; + } + + Ok((external_group_id, underlay_group_id)) +} + +fn delete_multicast_groups( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, +) -> DpdResult<()> { + if let Some(external_scoped) = external_group_id.as_ref() { + let external_id = external_scoped.id(); + s.asic_hdl.mc_group_destroy(external_id).map_err(|e| { + DpdError::McastGroupFailure(format!( + "failed to delete external multicast group for IP {} with ID {}: {:?}", + group_ip, external_id, e + )) + })?; + } + + if let Some(underlay_scoped) = underlay_group_id.as_ref() { + let underlay_id = underlay_scoped.id(); + s.asic_hdl.mc_group_destroy(underlay_id).map_err(|e| { + DpdError::McastGroupFailure(format!( + "failed to delete underlay multicast group for IP {} with ID {}: {:?}", + group_ip, underlay_id, e + )) + })?; + } + + Ok(()) +} + +fn create_asic_group( + s: &Switch, + group_id: MulticastGroupId, + group_ip: IpAddr, +) -> DpdResult<()> { + s.asic_hdl + .mc_group_create(group_id) + .map_err(|e: AsicError| { + DpdError::McastGroupFailure(format!( + "failed to create multicast group for IP {} with ID {}: {:?}", + group_ip, group_id, e + )) + }) +} + +fn add_ports_to_groups( + s: &Switch, + group_ip: IpAddr, + members: &[MulticastGroupMember], + external_group_id: Option, + underlay_group_id: Option, + replication_info: &MulticastReplicationInfo, + added_members: &mut Vec<(PortId, LinkId, Direction)>, +) -> DpdResult<()> { + for member in members { + let group_id = match member.direction { + Direction::External => external_group_id, + Direction::Underlay => underlay_group_id, + }; + + let Some(group_id) = group_id else { + continue; + }; + + let asic_id = s + .port_link_to_asic_id(member.port_id, member.link_id) + .inspect_err(|_e| { + rollback_on_group_create( + s, + group_ip, + (external_group_id, underlay_group_id), + added_members, + replication_info, + None, + None, + ) + .ok(); + })?; + + s.asic_hdl + .mc_port_add( + group_id, + asic_id, + replication_info.rid, + replication_info.level1_excl_id, + ) + .map_err(|e| { + rollback_on_group_create( + s, + group_ip, + (external_group_id, underlay_group_id), + added_members, + replication_info, + None, + None, + ) + .ok(); + + DpdError::McastGroupFailure(format!( + "failed to add port {} to group for IP {}: {:?}", + member.port_id, group_ip, e + )) + })?; + + added_members.push((member.port_id, member.link_id, member.direction)); + } + + Ok(()) +} + +fn process_membership_changes( + s: &Switch, + group_ip: IpAddr, + new_members: &[MulticastGroupMember], + group_entry: &mut MulticastGroup, + replication_info: &MulticastReplicationInfo, + external_scoped_id: Option, + underlay_scoped_id: Option, +) -> DpdResult<(Vec, Vec)> { + // First validate that IPv4 doesn't have underlay members + if group_ip.is_ipv4() + && new_members + .iter() + .any(|m| m.direction == Direction::Underlay) + { + return Err(DpdError::Invalid(format!( + "multicast group for IPv4 {} cannot have underlay members", + group_ip + ))); + } + + let prev_members = + group_entry.members.iter().cloned().collect::>(); + let new_members_set = new_members.iter().cloned().collect::>(); + + let mut added_members = Vec::new(); + let mut removed_members = Vec::new(); + + // Step 1: Ensure required groups exist (this can fail cleanly) + ensure_external_group_exists( + s, + group_ip, + &new_members_set, + group_entry, + external_scoped_id, + )?; + + if group_ip.is_ipv6() { + ensure_underlay_group_exists( + s, + group_ip, + &new_members_set, + group_entry, + underlay_scoped_id, + )?; + } + + // Step 2: Remove members from ASIC (only after group creation succeeds) + for member in prev_members.difference(&new_members_set) { + let group_id = match member.direction { + Direction::External => group_entry.external_group_id(), + Direction::Underlay => group_entry.underlay_group_id(), + }; + + let Some(group_id) = group_id else { + continue; + }; + + let asic_id = s.port_link_to_asic_id(member.port_id, member.link_id)?; + s.asic_hdl.mc_port_remove(group_id, asic_id)?; + + removed_members.push(member.clone()); + } + + // Step 3: Add new members to ASIC + for member in new_members_set.difference(&prev_members) { + if group_ip.is_ipv4() && member.direction == Direction::Underlay { + continue; + } + + let group_id = match member.direction { + Direction::External => group_entry.external_group_id(), + Direction::Underlay => group_entry.underlay_group_id(), + }; + + let Some(group_id) = group_id else { + continue; + }; + + let asic_id = s.port_link_to_asic_id(member.port_id, member.link_id)?; + s.asic_hdl.mc_port_add( + group_id, + asic_id, + replication_info.rid, + replication_info.level1_excl_id, + )?; + added_members.push(member.clone()); + } + + Ok((added_members, removed_members)) +} + +fn ensure_external_group_exists( + s: &Switch, + group_ip: IpAddr, + members: &HashSet, + group_entry: &mut MulticastGroup, + pre_allocated_id: Option, +) -> DpdResult<()> { + if group_entry.external_group_id.is_none() + && members.iter().any(|m| m.direction == Direction::External) + { + let scoped_group_id = pre_allocated_id.ok_or_else(|| { + DpdError::Other( + "external group ID should have been pre-allocated".to_string(), + ) + })?; + + create_asic_group(s, scoped_group_id.id(), group_ip)?; + group_entry.external_group_id = Some(scoped_group_id); + } + + Ok(()) +} + +fn ensure_underlay_group_exists( + s: &Switch, + group_ip: IpAddr, + members: &HashSet, + group_entry: &mut MulticastGroup, + pre_allocated_id: Option, +) -> DpdResult<()> { + if group_entry.underlay_group_id.is_none() + && members.iter().any(|m| m.direction == Direction::Underlay) + { + let scoped_group_id = pre_allocated_id.ok_or_else(|| { + DpdError::Other( + "underlay group ID should have been pre-allocated".to_string(), + ) + })?; + + create_asic_group(s, scoped_group_id.id(), group_ip)?; + group_entry.underlay_group_id = Some(scoped_group_id); + } + + Ok(()) +} + +fn configure_replication( + external_group_id: Option, + underlay_group_id: Option, +) -> MulticastReplicationInfo { + let rid = external_group_id.or(underlay_group_id).unwrap(); + + // We default level exclusion IDs to 0 for internal groups + // since they can only be configured internally without API calls. + MulticastReplicationInfo { + rid, + level1_excl_id: 0, + level2_excl_id: 0, + } +} + +#[allow(clippy::too_many_arguments)] +fn configure_internal_tables( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, + replication_info: Option<&MulticastReplicationInfo>, + group_info: &MulticastGroupCreateEntry, + added_members: &[(PortId, LinkId, Direction)], + vlan_id: Option, // VLAN ID from referencing external group +) -> DpdResult<()> { + let res = match (group_ip, replication_info) { + // Note: There are no internal IPv4 groups, only external IPv4 groups + (IpAddr::V4(_), _) => { + return Err(DpdError::Invalid( + "IPv4 groups cannot be created as internal groups".to_string(), + )); + } + + (IpAddr::V6(ipv6), Some(replication_info)) => { + let mut res = table::mcast::mcast_replication::add_ipv6_entry( + s, + ipv6, + underlay_group_id, + external_group_id, + replication_info.rid, + replication_info.level1_excl_id, + replication_info.level2_excl_id, + ); + + if res.is_ok() { + if let Some(srcs) = &group_info.sources { + res = add_ipv6_source_filters(s, srcs, ipv6); + } + } + + if res.is_ok() { + res = table::mcast::mcast_route::add_ipv6_entry( + s, ipv6, + vlan_id, // VLAN from referencing external group + ); + } + + if res.is_ok() + && external_group_id.is_some() + && underlay_group_id.is_some() + { + let mut port_bitmap = + table::mcast::mcast_egress::PortBitmap::new(); + for (port_id, _link_id, direction) in added_members { + if *direction == Direction::External { + let port_number = port_id.as_u8(); + port_bitmap.add_port(port_number); + } + } + + res = table::mcast::mcast_egress::add_bitmap_entry( + s, + external_group_id.unwrap(), + &port_bitmap, + vlan_id, // VLAN from referencing external group + ); + } + + res + } + + (IpAddr::V6(_), None) => { + return Err(DpdError::Invalid( + "Internal, admin-scoped IPv6 groups must have replication info" + .to_string(), + )); + } + }; + + if let Err(e) = res { + if let Some(replication_info) = replication_info { + rollback_on_group_create( + s, + group_ip, + (external_group_id, underlay_group_id), + added_members, + replication_info, + None, // Internal groups don't have NAT targets + group_info.sources.as_deref(), + )?; + } + return Err(e); + } + + Ok(()) +} + +fn update_group_tables( + s: &Switch, + group_ip: IpAddr, + group_entry: &MulticastGroup, + replication_info: &MulticastReplicationInfo, + new_sources: &Option>, + old_sources: &Option>, +) -> DpdResult<()> { + if let Some(existing_replication) = &group_entry.replication_info { + if replication_info.rid != existing_replication.rid + || replication_info.level1_excl_id + != existing_replication.level1_excl_id + || replication_info.level2_excl_id + != existing_replication.level2_excl_id + { + update_replication_tables( + s, + group_ip, + group_entry.external_group_id(), + group_entry.underlay_group_id(), + replication_info, + )?; + } + } + + if new_sources != old_sources { + remove_source_filters(s, group_ip, old_sources.as_deref())?; + add_source_filters(s, group_ip, new_sources.as_deref())?; + } + + Ok(()) +} + +fn update_external_tables( + s: &Switch, + group_ip: IpAddr, + group_entry: &MulticastGroup, + new_group_info: &MulticastGroupUpdateExternalEntry, +) -> DpdResult<()> { + // Update sources if they changed + if new_group_info.sources != group_entry.sources { + remove_source_filters(s, group_ip, group_entry.sources.as_deref())?; + add_source_filters(s, group_ip, new_group_info.sources.as_deref())?; + } + + // Update NAT target - external groups always have NAT targets + if Some(new_group_info.nat_target) != group_entry.int_fwding.nat_target { + update_nat_tables( + s, + group_ip, + Some(new_group_info.nat_target), + group_entry.int_fwding.nat_target, + )?; + } + + // Update VLAN if it changed + if new_group_info.vlan_id != group_entry.ext_fwding.vlan_id { + match group_ip { + IpAddr::V4(ipv4) => table::mcast::mcast_route::update_ipv4_entry( + s, + ipv4, + new_group_info.vlan_id, + ), + IpAddr::V6(ipv6) => table::mcast::mcast_route::update_ipv6_entry( + s, + ipv6, + new_group_info.vlan_id, + ), + }?; + } + + Ok(()) +} + +fn delete_group_tables( + s: &Switch, + group_ip: IpAddr, + group: &MulticastGroup, +) -> DpdResult<()> { + match group_ip { + IpAddr::V4(ipv4) => { + remove_ipv4_source_filters(s, ipv4, group.sources.as_deref())?; + + if group.int_fwding.nat_target.is_some() { + table::mcast::mcast_nat::del_ipv4_entry(s, ipv4)?; + } + + table::mcast::mcast_route::del_ipv4_entry(s, ipv4)?; + } + IpAddr::V6(ipv6) => { + if group.external_group_id().is_some() + && group.underlay_group_id().is_some() + { + table::mcast::mcast_egress::del_bitmap_entry( + s, + group.external_group_id().unwrap(), + )?; + } + + table::mcast::mcast_replication::del_ipv6_entry(s, ipv6)?; + + remove_ipv6_source_filters(s, ipv6, group.sources.as_deref())?; + + if group.int_fwding.nat_target.is_some() { + table::mcast::mcast_nat::del_ipv6_entry(s, ipv6)?; + } + + table::mcast::mcast_route::del_ipv6_entry(s, ipv6)?; + } + } + + Ok(()) +} + +fn update_replication_tables( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, + replication_info: &MulticastReplicationInfo, +) -> DpdResult<()> { + match group_ip { + IpAddr::V4(_) => Ok(()), + IpAddr::V6(ipv6) => table::mcast::mcast_replication::update_ipv6_entry( + s, + ipv6, + underlay_group_id, + external_group_id, + replication_info.rid, + replication_info.level1_excl_id, + replication_info.level2_excl_id, + ), + } +} + +fn update_nat_tables( + s: &Switch, + group_ip: IpAddr, + new_nat_target: Option, + old_nat_target: Option, +) -> DpdResult<()> { + match (group_ip, new_nat_target, old_nat_target) { + (IpAddr::V4(ipv4), Some(nat), _) => { + table::mcast::mcast_nat::update_ipv4_entry(s, ipv4, nat) + } + (IpAddr::V6(ipv6), Some(nat), _) => { + table::mcast::mcast_nat::update_ipv6_entry(s, ipv6, nat) + } + (IpAddr::V4(ipv4), None, Some(_)) => { + table::mcast::mcast_nat::del_ipv4_entry(s, ipv4) + } + (IpAddr::V6(ipv6), None, Some(_)) => { + table::mcast::mcast_nat::del_ipv6_entry(s, ipv6) + } + _ => Ok(()), + } +} + +fn update_fwding_tables( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, + members: &[MulticastGroupMember], + vlan_id: Option, +) -> DpdResult<()> { + match group_ip { + IpAddr::V4(ipv4) => { + table::mcast::mcast_route::update_ipv4_entry(s, ipv4, vlan_id) + } + IpAddr::V6(ipv6) => { + let mut res = + table::mcast::mcast_route::update_ipv6_entry(s, ipv6, vlan_id); + + if res.is_ok() + && external_group_id.is_some() + && underlay_group_id.is_some() + { + let mut port_bitmap = + table::mcast::mcast_egress::PortBitmap::new(); + + for member in members { + if member.direction == Direction::External { + port_bitmap.add_port(member.port_id.as_u8()); + } + } + + res = table::mcast::mcast_egress::update_bitmap_entry( + s, + external_group_id.unwrap(), + &port_bitmap, + vlan_id, + ); + } + + res + } + } +} + +/// Rollback function for a multicast group creation failure. +/// +/// Cleans up all resources created during a failed multicast group creation. +/// +/// This function is reused for both external and internal group failures. +fn rollback_on_group_create( + s: &Switch, + group_ip: IpAddr, + group_ids: (Option, Option), + added_members: &[(PortId, LinkId, Direction)], + replication_info: &MulticastReplicationInfo, + nat_target: Option, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + debug!( + s.log, + "rolling back multicast group creation for IP {}", group_ip + ); + + let (external_group_id, underlay_group_id) = group_ids; + + let mut contains_errors = false; + + let added_members_converted: Vec = added_members + .iter() + .map(|(port_id, link_id, direction)| MulticastGroupMember { + port_id: *port_id, + link_id: *link_id, + direction: *direction, + }) + .collect(); + + if let Err(e) = rollback_ports( + s, + &added_members_converted, + &[], + replication_info, + external_group_id, + underlay_group_id, + ) { + error!(s.log, "error removing ports during rollback: {:?}", e); + contains_errors = true; + } + + if let Err(e) = rollback_remove_groups( + s, + group_ip, + external_group_id, + underlay_group_id, + ) { + error!(s.log, "error deleting groups during rollback: {:?}", e); + contains_errors = true; + } + + if let Err(e) = rollback_remove_tables( + s, + group_ip, + external_group_id, + underlay_group_id, + nat_target, + sources, + ) { + error!( + s.log, + "Error deleting table entries during rollback: {:?}", e + ); + contains_errors = true; + } + + if contains_errors { + error!(s.log, "rollback completed with errors for IP {}", group_ip); + } else { + debug!( + s.log, + "successfully rolled back multicast group creation for IP {}", + group_ip + ); + } + + Ok(()) +} + +/// Rollback function for a multicast group modification if it fails on updates. +/// +/// Restores the group to its original state. +/// +/// This function is reused for both external and internal group modifications. +fn rollback_on_group_update( + s: &Switch, + group_ip: IpAddr, + added_ports: &[MulticastGroupMember], + removed_ports: &[MulticastGroupMember], + orig_group_info: &MulticastGroup, + new_sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + debug!( + s.log, + "rolling back multicast group update for IP {}", group_ip + ); + + let mut contains_errors = false; + + if let Some(replication_info) = &orig_group_info.replication_info { + if let Err(e) = rollback_ports( + s, + added_ports, + removed_ports, + replication_info, + orig_group_info.external_group_id(), + orig_group_info.underlay_group_id(), + ) { + error!( + s.log, + "error handling ports during update rollback: {:?}", e + ); + contains_errors = true; + } + } + + if new_sources.is_some() { + if let Err(e) = rollback_source_filters( + s, + group_ip, + new_sources, + orig_group_info.sources.as_deref(), + ) { + error!( + s.log, + "error restoring source filters during update rollback: {:?}", + e + ); + contains_errors = true; + } + } + + if let Err(e) = rollback_restore_tables(s, group_ip, orig_group_info) { + error!( + s.log, + "error restoring table entries during update rollback: {:?}", e + ); + contains_errors = true; + } + + if contains_errors { + error!( + s.log, + "update rollback completed with errors for IP {}", group_ip + ); + } else { + debug!( + s.log, + "successfully rolled back multicast group update for IP {}", + group_ip + ); + } + + Ok(()) +} + +fn rollback_ports( + s: &Switch, + added_ports: &[MulticastGroupMember], + removed_ports: &[MulticastGroupMember], + replication_info: &MulticastReplicationInfo, + external_group_id: Option, + underlay_group_id: Option, +) -> DpdResult<()> { + for member in added_ports { + let group_id = match member.direction { + Direction::External => external_group_id, + Direction::Underlay => underlay_group_id, + }; + + if group_id.is_none() { + continue; + } + + match s.port_link_to_asic_id(member.port_id, member.link_id) { + Ok(asic_id) => { + if let Err(e) = + s.asic_hdl.mc_port_remove(group_id.unwrap(), asic_id) + { + debug!( + s.log, + "failed to remove port during rollback: port={}, link={}, error={:?}", + member.port_id, member.link_id, e + ); + } + } + Err(e) => { + debug!( + s.log, + "failed to get ASIC ID for port during rollback: port={}, link={}, error={:?}", + member.port_id, member.link_id, e + ); + } + } + } + + for member in removed_ports { + let group_id = match member.direction { + Direction::External => external_group_id, + Direction::Underlay => underlay_group_id, + }; + + if group_id.is_none() { + continue; + } + + match s.port_link_to_asic_id(member.port_id, member.link_id) { + Ok(asic_id) => { + if let Err(e) = s.asic_hdl.mc_port_add( + group_id.unwrap(), + asic_id, + replication_info.rid, + replication_info.level1_excl_id, + ) { + debug!( + s.log, + "failed to restore port during rollback: port={}, link={}, error={:?}", + member.port_id, member.link_id, e + ); + } + } + Err(e) => { + debug!( + s.log, + "failed to get ASIC ID for port during rollback: port={}, link={}, error={:?}", + member.port_id, member.link_id, e + ); + } + } + } + + Ok(()) +} + +fn rollback_remove_groups( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, +) -> DpdResult<()> { + if let Some(external_id) = external_group_id { + if let Err(e) = s.asic_hdl.mc_group_destroy(external_id) { + debug!( + s.log, + "failed to remove external multicast group for IP {} with ID {} during rollback: {:?}", + group_ip, external_id, e + ); + } + } + + if let Some(underlay_id) = underlay_group_id { + if let Err(e) = s.asic_hdl.mc_group_destroy(underlay_id) { + debug!( + s.log, + "failed to remove underlay multicast group for IP {} with ID {} during rollback: {:?}", + group_ip, underlay_id, e + ); + } + } + + Ok(()) +} + +fn rollback_remove_tables( + s: &Switch, + group_ip: IpAddr, + external_group_id: Option, + underlay_group_id: Option, + nat_target: Option, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + match group_ip { + IpAddr::V4(ipv4) => { + if let Some(srcs) = sources { + for src in srcs { + match src { + IpSrc::Exact(IpAddr::V4(src)) => { + if let Err(e) = + table::mcast::mcast_src_filter::del_ipv4_entry( + s, + Ipv4Net::new(*src, 32).unwrap(), + ipv4, + ) + { + debug!(s.log, "failed to remove IPv4 source filter during rollback: {:?}", e); + } + } + IpSrc::Subnet(subnet) => { + if let Err(e) = + table::mcast::mcast_src_filter::del_ipv4_entry( + s, *subnet, ipv4, + ) + { + debug!(s.log, "failed to remove IPv4 subnet filter during rollback: {:?}", e); + } + } + _ => {} + } + } + } + + if nat_target.is_some() { + if let Err(e) = table::mcast::mcast_nat::del_ipv4_entry(s, ipv4) + { + debug!( + s.log, + "failed to remove IPv4 NAT entry during rollback: {:?}", + e + ); + } + } + + if let Err(e) = table::mcast::mcast_route::del_ipv4_entry(s, ipv4) { + debug!( + s.log, + "failed to remove IPv4 route entry during rollback: {:?}", + e + ); + } + } + IpAddr::V6(ipv6) => { + if external_group_id.is_some() && underlay_group_id.is_some() { + if let Err(e) = table::mcast::mcast_egress::del_bitmap_entry( + s, + external_group_id.unwrap(), + ) { + debug!(s.log, "failed to remove external egress entry during rollback: {:?}", e); + } + } + + if let Err(e) = + table::mcast::mcast_replication::del_ipv6_entry(s, ipv6) + { + debug!(s.log, "failed to remove IPv6 replication entry during rollback: {:?}", e); + } + + if let Some(srcs) = sources { + for src in srcs { + if let IpSrc::Exact(IpAddr::V6(src)) = src { + if let Err(e) = + table::mcast::mcast_src_filter::del_ipv6_entry( + s, *src, ipv6, + ) + { + debug!(s.log, "failed to remove IPv6 source filter during rollback: {:?}", e); + } + } + } + } + + if nat_target.is_some() { + if let Err(e) = table::mcast::mcast_nat::del_ipv6_entry(s, ipv6) + { + debug!( + s.log, + "failed to remove IPv6 NAT entry during rollback: {:?}", + e + ); + } + } + + if let Err(e) = table::mcast::mcast_route::del_ipv6_entry(s, ipv6) { + debug!( + s.log, + "failed to remove IPv6 route entry during rollback: {:?}", + e + ); + } + } + } + + Ok(()) +} + +fn rollback_source_filters( + s: &Switch, + group_ip: IpAddr, + new_sources: Option<&[IpSrc]>, + orig_sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + if let Err(e) = remove_source_filters(s, group_ip, new_sources) { + debug!( + s.log, + "failed to remove new source filters during rollback: {:?}", e + ); + } + + if let Err(e) = add_source_filters(s, group_ip, orig_sources) { + debug!( + s.log, + "failed to restore original source filters during rollback: {:?}", + e + ); + } + + Ok(()) +} + +fn rollback_restore_tables( + s: &Switch, + group_ip: IpAddr, + orig_group_info: &MulticastGroup, +) -> DpdResult<()> { + let external_group_id = orig_group_info.external_group_id(); + let underlay_group_id = orig_group_info.underlay_group_id(); + let replication_info = &orig_group_info.replication_info; + let vlan_id = orig_group_info.ext_fwding.vlan_id; + let nat_target = orig_group_info.int_fwding.nat_target; + let prev_members = orig_group_info.members.to_vec(); + + if let Some(replication_info) = replication_info { + if let Err(e) = update_replication_tables( + s, + group_ip, + external_group_id, + underlay_group_id, + replication_info, + ) { + debug!( + s.log, + "failed to restore replication settings during rollback: {:?}", + e + ); + } + } + + match group_ip { + IpAddr::V4(ipv4) => rollback_restore_nat_v4(s, ipv4, nat_target), + IpAddr::V6(ipv6) => rollback_restore_nat_v6(s, ipv6, nat_target), + } + + if let Err(e) = update_fwding_tables( + s, + group_ip, + external_group_id, + underlay_group_id, + &prev_members, + vlan_id, + ) { + debug!( + s.log, + "failed to restore VLAN settings during rollback: {:?}", e + ); + } + + Ok(()) +} + +fn rollback_restore_nat_v4( + s: &Switch, + ipv4: Ipv4Addr, + nat_target: Option, +) { + if let Some(nat) = nat_target { + if let Err(e) = table::mcast::mcast_nat::update_ipv4_entry(s, ipv4, nat) + { + debug!( + s.log, + "failed to restore IPv4 NAT settings during rollback: {:?}", e + ); + } + } else if let Err(e) = table::mcast::mcast_nat::del_ipv4_entry(s, ipv4) { + debug!( + s.log, + "failed to remove IPv4 NAT entry during rollback: {:?}", e + ); + } +} + +fn rollback_restore_nat_v6( + s: &Switch, + ipv6: Ipv6Addr, + nat_target: Option, +) { + if let Some(nat) = nat_target { + if let Err(e) = table::mcast::mcast_nat::update_ipv6_entry(s, ipv6, nat) + { + debug!( + s.log, + "failed to restore IPv6 NAT settings during rollback: {:?}", e + ); + } + } else if let Err(e) = table::mcast::mcast_nat::del_ipv6_entry(s, ipv6) { + debug!( + s.log, + "failed to remove IPv6 NAT entry during rollback: {:?}", e + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::thread; + + #[test] + fn test_scoped_group_id_drop_returns_to_pool() { + let free_ids = Arc::new(Mutex::new(vec![100, 101, 102])); + { + let scoped_id = ScopedGroupId::from(ScopedIdInner( + 101, + Arc::downgrade(&free_ids), + )); + assert_eq!(scoped_id.id(), 101); + } + + // ID should be returned to pool + let pool = free_ids.lock().unwrap(); + assert!(pool.contains(&101)); + assert_eq!(pool.len(), 4); // Original 3 + returned 1 + } + + #[test] + fn test_scoped_group_id_weak_reference_cleanup() { + let free_ids = Arc::new(Mutex::new(vec![100, 101, 102])); + let scoped_id = ScopedIdInner(101, Arc::downgrade(&free_ids)); + + // Drop the Arc, leaving only the weak reference + drop(free_ids); + + // When ScopedGroupId is dropped, it should handle the dead weak + // reference gracefully + drop(scoped_id); // Should not panic + } + + #[test] + fn test_multicast_group_data_generate_id_allocation() { + let mut mcast_data = MulticastGroupData::new(); + + // Generate first ID (Vec is used as stack, so pop() returns highest ID first) + let scoped_id1 = mcast_data.generate_group_id().unwrap(); + assert_eq!(scoped_id1.id(), MulticastGroupId::MAX - 1); // Should be highest available ID + + // Generate second ID + let scoped_id2 = mcast_data.generate_group_id().unwrap(); + assert_eq!(scoped_id2.id(), MulticastGroupId::MAX - 2); + + // Drop the second ID, it should return to pool + drop(scoped_id2); + + // Generate third ID, should reuse the returned ID + let scoped_id3 = mcast_data.generate_group_id().unwrap(); + assert_eq!(scoped_id3.id(), MulticastGroupId::MAX - 2); // Should reuse the returned ID + } + + #[test] + fn test_multicast_group_data_id_exhaustion() { + let mut mcast_data = MulticastGroupData::new(); + + // Exhaust the pool + { + let mut pool = mcast_data.free_group_ids.lock().unwrap(); + pool.clear(); + } + + // Should return error when no IDs available + let result = mcast_data.generate_group_id(); + assert!(result.is_err()); + + match result.unwrap_err() { + DpdError::McastGroupFailure(msg) => { + assert!(msg.contains("no free multicast group IDs available")); + } + _ => panic!("Expected McastGroupFailure error"), + } + } + + #[test] + fn test_concurrent_id_allocation() { + let mcast_data = Arc::new(Mutex::new(MulticastGroupData::new())); + let mut handles = Vec::new(); + + // Spawn multiple threads to allocate IDs concurrently + for _ in 0..10 { + let mcast_data_clone = Arc::clone(&mcast_data); + let handle = thread::spawn(move || { + let mut data = mcast_data_clone.lock().unwrap(); + data.generate_group_id().unwrap() + }); + handles.push(handle); + } + + // Collect all allocated IDs + let mut allocated_ids = Vec::new(); + for handle in handles { + allocated_ids.push(handle.join().unwrap()); + } + + let mut ids: Vec<_> = allocated_ids.iter().map(|v| v.id()).collect(); + + // All IDs should be unique + ids.sort(); + ids.dedup(); + assert_eq!(ids.len(), 10); + + // All IDs should be in valid range + for id in ids { + assert!(id >= MulticastGroupData::GENERATOR_START); + assert!(id < MulticastGroupId::MAX); + } + } + + #[test] + fn test_concurrent_allocation_and_deallocation() { + let mcast_data = Arc::new(Mutex::new(MulticastGroupData::new())); + let mut handles = Vec::new(); + + // Spawn threads that allocate and immediately drop (deallocate) + for _ in 0..5 { + let mcast_data_clone = Arc::clone(&mcast_data); + let handle = thread::spawn(move || { + for _ in 0..10 { + let scoped_id = { + let mut data = mcast_data_clone.lock().unwrap(); + data.generate_group_id().unwrap() + }; + drop(scoped_id); + } + }); + handles.push(handle); + } + + // Wait for all threads to complete + for handle in handles { + handle.join().unwrap(); + } + + // Pool should have all IDs back (minus any that might still be in use) + let pool_size = { + let data = mcast_data.lock().unwrap(); + let pool_len = data.free_group_ids.lock().unwrap().len(); + pool_len + }; + + // Should have close to the original number of IDs + let expected_size = (MulticastGroupId::MAX + - MulticastGroupData::GENERATOR_START) + as usize; + assert_eq!(pool_size, expected_size); + } + + #[test] + fn test_id_range_boundaries() { + let mcast_data = MulticastGroupData::new(); + + // Check that initial pool contains correct range + let pool = mcast_data.free_group_ids.lock().unwrap(); + let expected_size = (MulticastGroupId::MAX + - MulticastGroupData::GENERATOR_START) + as usize; + assert_eq!(pool.len(), expected_size); + + // Check that minimum and maximum IDs are in range + assert!(pool.contains(&MulticastGroupData::GENERATOR_START)); + assert!(pool.contains(&(MulticastGroupId::MAX - 1))); + assert!(!pool.contains(&(MulticastGroupData::GENERATOR_START - 1))); + assert!(!pool.contains(&MulticastGroupId::MAX)); + } +} diff --git a/dpd/src/mcast/validate.rs b/dpd/src/mcast/validate.rs new file mode 100644 index 0000000..e12f038 --- /dev/null +++ b/dpd/src/mcast/validate.rs @@ -0,0 +1,460 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + +use common::nat::NatTarget; +use oxnet::{Ipv4Net, Ipv6Net}; + +use super::IpSrc; +use crate::types::{DpdError, DpdResult}; + +/// Validates if a multicast address is allowed for group creation. +/// +/// Returns a [`DpdResult`] indicating whether the address is valid or not. +pub(crate) fn validate_multicast_address( + addr: IpAddr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + match addr { + IpAddr::V4(ipv4) => validate_ipv4_multicast(ipv4, sources), + IpAddr::V6(ipv6) => validate_ipv6_multicast(ipv6, sources), + } +} + +/// Validates the NAT target inner MAC address. +pub(crate) fn validate_nat_target(nat_target: NatTarget) -> DpdResult<()> { + if !nat_target.inner_mac.is_multicast() { + return Err(DpdError::Invalid(format!( + "NAT target inner MAC address {} is not a multicast MAC address", + nat_target.inner_mac + ))); + } + + let internal_nat_ip = Ipv6Net::new_unchecked(nat_target.internal_ip, 128); + + if !internal_nat_ip.is_admin_scoped_multicast() { + return Err(DpdError::Invalid(format!( + "NAT target internal IP address {} is not a valid site/admin-local or org-scoped multicast address", + nat_target.internal_ip + ))); + } + + Ok(()) +} + +/// Check if an IP address is a Source-Specific Multicast (SSM) address. +pub(crate) fn is_ssm(addr: IpAddr) -> bool { + match addr { + IpAddr::V4(ipv4) => { + let subnet = Ipv4Net::new_unchecked(Ipv4Addr::new(232, 0, 0, 0), 8); + subnet.contains(ipv4) + } + // Check for Source-Specific Multicast (ff3x::/32) + // In IPv6 multicast, the second nibble (flag field) indicates SSM when set to 3 + IpAddr::V6(ipv6) => { + let flag_field = (ipv6.octets()[1] & 0xF0) >> 4; + flag_field == 3 + } + } +} + +/// Validates IPv4 multicast addresses. +fn validate_ipv4_multicast( + addr: Ipv4Addr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + // Verify this is actually a multicast address + if !addr.is_multicast() { + return Err(DpdError::Invalid(format!( + "{} is not a multicast address", + addr + ))); + } + + // If this is SSM, require sources to be defined + if is_ssm(addr.into()) { + if sources.is_none() || sources.unwrap().is_empty() { + return Err(DpdError::Invalid(format!( + "{} is a Source-Specific Multicast address and requires at least one source to be defined", + addr + ))); + } + // If we have sources defined for an SSM address, it's valid + return Ok(()); + } else if sources.is_some() { + // If this is not SSM but sources are defined, it's invalid + return Err(DpdError::Invalid(format!( + "{} is not a Source-Specific Multicast address but sources were provided", + addr + ))); + } + + // Define reserved IPv4 multicast subnets + let reserved_subnets = [ + // Local network control block (link-local) + Ipv4Net::new_unchecked(Ipv4Addr::new(224, 0, 0, 0), 24), // 224.0.0.0/24 + // GLOP addressing + Ipv4Net::new_unchecked(Ipv4Addr::new(233, 0, 0, 0), 8), // 233.0.0.0/8 + // Administrative scoped addresses + Ipv4Net::new_unchecked(Ipv4Addr::new(239, 0, 0, 0), 8), // 239.0.0.0/8 (administratively scoped) + ]; + + // Check reserved subnets + for subnet in &reserved_subnets { + if subnet.contains(addr) { + return Err(DpdError::Invalid(format!( + "{} is in the reserved multicast subnet {}", + addr, subnet, + ))); + } + } + + // Check specific reserved addresses that may not fall within entire subnets + let specific_reserved = [ + Ipv4Addr::new(224, 0, 1, 1), // NTP (Network Time Protocol) + Ipv4Addr::new(224, 0, 1, 129), // Cisco Auto-RP-Announce + Ipv4Addr::new(224, 0, 1, 130), // Cisco Auto-RP-Discovery + ]; + + if specific_reserved.contains(&addr) { + return Err(DpdError::Invalid(format!( + "{} is a specifically reserved multicast address", + addr + ))); + } + + Ok(()) +} + +/// Validates IPv6 multicast addresses. +fn validate_ipv6_multicast( + addr: Ipv6Addr, + sources: Option<&[IpSrc]>, +) -> DpdResult<()> { + if !addr.is_multicast() { + return Err(DpdError::Invalid(format!( + "{} is not a multicast address", + addr + ))); + } + + // If this is SSM, require sources to be defined + if is_ssm(addr.into()) { + if sources.is_none() || sources.unwrap().is_empty() { + return Err(DpdError::Invalid(format!( + "{} is an IPv6 Source-Specific Multicast address (ff3x::/32) and requires at least one source to be defined", + addr + ))); + } + // If we have sources defined for an IPv6 SSM address, it's valid + return Ok(()); + } else if sources.is_some() { + // If this is not SSM but sources are defined, it's invalid + return Err(DpdError::Invalid(format!( + "{} is not a Source-Specific Multicast address but sources were provided", + addr + ))); + } + + // Define reserved IPv6 multicast subnets + let reserved_subnets = [ + // Link-local scope + Ipv6Net::new_unchecked(Ipv6Addr::new(0xff02, 0, 0, 0, 0, 0, 0, 0), 16), // ff02::/16 + // Interface-local scope + Ipv6Net::new_unchecked(Ipv6Addr::new(0xff01, 0, 0, 0, 0, 0, 0, 0), 16), // ff01::/16 + // Node-local scope (deprecated) + Ipv6Net::new_unchecked(Ipv6Addr::new(0xff00, 0, 0, 0, 0, 0, 0, 0), 16), // ff00::/16 + ]; + + // Check reserved subnets + for subnet in &reserved_subnets { + if subnet.contains(addr) { + return Err(DpdError::Invalid(format!( + "{} is in the reserved multicast subnet {}", + addr, subnet + ))); + } + } + + Ok(()) +} + +/// Validates that IPv6 addresses are not admin-scoped for external group creation. +pub(crate) fn validate_not_admin_scoped_ipv6(addr: IpAddr) -> DpdResult<()> { + if let IpAddr::V6(ipv6) = addr { + if oxnet::Ipv6Net::new_unchecked(ipv6, 128).is_admin_scoped_multicast() + { + return Err(DpdError::Invalid(format!( + "{} is an admin-scoped multicast address and must be created via the internal multicast API", + addr + ))); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use common::{nat::Vni, network::MacAddr}; + use oxnet::Ipv4Net; + + use std::str::FromStr; + + #[test] + fn test_ipv4_validation() { + // These should be allowed + assert!( + validate_ipv4_multicast(Ipv4Addr::new(224, 1, 0, 1), None).is_ok() + ); + assert!( + validate_ipv4_multicast(Ipv4Addr::new(224, 2, 2, 3), None).is_ok() + ); + assert!( + validate_ipv4_multicast(Ipv4Addr::new(231, 1, 2, 3), None).is_ok() + ); + + // These should be rejected + assert!( + validate_ipv4_multicast(Ipv4Addr::new(224, 0, 0, 1), None).is_err() + ); // Link-local + assert!( + validate_ipv4_multicast(Ipv4Addr::new(224, 0, 0, 5), None).is_err() + ); // Link-local + assert!(validate_ipv4_multicast(Ipv4Addr::new(192, 168, 1, 1), None) + .is_err()); // Not multicast + } + + #[test] + fn test_ipv6_validation() { + // These should be allowed + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 0x1234), + None + ) + .is_ok()); // Global + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0xff05, 0, 0, 0, 0, 0, 0, 0x1111), + None + ) + .is_ok()); // Site-local + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0xff08, 0, 0, 0, 0, 0, 0, 0x5678), + None + ) + .is_ok()); // Organization-local + + // These should be rejected + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0xff02, 0, 0, 0, 0, 0, 0, 0x1), + None + ) + .is_err()); // Link-local + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0xff01, 0, 0, 0, 0, 0, 0, 0x2,), + None + ) + .is_err()); // Interface-local + assert!(validate_ipv6_multicast( + Ipv6Addr::new(0x2001, 0xdb8, 0, 0, 0, 0, 0, 0x1), + None + ) + .is_err()); // Not multicast + } + + #[test] + fn test_ipv4_ssm_with_sources() { + // Create test data for source specifications + let ssm_addr = Ipv4Addr::new(232, 1, 2, 3); + let non_ssm_addr = Ipv4Addr::new(224, 1, 2, 3); + + // Test with exact source IP + let exact_sources = + vec![IpSrc::Exact(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)))]; + + // Test with subnet source specification + let subnet_sources = + vec![IpSrc::Subnet(Ipv4Net::from_str("192.168.1.0/24").unwrap())]; + + // Test with mixed source specifications + let mixed_sources = vec![ + IpSrc::Exact(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))), + IpSrc::Subnet(Ipv4Net::from_str("10.0.0.0/8").unwrap()), + ]; + + // Empty sources - should fail for SSM + assert!(validate_ipv4_multicast(ssm_addr, Some(&[])).is_err()); + + // SSM address with exact source - should pass + assert!(validate_ipv4_multicast(ssm_addr, Some(&exact_sources)).is_ok()); + + // SSM address with subnet source - should pass + assert!( + validate_ipv4_multicast(ssm_addr, Some(&subnet_sources)).is_ok() + ); + + // SSM address with mixed sources - should pass + assert!(validate_ipv4_multicast(ssm_addr, Some(&mixed_sources)).is_ok()); + + // Non-SSM address with sources - should fail as source specs only allowed for SSM + assert!(validate_ipv4_multicast(non_ssm_addr, Some(&exact_sources)) + .is_err()); + assert!(validate_ipv4_multicast(non_ssm_addr, Some(&subnet_sources)) + .is_err()); + assert!(validate_ipv4_multicast(non_ssm_addr, Some(&mixed_sources)) + .is_err()); + } + + #[test] + fn test_ipv6_ssm_with_sources() { + // IPv6 SSM addresses (ff3x::/32) + let ssm_global = Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0x1234); // Global scope (e) + let non_ssm_global = Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 0x1234); // Non-SSM global + + // Create test sources for IPv6 + let ip6_sources = vec![IpSrc::Exact(IpAddr::V6(Ipv6Addr::new( + 0x2001, 0xdb8, 0, 0, 0, 0, 0, 0x1, + )))]; + + // Empty sources - should fail for SSM + assert!(validate_ipv6_multicast(ssm_global, Some(&[])).is_err()); + + // SSM address with IPv6 source - should pass + assert!(validate_ipv6_multicast(ssm_global, Some(&ip6_sources)).is_ok()); + + // Non-SSM address with IPv6 source - should fail + assert!(validate_ipv6_multicast(non_ssm_global, Some(&ip6_sources)) + .is_err()); + } + + #[test] + fn test_is_ssm_function() { + // Test IPv4 SSM detection + assert!(is_ssm(IpAddr::V4(Ipv4Addr::new(232, 0, 0, 1)))); + assert!(is_ssm(IpAddr::V4(Ipv4Addr::new(232, 255, 255, 255)))); + assert!(!is_ssm(IpAddr::V4(Ipv4Addr::new(224, 0, 0, 1)))); + assert!(!is_ssm(IpAddr::V4(Ipv4Addr::new(231, 0, 0, 1)))); + + // Test IPv6 SSM detection (ff3x::/32) + assert!(is_ssm(IpAddr::V6(Ipv6Addr::new( + 0xff30, 0, 0, 0, 0, 0, 0, 0x1 + )))); // With 0 scope + assert!(is_ssm(IpAddr::V6(Ipv6Addr::new( + 0xff3e, 0, 0, 0, 0, 0, 0, 0x1 + )))); // Global scope (e) + assert!(is_ssm(IpAddr::V6(Ipv6Addr::new( + 0xff35, 0, 0, 0, 0, 0, 0, 0x1 + )))); // Site-local scope (5) + + // Not SSM + assert!(!is_ssm(IpAddr::V6(Ipv6Addr::new( + 0xff0e, 0, 0, 0, 0, 0, 0, 0x1 + )))); // Flag bit not 3 + assert!(!is_ssm(IpAddr::V6(Ipv6Addr::new( + 0xff1e, 0, 0, 0, 0, 0, 0, 0x1 + )))); // Flag bit not 3 + } + + #[test] + fn test_address_validation_integrated() { + // Test the main validate_multicast_address function + + // Valid IPv4 non-SSM address, no sources + assert!(validate_multicast_address( + IpAddr::V4(Ipv4Addr::new(224, 1, 0, 1)), + None + ) + .is_ok()); + + // Valid IPv4 SSM address with sources + let sources = vec![ + IpSrc::Exact(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))), + IpSrc::Subnet(Ipv4Net::from_str("10.0.0.0/8").unwrap()), + ]; + assert!(validate_multicast_address( + IpAddr::V4(Ipv4Addr::new(232, 1, 2, 3)), + Some(&sources) + ) + .is_ok()); + + // Valid IPv6 non-SSM address, no sources + assert!(validate_multicast_address( + IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 0x1234)), + None + ) + .is_ok()); + + // Valid IPv6 SSM address with sources + let ip6_sources = vec![IpSrc::Exact(IpAddr::V6(Ipv6Addr::new( + 0x2001, 0xdb8, 0, 0, 0, 0, 0, 0x1, + )))]; + assert!(validate_multicast_address( + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0x1234)), + Some(&ip6_sources) + ) + .is_ok()); + + // Error cases + + // Not a multicast address + assert!(validate_multicast_address( + IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1)), + None + ) + .is_err()); + + // IPv4 SSM without sources + assert!(validate_multicast_address( + IpAddr::V4(Ipv4Addr::new(232, 1, 2, 3)), + None + ) + .is_err()); + + // IPv4 non-SSM with sources + assert!(validate_multicast_address( + IpAddr::V4(Ipv4Addr::new(224, 1, 2, 3)), + Some(&sources) + ) + .is_err()); + + // IPv6 SSM without sources + assert!(validate_multicast_address( + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0x1234)), + None + ) + .is_err()); + + // IPv6 non-SSM with sources + assert!(validate_multicast_address( + IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 0x1234)), + Some(&ip6_sources) + ) + .is_err()); + } + + #[test] + fn test_validate_nat_target() { + let ucast_nat_target = NatTarget { + internal_ip: Ipv6Addr::new(0x2001, 0xdb8, 0, 0, 0, 0, 0, 1), + // Not a multicast MAC + inner_mac: MacAddr::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x01), + vni: Vni::new(100).unwrap(), + }; + + assert!(validate_nat_target(ucast_nat_target).is_err()); + + let mcast_nat_target = NatTarget { + // org-scoped multicast + internal_ip: Ipv6Addr::new(0xff08, 0, 0, 0, 0, 0, 0, 0x1234), + // Multicast MAC + inner_mac: MacAddr::new(0x01, 0x00, 0x5e, 0x00, 0x00, 0x01), + vni: Vni::new(100).unwrap(), + }; + + assert!(validate_nat_target(mcast_nat_target).is_ok()); + } +} diff --git a/dpd/src/port_settings.rs b/dpd/src/port_settings.rs index 23aecb8..cab00c1 100644 --- a/dpd/src/port_settings.rs +++ b/dpd/src/port_settings.rs @@ -4,10 +4,6 @@ // // Copyright 2025 Oxide Computer Company -// This Source Code Form is subject to the terms of the Mozilla Source -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - use crate::api_server::LinkSettings; use crate::api_server::PortSettings; use crate::link::Link; diff --git a/dpd/src/route.rs b/dpd/src/route.rs index c386ec7..9ac54eb 100644 --- a/dpd/src/route.rs +++ b/dpd/src/route.rs @@ -86,6 +86,7 @@ // 6 (1, 172.17.10.1, 1) // 7 (5, 172.17.14.1, 0) // +// // Still todo: // - Implement this multipath support for IPv6. This should be a simple // copy-and-paste of the IPv4 implementation. This is currently blocked on @@ -153,6 +154,7 @@ impl RouteEntry { } } +/// A Vlan identifier, made up of a port, link, and vlan tag. #[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)] struct VlanId { // The switch port out which routed traffic is sent. @@ -164,11 +166,7 @@ struct VlanId { } impl VlanId { - pub fn new( - port_id: PortId, - link_id: LinkId, - vlan_id: u16, - ) -> DpdResult { + fn new(port_id: PortId, link_id: LinkId, vlan_id: u16) -> DpdResult { if vlan_id > 0 { common::network::validate_vlan(vlan_id)?; } @@ -454,6 +452,7 @@ fn replace_route_targets_ipv4( // Insert all the entries into the table let mut idx = new_entry.index; + for target in targets { if let Err(e) = table::route_ipv4::add_route_target( switch, @@ -742,6 +741,7 @@ pub fn delete_route_ipv4_locked( .v4 .remove(&subnet) .ok_or(DpdError::Missing("no such route".into()))?; + cleanup_route_ipv4(switch, route_data, Some(subnet), entry) } diff --git a/dpd/src/table/mcast/mcast_egress.rs b/dpd/src/table/mcast/mcast_egress.rs new file mode 100644 index 0000000..107ac3d --- /dev/null +++ b/dpd/src/table/mcast/mcast_egress.rs @@ -0,0 +1,385 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast egress entries. + +use std::fmt; + +use crate::{mcast::MulticastGroupId, table::*, Switch}; + +use aal::{ActionParse, MatchParse}; +use aal_macros::*; +use slog::debug; + +/// Table for multicast egress entries matching the multicast group ID +/// and setting which ports to possibly decap. +pub(crate) const DECAP_PORTS_TABLE_NAME: &str = + "pipe.Egress.mcast_egress.tbl_decap_ports"; + +/// Table for multicast egress entries matching the replication group ID. +pub(crate) const PORT_ID_TABLE_NAME: &str = + "pipe.Egress.mcast_egress.asic_id_to_port"; + +#[derive(MatchParse, Hash)] +struct MatchKeyDecapPorts { + #[match_xlate(name = "egress_rid")] + mcast_external_grp: MulticastGroupId, +} + +impl MatchKeyDecapPorts { + fn new(mcast_external_grp: MulticastGroupId) -> Self { + Self { mcast_external_grp } + } +} + +impl fmt::Display for MatchKeyDecapPorts { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "egress_rid={}", self.mcast_external_grp) + } +} + +#[derive(MatchParse, Hash)] +struct MatchKeyPortId { + #[match_xlate(name = "egress_port")] + asic_port_id: u16, +} + +impl MatchKeyPortId { + fn new(asic_port_id: u16) -> Self { + Self { asic_port_id } + } +} + +impl fmt::Display for MatchKeyPortId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "egress_port={}", self.asic_port_id) + } +} + +#[derive(ActionParse, Debug)] +enum DecapPortsAction { + #[action_xlate(name = "set_decap_ports")] + SetDecapPorts { + ports_0: u32, + ports_1: u32, + ports_2: u32, + ports_3: u32, + ports_4: u32, + ports_5: u32, + ports_6: u32, + ports_7: u32, + }, + #[action_xlate(name = "set_decap_ports_and_vlan")] + SetDecapPortsAndVlan { + ports_0: u32, + ports_1: u32, + ports_2: u32, + ports_3: u32, + ports_4: u32, + ports_5: u32, + ports_6: u32, + ports_7: u32, + vlan_id: u16, + }, +} + +#[derive(ActionParse, Debug)] +enum PortIdAction { + #[action_xlate(name = "set_port_number")] + SetPortNumber { port_number: u8 }, +} + +/// Add a multicast entry to the decap table, keyed on +/// `mcast_external_grp` and setting the port bitmap. +pub(crate) fn add_bitmap_entry( + s: &Switch, + mcast_external_grp: MulticastGroupId, + port_bitmap: &PortBitmap, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = MatchKeyDecapPorts::new(mcast_external_grp); + + let action_data = match vlan_id { + None => port_bitmap.to_action(), + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + port_bitmap.to_action_vlan(vlan_id) + } + }; + debug!( + s.log, + "add multicast egress entry for decap {} -> {:?}", + match_key, + action_data + ); + + s.table_entry_add( + TableType::McastEgressDecapPorts, + &match_key, + &action_data, + ) +} + +/// Update a multicast entry in the decap table, keyed on +/// `mcast_external_grp` and setting the port bitmap. +pub(crate) fn update_bitmap_entry( + s: &Switch, + mcast_external_grp: MulticastGroupId, + port_bitmap: &PortBitmap, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = MatchKeyDecapPorts::new(mcast_external_grp); + let action_data = match vlan_id { + None => port_bitmap.to_action(), + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + port_bitmap.to_action_vlan(vlan_id) + } + }; + + debug!( + s.log, + "update multicast egress entry for decap {} -> {:?}", + match_key, + action_data + ); + + s.table_entry_update( + TableType::McastEgressDecapPorts, + &match_key, + &action_data, + ) +} + +/// Delete a multicast entry from the decap table, keyed on +/// `mcast_external_grp`. +pub(crate) fn del_bitmap_entry( + s: &Switch, + mcast_external_grp: MulticastGroupId, +) -> DpdResult<()> { + let match_key = MatchKeyDecapPorts::new(mcast_external_grp); + + debug!( + s.log, + "delete multicast egress entry for decap {} -> {}", + match_key, + mcast_external_grp + ); + + s.table_entry_del(TableType::McastEgressDecapPorts, &match_key) +} + +/// Dump the multicast decap table. +pub(crate) fn bitmap_table_dump(s: &Switch) -> DpdResult { + s.table_dump::( + TableType::McastEgressDecapPorts, + ) +} + +/// Fetch the multicast decap table counters. +pub(crate) fn bitmap_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::( + force_sync, + TableType::McastEgressDecapPorts, + ) +} + +/// Reset the multicast decap table. +pub(crate) fn reset_bitmap_table(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::McastEgressDecapPorts) +} + +/// Add a port ID entry to the port ID table for converting ASIC port IDs +/// to port numbers. +pub(crate) fn add_port_mapping_entry( + s: &Switch, + asic_port_id: u16, +) -> DpdResult<()> { + let match_key = MatchKeyPortId::new(asic_port_id); + + let (port, _) = s.asic_id_to_port_link(asic_port_id)?; + + let action_data = PortIdAction::SetPortNumber { + port_number: port.as_u8(), + }; + + debug!( + s.log, + "add port id entry {} -> {:?}", match_key, action_data + ); + + s.table_entry_add( + TableType::McastEgressPortMapping, + &match_key, + &action_data, + ) +} + +/// Update a port ID entry in the port ID table for converting ASIC port IDs +/// to port numbers. +#[allow(dead_code)] +pub(crate) fn update_port_mapping_entry( + s: &Switch, + asic_port_id: u16, +) -> DpdResult<()> { + let match_key = MatchKeyPortId::new(asic_port_id); + + let (port, _) = s.asic_id_to_port_link(asic_port_id)?; + + let action_data = PortIdAction::SetPortNumber { + port_number: port.as_u8(), + }; + + debug!( + s.log, + "update port id entry {} -> {:?}", match_key, action_data + ); + + s.table_entry_update( + TableType::McastEgressPortMapping, + &match_key, + &action_data, + ) +} + +/// Delete a port ID entry from the port ID table for converting ASIC port IDs +/// to port numbers. +pub(crate) fn del_port_mapping_entry( + s: &Switch, + asic_port_id: u16, +) -> DpdResult<()> { + let match_key = MatchKeyPortId::new(asic_port_id); + + debug!( + s.log, + "delete port id entry {} -> {}", match_key, asic_port_id + ); + + s.table_entry_del(TableType::McastEgressPortMapping, &match_key) +} + +/// Dump the multicast port mapping table. +pub(crate) fn port_mapping_table_dump(s: &Switch) -> DpdResult { + s.table_dump::( + TableType::McastEgressPortMapping, + ) +} + +/// Fetch the multicast port mapping table counters. +pub(crate) fn port_mapping_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::( + force_sync, + TableType::McastEgressPortMapping, + ) +} + +/// Structure to hold and manipulate the 256-bit port bitmap. +#[derive(Debug, Clone, Default)] +pub(crate) struct PortBitmap { + // 8 x 32-bit values representing all 256 ports + ports: [u32; 8], +} + +impl PortBitmap { + /// Create a new empty port bitmap. + pub(crate) fn new() -> Self { + Self { ports: [0; 8] } + } + + /// Add a port to the bitmap. + pub(crate) fn add_port(&mut self, port: u8) { + let array_idx = (port >> 5) as usize; // Divide by 32 to get array index + let bit_pos = port & 0x1F; // Modulo 32 to get bit position + let mask = 1u32 << bit_pos; // Create mask with the appropriate bit set + self.ports[array_idx] |= mask; // Set the bit + } + + /// Remove a port from the bitmap + #[allow(dead_code)] + pub(crate) fn remove_port(&mut self, port: u16) { + let array_idx = (port >> 5) as usize; + let bit_pos = port & 0x1F; + let mask = 1u32 << bit_pos; + + self.ports[array_idx] &= !mask; // Clear the bit + } + + /// Check if a port is in the bitmap + #[allow(dead_code)] + fn contains_port(&self, port: u16) -> bool { + if port >= 256 { + return false; + } + + let array_idx = (port >> 5) as usize; + let bit_pos = port & 0x1F; + let mask = 1u32 << bit_pos; + + (self.ports[array_idx] & mask) != 0 + } + + /// Convert to an action for the P4 table + fn to_action(&self) -> DecapPortsAction { + DecapPortsAction::SetDecapPorts { + ports_0: self.ports[0], + ports_1: self.ports[1], + ports_2: self.ports[2], + ports_3: self.ports[3], + ports_4: self.ports[4], + ports_5: self.ports[5], + ports_6: self.ports[6], + ports_7: self.ports[7], + } + } + + /// Convert to an action for the P4 table with Vlan ID + fn to_action_vlan(&self, vlan_id: u16) -> DecapPortsAction { + DecapPortsAction::SetDecapPortsAndVlan { + ports_0: self.ports[0], + ports_1: self.ports[1], + ports_2: self.ports[2], + ports_3: self.ports[3], + ports_4: self.ports[4], + ports_5: self.ports[5], + ports_6: self.ports[6], + ports_7: self.ports[7], + vlan_id, + } + } + + /// Get the raw port bitmap values + #[allow(dead_code)] + fn get_port_values(&self) -> &[u32; 8] { + &self.ports + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_port_bitmap() { + let mut bitmap = PortBitmap::new(); + bitmap.add_port(5); + bitmap.add_port(10); + bitmap.add_port(255); + + assert!(bitmap.contains_port(5)); + assert!(bitmap.contains_port(10)); + assert!(bitmap.contains_port(255)); + assert!(!bitmap.contains_port(256)); + + bitmap.remove_port(10); + assert!(!bitmap.contains_port(10)); + } +} diff --git a/dpd/src/table/mcast/mcast_nat.rs b/dpd/src/table/mcast/mcast_nat.rs new file mode 100644 index 0000000..d01c4f1 --- /dev/null +++ b/dpd/src/table/mcast/mcast_nat.rs @@ -0,0 +1,191 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast NAT entries. + +use std::net::{Ipv4Addr, Ipv6Addr}; + +use crate::{table::*, Switch}; + +use super::{Ipv4MatchKey, Ipv6MatchKey}; + +use aal::ActionParse; +use aal_macros::*; +use common::{nat::NatTarget, network::MacAddr}; +use slog::debug; + +/// IPv4 Table for multicast NAT entries. +pub(crate) const IPV4_TABLE_NAME: &str = + "pipe.Ingress.nat_ingress.ingress_ipv4_mcast"; +/// IPv6 Table for multicast NAT entries. +pub(crate) const IPV6_TABLE_NAME: &str = + "pipe.Ingress.nat_ingress.ingress_ipv6_mcast"; + +#[derive(ActionParse, Debug)] +enum Ipv4Action { + #[action_xlate(name = "mcast_forward_ipv4_to")] + Forward { + target: Ipv6Addr, + inner_mac: MacAddr, + vni: u32, + }, +} + +#[derive(ActionParse, Debug)] +enum Ipv6Action { + #[action_xlate(name = "mcast_forward_ipv6_to")] + Forward { + target: Ipv6Addr, + inner_mac: MacAddr, + vni: u32, + }, +} + +/// Add a NAT entry for IPv4 multicast traffic, keyed on `ip`. +pub(crate) fn add_ipv4_entry( + s: &Switch, + ip: Ipv4Addr, + tgt: NatTarget, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(ip); + let action_key = Ipv4Action::Forward { + target: tgt.internal_ip, + inner_mac: tgt.inner_mac, + vni: tgt.vni.as_u32(), + }; + + debug!( + s.log, + "add ingress mcast entry {} -> {:?}", match_key, action_key + ); + + s.table_entry_add(TableType::NatIngressIpv4Mcast, &match_key, &action_key) +} + +/// Update a NAT entry for IPv4 multicast traffic. +pub(crate) fn update_ipv4_entry( + s: &Switch, + ip: Ipv4Addr, + tgt: NatTarget, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(ip); + let action_key = Ipv4Action::Forward { + target: tgt.internal_ip, + inner_mac: tgt.inner_mac, + vni: tgt.vni.as_u32(), + }; + + debug!( + s.log, + "update ingress mcast entry {} -> {:?}", match_key, action_key + ); + + s.table_entry_update( + TableType::NatIngressIpv4Mcast, + &match_key, + &action_key, + ) +} + +/// Delete a NAT entry for IPv4 multicast traffic, keyed on `ip`. +pub(crate) fn del_ipv4_entry(s: &Switch, ip: Ipv4Addr) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(ip); + + debug!(s.log, "delete ingress mcast entry {}", match_key); + + s.table_entry_del(TableType::NatIngressIpv4Mcast, &match_key) +} + +/// Dump the IPv4 NAT table's contents. +pub(crate) fn ipv4_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::NatIngressIpv4Mcast) +} + +/// Fetch the IPv4 NAT table's counters. +pub(crate) fn ipv4_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::NatIngressIpv4Mcast) +} + +/// Reset the Ipv4 NAT table. +pub(crate) fn reset_ipv4(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::NatIngressIpv4Mcast) +} + +/// Add a NAT entry for IPv6 multicast traffic, keyed on `ip`. +pub(crate) fn add_ipv6_entry( + s: &Switch, + ip: Ipv6Addr, + tgt: NatTarget, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(ip); + let action_key = Ipv6Action::Forward { + target: tgt.internal_ip, + inner_mac: tgt.inner_mac, + vni: tgt.vni.as_u32(), + }; + + debug!( + s.log, + "add ingress mcast entry {} -> {:?}", match_key, action_key + ); + + s.table_entry_add(TableType::NatIngressIpv6Mcast, &match_key, &action_key) +} + +/// Update a NAT entry for IPv6 multicast traffic. +pub(crate) fn update_ipv6_entry( + s: &Switch, + ip: Ipv6Addr, + tgt: NatTarget, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(ip); + let action_key = Ipv6Action::Forward { + target: tgt.internal_ip, + inner_mac: tgt.inner_mac, + vni: tgt.vni.as_u32(), + }; + + debug!( + s.log, + "update ingress mcast entry {} -> {:?}", match_key, action_key + ); + + s.table_entry_update( + TableType::NatIngressIpv6Mcast, + &match_key, + &action_key, + ) +} + +/// Delete a NAT entry for IPv6 multicast traffic, keyed on `ip`. +pub(crate) fn del_ipv6_entry(s: &Switch, ip: Ipv6Addr) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(ip); + + debug!(s.log, "delete ingress mcast entry {}", match_key); + + s.table_entry_del(TableType::NatIngressIpv6Mcast, &match_key) +} + +/// Dump the IPv6 NAT table's contents. +pub(crate) fn ipv6_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::NatIngressIpv6Mcast) +} + +/// Fetch the IPv6 NAT table's counters. +pub(crate) fn ipv6_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::NatIngressIpv6Mcast) +} + +/// Reset the Ipv6 NAT table. +pub(crate) fn reset_ipv6(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::NatIngressIpv6Mcast) +} diff --git a/dpd/src/table/mcast/mcast_port_mac.rs b/dpd/src/table/mcast/mcast_port_mac.rs new file mode 100644 index 0000000..687d35a --- /dev/null +++ b/dpd/src/table/mcast/mcast_port_mac.rs @@ -0,0 +1,25 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast port MAC entries. + +use crate::table::{MacTable, TableType}; + +/// Table for multicast port MAC entries. +pub const TABLE_NAME: &str = "pipe.Egress.mac_rewrite.mac_rewrite"; + +/// Table for multicast port MAC entries. +pub struct PortMacTable; + +impl MacTable for PortMacTable { + fn table_type() -> TableType { + TableType::PortMacMcast + } + + fn table_name() -> &'static str { + TABLE_NAME + } +} diff --git a/dpd/src/table/mcast/mcast_replication.rs b/dpd/src/table/mcast/mcast_replication.rs new file mode 100644 index 0000000..917dd1b --- /dev/null +++ b/dpd/src/table/mcast/mcast_replication.rs @@ -0,0 +1,157 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast replication information. + +use std::net::Ipv6Addr; + +use crate::{mcast::MulticastGroupId, table::*, Switch}; + +use super::Ipv6MatchKey; + +use aal::ActionParse; +use aal_macros::*; +use slog::debug; + +/// IPv6 Table for multicast replication entries and group membership. +pub(crate) const IPV6_TABLE_NAME: &str = + "pipe.Ingress.mcast_ingress.mcast_replication_ipv6"; + +#[derive(ActionParse, Debug)] +enum Ipv6Action { + #[action_xlate(name = "configure_mcastv6")] + ConfigureIpv6 { + mcast_grp_a: MulticastGroupId, + mcast_grp_b: MulticastGroupId, + rid: u16, + level1_excl_id: u16, + // This is a `bit` in the P4 sidecar and tofino doc, but we can't + // represent that in Rust, so we validate in the caller. + level2_excl_id: u16, + }, +} + +/// Add an IPv6 multicast entries to the replication table: +/// `dst_addr -> underlay_mcast_grp && external_mcast_grp, replication_id, +/// level1_excl_id, level2_excl_id`. +/// +/// The bifurcated replication supports: +/// - external_mcast_grp: for replication to external/customer ports (mcast_grp_a) +/// - underlay_mcast_grp: for replication to underlay/infrastructure ports (mcast_grp_b) +/// +/// Both groups are optional depending on the group's member configuration. +pub(crate) fn add_ipv6_entry( + s: &Switch, + dst_addr: Ipv6Addr, + underlay_mcast_grp: Option, + external_mcast_grp: Option, + replication_id: u16, + level1_excl_id: u16, + level2_excl_id: u16, +) -> DpdResult<()> { + if level2_excl_id > 511 { + return Err(DpdError::Invalid( + "`level2 exclusion id` exceeds 9-bit range".to_string(), + )); + } + + if underlay_mcast_grp.is_none() && external_mcast_grp.is_none() { + return Err(DpdError::McastGroupFailure( + "neither underlay nor external multicast group specified" + .to_string(), + )); + } + + let match_key = Ipv6MatchKey::new(dst_addr); + + let action_data = Ipv6Action::ConfigureIpv6 { + mcast_grp_a: external_mcast_grp.unwrap_or(0), + mcast_grp_b: underlay_mcast_grp.unwrap_or(0), + rid: replication_id, + level1_excl_id, + level2_excl_id, + }; + + debug!( + s.log, + "add mcast_ipv6 entry {} -> {:?}", dst_addr, action_data + ); + + s.table_entry_add(TableType::McastIpv6, &match_key, &action_data) +} + +/// Update an IPv6 multicast entries in the replication table. +/// +/// Updates the bifurcated replication configuration: +/// - external_mcast_grp: for replication to external/customer ports (mcast_grp_a) +/// - underlay_mcast_grp: for replication to underlay/infrastructure ports (mcast_grp_b) +pub(crate) fn update_ipv6_entry( + s: &Switch, + dst_addr: Ipv6Addr, + underlay_mcast_grp: Option, + external_mcast_grp: Option, + replication_id: u16, + level1_excl_id: u16, + level2_excl_id: u16, +) -> DpdResult<()> { + if level2_excl_id > 511 { + return Err(DpdError::Invalid( + "`level2 exclusion id` exceeds 9-bit range".to_string(), + )); + } + + if underlay_mcast_grp.is_none() && external_mcast_grp.is_none() { + return Err(DpdError::McastGroupFailure( + "neither underlay nor external multicast group specified" + .to_string(), + )); + } + + let match_key = Ipv6MatchKey::new(dst_addr); + + let action_data = Ipv6Action::ConfigureIpv6 { + mcast_grp_a: external_mcast_grp.unwrap_or(0), + mcast_grp_b: underlay_mcast_grp.unwrap_or(0), + rid: replication_id, + level1_excl_id, + level2_excl_id, + }; + + debug!( + s.log, + "update mcast_ipv6 entry {} -> {:?}", dst_addr, action_data + ); + + s.table_entry_update(TableType::McastIpv6, &match_key, &action_data) +} + +/// Delete an IPv6 multicast entries from replication table, keyed on +/// `dst_addr`. +pub(crate) fn del_ipv6_entry(s: &Switch, dst_addr: Ipv6Addr) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(dst_addr); + + debug!(s.log, "delete mcast_ipv6 entry {}", match_key); + + s.table_entry_del(TableType::McastIpv6, &match_key) +} + +/// Dump the IPv6 multicast table's contents. +pub(crate) fn ipv6_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::McastIpv6) +} + +/// Fetch the IPv6 multicast table's counters. +pub(crate) fn ipv6_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::McastIpv6) +} + +/// Reset the IPv6 multicast replication table. +pub(crate) fn reset_ipv6(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::McastIpv6) +} diff --git a/dpd/src/table/mcast/mcast_route.rs b/dpd/src/table/mcast/mcast_route.rs new file mode 100644 index 0000000..f6abd95 --- /dev/null +++ b/dpd/src/table/mcast/mcast_route.rs @@ -0,0 +1,212 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast routing entries (on Ingress to the switch). + +use std::net::{Ipv4Addr, Ipv6Addr}; + +use crate::{table::*, Switch}; + +use super::{Ipv4MatchKey, Ipv6MatchKey}; + +use aal::ActionParse; +use aal_macros::*; +use oxnet::Ipv6Net; +use slog::debug; + +/// IPv4 Table for multicast routing entries. +pub(crate) const IPV4_TABLE_NAME: &str = + "pipe.Ingress.l3_router.MulticastRouter4.tbl"; +/// IPv6 Table for multicast routing entries. +pub(crate) const IPV6_TABLE_NAME: &str = + "pipe.Ingress.l3_router.MulticastRouter6.tbl"; + +#[derive(ActionParse, Debug)] +enum Ipv4Action { + #[action_xlate(name = "forward")] + Forward, + #[action_xlate(name = "forward_vlan")] + ForwardVLAN { vlan_id: u16 }, +} + +#[derive(ActionParse, Debug)] +enum Ipv6Action { + #[action_xlate(name = "forward")] + Forward, + #[action_xlate(name = "forward_vlan")] + ForwardVLAN { vlan_id: u16 }, +} + +/// Add an IPv4 multicast route entry to the routing table, keyed on +/// `route`, with an optional `vlan_id`. +pub(crate) fn add_ipv4_entry( + s: &Switch, + route: Ipv4Addr, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(route); + + let action_data = match vlan_id { + None => Ipv4Action::Forward, + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + Ipv4Action::ForwardVLAN { vlan_id } + } + }; + + debug!( + s.log, + "add multicast route entry {} -> {:?}", route, action_data + ); + + s.table_entry_add(TableType::RouteIpv4Mcast, &match_key, &action_data) +} + +/// Update an IPv4 multicast route entry in the routing table. +pub(crate) fn update_ipv4_entry( + s: &Switch, + route: Ipv4Addr, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(route); + let action_data = match vlan_id { + None => Ipv4Action::Forward, + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + Ipv4Action::ForwardVLAN { vlan_id } + } + }; + + debug!( + s.log, + "update multicast route entry {} -> {:?}", route, action_data + ); + + s.table_entry_update(TableType::RouteIpv4Mcast, &match_key, &action_data) +} + +/// Delete an IPv4 multicast route entry from table, keyed on +/// `route`. +pub(crate) fn del_ipv4_entry(s: &Switch, route: Ipv4Addr) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(route); + + debug!(s.log, "delete multicast route entry {}", match_key); + + s.table_entry_del(TableType::RouteIpv4Mcast, &match_key) +} + +/// Dump the IPv4 multicast routing table's contents. +pub(crate) fn ipv4_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::RouteIpv4Mcast) +} + +/// Fetch the IPv4 multicast routing table's counters. +pub(crate) fn ipv4_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::RouteIpv4Mcast) +} + +/// Reset the IPv4 multicast routing table. +pub(crate) fn reset_ipv4(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::RouteIpv4Mcast) +} + +/// Add an IPv6 multicast route entry to the routing table, keyed on +/// `route`, with an optional `vlan_id`. +pub(crate) fn add_ipv6_entry( + s: &Switch, + route: Ipv6Addr, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(route); + let internal_ip = Ipv6Net::new_unchecked(route, 128); + + // Admin-scoped multicast and unique local addresses are internal to the rack + // and don't require VLAN tagging, so always use Forward action + let action_data: Ipv6Action = if internal_ip.is_admin_scoped_multicast() + || internal_ip.is_unique_local() + { + Ipv6Action::Forward + } else { + match vlan_id { + None => Ipv6Action::Forward, + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + Ipv6Action::ForwardVLAN { vlan_id } + } + } + }; + + debug!( + s.log, + "add multicast route entry {} -> {:?}", route, action_data + ); + + s.table_entry_add(TableType::RouteIpv6Mcast, &match_key, &action_data) +} + +/// Update an IPv6 multicast route entry in the routing table. +pub(crate) fn update_ipv6_entry( + s: &Switch, + route: Ipv6Addr, + vlan_id: Option, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(route); + let internal_ip = Ipv6Net::new_unchecked(route, 128); + + // Admin-scoped multicast and unique local addresses are internal to the rack + // and don't require VLAN tagging, so always use Forward action + let action_data: Ipv6Action = if internal_ip.is_admin_scoped_multicast() + || internal_ip.is_unique_local() + { + Ipv6Action::Forward + } else { + match vlan_id { + None => Ipv6Action::Forward, + Some(vlan_id) => { + common::network::validate_vlan(vlan_id)?; + Ipv6Action::ForwardVLAN { vlan_id } + } + } + }; + + debug!( + s.log, + "update multicast route entry {} -> {:?}", route, action_data + ); + + s.table_entry_update(TableType::RouteIpv6Mcast, &match_key, &action_data) +} + +/// Delete an IPv6 multicast entry from routing table, keyed on +/// `route`. +pub(crate) fn del_ipv6_entry(s: &Switch, route: Ipv6Addr) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(route); + + debug!(s.log, "delete multicast route entry {}", match_key); + + s.table_entry_del(TableType::RouteIpv6Mcast, &match_key) +} + +/// Dump the IPv6 multicast routing table's contents. +pub(crate) fn ipv6_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::RouteIpv6Mcast) +} + +/// Fetch the IPv6 multicast routing table's counters. +pub(crate) fn ipv6_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::RouteIpv6Mcast) +} + +/// Reset the IPv6 multicast routing table. +pub(crate) fn reset_ipv6(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::RouteIpv6Mcast) +} diff --git a/dpd/src/table/mcast/mcast_src_filter.rs b/dpd/src/table/mcast/mcast_src_filter.rs new file mode 100644 index 0000000..c058640 --- /dev/null +++ b/dpd/src/table/mcast/mcast_src_filter.rs @@ -0,0 +1,181 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Table operations for multicast source filter entries. + +use std::{ + fmt, + net::{Ipv4Addr, Ipv6Addr}, +}; + +use crate::{table::*, Switch}; + +use aal::{ActionParse, MatchParse}; +use aal_macros::*; +use oxnet::Ipv4Net; +use slog::debug; + +/// IPv4 Table for multicast source filter entries. +pub(crate) const IPV4_TABLE_NAME: &str = + "pipe.Ingress.mcast_ingress.mcast_source_filter_ipv4"; +/// IPv6 Table for multicast source filter entries. +pub(crate) const IPV6_TABLE_NAME: &str = + "pipe.Ingress.mcast_ingress.mcast_source_filter_ipv6"; + +#[derive(MatchParse, Hash)] +struct Ipv4MatchKey { + #[match_xlate(name = "src_addr", type = "lpm")] + src_addr: Ipv4Net, + dst_addr: Ipv4Addr, +} + +impl Ipv4MatchKey { + fn new(src_addr: Ipv4Net, dst_addr: Ipv4Addr) -> Self { + Self { src_addr, dst_addr } + } +} + +impl fmt::Display for Ipv4MatchKey { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} -> {}", self.src_addr, self.dst_addr) + } +} + +#[derive(MatchParse, Hash)] +struct Ipv6MatchKey { + src_addr: Ipv6Addr, + dst_addr: Ipv6Addr, +} + +impl Ipv6MatchKey { + fn new(src_addr: Ipv6Addr, dst_addr: Ipv6Addr) -> Self { + Self { src_addr, dst_addr } + } +} + +impl fmt::Display for Ipv6MatchKey { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} -> {}", self.src_addr, self.dst_addr) + } +} + +#[derive(ActionParse, Debug)] +enum Ipv4Action { + #[action_xlate(name = "allow_source_mcastv4")] + AllowSrc, +} + +#[derive(ActionParse, Debug)] +enum Ipv6Action { + #[action_xlate(name = "allow_source_mcastv6")] + AllowSrc, +} + +/// Add a source filter entry for IPv4 multicast traffic: +/// `src_addr, dst_addr -> allow_source_mcastv4`. +pub(crate) fn add_ipv4_entry( + s: &Switch, + src_addr: Ipv4Net, + dst_addr: Ipv4Addr, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(src_addr, dst_addr); + let action_data = Ipv4Action::AllowSrc; + + debug!( + s.log, + "add source filter entry {} -> {:?}", src_addr, action_data + ); + + s.table_entry_add(TableType::McastIpv4SrcFilter, &match_key, &action_data) +} + +/// Delete a source filter entry for IPv4 multicast traffic, keyed on +/// `src_addr, dst_addr`. +pub(crate) fn del_ipv4_entry( + s: &Switch, + src_addr: Ipv4Net, + dst_addr: Ipv4Addr, +) -> DpdResult<()> { + let match_key = Ipv4MatchKey::new(src_addr, dst_addr); + + debug!( + s.log, + "delete source filter entry {} -> {}", src_addr, dst_addr + ); + + s.table_entry_del(TableType::McastIpv4SrcFilter, &match_key) +} + +/// Dump the IPv4 multicast source filter table's contents. +pub(crate) fn ipv4_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::McastIpv4SrcFilter) +} + +/// Fetch the IPv4 multicast source filter table's counters. +pub(crate) fn ipv4_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::McastIpv4SrcFilter) +} + +/// Reset the IPv4 multicast source filter table. +pub(crate) fn reset_ipv4(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::McastIpv4SrcFilter) +} + +/// Add a source filter entry for IPv6 multicast traffic: +/// `src_addr, dst_addr -> allow_source_mcastv6`. +pub(crate) fn add_ipv6_entry( + s: &Switch, + src_addr: Ipv6Addr, + dst_addr: Ipv6Addr, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(src_addr, dst_addr); + let action_data = Ipv6Action::AllowSrc; + + debug!( + s.log, + "add source filter entry {} -> {:?}", src_addr, action_data + ); + + s.table_entry_add(TableType::McastIpv6SrcFilter, &match_key, &action_data) +} + +/// Delete a source filter entry for IPv6 multicast traffic, keyed on +/// `src_addr, dst_addr`. +pub(crate) fn del_ipv6_entry( + s: &Switch, + src_addr: Ipv6Addr, + dst_addr: Ipv6Addr, +) -> DpdResult<()> { + let match_key = Ipv6MatchKey::new(src_addr, dst_addr); + + debug!( + s.log, + "delete source filter entry {} -> {}", src_addr, dst_addr + ); + + s.table_entry_del(TableType::McastIpv6SrcFilter, &match_key) +} + +/// Dump the IPv6 multicast source filter table's contents. +pub(crate) fn ipv6_table_dump(s: &Switch) -> DpdResult { + s.table_dump::(TableType::McastIpv6SrcFilter) +} + +/// Fetch the IPv6 multicast source filter table's counters. +pub(crate) fn ipv6_counter_fetch( + s: &Switch, + force_sync: bool, +) -> DpdResult> { + s.counter_fetch::(force_sync, TableType::McastIpv6SrcFilter) +} + +/// Reset the IPv6 multicast source filter table. +pub(crate) fn reset_ipv6(s: &Switch) -> DpdResult<()> { + s.table_clear(TableType::McastIpv6SrcFilter) +} diff --git a/dpd/src/table/mcast/mod.rs b/dpd/src/table/mcast/mod.rs new file mode 100644 index 0000000..ee3ff5b --- /dev/null +++ b/dpd/src/table/mcast/mod.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/ +// +// Copyright 2025 Oxide Computer Company + +//! Multicast table operations. + +use std::{ + convert::TryInto, + fmt, + net::{Ipv4Addr, Ipv6Addr}, +}; + +use aal::MatchParse; +use aal_macros::*; + +pub(crate) mod mcast_egress; +pub(crate) mod mcast_nat; +pub(crate) mod mcast_port_mac; +pub(crate) mod mcast_replication; +pub(crate) mod mcast_route; +pub(crate) mod mcast_src_filter; + +#[derive(MatchParse, Hash)] +struct Ipv4MatchKey { + dst_addr: Ipv4Addr, +} + +impl Ipv4MatchKey { + fn new(dst_addr: Ipv4Addr) -> Self { + Self { dst_addr } + } +} + +impl fmt::Display for Ipv4MatchKey { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.dst_addr) + } +} + +#[derive(MatchParse, Hash)] +struct Ipv6MatchKey { + dst_addr: Ipv6Addr, +} + +impl Ipv6MatchKey { + pub(crate) fn new(dst_addr: Ipv6Addr) -> Self { + Self { dst_addr } + } +} + +impl fmt::Display for Ipv6MatchKey { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.dst_addr) + } +} diff --git a/dpd/src/table/mod.rs b/dpd/src/table/mod.rs index cd35113..fa1560e 100644 --- a/dpd/src/table/mod.rs +++ b/dpd/src/table/mod.rs @@ -7,7 +7,7 @@ use std::convert::TryFrom; use std::hash::Hash; -use slog::debug; +use slog::{debug, error, info}; use crate::types::*; use crate::views; @@ -15,8 +15,10 @@ use crate::Switch; use aal::ActionParse; use aal::MatchParse; use aal::TableOps; +use common::network::MacAddr; pub mod arp_ipv4; +pub mod mcast; pub mod nat; pub mod neighbor_ipv6; pub mod port_ip; @@ -25,7 +27,7 @@ pub mod port_nat; pub mod route_ipv4; pub mod route_ipv6; -const NAME_TO_TYPE: [(&str, TableType); 11] = [ +const NAME_TO_TYPE: [(&str, TableType); 21] = [ (route_ipv4::INDEX_TABLE_NAME, TableType::RouteIdxIpv4), (route_ipv4::FORWARD_TABLE_NAME, TableType::RouteFwdIpv4), (route_ipv6::TABLE_NAME, TableType::RouteIpv6), @@ -37,6 +39,43 @@ const NAME_TO_TYPE: [(&str, TableType); 11] = [ (nat::IPV4_TABLE_NAME, TableType::NatIngressIpv4), (nat::IPV6_TABLE_NAME, TableType::NatIngressIpv6), (port_nat::TABLE_NAME, TableType::NatOnly), + ( + mcast::mcast_replication::IPV6_TABLE_NAME, + TableType::McastIpv6, + ), + ( + mcast::mcast_src_filter::IPV4_TABLE_NAME, + TableType::McastIpv4SrcFilter, + ), + ( + mcast::mcast_src_filter::IPV6_TABLE_NAME, + TableType::McastIpv6SrcFilter, + ), + ( + mcast::mcast_nat::IPV4_TABLE_NAME, + TableType::NatIngressIpv4Mcast, + ), + ( + mcast::mcast_nat::IPV6_TABLE_NAME, + TableType::NatIngressIpv6Mcast, + ), + ( + mcast::mcast_route::IPV4_TABLE_NAME, + TableType::RouteIpv4Mcast, + ), + ( + mcast::mcast_route::IPV6_TABLE_NAME, + TableType::RouteIpv6Mcast, + ), + (mcast::mcast_port_mac::TABLE_NAME, TableType::PortMacMcast), + ( + mcast::mcast_egress::DECAP_PORTS_TABLE_NAME, + TableType::McastEgressDecapPorts, + ), + ( + mcast::mcast_egress::PORT_ID_TABLE_NAME, + TableType::McastEgressPortMapping, + ), ]; /// Basic statistics about p4 table usage @@ -79,7 +118,7 @@ impl TableUsage { } } -/// A p4 table +/// A P4 table. pub struct Table { /// Name of the table pub name: String, @@ -238,8 +277,40 @@ pub fn get_entries(switch: &Switch, name: String) -> DpdResult { TableType::NatIngressIpv6 => nat::ipv6_table_dump(switch), TableType::PortIpv4 => port_ip::ipv4_table_dump(switch), TableType::PortIpv6 => port_ip::ipv6_table_dump(switch), - TableType::PortMac => port_mac::table_dump(switch), + TableType::PortMac => { + MacOps::::table_dump(switch) + } TableType::NatOnly => port_nat::table_dump(switch), + TableType::McastIpv6 => { + mcast::mcast_replication::ipv6_table_dump(switch) + } + TableType::McastIpv4SrcFilter => { + mcast::mcast_src_filter::ipv4_table_dump(switch) + } + TableType::McastIpv6SrcFilter => { + mcast::mcast_src_filter::ipv6_table_dump(switch) + } + TableType::NatIngressIpv4Mcast => { + mcast::mcast_nat::ipv4_table_dump(switch) + } + TableType::NatIngressIpv6Mcast => { + mcast::mcast_nat::ipv6_table_dump(switch) + } + TableType::RouteIpv4Mcast => { + mcast::mcast_route::ipv4_table_dump(switch) + } + TableType::RouteIpv6Mcast => { + mcast::mcast_route::ipv6_table_dump(switch) + } + TableType::PortMacMcast => { + MacOps::::table_dump(switch) + } + TableType::McastEgressDecapPorts => { + mcast::mcast_egress::bitmap_table_dump(switch) + } + TableType::McastEgressPortMapping => { + mcast::mcast_egress::port_mapping_table_dump(switch) + } } } @@ -271,6 +342,34 @@ pub fn get_counters( TableType::PortIpv4 => port_ip::ipv4_counter_fetch(switch, force_sync), TableType::PortIpv6 => port_ip::ipv6_counter_fetch(switch, force_sync), TableType::NatOnly => port_nat::counter_fetch(switch, force_sync), + TableType::McastIpv6 => { + mcast::mcast_replication::ipv6_counter_fetch(switch, force_sync) + } + TableType::McastIpv4SrcFilter => { + mcast::mcast_src_filter::ipv4_counter_fetch(switch, force_sync) + } + TableType::McastIpv6SrcFilter => { + mcast::mcast_src_filter::ipv6_counter_fetch(switch, force_sync) + } + TableType::NatIngressIpv4Mcast => { + mcast::mcast_nat::ipv4_counter_fetch(switch, force_sync) + } + TableType::NatIngressIpv6Mcast => { + mcast::mcast_nat::ipv6_counter_fetch(switch, force_sync) + } + TableType::RouteIpv4Mcast => { + mcast::mcast_route::ipv4_counter_fetch(switch, force_sync) + } + TableType::RouteIpv6Mcast => { + mcast::mcast_route::ipv6_counter_fetch(switch, force_sync) + } + TableType::McastEgressDecapPorts => { + mcast::mcast_egress::bitmap_counter_fetch(switch, force_sync) + } + TableType::McastEgressPortMapping => { + mcast::mcast_egress::port_mapping_counter_fetch(switch, force_sync) + } + // There is no counter in the PortMac table, as it duplicates data // already available in the rmon egress counter. _ => Err(DpdError::NoSuchTable(name)), @@ -281,7 +380,9 @@ pub fn get_counters( pub enum TableType { RouteIdxIpv4, RouteFwdIpv4, + RouteIpv4Mcast, RouteIpv6, + RouteIpv6Mcast, ArpIpv4, NeighborIpv6, PortMac, @@ -290,6 +391,14 @@ pub enum TableType { NatIngressIpv4, NatIngressIpv6, NatOnly, + McastIpv6, + McastIpv4SrcFilter, + McastIpv6SrcFilter, + NatIngressIpv4Mcast, + NatIngressIpv6Mcast, + PortMacMcast, + McastEgressDecapPorts, + McastEgressPortMapping, } impl TryFrom<&str> for TableType { @@ -316,3 +425,133 @@ pub fn init(switch: &mut Switch) -> anyhow::Result<()> { Ok(()) } + +// Common trait for Mac-related table rewriting +pub trait MacTable { + // The table type identifier + fn table_type() -> TableType; + fn table_name() -> &'static str; +} + +#[derive(aal_macros::MatchParse, Debug, Hash)] +struct MacMatchKey { + port: u16, +} + +#[derive(aal_macros::ActionParse, Debug)] +enum MacAction { + #[action_xlate(name = "rewrite")] + Rewrite { mac: MacAddr }, +} + +// Generic MAC operations that work with any table that implements MacTable +pub struct MacOps { + _phantom: std::marker::PhantomData, +} + +impl MacOps { + /// Update an _existing_ entry in the MAC table. + #[allow(dead_code)] + pub(crate) fn mac_update( + s: &Switch, + port: u16, + mac: MacAddr, + ) -> DpdResult<()> { + let match_key = MacMatchKey { port }; + let action_data = MacAction::Rewrite { mac }; + + match s.table_entry_update(T::table_type(), &match_key, &action_data) { + Ok(_) => { + info!( + s.log, + "update mac on {} in table {}: {}", + port, + T::table_name(), + mac + ); + Ok(()) + } + Err(e) => { + error!( + s.log, + "update mac on {} in table {}: {} failed: {:?}", + port, + T::table_name(), + mac, + e + ); + Err(e) + } + } + } + + /// Add a new entry to the MAC table. + /// + /// An error is returned if the entry already exists. Use `mac_update` instead. + pub fn mac_set(s: &Switch, port: u16, mac: MacAddr) -> DpdResult<()> { + let match_key = MacMatchKey { port }; + let action_data = MacAction::Rewrite { mac }; + + match s.table_entry_add(T::table_type(), &match_key, &action_data) { + Ok(_) => { + info!( + s.log, + "set mac on {} in table {}: {}", + port, + T::table_name(), + mac + ); + Ok(()) + } + Err(e) => { + error!( + s.log, + "set mac on {} in table {}: {} failed: {:?}", + port, + T::table_name(), + mac, + e + ); + Err(e) + } + } + } + + /// Remove an entry from the MAC table. + pub fn mac_clear(s: &Switch, port: u16) -> DpdResult<()> { + let match_key = MacMatchKey { port }; + + match s.table_entry_del(T::table_type(), &match_key) { + Ok(_) => { + info!( + s.log, + "cleared mac on {} in table {}", + port, + T::table_name() + ); + Ok(()) + } + Err(e) => { + error!( + s.log, + "clear mac on {} in table {} failed: {:?}", + port, + T::table_name(), + e + ); + Err(e) + } + } + } + + pub fn table_dump(s: &Switch) -> DpdResult { + s.table_dump::(T::table_type()) + } + + /// Remove all entries from the MAC table. + #[cfg_attr(not(feature = "tofino_asic"), allow(dead_code))] + pub fn reset(s: &Switch) -> DpdResult<()> { + info!(s.log, "reset port macs in table {}", T::table_name()); + s.table_clear(T::table_type()) + } +} diff --git a/dpd/src/table/port_ip.rs b/dpd/src/table/port_ip.rs index 06cad94..0a3d322 100644 --- a/dpd/src/table/port_ip.rs +++ b/dpd/src/table/port_ip.rs @@ -25,15 +25,15 @@ pub const IPV6_TABLE_NAME: &str = "pipe.Ingress.filter.switch_ipv6_addr"; struct Ipv4MatchKey { #[match_xlate(name = "orig_dst_ipv4")] dst_addr: Ipv4Addr, - #[match_xlate(name = "in_port", type = "mask")] - port: MatchMask, + #[match_xlate(name = "ingress_port", type = "mask")] + in_port: MatchMask, } #[derive(MatchParse, Hash)] struct Ipv6MatchKey { dst_addr: Ipv6Addr, - #[match_xlate(name = "in_port", type = "mask")] - port: MatchMask, + #[match_xlate(name = "ingress_port", type = "mask")] + in_port: MatchMask, } #[derive(ActionParse)] @@ -72,14 +72,14 @@ const REPAIR_ATTEMPTS: usize = 3; fn match_keys_ipv4(ipv4: Ipv4Addr, port: u16) -> (Ipv4MatchKey, Ipv4MatchKey) { let claim_key = Ipv4MatchKey { dst_addr: ipv4, - port: MatchMask { + in_port: MatchMask { val: port.into(), mask: 0x1ffu16.into(), }, }; let drop_key = Ipv4MatchKey { dst_addr: ipv4, - port: MatchMask { + in_port: MatchMask { val: port.into(), mask: 0u16.into(), }, @@ -90,14 +90,14 @@ fn match_keys_ipv4(ipv4: Ipv4Addr, port: u16) -> (Ipv4MatchKey, Ipv4MatchKey) { fn match_keys_ipv6(ipv6: Ipv6Addr, port: u16) -> (Ipv6MatchKey, Ipv6MatchKey) { let claim_key = Ipv6MatchKey { dst_addr: ipv6, - port: MatchMask { + in_port: MatchMask { val: port.into(), mask: 0x1ffu16.into(), }, }; let drop_key = Ipv6MatchKey { dst_addr: ipv6, - port: MatchMask { + in_port: MatchMask { val: 0u16.into(), mask: 0u16.into(), }, @@ -108,7 +108,7 @@ fn match_keys_ipv6(ipv6: Ipv6Addr, port: u16) -> (Ipv6MatchKey, Ipv6MatchKey) { pub fn loopback_ipv4_add(s: &Switch, ipv4: Ipv4Addr) -> DpdResult<()> { let claim_key = Ipv4MatchKey { dst_addr: ipv4, - port: MatchMask { + in_port: MatchMask { val: 0u16.into(), mask: 0u16.into(), }, @@ -125,7 +125,7 @@ pub fn loopback_ipv4_add(s: &Switch, ipv4: Ipv4Addr) -> DpdResult<()> { pub fn loopback_ipv4_delete(s: &Switch, ipv4: Ipv4Addr) -> DpdResult<()> { let claim_key = Ipv4MatchKey { dst_addr: ipv4, - port: MatchMask { + in_port: MatchMask { val: 0u16.into(), mask: 0u16.into(), }, @@ -142,7 +142,7 @@ pub fn loopback_ipv4_delete(s: &Switch, ipv4: Ipv4Addr) -> DpdResult<()> { pub fn loopback_ipv6_add(s: &Switch, ipv6: Ipv6Addr) -> DpdResult<()> { let claim_key = Ipv6MatchKey { dst_addr: ipv6, - port: MatchMask { + in_port: MatchMask { val: 0u16.into(), mask: 0u16.into(), }, @@ -159,7 +159,7 @@ pub fn loopback_ipv6_add(s: &Switch, ipv6: Ipv6Addr) -> DpdResult<()> { pub fn loopback_ipv6_delete(s: &Switch, ipv6: Ipv6Addr) -> DpdResult<()> { let claim_key = Ipv6MatchKey { dst_addr: ipv6, - port: MatchMask { + in_port: MatchMask { val: 0u16.into(), mask: 0u16.into(), }, diff --git a/dpd/src/table/port_mac.rs b/dpd/src/table/port_mac.rs index ad97370..267a12c 100644 --- a/dpd/src/table/port_mac.rs +++ b/dpd/src/table/port_mac.rs @@ -4,89 +4,18 @@ // // Copyright 2025 Oxide Computer Company -use std::convert::TryInto; - -use slog::{error, info}; - -use crate::table::*; -use crate::Switch; -use aal::{ActionParse, MatchParse}; -use aal_macros::*; -use common::network::MacAddr; +use super::{MacTable, TableType}; pub const TABLE_NAME: &str = "pipe.Ingress.mac_rewrite.mac_rewrite"; -#[derive(MatchParse, Debug, Hash)] -struct MatchKey { - port: u16, -} - -#[derive(ActionParse, Debug)] -enum Action { - #[action_xlate(name = "rewrite")] - Rewrite { mac: MacAddr }, -} - -/// Update an _existing_ entry in the MAC table. -#[allow(dead_code)] -pub fn mac_update(s: &Switch, port: u16, mac: MacAddr) -> DpdResult<()> { - let match_key = MatchKey { port }; - let action_data = Action::Rewrite { mac }; - - match s.table_entry_update(TableType::PortMac, &match_key, &action_data) { - Ok(_) => { - info!(s.log, "update mac on {}: {}", port, mac); - Ok(()) - } - Err(e) => { - error!(s.log, "update mac on {}: {} failed: {:?}", port, mac, e); - Err(e) - } - } -} - -/// Add a new entry to the MAC table. -/// -/// An error is returned if the entry already exists. Use `mac_update` instead. -pub fn mac_set(s: &Switch, port: u16, mac: MacAddr) -> DpdResult<()> { - let match_key = MatchKey { port }; - let action_data = Action::Rewrite { mac }; +pub struct PortMacTable; - match s.table_entry_add(TableType::PortMac, &match_key, &action_data) { - Ok(_) => { - info!(s.log, "set mac on {}: {}", port, mac); - Ok(()) - } - Err(e) => { - error!(s.log, "set mac on {}: {} failed: {:?}", port, mac, e); - Err(e) - } +impl MacTable for PortMacTable { + fn table_type() -> TableType { + TableType::PortMac } -} -/// Remove an entry from the MAC table. -pub fn mac_clear(s: &Switch, port: u16) -> DpdResult<()> { - let match_key = MatchKey { port }; - match s.table_entry_del(TableType::PortMac, &match_key) { - Ok(_) => { - info!(s.log, "cleared mac on {}", port); - Ok(()) - } - Err(e) => { - error!(s.log, "clear mac on {} failed: {:?}", port, e); - Err(e) - } + fn table_name() -> &'static str { + TABLE_NAME } } - -pub fn table_dump(s: &Switch) -> DpdResult { - s.table_dump::(TableType::PortMac) -} - -/// Remove all entries from the MAC table. -#[cfg_attr(not(feature = "tofino_asic"), allow(dead_code))] -pub fn reset(s: &Switch) -> DpdResult<()> { - info!(s.log, "reset port macs"); - - s.table_clear(TableType::PortMac) -} diff --git a/dpd/src/table/port_nat.rs b/dpd/src/table/port_nat.rs index 9e74251..64b5a5c 100644 --- a/dpd/src/table/port_nat.rs +++ b/dpd/src/table/port_nat.rs @@ -17,6 +17,7 @@ pub const TABLE_NAME: &str = "pipe.Ingress.nat_ingress.nat_only"; #[derive(MatchParse, Debug, Hash)] struct MatchKey { + #[match_xlate(name = "ingress_port")] in_port: u16, } diff --git a/dpd/src/table/route_ipv4.rs b/dpd/src/table/route_ipv4.rs index 534bc5e..62d953e 100644 --- a/dpd/src/table/route_ipv4.rs +++ b/dpd/src/table/route_ipv4.rs @@ -55,22 +55,23 @@ enum IndexAction { Index { idx: u16, slots: u8 }, } -// Add an entry to the route->index table +/// Add an entry to the route->index table pub fn add_route_index( s: &Switch, cidr: &Ipv4Net, idx: u16, slots: u8, ) -> DpdResult<()> { - let match_key = RouteKey { dst_addr: *cidr }; let action_data = IndexAction::Index { idx, slots }; + let match_key = RouteKey { dst_addr: *cidr }; + match s.table_entry_add(TableType::RouteIdxIpv4, &match_key, &action_data) { Ok(()) => { info!(s.log, "added ipv4 route entry"; "route" => %cidr, "index" => %idx, - "slots" => %slots); + "slots" => %slots); Ok(()) } Err(e) => { @@ -84,7 +85,7 @@ pub fn add_route_index( } } -// Remove an entry from the route->index table +/// Remove an entry from the route->index table pub fn delete_route_index(s: &Switch, cidr: &Ipv4Net) -> DpdResult<()> { let match_key = RouteKey { dst_addr: *cidr }; diff --git a/dpd/src/types.rs b/dpd/src/types.rs index 773ebbc..96c9768 100644 --- a/dpd/src/types.rs +++ b/dpd/src/types.rs @@ -85,6 +85,10 @@ pub enum DpdError { Oximeter(String), #[error("No switch identifiers available")] NoSwitchIdentifiers, + #[error("Multicast group failure: {}", .0)] + McastGroupFailure(String), + #[error("Resource exhausted: {}", .0)] + ResourceExhausted(String), } impl From for DpdError { @@ -263,6 +267,12 @@ impl convert::From for dropshot::HttpError { e @ DpdError::NoSwitchIdentifiers => { dropshot::HttpError::for_unavail(None, format!("{e}")) } + DpdError::McastGroupFailure(e) => { + dropshot::HttpError::for_internal_error(e.to_string()) + } + DpdError::ResourceExhausted(e) => { + dropshot::HttpError::for_unavail(None, e) + } } } } diff --git a/openapi/dpd.json b/openapi/dpd.json index fcadc34..cea174d 100644 --- a/openapi/dpd.json +++ b/openapi/dpd.json @@ -1084,6 +1084,388 @@ } } }, + "/multicast/external-groups": { + "post": { + "summary": "Create an external-only multicast group configuration.", + "description": "External-only groups are used for IPv4 and non-admin-scoped IPv6 multicast traffic that doesn't require replication infrastructure. These groups use simple forwarding tables and require a NAT target.", + "operationId": "multicast_group_create_external", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupCreateExternalEntry" + } + } + }, + "required": true + }, + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast/external-groups/{group_ip}": { + "put": { + "summary": "Update an external-only multicast group configuration for a given group IP address.", + "description": "External-only groups are used for IPv4 and non-admin-scoped IPv6 multicast traffic that doesn't require replication infrastructure.", + "operationId": "multicast_group_update_external", + "parameters": [ + { + "in": "path", + "name": "group_ip", + "required": true, + "schema": { + "type": "string", + "format": "ip" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupUpdateExternalEntry" + } + } + }, + "required": true + }, + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast/groups": { + "get": { + "summary": "List all multicast groups.", + "operationId": "multicast_groups_list", + "parameters": [ + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponseResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "post": { + "summary": "Create an internal multicast group configuration.", + "description": "Internal groups are used for admin-scoped IPv6 multicast traffic that requires replication infrastructure. These groups support both external and underlay members with full replication capabilities.", + "operationId": "multicast_group_create", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupCreateEntry" + } + } + }, + "required": true + }, + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Reset all multicast group configurations.", + "operationId": "multicast_reset", + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast/groups/{group_ip}": { + "get": { + "summary": "Get the multicast group configuration for a given group IP address.", + "operationId": "multicast_group_get", + "parameters": [ + { + "in": "path", + "name": "group_ip", + "required": true, + "schema": { + "type": "string", + "format": "ip" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Update an internal multicast group configuration for a given group IP address.", + "description": "Internal groups are used for admin-scoped IPv6 multicast traffic that requires replication infrastructure with external and underlay members.", + "operationId": "multicast_group_update", + "parameters": [ + { + "in": "path", + "name": "group_ip", + "required": true, + "schema": { + "type": "string", + "format": "ip" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupUpdateEntry" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Delete a multicast group configuration by IP address.", + "operationId": "multicast_group_delete", + "parameters": [ + { + "in": "path", + "name": "group_ip", + "required": true, + "schema": { + "type": "string", + "format": "ip" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast/tags/{tag}": { + "get": { + "summary": "List all multicast groups with a given tag.", + "operationId": "multicast_groups_list_by_tag", + "parameters": [ + { + "in": "path", + "name": "tag", + "required": true, + "schema": { + "type": "string" + } + }, + { + "in": "query", + "name": "limit", + "description": "Maximum number of items returned by a single call", + "schema": { + "nullable": true, + "type": "integer", + "format": "uint32", + "minimum": 1 + } + }, + { + "in": "query", + "name": "page_token", + "description": "Token returned by previous call to retrieve the subsequent page", + "schema": { + "nullable": true, + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/MulticastGroupResponseResultsPage" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + }, + "x-dropshot-pagination": { + "required": [] + } + }, + "delete": { + "summary": "Delete all multicast groups (and associated routes) with a given tag.", + "operationId": "multicast_reset_by_tag", + "parameters": [ + { + "in": "path", + "name": "tag", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast/untagged": { + "delete": { + "summary": "Delete all multicast groups (and associated routes) without a tag.", + "operationId": "multicast_reset_untagged", + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/nat/ipv4": { "get": { "summary": "Get all of the external addresses in use for IPv4 NAT mappings.", @@ -5486,6 +5868,14 @@ "readapt_cnt" ] }, + "Direction": { + "description": "Direction a multicast group member is reached by.\n\n`External` group members must have any packet encapsulation removed before packet delivery.", + "type": "string", + "enum": [ + "Underlay", + "External" + ] + }, "ElectricalMode": { "description": "The electrical mode of a QSFP-capable port.\n\nQSFP ports can be broken out into one of several different electrical configurations or modes. This describes how the transmit/receive lanes are grouped into a single, logical link.\n\nNote that the electrical mode may only be changed if there are no links within the port, _and_ if the inserted QSFP module actually supports this mode.", "oneOf": [ @@ -5535,6 +5925,18 @@ "request_id" ] }, + "ExternalForwarding": { + "description": "Represents the forwarding configuration for external multicast traffic.", + "type": "object", + "properties": { + "vlan_id": { + "nullable": true, + "type": "integer", + "format": "uint16", + "minimum": 0 + } + } + }, "Fault": { "description": "A Fault represents a specific kind of failure, and carries some additional context. Currently Faults are only used to describe Link failures, but there is no reason they couldn't be used elsewhere.", "oneOf": [ @@ -5748,6 +6150,20 @@ "port_id" ] }, + "InternalForwarding": { + "description": "Represents the NAT target for multicast traffic for internal/underlay forwarding.", + "type": "object", + "properties": { + "nat_target": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/NatTarget" + } + ] + } + } + }, "IpNet": { "x-rust-type": { "crate": "oxnet", @@ -5773,6 +6189,38 @@ } ] }, + "IpSrc": { + "description": "Source filter match key for multicast traffic.", + "oneOf": [ + { + "description": "Exact match for the source IP address.", + "type": "object", + "properties": { + "Exact": { + "type": "string", + "format": "ip" + } + }, + "required": [ + "Exact" + ], + "additionalProperties": false + }, + { + "description": "Subnet match for the source IP address.", + "type": "object", + "properties": { + "Subnet": { + "$ref": "#/components/schemas/Ipv4Net" + } + }, + "required": [ + "Subnet" + ], + "additionalProperties": false + } + ] + }, "Ipv4Entry": { "description": "An IPv4 address assigned to a link.", "type": "object", @@ -7035,6 +7483,218 @@ } } }, + "MulticastGroupCreateEntry": { + "description": "A multicast group configuration for POST requests for internal (to the rack) groups.", + "type": "object", + "properties": { + "group_ip": { + "type": "string", + "format": "ipv6" + }, + "members": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastGroupMember" + } + }, + "sources": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/IpSrc" + } + }, + "tag": { + "nullable": true, + "type": "string" + } + }, + "required": [ + "group_ip", + "members" + ] + }, + "MulticastGroupCreateExternalEntry": { + "description": "A multicast group configuration for POST requests for external (to the rack) groups.", + "type": "object", + "properties": { + "group_ip": { + "type": "string", + "format": "ip" + }, + "nat_target": { + "$ref": "#/components/schemas/NatTarget" + }, + "sources": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/IpSrc" + } + }, + "tag": { + "nullable": true, + "type": "string" + }, + "vlan_id": { + "nullable": true, + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "group_ip", + "nat_target" + ] + }, + "MulticastGroupMember": { + "description": "Represents a member of a multicast group.", + "type": "object", + "properties": { + "direction": { + "$ref": "#/components/schemas/Direction" + }, + "link_id": { + "$ref": "#/components/schemas/LinkId" + }, + "port_id": { + "$ref": "#/components/schemas/PortId" + } + }, + "required": [ + "direction", + "link_id", + "port_id" + ] + }, + "MulticastGroupResponse": { + "description": "Response structure for multicast group operations.", + "type": "object", + "properties": { + "ext_fwding": { + "$ref": "#/components/schemas/ExternalForwarding" + }, + "external_group_id": { + "nullable": true, + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "group_ip": { + "type": "string", + "format": "ip" + }, + "int_fwding": { + "$ref": "#/components/schemas/InternalForwarding" + }, + "members": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastGroupMember" + } + }, + "sources": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/IpSrc" + } + }, + "tag": { + "nullable": true, + "type": "string" + }, + "underlay_group_id": { + "nullable": true, + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "ext_fwding", + "group_ip", + "int_fwding", + "members" + ] + }, + "MulticastGroupResponseResultsPage": { + "description": "A single page of results", + "type": "object", + "properties": { + "items": { + "description": "list of items on this page of results", + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastGroupResponse" + } + }, + "next_page": { + "nullable": true, + "description": "token used to fetch the next page of results (if any)", + "type": "string" + } + }, + "required": [ + "items" + ] + }, + "MulticastGroupUpdateEntry": { + "description": "Represents a multicast replication entry for PUT requests for internal (to the rack) groups.", + "type": "object", + "properties": { + "members": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastGroupMember" + } + }, + "sources": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/IpSrc" + } + }, + "tag": { + "nullable": true, + "type": "string" + } + }, + "required": [ + "members" + ] + }, + "MulticastGroupUpdateExternalEntry": { + "description": "A multicast group update entry for PUT requests for external (to the rack) groups.", + "type": "object", + "properties": { + "nat_target": { + "$ref": "#/components/schemas/NatTarget" + }, + "sources": { + "nullable": true, + "type": "array", + "items": { + "$ref": "#/components/schemas/IpSrc" + } + }, + "tag": { + "nullable": true, + "type": "string" + }, + "vlan_id": { + "nullable": true, + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "nat_target" + ] + }, "NatTarget": { "description": "represents an internal NAT target", "type": "object", diff --git a/packet/src/ipv6.rs b/packet/src/ipv6.rs index 6a3f18a..3b84a42 100644 --- a/packet/src/ipv6.rs +++ b/packet/src/ipv6.rs @@ -62,9 +62,9 @@ impl Ipv6Hdr { v } - pub fn adjust_hlim(pkt: &mut Packet, delta: i8) { - let hdr = pkt.hdrs.ipv6_hdr.as_mut().unwrap(); - hdr.ipv6_hop_lim = (hdr.ipv6_hop_lim as i8 + delta) as u8; + pub fn adjust_hlim(pkt: &mut Packet, delta: i16) { + let hdr: &mut Ipv6Hdr = pkt.hdrs.ipv6_hdr.as_mut().unwrap(); + hdr.ipv6_hop_lim = (hdr.ipv6_hop_lim as i16 + delta) as u8; } } @@ -224,11 +224,11 @@ fn test_ipv6_parse() { " 3333 0000 00fb c869 cd3c 6917 86dd 6008 - 0d00 0010 11ff fe80 + 0d00 0010 11ff fe80 0000 0000 0000 0491 - 3609 ccb9 7632 ff02 + 3609 ccb9 7632 ff02 0000 0000 0000 0000 - 0000 0000 00fb 14e9 + 0000 0000 00fb 14e9 14e9 0000 e21a 0000 " ); diff --git a/tools/check_copyrights.sh b/tools/check_copyrights.sh index 07dfa97..caa5e35 100755 --- a/tools/check_copyrights.sh +++ b/tools/check_copyrights.sh @@ -37,7 +37,7 @@ function check_copyright { ANY="Copyright [0-9]+ Oxide Computer Company" grep -q "$CORRECT" $1 && return 0 - egrep -q "$ANY" $1 + egrep -q "$ANY" $1 if [ $? == 0 ]; then echo $1: Copyright with wrong year else @@ -46,7 +46,7 @@ function check_copyright { return 0 } -FILES=`git ls-files | egrep "\.(sh|xml|rs)$" | grep -v .github` +FILES=`git ls-files | egrep "\.(sh|xml|rs|p4)$" | grep -v .github` errs=0 for f in $FILES diff --git a/xtask/src/linux.rs b/xtask/src/linux.rs index cc6d0e4..966b18f 100644 --- a/xtask/src/linux.rs +++ b/xtask/src/linux.rs @@ -4,7 +4,6 @@ // // Copyright 2025 Oxide Computer Company -use std::collections::HashMap; use std::fs; use std::io::Write; use std::path::Path;