From 8ec33f2b56649ee22ca8ab70bb016ba9452c8fa8 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Mon, 2 Jun 2025 11:16:32 +0200 Subject: [PATCH 1/3] helm: add crawler_network_policy_additional_egress, resolves #2121 --- chart/templates/networkpolicies.yaml | 3 +++ chart/values.yaml | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/chart/templates/networkpolicies.yaml b/chart/templates/networkpolicies.yaml index 4c605a0b78..4f8897c537 100644 --- a/chart/templates/networkpolicies.yaml +++ b/chart/templates/networkpolicies.yaml @@ -11,6 +11,9 @@ spec: policyTypes: - Egress egress: + {{- if .Values.crawler_network_policy_additional_egress | default false -}} + {{- .Values.crawler_network_policy_additional_egress | toYaml | nindent 4 -}} + {{- end -}} {{- if .Values.crawler_network_policy_egress | default false -}} {{- .Values.crawler_network_policy_egress | toYaml | nindent 4 -}} {{- else }} diff --git a/chart/values.yaml b/chart/values.yaml index b2f320ce6f..235b40ed84 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -373,12 +373,15 @@ btrix-proxies: # crawler_fsgroup: 201407 -# optional: enable/disable crawler network policy +# optional: enable/disable crawler network policy, prevents crawler pods from accessing internal services crawler_enable_network_policy: true -# optional: replace the default crawler egress policy with your own +# optional: add additional egress rules to the default crawler network policy (See chart/templates/networkpolicies.yaml for an example) +# crawler_network_policy_additional_egress: [] + +# optional: replace the default crawler egress policy with your own egress rules (See chart/templates/networkpolicies.yaml for an example) # see chart/templates/networkpolicies.yaml for an example -# crawler_network_policy_egress: {} +# crawler_network_policy_egress: [] # time to wait for graceful stop grace_period: 1000 From 80a3da5cdb4a9de2160fdc80da47fe710b44aae9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 10 Jun 2025 15:56:05 -0700 Subject: [PATCH 2/3] docs: update docs for 'crawler_network_policy_additional_egress' --- frontend/docs/docs/deploy/customization.md | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md index b4753a480b..4893b4c3fc 100644 --- a/frontend/docs/docs/deploy/customization.md +++ b/frontend/docs/docs/deploy/customization.md @@ -139,6 +139,8 @@ storages: When replica locations are set, the default behavior when a crawl, upload, or browser profile is deleted is that the replica files are deleted at the same time as the file in primary storage. To delay deletion of replicas, set `replica_deletion_delay_days` in the Helm chart to the number of days by which to delay replica file deletion. This feature gives Browsertrix administrators time in the event of files being deleted accidentally or maliciously to recover copies from configured replica locations. +??? info "If you are specifying a custom Minio deployment running in the same Kubernetes cluster, be sure to update the [network policy to allow access to your custom resource](#local-network-access-policy-and-custom-services)" + ## Horizontal Autoscaling Browsertrix also includes support for horizontal auto-scaling for both the backend and frontend pods. @@ -250,3 +252,37 @@ type btrixEvent = ( ``` Tracking is optional and will never expose personally identifiable information. + +## Local Network Access Policy and Custom Services + +By default, Browsertrix configures the crawlers with a network policy that restricts access to internal Kubernetes resources, to prevent the crawler from snooping around the internal network. This should be fine for crawling +public websites with the default configuration. + +However, you may want to provide access to an internal IP (for example, if crawling a site deployed on a local server) or another Kubernetes service (such as a custom Minio deployment) + +To provide access, you can extend the existing network policy 'egress' with the `crawler_network_policy_additional_egress` setting: + +For example, to allow the crawler to access the `10.0.0.1/32` IP block on port 80, +and to pods that have a label `my-custom-minio` only on port 9000, add: + +```yaml +crawler_network_policy_additional_egress: + - to: + - ipBlock: + cidr: 10.0.0.1/32 + ports: + - port: 80 + protocol: TCP + + - to: + - namespaceSelector: + - podSelector: + matchLabels: + app: my-custom-minio + + ports: + - port: 9000 + protocol: TCP +``` + +Refer to the [default networkpolicies.yaml](https://github.com/webrecorder/browsertrix/blob/main/chart/templates/networkpolicies.yaml) for additional examples and the [official Kubernetes documentation for Network Policies](https://kubernetes.io/docs/concepts/services-networking/network-policies/) From 024e953241dd5d72a1ff35ad2e2d5ffc373d1993 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 10 Jun 2025 16:05:54 -0700 Subject: [PATCH 3/3] fix typo? --- frontend/docs/docs/deploy/customization.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md index 4893b4c3fc..5041aad8ff 100644 --- a/frontend/docs/docs/deploy/customization.md +++ b/frontend/docs/docs/deploy/customization.md @@ -275,10 +275,9 @@ crawler_network_policy_additional_egress: protocol: TCP - to: - - namespaceSelector: - - podSelector: - matchLabels: - app: my-custom-minio + - podSelector: + matchLabels: + app: my-custom-minio ports: - port: 9000