From f0e67c8bb2b885b0e3c44319b0848c6f5df49652 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 30 Jul 2024 15:38:12 +0200 Subject: [PATCH 01/50] backend: add ssh proxies configuration --- backend/btrixcloud/crawlconfigs.py | 44 +++++++++++++++++++++++++++ backend/btrixcloud/crawlmanager.py | 24 ++++++++++++++- backend/btrixcloud/crawls.py | 5 +++ backend/btrixcloud/k8sapi.py | 7 ++++- backend/btrixcloud/main.py | 2 ++ backend/btrixcloud/main_op.py | 2 ++ backend/btrixcloud/models.py | 32 +++++++++++++++++++ backend/btrixcloud/operator/crawls.py | 15 ++++++++- backend/btrixcloud/operator/models.py | 3 ++ chart/app-templates/crawl_job.yaml | 3 ++ chart/app-templates/crawler.yaml | 35 ++++++++++++++++----- chart/templates/configmap.yaml | 7 +++-- chart/templates/secrets.yaml | 1 + chart/templates/sshproxies.yaml | 25 +++++++++++++++ chart/values.yaml | 19 ++++++++++-- 15 files changed, 207 insertions(+), 17 deletions(-) create mode 100644 chart/templates/sshproxies.yaml diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 60fd36ea1f..706898d7b0 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -10,6 +10,7 @@ import json import re import os +import traceback from datetime import datetime from uuid import UUID, uuid4 import urllib.parse @@ -39,6 +40,8 @@ CrawlConfigSearchValues, CrawlConfigUpdateResponse, CrawlConfigDeletedResponse, + CrawlerSSHProxy, + CrawlerSSHProxies, ) from .utils import dt_now, slug_from_name @@ -122,6 +125,28 @@ def __init__( self.crawler_channels = CrawlerChannels(channels=channels) + self.crawler_ssh_proxies_map = {} + with open( + os.environ["CRAWLER_SSH_PROXIES_JSON"], encoding="utf-8" + ) as fh: + ssh_proxy_list: list[dict] = json.loads(fh.read()) + for ssh_proxy_data in ssh_proxy_list: + ssh_proxy = CrawlerSSHProxy( + id=ssh_proxy_data["id"], + country_code=ssh_proxy_data["country_code"], + hostname=ssh_proxy_data["hostname"], + port=ssh_proxy_data.get("port", 22), + username=ssh_proxy_data["username"], + ) + + self.crawler_ssh_proxies_map[ssh_proxy.id] = ( + ssh_proxy + ) + + self.crawler_ssh_proxies = CrawlerSSHProxies( + servers=list(self.crawler_ssh_proxies_map.values()) + ) + if "default" not in self.crawler_images_map: raise TypeError("The channel list must include a 'default' channel") @@ -218,6 +243,7 @@ async def add_crawl_config( profileid=profileid, crawlerChannel=config_in.crawlerChannel, crawlFilenameTemplate=config_in.crawlFilenameTemplate, + crawlerSSHProxyId=config_in.crawlerSSHProxyId, ) if config_in.runNow: @@ -329,6 +355,10 @@ async def update_crawl_config( and ((not update.profileid) != (not orig_crawl_config.profileid)) ) + changed = changed or ( + self.check_attr_changed(orig_crawl_config, update, "crawlerSSHProxy") + ) + metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") metadata_changed = metadata_changed or self.check_attr_changed( orig_crawl_config, update, "description" @@ -849,6 +879,7 @@ async def run_now_internal( except Exception as exc: # pylint: disable=raise-missing-from + print(traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") async def set_config_current_crawl_info( @@ -898,6 +929,12 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") + def get_crawler_ssh_proxy( + self, ssh_proxy_id:str + ) -> Optional[CrawlerSSHProxy]: + """Get crawlerSSHProxy by id""" + return self.crawler_ssh_proxies_map.get(ssh_proxy_id) + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url if no name is provided, hostname is used from url, otherwise @@ -1068,6 +1105,13 @@ async def get_crawler_channels( ): return ops.crawler_channels + @router.get("/crawler-ssh-proxies", response_model=CrawlerSSHProxies) + async def get_crawler_ssh_proxies( + # pylint: disable=unused-argument + org: Organization = Depends(org_crawl_dep), + ): + return ops.crawler_ssh_proxies + @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( cid: UUID, diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 6bb3d18879..926876714b 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -4,7 +4,7 @@ import asyncio import secrets -from typing import Optional, Dict +from typing import Optional, Dict, TYPE_CHECKING, cast from datetime import timedelta from fastapi import HTTPException @@ -14,15 +14,27 @@ from .models import StorageRef, CrawlConfig, BgJobType +if TYPE_CHECKING: + from .crawlconfigs import CrawlConfigOps +else: + CrawlConfigOps = object + # ============================================================================ class CrawlManager(K8sAPI): """abstract crawl manager""" + crawlconfigs: CrawlConfigOps + def __init__(self): super().__init__() self.loop = asyncio.get_running_loop() + self.crawlconfigs = cast(CrawlConfigOps, None) + + def set_crawlconfig(self, crawlconfigs): + """set crawlconfig ops""" + self.crawlconfigs = crawlconfigs # pylint: disable=too-many-arguments async def run_profile_browser( @@ -125,6 +137,15 @@ async def create_crawl_job( await self.has_storage_secret(storage_secret) + crawler_ssh_proxy = self.crawlconfigs.get_crawler_ssh_proxy( + crawlconfig.crawlerSSHProxyId + ) + if ( + crawlconfig.crawlerSSHProxyId is not None + and len(crawlconfig.crawlerSSHProxyId) > 0 + ): + assert crawler_ssh_proxy is not None + return await self.new_crawl_job( cid, userid, @@ -138,6 +159,7 @@ async def create_crawl_job( warc_prefix=warc_prefix, storage_filename=storage_filename, profile_filename=profile_filename, + crawler_ssh_proxy=crawler_ssh_proxy, ) async def create_qa_crawl_job( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 45da9fe562..0b6130631e 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -353,6 +353,9 @@ async def add_new_crawl( username = user.name image = self.crawl_configs.get_channel_crawler_image(crawlconfig.crawlerChannel) + ssh_proxy = self.crawl_configs.get_crawler_ssh_proxy( + crawlconfig.crawlerSSHProxyId + ) crawl = Crawl( id=crawl_id, @@ -374,6 +377,8 @@ async def add_new_crawl( tags=crawlconfig.tags, name=crawlconfig.name, crawlerChannel=crawlconfig.crawlerChannel, + crawlerSSHProxyId=crawlconfig.crawlerSSHProxyId, + sshProxy=ssh_proxy.dict() if ssh_proxy else None, image=image, ) diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index dc9175cc46..ddb3b71af0 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -2,8 +2,8 @@ import os import traceback - from typing import Optional + import yaml from kubernetes_asyncio import client, config @@ -19,6 +19,7 @@ from fastapi.templating import Jinja2Templates from .utils import get_templates_dir, dt_now +from .models import CrawlerSSHProxy # ============================================================================ @@ -93,6 +94,7 @@ def new_crawl_job_yaml( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", + crawler_ssh_proxy:Optional[CrawlerSSHProxy] = None, ): """load job template from yaml""" if not crawl_id: @@ -115,6 +117,7 @@ def new_crawl_job_yaml( "storage_filename": storage_filename, "profile_filename": profile_filename, "qa_source": qa_source, + "ssh_proxy_id": crawler_ssh_proxy.id if crawler_ssh_proxy else None } data = self.templates.env.get_template("crawl_job.yaml").render(params) @@ -136,6 +139,7 @@ async def new_crawl_job( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", + crawler_ssh_proxy: Optional[CrawlerSSHProxy] = None, ) -> str: """load and init crawl job via k8s api""" crawl_id, data = self.new_crawl_job_yaml( @@ -153,6 +157,7 @@ async def new_crawl_job( storage_filename=storage_filename, profile_filename=profile_filename, qa_source=qa_source, + crawler_ssh_proxy=crawler_ssh_proxy, ) # create job directly diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index d161d75d13..935704877b 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -246,6 +246,8 @@ def main() -> None: crawl_config_ops.set_coll_ops(coll_ops) + crawl_manager.set_crawlconfig(crawl_config_ops) + # run only in first worker if run_once_lock("btrix-init-db"): asyncio.create_task( diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index 573c3174b5..43ba8b1860 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -94,6 +94,8 @@ def main(): background_job_ops.set_ops(crawl_ops, profile_ops) + crawl_manager.set_crawlconfig(crawl_config_ops) + return init_operator_api( app_root, crawl_config_ops, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 99cfadca1e..6b2b53a5c6 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -340,6 +340,7 @@ class CrawlConfigIn(BaseModel): profileid: Union[UUID, EmptyStr, None] = None crawlerChannel: str = "default" + crawlerSSHProxyId: Optional[str] = None autoAddCollections: Optional[List[UUID]] = [] tags: Optional[List[str]] = [] @@ -363,6 +364,7 @@ class ConfigRevision(BaseMongoModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None + crawlerSSHProxy: Optional[str] = None crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 @@ -393,6 +395,7 @@ class CrawlConfigCore(BaseMongoModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None # ============================================================================ @@ -490,6 +493,7 @@ class UpdateCrawlConfig(BaseModel): schedule: Optional[str] = None profileid: Union[UUID, EmptyStr, None] = None crawlerChannel: Optional[str] = None + crawlerSSHProxy: Optional[str] = None crawlTimeout: Optional[int] = None maxCrawlSize: Optional[int] = None scale: Scale = 1 @@ -567,6 +571,28 @@ class CrawlerChannels(BaseModel): channels: List[CrawlerChannel] = [] +# ============================================================================ + +### SSH PROXIES ### + + +class CrawlerSSHProxy(BaseModel): + """SSH proxy definition""" + + id: str + country_code: str + hostname: str + port: int + username: str + + +# ============================================================================ +class CrawlerSSHProxies(BaseModel): + """List of CrawlerSSHProxy instances for API""" + + servers: List[CrawlerSSHProxy] = [] + + # ============================================================================ ### BASE CRAWLS ### @@ -671,6 +697,8 @@ class CoreCrawlable(BaseModel): image: Optional[str] = None + sshProxy: Optional[CrawlerSSHProxy] = None + stats: Optional[CrawlStats] = CrawlStats() files: List[CrawlFile] = [] @@ -762,6 +790,7 @@ class CrawlOut(BaseMongoModel): execMinutesQuotaReached: Optional[bool] = False crawlerChannel: str = "default" + crawlerSSHProxy: Optional[str] = None image: Optional[str] = None reviewStatus: ReviewStatus = None @@ -1608,6 +1637,7 @@ class Profile(BaseMongoModel): baseid: Optional[UUID] = None crawlerChannel: Optional[str] = None + crawlerSSHProxy: Optional[str] = None # ============================================================================ @@ -1630,6 +1660,7 @@ class ProfileLaunchBrowserIn(UrlIn): profileId: Optional[UUID] = None crawlerChannel: str = "default" + crawlerSSHProxy: Optional[str] # ============================================================================ @@ -1647,6 +1678,7 @@ class ProfileCreate(BaseModel): name: str description: Optional[str] = "" crawlerChannel: str = "default" + crawlerSSHProxy: Optional[str] # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 0327717fb0..d89bf9e64b 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -120,7 +120,7 @@ async def sync_crawls(self, data: MCSyncData): status = CrawlStatus(**data.parent.get("status", {})) - spec = data.parent.get("spec", {}) + spec = data.parent.get("spec", {}) # spec is the data from crawl_job.yaml crawl_id = spec["id"] cid = spec["cid"] oid = spec["oid"] @@ -142,6 +142,7 @@ async def sync_crawls(self, data: MCSyncData): oid=oid, storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel"), + crawler_ssh_proxy_id=spec.get("crawlerSSHProxyId"), scale=spec.get("scale", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), @@ -278,6 +279,18 @@ async def sync_crawls(self, data: MCSyncData): params["crawler_image"] = status.crawlerImage + if crawl.crawler_ssh_proxy_id: + status.crawlerSSHProxyId = crawl.crawler_ssh_proxy_id + ssh_proxy = self.crawl_config_ops.get_crawler_ssh_proxy( + status.crawlerSSHProxyId + ) + assert ssh_proxy is not None + params["ssh_proxy_id"] = ssh_proxy.id + params["ssh_proxy_hostname"] = ssh_proxy.hostname + params["ssh_proxy_username"] = ssh_proxy.username + params["ssh_proxy_port"] = ssh_proxy.port if ssh_proxy.port else 22 + + params["storage_filename"] = spec["storage_filename"] params["restart_time"] = spec.get("restartTime") diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 92edaa34a3..7ecf53d7d9 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -79,6 +79,7 @@ class CrawlSpec(BaseModel): timeout: int = 0 max_crawl_size: int = 0 qa_source_crawl_id: Optional[str] = "" + crawler_ssh_proxy_id: Optional[str] = None @property def db_crawl_id(self) -> str: @@ -198,6 +199,8 @@ class CrawlStatus(BaseModel): stopReason: Optional[StopReason] = None initRedis: bool = False crawlerImage: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None + lastActiveTime: str = "" podStatus: DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] = ( defaultdict(lambda: PodInfo()) # pylint: disable=unnecessary-lambda diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 16c19dbc86..255588eeda 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -34,3 +34,6 @@ spec: storageName: "{{ storage_name }}" + {% if ssh_proxy_id %} + crawlerSSHProxyId: "{{ ssh_proxy_id }}" + {% endif %} diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index dead7f3b37..e3ac0e7002 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -71,7 +71,15 @@ spec: - name: crawl-data persistentVolumeClaim: claimName: {{ name }} - + {% if ssh_proxy_hostname %} + - name: proxy-ssh-key + secret: + secretName: "crawler-ssh-proxy-key-{{ ssh_proxy_id }}" + defaultMode: 0600 + - name: proxy-known-hosts + configMap: + name: crawler-ssh-proxy-known-hosts + {% endif %} affinity: {% if crawler_node_type %} @@ -129,6 +137,13 @@ spec: {% elif profile_filename %} - --profile - "@{{ profile_filename }}" + {% elif ssh_proxy_hostname %} + - --sshProxyLogin + - "{{ssh_proxy_username }}@{{ ssh_proxy_hostname }}:{{ ssh_proxy_port }}" + - --sshProxyPrivateKeyFile + - /tmp/ssh-proxy-privatekey + - --sshProxyKnownHostsFile + - /tmp/ssh-proxy-known_hosts {% endif %} volumeMounts: - name: crawl-config @@ -141,7 +156,15 @@ spec: mountPath: /tmp/qa/ readOnly: True {% endif %} - + {% if ssh_proxy_hostname %} + - name: proxy-ssh-key + mountPath: /tmp/ssh-proxy-privatekey + subPath: ssh-privatekey + - name: proxy-known-hosts + mountPath: /tmp/ssh-proxy-known_hosts + subPath: known_hosts + readOnly: true + {% endif %} - name: crawl-data mountPath: /crawls envFrom: @@ -179,12 +202,8 @@ spec: value: "{{ warc_prefix }}" {% if crawler_socks_proxy_host %} - - name: SOCKS_HOST - value: "{{ crawler_socks_proxy_host }}" - {% if crawler_socks_proxy_port %} - - name: SOCKS_PORT - value: "{{ crawler_socks_proxy_port }}" - {% endif %} + - name: PROXY_SERVER + value: "socks5://{{ crawler_socks_proxy_host }}:{{ crawler_socks_proxy_port }}" {% endif %} resources: diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 3f8e2c6c2d..39748058d4 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -53,6 +53,7 @@ data: STORAGES_JSON: "/ops-configs/storages.json" CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" + CRAWLER_SSH_PROXIES_JSON: "/ops-configs/crawler_ssh_proxies.json" MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" @@ -134,9 +135,9 @@ data: crawler_socks_proxy_host: "{{ .Values.crawler_socks_proxy_host }}" crawler_socks_proxy_port: "{{ .Values.crawler_socks_proxy_port }}" - crawler_uid: "{{ .Values.crawler_uid | default 201400007 }}" - crawler_gid: "{{ .Values.crawler_gid | default 201400007 }}" - crawler_fsgroup: "{{ .Values.crawler_fsgroup | default 201400007 }}" + crawler_uid: "{{ .Values.crawler_uid | default 201407 }}" + crawler_gid: "{{ .Values.crawler_gid | default 201407 }}" + crawler_fsgroup: "{{ .Values.crawler_fsgroup | default 201407 }}" profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}" diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 4ee89e9f50..6df02abe9d 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -32,6 +32,7 @@ type: Opaque data: storages.json: {{ .Values.storages | toJson | b64enc | quote }} crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} + crawler_ssh_proxies.json: {{ .Values.crawler_ssh_proxies | toJson | b64enc | quote }} {{- range $storage := .Values.storages }} diff --git a/chart/templates/sshproxies.yaml b/chart/templates/sshproxies.yaml new file mode 100644 index 0000000000..1e4dc83d27 --- /dev/null +++ b/chart/templates/sshproxies.yaml @@ -0,0 +1,25 @@ +{{- if .Values.crawler_ssh_proxies }} +{{- $crawler_namespace := .Values.crawler_namespace -}} +{{- range .Values.crawler_ssh_proxies }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: crawler-ssh-proxy-key-{{ .id }} + namespace: {{ $crawler_namespace | quote }} +type: kubernetes.io/ssh-auth +data: + ssh-privatekey: {{ .private_key | b64enc | quote }} +{{- end }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: crawler-ssh-proxy-known-hosts + namespace: {{ $crawler_namespace | quote }} +data: + known_hosts: | + {{- range .Values.crawler_ssh_proxies }} + {{ .host_public_key | nindent 4 }} + {{- end }} +{{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index fbc6ae2bc3..444941e55f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -301,10 +301,23 @@ crawler_liveness_port: 6065 # crawler_socks_proxy_host: 192.0.2.1 # crawler_socks_proxy_port: 9050 +# optional: configure a list of ssh servers to be used as a proxy +crawler_ssh_proxies: [] +# - id: USA # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric +# label: "US Proxy" # optional: label to show instead of the id in the dropdown +# country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search +# hostname: example.com # IP or hostname of ssh server +# port: 22 +# username: proxy +# private_key: secretkey # ssh-key needed to connect to server +# host_public_key: | # ssh public keys of the ssh server, use output of `ssh-keyscan $hostname -p $port` for best results +# # example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 +# example.invalid ssh-rsa AAA[..] + # optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods -# crawler_uid: 201400007 -# crawler_gid: 201400007 -# crawler_fsgroup: 201400007 +# crawler_uid: 201407 +# crawler_gid: 201407 +# crawler_fsgroup: 201407 # optional: enable/disable crawler network policy From d96fff445cb7a4793983eb3d1e3f9ab3493cdd67 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 30 Jul 2024 15:42:57 +0200 Subject: [PATCH 02/50] frontend: add wip ssh proxy selection --- frontend/src/components/ui/config-details.ts | 6 + frontend/src/components/ui/index.ts | 1 + .../components/ui/select-crawler-ssh-proxy.ts | 194 ++++++++++++++++++ frontend/src/pages/org/workflow-editor.ts | 19 ++ frontend/src/pages/org/workflows-new.ts | 1 + frontend/src/types/crawler.ts | 9 + 6 files changed, 230 insertions(+) create mode 100644 frontend/src/components/ui/select-crawler-ssh-proxy.ts diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 298d9ff5ad..1f8d15af52 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -256,6 +256,12 @@ export class ConfigDetails extends LiteElement { ISO6391.getName(crawlConfig.config.lang), ) : nothing} + ${crawlConfig?.crawlerSSHProxyId + ? this.renderSetting( + msg("SSH Proxy Server"), + capitalize(crawlConfig.crawlerSSHProxyId), + ) + : nothing}
diff --git a/frontend/src/components/ui/index.ts b/frontend/src/components/ui/index.ts index 0a4cd1577e..30f0685682 100644 --- a/frontend/src/components/ui/index.ts +++ b/frontend/src/components/ui/index.ts @@ -30,6 +30,7 @@ import("./relative-duration"); import("./search-combobox"); import("./section-heading"); import("./select-crawler"); +import("./select-crawler-ssh-proxy"); import("./tab-group"); import("./tab-list"); import("./table"); diff --git a/frontend/src/components/ui/select-crawler-ssh-proxy.ts b/frontend/src/components/ui/select-crawler-ssh-proxy.ts new file mode 100644 index 0000000000..ddb65d9da5 --- /dev/null +++ b/frontend/src/components/ui/select-crawler-ssh-proxy.ts @@ -0,0 +1,194 @@ +import { localized, msg } from "@lit/localize"; +import { type SlSelect } from "@shoelace-style/shoelace"; +import { html } from "lit"; +import { customElement, property, state } from "lit/decorators.js"; +import capitalize from "lodash/fp/capitalize"; + +import type { crawlerSSHProxy } from "@/pages/org/types"; +import type { AuthState } from "@/utils/AuthService"; +import LiteElement from "@/utils/LiteElement"; + +type SelectCrawlerSSHProxyChangeDetail = { + value: string | undefined; +}; + +export type SelectCrawlerSSHProxyChangeEvent = + CustomEvent; + +type SelectCrawlerSSHProxyUpdateDetail = { + show: boolean; +}; + +export type SelectCrawlerSSHProxyUpdateEvent = + CustomEvent; + +type crawlerSSHProxiesAPIResponse = { + servers: crawlerSSHProxy[]; +}; + +/** + * Crawler ssh proxy select dropdown + * + * Usage example: + * ```ts + * selectedcrawlerSSHProxy = value} + * > + * ``` + * + * @event on-change + */ +@customElement("btrix-select-crawler-ssh-proxy") +@localized() +export class SelectCrawlerSSHProxy extends LiteElement { + @property({ type: Object }) + authState!: AuthState; + + @property({ type: String }) + orgId!: string; + + @property({ type: String }) + crawlerSSHProxyId?: string | null; + + @state() + private selectedSSHProxy?: crawlerSSHProxy; + + @state() + private crawlerSSHProxies?: crawlerSSHProxy[]; + + protected firstUpdated() { + void this.fetchCrawlerSSHProxies(); + } + // credit: https://dev.to/jorik/country-code-to-flag-emoji-a21 + private countryCodeToFlagEmoji(countryCode: String): String { + return countryCode + .toUpperCase() + .split("") + .map((char) => String.fromCodePoint(char.charCodeAt(0) + 127397)) + .join(""); + } + + render() { + /*if (this.crawlerSSHProxys && this.crawlerSSHProxys.length < 2) { + return html``; + }*/ + + return html` + { + // Refetch to keep list up to date + void this.fetchCrawlerSSHProxies(); + }} + @sl-hide=${this.stopProp} + @sl-after-hide=${this.stopProp} + > + ${this.crawlerSSHProxies?.map( + (server) => + html` + ${this.countryCodeToFlagEmoji(server.country_code)} + ${capitalize(server.id)} + `, + )} + ${this.selectedSSHProxy + ? html` +
+ ${msg("Connection:")} + ${this.selectedSSHProxy.username}@${this.selectedSSHProxy + .hostname} +
+ ` + : ``} +
+ `; + } + + private onChange(e: Event) { + this.stopProp(e); + + this.selectedSSHProxy = this.crawlerSSHProxies?.find( + ({ id }) => id === (e.target as SlSelect).value, + ); + + this.dispatchEvent( + new CustomEvent("on-change", { + detail: { + value: this.selectedSSHProxy?.id, + }, + }), + ); + } + + /** + * Fetch crawler ssh proxies and update internal state + */ + private async fetchCrawlerSSHProxies(): Promise { + try { + const servers = await this.getCrawlerSSHProxies(); + this.crawlerSSHProxies = servers; + + if (this.crawlerSSHProxyId && !this.selectedSSHProxy?.id) { + this.selectedSSHProxy = this.crawlerSSHProxies.find( + ({ id }) => id === this.crawlerSSHProxyId, + ); + } + + if (!this.selectedSSHProxy) { + this.crawlerSSHProxyId = null; + this.dispatchEvent( + new CustomEvent("on-change", { + detail: { + value: null, + }, + }), + ); + this.selectedSSHProxy = this.crawlerSSHProxies.find( + ({ id }) => id === this.crawlerSSHProxyId, + ); + } + + this.dispatchEvent( + new CustomEvent("on-update", { + detail: { + show: this.crawlerSSHProxies.length > 1, + }, + }), + ); + } catch (e) { + this.notify({ + message: msg("Sorry, couldn't retrieve ssh proxies at this time."), + variant: "danger", + icon: "exclamation-octagon", + }); + } + } + + private async getCrawlerSSHProxies(): Promise { + const data: crawlerSSHProxiesAPIResponse = + await this.apiFetch( + `/orgs/${this.orgId}/crawlconfigs/crawler-ssh-proxies`, + this.authState!, + ); + + return data.servers; + } + + /** + * Stop propgation of sl-select events. + * Prevents bug where sl-dialog closes when dropdown closes + * https://github.com/shoelace-style/shoelace/issues/170 + */ + private stopProp(e: Event) { + e.stopPropagation(); + } +} diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 21e8f18851..25b4ca75d5 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -42,6 +42,7 @@ import type { SelectCrawlerChangeEvent, SelectCrawlerUpdateEvent, } from "@/components/ui/select-crawler"; +import type { SelectCrawlerSSHProxyChangeEvent } from "@/components/ui/select-crawler-ssh-proxy"; import type { Tab } from "@/components/ui/tab-list"; import type { TagInputEvent, @@ -134,6 +135,7 @@ type FormState = { autoscrollBehavior: boolean; userAgent: string | null; crawlerChannel: string; + crawlerSSHProxyId: string | null; }; const getDefaultProgressState = (hasConfigId = false): ProgressState => { @@ -211,6 +213,7 @@ const getDefaultFormState = (): FormState => ({ autoscrollBehavior: true, userAgent: null, crawlerChannel: "default", + crawlerSSHProxyId: null, }); function getLocalizedWeekDays() { @@ -616,6 +619,9 @@ export class CrawlConfigEditor extends LiteElement { this.initialWorkflow.config.userAgent ?? defaultFormState.userAgent, crawlerChannel: this.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, + crawlerSSHProxyId: + this.initialWorkflow.crawlerSSHProxyId || + defaultFormState.crawlerSSHProxyId, ...formState, }; } @@ -1755,6 +1761,18 @@ https://archiveweb.page/images/${"logo.svg"}`} msg(`Websites that observe the browser’s language setting may serve content in that language if available.`), )} + ${this.renderFormCol(html` + + this.updateFormState({ + crawlerSSHProxyId: e.detail.value, + })} + > + `)} + ${this.renderHelpTextCol(msg(`Choose a Browsertrix SSH Proxy`))} `; } @@ -2476,6 +2494,7 @@ https://archiveweb.page/images/${"logo.svg"}`} ).join(","), }, crawlerChannel: this.formState.crawlerChannel || "default", + crawlerSSHProxyId: this.formState.crawlerSSHProxyId, }; return config; diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index ddf307c8df..7213b803af 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -36,6 +36,7 @@ const defaultValue = { scale: 1, autoAddCollections: [], crawlerChannel: "default", + crawlerSSHProxyId: null, } as WorkflowParams; /** diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index dcabc8a674..60aa6ead4a 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -51,6 +51,7 @@ export type WorkflowParams = { description: string | null; autoAddCollections: string[]; crawlerChannel: string; + crawlerSSHProxyId: string | null; }; export type CrawlConfig = WorkflowParams & { @@ -212,6 +213,14 @@ export type CrawlerChannel = { image: string; }; +export type crawlerSSHProxy = { + id: string; + country_code: string; + hostname: string; + port: number; + username: string; +}; + export type ArchivedItem = Crawl | Upload; export type ArchivedItemPageComment = { From 2d3e9ef23ac6b70907aa024b5c957c0905f12117 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 30 Jul 2024 15:43:51 +0200 Subject: [PATCH 03/50] scripts: add minikube utilities --- scripts/minikube-build-and-deploy.sh | 16 ++++++++++++++++ scripts/minikube-reset.sh | 14 ++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 scripts/minikube-build-and-deploy.sh create mode 100644 scripts/minikube-reset.sh diff --git a/scripts/minikube-build-and-deploy.sh b/scripts/minikube-build-and-deploy.sh new file mode 100644 index 0000000000..259b05c6e0 --- /dev/null +++ b/scripts/minikube-build-and-deploy.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +CURR=$(dirname "${BASH_SOURCE[0]}") + +eval $(minikube docker-env) +for img in backend frontend; +do + sh "${CURR}/build-${img}.sh" +done + +echo "Deploying helm chart..." +helm upgrade --wait --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ + +until kubectl port-forward service/browsertrix-cloud-frontend 8000:80; do + echo "Unable to forward service/browsertrix-cloud-frontend. Retrying.." >&2 + sleep 1 +done diff --git a/scripts/minikube-reset.sh b/scripts/minikube-reset.sh new file mode 100644 index 0000000000..0bad515fa7 --- /dev/null +++ b/scripts/minikube-reset.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +if [ "$(minikube status | grep -o Running | wc -l)" -lt 3 ]; then + echo "Error: Less than 3 components are running in Minikube" + exit 1 +fi + +if kubectl config get-contexts | grep -q minikube; then + kubectl config set-context minikube + # ~~~ DANGER ZONE ~~~ + echo "Uninstalling helm deployment and deleting pvcs" + helm uninstall btrix + minikube kubectl delete pvc minio-storage-pvc + minikube kubectl delete pvc data-db-local-mongo-0 +fi From fca588695bfc115ab90a63c0cf16155be24b61ec Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 30 Jul 2024 17:05:30 +0200 Subject: [PATCH 04/50] ssh proxy: fix changing proxy in workflow editor --- backend/btrixcloud/crawlconfigs.py | 2 +- backend/btrixcloud/crawls.py | 4 ++-- backend/btrixcloud/models.py | 14 +++++++------- .../src/components/ui/select-crawler-ssh-proxy.ts | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 706898d7b0..e866bd6196 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -356,7 +356,7 @@ async def update_crawl_config( ) changed = changed or ( - self.check_attr_changed(orig_crawl_config, update, "crawlerSSHProxy") + orig_crawl_config.crawlerSSHProxyId != update.crawlerSSHProxyId ) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 0b6130631e..f341e148c0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -353,7 +353,7 @@ async def add_new_crawl( username = user.name image = self.crawl_configs.get_channel_crawler_image(crawlconfig.crawlerChannel) - ssh_proxy = self.crawl_configs.get_crawler_ssh_proxy( + crawler_ssh_proxy = self.crawl_configs.get_crawler_ssh_proxy( crawlconfig.crawlerSSHProxyId ) @@ -378,7 +378,7 @@ async def add_new_crawl( name=crawlconfig.name, crawlerChannel=crawlconfig.crawlerChannel, crawlerSSHProxyId=crawlconfig.crawlerSSHProxyId, - sshProxy=ssh_proxy.dict() if ssh_proxy else None, + crawlerSSHProxy=crawler_ssh_proxy.dict() if crawler_ssh_proxy else None, image=image, ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 6b2b53a5c6..c79d23e5bb 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -364,7 +364,7 @@ class ConfigRevision(BaseMongoModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None - crawlerSSHProxy: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 @@ -493,7 +493,7 @@ class UpdateCrawlConfig(BaseModel): schedule: Optional[str] = None profileid: Union[UUID, EmptyStr, None] = None crawlerChannel: Optional[str] = None - crawlerSSHProxy: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None crawlTimeout: Optional[int] = None maxCrawlSize: Optional[int] = None scale: Scale = 1 @@ -697,7 +697,7 @@ class CoreCrawlable(BaseModel): image: Optional[str] = None - sshProxy: Optional[CrawlerSSHProxy] = None + crawlerSSHProxy: Optional[CrawlerSSHProxy] = None stats: Optional[CrawlStats] = CrawlStats() @@ -790,7 +790,7 @@ class CrawlOut(BaseMongoModel): execMinutesQuotaReached: Optional[bool] = False crawlerChannel: str = "default" - crawlerSSHProxy: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None image: Optional[str] = None reviewStatus: ReviewStatus = None @@ -1637,7 +1637,7 @@ class Profile(BaseMongoModel): baseid: Optional[UUID] = None crawlerChannel: Optional[str] = None - crawlerSSHProxy: Optional[str] = None + crawlerSSHProxyId: Optional[str] = None # ============================================================================ @@ -1660,7 +1660,7 @@ class ProfileLaunchBrowserIn(UrlIn): profileId: Optional[UUID] = None crawlerChannel: str = "default" - crawlerSSHProxy: Optional[str] + crawlerSSHProxyId: Optional[str] = None # ============================================================================ @@ -1678,7 +1678,7 @@ class ProfileCreate(BaseModel): name: str description: Optional[str] = "" crawlerChannel: str = "default" - crawlerSSHProxy: Optional[str] + crawlerSSHProxyId: Optional[str] = None # ============================================================================ diff --git a/frontend/src/components/ui/select-crawler-ssh-proxy.ts b/frontend/src/components/ui/select-crawler-ssh-proxy.ts index ddb65d9da5..d9b95fac6e 100644 --- a/frontend/src/components/ui/select-crawler-ssh-proxy.ts +++ b/frontend/src/components/ui/select-crawler-ssh-proxy.ts @@ -9,7 +9,7 @@ import type { AuthState } from "@/utils/AuthService"; import LiteElement from "@/utils/LiteElement"; type SelectCrawlerSSHProxyChangeDetail = { - value: string | undefined; + value: string | null; }; export type SelectCrawlerSSHProxyChangeEvent = @@ -50,7 +50,7 @@ export class SelectCrawlerSSHProxy extends LiteElement { orgId!: string; @property({ type: String }) - crawlerSSHProxyId?: string | null; + crawlerSSHProxyId: string | null = null; @state() private selectedSSHProxy?: crawlerSSHProxy; @@ -123,7 +123,7 @@ export class SelectCrawlerSSHProxy extends LiteElement { this.dispatchEvent( new CustomEvent("on-change", { detail: { - value: this.selectedSSHProxy?.id, + value: this.selectedSSHProxy ? this.selectedSSHProxy.id : null, }, }), ); From 25b813c8e6e65eceb80e1c4487715a6f8205005b Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 30 Jul 2024 17:08:19 +0200 Subject: [PATCH 05/50] formatting --- backend/btrixcloud/crawlconfigs.py | 12 +++--------- backend/btrixcloud/k8sapi.py | 4 ++-- backend/btrixcloud/operator/crawls.py | 3 +-- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index e866bd6196..58c368f7ed 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -126,9 +126,7 @@ def __init__( self.crawler_channels = CrawlerChannels(channels=channels) self.crawler_ssh_proxies_map = {} - with open( - os.environ["CRAWLER_SSH_PROXIES_JSON"], encoding="utf-8" - ) as fh: + with open(os.environ["CRAWLER_SSH_PROXIES_JSON"], encoding="utf-8") as fh: ssh_proxy_list: list[dict] = json.loads(fh.read()) for ssh_proxy_data in ssh_proxy_list: ssh_proxy = CrawlerSSHProxy( @@ -139,9 +137,7 @@ def __init__( username=ssh_proxy_data["username"], ) - self.crawler_ssh_proxies_map[ssh_proxy.id] = ( - ssh_proxy - ) + self.crawler_ssh_proxies_map[ssh_proxy.id] = ssh_proxy self.crawler_ssh_proxies = CrawlerSSHProxies( servers=list(self.crawler_ssh_proxies_map.values()) @@ -929,9 +925,7 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") - def get_crawler_ssh_proxy( - self, ssh_proxy_id:str - ) -> Optional[CrawlerSSHProxy]: + def get_crawler_ssh_proxy(self, ssh_proxy_id: str) -> Optional[CrawlerSSHProxy]: """Get crawlerSSHProxy by id""" return self.crawler_ssh_proxies_map.get(ssh_proxy_id) diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index ddb3b71af0..112853db33 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -94,7 +94,7 @@ def new_crawl_job_yaml( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", - crawler_ssh_proxy:Optional[CrawlerSSHProxy] = None, + crawler_ssh_proxy: Optional[CrawlerSSHProxy] = None, ): """load job template from yaml""" if not crawl_id: @@ -117,7 +117,7 @@ def new_crawl_job_yaml( "storage_filename": storage_filename, "profile_filename": profile_filename, "qa_source": qa_source, - "ssh_proxy_id": crawler_ssh_proxy.id if crawler_ssh_proxy else None + "ssh_proxy_id": crawler_ssh_proxy.id if crawler_ssh_proxy else None, } data = self.templates.env.get_template("crawl_job.yaml").render(params) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d89bf9e64b..2ec69f541f 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -120,7 +120,7 @@ async def sync_crawls(self, data: MCSyncData): status = CrawlStatus(**data.parent.get("status", {})) - spec = data.parent.get("spec", {}) # spec is the data from crawl_job.yaml + spec = data.parent.get("spec", {}) # spec is the data from crawl_job.yaml crawl_id = spec["id"] cid = spec["cid"] oid = spec["oid"] @@ -290,7 +290,6 @@ async def sync_crawls(self, data: MCSyncData): params["ssh_proxy_username"] = ssh_proxy.username params["ssh_proxy_port"] = ssh_proxy.port if ssh_proxy.port else 22 - params["storage_filename"] = spec["storage_filename"] params["restart_time"] = spec.get("restartTime") From 80542df6050cfca281b430fddcceded87e452a28 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 30 Jul 2024 19:18:34 -0700 Subject: [PATCH 06/50] cleanup: various renaming / simplifications, remove 'ssh' from names, just use proxies just pass proxyId to CrawlJob, lookup in operator share single secret for all proxy configs, containg private keys and public host keys use single 'auth' field for 'user@host[:port]' --- backend/btrixcloud/crawlconfigs.py | 53 +++++----- backend/btrixcloud/crawlmanager.py | 25 +---- backend/btrixcloud/crawls.py | 6 +- backend/btrixcloud/k8sapi.py | 9 +- backend/btrixcloud/main.py | 2 - backend/btrixcloud/main_op.py | 2 - backend/btrixcloud/models.py | 35 +++---- backend/btrixcloud/operator/crawls.py | 18 ++-- backend/btrixcloud/operator/models.py | 3 +- chart/app-templates/crawl_job.yaml | 4 +- chart/app-templates/crawler.yaml | 42 ++++---- chart/templates/configmap.yaml | 2 +- chart/templates/proxies.yaml | 18 ++++ chart/templates/secrets.yaml | 2 +- chart/templates/sshproxies.yaml | 25 ----- chart/values.yaml | 8 +- frontend/src/components/ui/config-details.ts | 4 +- frontend/src/components/ui/index.ts | 2 +- ...r-ssh-proxy.ts => select-crawler-proxy.ts} | 99 +++++++++---------- frontend/src/pages/org/workflow-editor.ts | 24 +++-- frontend/src/pages/org/workflows-new.ts | 2 +- frontend/src/types/crawler.ts | 9 +- 22 files changed, 166 insertions(+), 228 deletions(-) create mode 100644 chart/templates/proxies.yaml delete mode 100644 chart/templates/sshproxies.yaml rename frontend/src/components/ui/{select-crawler-ssh-proxy.ts => select-crawler-proxy.ts} (50%) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 58c368f7ed..54f882d22b 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -40,8 +40,8 @@ CrawlConfigSearchValues, CrawlConfigUpdateResponse, CrawlConfigDeletedResponse, - CrawlerSSHProxy, - CrawlerSSHProxies, + CrawlerProxy, + CrawlerProxies, ) from .utils import dt_now, slug_from_name @@ -125,26 +125,27 @@ def __init__( self.crawler_channels = CrawlerChannels(channels=channels) - self.crawler_ssh_proxies_map = {} - with open(os.environ["CRAWLER_SSH_PROXIES_JSON"], encoding="utf-8") as fh: - ssh_proxy_list: list[dict] = json.loads(fh.read()) - for ssh_proxy_data in ssh_proxy_list: - ssh_proxy = CrawlerSSHProxy( - id=ssh_proxy_data["id"], - country_code=ssh_proxy_data["country_code"], - hostname=ssh_proxy_data["hostname"], - port=ssh_proxy_data.get("port", 22), - username=ssh_proxy_data["username"], + if "default" not in self.crawler_images_map: + raise TypeError("The channel list must include a 'default' channel") + + self.crawler_proxies_map = {} + with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh: + proxy_list = json.loads(fh.read()) + for proxy_data in proxy_list: + proxy = CrawlerProxy( + id=proxy_data["id"], + label=proxy_data["label"], + country_code=proxy_data["country_code"], + auth=proxy_data["auth"], ) - self.crawler_ssh_proxies_map[ssh_proxy.id] = ssh_proxy + self.crawler_proxies_map[proxy.id] = proxy - self.crawler_ssh_proxies = CrawlerSSHProxies( - servers=list(self.crawler_ssh_proxies_map.values()) + self.crawler_proxies = CrawlerProxies( + servers=list(self.crawler_proxies_map.values()) ) - if "default" not in self.crawler_images_map: - raise TypeError("The channel list must include a 'default' channel") + print(self.crawler_proxies) def set_crawl_ops(self, ops): """set crawl ops reference""" @@ -239,7 +240,7 @@ async def add_crawl_config( profileid=profileid, crawlerChannel=config_in.crawlerChannel, crawlFilenameTemplate=config_in.crawlFilenameTemplate, - crawlerSSHProxyId=config_in.crawlerSSHProxyId, + proxyId=config_in.proxyId, ) if config_in.runNow: @@ -351,9 +352,7 @@ async def update_crawl_config( and ((not update.profileid) != (not orig_crawl_config.profileid)) ) - changed = changed or ( - orig_crawl_config.crawlerSSHProxyId != update.crawlerSSHProxyId - ) + changed = changed or (orig_crawl_config.proxyId != update.proxyId) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") metadata_changed = metadata_changed or self.check_attr_changed( @@ -925,9 +924,9 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") - def get_crawler_ssh_proxy(self, ssh_proxy_id: str) -> Optional[CrawlerSSHProxy]: - """Get crawlerSSHProxy by id""" - return self.crawler_ssh_proxies_map.get(ssh_proxy_id) + def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]: + """Get crawlerProxy by id""" + return self.crawler_proxies_map.get(proxy_id) def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url @@ -1099,12 +1098,12 @@ async def get_crawler_channels( ): return ops.crawler_channels - @router.get("/crawler-ssh-proxies", response_model=CrawlerSSHProxies) - async def get_crawler_ssh_proxies( + @router.get("/crawler-proxies", response_model=CrawlerProxies) + async def get_crawler_proxies( # pylint: disable=unused-argument org: Organization = Depends(org_crawl_dep), ): - return ops.crawler_ssh_proxies + return ops.crawler_proxies @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 926876714b..5c0c415ec8 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -4,7 +4,7 @@ import asyncio import secrets -from typing import Optional, Dict, TYPE_CHECKING, cast +from typing import Optional, Dict from datetime import timedelta from fastapi import HTTPException @@ -14,27 +14,15 @@ from .models import StorageRef, CrawlConfig, BgJobType -if TYPE_CHECKING: - from .crawlconfigs import CrawlConfigOps -else: - CrawlConfigOps = object - # ============================================================================ class CrawlManager(K8sAPI): """abstract crawl manager""" - crawlconfigs: CrawlConfigOps - def __init__(self): super().__init__() self.loop = asyncio.get_running_loop() - self.crawlconfigs = cast(CrawlConfigOps, None) - - def set_crawlconfig(self, crawlconfigs): - """set crawlconfig ops""" - self.crawlconfigs = crawlconfigs # pylint: disable=too-many-arguments async def run_profile_browser( @@ -137,15 +125,6 @@ async def create_crawl_job( await self.has_storage_secret(storage_secret) - crawler_ssh_proxy = self.crawlconfigs.get_crawler_ssh_proxy( - crawlconfig.crawlerSSHProxyId - ) - if ( - crawlconfig.crawlerSSHProxyId is not None - and len(crawlconfig.crawlerSSHProxyId) > 0 - ): - assert crawler_ssh_proxy is not None - return await self.new_crawl_job( cid, userid, @@ -159,7 +138,7 @@ async def create_crawl_job( warc_prefix=warc_prefix, storage_filename=storage_filename, profile_filename=profile_filename, - crawler_ssh_proxy=crawler_ssh_proxy, + proxy_id=crawlconfig.proxyId, ) async def create_qa_crawl_job( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index f341e148c0..5fd4dbc384 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -353,9 +353,6 @@ async def add_new_crawl( username = user.name image = self.crawl_configs.get_channel_crawler_image(crawlconfig.crawlerChannel) - crawler_ssh_proxy = self.crawl_configs.get_crawler_ssh_proxy( - crawlconfig.crawlerSSHProxyId - ) crawl = Crawl( id=crawl_id, @@ -377,8 +374,7 @@ async def add_new_crawl( tags=crawlconfig.tags, name=crawlconfig.name, crawlerChannel=crawlconfig.crawlerChannel, - crawlerSSHProxyId=crawlconfig.crawlerSSHProxyId, - crawlerSSHProxy=crawler_ssh_proxy.dict() if crawler_ssh_proxy else None, + proxyId=crawlconfig.proxyId, image=image, ) diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 112853db33..73beed6bfc 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -19,7 +19,6 @@ from fastapi.templating import Jinja2Templates from .utils import get_templates_dir, dt_now -from .models import CrawlerSSHProxy # ============================================================================ @@ -94,7 +93,7 @@ def new_crawl_job_yaml( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", - crawler_ssh_proxy: Optional[CrawlerSSHProxy] = None, + proxy_id: Optional[str] = None, ): """load job template from yaml""" if not crawl_id: @@ -117,7 +116,7 @@ def new_crawl_job_yaml( "storage_filename": storage_filename, "profile_filename": profile_filename, "qa_source": qa_source, - "ssh_proxy_id": crawler_ssh_proxy.id if crawler_ssh_proxy else None, + "proxy_id": proxy_id, } data = self.templates.env.get_template("crawl_job.yaml").render(params) @@ -139,7 +138,7 @@ async def new_crawl_job( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", - crawler_ssh_proxy: Optional[CrawlerSSHProxy] = None, + proxy_id: Optional[str] = None, ) -> str: """load and init crawl job via k8s api""" crawl_id, data = self.new_crawl_job_yaml( @@ -157,7 +156,7 @@ async def new_crawl_job( storage_filename=storage_filename, profile_filename=profile_filename, qa_source=qa_source, - crawler_ssh_proxy=crawler_ssh_proxy, + proxy_id=proxy_id, ) # create job directly diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 935704877b..d161d75d13 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -246,8 +246,6 @@ def main() -> None: crawl_config_ops.set_coll_ops(coll_ops) - crawl_manager.set_crawlconfig(crawl_config_ops) - # run only in first worker if run_once_lock("btrix-init-db"): asyncio.create_task( diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index 43ba8b1860..573c3174b5 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -94,8 +94,6 @@ def main(): background_job_ops.set_ops(crawl_ops, profile_ops) - crawl_manager.set_crawlconfig(crawl_config_ops) - return init_operator_api( app_root, crawl_config_ops, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 6eff007e20..84dcbca9f2 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -340,7 +340,7 @@ class CrawlConfigIn(BaseModel): profileid: Union[UUID, EmptyStr, None] = None crawlerChannel: str = "default" - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None autoAddCollections: Optional[List[UUID]] = [] tags: Optional[List[str]] = [] @@ -364,7 +364,7 @@ class ConfigRevision(BaseMongoModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 @@ -395,7 +395,7 @@ class CrawlConfigCore(BaseMongoModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None # ============================================================================ @@ -493,7 +493,7 @@ class UpdateCrawlConfig(BaseModel): schedule: Optional[str] = None profileid: Union[UUID, EmptyStr, None] = None crawlerChannel: Optional[str] = None - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None crawlTimeout: Optional[int] = None maxCrawlSize: Optional[int] = None scale: Scale = 1 @@ -573,24 +573,23 @@ class CrawlerChannels(BaseModel): # ============================================================================ -### SSH PROXIES ### +### PROXIES ### -class CrawlerSSHProxy(BaseModel): - """SSH proxy definition""" +class CrawlerProxy(BaseModel): + """proxy definition""" id: str + label: str country_code: str - hostname: str - port: int - username: str + auth: str # ============================================================================ -class CrawlerSSHProxies(BaseModel): - """List of CrawlerSSHProxy instances for API""" +class CrawlerProxies(BaseModel): + """List of CrawlerProxy instances for API""" - servers: List[CrawlerSSHProxy] = [] + servers: List[CrawlerProxy] = [] # ============================================================================ @@ -695,8 +694,6 @@ class CoreCrawlable(BaseModel): image: Optional[str] = None - crawlerSSHProxy: Optional[CrawlerSSHProxy] = None - stats: Optional[CrawlStats] = CrawlStats() files: List[CrawlFile] = [] @@ -788,7 +785,7 @@ class CrawlOut(BaseMongoModel): execMinutesQuotaReached: Optional[bool] = False crawlerChannel: str = "default" - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None image: Optional[str] = None reviewStatus: ReviewStatus = None @@ -1634,7 +1631,7 @@ class Profile(BaseMongoModel): baseid: Optional[UUID] = None crawlerChannel: Optional[str] = None - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None # ============================================================================ @@ -1657,7 +1654,7 @@ class ProfileLaunchBrowserIn(UrlIn): profileId: Optional[UUID] = None crawlerChannel: str = "default" - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None # ============================================================================ @@ -1675,7 +1672,7 @@ class ProfileCreate(BaseModel): name: str description: Optional[str] = "" crawlerChannel: str = "default" - crawlerSSHProxyId: Optional[str] = None + proxyId: Optional[str] = None # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index b4af3996cf..da5b4fb21d 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -142,7 +142,7 @@ async def sync_crawls(self, data: MCSyncData): oid=oid, storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel"), - crawler_ssh_proxy_id=spec.get("crawlerSSHProxyId"), + proxy_id=spec.get("proxyId"), scale=spec.get("scale", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), @@ -279,16 +279,12 @@ async def sync_crawls(self, data: MCSyncData): params["crawler_image"] = status.crawlerImage - if crawl.crawler_ssh_proxy_id: - status.crawlerSSHProxyId = crawl.crawler_ssh_proxy_id - ssh_proxy = self.crawl_config_ops.get_crawler_ssh_proxy( - status.crawlerSSHProxyId - ) - assert ssh_proxy is not None - params["ssh_proxy_id"] = ssh_proxy.id - params["ssh_proxy_hostname"] = ssh_proxy.hostname - params["ssh_proxy_username"] = ssh_proxy.username - params["ssh_proxy_port"] = ssh_proxy.port if ssh_proxy.port else 22 + if crawl.proxy_id: + proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id) + params["proxy_id"] = crawl.proxy_id + params["ssh_proxy_auth"] = proxy.auth if proxy else "" + else: + params["proxy_id"] = None params["storage_filename"] = spec["storage_filename"] params["restart_time"] = spec.get("restartTime") diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 7ecf53d7d9..78bf322a85 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -79,7 +79,7 @@ class CrawlSpec(BaseModel): timeout: int = 0 max_crawl_size: int = 0 qa_source_crawl_id: Optional[str] = "" - crawler_ssh_proxy_id: Optional[str] = None + proxy_id: Optional[str] = None @property def db_crawl_id(self) -> str: @@ -199,7 +199,6 @@ class CrawlStatus(BaseModel): stopReason: Optional[StopReason] = None initRedis: bool = False crawlerImage: Optional[str] = None - crawlerSSHProxyId: Optional[str] = None lastActiveTime: str = "" podStatus: DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] = ( diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 255588eeda..002372c65d 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -34,6 +34,4 @@ spec: storageName: "{{ storage_name }}" - {% if ssh_proxy_id %} - crawlerSSHProxyId: "{{ ssh_proxy_id }}" - {% endif %} + proxyId: "{{ proxy_id }}" diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index e3ac0e7002..d2df3331a2 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -52,8 +52,8 @@ spec: securityContext: runAsNonRoot: true - runAsUser: {{ crawler_uid}} - runAsGroup: {{ crawler_gid}} + runAsUser: {{ crawler_uid }} + runAsGroup: {{ crawler_gid }} fsGroup: {{ crawler_fsgroup }} allowPrivilegeEscalation: false readOnlyRootFilesystem: true @@ -71,14 +71,11 @@ spec: - name: crawl-data persistentVolumeClaim: claimName: {{ name }} - {% if ssh_proxy_hostname %} - - name: proxy-ssh-key + {% if proxy_id %} + - name: proxy-ssh-keys secret: - secretName: "crawler-ssh-proxy-key-{{ ssh_proxy_id }}" + secretName: proxy-ssh-keys defaultMode: 0600 - - name: proxy-known-hosts - configMap: - name: crawler-ssh-proxy-known-hosts {% endif %} affinity: @@ -137,13 +134,14 @@ spec: {% elif profile_filename %} - --profile - "@{{ profile_filename }}" - {% elif ssh_proxy_hostname %} + {% endif %} + {% if proxy_id %} - --sshProxyLogin - - "{{ssh_proxy_username }}@{{ ssh_proxy_hostname }}:{{ ssh_proxy_port }}" + - "{{ ssh_proxy_auth }}" - --sshProxyPrivateKeyFile - - /tmp/ssh-proxy-privatekey + - /tmp/ssh-proxy/private-key - --sshProxyKnownHostsFile - - /tmp/ssh-proxy-known_hosts + - /tmp/ssh-proxy/known-hosts {% endif %} volumeMounts: - name: crawl-config @@ -156,13 +154,14 @@ spec: mountPath: /tmp/qa/ readOnly: True {% endif %} - {% if ssh_proxy_hostname %} - - name: proxy-ssh-key - mountPath: /tmp/ssh-proxy-privatekey - subPath: ssh-privatekey - - name: proxy-known-hosts - mountPath: /tmp/ssh-proxy-known_hosts - subPath: known_hosts + {% if proxy_id %} + - name: proxy-ssh-keys + mountPath: /tmp/ssh-proxy/private-key + subPath: {{ proxy_id }}-private-key + readOnly: true + - name: proxy-ssh-keys + mountPath: /tmp/ssh-proxy/known-hosts + subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - name: crawl-data @@ -201,11 +200,6 @@ spec: - name: WARC_PREFIX value: "{{ warc_prefix }}" - {% if crawler_socks_proxy_host %} - - name: PROXY_SERVER - value: "socks5://{{ crawler_socks_proxy_host }}:{{ crawler_socks_proxy_port }}" - {% endif %} - resources: limits: memory: "{{ memory_limit }}" diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 39748058d4..3f71c111a8 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -53,7 +53,7 @@ data: STORAGES_JSON: "/ops-configs/storages.json" CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" - CRAWLER_SSH_PROXIES_JSON: "/ops-configs/crawler_ssh_proxies.json" + CRAWLER_PROXIES_JSON: "/ops-configs/crawler_proxies.json" MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" diff --git a/chart/templates/proxies.yaml b/chart/templates/proxies.yaml new file mode 100644 index 0000000000..9510a4092f --- /dev/null +++ b/chart/templates/proxies.yaml @@ -0,0 +1,18 @@ +{{- if .Values.proxies }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: proxy-ssh-keys + namespace: {{ .Values.crawler_namespace }} +#type: kubernetes.io/ssh-auth +type: Opaque +stringData: +{{- range .Values.proxies }} + {{ .id }}-private-key: | +{{ .private_key | indent 4 }} + {{ .id }}-known-hosts: {{ .host_public_key }} +{{- end }} + +{{- end }} + diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 6df02abe9d..a1e7ced0ba 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -32,7 +32,7 @@ type: Opaque data: storages.json: {{ .Values.storages | toJson | b64enc | quote }} crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} - crawler_ssh_proxies.json: {{ .Values.crawler_ssh_proxies | toJson | b64enc | quote }} + crawler_proxies.json: {{ .Values.proxies | toJson | b64enc | quote }} {{- range $storage := .Values.storages }} diff --git a/chart/templates/sshproxies.yaml b/chart/templates/sshproxies.yaml deleted file mode 100644 index 1e4dc83d27..0000000000 --- a/chart/templates/sshproxies.yaml +++ /dev/null @@ -1,25 +0,0 @@ -{{- if .Values.crawler_ssh_proxies }} -{{- $crawler_namespace := .Values.crawler_namespace -}} -{{- range .Values.crawler_ssh_proxies }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: crawler-ssh-proxy-key-{{ .id }} - namespace: {{ $crawler_namespace | quote }} -type: kubernetes.io/ssh-auth -data: - ssh-privatekey: {{ .private_key | b64enc | quote }} -{{- end }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: crawler-ssh-proxy-known-hosts - namespace: {{ $crawler_namespace | quote }} -data: - known_hosts: | - {{- range .Values.crawler_ssh_proxies }} - {{ .host_public_key | nindent 4 }} - {{- end }} -{{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index ca7b39673e..45ba6f5fb3 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -302,13 +302,11 @@ crawler_liveness_port: 6065 # crawler_socks_proxy_port: 9050 # optional: configure a list of ssh servers to be used as a proxy -crawler_ssh_proxies: [] -# - id: USA # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric +proxies: [] +# - id: My Proxy # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric # label: "US Proxy" # optional: label to show instead of the id in the dropdown # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search -# hostname: example.com # IP or hostname of ssh server -# port: 22 -# username: proxy +# auth: user@example.com[:port] # login string for SSH, with optional port # private_key: secretkey # ssh-key needed to connect to server # host_public_key: | # ssh public keys of the ssh server, use output of `ssh-keyscan $hostname -p $port` for best results # # example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 1f8d15af52..abe10b2c4e 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -256,10 +256,10 @@ export class ConfigDetails extends LiteElement { ISO6391.getName(crawlConfig.config.lang), ) : nothing} - ${crawlConfig?.crawlerSSHProxyId + ${crawlConfig?.proxyId ? this.renderSetting( msg("SSH Proxy Server"), - capitalize(crawlConfig.crawlerSSHProxyId), + capitalize(crawlConfig.proxyId), ) : nothing} diff --git a/frontend/src/components/ui/index.ts b/frontend/src/components/ui/index.ts index 30f0685682..ef60ac36d9 100644 --- a/frontend/src/components/ui/index.ts +++ b/frontend/src/components/ui/index.ts @@ -30,7 +30,7 @@ import("./relative-duration"); import("./search-combobox"); import("./section-heading"); import("./select-crawler"); -import("./select-crawler-ssh-proxy"); +import("./select-crawler-proxy"); import("./tab-group"); import("./tab-list"); import("./table"); diff --git a/frontend/src/components/ui/select-crawler-ssh-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts similarity index 50% rename from frontend/src/components/ui/select-crawler-ssh-proxy.ts rename to frontend/src/components/ui/select-crawler-proxy.ts index d9b95fac6e..9f85f6bcda 100644 --- a/frontend/src/components/ui/select-crawler-ssh-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -4,45 +4,45 @@ import { html } from "lit"; import { customElement, property, state } from "lit/decorators.js"; import capitalize from "lodash/fp/capitalize"; -import type { crawlerSSHProxy } from "@/pages/org/types"; +import type { Proxy } from "@/pages/org/types"; import type { AuthState } from "@/utils/AuthService"; import LiteElement from "@/utils/LiteElement"; -type SelectCrawlerSSHProxyChangeDetail = { +type SelectCrawlerProxyChangeDetail = { value: string | null; }; -export type SelectCrawlerSSHProxyChangeEvent = - CustomEvent; +export type SelectCrawlerProxyChangeEvent = + CustomEvent; -type SelectCrawlerSSHProxyUpdateDetail = { +type SelectCrawlerProxyUpdateDetail = { show: boolean; }; -export type SelectCrawlerSSHProxyUpdateEvent = - CustomEvent; +export type SelectCrawlerProxyUpdateEvent = + CustomEvent; -type crawlerSSHProxiesAPIResponse = { - servers: crawlerSSHProxy[]; +type allProxiesAPIResponse = { + servers: Proxy[]; }; /** - * Crawler ssh proxy select dropdown + * Crawler proxy select dropdown * * Usage example: * ```ts - * selectedcrawlerSSHProxy = value} - * > + * on-change=${({value}) => selectedcrawlerProxy = value} + * > * ``` * * @event on-change */ -@customElement("btrix-select-crawler-ssh-proxy") +@customElement("btrix-select-crawler-proxy") @localized() -export class SelectCrawlerSSHProxy extends LiteElement { +export class SelectCrawlerProxy extends LiteElement { @property({ type: Object }) authState!: AuthState; @@ -50,16 +50,16 @@ export class SelectCrawlerSSHProxy extends LiteElement { orgId!: string; @property({ type: String }) - crawlerSSHProxyId: string | null = null; + proxyId: string | null = null; @state() - private selectedSSHProxy?: crawlerSSHProxy; + private selectedProxy?: Proxy; @state() - private crawlerSSHProxies?: crawlerSSHProxy[]; + private allProxies?: Proxy[]; protected firstUpdated() { - void this.fetchCrawlerSSHProxies(); + void this.fetchallProxies(); } // credit: https://dev.to/jorik/country-code-to-flag-emoji-a21 private countryCodeToFlagEmoji(countryCode: String): String { @@ -71,41 +71,38 @@ export class SelectCrawlerSSHProxy extends LiteElement { } render() { - /*if (this.crawlerSSHProxys && this.crawlerSSHProxys.length < 2) { + /*if (this.crawlerProxys && this.crawlerProxys.length < 2) { return html``; }*/ return html` { // Refetch to keep list up to date - void this.fetchCrawlerSSHProxies(); + void this.fetchallProxies(); }} @sl-hide=${this.stopProp} @sl-after-hide=${this.stopProp} > - ${this.crawlerSSHProxies?.map( + ${this.allProxies?.map( (server) => html` ${this.countryCodeToFlagEmoji(server.country_code)} - ${capitalize(server.id)} + ${capitalize(server.label)} `, )} - ${this.selectedSSHProxy + ${this.selectedProxy ? html`
${msg("Connection:")} - ${this.selectedSSHProxy.username}@${this.selectedSSHProxy - .hostname} + ${this.selectedProxy.auth}
` : ``} @@ -116,35 +113,35 @@ export class SelectCrawlerSSHProxy extends LiteElement { private onChange(e: Event) { this.stopProp(e); - this.selectedSSHProxy = this.crawlerSSHProxies?.find( + this.selectedProxy = this.allProxies?.find( ({ id }) => id === (e.target as SlSelect).value, ); this.dispatchEvent( - new CustomEvent("on-change", { + new CustomEvent("on-change", { detail: { - value: this.selectedSSHProxy ? this.selectedSSHProxy.id : null, + value: this.selectedProxy ? this.selectedProxy.id : null, }, }), ); } /** - * Fetch crawler ssh proxies and update internal state + * Fetch crawler proxies and update internal state */ - private async fetchCrawlerSSHProxies(): Promise { + private async fetchallProxies(): Promise { try { - const servers = await this.getCrawlerSSHProxies(); - this.crawlerSSHProxies = servers; + const servers = await this.getallProxies(); + this.allProxies = servers; - if (this.crawlerSSHProxyId && !this.selectedSSHProxy?.id) { - this.selectedSSHProxy = this.crawlerSSHProxies.find( - ({ id }) => id === this.crawlerSSHProxyId, + if (this.proxyId && !this.selectedProxy?.id) { + this.selectedProxy = this.allProxies.find( + ({ id }) => id === this.proxyId, ); } - if (!this.selectedSSHProxy) { - this.crawlerSSHProxyId = null; + if (!this.selectedProxy) { + this.proxyId = null; this.dispatchEvent( new CustomEvent("on-change", { detail: { @@ -152,31 +149,31 @@ export class SelectCrawlerSSHProxy extends LiteElement { }, }), ); - this.selectedSSHProxy = this.crawlerSSHProxies.find( - ({ id }) => id === this.crawlerSSHProxyId, + this.selectedProxy = this.allProxies.find( + ({ id }) => id === this.proxyId, ); } this.dispatchEvent( - new CustomEvent("on-update", { + new CustomEvent("on-update", { detail: { - show: this.crawlerSSHProxies.length > 1, + show: this.allProxies.length > 1, }, }), ); } catch (e) { this.notify({ - message: msg("Sorry, couldn't retrieve ssh proxies at this time."), + message: msg("Sorry, couldn't retrieve proxies at this time."), variant: "danger", icon: "exclamation-octagon", }); } } - private async getCrawlerSSHProxies(): Promise { - const data: crawlerSSHProxiesAPIResponse = - await this.apiFetch( - `/orgs/${this.orgId}/crawlconfigs/crawler-ssh-proxies`, + private async getallProxies(): Promise { + const data: allProxiesAPIResponse = + await this.apiFetch( + `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, this.authState!, ); diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 25b4ca75d5..fc48f37a62 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -42,7 +42,7 @@ import type { SelectCrawlerChangeEvent, SelectCrawlerUpdateEvent, } from "@/components/ui/select-crawler"; -import type { SelectCrawlerSSHProxyChangeEvent } from "@/components/ui/select-crawler-ssh-proxy"; +import type { SelectCrawlerProxyChangeEvent } from "@/components/ui/select-crawler-proxy"; import type { Tab } from "@/components/ui/tab-list"; import type { TagInputEvent, @@ -135,7 +135,7 @@ type FormState = { autoscrollBehavior: boolean; userAgent: string | null; crawlerChannel: string; - crawlerSSHProxyId: string | null; + proxyId: string | null; }; const getDefaultProgressState = (hasConfigId = false): ProgressState => { @@ -213,7 +213,7 @@ const getDefaultFormState = (): FormState => ({ autoscrollBehavior: true, userAgent: null, crawlerChannel: "default", - crawlerSSHProxyId: null, + proxyId: null, }); function getLocalizedWeekDays() { @@ -619,9 +619,7 @@ export class CrawlConfigEditor extends LiteElement { this.initialWorkflow.config.userAgent ?? defaultFormState.userAgent, crawlerChannel: this.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, - crawlerSSHProxyId: - this.initialWorkflow.crawlerSSHProxyId || - defaultFormState.crawlerSSHProxyId, + proxyId: this.initialWorkflow.proxyId || defaultFormState.proxyId, ...formState, }; } @@ -1762,17 +1760,17 @@ https://archiveweb.page/images/${"logo.svg"}`} content in that language if available.`), )} ${this.renderFormCol(html` - + @on-change=${(e: SelectCrawlerProxyChangeEvent) => this.updateFormState({ - crawlerSSHProxyId: e.detail.value, + proxyId: e.detail.value, })} - > + > `)} - ${this.renderHelpTextCol(msg(`Choose a Browsertrix SSH Proxy`))} + ${this.renderHelpTextCol(msg(`Choose a Browsertrix Proxy`))} `; } @@ -2494,7 +2492,7 @@ https://archiveweb.page/images/${"logo.svg"}`} ).join(","), }, crawlerChannel: this.formState.crawlerChannel || "default", - crawlerSSHProxyId: this.formState.crawlerSSHProxyId, + proxyId: this.formState.proxyId, }; return config; diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 7213b803af..7c8788ac6d 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -36,7 +36,7 @@ const defaultValue = { scale: 1, autoAddCollections: [], crawlerChannel: "default", - crawlerSSHProxyId: null, + proxyId: null, } as WorkflowParams; /** diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 60aa6ead4a..47bd38cb32 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -51,7 +51,7 @@ export type WorkflowParams = { description: string | null; autoAddCollections: string[]; crawlerChannel: string; - crawlerSSHProxyId: string | null; + proxyId: string | null; }; export type CrawlConfig = WorkflowParams & { @@ -213,12 +213,11 @@ export type CrawlerChannel = { image: string; }; -export type crawlerSSHProxy = { +export type Proxy = { id: string; + label: string; country_code: string; - hostname: string; - port: number; - username: string; + auth: string; }; export type ArchivedItem = Crawl | Upload; From eb4f9f1b6e7914989594c5c5bf4eb2640c9fea56 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 30 Jul 2024 20:25:55 -0700 Subject: [PATCH 07/50] fixes: ensure proxyId defaults to "" if none secrets: map /etc/passwd and /etc/group to ensure user/group are defined for ssh --- backend/btrixcloud/crawlmanager.py | 2 +- backend/btrixcloud/k8sapi.py | 4 ++-- backend/btrixcloud/operator/crawls.py | 2 +- chart/app-templates/crawler.yaml | 10 +++++++++- chart/templates/proxies.yaml | 9 +++++++++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 5c0c415ec8..bc1eeaf7df 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -138,7 +138,7 @@ async def create_crawl_job( warc_prefix=warc_prefix, storage_filename=storage_filename, profile_filename=profile_filename, - proxy_id=crawlconfig.proxyId, + proxy_id=crawlconfig.proxyId or "", ) async def create_qa_crawl_job( diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 73beed6bfc..238155d212 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -93,7 +93,7 @@ def new_crawl_job_yaml( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", - proxy_id: Optional[str] = None, + proxy_id: str = "", ): """load job template from yaml""" if not crawl_id: @@ -138,7 +138,7 @@ async def new_crawl_job( storage_filename: str = "", profile_filename: str = "", qa_source: str = "", - proxy_id: Optional[str] = None, + proxy_id: str = "", ) -> str: """load and init crawl job via k8s api""" crawl_id, data = self.new_crawl_job_yaml( diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index da5b4fb21d..31e671f3e9 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -279,7 +279,7 @@ async def sync_crawls(self, data: MCSyncData): params["crawler_image"] = status.crawlerImage - if crawl.proxy_id: + if crawl.proxy_id and not crawl.is_qa: proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id) params["proxy_id"] = crawl.proxy_id params["ssh_proxy_auth"] = proxy.auth if proxy else "" diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index d2df3331a2..35ac242b92 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -163,7 +163,15 @@ spec: mountPath: /tmp/ssh-proxy/known-hosts subPath: {{ proxy_id }}-known-hosts readOnly: true - {% endif %} + - name: proxy-ssh-keys + mountPath: /etc/passwd + subPath: passwd + readOnly: true + - name: proxy-ssh-keys + mountPath: /etc/group + subPath: group + readOnly: true + {% endif %} - name: crawl-data mountPath: /crawls envFrom: diff --git a/chart/templates/proxies.yaml b/chart/templates/proxies.yaml index 9510a4092f..7117631b13 100644 --- a/chart/templates/proxies.yaml +++ b/chart/templates/proxies.yaml @@ -14,5 +14,14 @@ stringData: {{ .id }}-known-hosts: {{ .host_public_key }} {{- end }} + # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies + passwd: | + root:x:0:0:root:/root:/bin/bash + btrix:btrix:{{ .Values.crawler_uid | default 201407 }}:{{ .Values.crawler_gid | default 201407 }}::/tmp/btrix:/bin/sh + + group: | + root:x:0: + btrix:x:{{ .Values.crawler_gid | default 201407 }}: + {{- end }} From ba07896cd4064105f9a2ed32ca3c19595edf66e5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 30 Jul 2024 20:40:05 -0700 Subject: [PATCH 08/50] version: bump to 1.12.0-beta.0 --- backend/btrixcloud/version.py | 2 +- chart/Chart.yaml | 2 +- chart/values.yaml | 4 ++-- frontend/package.json | 2 +- version.txt | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index 62094c2b73..1607b7d9cf 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """ current version """ -__version__ = "1.11.1" +__version__ = "1.12.0-beta.0" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index e0d6041a3e..700ec2863b 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.11.1 +version: v1.12.0-beta.0 dependencies: - name: btrix-admin-logging diff --git a/chart/values.yaml b/chart/values.yaml index 45ba6f5fb3..7125995a71 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -90,7 +90,7 @@ default_org: "My Organization" # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.11.1" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.12.0-beta.0" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -135,7 +135,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.11.1" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.12.0-beta.0" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index 230beb0f98..4cd5555a70 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.11.1", + "version": "1.12.0-beta.0", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/version.txt b/version.txt index 720c7384c6..c338dd93b3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.11.1 +1.12.0-beta.0 From f0a3d11aebbf558c2f2e5e546233d191fd279c2e Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Wed, 31 Jul 2024 10:46:01 +0200 Subject: [PATCH 09/50] fixes: ssh proxy - allow multiline known_hosts file --- chart/templates/proxies.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/templates/proxies.yaml b/chart/templates/proxies.yaml index 7117631b13..fc05ddd106 100644 --- a/chart/templates/proxies.yaml +++ b/chart/templates/proxies.yaml @@ -11,7 +11,8 @@ stringData: {{- range .Values.proxies }} {{ .id }}-private-key: | {{ .private_key | indent 4 }} - {{ .id }}-known-hosts: {{ .host_public_key }} + {{ .id }}-known-hosts: | + {{ .host_public_key | nindent 4 }} {{- end }} # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies @@ -24,4 +25,3 @@ stringData: btrix:x:{{ .Values.crawler_gid | default 201407 }}: {{- end }} - From e893f892ae5fc3890e222ed9a13278235aa4de50 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 31 Jul 2024 11:26:12 -0700 Subject: [PATCH 10/50] add proxy support for profiles! pass proxy as ssh:// instead of separate --sshProxyLogin flag --- backend/btrixcloud/crawlmanager.py | 2 ++ backend/btrixcloud/operator/profiles.py | 6 ++++ backend/btrixcloud/profiles.py | 1 + chart/app-templates/crawler.yaml | 4 +-- chart/app-templates/profile_job.yaml | 2 ++ chart/app-templates/profilebrowser.yaml | 33 +++++++++++++++++++ chart/templates/proxies.yaml | 2 +- .../new-browser-profile-dialog.ts | 20 ++++++++++- .../src/pages/org/browser-profiles-new.ts | 9 +++++ 9 files changed, 75 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index bc1eeaf7df..e3fa3c4383 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -34,6 +34,7 @@ async def run_profile_browser( crawler_image: str, baseprofile: str = "", profile_filename: str = "", + proxy_id: str = "", ) -> str: """run browser for profile creation""" @@ -55,6 +56,7 @@ async def run_profile_browser( "vnc_password": secrets.token_hex(16), "expire_time": to_k8s_date(dt_now() + timedelta(seconds=30)), "crawler_image": crawler_image, + "proxy_id": proxy_id, } data = self.templates.env.get_template("profile_job.yaml").render(params) diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py index 713252d7c5..a213df7f79 100644 --- a/backend/btrixcloud/operator/profiles.py +++ b/backend/btrixcloud/operator/profiles.py @@ -49,6 +49,12 @@ async def sync_profile_browsers(self, data: MCSyncData): params["profile_filename"] = spec.get("profileFilename", "") params["crawler_image"] = spec["crawlerImage"] + proxy_id = spec.get("proxyId") + params["proxy_id"] = proxy_id + if proxy_id: + proxy = self.crawl_config_ops.get_crawler_proxy(proxy_id) + params["ssh_proxy_auth"] = proxy.auth if proxy else "" + params["url"] = spec.get("startUrl", "about:blank") params["vnc_password"] = spec.get("vncPassword") diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index fe16e80b9d..25a7e382e2 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -115,6 +115,7 @@ async def create_new_browser( crawler_image=crawler_image, baseprofile=prev_profile_id, profile_filename=prev_profile_path, + proxy_id=profile_launch.proxyId or "", ) if not browserid: diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 35ac242b92..0135798114 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -136,8 +136,8 @@ spec: - "@{{ profile_filename }}" {% endif %} {% if proxy_id %} - - --sshProxyLogin - - "{{ ssh_proxy_auth }}" + - --proxyServer + - "ssh://{{ ssh_proxy_auth }}" - --sshProxyPrivateKeyFile - /tmp/ssh-proxy/private-key - --sshProxyKnownHostsFile diff --git a/chart/app-templates/profile_job.yaml b/chart/app-templates/profile_job.yaml index 2f3f6f8865..fc6f61fbfc 100644 --- a/chart/app-templates/profile_job.yaml +++ b/chart/app-templates/profile_job.yaml @@ -28,6 +28,8 @@ spec: profileFilename: "{{ profile_filename }}" vncPassword: "{{ vnc_password }}" + proxyId: "{{ proxy_id }}" + {% if expire_time %} expireTime: "{{ expire_time }}" {% endif %} diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index 8eda40c615..fdc0eb701d 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -26,6 +26,13 @@ spec: emptyDir: sizeLimit: {{ profile_browser_workdir_size }} + {% if proxy_id %} + - name: proxy-ssh-keys + secret: + secretName: proxy-ssh-keys + defaultMode: 0600 + {% endif %} + {% if priorityClassName %} priorityClassName: {{ priorityClassName }} {% endif %} @@ -73,10 +80,36 @@ spec: - --profile - "@{{ profile_filename }}" {%- endif %} + {% if proxy_id %} + - --proxyServer + - "ssh://{{ ssh_proxy_auth }}" + - --sshProxyPrivateKeyFile + - /tmp/ssh-proxy/private-key + - --sshProxyKnownHostsFile + - /tmp/ssh-proxy/known-hosts + {% endif %} volumeMounts: - name: crawler-workdir mountPath: /tmp/home + {% if proxy_id %} + - name: proxy-ssh-keys + mountPath: /tmp/ssh-proxy/private-key + subPath: {{ proxy_id }}-private-key + readOnly: true + - name: proxy-ssh-keys + mountPath: /tmp/ssh-proxy/known-hosts + subPath: {{ proxy_id }}-known-hosts + readOnly: true + - name: proxy-ssh-keys + mountPath: /etc/passwd + subPath: passwd + readOnly: true + - name: proxy-ssh-keys + mountPath: /etc/group + subPath: group + readOnly: true + {% endif %} envFrom: - secretRef: diff --git a/chart/templates/proxies.yaml b/chart/templates/proxies.yaml index fc05ddd106..72acb13648 100644 --- a/chart/templates/proxies.yaml +++ b/chart/templates/proxies.yaml @@ -12,7 +12,7 @@ stringData: {{ .id }}-private-key: | {{ .private_key | indent 4 }} {{ .id }}-known-hosts: | - {{ .host_public_key | nindent 4 }} +{{ .host_public_key | indent 4 }} {{- end }} # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies diff --git a/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts b/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts index fcd0b180aa..a959373927 100644 --- a/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts +++ b/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts @@ -10,7 +10,8 @@ import { import queryString from "query-string"; import type { Dialog } from "@/components/ui/dialog"; -import { type SelectCrawlerChangeEvent } from "@/components/ui/select-crawler"; +import type { SelectCrawlerChangeEvent } from "@/components/ui/select-crawler"; +import type { SelectCrawlerProxyChangeEvent } from "@/components/ui/select-crawler-proxy"; import type { AuthState } from "@/utils/AuthService"; import LiteElement, { html } from "@/utils/LiteElement"; @@ -32,6 +33,9 @@ export class NewBrowserProfileDialog extends LiteElement { @state() private crawlerChannel = "default"; + @state() + private proxyId: string | null = null; + @query("btrix-dialog") private readonly dialog?: Dialog; @@ -88,6 +92,15 @@ export class NewBrowserProfileDialog extends LiteElement { (this.crawlerChannel = e.detail.value!)} > +
+ + (this.proxyId = e.detail.value!)} + > +
@@ -135,6 +148,7 @@ export class NewBrowserProfileDialog extends LiteElement { const data = await this.createBrowser({ url: url, crawlerChannel: this.crawlerChannel, + proxyId: this.proxyId, }); this.notify({ @@ -150,6 +164,7 @@ export class NewBrowserProfileDialog extends LiteElement { url, name: msg("My Profile"), crawlerChannel: this.crawlerChannel, + proxyId: this.proxyId, })}`, ); } catch (e) { @@ -165,13 +180,16 @@ export class NewBrowserProfileDialog extends LiteElement { private async createBrowser({ url, crawlerChannel, + proxyId, }: { url: string; crawlerChannel: string; + proxyId: string | null; }) { const params = { url, crawlerChannel, + proxyId, }; return this.apiFetch<{ browserid: string }>( diff --git a/frontend/src/pages/org/browser-profiles-new.ts b/frontend/src/pages/org/browser-profiles-new.ts index dc76ded260..55cd9aa717 100644 --- a/frontend/src/pages/org/browser-profiles-new.ts +++ b/frontend/src/pages/org/browser-profiles-new.ts @@ -43,9 +43,11 @@ export class BrowserProfilesNew extends TailwindElement { crawlerChannel?: string; profileId?: string | null; navigateUrl?: string; + proxyId: string | null; } = { name: "", url: "", + proxyId: null, }; private readonly api = new APIController(this); @@ -288,9 +290,11 @@ export class BrowserProfilesNew extends TailwindElement { } const crawlerChannel = this.browserParams.crawlerChannel || "default"; + const proxyId = this.browserParams.proxyId; const data = await this.createBrowser({ url, crawlerChannel, + proxyId, }); this.nav.to( @@ -300,6 +304,7 @@ export class BrowserProfilesNew extends TailwindElement { url, name: this.browserParams.name || msg("My Profile"), crawlerChannel, + proxyId, })}`, ); } @@ -314,6 +319,7 @@ export class BrowserProfilesNew extends TailwindElement { name: formData.get("name"), description: formData.get("description"), crawlerChannel: this.browserParams.crawlerChannel, + proxyId: this.browserParams.proxyId, }; try { @@ -362,13 +368,16 @@ export class BrowserProfilesNew extends TailwindElement { private async createBrowser({ url, crawlerChannel, + proxyId, }: { url: string; crawlerChannel: string; + proxyId: string | null; }) { const params = { url, crawlerChannel, + proxyId, }; return this.api.fetch<{ browserid: string }>( From e59e1c8e30227551570a1e6b61660c949b2df79d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 31 Jul 2024 17:47:45 -0700 Subject: [PATCH 11/50] make proxies more generic, can support ssh://, socks5:// and http:// require ssh_private_key and ssh_host_public_key for ssh proxies only support description in frontend support 'default_proxy' which sets default proxy if none is specified --- backend/btrixcloud/crawlconfigs.py | 9 ++++--- backend/btrixcloud/crawlmanager.py | 14 +++++------ backend/btrixcloud/models.py | 7 ++++-- backend/btrixcloud/operator/crawls.py | 9 ++++--- backend/btrixcloud/operator/profiles.py | 7 ++++-- chart/app-templates/crawler.yaml | 22 ++++++++++------ chart/app-templates/profilebrowser.yaml | 22 ++++++++++------ chart/templates/configmap.yaml | 2 ++ chart/templates/proxies.yaml | 13 +++++++--- chart/values.yaml | 25 ++++++++++++------- .../src/components/ui/select-crawler-proxy.ts | 10 +++++--- frontend/src/types/crawler.ts | 4 +-- 12 files changed, 93 insertions(+), 51 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 54f882d22b..c350aec1a3 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -135,8 +135,11 @@ def __init__( proxy = CrawlerProxy( id=proxy_data["id"], label=proxy_data["label"], - country_code=proxy_data["country_code"], - auth=proxy_data["auth"], + description=proxy_data.get("description", ""), + country_code=proxy_data.get("country_code", ""), + url=proxy_data["url"], + has_host_public_key=bool(proxy_data.get("ssh_host_public_key")), + has_private_key=bool(proxy_data.get("ssh_private_key")), ) self.crawler_proxies_map[proxy.id] = proxy @@ -145,8 +148,6 @@ def __init__( servers=list(self.crawler_proxies_map.values()) ) - print(self.crawler_proxies) - def set_crawl_ops(self, ops): """set crawl ops reference""" self.crawl_ops = ops diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index e3fa3c4383..9907cc3604 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -1,7 +1,6 @@ """ shared crawl manager implementation """ import os -import asyncio import secrets from typing import Optional, Dict @@ -16,13 +15,12 @@ # ============================================================================ -class CrawlManager(K8sAPI): - """abstract crawl manager""" +DEFAULT_PROXY_ID: str = os.environ.get("DEFAULT_PROXY_ID", "") - def __init__(self): - super().__init__() - self.loop = asyncio.get_running_loop() +# ============================================================================ +class CrawlManager(K8sAPI): + """abstract crawl manager""" # pylint: disable=too-many-arguments async def run_profile_browser( @@ -56,7 +54,7 @@ async def run_profile_browser( "vnc_password": secrets.token_hex(16), "expire_time": to_k8s_date(dt_now() + timedelta(seconds=30)), "crawler_image": crawler_image, - "proxy_id": proxy_id, + "proxy_id": proxy_id or DEFAULT_PROXY_ID, } data = self.templates.env.get_template("profile_job.yaml").render(params) @@ -140,7 +138,7 @@ async def create_crawl_job( warc_prefix=warc_prefix, storage_filename=storage_filename, profile_filename=profile_filename, - proxy_id=crawlconfig.proxyId or "", + proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID, ) async def create_qa_crawl_job( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 84dcbca9f2..b51151ff9f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -580,9 +580,12 @@ class CrawlerProxy(BaseModel): """proxy definition""" id: str + url: str label: str - country_code: str - auth: str + description: str = "" + country_code: str = "" + has_host_public_key: bool + has_private_key: bool # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 31e671f3e9..23e789c73c 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -281,10 +281,11 @@ async def sync_crawls(self, data: MCSyncData): if crawl.proxy_id and not crawl.is_qa: proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id) - params["proxy_id"] = crawl.proxy_id - params["ssh_proxy_auth"] = proxy.auth if proxy else "" - else: - params["proxy_id"] = None + if proxy: + params["proxy_id"] = crawl.proxy_id + params["proxy_url"] = proxy.url + params["proxy_ssh_private_key"] = proxy.has_private_key + params["proxy_ssh_host_public_key"] = proxy.has_host_public_key params["storage_filename"] = spec["storage_filename"] params["restart_time"] = spec.get("restartTime") diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py index a213df7f79..115e23639e 100644 --- a/backend/btrixcloud/operator/profiles.py +++ b/backend/btrixcloud/operator/profiles.py @@ -50,10 +50,13 @@ async def sync_profile_browsers(self, data: MCSyncData): params["crawler_image"] = spec["crawlerImage"] proxy_id = spec.get("proxyId") - params["proxy_id"] = proxy_id if proxy_id: proxy = self.crawl_config_ops.get_crawler_proxy(proxy_id) - params["ssh_proxy_auth"] = proxy.auth if proxy else "" + if proxy: + params["proxy_id"] = proxy_id + params["proxy_url"] = proxy.url + params["proxy_ssh_private_key"] = proxy.has_private_key + params["proxy_ssh_host_public_key"] = proxy.has_host_public_key params["url"] = spec.get("startUrl", "about:blank") params["vnc_password"] = spec.get("vncPassword") diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 0135798114..256ef16cc6 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -72,9 +72,9 @@ spec: persistentVolumeClaim: claimName: {{ name }} {% if proxy_id %} - - name: proxy-ssh-keys + - name: proxies secret: - secretName: proxy-ssh-keys + secretName: proxies defaultMode: 0600 {% endif %} @@ -137,12 +137,16 @@ spec: {% endif %} {% if proxy_id %} - --proxyServer - - "ssh://{{ ssh_proxy_auth }}" + - "{{ proxy_url }}" + {% if proxy_ssh_private_key %} - --sshProxyPrivateKeyFile - /tmp/ssh-proxy/private-key + {% endif %} + {% if proxy_ssh_host_public_key %} - --sshProxyKnownHostsFile - /tmp/ssh-proxy/known-hosts {% endif %} + {% endif %} volumeMounts: - name: crawl-config mountPath: /tmp/crawl-config.json @@ -155,19 +159,23 @@ spec: readOnly: True {% endif %} {% if proxy_id %} - - name: proxy-ssh-keys + {% if proxy_ssh_private_key %} + - name: proxies mountPath: /tmp/ssh-proxy/private-key subPath: {{ proxy_id }}-private-key readOnly: true - - name: proxy-ssh-keys + {% endif %} + {% if proxy_ssh_host_public_key %} + - name: proxies mountPath: /tmp/ssh-proxy/known-hosts subPath: {{ proxy_id }}-known-hosts readOnly: true - - name: proxy-ssh-keys + {% endif %} + - name: proxies mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxy-ssh-keys + - name: proxies mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index fdc0eb701d..c2119b4f27 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -27,9 +27,9 @@ spec: sizeLimit: {{ profile_browser_workdir_size }} {% if proxy_id %} - - name: proxy-ssh-keys + - name: proxies secret: - secretName: proxy-ssh-keys + secretName: proxies defaultMode: 0600 {% endif %} @@ -82,30 +82,38 @@ spec: {%- endif %} {% if proxy_id %} - --proxyServer - - "ssh://{{ ssh_proxy_auth }}" + - "{{ proxy_url }}" + {% if proxy_ssh_private_key %} - --sshProxyPrivateKeyFile - /tmp/ssh-proxy/private-key + {% endif %} + {% if proxy_ssh_host_public_key %} - --sshProxyKnownHostsFile - /tmp/ssh-proxy/known-hosts {% endif %} + {% endif %} volumeMounts: - name: crawler-workdir mountPath: /tmp/home {% if proxy_id %} - - name: proxy-ssh-keys + {% if proxy_ssh_private_key %} + - name: proxies mountPath: /tmp/ssh-proxy/private-key subPath: {{ proxy_id }}-private-key readOnly: true - - name: proxy-ssh-keys + {% endif %} + {% if proxy_ssh_host_public_key %} + - name: proxies mountPath: /tmp/ssh-proxy/known-hosts subPath: {{ proxy_id }}-known-hosts readOnly: true - - name: proxy-ssh-keys + {% endif %} + - name: proxies mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxy-ssh-keys + - name: proxies mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 3f71c111a8..001d712a82 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -55,6 +55,8 @@ data: CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" CRAWLER_PROXIES_JSON: "/ops-configs/crawler_proxies.json" + DEFAULT_PROXY_ID: "{{ .Values.default_proxy }}" + MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" diff --git a/chart/templates/proxies.yaml b/chart/templates/proxies.yaml index 72acb13648..f10b6e3480 100644 --- a/chart/templates/proxies.yaml +++ b/chart/templates/proxies.yaml @@ -3,16 +3,23 @@ apiVersion: v1 kind: Secret metadata: - name: proxy-ssh-keys + name: proxies namespace: {{ .Values.crawler_namespace }} #type: kubernetes.io/ssh-auth type: Opaque stringData: {{- range .Values.proxies }} + +{{- if .ssh_private_key }} {{ .id }}-private-key: | -{{ .private_key | indent 4 }} +{{ .ssh_private_key | indent 4 }} +{{- end }} + +{{- if .ssh_host_public_key }} {{ .id }}-known-hosts: | -{{ .host_public_key | indent 4 }} +{{ .ssh_host_public_key | indent 4 }} +{{- end }} + {{- end }} # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies diff --git a/chart/values.yaml b/chart/values.yaml index 7125995a71..b15f0112f8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -297,22 +297,29 @@ crawler_session_time_limit_seconds: 18000 crawler_liveness_port: 6065 -# optional: use socks5 proxy for crawler and profilebrowser -# crawler_socks_proxy_host: 192.0.2.1 -# crawler_socks_proxy_port: 9050 - # optional: configure a list of ssh servers to be used as a proxy proxies: [] # - id: My Proxy # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric +# url: # proxy connection string, must be a ssh://, socks:// or http:// URL # label: "US Proxy" # optional: label to show instead of the id in the dropdown # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search -# auth: user@example.com[:port] # login string for SSH, with optional port -# private_key: secretkey # ssh-key needed to connect to server -# host_public_key: | # ssh public keys of the ssh server, use output of `ssh-keyscan $hostname -p $port` for best results -# # example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 -# example.invalid ssh-rsa AAA[..] +# +# ssh_private_key: | # requred for ssh:// proxies +# # ssh-key needed to connect to the SSH server +# +# +# ssh_host_public_key: | # optional, for ssh:// proxies-only +# # ssh public keys of the SSH server +# # use output of `ssh-keyscan $hostname -p $port` for best results +# example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 +# example.invalid ssh-rsa AAA[..] + +# optional: always proxy with following id by default, if no other proxy settings are set +# must match one of the proxies in the 'proxies' list +# default_proxy: # optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods +# the following values are used by default: # crawler_uid: 201407 # crawler_gid: 201407 # crawler_fsgroup: 201407 diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index 9f85f6bcda..6d6ca231ca 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -94,15 +94,19 @@ export class SelectCrawlerProxy extends LiteElement { ${this.allProxies?.map( (server) => html` - ${this.countryCodeToFlagEmoji(server.country_code)} + ${server.country_code + ? this.countryCodeToFlagEmoji(server.country_code) + : ""} ${capitalize(server.label)} `, )} ${this.selectedProxy ? html`
- ${msg("Connection:")} - ${this.selectedProxy.auth} + ${msg("Description:")} + ${this.selectedProxy.description || ""}
` : ``} diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 47bd38cb32..3112bb1e8c 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -216,8 +216,8 @@ export type CrawlerChannel = { export type Proxy = { id: string; label: string; - country_code: string; - auth: string; + country_code: string | null; + description: string | null; }; export type ArchivedItem = Crawl | Upload; From d575b87aaa8f83a2e9b116f67c025d8e29094607 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Wed, 7 Aug 2024 10:02:49 +0200 Subject: [PATCH 12/50] show default proxy in `select-crawler-proxy` + misc visual fixes --- backend/btrixcloud/crawlconfigs.py | 2 + backend/btrixcloud/models.py | 2 +- chart/values.yaml | 4 +- .../src/components/ui/select-crawler-proxy.ts | 47 ++++++++++++++----- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c350aec1a3..4dc560f14d 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -66,6 +66,7 @@ "name", ) +DEFAULT_PROXY_ID: str | None = os.environ.get("DEFAULT_PROXY_ID") # ============================================================================ class CrawlConfigOps: @@ -145,6 +146,7 @@ def __init__( self.crawler_proxies_map[proxy.id] = proxy self.crawler_proxies = CrawlerProxies( + default_proxy_id=DEFAULT_PROXY_ID, servers=list(self.crawler_proxies_map.values()) ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index b51151ff9f..c81dfd807b 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -591,7 +591,7 @@ class CrawlerProxy(BaseModel): # ============================================================================ class CrawlerProxies(BaseModel): """List of CrawlerProxy instances for API""" - + default_proxy_id: Optional[str] = None servers: List[CrawlerProxy] = [] diff --git a/chart/values.yaml b/chart/values.yaml index b15f0112f8..c67b649332 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -301,9 +301,9 @@ crawler_liveness_port: 6065 proxies: [] # - id: My Proxy # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric # url: # proxy connection string, must be a ssh://, socks:// or http:// URL -# label: "US Proxy" # optional: label to show instead of the id in the dropdown +# label: "US Proxy" # label to show in dropdown # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search -# +# description: "Proxy" # optional: description to show for the proxy # ssh_private_key: | # requred for ssh:// proxies # # ssh-key needed to connect to the SSH server # diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index 6d6ca231ca..1f91705bd8 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -2,7 +2,6 @@ import { localized, msg } from "@lit/localize"; import { type SlSelect } from "@shoelace-style/shoelace"; import { html } from "lit"; import { customElement, property, state } from "lit/decorators.js"; -import capitalize from "lodash/fp/capitalize"; import type { Proxy } from "@/pages/org/types"; import type { AuthState } from "@/utils/AuthService"; @@ -23,6 +22,7 @@ export type SelectCrawlerProxyUpdateEvent = CustomEvent; type allProxiesAPIResponse = { + default_proxy_id: string | null; servers: Proxy[]; }; @@ -55,11 +55,14 @@ export class SelectCrawlerProxy extends LiteElement { @state() private selectedProxy?: Proxy; + @state() + private defaultProxy?: Proxy; + @state() private allProxies?: Proxy[]; protected firstUpdated() { - void this.fetchallProxies(); + void this.fetchAllProxies(); } // credit: https://dev.to/jorik/country-code-to-flag-emoji-a21 private countryCodeToFlagEmoji(countryCode: String): String { @@ -80,13 +83,15 @@ export class SelectCrawlerProxy extends LiteElement { name="crawlerProxy-select" label=${msg("Crawler Proxy Server")} value=${this.selectedProxy?.id || ""} - placeholder=${msg("No Proxy")} + placeholder=${this.defaultProxy + ? `${msg(`Default Proxy:`)} ${this.defaultProxy.label}` + : msg("No Proxy")} hoist clearable @sl-change=${this.onChange} @sl-focus=${() => { // Refetch to keep list up to date - void this.fetchallProxies(); + void this.fetchAllProxies(); }} @sl-hide=${this.stopProp} @sl-after-hide=${this.stopProp} @@ -95,9 +100,11 @@ export class SelectCrawlerProxy extends LiteElement { (server) => html` ${server.country_code - ? this.countryCodeToFlagEmoji(server.country_code) + ? html` + ${this.countryCodeToFlagEmoji(server.country_code)} + ` : ""} - ${capitalize(server.label)} + ${server.label} `, )} ${this.selectedProxy @@ -110,6 +117,16 @@ export class SelectCrawlerProxy extends LiteElement {
` : ``} + ${!this.selectedProxy && this.defaultProxy + ? html` +
+ ${msg("Description:")} + ${this.defaultProxy.description || ""} +
+ ` + : ``}
`; } @@ -133,10 +150,18 @@ export class SelectCrawlerProxy extends LiteElement { /** * Fetch crawler proxies and update internal state */ - private async fetchallProxies(): Promise { + private async fetchAllProxies(): Promise { try { - const servers = await this.getallProxies(); - this.allProxies = servers; + const data = await this.getAllProxies(); + const defaultProxyId = data.default_proxy_id; + + this.allProxies = data.servers; + + if (!this.defaultProxy) { + this.defaultProxy = this.allProxies.find( + ({ id }) => id === defaultProxyId, + ); + } if (this.proxyId && !this.selectedProxy?.id) { this.selectedProxy = this.allProxies.find( @@ -174,14 +199,14 @@ export class SelectCrawlerProxy extends LiteElement { } } - private async getallProxies(): Promise { + private async getAllProxies(): Promise { const data: allProxiesAPIResponse = await this.apiFetch( `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, this.authState!, ); - return data.servers; + return data; } /** From 3969513583fee5eed83b8662d08c7afa40741c66 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 7 Aug 2024 18:08:35 -0700 Subject: [PATCH 13/50] reformat --- backend/btrixcloud/crawlconfigs.py | 3 ++- backend/btrixcloud/models.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 4dc560f14d..286da0bd48 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -68,6 +68,7 @@ DEFAULT_PROXY_ID: str | None = os.environ.get("DEFAULT_PROXY_ID") + # ============================================================================ class CrawlConfigOps: """Crawl Config Operations""" @@ -147,7 +148,7 @@ def __init__( self.crawler_proxies = CrawlerProxies( default_proxy_id=DEFAULT_PROXY_ID, - servers=list(self.crawler_proxies_map.values()) + servers=list(self.crawler_proxies_map.values()), ) def set_crawl_ops(self, ops): diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 9e7c791be1..3277394e78 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -590,6 +590,7 @@ class CrawlerProxy(BaseModel): # ============================================================================ class CrawlerProxies(BaseModel): """List of CrawlerProxy instances for API""" + default_proxy_id: Optional[str] = None servers: List[CrawlerProxy] = [] From ce71535cda130cc4fd815d5b05e3ece3d9d7ee65 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Aug 2024 19:12:23 -0700 Subject: [PATCH 14/50] fix ui post frontend refactor, remove authstate --- .../src/components/ui/select-crawler-proxy.ts | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index 1f91705bd8..39acb7aea7 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -4,7 +4,6 @@ import { html } from "lit"; import { customElement, property, state } from "lit/decorators.js"; import type { Proxy } from "@/pages/org/types"; -import type { AuthState } from "@/utils/AuthService"; import LiteElement from "@/utils/LiteElement"; type SelectCrawlerProxyChangeDetail = { @@ -21,7 +20,7 @@ type SelectCrawlerProxyUpdateDetail = { export type SelectCrawlerProxyUpdateEvent = CustomEvent; -type allProxiesAPIResponse = { +type AllProxiesAPIResponse = { default_proxy_id: string | null; servers: Proxy[]; }; @@ -32,7 +31,6 @@ type allProxiesAPIResponse = { * Usage example: * ```ts * selectedcrawlerProxy = value} * > @@ -43,12 +41,6 @@ type allProxiesAPIResponse = { @customElement("btrix-select-crawler-proxy") @localized() export class SelectCrawlerProxy extends LiteElement { - @property({ type: Object }) - authState!: AuthState; - - @property({ type: String }) - orgId!: string; - @property({ type: String }) proxyId: string | null = null; @@ -199,9 +191,9 @@ export class SelectCrawlerProxy extends LiteElement { } } - private async getAllProxies(): Promise { - const data: allProxiesAPIResponse = - await this.apiFetch( + private async getAllProxies(): Promise { + const data: AllProxiesAPIResponse = + await this.apiFetch( `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, this.authState!, ); From c7b33fcb2ff71842e3901dced9eecd07b5192167 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Aug 2024 20:06:45 -0700 Subject: [PATCH 15/50] more removal of authstate, including from comments --- frontend/src/components/ui/select-crawler-proxy.ts | 1 - frontend/src/components/ui/select-crawler.ts | 1 - .../features/browser-profiles/new-browser-profile-dialog.ts | 1 - frontend/src/features/browser-profiles/profile-browser.ts | 1 - .../src/features/browser-profiles/select-browser-profile.ts | 1 - frontend/src/index.test.ts | 6 +++--- frontend/src/pages/org/browser-profiles-detail.ts | 1 - frontend/src/pages/org/browser-profiles-list.ts | 1 - frontend/src/pages/org/browser-profiles-new.ts | 1 - frontend/src/pages/org/workflow-editor.ts | 1 - 10 files changed, 3 insertions(+), 12 deletions(-) diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index 39acb7aea7..82b21a7b88 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -195,7 +195,6 @@ export class SelectCrawlerProxy extends LiteElement { const data: AllProxiesAPIResponse = await this.apiFetch( `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, - this.authState!, ); return data; diff --git a/frontend/src/components/ui/select-crawler.ts b/frontend/src/components/ui/select-crawler.ts index 9219db8248..f10d945fc6 100644 --- a/frontend/src/components/ui/select-crawler.ts +++ b/frontend/src/components/ui/select-crawler.ts @@ -29,7 +29,6 @@ type CrawlerChannelsAPIResponse = { * Usage example: * ```ts * selectedCrawler = value} * > * ``` diff --git a/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts b/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts index 842ef2c56f..62ef3844bb 100644 --- a/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts +++ b/frontend/src/features/browser-profiles/new-browser-profile-dialog.ts @@ -87,7 +87,6 @@ export class NewBrowserProfileDialog extends LiteElement { (this.proxyId = e.detail.value!)} > diff --git a/frontend/src/features/browser-profiles/profile-browser.ts b/frontend/src/features/browser-profiles/profile-browser.ts index e3b06723b8..24f755117b 100644 --- a/frontend/src/features/browser-profiles/profile-browser.ts +++ b/frontend/src/features/browser-profiles/profile-browser.ts @@ -27,7 +27,6 @@ export type BrowserConnectionChange = { * Usage example: * ```ts * selectedProfile = value} * > * ``` diff --git a/frontend/src/index.test.ts b/frontend/src/index.test.ts index b7bd3ce98e..ff8b3fe421 100644 --- a/frontend/src/index.test.ts +++ b/frontend/src/index.test.ts @@ -86,7 +86,7 @@ describe("browsertrix-app", () => { Promise.resolve(mockAPIUser), ); stub(AuthService.prototype, "startFreshnessCheck").callsFake(() => {}); - stub(AuthService, "initSessionStorage").callsFake(() => + stub(AuthService, "initSessionStorage").callsFake(async () => Promise.resolve({ headers: { Authorization: "_fake_headers_" }, tokenExpiresAt: 0, @@ -105,7 +105,7 @@ describe("browsertrix-app", () => { Promise.resolve(mockAPIUser), ); stub(AuthService.prototype, "startFreshnessCheck").callsFake(() => {}); - stub(AuthService, "initSessionStorage").callsFake(() => + stub(AuthService, "initSessionStorage").callsFake(async () => Promise.resolve({ headers: { Authorization: "_fake_headers_" }, tokenExpiresAt: 0, @@ -134,7 +134,7 @@ describe("browsertrix-app", () => { }), ); stub(AuthService.prototype, "startFreshnessCheck").callsFake(() => {}); - stub(AuthService, "initSessionStorage").callsFake(() => + stub(AuthService, "initSessionStorage").callsFake(async () => Promise.resolve({ headers: { Authorization: "_fake_headers_" }, tokenExpiresAt: 0, diff --git a/frontend/src/pages/org/browser-profiles-detail.ts b/frontend/src/pages/org/browser-profiles-detail.ts index 7d34ea0343..80039c6834 100644 --- a/frontend/src/pages/org/browser-profiles-detail.ts +++ b/frontend/src/pages/org/browser-profiles-detail.ts @@ -22,7 +22,6 @@ const DESCRIPTION_MAXLENGTH = 500; * Usage: * ```ts * * ``` diff --git a/frontend/src/pages/org/browser-profiles-list.ts b/frontend/src/pages/org/browser-profiles-list.ts index 9821361f51..efc7b0e94a 100644 --- a/frontend/src/pages/org/browser-profiles-list.ts +++ b/frontend/src/pages/org/browser-profiles-list.ts @@ -32,7 +32,6 @@ const INITIAL_PAGE_SIZE = 20; * Usage: * ```ts * * ``` */ diff --git a/frontend/src/pages/org/browser-profiles-new.ts b/frontend/src/pages/org/browser-profiles-new.ts index 8311ff1e7b..ef999bffee 100644 --- a/frontend/src/pages/org/browser-profiles-new.ts +++ b/frontend/src/pages/org/browser-profiles-new.ts @@ -13,7 +13,6 @@ import { isApiError } from "@/utils/api"; * Usage: * ```ts * * ``` diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index dfe06e6525..46e05aa50e 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -1746,7 +1746,6 @@ https://archiveweb.page/images/${"logo.svg"}`} this.updateFormState({ proxyId: e.detail.value, From 310b647ba2709bf3051b605522c6843e566a5fac Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Thu, 15 Aug 2024 12:55:00 +0200 Subject: [PATCH 16/50] move proxy config to subchart, allow updating proxies without re-deploying --- backend/btrixcloud/crawlconfigs.py | 65 +++++++++++++++------ chart/Chart.lock | 7 ++- chart/Chart.yaml | 4 ++ chart/charts/btrix-proxies-0.1.0.tgz | Bin 0 -> 820 bytes chart/proxies/Chart.yaml | 15 +++++ chart/{ => proxies}/templates/proxies.yaml | 12 +++- chart/proxies/values.yaml | 2 + chart/templates/backend.yaml | 11 ++++ chart/templates/configmap.yaml | 4 +- chart/templates/secrets.yaml | 2 - chart/values.yaml | 41 +++++++------ 11 files changed, 120 insertions(+), 43 deletions(-) create mode 100644 chart/charts/btrix-proxies-0.1.0.tgz create mode 100644 chart/proxies/Chart.yaml rename chart/{ => proxies}/templates/proxies.yaml (73%) create mode 100644 chart/proxies/values.yaml diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 286da0bd48..6ae7a93d91 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -130,25 +130,12 @@ def __init__( if "default" not in self.crawler_images_map: raise TypeError("The channel list must include a 'default' channel") - self.crawler_proxies_map = {} - with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh: - proxy_list = json.loads(fh.read()) - for proxy_data in proxy_list: - proxy = CrawlerProxy( - id=proxy_data["id"], - label=proxy_data["label"], - description=proxy_data.get("description", ""), - country_code=proxy_data.get("country_code", ""), - url=proxy_data["url"], - has_host_public_key=bool(proxy_data.get("ssh_host_public_key")), - has_private_key=bool(proxy_data.get("ssh_private_key")), - ) - - self.crawler_proxies_map[proxy.id] = proxy + self._crawler_proxies_last_updated = None + self._crawler_proxies_map = None - self.crawler_proxies = CrawlerProxies( - default_proxy_id=DEFAULT_PROXY_ID, - servers=list(self.crawler_proxies_map.values()), + if DEFAULT_PROXY_ID and DEFAULT_PROXY_ID not in self.crawler_proxies_map: + raise ValueError( + f"Configured proxies must include DEFAULT_PROXY_ID: {DEFAULT_PROXY_ID}" ) def set_crawl_ops(self, ops): @@ -928,6 +915,48 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") + @property + def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: + """Load CrawlerProxy mapping from config""" + proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"] + + if not os.path.isfile(proxies_last_update_path): + return {} + + # return cached data, when last_update timestamp hasn't changed + if self._crawler_proxies_last_updated: + with open(proxies_last_update_path, encoding="utf-8") as fh: + proxies_last_update = int(fh.read().strip()) + if proxies_last_update == self._crawler_proxies_last_updated: + return self._crawler_proxies_map + self._crawler_proxies_last_updated = proxies_last_update + + crawler_proxies_map = {} + with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh: + proxy_list = json.loads(fh.read()) + for proxy_data in proxy_list: + proxy = CrawlerProxy( + id=proxy_data["id"], + label=proxy_data["label"], + description=proxy_data.get("description", ""), + country_code=proxy_data.get("country_code", ""), + url=proxy_data["url"], + has_host_public_key=bool(proxy_data.get("ssh_host_public_key")), + has_private_key=bool(proxy_data.get("ssh_private_key")), + ) + + crawler_proxies_map[proxy.id] = proxy + + self._crawler_proxies_map = crawler_proxies_map + return self._crawler_proxies_map + + @property + def crawler_proxies(self): + return CrawlerProxies( + default_proxy_id=DEFAULT_PROXY_ID, + servers=list(self.crawler_proxies_map.values()), + ) + def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]: """Get crawlerProxy by id""" return self.crawler_proxies_map.get(proxy_id) diff --git a/chart/Chart.lock b/chart/Chart.lock index 840747e9d1..fec463cfa0 100644 --- a/chart/Chart.lock +++ b/chart/Chart.lock @@ -8,5 +8,8 @@ dependencies: - name: metacontroller-helm repository: oci://ghcr.io/metacontroller version: 4.11.11 -digest: sha256:ae000dbd876ade6de33de1b8740f73683e2a783847f7e73e2cac4a0c2ee4d797 -generated: "2024-03-26T21:24:31.761944-07:00" +- name: btrix-proxies + repository: file://./proxies/ + version: 0.1.0 +digest: sha256:2fd9472f857e9e3eacdcc616a3cffac5bb2951411cc2d34aea84253092225ecf +generated: "2024-08-15T11:19:17.884682494+02:00" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 700ec2863b..e76d43f28c 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -18,3 +18,7 @@ dependencies: - name: metacontroller-helm version: 4.11.11 repository: "oci://ghcr.io/metacontroller" + - name: btrix-proxies + version: 0.1.0 + condition: btrix-proxies.enabled + repository: file://./proxies/ diff --git a/chart/charts/btrix-proxies-0.1.0.tgz b/chart/charts/btrix-proxies-0.1.0.tgz new file mode 100644 index 0000000000000000000000000000000000000000..bf4173a63cd0ce73fcb1cbb6b99c2361e24e5846 GIT binary patch literal 820 zcmV-41Izp$iwG0|00000|0w_~VMtOiV@ORlOnEsqVl!4SWK%V1T2nbTPgYhoO;>Dc zVQyr3R8em|NM&qo0PI%JkLorM=Ip;>q<0Q6kmX6f_1C3`R_c@96`5p~xCGnlnZQQC z|6VzPZD83xsYUy?QochH$K#o?J>O?9Ihg0f2ls4{(w|Blc--hsHM%3DlyaVB?ORHD z{g$(b*=Ul@A3o2rY&Ml6ImzU7G6H$eB;G5+Qf?(X2MwgSG__<^h=?;8Se^7Vzi-)D(?R zWhMZ2hD-P{xE*1L6(~#|{g=$1RGRK^88CODz%aJRsU||EG#ZV)T?wlzJ*OPptwC^(4NaXsPy#ujK9vN$;UMO(azQ99y4Y^7=kGfQsg}KnrK(F*R8g_3a<81rB6#7<#&mo zmsP98QG(g^_NN;kGX($$&arw{Qc2Z*rrmRCY`WA`wl=L!sP1`VOKVfW0SbK7b;U4^ zH~7zRJl-;0ygLBzi(Yv-vI$R_D7x;F~^53EoV zj(_V{_GM|GeV#KsZN{R~#EaS&nmb?C?ce~lHP63zmzQt=c3)arzyX%?3~devc&Z&^ ySADAx&mZSj#1`HVR4)`ko0RR7d{N{)N6aWCyg_kq{ literal 0 HcmV?d00001 diff --git a/chart/proxies/Chart.yaml b/chart/proxies/Chart.yaml new file mode 100644 index 0000000000..4632adeb69 --- /dev/null +++ b/chart/proxies/Chart.yaml @@ -0,0 +1,15 @@ +apiVersion: v2 +name: btrix-proxies +description: A chart deploying the configmap and secrets required for using proxies with Browsertrix +type: application +icon: https://webrecorder.net/assets/icon.png + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +appVersion: 0.1.0 diff --git a/chart/templates/proxies.yaml b/chart/proxies/templates/proxies.yaml similarity index 73% rename from chart/templates/proxies.yaml rename to chart/proxies/templates/proxies.yaml index f10b6e3480..ab53ca5827 100644 --- a/chart/templates/proxies.yaml +++ b/chart/proxies/templates/proxies.yaml @@ -5,7 +5,6 @@ kind: Secret metadata: name: proxies namespace: {{ .Values.crawler_namespace }} -#type: kubernetes.io/ssh-auth type: Opaque stringData: {{- range .Values.proxies }} @@ -31,4 +30,15 @@ stringData: root:x:0: btrix:x:{{ .Values.crawler_gid | default 201407 }}: +--- +apiVersion: v1 +kind: Secret +metadata: + name: ops-proxy-configs + namespace: {{ .Release.Namespace }} + +type: Opaque +data: + crawler_proxies_last_update: {{ now | unixEpoch | toString | b64enc | quote }} + crawler_proxies.json: {{ .Values.proxies | toJson | b64enc | quote }} {{- end }} diff --git a/chart/proxies/values.yaml b/chart/proxies/values.yaml new file mode 100644 index 0000000000..f0284e6c16 --- /dev/null +++ b/chart/proxies/values.yaml @@ -0,0 +1,2 @@ +proxies: [] # see proxies description in main helm chart +crawler_namespace: crawlers # namespace to deploy ssh keys to diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index a1577288ed..8f96fdd24f 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -43,6 +43,11 @@ spec: secret: secretName: ops-configs + - name: ops-proxy-configs + secret: + secretName: ops-proxy-configs + optional: true + - name: app-templates configMap: name: app-templates @@ -97,6 +102,9 @@ spec: - name: ops-configs mountPath: /ops-configs/ + - name: ops-proxy-configs + mountPath: /ops-proxy-configs/ + - name: app-templates mountPath: /app/btrixcloud/templates/ @@ -171,6 +179,9 @@ spec: - name: ops-configs mountPath: /ops-configs/ + - name: ops-proxy-configs + mountPath: /ops-proxy-configs/ + - name: app-templates mountPath: /app/btrixcloud/templates/ diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 001d712a82..e6b8a7a909 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -53,7 +53,9 @@ data: STORAGES_JSON: "/ops-configs/storages.json" CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json" - CRAWLER_PROXIES_JSON: "/ops-configs/crawler_proxies.json" + + CRAWLER_PROXIES_LAST_UPDATE: "/ops-proxy-configs/crawler_proxies_last_update" + CRAWLER_PROXIES_JSON: "/ops-proxy-configs/crawler_proxies.json" DEFAULT_PROXY_ID: "{{ .Values.default_proxy }}" diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index a1e7ced0ba..6968c7bcb8 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -32,8 +32,6 @@ type: Opaque data: storages.json: {{ .Values.storages | toJson | b64enc | quote }} crawler_channels.json: {{ .Values.crawler_channels | toJson | b64enc | quote }} - crawler_proxies.json: {{ .Values.proxies | toJson | b64enc | quote }} - {{- range $storage := .Values.storages }} --- diff --git a/chart/values.yaml b/chart/values.yaml index c67b649332..1f54d8d568 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -297,26 +297,29 @@ crawler_session_time_limit_seconds: 18000 crawler_liveness_port: 6065 -# optional: configure a list of ssh servers to be used as a proxy -proxies: [] -# - id: My Proxy # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric -# url: # proxy connection string, must be a ssh://, socks:// or http:// URL -# label: "US Proxy" # label to show in dropdown -# country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search -# description: "Proxy" # optional: description to show for the proxy -# ssh_private_key: | # requred for ssh:// proxies -# # ssh-key needed to connect to the SSH server -# -# -# ssh_host_public_key: | # optional, for ssh:// proxies-only -# # ssh public keys of the SSH server -# # use output of `ssh-keyscan $hostname -p $port` for best results -# example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 -# example.invalid ssh-rsa AAA[..] - # optional: always proxy with following id by default, if no other proxy settings are set -# must match one of the proxies in the 'proxies' list -# default_proxy: +# must match one of the proxies in the 'browsertrix-proxies' list +# default_proxy: "proxy-id" + +# optional: configure a list of ssh servers to be used as a proxy +btrix-proxies: + enabled: false # enable to deploy proxies configmap and secret + crawler_namespace: "crawlers" + proxies: [] + # - id: proxy-id # name of the proxy, is shown in the dropdown, has to be lowercase, alphanumeric, can contain dashes + # url: # proxy connection string, must be a ssh://, socks:// or http:// URL + # label: "US Proxy" # label to show in dropdown + # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search + # description: "Proxy" # optional: description to show for the proxy + # ssh_private_key: | # requred for ssh:// proxies + # # ssh-key needed to connect to the SSH server + # + # + # ssh_host_public_key: | # optional, for ssh:// proxies-only + # # ssh public keys of the SSH server + # # use output of `ssh-keyscan $hostname -p $port` for best results + # example.invalid:22 SSH-2.0-OpenSSH_9.6p1 Ubuntu-3ubuntu13 + # example.invalid ssh-rsa AAA[..] # optional: set the uid, gid and fsgroup for the crawler and profilebrowser pods # the following values are used by default: From e48a0749b83e7844aee6e28c8bd6837658fd62cd Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Thu, 15 Aug 2024 14:27:05 +0200 Subject: [PATCH 17/50] move passwd hack to main chart --- chart/app-templates/crawler.yaml | 8 ++++++-- chart/app-templates/profilebrowser.yaml | 8 ++++++-- chart/proxies/templates/proxies.yaml | 10 ---------- chart/templates/secrets.yaml | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 256ef16cc6..311c958afc 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -76,6 +76,10 @@ spec: secret: secretName: proxies defaultMode: 0600 + - name: proxies-passwd-hack + secret: + secretName: proxies-passwd-hack + defaultMode: 0600 {% endif %} affinity: @@ -171,11 +175,11 @@ spec: subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - - name: proxies + - name: proxies-passwd-hack mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxies + - name: proxies-passwd-hack mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index c2119b4f27..e4f68ea3f8 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -31,6 +31,10 @@ spec: secret: secretName: proxies defaultMode: 0600 + - name: proxies-passwd-hack + secret: + secretName: proxies-passwd-hack + defaultMode: 0600 {% endif %} {% if priorityClassName %} @@ -109,11 +113,11 @@ spec: subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - - name: proxies + - name: proxies-passwd-hack mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxies + - name: proxies-passwd-hack mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/proxies/templates/proxies.yaml b/chart/proxies/templates/proxies.yaml index ab53ca5827..e9b6d68df3 100644 --- a/chart/proxies/templates/proxies.yaml +++ b/chart/proxies/templates/proxies.yaml @@ -20,16 +20,6 @@ stringData: {{- end }} {{- end }} - - # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies - passwd: | - root:x:0:0:root:/root:/bin/bash - btrix:btrix:{{ .Values.crawler_uid | default 201407 }}:{{ .Values.crawler_gid | default 201407 }}::/tmp/btrix:/bin/sh - - group: | - root:x:0: - btrix:x:{{ .Values.crawler_gid | default 201407 }}: - --- apiVersion: v1 kind: Secret diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 6968c7bcb8..60de9e4776 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -59,3 +59,21 @@ stringData: STORE_S3_PROVIDER: {{ $storage.s3_provider | default "Other" }} {{- end }} + +--- +apiVersion: v1 +kind: Secret +metadata: + name: proxies-passwd-hack + namespace: {{ .Values.crawler_namespace }} +type: Opaque +stringData: + + # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies + passwd: | + root:x:0:0:root:/root:/bin/bash + btrix:btrix:{{ .Values.crawler_uid | default 201407 }}:{{ .Values.crawler_gid | default 201407 }}::/tmp/btrix:/bin/sh + + group: | + root:x:0: + btrix:x:{{ .Values.crawler_gid | default 201407 }}: From 7266d1dca27082bcf7724cfa17d120e0a23bf593 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Thu, 15 Aug 2024 14:39:37 +0200 Subject: [PATCH 18/50] add missing docstring --- backend/btrixcloud/crawlconfigs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 6ae7a93d91..f0616748d8 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -952,6 +952,7 @@ def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: @property def crawler_proxies(self): + """Get CrawlerProxy configuration""" return CrawlerProxies( default_proxy_id=DEFAULT_PROXY_ID, servers=list(self.crawler_proxies_map.values()), From cfaa3b8194adc513bb66fa95625aa16f625d2c8f Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Thu, 29 Aug 2024 19:01:18 +0200 Subject: [PATCH 19/50] fix lint error --- backend/btrixcloud/crawlconfigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index f0616748d8..f136352db4 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -924,14 +924,14 @@ def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: return {} # return cached data, when last_update timestamp hasn't changed - if self._crawler_proxies_last_updated: + if self._crawler_proxies_last_updated and self._crawler_proxies_map: with open(proxies_last_update_path, encoding="utf-8") as fh: proxies_last_update = int(fh.read().strip()) if proxies_last_update == self._crawler_proxies_last_updated: return self._crawler_proxies_map self._crawler_proxies_last_updated = proxies_last_update - crawler_proxies_map = {} + crawler_proxies_map: dict[str, CrawlerProxy] = {} with open(os.environ["CRAWLER_PROXIES_JSON"], encoding="utf-8") as fh: proxy_list = json.loads(fh.read()) for proxy_data in proxy_list: From 8663875ce75b69f892fa83c30b50094f99cf98f8 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Mon, 2 Sep 2024 13:12:38 +0200 Subject: [PATCH 20/50] proxies: add shared flag, org proxy settings --- backend/btrixcloud/crawlconfigs.py | 57 ++++++++++++++++++++++++------ backend/btrixcloud/models.py | 16 +++++++-- backend/btrixcloud/orgs.py | 25 +++++++++++++ backend/btrixcloud/profiles.py | 5 +++ chart/values.yaml | 6 ++-- 5 files changed, 95 insertions(+), 14 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index f136352db4..be59594f5c 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -43,7 +43,7 @@ CrawlerProxy, CrawlerProxies, ) -from .utils import dt_now, slug_from_name +from .utils import dt_now, slug_from_name, is_bool if TYPE_CHECKING: from .orgs import OrgOps @@ -133,7 +133,7 @@ def __init__( self._crawler_proxies_last_updated = None self._crawler_proxies_map = None - if DEFAULT_PROXY_ID and DEFAULT_PROXY_ID not in self.crawler_proxies_map: + if DEFAULT_PROXY_ID and DEFAULT_PROXY_ID not in self.get_crawler_proxies_map(): raise ValueError( f"Configured proxies must include DEFAULT_PROXY_ID: {DEFAULT_PROXY_ID}" ) @@ -208,6 +208,11 @@ async def add_crawl_config( if profileid: await self.profiles.get_profile(profileid, org) + # ensure proxyId is valid and available for org + if config_in.proxyId: + if not self.can_org_use_proxy(org, config_in.proxyId): + raise HTTPException(status_code=404, detail="proxy_not_found") + now = dt_now() crawlconfig = CrawlConfig( id=uuid4(), @@ -846,6 +851,9 @@ async def run_now_internal( if await self.get_running_crawl(crawlconfig): raise HTTPException(status_code=400, detail="crawl_already_running") + if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): + raise HTTPException(status_code=404, detail="proxy_not_found") + profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -915,8 +923,7 @@ def get_channel_crawler_image( """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") - @property - def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: + def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]: """Load CrawlerProxy mapping from config""" proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"] @@ -943,6 +950,8 @@ def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: url=proxy_data["url"], has_host_public_key=bool(proxy_data.get("ssh_host_public_key")), has_private_key=bool(proxy_data.get("ssh_private_key")), + shared=is_bool(proxy_data.get("shared")) + or proxy_data["id"] == DEFAULT_PROXY_ID, ) crawler_proxies_map[proxy.id] = proxy @@ -950,17 +959,30 @@ def crawler_proxies_map(self) -> dict[str, CrawlerProxy]: self._crawler_proxies_map = crawler_proxies_map return self._crawler_proxies_map - @property - def crawler_proxies(self): + def get_crawler_proxies(self): """Get CrawlerProxy configuration""" return CrawlerProxies( default_proxy_id=DEFAULT_PROXY_ID, - servers=list(self.crawler_proxies_map.values()), + servers=list(self.get_crawler_proxies_map().values()), ) def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]: """Get crawlerProxy by id""" - return self.crawler_proxies_map.get(proxy_id) + return self.get_crawler_proxies_map().get(proxy_id) + + def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> bool: + """Checks if org is able to use proxy""" + if isinstance(proxy, str): + _proxy = self.get_crawler_proxy(proxy) + else: + _proxy = proxy + + if _proxy is None: + return False + + return ( + _proxy.shared and org.allowSharedProxies + ) or _proxy.id in org.allowedProxies def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url @@ -1134,10 +1156,25 @@ async def get_crawler_channels( @router.get("/crawler-proxies", response_model=CrawlerProxies) async def get_crawler_proxies( - # pylint: disable=unused-argument org: Organization = Depends(org_crawl_dep), ): - return ops.crawler_proxies + return CrawlerProxies( + default_proxy_id=DEFAULT_PROXY_ID, + servers=[ + proxy + for proxy in ops.get_crawler_proxies_map().values() + if ops.can_org_use_proxy(org, proxy) + ], + ) + + @router.get("/crawler-proxies/all", response_model=CrawlerProxies) + async def get_all_crawler_proxies( + user: User = Depends(user_dep), + ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return ops.get_crawler_proxies @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 3277394e78..1b371af406 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -583,8 +583,9 @@ class CrawlerProxy(BaseModel): label: str description: str = "" country_code: str = "" - has_host_public_key: bool - has_private_key: bool + has_host_public_key: bool = False + has_private_key: bool = False + shared: bool = False # ============================================================================ @@ -595,6 +596,14 @@ class CrawlerProxies(BaseModel): servers: List[CrawlerProxy] = [] +# ============================================================================ +class OrgProxies(BaseModel): + """Org proxy settings for API""" + + allowSharedProxies: bool + allowedProxies: list[str] + + # ============================================================================ ### BASE CRAWLS ### @@ -1418,6 +1427,9 @@ class Organization(BaseMongoModel): subscription: Optional[Subscription] = None + allowSharedProxies: bool = True + allowedProxies: list[str] = [] + def is_owner(self, user): """Check if user is owner""" return self._is_auth(user, UserRole.OWNER) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 7222cae936..94bc0aa2ba 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -38,6 +38,7 @@ OrgMetrics, OrgWebhookUrls, OrgCreate, + OrgProxies, Subscription, SubscriptionUpdate, SubscriptionCancel, @@ -512,6 +513,17 @@ async def update_custom_storages(self, org: Organization) -> bool: res = await self.orgs.find_one_and_update({"_id": org.id}, {"$set": set_dict}) return res is not None + async def update_proxies(self, org: Organization, proxies: OrgProxies) -> None: + await self.orgs.find_one_and_update( + {"_id": org.id}, + { + "$set": { + "allowSharedProxies": proxies.allowSharedProxies, + "allowedProxies": proxies.allowedProxies, + } + }, + ) + async def update_quotas(self, org: Organization, quotas: OrgQuotas) -> None: """update organization quotas""" @@ -1469,6 +1481,19 @@ async def update_quotas( return {"updated": True} + @router.post("/proxies", tags=["organizations"], response_model=UpdatedResponse) + async def update_proxies( + proxies: OrgProxies, + org: Organization = Depends(org_owner_dep), + user: User = Depends(user_dep), + ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + await ops.update_quotas(org, proxies) + + return {"updated": True} + @router.post("/read-only", tags=["organizations"], response_model=UpdatedResponse) async def update_read_only( update: OrgReadOnlyUpdate, diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 25a7e382e2..d3f70a4cfe 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -107,6 +107,11 @@ async def create_new_browser( if not crawler_image: raise HTTPException(status_code=404, detail="crawler_not_found") + if profile_launch.proxyId and not self.crawlconfigs.can_org_use_proxy( + org, profile_launch.proxyId + ): + raise HTTPException(status_code=404, detail="proxy_not_found") + browserid = await self.crawl_manager.run_profile_browser( str(user.id), str(org.id), diff --git a/chart/values.yaml b/chart/values.yaml index 1f54d8d568..48fc99b9d6 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -297,8 +297,9 @@ crawler_session_time_limit_seconds: 18000 crawler_liveness_port: 6065 -# optional: always proxy with following id by default, if no other proxy settings are set -# must match one of the proxies in the 'browsertrix-proxies' list +# optional: use this proxy by default, when no other proxy is set for the crawl +# must match one of the proxy ids in the 'btrix-proxies.proxies' list +# will set the proxy to shared # default_proxy: "proxy-id" # optional: configure a list of ssh servers to be used as a proxy @@ -311,6 +312,7 @@ btrix-proxies: # label: "US Proxy" # label to show in dropdown # country_code: US # Alpha-2 ISO 3991 country code, https://www.iso.org/obp/ui/#search # description: "Proxy" # optional: description to show for the proxy + # shared: false # optional: set to true, to make proxy availble for all orgs # ssh_private_key: | # requred for ssh:// proxies # # ssh-key needed to connect to the SSH server # From c702ba7e6f1b9dc84f0981528a76cd65c3593de8 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 3 Sep 2024 10:32:41 +0200 Subject: [PATCH 21/50] proxies: fix backend bugs --- backend/btrixcloud/crawlconfigs.py | 7 ++++--- backend/btrixcloud/models.py | 3 +++ backend/btrixcloud/orgs.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index be59594f5c..a7f227f764 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -43,7 +43,7 @@ CrawlerProxy, CrawlerProxies, ) -from .utils import dt_now, slug_from_name, is_bool +from .utils import dt_now, slug_from_name if TYPE_CHECKING: from .orgs import OrgOps @@ -950,7 +950,7 @@ def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]: url=proxy_data["url"], has_host_public_key=bool(proxy_data.get("ssh_host_public_key")), has_private_key=bool(proxy_data.get("ssh_private_key")), - shared=is_bool(proxy_data.get("shared")) + shared=proxy_data.get("shared", False) or proxy_data["id"] == DEFAULT_PROXY_ID, ) @@ -972,6 +972,7 @@ def get_crawler_proxy(self, proxy_id: str) -> Optional[CrawlerProxy]: def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> bool: """Checks if org is able to use proxy""" + if isinstance(proxy, str): _proxy = self.get_crawler_proxy(proxy) else: @@ -1174,7 +1175,7 @@ async def get_all_crawler_proxies( if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return ops.get_crawler_proxies + return ops.get_crawler_proxies() @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 1b371af406..0f8bdcf1ee 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1376,6 +1376,9 @@ class OrgOut(BaseMongoModel): subscription: Optional[Subscription] = None + allowSharedProxies: bool = True + allowedProxies: list[str] = [] + # ============================================================================ class Organization(BaseMongoModel): diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 94bc0aa2ba..10105720c2 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1490,7 +1490,7 @@ async def update_proxies( if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - await ops.update_quotas(org, proxies) + await ops.update_proxies(org, proxies) return {"updated": True} From b63322cac44569f95bbfcef4ed2fc7a426d40988 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 3 Sep 2024 10:33:30 +0200 Subject: [PATCH 22/50] frontend: add `proxy_not_found` error message --- frontend/src/pages/org/workflow-detail.ts | 4 ++++ frontend/src/pages/org/workflows-list.ts | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 8e6372b066..8db4f92892 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -1628,6 +1628,10 @@ export class WorkflowDetail extends LiteElement { } else { message = msg("You do not have permission to run crawls."); } + } else if (isApiError(e) && e.details == "proxy_not_found") { + message = msg( + "Your org doesn't have permission to use the proxy configured for this crawl.", + ); } this.notify({ message: message, diff --git a/frontend/src/pages/org/workflows-list.ts b/frontend/src/pages/org/workflows-list.ts index 301760cf32..2fbcc72e03 100644 --- a/frontend/src/pages/org/workflows-list.ts +++ b/frontend/src/pages/org/workflows-list.ts @@ -786,6 +786,10 @@ export class WorkflowsList extends LiteElement { } else { message = msg("You do not have permission to run crawls."); } + } else if (isApiError(e) && e.details == "proxy_not_found") { + message = msg( + "Your org doesn't have permission to use the proxy configured for this crawl.", + ); } this.notify({ message: message, From b3dbfe117e4b437fba579fdaf58544723ad4258d Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 3 Sep 2024 10:34:35 +0200 Subject: [PATCH 23/50] frontend: add wip admin proxy gui --- frontend/src/components/orgs-list.ts | 133 +++++++++++++++++- .../src/components/ui/select-crawler-proxy.ts | 26 ++-- frontend/src/pages/home.ts | 13 ++ frontend/src/types/crawler.ts | 6 + frontend/src/types/org.ts | 2 + 5 files changed, 160 insertions(+), 20 deletions(-) diff --git a/frontend/src/components/orgs-list.ts b/frontend/src/components/orgs-list.ts index 1d6198a797..cb65a1c12c 100644 --- a/frontend/src/components/orgs-list.ts +++ b/frontend/src/components/orgs-list.ts @@ -2,20 +2,24 @@ import { localized, msg, str } from "@lit/localize"; import type { SlButton, SlChangeEvent, + SlCheckbox, SlInput, + SlMenuItem, } from "@shoelace-style/shoelace"; import { serialize } from "@shoelace-style/shoelace/dist/utilities/form.js"; import { css, html, nothing } from "lit"; -import { customElement, property, query } from "lit/decorators.js"; +import { customElement, property, query, state } from "lit/decorators.js"; import { when } from "lit/directives/when.js"; import { BtrixElement } from "@/classes/BtrixElement"; import type { Dialog } from "@/components/ui/dialog"; +import type { ProxiesAPIResponse, Proxy } from "@/types/crawler"; import { formatNumber, getLocale } from "@/utils/localization"; import type { OrgData } from "@/utils/orgs"; /** * @fires update-quotas + * @fires update-proxies */ @localized() @customElement("btrix-orgs-list") @@ -35,9 +39,15 @@ export class OrgsList extends BtrixElement { @property({ type: Object }) currOrg?: OrgData | null = null; + @state() + private allProxies?: Proxy[]; + @query("#orgQuotaDialog") private readonly orgQuotaDialog?: Dialog | null; + @query("#orgProxiesDialog") + private readonly orgProxiesDialog?: Dialog | null; + @query("#orgReadOnlyDialog") private readonly orgReadOnlyDialog?: Dialog | null; @@ -79,8 +89,8 @@ export class OrgsList extends BtrixElement { - ${this.renderOrgQuotas()} ${this.renderOrgReadOnly()} - ${this.renderOrgDelete()} + ${this.renderOrgQuotas()} ${this.renderOrgProxies()} + ${this.renderOrgReadOnly()} ${this.renderOrgDelete()} `; } @@ -139,6 +149,67 @@ export class OrgsList extends BtrixElement { `; } + private renderOrgProxies() { + return html` + (this.currOrg = null)} + @sl-show=${() => { + if (this.currOrg) { + void this.fetchAllProxies(this.currOrg); + } + }} + > + ${msg("Allow shared proxies")} + + + ${this.allProxies + ?.filter((server) => server.shared) + .map( + (server) => + html` + ${server.id}: ${server.label} + `, + )} + ${this.allProxies + ?.filter((server) => !server.shared) + .map( + (server) => + html` + ${server.id}: ${server.label} + `, + )} + + +
+ ${msg("Update Proxy Settings")} + +
+
+ `; + } + private renderOrgReadOnly() { return html` ( + `/orgs/${org.id}/crawlconfigs/crawler-proxies/all`, + ); + this.allProxies = data.servers; + } catch (e) { + console.debug(e); + + this.notify.toast({ + message: msg("Sorry, couldn't get all proxies at this time."), + variant: "danger", + icon: "exclamation-octagon", + }); + } + } private async deleteOrg(org: OrgData) { try { await this.api.fetch(`/orgs/${org.id}`, { @@ -537,6 +655,15 @@ export class OrgsList extends BtrixElement { ${msg("Edit Quotas")} + { + this.currOrg = org; + void this.orgProxiesDialog?.show(); + }} + > + + ${msg("Edit Proxies")} + ${org.readOnly ? html` ; -type AllProxiesAPIResponse = { - default_proxy_id: string | null; - servers: Proxy[]; -}; - /** * Crawler proxy select dropdown * @@ -54,7 +49,7 @@ export class SelectCrawlerProxy extends LiteElement { private allProxies?: Proxy[]; protected firstUpdated() { - void this.fetchAllProxies(); + void this.fetchOrgProxies(); } // credit: https://dev.to/jorik/country-code-to-flag-emoji-a21 private countryCodeToFlagEmoji(countryCode: String): String { @@ -83,7 +78,7 @@ export class SelectCrawlerProxy extends LiteElement { @sl-change=${this.onChange} @sl-focus=${() => { // Refetch to keep list up to date - void this.fetchAllProxies(); + void this.fetchOrgProxies(); }} @sl-hide=${this.stopProp} @sl-after-hide=${this.stopProp} @@ -142,9 +137,9 @@ export class SelectCrawlerProxy extends LiteElement { /** * Fetch crawler proxies and update internal state */ - private async fetchAllProxies(): Promise { + private async fetchOrgProxies(): Promise { try { - const data = await this.getAllProxies(); + const data = await this.getOrgProxies(); const defaultProxyId = data.default_proxy_id; this.allProxies = data.servers; @@ -191,13 +186,10 @@ export class SelectCrawlerProxy extends LiteElement { } } - private async getAllProxies(): Promise { - const data: AllProxiesAPIResponse = - await this.apiFetch( - `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, - ); - - return data; + private async getOrgProxies(): Promise { + return this.apiFetch( + `/orgs/${this.orgId}/crawlconfigs/crawler-proxies`, + ); } /** diff --git a/frontend/src/pages/home.ts b/frontend/src/pages/home.ts index 0c39530ecd..1c0909f3bb 100644 --- a/frontend/src/pages/home.ts +++ b/frontend/src/pages/home.ts @@ -171,6 +171,7 @@ export class Home extends LiteElement {
@@ -402,6 +403,18 @@ export class Home extends LiteElement { }); } + async onUpdateOrgProxies(e: CustomEvent) { + const org = e.detail as OrgData; + + await this.apiFetch(`/orgs/${org.id}/proxies`, { + method: "POST", + body: JSON.stringify({ + allowSharedProxies: org.allowSharedProxies, + allowedProxies: org.allowedProxies, + }), + }); + } + async checkFormValidity(formEl: HTMLFormElement) { await this.updateComplete; return !formEl.querySelector("[data-invalid]"); diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 3112bb1e8c..aa7916e59b 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -218,6 +218,12 @@ export type Proxy = { label: string; country_code: string | null; description: string | null; + shared: boolean; +}; + +export type ProxiesAPIResponse = { + default_proxy_id: string | null; + servers: Proxy[]; }; export type ArchivedItem = Crawl | Upload; diff --git a/frontend/src/types/org.ts b/frontend/src/types/org.ts index c0404fe3bc..8890fcc2ce 100644 --- a/frontend/src/types/org.ts +++ b/frontend/src/types/org.ts @@ -73,6 +73,8 @@ export const orgDataSchema = z.object({ readOnlyReason: z.union([orgReadOnlyReasonSchema, z.string()]).nullable(), readOnlyOnCancel: z.boolean(), subscription: subscriptionSchema.nullable(), + allowSharedProxies: z.boolean(), + allowedProxies: z.array(z.string()), }); export type OrgData = z.infer; From 2e5fa5fabd2cc3126b6a55a583107a102783c6b4 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Tue, 3 Sep 2024 10:43:49 +0200 Subject: [PATCH 24/50] add missing docstring --- backend/btrixcloud/orgs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 10105720c2..e0e4055c33 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -514,6 +514,7 @@ async def update_custom_storages(self, org: Organization) -> bool: return res is not None async def update_proxies(self, org: Organization, proxies: OrgProxies) -> None: + """Update org proxy settings""" await self.orgs.find_one_and_update( {"_id": org.id}, { From 0cb5d0ebfd920a524f664d18ec0ae22940748486 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 12 Sep 2024 16:30:28 -0700 Subject: [PATCH 25/50] proxy UI fixes after merge --- .../src/features/crawl-workflows/workflow-editor.ts | 12 +++--------- frontend/src/strings/crawl-workflows/infoText.ts | 1 + frontend/xliff/es.xlf | 8 ++------ 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 52b27c79c0..6dabc3e0b9 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1420,12 +1420,8 @@ https://archiveweb.page/images/${"logo.svg"}`} ${msg("Language")} `)} - <<<<<<< HEAD:frontend/src/pages/org/workflow-editor.ts - ${this.renderHelpTextCol( - msg(`Websites that observe the browser’s language setting may serve - content in that language if available.`), - )} - ${this.renderFormCol(html` + ${this.renderHelpTextCol(infoTextStrings["lang"])} + ${inputCol(html` `)} - ${this.renderHelpTextCol(msg(`Choose a Browsertrix Proxy`))} ======= - ${this.renderHelpTextCol(infoTextStrings["lang"])} >>>>>>> - main:frontend/src/features/crawl-workflows/workflow-editor.ts + ${this.renderHelpTextCol(infoTextStrings["proxyId"])} `; } diff --git a/frontend/src/strings/crawl-workflows/infoText.ts b/frontend/src/strings/crawl-workflows/infoText.ts index 28c59eb200..697228a852 100644 --- a/frontend/src/strings/crawl-workflows/infoText.ts +++ b/frontend/src/strings/crawl-workflows/infoText.ts @@ -58,6 +58,7 @@ const infoText: Partial> = { ), lang: msg(`Websites that observe the browser’s language setting may serve content in that language if available.`), + proxyId: msg(`Choose a Proxy to crawl through`), }; export default infoText; diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index 5ffdb6ee7c..a4a319d59a 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3835,12 +3835,8 @@ Edit Proxies - - Websites that observe the browser’s language setting may serve - content in that language if available. - - - Choose a Browsertrix Proxy + + Choose a Proxy to crawl through From f591b4cc29d647a2581f22f7fd20d2d901250928 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 Sep 2024 11:22:51 -0700 Subject: [PATCH 26/50] use proxyId from existing profile when running profile browser for existing profile --- backend/btrixcloud/crawlconfigs.py | 4 +++- backend/btrixcloud/profiles.py | 25 +++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 1420211ad6..41cb06abb3 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -181,7 +181,9 @@ async def get_profile_filename( if not profileid: return "" - profile_filename = await self.profiles.get_profile_storage_path(profileid, org) + profile_filename, _ = await self.profiles.get_profile_storage_path_and_proxy( + profileid, org + ) if not profile_filename: raise HTTPException(status_code=400, detail="invalid_profile_id") diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index d3f70a4cfe..7ed3b8bf16 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -91,9 +91,12 @@ async def create_new_browser( """Create new profile""" prev_profile_path = "" prev_profile_id = "" + prev_proxy_id = "" if profile_launch.profileId: - prev_profile_path = await self.get_profile_storage_path( - profile_launch.profileId, org + prev_profile_path, prev_proxy_id = ( + await self.get_profile_storage_path_and_proxy( + profile_launch.profileId, org + ) ) if not prev_profile_path: @@ -107,9 +110,10 @@ async def create_new_browser( if not crawler_image: raise HTTPException(status_code=404, detail="crawler_not_found") - if profile_launch.proxyId and not self.crawlconfigs.can_org_use_proxy( - org, profile_launch.proxyId - ): + # use either specified proxyId or if none, use proxyId from existing profile + proxy_id = profile_launch.proxyId or prev_proxy_id + + if proxy_id and not self.crawlconfigs.can_org_use_proxy(org, proxy_id): raise HTTPException(status_code=404, detail="proxy_not_found") browserid = await self.crawl_manager.run_profile_browser( @@ -120,7 +124,7 @@ async def create_new_browser( crawler_image=crawler_image, baseprofile=prev_profile_id, profile_filename=prev_profile_path, - proxy_id=profile_launch.proxyId or "", + proxy_id=proxy_id, ) if not browserid: @@ -368,18 +372,19 @@ async def get_profile_with_configs( return ProfileWithCrawlConfigs(crawlconfigs=crawlconfigs, **profile.dict()) - async def get_profile_storage_path( + async def get_profile_storage_path_and_proxy( self, profileid: UUID, org: Optional[Organization] = None - ) -> str: + ) -> tuple[str, str]: """return profile path filename (relative path) for given profile id and org""" try: profile = await self.get_profile(profileid, org) - return profile.resource.filename if profile.resource else "" + storage_path = profile.resource.filename if profile.resource else "" + return storage_path, profile.proxyId or "" # pylint: disable=bare-except except: pass - return "" + return "", "" async def get_profile_name( self, profileid: UUID, org: Optional[Organization] = None From e08500a52bbf4b1791492d3f4c847f0e29017544 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 Sep 2024 23:09:18 -0700 Subject: [PATCH 27/50] proxies subchart: default to 'crawlers' namespace --- chart/charts/btrix-proxies-0.1.0.tgz | Bin 820 -> 706 bytes chart/proxies/templates/proxies.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/charts/btrix-proxies-0.1.0.tgz b/chart/charts/btrix-proxies-0.1.0.tgz index bf4173a63cd0ce73fcb1cbb6b99c2361e24e5846..33dca51c7d8b4244678dfb9ce10eb294f8d9686f 100644 GIT binary patch delta 658 zcmV;D0&V@Y2Eql9Jbz1%+At92>|Zh3JtsJj1u3__Ek%1B-zF zUKtV!S=v=qv=1rYCE&-*cxJxO9y==^0%P@qMCyLXg>~Ud%!|$i!8zxHINp6Z=glwg z-}F0CJQ()k!7v*1Iy{PbpLc*?GKfE+BqwZ#|F^H^;ob;@k$-n+NotkAqQ`_Whrc|G zLe4Uz)Jo%aL+>E<9ReAeLa(Hng3A$7ttN6h7Y2mNfH1Y_2rS;0(qaY^ZJ{K;xN0*7 zu#_%`_f{_nt?xi4Q*S?a&QQ|rE^%xz)iy&LDs){zgpRs?QE1eZDKW<^MtyCDNq@;3@t`gJF~ZywL|2eQ;2OuX9-tCpYCV&5S%YqrN3!ij_d?C;~>*ZYwutk(zQk0b*&kl{p>g@dbFJg&U&{ayami2D*5IV;ulAacIoJgvGuUo}YATrFDuE626U`kPkSasvAoiQRT~yg4 zAGszsGJoZ`km)%;Go_a*@Qdi!PjlDbeO15zy&Brkj@PST|2{iYmv300APm183gW$< zB)_&<*RD$Us1W2vrOAZb+fsT78z_~0_+)gN!v>tb-4zZtFdoFHQrN(KsU7aq)G!$S sB!8@NXJO?7w29cMdU-XY6TnPir@1l#MGz(&CT zUO9nnVA(#YMfel00(s9Q-a<{R!$|(;Sl5&LpMOAm^A!UbXBBLxLVN$x zl;g=*iUKJQ#&i4Y2gqZEK!Lt;jj=1>5+Qf?(X2MwgSG__<^h=?;8Se^7Vzi-)D(?R zWhMZ2hD-P{xE*1L6(~#|{g=$1RGRK^88CODz%aJRsU||EG#ZV)T?wlzJP01K6EJ0*FdjV?)4hpx}qYCI1;WzS4}8 zp8niL3ckVrshnP)|MM)H4g7x(?Dq+nM;L$Yl>bcr<9|^kNg^(82NSV2wove;J>bO# zSr?kM5&+$K{P*$R6NUZ$#rZ4Ah+(>8`**LOY81r5*wxRi*O*z*c7?Z+6X(2%%7qVR zs~Hz-Y+@g@F--x-V{+1f+-MEgLBzi(Yv-vI$R_D7x;F~^ z53EoVj(_V{_GM|GeV#KsZN{R~#EaS&nmb?C?ce~lHP63zmzQt=c3)arzyX%?3~dev zc&Z&^SADAx&mZSj#1`HVR4)`ko0RR7d{N{)N6aWCD CBZ}An diff --git a/chart/proxies/templates/proxies.yaml b/chart/proxies/templates/proxies.yaml index e9b6d68df3..33003c34fc 100644 --- a/chart/proxies/templates/proxies.yaml +++ b/chart/proxies/templates/proxies.yaml @@ -4,7 +4,7 @@ apiVersion: v1 kind: Secret metadata: name: proxies - namespace: {{ .Values.crawler_namespace }} + namespace: {{ .Values.crawler_namespace | default "crawlers" }} type: Opaque stringData: {{- range .Values.proxies }} From 95491235694426d14d8b7ed1d88793fe56096f5f Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 20 Sep 2024 08:18:47 +0200 Subject: [PATCH 28/50] backend: unpin motor dependency, fixes ImportError on backend start --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 3c6b868b8d..fc49b9506e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,7 +1,7 @@ gunicorn uvicorn[standard] fastapi==0.103.2 -motor==3.3.1 +motor passlib PyJWT==2.8.0 pydantic==2.8.2 From ca37b2b910f959d6df7d688c604f61e69c8ccf5f Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 20 Sep 2024 08:20:26 +0200 Subject: [PATCH 29/50] backend: improve `get_all_crawler_proxies` endpoint path --- backend/btrixcloud/crawlconfigs.py | 3 ++- backend/btrixcloud/main.py | 1 + frontend/src/components/orgs-list.ts | 8 +++----- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 41cb06abb3..5a0551f141 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1072,6 +1072,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments def init_crawl_config_api( + app, dbclient, mdb, user_dep, @@ -1162,7 +1163,7 @@ async def get_crawler_proxies( ], ) - @router.get("/crawler-proxies/all", response_model=CrawlerProxies) + @app.get("/orgs/all/crawlconfigs/crawler-proxies", response_model=CrawlerProxies) async def get_all_crawler_proxies( user: User = Depends(user_dep), ): diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index cb1610a98a..0bc3e48982 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -205,6 +205,7 @@ def main() -> None: ) crawl_config_ops = init_crawl_config_api( + app, dbclient, mdb, current_active_user, diff --git a/frontend/src/components/orgs-list.ts b/frontend/src/components/orgs-list.ts index c81464d29d..7e5c9450b1 100644 --- a/frontend/src/components/orgs-list.ts +++ b/frontend/src/components/orgs-list.ts @@ -156,9 +156,7 @@ export class OrgsList extends BtrixElement { .label=${msg(str`Proxy Settings for: ${this.currOrg?.name || ""}`)} @sl-after-hide=${() => (this.currOrg = null)} @sl-show=${() => { - if (this.currOrg) { - void this.fetchAllProxies(this.currOrg); - } + void this.fetchAllProxies(); }} > ( - `/orgs/${org.id}/crawlconfigs/crawler-proxies/all`, + `/orgs/all/crawlconfigs/crawler-proxies`, ); this.allProxies = data.servers; } catch (e) { From d958fa64ebcd915259defa33316becf29c57d9ec Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 20 Sep 2024 08:30:07 +0200 Subject: [PATCH 30/50] backend: disable org shared proxies by default --- backend/btrixcloud/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index b56058cb27..9d64b19e89 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1416,7 +1416,7 @@ class OrgOut(BaseMongoModel): subscription: Optional[Subscription] = None - allowSharedProxies: bool = True + allowSharedProxies: bool = False allowedProxies: list[str] = [] crawlingDefaults: Optional[CrawlConfigDefaults] = None @@ -1471,7 +1471,7 @@ class Organization(BaseMongoModel): subscription: Optional[Subscription] = None - allowSharedProxies: bool = True + allowSharedProxies: bool = False allowedProxies: list[str] = [] crawlingDefaults: Optional[CrawlConfigDefaults] = None From eaff24057d44e669ce85d020316990aa29cc326a Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 20 Sep 2024 08:50:06 +0200 Subject: [PATCH 31/50] frontend: few more labels to org proxy admin modal --- frontend/src/components/orgs-list.ts | 6 +++++- frontend/xliff/es.xlf | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/frontend/src/components/orgs-list.ts b/frontend/src/components/orgs-list.ts index 7e5c9450b1..faa421204a 100644 --- a/frontend/src/components/orgs-list.ts +++ b/frontend/src/components/orgs-list.ts @@ -164,10 +164,11 @@ export class OrgsList extends BtrixElement { name="allowSharedProxies" ?checked=${this.currOrg?.allowSharedProxies} @sl-input="${this.onUpdateAllowSharedProxies}" - >${msg("Allow shared proxies")}${msg("Enable all shared proxies")} + Enable selected shared proxies ${this.allProxies ?.filter((server) => server.shared) .map( @@ -181,6 +182,9 @@ export class OrgsList extends BtrixElement { ${server.id}: ${server.label} `, )} + + Enable selected private proxies + ${this.allProxies ?.filter((server) => !server.shared) .map( diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index a4a319d59a..5bc06e85de 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3823,9 +3823,6 @@ Proxy Settings for: - - Allow shared proxies - Update Proxy Settings @@ -3838,6 +3835,9 @@ Choose a Proxy to crawl through + + Enable all shared proxies + From 827023a83201dd85ec8ddf2789cf11c6e13f679f Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 20 Sep 2024 08:55:38 +0200 Subject: [PATCH 32/50] frontend: misc text changes --- frontend/src/components/ui/config-details.ts | 5 +---- frontend/src/strings/crawl-workflows/infoText.ts | 2 +- frontend/xliff/es.xlf | 12 ++++++------ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 131769cfd2..8b31a6f666 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -260,10 +260,7 @@ export class ConfigDetails extends LiteElement { ) : nothing} ${crawlConfig?.proxyId - ? this.renderSetting( - msg("SSH Proxy Server"), - capitalize(crawlConfig.proxyId), - ) + ? this.renderSetting(msg("Proxy"), capitalize(crawlConfig.proxyId)) : nothing} diff --git a/frontend/src/strings/crawl-workflows/infoText.ts b/frontend/src/strings/crawl-workflows/infoText.ts index 697228a852..bcc0721389 100644 --- a/frontend/src/strings/crawl-workflows/infoText.ts +++ b/frontend/src/strings/crawl-workflows/infoText.ts @@ -58,7 +58,7 @@ const infoText: Partial> = { ), lang: msg(`Websites that observe the browser’s language setting may serve content in that language if available.`), - proxyId: msg(`Choose a Proxy to crawl through`), + proxyId: msg(`Choose a proxy to crawl through`), }; export default infoText; diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index 5bc06e85de..4e85bfae1b 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3802,9 +3802,6 @@ Your org doesn't have permission to use the proxy configured for this crawl. - - SSH Proxy Server - Crawler Proxy Server @@ -3832,12 +3829,15 @@ Edit Proxies - - Choose a Proxy to crawl through - Enable all shared proxies + + Choose a proxy to crawl through + + + Proxy + From d0839b443a82fc92ebda0a3faf66483596ac3fa9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 20 Sep 2024 12:01:17 -0700 Subject: [PATCH 33/50] ensure proxyId saved on Profile --- backend/btrixcloud/profiles.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 7ed3b8bf16..ab72422472 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -248,6 +248,7 @@ async def commit_to_profile( oid=org.id, baseid=baseid, crawlerChannel=browser_commit.crawlerChannel, + proxyId=browser_commit.proxyId, ) await self.profiles.find_one_and_update( @@ -572,6 +573,7 @@ async def commit_browser_to_existing( name=browser_commit.name, description=browser_commit.description or profile.description, crawlerChannel=profile.crawlerChannel, + proxyId=profile.proxyId, ), org=org, user=user, From ae3e9091ba3507eb1a3275f73bade8a1d7d5341c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 20 Sep 2024 13:13:32 -0700 Subject: [PATCH 34/50] ensure proxyId is passed through to profile creation --- frontend/src/pages/org/browser-profiles-detail.ts | 1 + frontend/src/pages/org/index.ts | 2 ++ frontend/src/types/crawler.ts | 1 + 3 files changed, 4 insertions(+) diff --git a/frontend/src/pages/org/browser-profiles-detail.ts b/frontend/src/pages/org/browser-profiles-detail.ts index ac118d30ea..045e0552d2 100644 --- a/frontend/src/pages/org/browser-profiles-detail.ts +++ b/frontend/src/pages/org/browser-profiles-detail.ts @@ -585,6 +585,7 @@ export class BrowserProfilesDetail extends BtrixElement { description: this.profile.description.slice(0, DESCRIPTION_MAXLENGTH), profileId: this.profile.id, crawlerChannel: this.profile.crawlerChannel, + proxyId: this.profile.proxyId, })}`, ); } catch (e) { diff --git a/frontend/src/pages/org/index.ts b/frontend/src/pages/org/index.ts index b5d6469f98..db2c7f219c 100644 --- a/frontend/src/pages/org/index.ts +++ b/frontend/src/pages/org/index.ts @@ -72,6 +72,7 @@ export type OrgParams = { crawlerChannel?: string; profileId?: string; navigateUrl?: string; + proxyId?: string; }; collections: ArchivedItemPageParams & { collectionTab?: string; @@ -574,6 +575,7 @@ export class Org extends LiteElement { crawlerChannel: params.crawlerChannel, profileId: params.profileId, navigateUrl: params.navigateUrl, + proxyId: params.proxyId, }} >`; } diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 6a16c73433..cbf5aabc7b 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -124,6 +124,7 @@ export type Profile = { replicas: ProfileReplica[] | null; }; crawlerChannel?: string; + proxyId?: string; }; // TODO maybe convert this to an enum? From 7b052b539cf61aa1778e0c7b1f67303febe38d37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 20 Sep 2024 18:02:56 -0700 Subject: [PATCH 35/50] add proxy selector to org defaults display 'Using Proxy' on profile selector, if proxyId is set --- backend/btrixcloud/models.py | 1 + .../src/features/browser-profiles/select-browser-profile.ts | 6 ++++++ .../src/pages/org/settings/components/crawling-defaults.ts | 5 +++++ frontend/src/types/org.ts | 1 + frontend/xliff/es.xlf | 3 +++ 5 files changed, 16 insertions(+) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 938eee29f8..e5d2e45941 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -527,6 +527,7 @@ class CrawlConfigDefaults(BaseModel): profileid: Optional[UUID] = None crawlerChannel: Optional[str] = None + proxyId: Optional[str] = None lang: Optional[str] = None diff --git a/frontend/src/features/browser-profiles/select-browser-profile.ts b/frontend/src/features/browser-profiles/select-browser-profile.ts index 443c3e49be..af13c17cbc 100644 --- a/frontend/src/features/browser-profiles/select-browser-profile.ts +++ b/frontend/src/features/browser-profiles/select-browser-profile.ts @@ -114,6 +114,12 @@ export class SelectBrowserProfile extends LiteElement { minute="2-digit" > + ${this.selectedProfile.proxyId + ? html` + ${msg("Using proxy: ")} + ${this.selectedProfile.proxyId} + ` + : ``} ${msg("Language")} `, + proxyId: html` `, }; return { @@ -287,6 +291,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement { blockAds: values.blockAds === "on", profileid: values.profileid, crawlerChannel: values.crawlerChannel, + proxyId: values.proxyId, userAgent: values.userAgent, lang: this.languageSelect?.value || undefined, exclude: this.exclusionTable?.exclusions?.filter((v) => v) || [], diff --git a/frontend/src/types/org.ts b/frontend/src/types/org.ts index 93bb8d099c..44e134e8b3 100644 --- a/frontend/src/types/org.ts +++ b/frontend/src/types/org.ts @@ -42,6 +42,7 @@ export const crawlingDefaultsSchema = z.object({ blockAds: z.boolean().optional(), profileid: z.string().optional(), crawlerChannel: z.string().optional(), + proxyId: z.string().optional(), lang: z.string().optional(), userAgent: z.string().optional(), exclude: z.array(z.string()), diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index 4e85bfae1b..afc4643bc2 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3838,6 +3838,9 @@ Proxy + + Using proxy: + From 81b07a6f4d410289000dc9fd024b2aed54b3625e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 20 Sep 2024 18:15:10 -0700 Subject: [PATCH 36/50] form name fix --- frontend/src/components/ui/select-crawler-proxy.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index b710a7aaac..8a5df7d480 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -67,7 +67,7 @@ export class SelectCrawlerProxy extends LiteElement { return html` Date: Fri, 20 Sep 2024 18:31:16 -0700 Subject: [PATCH 37/50] fix proxy clearing --- frontend/src/components/ui/select-crawler-proxy.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frontend/src/components/ui/select-crawler-proxy.ts b/frontend/src/components/ui/select-crawler-proxy.ts index 8a5df7d480..5490c9594e 100644 --- a/frontend/src/components/ui/select-crawler-proxy.ts +++ b/frontend/src/components/ui/select-crawler-proxy.ts @@ -125,6 +125,10 @@ export class SelectCrawlerProxy extends LiteElement { ({ id }) => id === (e.target as SlSelect).value, ); + if (!this.selectedProxy) { + this.proxyId = null; + } + this.dispatchEvent( new CustomEvent("on-change", { detail: { From 4477a1f11a8158a3075c623ad4e63a4b14684a89 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 20 Sep 2024 21:15:17 -0700 Subject: [PATCH 38/50] misc tweaks: fix workflow default, EmailStr cast, add comments for btrix-proxies --- backend/btrixcloud/users.py | 2 +- chart/values.yaml | 2 +- frontend/src/pages/org/workflows-new.ts | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/users.py b/backend/btrixcloud/users.py index 6519d1cc9c..3e5b6fcc16 100644 --- a/backend/btrixcloud/users.py +++ b/backend/btrixcloud/users.py @@ -276,7 +276,7 @@ async def create_super_user(self) -> None: superuser = await self.get_superuser() if superuser: if str(superuser.email) != email: - await self.update_email_name(superuser, EmailStr(email), name) + await self.update_email_name(superuser, cast(EmailStr, email), name) print("Superuser email updated") if not await self.check_password(superuser, password): diff --git a/chart/values.yaml b/chart/values.yaml index 70ec08a09f..7d07cd05a8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -315,7 +315,7 @@ crawler_liveness_port: 6065 # will set the proxy to shared # default_proxy: "proxy-id" -# optional: configure a list of ssh servers to be used as a proxy +# optional: enable the proxies subchart and configure a list of ssh servers to be used as crawler proxies btrix-proxies: enabled: false # enable to deploy proxies configmap and secret crawler_namespace: "crawlers" diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 6820c4aa1a..6b99a0de07 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -119,6 +119,7 @@ export class WorkflowsNew extends LiteElement { crawlTimeout: org.crawlingDefaults?.crawlTimeout, maxCrawlSize: org.crawlingDefaults?.maxCrawlSize, crawlerChannel: org.crawlingDefaults?.crawlerChannel, + proxyId: org.crawlingDefaults?.proxyId, }, this.initialWorkflow || {}, ); From 4dc72a9f549f3ba95155f317c7e3c8367f387dd7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 25 Sep 2024 18:06:41 -0400 Subject: [PATCH 39/50] reextract strings --- frontend/xliff/es.xlf | 208 +++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 106 deletions(-) diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index ff9cdc60e9..5b94f86f55 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3645,9 +3645,108 @@ Your plan will be canceled on + + Scope + + + Page Crawl + + + One or more page URLs + + + Entire website or directory + + + List of Pages + + + Pages in Same Directory + + + Pages on Same Domain + + + Pages on Same Domain + Subdomains + + + Single Page + + + Any Page + + + Scope options + + + New Crawl Workflow + + + Setup Guide + + + Page URLs + + + Max Depth in Scope + + + Additional Page URLs + + + Exclude Pages + + + The URL of the page to crawl. + + + The crawler will visit and record each URL listed here. You can enter up to URLs. + + + If checked, the crawler will visit pages one link away. + + + Will crawl hash anchor links as pages. For example, + #example-page + will be treated as a separate page. + + + If the crawler finds pages outside of the Crawl Scope they + will only be saved if they begin with URLs listed here. + + + Limits how many hops away the crawler can visit while staying within the Crawl Scope. + + + If checked, the crawler will visit pages one link away outside of + Crawl Scope. + + + Additional Pages + + + There is an issue with this Crawl Workflow: + + + + required in + Scope. + + + Please fix to continue. + + + In-Page Links + + + Choose a proxy to crawl through + Your org doesn't have permission to use the proxy configured for this crawl. + + Proxy + Crawler Proxy Server @@ -3666,6 +3765,9 @@ Proxy Settings for: + + Enable all shared proxies + Update Proxy Settings @@ -3675,114 +3777,8 @@ Edit Proxies - - Enable all shared proxies - - - Choose a proxy to crawl through - - - Proxy - Using proxy: - - Scope - - - Page Crawl - - - One or more page URLs - - - Entire website or directory - - - List of Pages - - - Pages in Same Directory - - - Pages on Same Domain - - - Pages on Same Domain + Subdomains - - - Single Page - - - Any Page - - - Scope options - - - New Crawl Workflow - - - Setup Guide - - - Page URLs - - - Max Depth in Scope - - - Additional Page URLs - - - Exclude Pages - - - The URL of the page to crawl. - - - The crawler will visit and record each URL listed here. You can enter up to URLs. - - - If checked, the crawler will visit pages one link away. - - - Will crawl hash anchor links as pages. For example, - #example-page - will be treated as a separate page. - - - If the crawler finds pages outside of the Crawl Scope they - will only be saved if they begin with URLs listed here. - - - Limits how many hops away the crawler can visit while staying within the Crawl Scope. - - - If checked, the crawler will visit pages one link away outside of - Crawl Scope. - - - Additional Pages - - - There is an issue with this Crawl Workflow: - - - - required in - Scope. - - - Please fix to continue. - - - In-Page Links - - Using proxy: - Using proxy: - Using proxy: - Using proxy: From 4fd3631f8ab0a10a07585cb87a8d2ff4e1bb4eaa Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 26 Sep 2024 15:48:49 -0400 Subject: [PATCH 40/50] WIP: Start adding documentation --- docs/deploy/customization.md | 2 +- docs/deploy/index.md | 2 +- docs/deploy/proxies.md | 27 +++++++++++++++++++++++++++ docs/user-guide/workflow-setup.md | 6 ++++++ 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 docs/deploy/proxies.md diff --git a/docs/deploy/customization.md b/docs/deploy/customization.md index e48f9e84f9..8fc717a2bd 100644 --- a/docs/deploy/customization.md +++ b/docs/deploy/customization.md @@ -149,4 +149,4 @@ Browsertrix has the ability to cryptographically sign WACZ files with [Authsign] ## Enable Open Registration -You can enable sign-ups by setting `registration_enabled` to `"1"`. Once enabled, your users can register by visiting `/sign-up`. \ No newline at end of file +You can enable sign-ups by setting `registration_enabled` to `"1"`. Once enabled, your users can register by visiting `/sign-up`. diff --git a/docs/deploy/index.md b/docs/deploy/index.md index 3ea0077a74..e0693e4042 100644 --- a/docs/deploy/index.md +++ b/docs/deploy/index.md @@ -13,6 +13,6 @@ The main requirements for Browsertrix are: - A Kubernetes Cluster - [Helm 3](https://helm.sh/) (package manager for Kubernetes) -We have prepared a [Local Deployment Guide](local.md) which covers several options for testing Browsertrix locally on a single machine, as well as a [Production (Self-Hosted and Cloud) Deployment](remote.md) guide to help with setting up Browsertrix in different production scenarios. Information about configuring storage, crawler channels, and other details in local or production deployments is in the [Customizing Browsertrix Deployment Guide](customization.md). +We have prepared a [Local Deployment Guide](local.md) which covers several options for testing Browsertrix locally on a single machine, as well as a [Production (Self-Hosted and Cloud) Deployment](remote.md) guide to help with setting up Browsertrix in different production scenarios. Information about configuring storage, crawler channels, and other details in local or production deployments is in the [Customizing Browsertrix Deployment Guide](customization.md). Information about configuring proxies to use with Browsertrix can be found in the [Configuring Proxies](proxies.md) guide. Details on managing org export and import for existing clusters can be found in the [Org Import & Export](admin/org-import-export.md) guide. diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md new file mode 100644 index 0000000000..e47d241cfc --- /dev/null +++ b/docs/deploy/proxies.md @@ -0,0 +1,27 @@ +# Configuring Proxies + +Browsertrix can be configured to direct crawling traffic through dedicated proxy servers, so that websites can be crawled from a specific geographic location regardless of where Browsertrix itself is deployed. + +This guide covers how to set up proxy servers for use with Browsertrix, as well as how to configure Browsertrix to make those proxies available. + +## Proxy Configuration + +Browsertrix supports crawling through HTTP and SOCKS5 proxies, including through a SOCKS5 proxy over an SSH tunnel. For more information on what is supported in the underlying Browsertrix Crawler, see the [Browsertrix Crawler documentation](https://crawler.docs.browsertrix.com/user-guide/proxies/). + +Many commercial proxy services exist. If you are planning to use commercially-provided proxies, continue to [Browsertrix Configuration](#browsertrix-configuration) below. + +To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. Once you have access to this remote machine, you will need to add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to support ssh connections to the remote machine. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. + +(TODO: More technical setup details as needed) + +## Browsertrix Configuration + +Proxies are configured in Browsertrix through a separate deployment and subchart. This enables easier updates to available proxy servers without needing to redeploy the entire Browsertrix application. + +To add or update proxies to your Browsertrix Deployment, modify the `btrix-proxies` section of the main Helm chart or your local override. + +First, set `enabled` to `true`, which will enable deploying proxy servers. + +Next, provide the details of each proxy server that you want available within Browsertrix in the `proxies` list. Minimally, an id, connection string URL, label, and two-letter country code must be set for each proxy. If you want a particular proxy to be shared and potentially available to all organizations on a Browsertrix deployment, set `shared` to `true`. For SSH proxy servers, an `ssh_private_key` is required, and the contents of a known hosts file can additionally be provided to help secure a connection. + +Once all proxy details are set, deploy the proxies by (TODO: add these details) diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index a3fb93d8e0..174d2e902c 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -213,6 +213,12 @@ Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTT Sets the browser's language setting. Useful for crawling websites that detect the browser's language setting and serve content accordingly. +### Proxy + +Sets the proxy server that [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) will direct traffic through while crawling. When a proxy is selected, crawled websites will see traffic as coming from the IP address of the proxy rather than where the Browsertrix Crawler node is deployed. + +This setting will only be shown if proxies are available for use. + ## Scheduling Automatically start crawls periodically on a daily, weekly, or monthly schedule. From 27c753e46c98f7d311f0e1a11645fad94e4305e6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 30 Sep 2024 17:40:17 -0700 Subject: [PATCH 41/50] adjust placement of socks proxy to be below profiles --- frontend/src/__generated__/locale-codes.ts | 9 ++++++-- .../crawl-workflows/workflow-editor.ts | 22 +++++++++---------- .../settings/components/crawling-defaults.ts | 8 +++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/frontend/src/__generated__/locale-codes.ts b/frontend/src/__generated__/locale-codes.ts index 28be186a43..1a0d48c560 100644 --- a/frontend/src/__generated__/locale-codes.ts +++ b/frontend/src/__generated__/locale-codes.ts @@ -10,9 +10,14 @@ export const sourceLocale = `en`; * The other locale codes that this application is localized into. Sorted * lexicographically. */ -export const targetLocales = [] as const; +export const targetLocales = [ + `es`, +] as const; /** * All valid project locale codes. Sorted lexicographically. */ -export const allLocales = [`en`] as const; +export const allLocales = [ + `en`, + `es`, +] as const; diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 7daf19d362..c661db817c 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1321,6 +1321,17 @@ https://archiveweb.page/images/${"logo.svg"}`} > `)} ${this.renderHelpTextCol(infoTextStrings["browserProfile"])} + ${inputCol(html` + + this.updateFormState({ + proxyId: e.detail.value, + })} + > + `)} + ${this.renderHelpTextCol(infoTextStrings["proxyId"])} ${inputCol(html` `)} ${this.renderHelpTextCol(infoTextStrings["lang"])} - ${inputCol(html` - - this.updateFormState({ - proxyId: e.detail.value, - })} - > - `)} - ${this.renderHelpTextCol(infoTextStrings["proxyId"])} `; } diff --git a/frontend/src/pages/org/settings/components/crawling-defaults.ts b/frontend/src/pages/org/settings/components/crawling-defaults.ts index 9c5b77f9c1..7000230a79 100644 --- a/frontend/src/pages/org/settings/components/crawling-defaults.ts +++ b/frontend/src/pages/org/settings/components/crawling-defaults.ts @@ -195,6 +195,10 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement { size="small" > `, + proxyId: html` `, crawlerChannel: html` ${msg("Language")} `, - proxyId: html` `, }; return { From 74fa4a8f41526ed7e0606ce917aabb9a9d1f4ab2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 1 Oct 2024 17:34:11 -0700 Subject: [PATCH 42/50] ensure proxyId included in cronjob, skip cronjob if proxy is missing --- backend/btrixcloud/operator/cronjobs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 74a43b0b44..978235d3ba 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -97,7 +97,15 @@ async def make_new_crawljob( ) return self.get_finished_response(metadata) - # if no db state, crawl crawl in the db + if crawlconfig.proxyId and not self.crawl_config_ops.get_crawler_proxy( + crawlconfig.proxyId + ): + print( + f"proxy {crawlconfig.proxyId} missing, skipping scheduled crawl for workflow {cid} in org {org.id}" + ) + return self.get_finished_response(metadata) + + # if no db state, add crawl in the db if not state: await self.crawl_config_ops.add_new_crawl( crawl_id, @@ -125,6 +133,7 @@ async def make_new_crawljob( warc_prefix=warc_prefix, storage_filename=self.crawl_config_ops.default_filename_template, profile_filename=profile_filename or "", + proxy_id=crawlconfig.proxyId or "", ) return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob))) From 68571db88df4d718f8bad0303e511ac9c24a8a4d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 1 Oct 2024 17:39:24 -0700 Subject: [PATCH 43/50] lint fixes --- backend/btrixcloud/operator/cronjobs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 978235d3ba..9d0367aec3 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -93,7 +93,7 @@ async def make_new_crawljob( if org.readOnly: print( - f"org {org.id} set to read-only. skipping scheduled crawl for workflow {cid}" + f'org "{org.slug}" set to read-only. skipping scheduled crawl for workflow {cid}' ) return self.get_finished_response(metadata) @@ -101,7 +101,8 @@ async def make_new_crawljob( crawlconfig.proxyId ): print( - f"proxy {crawlconfig.proxyId} missing, skipping scheduled crawl for workflow {cid} in org {org.id}" + f"proxy {crawlconfig.proxyId} missing, skipping scheduled crawl for " + + f'workflow {cid} in "{org.slug}"' ) return self.get_finished_response(metadata) From 1b3c5dc0ce76ab93c7ced3dfc79a0acd1cbcb625 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Oct 2024 15:46:49 -0400 Subject: [PATCH 44/50] Update documentation based on review comments --- docs/deploy/proxies.md | 35 +++++++++++++++++++++++++++---- docs/user-guide/workflow-setup.md | 2 -- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index e47d241cfc..66da07bbd4 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -10,18 +10,45 @@ Browsertrix supports crawling through HTTP and SOCKS5 proxies, including through Many commercial proxy services exist. If you are planning to use commercially-provided proxies, continue to [Browsertrix Configuration](#browsertrix-configuration) below. -To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. Once you have access to this remote machine, you will need to add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to support ssh connections to the remote machine. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. +To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. For security purposes, recommend creating a new user on this remote machine solely for proxy access. -(TODO: More technical setup details as needed) +Once the remote machine is ready for use as a proxy, add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to the remote machine under the proxy user to allow. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. + +Finally, modify the ssh configuration for the proxy user on the remote machine to secure the server and only allow public key authentication for this user. For instance: + +``` +Match User proxy-user + AllowTcpForwarding yes + X11Forwarding no + AllowAgentForwarding no + ForceCommand /bin/false + PubkeyAuthentication yes + PasswordAuthentication no +``` ## Browsertrix Configuration Proxies are configured in Browsertrix through a separate deployment and subchart. This enables easier updates to available proxy servers without needing to redeploy the entire Browsertrix application. -To add or update proxies to your Browsertrix Deployment, modify the `btrix-proxies` section of the main Helm chart or your local override. +Proxies can be configured in the `btrix-proxies` section of the main Helm chart or local override for the main Browsertrix deployment, or in a separate values file that only contains proxy information, for example `proxies.yaml`. First, set `enabled` to `true`, which will enable deploying proxy servers. Next, provide the details of each proxy server that you want available within Browsertrix in the `proxies` list. Minimally, an id, connection string URL, label, and two-letter country code must be set for each proxy. If you want a particular proxy to be shared and potentially available to all organizations on a Browsertrix deployment, set `shared` to `true`. For SSH proxy servers, an `ssh_private_key` is required, and the contents of a known hosts file can additionally be provided to help secure a connection. -Once all proxy details are set, deploy the proxies by (TODO: add these details) +The `default_proxy` field can optionally be set to the id for one of the proxies in the `proxies` list. If set, the default proxy will be used for all crawls that do not have an alternate proxy set in the workflow configuration. + +Once all proxy details are set, they are ready to be deployed. + +If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, deploy with the regular Helm upgrade command, e.g.: + +```sh +helm upgrade --wait --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ +``` + +If `btrix-proxies` have been set in a distinct value file, deploy changes from this file directly. For instance, if the proxy configuration is located in a file named `proxies.yaml`, you can use the following Helm command: + +```sh +helm upgrade --wait --install -f ./chart/proxies.yaml proxies ./chart/proxies/ +``` + diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 174d2e902c..3472a45d48 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -217,8 +217,6 @@ Sets the browser's language setting. Useful for crawling websites that detect th Sets the proxy server that [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) will direct traffic through while crawling. When a proxy is selected, crawled websites will see traffic as coming from the IP address of the proxy rather than where the Browsertrix Crawler node is deployed. -This setting will only be shown if proxies are available for use. - ## Scheduling Automatically start crawls periodically on a daily, weekly, or monthly schedule. From f192bbd1452e3e143de814a5333b224ed5f6ebe0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Oct 2024 15:54:07 -0400 Subject: [PATCH 45/50] Wordsmith docs --- docs/deploy/proxies.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index 66da07bbd4..fc0181db53 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -10,9 +10,9 @@ Browsertrix supports crawling through HTTP and SOCKS5 proxies, including through Many commercial proxy services exist. If you are planning to use commercially-provided proxies, continue to [Browsertrix Configuration](#browsertrix-configuration) below. -To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. For security purposes, recommend creating a new user on this remote machine solely for proxy access. +To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. For security purposes, we recommend creating a new user on this remote machine solely for proxy access. -Once the remote machine is ready for use as a proxy, add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to the remote machine under the proxy user to allow. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. +Once the remote machine is ready and the new user created, add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to the remote machine under the proxy user to allow. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. Finally, modify the ssh configuration for the proxy user on the remote machine to secure the server and only allow public key authentication for this user. For instance: @@ -28,9 +28,7 @@ Match User proxy-user ## Browsertrix Configuration -Proxies are configured in Browsertrix through a separate deployment and subchart. This enables easier updates to available proxy servers without needing to redeploy the entire Browsertrix application. - -Proxies can be configured in the `btrix-proxies` section of the main Helm chart or local override for the main Browsertrix deployment, or in a separate values file that only contains proxy information, for example `proxies.yaml`. +Proxies are configured in Browsertrix through a separate subchart, and can be configured in the `btrix-proxies` section of the main Helm chart (or local override file) for the Browsertrix deployment, or in a separate values file that only contains proxy information, for example `proxies.yaml`. First, set `enabled` to `true`, which will enable deploying proxy servers. @@ -40,13 +38,13 @@ The `default_proxy` field can optionally be set to the id for one of the proxies Once all proxy details are set, they are ready to be deployed. -If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, deploy with the regular Helm upgrade command, e.g.: +If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, deploy with the regular Helm upgrade command. For isntance, if the proxy configuration is located in a local override file `local.yaml`, you can use the following Helm command to redeploy Browsertrix with the proxy configuration: ```sh helm upgrade --wait --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ ``` -If `btrix-proxies` have been set in a distinct value file, deploy changes from this file directly. For instance, if the proxy configuration is located in a file named `proxies.yaml`, you can use the following Helm command: +If `btrix-proxies` have been set in a distinct value file, deploy changes from this file directly. This approach does not require redeploying the entire Browsertrix application to update the proxy configuration. For instance, if the proxy configuration is located in a file named `proxies.yaml`, you can use the following Helm command to deploy the proxy changes: ```sh helm upgrade --wait --install -f ./chart/proxies.yaml proxies ./chart/proxies/ From a07b4c6c19296b089d17b0daa45bd75317747a8f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Oct 2024 15:55:52 -0400 Subject: [PATCH 46/50] More wordsmithing --- docs/deploy/proxies.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index fc0181db53..840c2f711e 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -12,7 +12,7 @@ Many commercial proxy services exist. If you are planning to use commercially-pr To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. For security purposes, we recommend creating a new user on this remote machine solely for proxy access. -Once the remote machine is ready and the new user created, add the public key of a public/private key pair (we recommend using a new ECDSA key pair) to the remote machine under the proxy user to allow. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. +Once the remote machine is ready and the new user created, add the public key of a public/private key pair to the remote machine under the proxy user to allow. We recommend using a new ECDSA key pair. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. Finally, modify the ssh configuration for the proxy user on the remote machine to secure the server and only allow public key authentication for this user. For instance: From 72148952a572f3e377af903ddd1e524cceb96dd8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Oct 2024 17:03:14 -0700 Subject: [PATCH 47/50] update proxy docs --- docs/deploy/proxies.md | 110 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 11 deletions(-) diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index 840c2f711e..fca4169b00 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -8,13 +8,20 @@ This guide covers how to set up proxy servers for use with Browsertrix, as well Browsertrix supports crawling through HTTP and SOCKS5 proxies, including through a SOCKS5 proxy over an SSH tunnel. For more information on what is supported in the underlying Browsertrix Crawler, see the [Browsertrix Crawler documentation](https://crawler.docs.browsertrix.com/user-guide/proxies/). -Many commercial proxy services exist. If you are planning to use commercially-provided proxies, continue to [Browsertrix Configuration](#browsertrix-configuration) below. +### Obtain an SSH Key-pair -To set up your own proxy server to use with Browsertrix as SOCKS5 over SSH, the first thing that is needed is a physical or virtual server that you intend to use as the proxy. For security purposes, we recommend creating a new user on this remote machine solely for proxy access. +To set up a proxy server to use with Browsertrix as SOCKS5 over SSH, you will need an SSH public key-pair and: +- The SSH public key configured on the remote machine +- The SSH private key configured in Browsertrix +- The public host key of the remote machine configured in Browsertrix (optional) -Once the remote machine is ready and the new user created, add the public key of a public/private key pair to the remote machine under the proxy user to allow. We recommend using a new ECDSA key pair. You will need to supply the corresponding private key to Browsertrix in [Browsertrix Configuration](#browsertrix-configuration) below. +We recommend creating a dedicated SSH key-pair (we recommend an ECDSA key-pair) for use with Browsertrix, as well as a dedicated user, eg. `proxy-user`, and not reusing existing keys or users. + +For basic information on how to create a key-pair using `ssh-keygen`, see existing guides such as [this one from DigitalOcean](https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server) or [this one from ssh.com](https://www.ssh.com/academy/ssh/keygen) + +We recommend securing the SSH connection for the proxy user to contain the following settings. This can be done by adding a file +such as `/etc/ssh/sshd_config.d/99-ssh-proxy.conf` where `proxy-user` is the user connecting to the machine. -Finally, modify the ssh configuration for the proxy user on the remote machine to secure the server and only allow public key authentication for this user. For instance: ``` Match User proxy-user @@ -28,25 +35,106 @@ Match User proxy-user ## Browsertrix Configuration -Proxies are configured in Browsertrix through a separate subchart, and can be configured in the `btrix-proxies` section of the main Helm chart (or local override file) for the Browsertrix deployment, or in a separate values file that only contains proxy information, for example `proxies.yaml`. +Proxies are configured in Browsertrix through a separate subchart, and can be configured in the `btrix-proxies` section of the main Helm chart (or local override file) for the Browsertrix deployment. Alternatively, they can be [configured as a separate subchart](#deploying-with-proxies-via-subchart) + +The proxy configuration will look like this, containing one or more proxy declarations. + +```yaml +btrix-proxies: + enabled: true + proxies: + - id: proxy-id-1 + shared: true + label: My Proxy + description: Proxy hosted in for Browsertrix + country_code: US + url: ssh://proxy-user@ssh-proxy-host + ssh_host_public_key: + ssh_private_key: + + - id: proxy-id-2 + shared: false + label: My SOCKS5 proxy + country_code: DE + url: socks5://username:password@proxy-host + ... +``` + + +First, set `enabled` to `true`, which will enable this proxies in Browsertrix. + +Next, provide the details of each proxy server that you want available within Browsertrix in the `proxies` list. Minimally, the `id`, `url` connection string, `label` name, and `country_code` two-letter country code must be set for each proxy. + +### SSH Proxies + +For SSH proxy servers,The `url` should be of the form `ssh://proxy-user@ssh-proxy-host`. + +The `ssh_private_key` is required and is the private key of the key-pair created above. + +The `ssh_host_public_key` is recommended to help ensure a secure connection and can often be obtained by running: `ssh-keyscan dev.proxy-host -p 22` on the remote machine, assuming default SSH setup and hostname of `proxy-host`. + +Only key-based auth is supported for SSH proxies, password-based authentication is not supported. -First, set `enabled` to `true`, which will enable deploying proxy servers. +### SOCKS5 Proxies -Next, provide the details of each proxy server that you want available within Browsertrix in the `proxies` list. Minimally, an id, connection string URL, label, and two-letter country code must be set for each proxy. If you want a particular proxy to be shared and potentially available to all organizations on a Browsertrix deployment, set `shared` to `true`. For SSH proxy servers, an `ssh_private_key` is required, and the contents of a known hosts file can additionally be provided to help secure a connection. +For SOCKS5 proxies, the `url` should be of the form `socks5://username:password@socks-proxy-host`. -The `default_proxy` field can optionally be set to the id for one of the proxies in the `proxies` list. If set, the default proxy will be used for all crawls that do not have an alternate proxy set in the workflow configuration. +This method is to be used with dedicated SOCKS5 proxies (not over SSH), such as existing services that provide this feature. -Once all proxy details are set, they are ready to be deployed. +### Shared Proxies -If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, deploy with the regular Helm upgrade command. For isntance, if the proxy configuration is located in a local override file `local.yaml`, you can use the following Helm command to redeploy Browsertrix with the proxy configuration: +The `shared` field on each proxy object defines if this proxy should be accessible to all organizations in a Browsertrix deployment +that are allowed to access shared proxy. If false, the proxy must be added directly to each organization that will have access to the proxy. + +The proxy settings can be be configured in the super-admin UI by clicking on the 'Edit Proxies...' next to each organization. + +### Default Proxies + +The `default_proxy` field can optionally be set to the id for one of the proxies in the `proxies` list. If set, the default proxy will be used for all crawls that do not have an alternate proxy set in the workflow configuration. This can be useful +if Browsertrix is deployed on a private network and requires a proxy to access the outside world. + +## Deployment + +If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, proxies will be enabled on next deploy of the Browsertrix helm chart. For instance, if the proxy configuration is located in a local override file `local.yaml`, you can use the following Helm command to redeploy Browsertrix with the proxy configuration: ```sh helm upgrade --wait --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ ``` -If `btrix-proxies` have been set in a distinct value file, deploy changes from this file directly. This approach does not require redeploying the entire Browsertrix application to update the proxy configuration. For instance, if the proxy configuration is located in a file named `proxies.yaml`, you can use the following Helm command to deploy the proxy changes: +### Deploying with Proxies via Subchart + +Alternatively, the proxies can also be configured with a separate proxies sub-chart. + +This allows for updating proxies without having to redeploy all of Browsertrix. + +A separate proxies YAML file should contain just the `proxies` key: + +```yaml +proxies: + - id: proxy-id-1 + shared: true + label: My Proxy + description: Proxy hosted in for Browsertrix + country_code: US + url: ssh://proxy-user@ssh-proxy-host + ssh_host_public_key: + ssh_private_key: + + - id: proxy-id-2 + shared: false + label: My SOCKS5 proxy + country_code: DE + url: socks5://username:password@proxy-host +``` + + +If the above YAML is placed in `proxies.yaml`, the subchart can be deployed with ```sh helm upgrade --wait --install -f ./chart/proxies.yaml proxies ./chart/proxies/ ``` +(This layout assumes a local copy of Browsertrix repo.) + +The proxies can be updated without redeploying all of Browsertrix, and Browsertrix will pick up the updated proxies. + From 93feaf28aadb65b9c3ca3c28cfafb9cd27bd70a9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Oct 2024 17:26:19 -0700 Subject: [PATCH 48/50] update docs, add proxies subchart to release --- .github/workflows/publish-helm-chart.yaml | 5 ++++- docs/deploy/proxies.md | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-helm-chart.yaml b/.github/workflows/publish-helm-chart.yaml index 5c4a7d692c..acd16db5a0 100644 --- a/.github/workflows/publish-helm-chart.yaml +++ b/.github/workflows/publish-helm-chart.yaml @@ -23,6 +23,7 @@ jobs: run: | mkdir .chart-out helm package chart/ --destination .chart-out + helm package chart/proxies/ --destination .chart-out - name: Get Version run: | @@ -49,7 +50,9 @@ jobs: See [the development guide](https://docs.browsertrix.com/deploy/) for more info how to deploy Browsertrix. - files: .chart-out/browsertrix-v${{ env.version }}.tgz + files: | + .chart-out/browsertrix-v${{ env.version }}.tgz + .chart-out/btrix-proxies-0.1.0.tgz tag_name: v${{ env.version }} draft: true fail_on_unmatched_files: true diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index fca4169b00..91a9870301 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -98,7 +98,7 @@ if Browsertrix is deployed on a private network and requires a proxy to access t If `btrix-proxies` have been set in the main Helm chart or a local override file for your Browsertrix deployment, proxies will be enabled on next deploy of the Browsertrix helm chart. For instance, if the proxy configuration is located in a local override file `local.yaml`, you can use the following Helm command to redeploy Browsertrix with the proxy configuration: ```sh -helm upgrade --wait --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ +helm upgrade --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart/ ``` ### Deploying with Proxies via Subchart @@ -131,10 +131,22 @@ proxies: If the above YAML is placed in `proxies.yaml`, the subchart can be deployed with ```sh -helm upgrade --wait --install -f ./chart/proxies.yaml proxies ./chart/proxies/ +helm upgrade --install -f ./chart/proxies.yaml proxies ./chart/proxies/ ``` -(This layout assumes a local copy of Browsertrix repo.) - The proxies can be updated without redeploying all of Browsertrix, and Browsertrix will pick up the updated proxies. +### GitHub Release for Subchart + +The above layout assumes a local copy of Browsertrix repo. + +The proxies subchart can also be deployed from the latest GitHub release via: + +```sh +helm upgrade --install proxies https://github.com/webrecorder/browsertrix/releases/download/RELEASE/btrix-proxies-VERSION.tgz +``` + +where `RELEASE` are the Browsertrix release and the `VERSION` is the version of the proxies chart. + +See the [Browsertrix releases page](https://github.com/webrecorder/browsertrix/releases) for the latest available versions. + From c90bc0aff1c8923da914f506c1e464e1a76b5892 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Oct 2024 17:47:07 -0700 Subject: [PATCH 49/50] more docs tweaks --- docs/deploy/proxies.md | 11 ++++++++--- mkdocs.yml | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/deploy/proxies.md b/docs/deploy/proxies.md index 91a9870301..fe12045318 100644 --- a/docs/deploy/proxies.md +++ b/docs/deploy/proxies.md @@ -2,6 +2,8 @@ Browsertrix can be configured to direct crawling traffic through dedicated proxy servers, so that websites can be crawled from a specific geographic location regardless of where Browsertrix itself is deployed. +The Browsertrix superadmin can configure which proxy servers are available to which organizations (or if they are shared for all organizations) and users can choose from one of the available proxies in each crawl workflow. Users can also configure the default crawling proxy that will be used for the organization in organization-wide [Crawling Defaults](/user-guide/org-settings/#crawling-defaults). + This guide covers how to set up proxy servers for use with Browsertrix, as well as how to configure Browsertrix to make those proxies available. ## Proxy Configuration @@ -40,6 +42,8 @@ Proxies are configured in Browsertrix through a separate subchart, and can be co The proxy configuration will look like this, containing one or more proxy declarations. ```yaml +#default_proxy: + btrix-proxies: enabled: true proxies: @@ -88,10 +92,11 @@ that are allowed to access shared proxy. If false, the proxy must be added direc The proxy settings can be be configured in the super-admin UI by clicking on the 'Edit Proxies...' next to each organization. -### Default Proxies +### Default Proxy + +The `default_proxy` field in the root of the Helm values file can optionally be set to the id for one of the available proxies list. If set, the default proxy will be used for all crawls that do not have an alternate proxy set in the workflow configuration. This can be useful if Browsertrix is deployed on a private network and requires a proxy to access the outside world. -The `default_proxy` field can optionally be set to the id for one of the proxies in the `proxies` list. If set, the default proxy will be used for all crawls that do not have an alternate proxy set in the workflow configuration. This can be useful -if Browsertrix is deployed on a private network and requires a proxy to access the outside world. +This is a deployment-wide setting and is not shown to users, and is designed for admins to route all traffic through a designated proxy. Browsertrix will fail to start if the default proxy is not listed in the available proxies. ## Deployment diff --git a/mkdocs.yml b/mkdocs.yml index d5c2d30910..a930b276a0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,6 +77,7 @@ nav: - deploy/local.md - deploy/remote.md - deploy/customization.md + - deploy/proxies.md - Ansible: - deploy/ansible/digitalocean.md - deploy/ansible/microk8s.md From 3e5302ce826c994f62d5681ae0d7ec32d2f15e3c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Oct 2024 18:14:04 -0700 Subject: [PATCH 50/50] rename proxies-passwd-hack -> force-user-and-group-name for clarity --- chart/app-templates/crawler.yaml | 8 ++++---- chart/app-templates/profilebrowser.yaml | 8 ++++---- chart/templates/secrets.yaml | 5 +++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 0a4e864e8f..a00d4af332 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -76,9 +76,9 @@ spec: secret: secretName: proxies defaultMode: 0600 - - name: proxies-passwd-hack + - name: force-user-and-group-name secret: - secretName: proxies-passwd-hack + secretName: force-user-and-group-name defaultMode: 0600 {% endif %} @@ -175,11 +175,11 @@ spec: subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - - name: proxies-passwd-hack + - name: force-user-and-group-name mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxies-passwd-hack + - name: force-user-and-group-name mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index e4f68ea3f8..662fa5a17b 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -31,9 +31,9 @@ spec: secret: secretName: proxies defaultMode: 0600 - - name: proxies-passwd-hack + - name: force-user-and-group-name secret: - secretName: proxies-passwd-hack + secretName: force-user-and-group-name defaultMode: 0600 {% endif %} @@ -113,11 +113,11 @@ spec: subPath: {{ proxy_id }}-known-hosts readOnly: true {% endif %} - - name: proxies-passwd-hack + - name: force-user-and-group-name mountPath: /etc/passwd subPath: passwd readOnly: true - - name: proxies-passwd-hack + - name: force-user-and-group-name mountPath: /etc/group subPath: group readOnly: true diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 60de9e4776..7c972b1a46 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -64,12 +64,13 @@ stringData: apiVersion: v1 kind: Secret metadata: - name: proxies-passwd-hack + name: force-user-and-group-name namespace: {{ .Values.crawler_namespace }} type: Opaque stringData: - # slightly hacky: override /etc/passwd and /etc/group in crawler to be able to ssh to proxies + # slightly hacky: override /etc/passwd and /etc/group in crawler + # this is needed to be able to use ssh to use proxies passwd: | root:x:0:0:root:/root:/bin/bash btrix:btrix:{{ .Values.crawler_uid | default 201407 }}:{{ .Values.crawler_gid | default 201407 }}::/tmp/btrix:/bin/sh