From 8b9d4bdf7d5c50d7d54d7788d79acc424e8e004e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 19:45:01 -0400 Subject: [PATCH 01/57] Modify backend scale to be number of browser windows --- backend/btrixcloud/crawls.py | 6 ++- backend/btrixcloud/main.py | 3 +- backend/btrixcloud/models.py | 4 +- backend/btrixcloud/operator/crawls.py | 55 +++++++++++++++++++++------ backend/btrixcloud/orgs.py | 4 +- backend/btrixcloud/utils.py | 6 +++ chart/templates/configmap.yaml | 2 +- chart/values.yaml | 4 +- 8 files changed, 61 insertions(+), 23 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 2dabd59c6b..1b127b1e01 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -25,6 +25,7 @@ parse_jsonl_log_messages, stream_dict_list_as_csv, validate_regexes, + pod_count_from_browser_windows, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -529,7 +530,7 @@ async def add_or_remove_exclusion( cid = crawl.cid - scale = crawl.scale or 1 + browser_windows = crawl.scale or 1 async with self.get_redis(crawl_id) as redis: query = { @@ -538,7 +539,8 @@ async def add_or_remove_exclusion( } query_str = json.dumps(query) - for i in range(0, scale): + pod_count = pod_count_from_browser_windows(browser_windows) + for i in range(0, pod_count): await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str) new_config = await self.crawl_configs.add_or_remove_exclusion( diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 0958c6a31f..2b17174708 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -149,8 +149,7 @@ def main() -> None: os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120) ), maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), - numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)), - maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)), + maxBrowserWindows=int(os.environ.get("MAX_BROWSER_WINDOWS", 8)), billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")), signUpUrl=os.environ.get("SIGN_UP_URL", ""), salesEmail=os.environ.get("SALES_EMAIL", ""), diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2e9d8da309..fd1355c48d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -31,7 +31,7 @@ from .db import BaseMongoModel # crawl scale for constraint -MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +MAX_BROWSER_WINDOWS = int(os.environ.get("MAX_BROWSER_WINDOWS", 8)) # Presign duration must be less than 604800 seconds (one week), # so set this one minute short of a week @@ -52,7 +52,7 @@ EmptyStr = Annotated[str, Field(min_length=0, max_length=0)] -Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE)] +Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_BROWSER_WINDOWS)] ReviewStatus = Optional[Annotated[int, Field(strict=True, ge=1, le=5)]] any_http_url_adapter = TypeAdapter(AnyHttpUrlNonStr) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 8a7b5e0995..ad3ace3fbf 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -32,7 +32,12 @@ StorageRef, ) -from btrixcloud.utils import str_to_date, date_to_str, dt_now +from btrixcloud.utils import ( + str_to_date, + date_to_str, + dt_now, + pod_count_from_browser_windows, +) from .baseoperator import BaseOperator, Redis from .models import ( @@ -358,10 +363,14 @@ async def sync_crawls(self, data: MCSyncData): children.extend(await self._load_qa_configmap(params, data.children)) is_paused = bool(crawl.paused_at) and status.state == "paused" + crawler_pod_count = pod_count_from_browser_windows(status.scale) + last_pod_remainder = status.scale % crawler_pod_count - for i in range(0, status.scale): + for i in range(0, crawler_pod_count): children.extend( - self._load_crawler(params, i, status, data.children, is_paused) + self._load_crawler( + params, i, last_index, last_pod_remainder, status, data.children, is_paused + ) ) return { @@ -466,8 +475,16 @@ async def _load_qa_configmap(self, params, children): params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) return self.load_from_yaml("qa_configmap.yaml", params) - # pylint: disable=too-many-arguments - def _load_crawler(self, params, i, status: CrawlStatus, children, paused: bool): + def _load_crawler( + self, + params, + i: int, + last_pod_index: int, + last_pod_remainder: int, + status: CrawlStatus, + children, + is_paused: bool + ): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] @@ -492,7 +509,11 @@ def _load_crawler(self, params, i, status: CrawlStatus, children, paused: bool): else: params["memory_limit"] = self.k8s.max_crawler_memory_size params["storage"] = pod_info.newStorage or params.get("crawler_storage") - params["workers"] = params.get(worker_field) or 1 + if i == last_pod_index and last_pod_remainder: + params["workers"] = last_pod_remainder + else: + params["workers"] = params.get(worker_field) or 1 + params["init_crawler"] = not paused if has_pod and not paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) @@ -500,6 +521,10 @@ def _load_crawler(self, params, i, status: CrawlStatus, children, paused: bool): print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False + print(f"crawler pod {i + 1} of {last_index + 1}, index: {i}", flush=True) + worker_count = params["workers"] + print(f"num of workers in pod: {worker_count}", flush=True) + return self.load_from_yaml("crawler.yaml", params) def _qa_configmap_update_needed(self, name, configmap): @@ -522,7 +547,7 @@ def _qa_configmap_update_needed(self, name, configmap): async def _resolve_scale( self, crawl_id: str, - desired_scale: int, + desired_browser_windows: int, redis: Redis, status: CrawlStatus, pods: dict[str, dict], @@ -540,13 +565,15 @@ async def _resolve_scale( if pods.get(f"redis-{crawl_id}"): actual_scale -= 1 + desired_scale = pod_count_from_browser_windows(desired_browser_windows) + # ensure at least enough pages for the scale if status.pagesFound < desired_scale: desired_scale = max(1, status.pagesFound) # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: - return desired_scale + return desired_browser_windows new_scale = actual_scale for i in range(actual_scale - 1, desired_scale - 1, -1): @@ -1477,14 +1504,18 @@ async def update_crawl_state( ) # resolve scale - if crawl.scale != status.scale: - status.scale = await self._resolve_scale( + desired_pod_count = pod_count_from_browser_windows(crawl.scale) + current_pod_count = pod_count_from_browser_windows(status.scale) + + if desired_pod_count != current_pod_count: + current_pod_count = await self._resolve_scale( crawl.id, crawl.scale, redis, status, pods ) # check if done / failed status_count: dict[str, int] = {} - for i in range(status.scale): + + for i in range(current_pod_count): res = results.get(f"crawl-{crawl.id}-{i}") if res: status_count[res] = status_count.get(res, 0) + 1 @@ -1492,7 +1523,7 @@ async def update_crawl_state( num_done = status_count.get("done", 0) num_failed = status_count.get("failed", 0) # all expected pods are either done or failed - all_completed = (num_done + num_failed) >= status.scale + all_completed = (num_done + num_failed) >= current_pod_count # check paused if not all_completed and crawl.paused_at and status.stopReason == "paused": diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 40130a313d..da9d275f21 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -105,7 +105,7 @@ DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") -MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +MAX_BROWSER_WINDOWS = int(os.environ.get("MAX_BROWSER_WINDOWS", 8)) # number of items to delete at a time DEL_ITEMS = 1000 @@ -1268,7 +1268,7 @@ async def import_org( # Ensure scale isn't above max_scale workflow_scale = workflow.get("scale", 1) - workflow["scale"] = max(workflow_scale, MAX_CRAWL_SCALE) + workflow["scale"] = max(workflow_scale, MAX_BROWSER_WINDOWS) # Ensure crawlerChannel is set if not workflow.get("crawlerChannel"): diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 0c145c3164..1ec95cc65f 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -200,3 +200,9 @@ def validate_language_code(lang: str): """Validate ISO-639-1 language code, raise HTTPException if invalid""" if not is_language(lang, "pt1"): raise HTTPException(status_code=400, detail="invalid_lang") + + +def pod_count_from_browser_windows(browser_windows: int) -> int: + """Return number of pods for given number of browser windows""" + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + return math.ceil(browser_windows / browsers_per_pod) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 9f01a474a7..75d0eef236 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -49,7 +49,7 @@ data: FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" - MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}" + MAX_BROWSER_WINDOWS: "{{ .Values.max_browser_windows | default 8 }}" LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" diff --git a/chart/values.yaml b/chart/values.yaml index b2f320ce6f..a7ee00a4a6 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -50,8 +50,8 @@ default_crawl_filename_template: "@ts-@hostsuffix.wacz" crawler_extra_args: "" -# max allowed crawl scale per crawl -max_crawl_scale: 3 +# max allowed browser windows per crawl +max_browser_windows: 8 # Cluster Settings From fa339c0849629b546480861db2cd50c11105fae5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 20:27:23 -0400 Subject: [PATCH 02/57] Fix SettingsReponse model --- backend/btrixcloud/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 2b17174708..af19f0d69c 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -112,8 +112,7 @@ class SettingsResponse(BaseModel): defaultPageLoadTimeSeconds: int maxPagesPerCrawl: int - numBrowsers: int - maxScale: int + maxBrowserWindows: int billingEnabled: bool From 54d91f6fe85f3dfb6da1b35220bbcd4873c89ad4 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 20:27:46 -0400 Subject: [PATCH 03/57] Update frontend for simplified maxBrowserWindows --- frontend/src/__mocks__/api/settings.js | 2 +- frontend/src/components/ui/config-details.ts | 4 +--- .../features/crawl-workflows/workflow-editor.ts | 14 ++++++-------- frontend/src/index.test.ts | 3 +-- frontend/src/index.ts | 3 ++- frontend/src/pages/org/index.ts | 4 ++-- frontend/src/pages/org/workflow-detail.ts | 9 ++++----- frontend/src/utils/app.ts | 6 ++---- frontend/src/utils/crawler.ts | 2 +- frontend/src/utils/workflow.ts | 8 ++++---- 10 files changed, 24 insertions(+), 31 deletions(-) diff --git a/frontend/src/__mocks__/api/settings.js b/frontend/src/__mocks__/api/settings.js index a47b266cb7..c1543c6736 100644 --- a/frontend/src/__mocks__/api/settings.js +++ b/frontend/src/__mocks__/api/settings.js @@ -4,7 +4,7 @@ export default { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, - maxScale: 0, + maxBrowserWindows: 0, billingEnabled: true, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 45009e21a1..7a2a0f1a30 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -251,9 +251,7 @@ export class ConfigDetails extends BtrixElement { )} ${this.renderSetting( msg("Browser Windows"), - crawlConfig?.scale && this.appState.settings - ? `${crawlConfig.scale * this.appState.settings.numBrowsers}` - : "", + crawlConfig?.scale ? `${crawlConfig.scale}` : "", )} ${this.renderSetting( msg("Crawler Channel (Exact Crawler Version)"), diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index f0ea0d1a03..ccbcec1c2c 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1587,14 +1587,12 @@ https://archiveweb.page/images/${"logo.svg"}`} scale: +(e.target as SlCheckbox).value, })} > - ${when(this.appState.settings?.numBrowsers, (numBrowsers) => - map( - range(this.orgDefaults.maxScale), - (i: number) => - html` ${(i + 1) * numBrowsers}`, - ), + ${map( + range(this.appState.settings?.maxBrowserWindows || 1), + (i: number) => + html` ${i + 1}`, )} `)} diff --git a/frontend/src/index.test.ts b/frontend/src/index.test.ts index 3fdd863c88..e09bc96a8d 100644 --- a/frontend/src/index.test.ts +++ b/frontend/src/index.test.ts @@ -40,8 +40,7 @@ const mockAppSettings: AppSettings = { defaultBehaviorTimeSeconds: 300, defaultPageLoadTimeSeconds: 120, maxPagesPerCrawl: 50000, - numBrowsers: 2, - maxScale: 3, + maxBrowserWindows: 4, billingEnabled: false, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/index.ts b/frontend/src/index.ts index a5b0246218..e6ba75afed 100644 --- a/frontend/src/index.ts +++ b/frontend/src/index.ts @@ -885,7 +885,8 @@ export class App extends BtrixElement { class="w-full" .viewStateData=${this.viewState.data} .params=${this.viewState.params} - .maxScale=${this.appState.settings?.maxScale || DEFAULT_MAX_SCALE} + .maxBrowserWindows=${this.appState.settings?.maxBrowserWindows || + DEFAULT_MAX_SCALE} orgPath=${orgPath.split(slug)[1]} orgTab=${orgTab} >`; diff --git a/frontend/src/pages/org/index.ts b/frontend/src/pages/org/index.ts index e6dcdf2fb2..ef7f449baf 100644 --- a/frontend/src/pages/org/index.ts +++ b/frontend/src/pages/org/index.ts @@ -113,7 +113,7 @@ export class Org extends BtrixElement { orgTab?: OrgTab | string; @property({ type: Number }) - maxScale: number = DEFAULT_MAX_SCALE; + maxBrowserWindows: number = DEFAULT_MAX_SCALE; @state() private openDialogName?: ResourceName; @@ -536,7 +536,7 @@ export class Org extends BtrixElement { openDialogName=${this.viewStateData?.dialog} ?isEditing=${isEditing} ?isCrawler=${this.appState.isCrawler} - .maxScale=${this.maxScale} + .maxBrowserWindows=${this.maxBrowserWindows} > `; } diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index f90d6f0330..e7e82670d3 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -78,7 +78,7 @@ export class WorkflowDetail extends BtrixElement { | "deleteCrawl"; @property({ type: Number }) - maxScale = DEFAULT_MAX_SCALE; + maxBrowserWindows = DEFAULT_MAX_SCALE; @state() private lastCrawlId: Workflow["lastCrawlId"] = null; @@ -1591,8 +1591,7 @@ export class WorkflowDetail extends BtrixElement { if (!this.isCrawler) return; const enableEditBrowserWindows = !this.workflow.lastCrawlStopping; - const windowCount = - this.workflow.scale * (this.appState.settings?.numBrowsers || 1); + const windowCount = this.workflow.scale || 1; return html`
@@ -2002,10 +2001,10 @@ export class WorkflowDetail extends BtrixElement { const scaleOptions = []; if (this.appState.settings) { - for (let value = 1; value <= this.maxScale; value++) { + for (let value = 1; value <= this.maxBrowserWindows; value++) { scaleOptions.push({ value, - label: value * this.appState.settings.numBrowsers, + label: value, }); } } diff --git a/frontend/src/utils/app.ts b/frontend/src/utils/app.ts index d1bd85347a..a4a7088106 100644 --- a/frontend/src/utils/app.ts +++ b/frontend/src/utils/app.ts @@ -8,8 +8,7 @@ export type AppSettings = { defaultBehaviorTimeSeconds: number; defaultPageLoadTimeSeconds: number; maxPagesPerCrawl: number; - numBrowsers: number; - maxScale: number; + maxBrowserWindows: number; billingEnabled: boolean; signUpUrl: string; salesEmail: string; @@ -37,8 +36,7 @@ export async function getAppSettings(): Promise { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, - numBrowsers: 1, - maxScale: 0, + maxBrowserWindows: 4, billingEnabled: false, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/utils/crawler.ts b/frontend/src/utils/crawler.ts index 2b2003aa0d..7d90fa3a05 100644 --- a/frontend/src/utils/crawler.ts +++ b/frontend/src/utils/crawler.ts @@ -19,7 +19,7 @@ export const activeCrawlStates = RUNNING_AND_WAITING_STATES; export const finishedCrawlStates = SUCCESSFUL_STATES; export const inactiveCrawlStates = SUCCESSFUL_AND_FAILED_STATES; -export const DEFAULT_MAX_SCALE = 3; +export const DEFAULT_MAX_SCALE = 8; export const DEPTH_SUPPORTED_SCOPES = [ "prefix", diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 5946f5b090..edd25de1e7 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -140,11 +140,11 @@ export type WorkflowDefaults = { behaviorTimeoutSeconds?: number; pageLoadTimeoutSeconds?: number; maxPagesPerCrawl?: number; - maxScale: number; + maxBrowserWindows: number; }; export const appDefaults: WorkflowDefaults = { - maxScale: DEFAULT_MAX_SCALE, + maxBrowserWindows: DEFAULT_MAX_SCALE, }; export const getDefaultFormState = (): FormState => ({ @@ -365,8 +365,8 @@ export async function getServerDefaults(): Promise { if (data.maxPagesPerCrawl > 0) { defaults.maxPagesPerCrawl = data.maxPagesPerCrawl; } - if (data.maxScale) { - defaults.maxScale = data.maxScale; + if (data.maxBrowserWindows) { + defaults.maxBrowserWindows = data.maxBrowserWindows; } return defaults; From fc1eafb66dcd483812643980b665a84a99683b81 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 20:30:05 -0400 Subject: [PATCH 04/57] Import math --- backend/btrixcloud/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 1ec95cc65f..c7198b77b3 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -8,6 +8,7 @@ import os import sys import re +import math from datetime import datetime, timezone from typing import Optional, Dict, Union, List, Any From d18f500921a2a53c76f4078903e5c75ee04382ed Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 20:39:21 -0400 Subject: [PATCH 05/57] Operator fixups --- backend/btrixcloud/operator/crawls.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index ad3ace3fbf..0985970650 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -369,7 +369,13 @@ async def sync_crawls(self, data: MCSyncData): for i in range(0, crawler_pod_count): children.extend( self._load_crawler( - params, i, last_index, last_pod_remainder, status, data.children, is_paused + params, + i, + crawler_pod_count - 1, + last_pod_remainder, + status, + data.children, + is_paused ) ) @@ -521,7 +527,7 @@ def _load_crawler( print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False - print(f"crawler pod {i + 1} of {last_index + 1}, index: {i}", flush=True) + print(f"crawler pod {i + 1} of {last_pod_index + 1}, index: {i}", flush=True) worker_count = params["workers"] print(f"num of workers in pod: {worker_count}", flush=True) From 3958bfd11e4d754049691b34b1be727385edb564 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 20:47:16 -0400 Subject: [PATCH 06/57] Handle case where scale < workers per pod --- backend/btrixcloud/operator/crawls.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 0985970650..937d419a41 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -364,7 +364,12 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" crawler_pod_count = pod_count_from_browser_windows(status.scale) - last_pod_remainder = status.scale % crawler_pod_count + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + + if status.scale < browsers_per_pod: + remainder = status.scale + else: + remainder = status.scale % crawler_pod_count for i in range(0, crawler_pod_count): children.extend( @@ -372,7 +377,7 @@ async def sync_crawls(self, data: MCSyncData): params, i, crawler_pod_count - 1, - last_pod_remainder, + remainder, status, data.children, is_paused From a7de12b432c5fc0287d018786e382a3750a45bb1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 23:32:21 -0400 Subject: [PATCH 07/57] Add pylint comment --- backend/btrixcloud/operator/crawls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 937d419a41..3e6bc900d4 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -486,6 +486,7 @@ async def _load_qa_configmap(self, params, children): params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) return self.load_from_yaml("qa_configmap.yaml", params) + # pylint: too-many-arguments def _load_crawler( self, params, From ece1bc7b8c4759ca95f46245c98d3a43138f8064 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 23:34:14 -0400 Subject: [PATCH 08/57] Update API settings test --- backend/test/test_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/test/test_api.py b/backend/test/test_api.py index 88cb0e3806..a3fba6c00e 100644 --- a/backend/test/test_api.py +++ b/backend/test/test_api.py @@ -43,8 +43,7 @@ def test_api_settings(): "jwtTokenLifetime": 1440, "defaultBehaviorTimeSeconds": 300, "maxPagesPerCrawl": 4, - "numBrowsers": 2, - "maxScale": 3, + "maxBrowserWindows": 8, "defaultPageLoadTimeSeconds": 120, "billingEnabled": True, "signUpUrl": "", From 31f5adef36818150753c26365cc1762d9c6bee7c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 23:34:56 -0400 Subject: [PATCH 09/57] Fix pylint comment --- backend/btrixcloud/operator/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 3e6bc900d4..dbeb643300 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -486,7 +486,7 @@ async def _load_qa_configmap(self, params, children): params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) return self.load_from_yaml("qa_configmap.yaml", params) - # pylint: too-many-arguments + # pylint: disable=too-many-arguments def _load_crawler( self, params, From 5cd019cc2ed7a600c2f1b1c554c0cf135c742a62 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 20 May 2025 23:59:32 -0400 Subject: [PATCH 10/57] Use crawl.scale, add lots of debug print logging --- backend/btrixcloud/operator/crawls.py | 30 ++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index dbeb643300..831781aa51 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -365,11 +365,26 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" crawler_pod_count = pod_count_from_browser_windows(status.scale) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + print(f"status.scale: {status.scale}", flush=True) + print(f"crawl.scale: {crawl.scale}", flush=True) - if status.scale < browsers_per_pod: - remainder = status.scale + crawler_pod_count = pod_count_from_browser_windows(crawl.scale) + print(f"crawler pod count: {crawler_pod_count}", flush=True) + + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + print(f"browsers per pod: {browsers_per_pod}", flush=True) + + if crawl.scale < browsers_per_pod: + remainder = crawl.scale + print( + f"remainder (scale less than browsers per pod): {remainder}", flush=True + ) else: - remainder = status.scale % crawler_pod_count + remainder = crawl.scale % crawler_pod_count + print( + f"remainder (scale not less than browsers per pod): {remainder}", + flush=True, + ) for i in range(0, crawler_pod_count): children.extend( @@ -579,6 +594,10 @@ async def _resolve_scale( desired_scale = pod_count_from_browser_windows(desired_browser_windows) + print(f"actual scale (pods): {actual_scale}", flush=True) + print(f"desired browser windows: {desired_browser_windows}", flush=True) + print(f"desired scale (pods): {desired_scale}", flush=True) + # ensure at least enough pages for the scale if status.pagesFound < desired_scale: desired_scale = max(1, status.pagesFound) @@ -1519,10 +1538,14 @@ async def update_crawl_state( desired_pod_count = pod_count_from_browser_windows(crawl.scale) current_pod_count = pod_count_from_browser_windows(status.scale) + print(f"desired pod count: {desired_pod_count}", flush=True) + print(f"current pod count: {current_pod_count}", flush=True) + if desired_pod_count != current_pod_count: current_pod_count = await self._resolve_scale( crawl.id, crawl.scale, redis, status, pods ) + print(f"reset current pod count to: {current_pod_count}", flush=True) # check if done / failed status_count: dict[str, int] = {} @@ -1536,6 +1559,7 @@ async def update_crawl_state( num_failed = status_count.get("failed", 0) # all expected pods are either done or failed all_completed = (num_done + num_failed) >= current_pod_count + print(f"all completed: {all_completed}", flush=True) # check paused if not all_completed and crawl.paused_at and status.stopReason == "paused": From 2561701be6c51cd595b0c15314e4cb209bea32c6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 21 May 2025 00:06:42 -0400 Subject: [PATCH 11/57] Fixups --- backend/btrixcloud/operator/crawls.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 831781aa51..347af73848 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -376,15 +376,8 @@ async def sync_crawls(self, data: MCSyncData): if crawl.scale < browsers_per_pod: remainder = crawl.scale - print( - f"remainder (scale less than browsers per pod): {remainder}", flush=True - ) else: remainder = crawl.scale % crawler_pod_count - print( - f"remainder (scale not less than browsers per pod): {remainder}", - flush=True, - ) for i in range(0, crawler_pod_count): children.extend( @@ -604,7 +597,7 @@ async def _resolve_scale( # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: - return desired_browser_windows + return desired_scale new_scale = actual_scale for i in range(actual_scale - 1, desired_scale - 1, -1): @@ -1536,21 +1529,20 @@ async def update_crawl_state( # resolve scale desired_pod_count = pod_count_from_browser_windows(crawl.scale) - current_pod_count = pod_count_from_browser_windows(status.scale) print(f"desired pod count: {desired_pod_count}", flush=True) - print(f"current pod count: {current_pod_count}", flush=True) + print(f"current pod count: {status.scale}", flush=True) - if desired_pod_count != current_pod_count: - current_pod_count = await self._resolve_scale( + if desired_pod_count != status.scale: + status.scale = await self._resolve_scale( crawl.id, crawl.scale, redis, status, pods ) - print(f"reset current pod count to: {current_pod_count}", flush=True) + print(f"reset current pod count to: {status.scale}", flush=True) # check if done / failed status_count: dict[str, int] = {} - for i in range(current_pod_count): + for i in range(status.scale): res = results.get(f"crawl-{crawl.id}-{i}") if res: status_count[res] = status_count.get(res, 0) + 1 @@ -1558,7 +1550,7 @@ async def update_crawl_state( num_done = status_count.get("done", 0) num_failed = status_count.get("failed", 0) # all expected pods are either done or failed - all_completed = (num_done + num_failed) >= current_pod_count + all_completed = (num_done + num_failed) >= status.scale print(f"all completed: {all_completed}", flush=True) # check paused From 0396c1cd173ae763e995af414a4b38c23349d5dc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 21 May 2025 00:16:39 -0400 Subject: [PATCH 12/57] Consolidate print logging lines --- backend/btrixcloud/operator/crawls.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 347af73848..9dd5ec8812 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -541,9 +541,11 @@ def _load_crawler( print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False - print(f"crawler pod {i + 1} of {last_pod_index + 1}, index: {i}", flush=True) worker_count = params["workers"] - print(f"num of workers in pod: {worker_count}", flush=True) + print( + f"crawler pod {i + 1} of {last_pod_index + 1}, index {i}, {worker_count} workers", + flush=True, + ) return self.load_from_yaml("crawler.yaml", params) From 6fd850a054de8c46f29b79c86a84ad53a240e691 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 21 May 2025 00:19:01 -0400 Subject: [PATCH 13/57] Remove some debug print logging --- backend/btrixcloud/operator/crawls.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 9dd5ec8812..495d9f29dd 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -369,10 +369,7 @@ async def sync_crawls(self, data: MCSyncData): print(f"crawl.scale: {crawl.scale}", flush=True) crawler_pod_count = pod_count_from_browser_windows(crawl.scale) - print(f"crawler pod count: {crawler_pod_count}", flush=True) - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - print(f"browsers per pod: {browsers_per_pod}", flush=True) if crawl.scale < browsers_per_pod: remainder = crawl.scale From bf8f1d0e26dde44d6706ca307d8905c599252499 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 21 May 2025 00:33:42 -0400 Subject: [PATCH 14/57] Temp: Debug print log sync_crawls return --- backend/btrixcloud/operator/crawls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 495d9f29dd..03f459d691 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -389,11 +389,14 @@ async def sync_crawls(self, data: MCSyncData): ) ) - return { + return_value = { "status": status.dict(exclude_none=True), "children": children, "resyncAfterSeconds": status.resync_after, } + print("sync_crawls return:", flush=True) + print(return_value, flush=True) + return return_value def _load_redis(self, params, status: CrawlStatus, children): name = f"redis-{params['id']}" From 551b18bba947020a6c901cad192c04a31820e5d6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 21 May 2025 18:44:24 -0700 Subject: [PATCH 15/57] rebase fix --- backend/btrixcloud/operator/crawls.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 03f459d691..c32071e9bd 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -385,18 +385,15 @@ async def sync_crawls(self, data: MCSyncData): remainder, status, data.children, - is_paused + is_paused, ) ) - return_value = { + return { "status": status.dict(exclude_none=True), "children": children, "resyncAfterSeconds": status.resync_after, } - print("sync_crawls return:", flush=True) - print(return_value, flush=True) - return return_value def _load_redis(self, params, status: CrawlStatus, children): name = f"redis-{params['id']}" @@ -503,7 +500,7 @@ def _load_crawler( last_pod_remainder: int, status: CrawlStatus, children, - is_paused: bool + is_paused: bool, ): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] @@ -534,8 +531,8 @@ def _load_crawler( else: params["workers"] = params.get(worker_field) or 1 - params["init_crawler"] = not paused - if has_pod and not paused: + params["init_crawler"] = not is_paused + if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) if restart_reason: print(f"Restarting {name}, reason: {restart_reason}") From 27d0fc192980676ed48fe97d5c4d335d1ef322ba Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 21 May 2025 23:35:47 -0700 Subject: [PATCH 16/57] work --- backend/btrixcloud/operator/crawls.py | 31 +++++++++++++++------------ backend/btrixcloud/operator/models.py | 1 + 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index c32071e9bd..7d86966647 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -363,26 +363,25 @@ async def sync_crawls(self, data: MCSyncData): children.extend(await self._load_qa_configmap(params, data.children)) is_paused = bool(crawl.paused_at) and status.state == "paused" - crawler_pod_count = pod_count_from_browser_windows(status.scale) - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + print(f"status.scale: {status.scale}", flush=True) print(f"crawl.scale: {crawl.scale}", flush=True) crawler_pod_count = pod_count_from_browser_windows(crawl.scale) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - if crawl.scale < browsers_per_pod: - remainder = crawl.scale - else: - remainder = crawl.scale % crawler_pod_count + remainder = crawl.scale % browsers_per_pod + remainder_changed = (status.lastScale % browsers_per_pod) != remainder + print(f"remainder: {remainder}, changed: {remainder_changed}") + status.lastScale = crawl.scale for i in range(0, crawler_pod_count): children.extend( self._load_crawler( params, i, - crawler_pod_count - 1, remainder, + remainder_changed, status, data.children, is_paused, @@ -496,8 +495,8 @@ def _load_crawler( self, params, i: int, - last_pod_index: int, - last_pod_remainder: int, + first_pod_remainder: int, + remainder_changed: bool, status: CrawlStatus, children, is_paused: bool, @@ -526,21 +525,25 @@ def _load_crawler( else: params["memory_limit"] = self.k8s.max_crawler_memory_size params["storage"] = pod_info.newStorage or params.get("crawler_storage") - if i == last_pod_index and last_pod_remainder: - params["workers"] = last_pod_remainder + + if i == 0 and first_pod_remainder: + params["workers"] = first_pod_remainder else: params["workers"] = params.get(worker_field) or 1 params["init_crawler"] = not is_paused if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) + if not restart_reason and i == 0 and remainder_changed: + restart_reason = "pod_resized" + if restart_reason: print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False worker_count = params["workers"] print( - f"crawler pod {i + 1} of {last_pod_index + 1}, index {i}, {worker_count} workers", + f"crawler pod {i}, {worker_count} workers", flush=True, ) @@ -591,8 +594,8 @@ async def _resolve_scale( print(f"desired scale (pods): {desired_scale}", flush=True) # ensure at least enough pages for the scale - if status.pagesFound < desired_scale: - desired_scale = max(1, status.pagesFound) + #if status.pagesFound < desired_scale: + # desired_scale = max(1, status.pagesFound) # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 555f32deda..ec402dcbbf 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -206,6 +206,7 @@ class CrawlStatus(BaseModel): # human readable size string sizeHuman: str = "" scale: int = 1 + lastScale: int = 1 filesAdded: int = 0 filesAddedSize: int = 0 finished: Optional[str] = None From 9e1a86e9d7edbc2532537570f979a924df87508c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 24 May 2025 21:48:37 -0700 Subject: [PATCH 17/57] switch back to last pod --- backend/btrixcloud/operator/crawls.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 7d86966647..d5ae0fb84c 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -380,7 +380,7 @@ async def sync_crawls(self, data: MCSyncData): self._load_crawler( params, i, - remainder, + remainder if i == crawler_pod_count - 1 else 0, remainder_changed, status, data.children, @@ -495,7 +495,7 @@ def _load_crawler( self, params, i: int, - first_pod_remainder: int, + pod_remainder: int, remainder_changed: bool, status: CrawlStatus, children, @@ -526,15 +526,15 @@ def _load_crawler( params["memory_limit"] = self.k8s.max_crawler_memory_size params["storage"] = pod_info.newStorage or params.get("crawler_storage") - if i == 0 and first_pod_remainder: - params["workers"] = first_pod_remainder + if pod_remainder: + params["workers"] = pod_remainder else: params["workers"] = params.get(worker_field) or 1 params["init_crawler"] = not is_paused if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) - if not restart_reason and i == 0 and remainder_changed: + if not restart_reason and pod_remainder and remainder_changed: restart_reason = "pod_resized" if restart_reason: @@ -594,7 +594,7 @@ async def _resolve_scale( print(f"desired scale (pods): {desired_scale}", flush=True) # ensure at least enough pages for the scale - #if status.pagesFound < desired_scale: + # if status.pagesFound < desired_scale: # desired_scale = max(1, status.pagesFound) # if desired_scale same or scaled up, return desired_scale From b7f855d517ab5b80b96bb1cf0c7218ada3f4694f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 24 May 2025 22:21:25 -0700 Subject: [PATCH 18/57] fix remainder check --- backend/btrixcloud/operator/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d5ae0fb84c..7f455bc43a 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -534,7 +534,7 @@ def _load_crawler( params["init_crawler"] = not is_paused if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) - if not restart_reason and pod_remainder and remainder_changed: + if not restart_reason and remainder_changed: restart_reason = "pod_resized" if restart_reason: From 92557ec4f60b4d614759718cb9b91f6444f8f01c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 24 May 2025 22:43:31 -0700 Subject: [PATCH 19/57] rename priorities to use max_browser_windows --- chart/templates/priorities.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 9e20d433ce..416b5a0319 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -1,5 +1,5 @@ -{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} +{{- range untilStep 0 (int .Values.max_browser_windows) 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass @@ -11,7 +11,7 @@ description: "Priority for crawl instance #{{ . }}" {{- end }} -{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} +{{- range untilStep 0 (int .Values.max_browser_windows) 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass From 1249ead177abebf2450101c4f8f08991198d6508 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 12:03:00 -0400 Subject: [PATCH 20/57] Fix screencast window count --- frontend/src/components/screencast.ts | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index bf63fec50f..94c9e03f4e 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -141,9 +141,6 @@ export class Screencast extends BtrixElement { // Websocket connections private readonly wsMap = new Map(); - // Number of available browsers. - // Multiply by scale to get available browser window count - private browsersCount = 1; private screenWidth = 640; private screenHeight = 480; private readonly timerIds: number[] = []; @@ -181,7 +178,7 @@ export class Screencast extends BtrixElement { } render() { - const screenCount = this.scale * this.browsersCount; + const screenCount = this.scale; return html`
= {}; - for (let i = 0; i < message.browsers * this.scale; i++) { + for (let i = 0; i < this.scale; i++) { dataMap[i] = null; } this.dataMap = dataMap; - this.browsersCount = message.browsers; + // this.browsersCount = message.browsers; this.screenWidth = message.width; this.screenHeight = message.height; } else { From 13c85be2d2a5f4191ea8dbf710d99872d5598739 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 14:26:17 -0400 Subject: [PATCH 21/57] Rename scale fields to distinguish pods from browser windows --- backend/btrixcloud/operator/crawls.py | 23 +++++++++-------------- backend/btrixcloud/operator/models.py | 5 +++-- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 7f455bc43a..7e28e99e7e 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -172,7 +172,7 @@ async def sync_crawls(self, data: MCSyncData): storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel", "default"), proxy_id=spec.get("proxyId"), - scale=spec.get("scale", 1), + browserWindows=spec.get("scale", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), paused_at=str_to_date(spec.get("pausedAt")), @@ -183,7 +183,7 @@ async def sync_crawls(self, data: MCSyncData): ) if crawl.qa_source_crawl_id: - crawl.scale = int(params.get("qa_scale", 1)) + crawl.browserWindows = int(params.get("qa_scale", 1)) # if finalizing, crawl is being deleted if data.finalizing: @@ -365,15 +365,15 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" print(f"status.scale: {status.scale}", flush=True) - print(f"crawl.scale: {crawl.scale}", flush=True) + print(f"crawl.browserWindows: {crawl.browserWindows}", flush=True) - crawler_pod_count = pod_count_from_browser_windows(crawl.scale) + crawler_pod_count = pod_count_from_browser_windows(crawl.browserWindows) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - remainder = crawl.scale % browsers_per_pod - remainder_changed = (status.lastScale % browsers_per_pod) != remainder + remainder = crawl.browserWindows % browsers_per_pod + remainder_changed = (status.lastBrowserWindows % browsers_per_pod) != remainder print(f"remainder: {remainder}, changed: {remainder_changed}") - status.lastScale = crawl.scale + status.lastBrowserWindows = crawl.browserWindows for i in range(0, crawler_pod_count): children.extend( @@ -1530,16 +1530,12 @@ async def update_crawl_state( ) # resolve scale - desired_pod_count = pod_count_from_browser_windows(crawl.scale) - - print(f"desired pod count: {desired_pod_count}", flush=True) - print(f"current pod count: {status.scale}", flush=True) + desired_pod_count = pod_count_from_browser_windows(crawl.browserWindows) if desired_pod_count != status.scale: status.scale = await self._resolve_scale( - crawl.id, crawl.scale, redis, status, pods + crawl.id, crawl.browserWindows, redis, status, pods ) - print(f"reset current pod count to: {status.scale}", flush=True) # check if done / failed status_count: dict[str, int] = {} @@ -1553,7 +1549,6 @@ async def update_crawl_state( num_failed = status_count.get("failed", 0) # all expected pods are either done or failed all_completed = (num_done + num_failed) >= status.scale - print(f"all completed: {all_completed}", flush=True) # check paused if not all_completed and crawl.paused_at and status.stopReason == "paused": diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index ec402dcbbf..a4db93a29a 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -74,7 +74,7 @@ class CrawlSpec(BaseModel): cid: UUID oid: UUID org: Organization - scale: int = 1 + browserWindows: int = 1 storage: StorageRef started: str crawler_channel: str @@ -205,8 +205,9 @@ class CrawlStatus(BaseModel): size: int = 0 # human readable size string sizeHuman: str = "" + # number of pods scale: int = 1 - lastScale: int = 1 + lastBrowserWindows: int = 1 filesAdded: int = 0 filesAddedSize: int = 0 finished: Optional[str] = None From c5da074be1a44d4702f3162458da874086696d61 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 14:30:20 -0400 Subject: [PATCH 22/57] Fix linting --- backend/btrixcloud/operator/crawls.py | 17 +++++++++-------- backend/btrixcloud/operator/models.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 7e28e99e7e..d270b1a430 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -172,7 +172,7 @@ async def sync_crawls(self, data: MCSyncData): storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel", "default"), proxy_id=spec.get("proxyId"), - browserWindows=spec.get("scale", 1), + browser_windows=spec.get("scale", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), paused_at=str_to_date(spec.get("pausedAt")), @@ -183,7 +183,7 @@ async def sync_crawls(self, data: MCSyncData): ) if crawl.qa_source_crawl_id: - crawl.browserWindows = int(params.get("qa_scale", 1)) + crawl.browser_windows = int(params.get("qa_scale", 1)) # if finalizing, crawl is being deleted if data.finalizing: @@ -365,15 +365,15 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" print(f"status.scale: {status.scale}", flush=True) - print(f"crawl.browserWindows: {crawl.browserWindows}", flush=True) + print(f"crawl.browser_windows: {crawl.browser_windows}", flush=True) - crawler_pod_count = pod_count_from_browser_windows(crawl.browserWindows) + crawler_pod_count = pod_count_from_browser_windows(crawl.browser_windows) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - remainder = crawl.browserWindows % browsers_per_pod + remainder = crawl.browser_windows % browsers_per_pod remainder_changed = (status.lastBrowserWindows % browsers_per_pod) != remainder print(f"remainder: {remainder}, changed: {remainder_changed}") - status.lastBrowserWindows = crawl.browserWindows + status.lastBrowserWindows = crawl.browser_windows for i in range(0, crawler_pod_count): children.extend( @@ -571,6 +571,7 @@ async def _resolve_scale( crawl_id: str, desired_browser_windows: int, redis: Redis, + # pylint: disable=unused-argument status: CrawlStatus, pods: dict[str, dict], ): @@ -1530,11 +1531,11 @@ async def update_crawl_state( ) # resolve scale - desired_pod_count = pod_count_from_browser_windows(crawl.browserWindows) + desired_pod_count = pod_count_from_browser_windows(crawl.browser_windows) if desired_pod_count != status.scale: status.scale = await self._resolve_scale( - crawl.id, crawl.browserWindows, redis, status, pods + crawl.id, crawl.browser_windows, redis, status, pods ) # check if done / failed diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index a4db93a29a..a1f78dccfe 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -74,7 +74,7 @@ class CrawlSpec(BaseModel): cid: UUID oid: UUID org: Organization - browserWindows: int = 1 + browser_windows: int = 1 storage: StorageRef started: str crawler_channel: str From 92ed072f147628fc44fb83ba19b2a47941e5e94d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 15:27:28 -0400 Subject: [PATCH 23/57] Undo change to worker index calculation for screenshots --- frontend/src/components/screencast.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index 94c9e03f4e..73dcbfc804 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -301,11 +301,11 @@ export class Screencast extends BtrixElement { ) { if (message.msg === "init") { const dataMap: Record = {}; - for (let i = 0; i < this.scale; i++) { + for (let i = 0; i < message.browsers * this.scale; i++) { dataMap[i] = null; } this.dataMap = dataMap; - // this.browsersCount = message.browsers; + this.browsersCount = message.browsers; this.screenWidth = message.width; this.screenHeight = message.height; } else { From 9e67798537e7076fc01f65e17b5651dbc99191d7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 15:28:48 -0400 Subject: [PATCH 24/57] Remove unused variable --- frontend/src/components/screencast.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index 73dcbfc804..3aeddc98ea 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -305,7 +305,6 @@ export class Screencast extends BtrixElement { dataMap[i] = null; } this.dataMap = dataMap; - this.browsersCount = message.browsers; this.screenWidth = message.width; this.screenHeight = message.height; } else { From 5f2f8593b05b053fe9f22d0afbd1d4cde97ae58a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 16:33:57 -0400 Subject: [PATCH 25/57] Add separate browserWindows on backend alongside scale --- backend/btrixcloud/crawlconfigs.py | 26 ++++++++++++++++++++++++++ backend/btrixcloud/crawlmanager.py | 8 ++++++-- backend/btrixcloud/crawls.py | 29 +++++++++++++++++++++++++---- backend/btrixcloud/models.py | 20 ++++++++++++++++---- backend/btrixcloud/utils.py | 6 ++++++ 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 02606a75b4..2a3147e72d 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -52,6 +52,8 @@ validate_regexes, validate_language_code, is_url, + pod_count_from_browser_windows, + browser_windows_from_pod_count, ) if TYPE_CHECKING: @@ -222,6 +224,16 @@ async def add_crawl_config( ) -> CrawlConfigAddedResponse: """Add new crawl config""" + if config_in.scale is None and config_in.browserWindows is None: + raise HTTPException( + status_code=400, detail="browser_windows_or_scale_required" + ) + + if config_in.browserWindows: + config_in.scale = pod_count_from_browser_windows(config_in.browserWindows) + else: + config_in.browserWindows = browser_windows_from_pod_count(config_in.scale) + # ensure crawlChannel is valid if not self.get_channel_crawler_image(config_in.crawlerChannel): raise HTTPException(status_code=404, detail="crawler_not_found") @@ -273,6 +285,7 @@ async def add_crawl_config( crawlTimeout=config_in.crawlTimeout, maxCrawlSize=config_in.maxCrawlSize, scale=config_in.scale, + browserWindows=config_in.browserWindows, autoAddCollections=config_in.autoAddCollections, profileid=profileid, crawlerChannel=config_in.crawlerChannel, @@ -408,6 +421,16 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) + if update.scale is None and update.browserWindows is None: + raise HTTPException( + status_code=400, detail="browser_windows_or_scale_required" + ) + + if update.browserWindows: + update.scale = pod_count_from_browser_windows(update.browserWindows) + else: + update.browserWindows = browser_windows_from_pod_count(update.scale) + if update.config and update.config.exclude: exclude = update.config.exclude if isinstance(exclude, str): @@ -442,6 +465,9 @@ async def update_crawl_config( self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate") ) changed = changed or self.check_attr_changed(orig_crawl_config, update, "scale") + changed = changed or self.check_attr_changed( + orig_crawl_config, update, "browserWindows" + ) schedule_changed = self.check_attr_changed( orig_crawl_config, update, "schedule" diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 88999b28f7..47c30ec2d3 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -373,9 +373,13 @@ async def rollover_restart_crawl(self, crawl_id: str) -> dict: update = date_to_str(dt_now()) return await self._patch_job(crawl_id, {"restartTime": update}) - async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict: + async def scale_crawl( + self, crawl_id: str, scale: int = 1, browser_windows: int = 1 + ) -> dict: """Set the crawl scale (job parallelism) on the specified job""" - return await self._patch_job(crawl_id, {"scale": scale}) + return await self._patch_job( + crawl_id, {"scale": scale, "browserWindows": browser_windows} + ) async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict: """Request a crawl cancelation or stop by calling an API diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 1b127b1e01..914200c5ab 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -26,6 +26,7 @@ stream_dict_list_as_csv, validate_regexes, pod_count_from_browser_windows, + browser_windows_from_pod_count, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -397,12 +398,20 @@ async def update_crawl_scale( ) -> bool: """Update crawl scale in the db""" crawl = await self.get_crawl(crawl_id, org) - update = UpdateCrawlConfig(scale=crawl_scale.scale) + + update = UpdateCrawlConfig( + scale=crawl_scale.scale, browserWindows=crawl_scale.browserWindows + ) await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, - {"$set": {"scale": crawl_scale.scale}}, + { + "$set": { + "scale": crawl_scale.scale, + "browserWindows": crawl_scale.browserWindows, + } + }, return_document=pymongo.ReturnDocument.AFTER, ) @@ -1531,15 +1540,27 @@ async def scale_crawl( user: User = Depends(user_dep), org: Organization = Depends(org_crawl_dep), ): + if scale.scale is None and scale.browserWindows is None: + raise HTTPException( + status_code=400, detail="browser_windows_or_scale_required" + ) + + if scale.browserWindows: + scale.scale = pod_count_from_browser_windows(scale.browserWindows) + else: + scale.browserWindows = browser_windows_from_pod_count(scale.scale) + await ops.update_crawl_scale(crawl_id, org, scale, user) - result = await ops.crawl_manager.scale_crawl(crawl_id, scale.scale) + result = await ops.crawl_manager.scale_crawl( + crawl_id, scale.scale, scale.browserWindows + ) if not result or not result.get("success"): raise HTTPException( status_code=400, detail=result.get("error") or "unknown" ) - return {"scaled": scale.scale} + return {"scaled": scale.scale, "browserWindows": scale.browserWindows} @app.get( "/orgs/{oid}/crawls/{crawl_id}/access", diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fd1355c48d..31687076aa 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -31,6 +31,9 @@ from .db import BaseMongoModel # crawl scale for constraint +MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) + +# browser window for constraint (preferred over scale if provided) MAX_BROWSER_WINDOWS = int(os.environ.get("MAX_BROWSER_WINDOWS", 8)) # Presign duration must be less than 604800 seconds (one week), @@ -52,7 +55,8 @@ EmptyStr = Annotated[str, Field(min_length=0, max_length=0)] -Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_BROWSER_WINDOWS)] +Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE)] +BrowserWindowCount = Annotated[int, Field(strict=True, ge=1, le=MAX_BROWSER_WINDOWS)] ReviewStatus = Optional[Annotated[int, Field(strict=True, ge=1, le=5)]] any_http_url_adapter = TypeAdapter(AnyHttpUrlNonStr) @@ -369,7 +373,9 @@ class CrawlConfigIn(BaseModel): crawlTimeout: int = 0 maxCrawlSize: int = 0 - scale: Scale = 1 + + scale: Optional[Scale] = None + browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None @@ -391,6 +397,7 @@ class ConfigRevision(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 scale: Scale = 1 + browserWindows: Optional[BrowserWindowCount] = 2 modified: datetime modifiedBy: Optional[UUID] = None @@ -412,6 +419,7 @@ class CrawlConfigCore(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 scale: Scale = 1 + browserWindows: BrowserWindowCount = 2 oid: UUID @@ -522,7 +530,8 @@ class UpdateCrawlConfig(BaseModel): proxyId: Optional[str] = None crawlTimeout: Optional[int] = None maxCrawlSize: Optional[int] = None - scale: Scale = 1 + scale: Optional[Scale] = None + browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None config: Optional[RawCrawlConfig] = None @@ -875,6 +884,7 @@ class CrawlOut(BaseMongoModel): manual: bool = False cid_rev: Optional[int] = None scale: Scale = 1 + browserWindows: BrowserWindowCount = 2 storageQuotaReached: Optional[bool] = False execMinutesQuotaReached: Optional[bool] = False @@ -961,7 +971,8 @@ class MatchCrawlQueueResponse(BaseModel): class CrawlScale(BaseModel): """scale the crawl to N parallel containers""" - scale: Scale = 1 + scale: Optional[Scale] = None + browserWindows: Optional[BrowserWindowCount] = None # ============================================================================ @@ -1054,6 +1065,7 @@ class CrawlScaleResponse(BaseModel): """Response model for modifying crawl scale""" scaled: int + browserWindows: int # ============================================================================ diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index c7198b77b3..1e506bdff7 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -207,3 +207,9 @@ def pod_count_from_browser_windows(browser_windows: int) -> int: """Return number of pods for given number of browser windows""" browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) return math.ceil(browser_windows / browsers_per_pod) + + +def browser_windows_from_pod_count(pod_count: int) -> int: + """Return number of browser windows from specified scale""" + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + return pod_count * browsers_per_pod From 459e466985ffe0f0291f6678a6259d0aa025ff96 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 16:47:55 -0400 Subject: [PATCH 26/57] Update frontend to use browserWindows not scale --- backend/btrixcloud/crawls.py | 2 +- backend/btrixcloud/models.py | 2 +- frontend/src/components/screencast.ts | 18 +++++++------- frontend/src/components/ui/config-details.ts | 2 +- .../crawl-workflows/workflow-editor.ts | 6 ++--- frontend/src/pages/org/workflow-detail.ts | 24 +++++++++---------- frontend/src/pages/org/workflows-new.ts | 2 +- frontend/src/types/crawler.ts | 2 ++ frontend/src/utils/workflow.ts | 6 ++--- 9 files changed, 33 insertions(+), 31 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 914200c5ab..c572e15155 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1560,7 +1560,7 @@ async def scale_crawl( status_code=400, detail=result.get("error") or "unknown" ) - return {"scaled": scale.scale, "browserWindows": scale.browserWindows} + return {"scaled": True, "browserWindows": scale.browserWindows} @app.get( "/orgs/{oid}/crawls/{crawl_id}/access", diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 31687076aa..92e87f031f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1064,7 +1064,7 @@ class CrawlCompleteIn(BaseModel): class CrawlScaleResponse(BaseModel): """Response model for modifying crawl scale""" - scaled: int + scaled: bool browserWindows: int diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index 3aeddc98ea..47710cc020 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -130,7 +130,7 @@ export class Screencast extends BtrixElement { crawlId?: string; @property({ type: Number }) - scale = 1; + browserWindows = 1; // List of browser screens @state() @@ -161,9 +161,9 @@ export class Screencast extends BtrixElement { this.disconnectAll(); this.connectAll(); } - const prevScale = changedProperties.get("scale"); - if (prevScale !== undefined) { - if (this.scale > prevScale) { + const prevWindows = changedProperties.get("browserWindows"); + if (prevWindows !== undefined) { + if (this.browserWindows > prevWindows) { this.scaleUp(); } else { this.scaleDown(); @@ -178,7 +178,7 @@ export class Screencast extends BtrixElement { } render() { - const screenCount = this.scale; + const screenCount = this.browserWindows; return html`
this.scale - 1; idx--) { + for (let idx = this.wsMap.size - 1; idx > this.browserWindows - 1; idx--) { const ws = this.wsMap.get(idx); if (ws) { @@ -269,7 +269,7 @@ export class Screencast extends BtrixElement { return; } - for (let idx = 0; idx < this.scale; idx++) { + for (let idx = 0; idx < this.browserWindows; idx++) { if (!this.wsMap.get(idx)) { const ws = this.connectWs(idx); @@ -301,7 +301,7 @@ export class Screencast extends BtrixElement { ) { if (message.msg === "init") { const dataMap: Record = {}; - for (let i = 0; i < message.browsers * this.scale; i++) { + for (let i = 0; i < message.browsers * this.browserWindows; i++) { dataMap[i] = null; } this.dataMap = dataMap; @@ -360,7 +360,7 @@ export class Screencast extends BtrixElement { }): void { const { index, retries = 0, delaySec = 10 } = opts; - if (index >= this.scale) { + if (index >= this.browserWindows) { return; } diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 7a2a0f1a30..599c54b4eb 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -251,7 +251,7 @@ export class ConfigDetails extends BtrixElement { )} ${this.renderSetting( msg("Browser Windows"), - crawlConfig?.scale ? `${crawlConfig.scale}` : "", + crawlConfig?.browserWindows ? `${crawlConfig.browserWindows}` : "", )} ${this.renderSetting( msg("Crawler Channel (Exact Crawler Version)"), diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index ccbcec1c2c..16cdd4872f 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1581,10 +1581,10 @@ https://archiveweb.page/images/${"logo.svg"}`} this.updateFormState({ - scale: +(e.target as SlCheckbox).value, + browserWindows: +(e.target as SlCheckbox).value, })} > ${map( @@ -2548,7 +2548,7 @@ https://archiveweb.page/images/${"logo.svg"}`} jobType: "custom", name: this.formState.jobName || "", description: this.formState.description, - scale: this.formState.scale, + browserWindows: this.formState.browserWindows, profileid: this.formState.browserProfile?.id || "", schedule: this.formState.scheduleType === "cron" ? this.utcSchedule : "", crawlTimeout: this.formState.crawlTimeoutMinutes * 60, diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index e7e82670d3..1f02da4e55 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -1591,7 +1591,7 @@ export class WorkflowDetail extends BtrixElement { if (!this.isCrawler) return; const enableEditBrowserWindows = !this.workflow.lastCrawlStopping; - const windowCount = this.workflow.scale || 1; + const windowCount = this.workflow.browserWindows || 1; return html`
@@ -1788,7 +1788,7 @@ export class WorkflowDetail extends BtrixElement {
@@ -2016,7 +2016,7 @@ export class WorkflowDetail extends BtrixElement { "Change the number of browser windows crawling in parallel. This change will take effect immediately on the currently running crawl and update crawl workflow settings.", )}

- + ${scaleOptions.map( ({ value, label }) => html` ( - `/orgs/${this.orgId}/crawls/${this.lastCrawlId}/scale`, - { - method: "POST", - body: JSON.stringify({ scale: +value }), - signal, - }, - ); + const data = await this.api.fetch<{ + scaled: boolean; + browserWindows: int; + }>(`/orgs/${this.orgId}/crawls/${this.lastCrawlId}/scale`, { + method: "POST", + body: JSON.stringify({ browserWindows: +value }), + signal, + }); if (data.scaled) { this.notify.toast({ diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 5b274f2ccb..6ed7f408c7 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -66,7 +66,7 @@ export class WorkflowsNew extends LiteElement { crawlTimeout: null, maxCrawlSize: null, jobType: "custom", - scale: 1, + browserWindows: 2, autoAddCollections: [], crawlerChannel: "default", proxyId: null, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index f348bb5559..8fcd8bf2c8 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -57,6 +57,7 @@ export type WorkflowParams = { name: string; schedule: string; scale: number; + browserWindows: number; profileid: string | null; config: SeedConfig; tags: string[]; @@ -193,6 +194,7 @@ export type Crawl = ArchivedItemBase & schedule: string; manual: boolean; scale: number; + browserWindows: number; shouldPause: boolean | null; resources?: { name: string; diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index edd25de1e7..291f384572 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -107,7 +107,7 @@ export type FormState = { | (typeof NewWorkflowOnlyScopeType)[keyof typeof NewWorkflowOnlyScopeType]; exclusions: WorkflowParams["config"]["exclude"]; pageLimit: WorkflowParams["config"]["limit"]; - scale: WorkflowParams["scale"]; + browserWindows: WorkflowParams["browserWindows"]; blockAds: WorkflowParams["config"]["blockAds"]; lang: WorkflowParams["config"]["lang"]; scheduleType: "date" | "cron" | "none"; @@ -164,7 +164,7 @@ export const getDefaultFormState = (): FormState => ({ scopeType: ScopeType.Page, exclusions: [], pageLimit: null, - scale: 1, + browserWindows: 2, blockAds: true, lang: getDefaultLang(), scheduleType: "none", @@ -306,7 +306,7 @@ export function getInitialFormState(params: { postLoadDelaySeconds: seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds, maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth, - scale: params.initialWorkflow.scale, + browserWindows: params.initialWorkflow.browserWindows, blockAds: params.initialWorkflow.config.blockAds, lang: params.initialWorkflow.config.lang ?? defaultFormState.lang, scheduleType: defaultFormState.scheduleType, From a2756db4ec308ead56d9ab7cef18d10e01d6c3f6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:00:53 -0400 Subject: [PATCH 27/57] Fix types --- backend/btrixcloud/crawlconfigs.py | 8 ++++++-- backend/btrixcloud/crawls.py | 10 +++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 2a3147e72d..c735e08687 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -427,9 +427,13 @@ async def update_crawl_config( ) if update.browserWindows: - update.scale = pod_count_from_browser_windows(update.browserWindows) + update.scale = pod_count_from_browser_windows( + cast(int, update.browserWindows) + ) else: - update.browserWindows = browser_windows_from_pod_count(update.scale) + update.browserWindows = browser_windows_from_pod_count( + cast(int, update.scale) + ) if update.config and update.config.exclude: exclude = update.config.exclude diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c572e15155..eb5bda7dc9 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -10,7 +10,7 @@ from datetime import datetime from uuid import UUID -from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator +from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator, cast from fastapi import Depends, HTTPException from fastapi.responses import StreamingResponse @@ -1546,9 +1546,13 @@ async def scale_crawl( ) if scale.browserWindows: - scale.scale = pod_count_from_browser_windows(scale.browserWindows) + scale.scale = pod_count_from_browser_windows( + cast(int, scale.browserWindows) + ) else: - scale.browserWindows = browser_windows_from_pod_count(scale.scale) + scale.browserWindows = browser_windows_from_pod_count( + cast(int, scale.scale) + ) await ops.update_crawl_scale(crawl_id, org, scale, user) From b0be90bdfa3a24775e4b95cd11faaacb66a6e361 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:05:20 -0400 Subject: [PATCH 28/57] Calculate browserWindows as needed on org import --- backend/btrixcloud/orgs.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index da9d275f21..2a7d758c7b 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -88,6 +88,7 @@ get_duplicate_key_error_field, validate_language_code, JSONSerializer, + browser_windows_from_pod_count, ) if TYPE_CHECKING: @@ -1270,6 +1271,11 @@ async def import_org( workflow_scale = workflow.get("scale", 1) workflow["scale"] = max(workflow_scale, MAX_BROWSER_WINDOWS) + if workflow.get("browserWindows") is None: + workflow["browserWindows"] = browser_windows_from_pod_count( + workflow["scale"] + ) + # Ensure crawlerChannel is set if not workflow.get("crawlerChannel"): workflow["crawlerChannel"] = "default" @@ -1299,6 +1305,13 @@ async def import_org( # Ensure crawlerChannel is set if not item.get("crawlerChannel"): item["crawlerChannel"] = "default" + + # Set browserWindows + if item.get("browserWindows") is None: + item["browserWindows"] = browser_windows_from_pod_count( + item.get("scale", 1) + ) + item_obj = Crawl.from_dict(item) if item["type"] == "upload": item_obj = UploadedCrawl.from_dict(item) # type: ignore From b2fd748bbbc815532b6741c93818e1f16fd515a4 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:06:30 -0400 Subject: [PATCH 29/57] Fix backend typing --- backend/btrixcloud/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index eb5bda7dc9..69b413bdaa 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1557,7 +1557,7 @@ async def scale_crawl( await ops.update_crawl_scale(crawl_id, org, scale, user) result = await ops.crawl_manager.scale_crawl( - crawl_id, scale.scale, scale.browserWindows + crawl_id, scale.scale, cast(int, scale.browserWindows) ) if not result or not result.get("success"): raise HTTPException( From fd241febf57fa6a6a693a9c39ab1cf5e8b0ed22a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:08:25 -0400 Subject: [PATCH 30/57] Fix frontend compilation errors --- frontend/src/pages/org/workflow-detail.ts | 2 +- frontend/src/types/crawler.ts | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 1f02da4e55..3e46d29f3f 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -2081,7 +2081,7 @@ export class WorkflowDetail extends BtrixElement { try { const data = await this.api.fetch<{ scaled: boolean; - browserWindows: int; + browserWindows: number; }>(`/orgs/${this.orgId}/crawls/${this.lastCrawlId}/scale`, { method: "POST", body: JSON.stringify({ browserWindows: +value }), diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 8fcd8bf2c8..a903be965d 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -56,7 +56,6 @@ export type WorkflowParams = { jobType?: JobType; name: string; schedule: string; - scale: number; browserWindows: number; profileid: string | null; config: SeedConfig; From 89f7e3906d201d6997d57e479ca02a836f6b92f7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:11:45 -0400 Subject: [PATCH 31/57] More backend fixups --- backend/btrixcloud/crawlconfigs.py | 17 +++++++++-------- backend/btrixcloud/crawls.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c735e08687..c33c2f98af 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -230,9 +230,13 @@ async def add_crawl_config( ) if config_in.browserWindows: - config_in.scale = pod_count_from_browser_windows(config_in.browserWindows) + config_in.scale = pod_count_from_browser_windows( + cast(int, config_in.browserWindows) + ) else: - config_in.browserWindows = browser_windows_from_pod_count(config_in.scale) + config_in.browserWindows = browser_windows_from_pod_count( + cast(int, config_in.scale) + ) # ensure crawlChannel is valid if not self.get_channel_crawler_image(config_in.crawlerChannel): @@ -421,16 +425,13 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) - if update.scale is None and update.browserWindows is None: - raise HTTPException( - status_code=400, detail="browser_windows_or_scale_required" - ) - + # Ensure browserWindows and scale are kept in sync and that + # browserWindows is given priority if update.browserWindows: update.scale = pod_count_from_browser_windows( cast(int, update.browserWindows) ) - else: + elif update.scale: update.browserWindows = browser_windows_from_pod_count( cast(int, update.scale) ) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 69b413bdaa..6b32d98aec 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1557,7 +1557,7 @@ async def scale_crawl( await ops.update_crawl_scale(crawl_id, org, scale, user) result = await ops.crawl_manager.scale_crawl( - crawl_id, scale.scale, cast(int, scale.browserWindows) + crawl_id, cast(int, scale.scale), cast(int, scale.browserWindows) ) if not result or not result.get("success"): raise HTTPException( From 2d1b152d22ec7e8a1a37a615b8c4cdc5d1a81023 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:18:41 -0400 Subject: [PATCH 32/57] Update CrawlSpec model to have both scale and browser windows --- backend/btrixcloud/operator/crawls.py | 3 ++- backend/btrixcloud/operator/models.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d270b1a430..73b73b59dc 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -172,7 +172,8 @@ async def sync_crawls(self, data: MCSyncData): storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel", "default"), proxy_id=spec.get("proxyId"), - browser_windows=spec.get("scale", 1), + scale=spec.get("scale", 1), + browser_windows=spec.get("browserWindows", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), paused_at=str_to_date(spec.get("pausedAt")), diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index a1f78dccfe..c19de57500 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -74,6 +74,7 @@ class CrawlSpec(BaseModel): cid: UUID oid: UUID org: Organization + scale: int = 1 browser_windows: int = 1 storage: StorageRef started: str From b6911add5dae86e57c86b94ba34dcb3ac9aefce7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 17:27:46 -0400 Subject: [PATCH 33/57] Add browser windows to crawl_job --- backend/btrixcloud/crawlmanager.py | 1 + backend/btrixcloud/k8sapi.py | 4 ++++ chart/app-templates/crawl_job.yaml | 1 + 3 files changed, 6 insertions(+) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 47c30ec2d3..e2fb1b6c2b 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -234,6 +234,7 @@ async def create_crawl_job( str(storage), crawlconfig.crawlerChannel, crawlconfig.scale, + crawlconfig.browserWindows, crawlconfig.crawlTimeout, crawlconfig.maxCrawlSize, manual=True, diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index bfaaabb656..41495f60de 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -85,6 +85,7 @@ def new_crawl_job_yaml( storage: str, crawler_channel: Optional[str] = "", scale: Optional[int] = 1, + browser_windows: Optional[int] = 1, crawl_timeout: Optional[int] = 0, max_crawl_size: Optional[int] = 0, manual: bool = True, @@ -109,6 +110,7 @@ def new_crawl_job_yaml( "storage_name": storage, "crawler_channel": crawler_channel, "scale": scale, + "browser_windows": browser_windows, "timeout": crawl_timeout, "max_crawl_size": max_crawl_size or 0, "manual": "1" if manual else "0", @@ -130,6 +132,7 @@ async def new_crawl_job( storage: str, crawler_channel: Optional[str] = "", scale: Optional[int] = 1, + browser_windows: Optional[int] = 1, crawl_timeout: Optional[int] = 0, max_crawl_size: Optional[int] = 0, manual: bool = True, @@ -148,6 +151,7 @@ async def new_crawl_job( storage=storage, crawler_channel=crawler_channel, scale=scale, + browser_windows=browser_windows, crawl_timeout=crawl_timeout, max_crawl_size=max_crawl_size, manual=manual, diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 9f9d966d26..4b749fab92 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -19,6 +19,7 @@ spec: cid: "{{ cid }}" oid: "{{ oid }}" scale: {{ scale }} + browserWindows: {{ browser_windows }} profile_filename: "{{ profile_filename }}" storage_filename: "{{ storage_filename }}" From 85c5254f73cb61e0315945da2452d30d589a4374 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 19:50:35 -0400 Subject: [PATCH 34/57] Keep default of scale=1 if browser windows/scale not specified --- backend/btrixcloud/crawlconfigs.py | 6 +----- backend/btrixcloud/models.py | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c33c2f98af..66723adf05 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -224,11 +224,7 @@ async def add_crawl_config( ) -> CrawlConfigAddedResponse: """Add new crawl config""" - if config_in.scale is None and config_in.browserWindows is None: - raise HTTPException( - status_code=400, detail="browser_windows_or_scale_required" - ) - + # Overrides scale if set if config_in.browserWindows: config_in.scale = pod_count_from_browser_windows( cast(int, config_in.browserWindows) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 92e87f031f..fefc27215c 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -374,7 +374,9 @@ class CrawlConfigIn(BaseModel): crawlTimeout: int = 0 maxCrawlSize: int = 0 - scale: Optional[Scale] = None + scale: Scale = 1 + + # Overrides scale if set browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None From 2d0646bb5b22c4ffff800ef58e173fb8fd1888c9 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 May 2025 20:26:57 -0400 Subject: [PATCH 35/57] Fix broken org import test --- backend/btrixcloud/orgs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 2a7d758c7b..6909a6b0fc 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -79,6 +79,8 @@ OrgSlugsResponse, OrgImportResponse, OrgPublicProfileUpdate, + MAX_BROWSER_WINDOWS, + MAX_CRAWL_SCALE, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import ( @@ -106,8 +108,6 @@ DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") -MAX_BROWSER_WINDOWS = int(os.environ.get("MAX_BROWSER_WINDOWS", 8)) - # number of items to delete at a time DEL_ITEMS = 1000 @@ -1267,14 +1267,17 @@ async def import_org( if old_userid and old_userid in user_id_map: workflow[userid_field] = user_id_map[old_userid] - # Ensure scale isn't above max_scale + # Ensure and browser windows don't exceed limits workflow_scale = workflow.get("scale", 1) - workflow["scale"] = max(workflow_scale, MAX_BROWSER_WINDOWS) + workflow["scale"] = max(workflow_scale, MAX_CRAWL_SCALE) if workflow.get("browserWindows") is None: - workflow["browserWindows"] = browser_windows_from_pod_count( + workflow_browser_windows = browser_windows_from_pod_count( workflow["scale"] ) + workflow["browserWindows"] = max( + workflow_browser_windows, MAX_BROWSER_WINDOWS + ) # Ensure crawlerChannel is set if not workflow.get("crawlerChannel"): From ae6a4ddb55fe76906bf110d77268665f2a42a4f4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 25 May 2025 14:54:37 -0700 Subject: [PATCH 36/57] cleanup, use desired pod count with page check to avoid starting up more pods than pages --- backend/btrixcloud/operator/crawls.py | 33 ++++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 73b73b59dc..def72fcb87 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -368,7 +368,7 @@ async def sync_crawls(self, data: MCSyncData): print(f"status.scale: {status.scale}", flush=True) print(f"crawl.browser_windows: {crawl.browser_windows}", flush=True) - crawler_pod_count = pod_count_from_browser_windows(crawl.browser_windows) + crawler_pod_count = self.desired_pod_count(crawl, status) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) remainder = crawl.browser_windows % browsers_per_pod @@ -566,11 +566,20 @@ def _qa_configmap_update_needed(self, name, configmap): return False + def desired_pod_count(self, crawl: CrawlSpec, status: CrawlStatus): + """get pod count from browser windows, also ensure not bigger + than pages found""" + desired_scale = pod_count_from_browser_windows(crawl.browser_windows) + + if status.pagesFound < desired_scale: + desired_scale = max(1, status.pagesFound) + + return desired_scale + # pylint: disable=too-many-arguments async def _resolve_scale( self, - crawl_id: str, - desired_browser_windows: int, + crawl: CrawlSpec, redis: Redis, # pylint: disable=unused-argument status: CrawlStatus, @@ -584,20 +593,14 @@ async def _resolve_scale( scale and clean up previous scale state. """ + crawl_id = crawl.id + # actual scale (minus redis pod) actual_scale = len(pods) if pods.get(f"redis-{crawl_id}"): actual_scale -= 1 - desired_scale = pod_count_from_browser_windows(desired_browser_windows) - - print(f"actual scale (pods): {actual_scale}", flush=True) - print(f"desired browser windows: {desired_browser_windows}", flush=True) - print(f"desired scale (pods): {desired_scale}", flush=True) - - # ensure at least enough pages for the scale - # if status.pagesFound < desired_scale: - # desired_scale = max(1, status.pagesFound) + desired_scale = self.desired_pod_count(crawl, status) # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: @@ -1532,12 +1535,10 @@ async def update_crawl_state( ) # resolve scale - desired_pod_count = pod_count_from_browser_windows(crawl.browser_windows) + desired_pod_count = self.desired_pod_count(crawl, status) if desired_pod_count != status.scale: - status.scale = await self._resolve_scale( - crawl.id, crawl.browser_windows, redis, status, pods - ) + status.scale = await self._resolve_scale(crawl, redis, status, pods) # check if done / failed status_count: dict[str, int] = {} From 252fc6db04b1a6a0d74c66d0ef1b77dda190f7d7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 May 2025 14:34:27 -0700 Subject: [PATCH 37/57] compute resources for remainder browsers --- backend/btrixcloud/operator/baseoperator.py | 62 ++++++++++----------- backend/btrixcloud/operator/crawls.py | 19 ++++--- 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index b63ff9f4ae..c4f7e76cc3 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -57,40 +57,13 @@ def compute_crawler_resources(self) -> None: except: # default to 1 for now for best results (to revisit in the future) qa_num_workers = 1 - crawler_cpu: float = 0 - crawler_memory: int = 0 - qa_cpu: float = 0 - qa_memory: int = 0 - print("crawler resources") - if not p.get("crawler_cpu"): - base = parse_quantity(p["crawler_cpu_base"]) - extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) - - # cpu is a floating value of cpu cores - crawler_cpu = float(base + (num_workers - 1) * extra) - qa_cpu = float(base + (qa_num_workers - 1) * extra) - print(f"cpu = {base} + {num_workers - 1} * {extra} = {crawler_cpu}") - print(f"qa_cpu = {base} + {qa_num_workers - 1} * {extra} = {qa_cpu}") - else: - crawler_cpu = float(parse_quantity(p["crawler_cpu"])) - qa_cpu = crawler_cpu - print(f"cpu = {crawler_cpu}") + crawler_memory, crawler_cpu = self.compute_for_num_browsers(num_workers) + qa_memory, qa_cpu = self.compute_for_num_browsers(qa_num_workers) - if not p.get("crawler_memory"): - base = parse_quantity(p["crawler_memory_base"]) - extra = parse_quantity(p["crawler_extra_memory_per_browser"]) - - # memory is always an int - crawler_memory = int(base + (num_workers - 1) * extra) - qa_memory = int(base + (qa_num_workers - 1) * extra) - - print(f"memory = {base} + {num_workers - 1} * {extra} = {crawler_memory}") - print(f"qa_memory = {base} + {qa_num_workers - 1} * {extra} = {qa_memory}") - else: - crawler_memory = int(parse_quantity(p["crawler_memory"])) - qa_memory = crawler_memory - print(f"memory = {crawler_memory}") + print("crawler resources") + print(f"cpu = {crawler_cpu} qa: {qa_cpu}") + print(f"memory = {crawler_memory} qa: {qa_memory}") max_crawler_memory_size = 0 max_crawler_memory = os.environ.get("MAX_CRAWLER_MEMORY") @@ -108,6 +81,31 @@ def compute_crawler_resources(self) -> None: p["qa_memory"] = qa_memory p["qa_workers"] = qa_num_workers + def compute_for_num_browsers(self, num_browsers) -> tuple[int, float]: + """compute memory, cpu for given num of browsers""" + p = self.shared_params + + if not p.get("crawler_memory"): + base = parse_quantity(p["crawler_memory_base"]) + extra = parse_quantity(p["crawler_extra_memory_per_browser"]) + + # memory is always an int + crawler_memory = int(base + (num_browsers - 1) * extra) + else: + crawler_memory = int(parse_quantity(p["crawler_memory"])) + + if not p.get("crawler_cpu"): + base = parse_quantity(p["crawler_cpu_base"]) + extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) + + # cpu is a floating value of cpu cores + crawler_cpu = float(base + (num_browsers - 1) * extra) + + else: + crawler_cpu = float(parse_quantity(p["crawler_cpu"])) + + return crawler_memory, crawler_cpu + def compute_profile_resources(self) -> None: """compute memory /cpu resources for a single profile browser""" p = self.shared_params diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index def72fcb87..0e7f57dae3 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -516,22 +516,27 @@ def _load_crawler( worker_field = "crawler_workers" pri_class = f"crawl-pri-{i}" + if pod_remainder: + memory, cpu = self.k8s.compute_for_num_browsers(pod_remainder) + workers = pod_remainder + print("pod remainder cpu, memory", cpu, memory) + else: + cpu = params.get(cpu_field) + memory = params.get(mem_field) + workers = params.get(worker_field) or 1 + pod_info = status.podStatus[name] params["name"] = name params["priorityClassName"] = pri_class - params["cpu"] = pod_info.newCpu or params.get(cpu_field) - params["memory"] = pod_info.newMemory or params.get(mem_field) + params["cpu"] = pod_info.newCpu or cpu + params["memory"] = pod_info.newMemory or memory + params["workers"] = workers if self.k8s.enable_auto_resize: params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING else: params["memory_limit"] = self.k8s.max_crawler_memory_size params["storage"] = pod_info.newStorage or params.get("crawler_storage") - if pod_remainder: - params["workers"] = pod_remainder - else: - params["workers"] = params.get(worker_field) or 1 - params["init_crawler"] = not is_paused if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) From 67cb22a0f12ccab7124dc1fc041dcbe12a81cdac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 May 2025 19:20:24 -0700 Subject: [PATCH 38/57] additional fixes for N browses: - ensure correct adjustment on frontend for unequal browser amounts - ensure pods correctly restarted if number of browser for each pod changes, track lastWorkers per pod - ensure if both max browsers and max scale is specified, max browsers if chosen, compute max scale from max browsers, otherwise compute max browsers from max scale --- backend/btrixcloud/main.py | 2 + backend/btrixcloud/models.py | 15 ++++- backend/btrixcloud/operator/baseoperator.py | 16 +++-- backend/btrixcloud/operator/crawls.py | 70 +++++++++++---------- backend/btrixcloud/operator/models.py | 3 +- frontend/src/components/screencast.ts | 36 +++++++++-- frontend/src/pages/org/workflow-detail.ts | 2 + frontend/src/pages/org/workflows-new.ts | 2 +- frontend/src/utils/app.ts | 2 + 9 files changed, 100 insertions(+), 48 deletions(-) diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index af19f0d69c..6b3ecefb8b 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -112,6 +112,7 @@ class SettingsResponse(BaseModel): defaultPageLoadTimeSeconds: int maxPagesPerCrawl: int + numBrowsersPerInstance: int maxBrowserWindows: int billingEnabled: bool @@ -148,6 +149,7 @@ def main() -> None: os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120) ), maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), + numBrowsersPerInstance=int(os.environ.get("NUM_BROWSERS", 1)), maxBrowserWindows=int(os.environ.get("MAX_BROWSER_WINDOWS", 8)), billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")), signUpUrl=os.environ.get("SIGN_UP_URL", ""), diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fefc27215c..a812978142 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -8,6 +8,7 @@ import base64 import hashlib import mimetypes +import math import os from typing import Optional, List, Dict, Union, Literal, Any, get_args @@ -30,11 +31,19 @@ from .db import BaseMongoModel -# crawl scale for constraint -MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +# num browsers per crawler instance +NUM_BROWSERS = int(os.environ.get("NUM_BROWSERS", 2)) # browser window for constraint (preferred over scale if provided) -MAX_BROWSER_WINDOWS = int(os.environ.get("MAX_BROWSER_WINDOWS", 8)) +MAX_BROWSER_WINDOWS = os.environ.get("MAX_BROWSER_WINDOWS") or 0 + +# crawl scale for constraint +if MAX_BROWSER_WINDOWS: + MAX_BROWSER_WINDOWS = int(MAX_BROWSER_WINDOWS) + MAX_CRAWL_SCALE = math.ceil(MAX_BROWSER_WINDOWS / NUM_BROWSERS) +else: + MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) + MAX_BROWSER_WINDOWS = MAX_CRAWL_SCALE * NUM_BROWSERS # Presign duration must be less than 604800 seconds (one week), # so set this one minute short of a week diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index c4f7e76cc3..8351c55d12 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -58,7 +58,9 @@ def compute_crawler_resources(self) -> None: # default to 1 for now for best results (to revisit in the future) qa_num_workers = 1 - crawler_memory, crawler_cpu = self.compute_for_num_browsers(num_workers) + crawler_memory, crawler_cpu = self.compute_for_num_browsers( + num_workers, p.get("crawler_memory"), p.get("crawler_cpu") + ) qa_memory, qa_cpu = self.compute_for_num_browsers(qa_num_workers) print("crawler resources") @@ -81,20 +83,22 @@ def compute_crawler_resources(self) -> None: p["qa_memory"] = qa_memory p["qa_workers"] = qa_num_workers - def compute_for_num_browsers(self, num_browsers) -> tuple[int, float]: + def compute_for_num_browsers( + self, num_browsers, crawler_memory_fixed="", crawler_cpu_fixed="" + ) -> tuple[int, float]: """compute memory, cpu for given num of browsers""" p = self.shared_params - if not p.get("crawler_memory"): + if not crawler_memory_fixed: base = parse_quantity(p["crawler_memory_base"]) extra = parse_quantity(p["crawler_extra_memory_per_browser"]) # memory is always an int crawler_memory = int(base + (num_browsers - 1) * extra) else: - crawler_memory = int(parse_quantity(p["crawler_memory"])) + crawler_memory = int(parse_quantity(crawler_memory_fixed)) - if not p.get("crawler_cpu"): + if not crawler_cpu_fixed: base = parse_quantity(p["crawler_cpu_base"]) extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) @@ -102,7 +106,7 @@ def compute_for_num_browsers(self, num_browsers) -> tuple[int, float]: crawler_cpu = float(base + (num_browsers - 1) * extra) else: - crawler_cpu = float(parse_quantity(p["crawler_cpu"])) + crawler_cpu = float(parse_quantity(crawler_cpu_fixed)) return crawler_memory, crawler_cpu diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 0e7f57dae3..e772c42ecd 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -368,21 +368,20 @@ async def sync_crawls(self, data: MCSyncData): print(f"status.scale: {status.scale}", flush=True) print(f"crawl.browser_windows: {crawl.browser_windows}", flush=True) - crawler_pod_count = self.desired_pod_count(crawl, status) - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + crawler_pod_count = pod_count_from_browser_windows(crawl.browser_windows) - remainder = crawl.browser_windows % browsers_per_pod - remainder_changed = (status.lastBrowserWindows % browsers_per_pod) != remainder - print(f"remainder: {remainder}, changed: {remainder_changed}") - status.lastBrowserWindows = crawl.browser_windows + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) for i in range(0, crawler_pod_count): + if status.pagesFound < i * browsers_per_pod: + break + children.extend( self._load_crawler( params, i, - remainder if i == crawler_pod_count - 1 else 0, - remainder_changed, + crawler_pod_count, + crawl.browser_windows, status, data.children, is_paused, @@ -496,8 +495,8 @@ def _load_crawler( self, params, i: int, - pod_remainder: int, - remainder_changed: bool, + total_pods: int, + total_browser_windows: int, status: CrawlStatus, children, is_paused: bool, @@ -516,16 +515,30 @@ def _load_crawler( worker_field = "crawler_workers" pri_class = f"crawl-pri-{i}" - if pod_remainder: - memory, cpu = self.k8s.compute_for_num_browsers(pod_remainder) - workers = pod_remainder - print("pod remainder cpu, memory", cpu, memory) + browsers_per_pod = params.get(worker_field) or 1 + + # if last pod, compute remaining browsers, or full amount if 0 + if i == total_pods - 1: + workers = (total_browser_windows % browsers_per_pod) or browsers_per_pod + else: + workers = browsers_per_pod + + # scale resources if < full browsers_per_pod + if workers < browsers_per_pod: + memory, cpu = self.k8s.compute_for_num_browsers(workers) else: cpu = params.get(cpu_field) memory = params.get(mem_field) - workers = params.get(worker_field) or 1 pod_info = status.podStatus[name] + + # compute if number of browsers for this pod has changed + workers_changed = pod_info.lastWorkers != workers + if workers_changed: + print(f"Workers changed for {i}: {pod_info.lastWorkers} -> {workers}") + + pod_info.lastWorkers = workers + params["name"] = name params["priorityClassName"] = pri_class params["cpu"] = pod_info.newCpu or cpu @@ -540,7 +553,7 @@ def _load_crawler( params["init_crawler"] = not is_paused if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) - if not restart_reason and remainder_changed: + if not restart_reason and workers_changed: restart_reason = "pod_resized" if restart_reason: @@ -571,16 +584,6 @@ def _qa_configmap_update_needed(self, name, configmap): return False - def desired_pod_count(self, crawl: CrawlSpec, status: CrawlStatus): - """get pod count from browser windows, also ensure not bigger - than pages found""" - desired_scale = pod_count_from_browser_windows(crawl.browser_windows) - - if status.pagesFound < desired_scale: - desired_scale = max(1, status.pagesFound) - - return desired_scale - # pylint: disable=too-many-arguments async def _resolve_scale( self, @@ -598,6 +601,14 @@ async def _resolve_scale( scale and clean up previous scale state. """ + desired_scale = pod_count_from_browser_windows(crawl.browser_windows) + + if status.pagesFound < desired_scale: + desired_scale = max(1, status.pagesFound) + + if desired_scale == status.scale: + return status.scale + crawl_id = crawl.id # actual scale (minus redis pod) @@ -605,8 +616,6 @@ async def _resolve_scale( if pods.get(f"redis-{crawl_id}"): actual_scale -= 1 - desired_scale = self.desired_pod_count(crawl, status) - # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: return desired_scale @@ -1540,10 +1549,7 @@ async def update_crawl_state( ) # resolve scale - desired_pod_count = self.desired_pod_count(crawl, status) - - if desired_pod_count != status.scale: - status.scale = await self._resolve_scale(crawl, redis, status, pods) + await self._resolve_scale(crawl, redis, status, pods) # check if done / failed status_count: dict[str, int] = {} diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index c19de57500..02f4d2a244 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -144,6 +144,8 @@ class PodInfo(BaseModel): evicted: Optional[bool] = False + lastWorkers: Optional[int] = 0 + def dict(self, *a, **kw): res = super().dict(*a, **kw) percent = { @@ -208,7 +210,6 @@ class CrawlStatus(BaseModel): sizeHuman: str = "" # number of pods scale: int = 1 - lastBrowserWindows: int = 1 filesAdded: int = 0 filesAddedSize: int = 0 finished: Optional[str] = None diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index 47710cc020..affa762622 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -132,6 +132,15 @@ export class Screencast extends BtrixElement { @property({ type: Number }) browserWindows = 1; + @property({ type: Number }) + numBrowsersPerInstance = 1; + + @state() + private scale = 1; + + @state() + private lastIndexOffset = 0; + // List of browser screens @state() private dataMap: { [index: string | number]: ScreencastMessage | null } = {}; @@ -146,9 +155,18 @@ export class Screencast extends BtrixElement { private readonly timerIds: number[] = []; protected firstUpdated() { + this.updateScale(); + // Connect to websocket server this.connectAll(); } + protected updateScale() { + const remainder = this.browserWindows % this.numBrowsersPerInstance; + this.scale = Math.ceil(this.browserWindows / this.numBrowsersPerInstance); + this.lastIndexOffset = remainder + ? (this.scale - 1) * (this.numBrowsersPerInstance - remainder) + : 0; + } async updated( changedProperties: PropertyValues & Map, @@ -161,6 +179,9 @@ export class Screencast extends BtrixElement { this.disconnectAll(); this.connectAll(); } + if (changedProperties.has("browserWindows")) { + this.updateScale(); + } const prevWindows = changedProperties.get("browserWindows"); if (prevWindows !== undefined) { if (this.browserWindows > prevWindows) { @@ -251,7 +272,7 @@ export class Screencast extends BtrixElement { } private scaleDown() { - for (let idx = this.wsMap.size - 1; idx > this.browserWindows - 1; idx--) { + for (let idx = this.wsMap.size - 1; idx > this.scale - 1; idx--) { const ws = this.wsMap.get(idx); if (ws) { @@ -269,7 +290,7 @@ export class Screencast extends BtrixElement { return; } - for (let idx = 0; idx < this.browserWindows; idx++) { + for (let idx = 0; idx < this.scale; idx++) { if (!this.wsMap.get(idx)) { const ws = this.connectWs(idx); @@ -298,17 +319,21 @@ export class Screencast extends BtrixElement { private handleMessage( message: InitMessage | ScreencastMessage | CloseMessage, + isLast: boolean, ) { if (message.msg === "init") { const dataMap: Record = {}; - for (let i = 0; i < message.browsers * this.browserWindows; i++) { + for (let i = 0; i < this.browserWindows; i++) { dataMap[i] = null; } this.dataMap = dataMap; this.screenWidth = message.width; this.screenHeight = message.height; } else { - const { id } = message; + let { id } = message; + if (isLast) { + id += this.lastIndexOffset; + } const dataMap = { ...this.dataMap }; if (message.msg === "screencast") { @@ -344,6 +369,7 @@ export class Screencast extends BtrixElement { ws.addEventListener("message", ({ data }: MessageEvent) => { this.handleMessage( JSON.parse(data) as InitMessage | ScreencastMessage | CloseMessage, + index === this.scale - 1, ); }); @@ -360,7 +386,7 @@ export class Screencast extends BtrixElement { }): void { const { index, retries = 0, delaySec = 10 } = opts; - if (index >= this.browserWindows) { + if (index >= this.scale) { return; } diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 3e46d29f3f..8b1c807254 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -1788,6 +1788,8 @@ export class WorkflowDetail extends BtrixElement {
diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 6ed7f408c7..b63112019f 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -66,7 +66,7 @@ export class WorkflowsNew extends LiteElement { crawlTimeout: null, maxCrawlSize: null, jobType: "custom", - browserWindows: 2, + browserWindows: this.appState.settings?.numBrowsersPerInstance || 1, autoAddCollections: [], crawlerChannel: "default", proxyId: null, diff --git a/frontend/src/utils/app.ts b/frontend/src/utils/app.ts index a4a7088106..e6c8670aeb 100644 --- a/frontend/src/utils/app.ts +++ b/frontend/src/utils/app.ts @@ -8,6 +8,7 @@ export type AppSettings = { defaultBehaviorTimeSeconds: number; defaultPageLoadTimeSeconds: number; maxPagesPerCrawl: number; + numBrowsersPerInstance: number; maxBrowserWindows: number; billingEnabled: boolean; signUpUrl: string; @@ -36,6 +37,7 @@ export async function getAppSettings(): Promise { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, + numBrowsersPerInstance: 1, maxBrowserWindows: 4, billingEnabled: false, signUpUrl: "", From 9f466b952fe8381cba6ffac236578595efc15018 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 May 2025 20:02:51 -0700 Subject: [PATCH 39/57] ensure backwards compatible with max_crawl_scale if no max_browser_windows --- chart/templates/configmap.yaml | 1 + chart/templates/priorities.yaml | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 75d0eef236..4e681840aa 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -49,6 +49,7 @@ data: FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" + MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}" MAX_BROWSER_WINDOWS: "{{ .Values.max_browser_windows | default 8 }}" LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 416b5a0319..97d3af72fb 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -1,5 +1,7 @@ +{{ $max_browser_windows := not (empty .Values.max_browser_windows) | ternary (int .Values.max_browser_windows) (mul (int .Values.max_crawl_scale) (int .Values.crawler_browser_instances) ) }} -{{- range untilStep 0 (int .Values.max_browser_windows) 1 }} + +{{- range untilStep 0 $max_browser_windows 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass @@ -11,7 +13,7 @@ description: "Priority for crawl instance #{{ . }}" {{- end }} -{{- range untilStep 0 (int .Values.max_browser_windows) 1 }} +{{- range untilStep 0 $max_browser_windows 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass From ae50e6e4fb859f56665672813e34146cce884365 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 May 2025 20:04:40 -0700 Subject: [PATCH 40/57] fix tests --- backend/test/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/test/test_api.py b/backend/test/test_api.py index a3fba6c00e..53cae1248f 100644 --- a/backend/test/test_api.py +++ b/backend/test/test_api.py @@ -44,6 +44,7 @@ def test_api_settings(): "defaultBehaviorTimeSeconds": 300, "maxPagesPerCrawl": 4, "maxBrowserWindows": 8, + "numBrowsersPerInstance": 2, "defaultPageLoadTimeSeconds": 120, "billingEnabled": True, "signUpUrl": "", From 844f253a17467406c9ca5e023b7fe3decd183ada Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 May 2025 20:49:45 -0700 Subject: [PATCH 41/57] switch browser windows to text box --- .../crawl-workflows/workflow-editor.ts | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 16cdd4872f..e40d797f66 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -32,8 +32,6 @@ import { state, } from "lit/decorators.js"; import { ifDefined } from "lit/directives/if-defined.js"; -import { map } from "lit/directives/map.js"; -import { range } from "lit/directives/range.js"; import { when } from "lit/directives/when.js"; import compact from "lodash/fp/compact"; import flow from "lodash/fp/flow"; @@ -1578,23 +1576,18 @@ https://archiveweb.page/images/${"logo.svg"}`} ] : nothing} ${inputCol(html` - - this.updateFormState({ - browserWindows: +(e.target as SlCheckbox).value, - })} - > - ${map( - range(this.appState.settings?.maxBrowserWindows || 1), - (i: number) => - html` ${i + 1}`, + value=${this.formState.browserWindows || ""} + placeholder=${defaultLabel( + this.appState.settings?.numBrowsersPerInstance, )} - + min="1" + max="${this.appState.settings?.maxBrowserWindows || 1}" + type="number" + inputmode="numeric" + > `)} ${this.renderHelpTextCol( html`${msg( From 79779772b97ffe4fc94706d9a50c385845b47e1b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 10:05:25 -0400 Subject: [PATCH 42/57] Calculate scale at time of need instead of storing in db This will keep scale and browserWindows from getting out of sync when the number of workers per pod on an instance changes. --- backend/btrixcloud/crawlconfigs.py | 18 +++--------------- backend/btrixcloud/crawlmanager.py | 6 ++++-- backend/btrixcloud/crawls.py | 12 +++++------- backend/btrixcloud/models.py | 6 ++---- 4 files changed, 14 insertions(+), 28 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 66723adf05..f6917e74f7 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -52,7 +52,6 @@ validate_regexes, validate_language_code, is_url, - pod_count_from_browser_windows, browser_windows_from_pod_count, ) @@ -225,11 +224,7 @@ async def add_crawl_config( """Add new crawl config""" # Overrides scale if set - if config_in.browserWindows: - config_in.scale = pod_count_from_browser_windows( - cast(int, config_in.browserWindows) - ) - else: + if config_in.browserWindows is None: config_in.browserWindows = browser_windows_from_pod_count( cast(int, config_in.scale) ) @@ -284,7 +279,6 @@ async def add_crawl_config( jobType=config_in.jobType, crawlTimeout=config_in.crawlTimeout, maxCrawlSize=config_in.maxCrawlSize, - scale=config_in.scale, browserWindows=config_in.browserWindows, autoAddCollections=config_in.autoAddCollections, profileid=profileid, @@ -421,16 +415,11 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) - # Ensure browserWindows and scale are kept in sync and that - # browserWindows is given priority - if update.browserWindows: - update.scale = pod_count_from_browser_windows( - cast(int, update.browserWindows) - ) - elif update.scale: + if update.scale: update.browserWindows = browser_windows_from_pod_count( cast(int, update.scale) ) + update.scale = None if update.config and update.config.exclude: exclude = update.config.exclude @@ -465,7 +454,6 @@ async def update_crawl_config( changed = changed or ( self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate") ) - changed = changed or self.check_attr_changed(orig_crawl_config, update, "scale") changed = changed or self.check_attr_changed( orig_crawl_config, update, "browserWindows" ) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index e2fb1b6c2b..be4832f45e 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -8,7 +8,7 @@ from fastapi import HTTPException -from .utils import dt_now, date_to_str +from .utils import dt_now, date_to_str, pod_count_from_browser_windows from .k8sapi import K8sAPI from .models import StorageRef, CrawlConfig, BgJobType @@ -227,13 +227,15 @@ async def create_crawl_job( await self.has_storage_secret(storage_secret) + scale = pod_count_from_browser_windows(crawlconfig.browserWindows) + return await self.new_crawl_job( cid, userid, str(crawlconfig.oid), str(storage), crawlconfig.crawlerChannel, - crawlconfig.scale, + scale, crawlconfig.browserWindows, crawlconfig.crawlTimeout, crawlconfig.maxCrawlSize, diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 6b32d98aec..5e597c1397 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -394,22 +394,20 @@ async def add_new_crawl( pass async def update_crawl_scale( - self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User + self, crawl_id: str, org: Organization, browser_windows: int, user: User ) -> bool: """Update crawl scale in the db""" crawl = await self.get_crawl(crawl_id, org) - update = UpdateCrawlConfig( - scale=crawl_scale.scale, browserWindows=crawl_scale.browserWindows - ) + update = UpdateCrawlConfig(browserWindows=browser_windows) + await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, { "$set": { - "scale": crawl_scale.scale, - "browserWindows": crawl_scale.browserWindows, + "browserWindows": browser_windows, } }, return_document=pymongo.ReturnDocument.AFTER, @@ -1554,7 +1552,7 @@ async def scale_crawl( cast(int, scale.scale) ) - await ops.update_crawl_scale(crawl_id, org, scale, user) + await ops.update_crawl_scale(crawl_id, org, scale.browserWindows, user) result = await ops.crawl_manager.scale_crawl( crawl_id, cast(int, scale.scale), cast(int, scale.browserWindows) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a812978142..6497f516be 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -407,7 +407,7 @@ class ConfigRevision(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 - scale: Scale = 1 + scale: Optional[Scale] = 1 browserWindows: Optional[BrowserWindowCount] = 2 modified: datetime @@ -429,7 +429,6 @@ class CrawlConfigCore(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 - scale: Scale = 1 browserWindows: BrowserWindowCount = 2 oid: UUID @@ -894,7 +893,6 @@ class CrawlOut(BaseMongoModel): pausedAt: Optional[datetime] = None manual: bool = False cid_rev: Optional[int] = None - scale: Scale = 1 browserWindows: BrowserWindowCount = 2 storageQuotaReached: Optional[bool] = False @@ -980,7 +978,7 @@ class MatchCrawlQueueResponse(BaseModel): # ============================================================================ class CrawlScale(BaseModel): - """scale the crawl to N parallel containers""" + """scale the crawl to N parallel containers or windows""" scale: Optional[Scale] = None browserWindows: Optional[BrowserWindowCount] = None From c6275aa34377c462db9e996abdc9f954bbc72499 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 10:18:42 -0400 Subject: [PATCH 43/57] Update org import for change --- backend/btrixcloud/orgs.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 6909a6b0fc..2bd7938a6d 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1267,13 +1267,11 @@ async def import_org( if old_userid and old_userid in user_id_map: workflow[userid_field] = user_id_map[old_userid] - # Ensure and browser windows don't exceed limits - workflow_scale = workflow.get("scale", 1) - workflow["scale"] = max(workflow_scale, MAX_CRAWL_SCALE) - + # Convert scale to browser windows and respect limits + workflow_scale = max(workflow.get("scale", 1), MAX_CRAWL_SCALE) if workflow.get("browserWindows") is None: workflow_browser_windows = browser_windows_from_pod_count( - workflow["scale"] + workflow_scale ) workflow["browserWindows"] = max( workflow_browser_windows, MAX_BROWSER_WINDOWS @@ -1310,10 +1308,12 @@ async def import_org( item["crawlerChannel"] = "default" # Set browserWindows - if item.get("browserWindows") is None: - item["browserWindows"] = browser_windows_from_pod_count( + browser_windows = item.get("browserWindows") + if browser_windows is None: + browser_windows = browser_windows_from_pod_count( item.get("scale", 1) ) + item["browserWindows"] = max(browser_windows, MAX_BROWSER_WINDOWS) item_obj = Crawl.from_dict(item) if item["type"] == "upload": From eaef86f76e5aec2333db009041e94b9468104a7a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 10:28:09 -0400 Subject: [PATCH 44/57] More fixups for removing scale from db --- backend/btrixcloud/crawlmanager.py | 3 ++- backend/btrixcloud/crawls.py | 4 ++-- .../migrations/migration_0005_operator_scheduled_jobs.py | 2 +- backend/btrixcloud/operator/cronjobs.py | 4 +++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index be4832f45e..1051c241fb 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -261,7 +261,8 @@ async def update_running_crawl_config( # pylint: disable=use-dict-literal patch = dict( crawlerChannel=crawlconfig.crawlerChannel, - scale=crawlconfig.scale, + scale=pod_count_from_browser_windows(crawlconfig.browserWindows), + browserWindows=crawlconfig.browserWindows, timeout=crawlconfig.crawlTimeout, maxCrawlSize=crawlconfig.maxCrawlSize, proxyId=crawlconfig.proxyId or DEFAULT_PROXY_ID, diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5e597c1397..e18b57c17a 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -370,7 +370,7 @@ async def add_new_crawl( oid=crawlconfig.oid, cid=crawlconfig.id, cid_rev=crawlconfig.rev, - scale=crawlconfig.scale, + browserWindows=crawlconfig.browserWindows, jobType=crawlconfig.jobType, config=crawlconfig.config, profileid=crawlconfig.profileid, @@ -537,7 +537,7 @@ async def add_or_remove_exclusion( cid = crawl.cid - browser_windows = crawl.scale or 1 + browser_windows = crawl.browserWindows or 2 async with self.get_redis(crawl_id) as redis: query = { diff --git a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py index 6e2d005ab2..1f70e0cf6d 100644 --- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py +++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py @@ -43,7 +43,7 @@ async def migrate_up(self): config = CrawlConfig.from_dict(config_dict) print( f"Updating Crawl Config {config.id}: schedule: {config.schedule}, " - + f"timeout: {config.crawlTimeout}, scale: {config.scale}" + + f"timeout: {config.crawlTimeout}" ) try: await crawl_manager.update_scheduled_job(config) diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 9a411431e5..5a3ba7d5ac 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -9,6 +9,7 @@ from .baseoperator import BaseOperator from ..models import CrawlConfig +from ..utils import pod_count_from_browser_windows # pylint: disable=too-many-locals @@ -129,7 +130,8 @@ async def make_new_crawljob( oid=str(oid), storage=str(org.storage), crawler_channel=crawlconfig.crawlerChannel or "default", - scale=crawlconfig.scale, + scale=pod_count_from_browser_windows(crawlconfig.browserWindows), + browser_windows=crawlconfig.browserWindows, crawl_timeout=crawlconfig.crawlTimeout, max_crawl_size=crawlconfig.maxCrawlSize, manual=False, From bb29cc5e62296af7fa123c32c5bd87d43da29cab Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 10:37:21 -0400 Subject: [PATCH 45/57] Add migration to convert scale to browserWindows in db Comment out unsetting of scale for now for easier testing on dev --- backend/btrixcloud/db.py | 2 +- ...migration_0047_scale_to_browser_windows.py | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index e27b499dd4..cbdd252d2e 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -32,7 +32,7 @@ ) = PageOps = BackgroundJobOps = object -CURR_DB_VERSION = "0046" +CURR_DB_VERSION = "0047" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py new file mode 100644 index 0000000000..03e547bab4 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py @@ -0,0 +1,68 @@ +""" +Migration 0047 - Convert scale to browserWindows +""" + +from btrixcloud.migrations import BaseMigration +from btrixcloud.utils import browser_windows_from_pod_count + + +MIGRATION_VERSION = "0047" + + +# pylint: disable=duplicate-code +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Calculate and store browserWindows from existing scale on workflows and crawls + """ + configs_mdb = self.mdb["crawl_configs"] + crawls_mdb = self.mdb["crawls"] + + async for config_raw in configs_mdb.find({"browserWindows": None}): + config_id = config_raw["_id"] + scale = config_raw.get("scale", 1) + + try: + await configs_mdb.find_one_and_update( + {"_id": config_id}, + { + "$set": { + "browserWindows": browser_windows_from_pod_count(scale) + }, + # "$unset": {"scale": 1} + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to set browser windows from scale for workflow {config_id}: {err}", + flush=True, + ) + + async for crawl_raw in crawls_mdb.find({"browserWindows": None}): + crawl_id = crawl_raw["_id"] + scale = crawl_raw.get("scale", 1) + + try: + await crawls_mdb.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "browserWindows": browser_windows_from_pod_count(scale) + }, + # "$unset": {"scale": 1} + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to set browser windows from scale for crawl {crawl_id}: {err}", + flush=True, + ) From c9014dbf9d6c3f56f0b2a80fc197724b7f49c387 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 27 May 2025 10:23:11 -0700 Subject: [PATCH 46/57] frontend: custom range for browser windows, by 1 until browser instances, then in multiples through max --- .../crawl-workflows/workflow-editor.ts | 25 +++++++++++------- frontend/src/pages/org/workflow-detail.ts | 5 ++-- frontend/src/utils/workflow.ts | 26 ++++++++++++++++++- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index e40d797f66..0a1271c99f 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -32,6 +32,7 @@ import { state, } from "lit/decorators.js"; import { ifDefined } from "lit/directives/if-defined.js"; +import { map } from "lit/directives/map.js"; import { when } from "lit/directives/when.js"; import compact from "lodash/fp/compact"; import flow from "lodash/fp/flow"; @@ -113,6 +114,7 @@ import { getInitialFormState, getServerDefaults, makeUserGuideEvent, + rangeBrowserWindows, SECTIONS, workflowTabToGuideHash, type FormState, @@ -1576,18 +1578,23 @@ https://archiveweb.page/images/${"logo.svg"}`} ] : nothing} ${inputCol(html` - + this.updateFormState({ + browserWindows: +(e.target as SlCheckbox).value, + })} + > + ${map( + rangeBrowserWindows(this.appState.settings), + (i: number) => + html` ${i}`, )} - min="1" - max="${this.appState.settings?.maxBrowserWindows || 1}" - type="number" - inputmode="numeric" - > + `)} ${this.renderHelpTextCol( html`${msg( diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 8b1c807254..b62d99d7da 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -41,6 +41,7 @@ import { humanizeExecutionSeconds } from "@/utils/executionTimeFormatter"; import { isArchivingDisabled } from "@/utils/orgs"; import { pluralOf } from "@/utils/pluralize"; import { tw } from "@/utils/tailwind"; +import { rangeBrowserWindows } from "@/utils/workflow"; const POLL_INTERVAL_SECONDS = 10; const CRAWLS_PAGINATION_NAME = "crawlsPage"; @@ -1789,7 +1790,7 @@ export class WorkflowDetail extends BtrixElement { authToken=${authToken} .crawlId=${this.lastCrawlId ?? undefined} numBrowsersPerInstance=${this.appState.settings - ?.numBrowsersPerInstance} + ?.numBrowsersPerInstance || 1} browserWindows=${workflow.browserWindows} >
@@ -2003,7 +2004,7 @@ export class WorkflowDetail extends BtrixElement { const scaleOptions = []; if (this.appState.settings) { - for (let value = 1; value <= this.maxBrowserWindows; value++) { + for (const value of rangeBrowserWindows(this.appState.settings)) { scaleOptions.push({ value, label: value, diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 291f384572..2266fcd743 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -1,7 +1,7 @@ import { msg, str } from "@lit/localize"; import { z } from "zod"; -import { getAppSettings } from "./app"; +import { getAppSettings, type AppSettings } from "./app"; import type { Tags } from "@/components/ui/tag-input"; import type { UserGuideEventMap } from "@/index"; @@ -376,3 +376,27 @@ export async function getServerDefaults(): Promise { return defaults; } + +export function* rangeBrowserWindows( + settings: AppSettings | null, +): Iterable { + console.log("range"); + if (!settings) { + yield 1; + return; + } + + const { numBrowsersPerInstance, maxBrowserWindows } = settings; + + for (let i = 1; i < numBrowsersPerInstance; i++) { + yield i; + } + + for ( + let i = numBrowsersPerInstance; + i <= maxBrowserWindows; + i += numBrowsersPerInstance + ) { + yield i; + } +} From 5df33dee541e707e019c69adc4a8f0360d632f45 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 13:48:22 -0400 Subject: [PATCH 47/57] Don't unset scale in migration --- .../migrations/migration_0047_scale_to_browser_windows.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py index 03e547bab4..29a3f5c214 100644 --- a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py +++ b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py @@ -36,7 +36,6 @@ async def migrate_up(self): "$set": { "browserWindows": browser_windows_from_pod_count(scale) }, - # "$unset": {"scale": 1} }, ) # pylint: disable=broad-exception-caught @@ -57,7 +56,6 @@ async def migrate_up(self): "$set": { "browserWindows": browser_windows_from_pod_count(scale) }, - # "$unset": {"scale": 1} }, ) # pylint: disable=broad-exception-caught From 49221794005dda2d8b4159530829c767083eac90 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 16:17:46 -0400 Subject: [PATCH 48/57] Store scale in crawl object --- backend/btrixcloud/crawls.py | 1 + backend/btrixcloud/models.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index e18b57c17a..a0c7653688 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -370,6 +370,7 @@ async def add_new_crawl( oid=crawlconfig.oid, cid=crawlconfig.id, cid_rev=crawlconfig.rev, + scale=pod_count_from_browser_windows(crawlconfig.browserWindows), browserWindows=crawlconfig.browserWindows, jobType=crawlconfig.jobType, config=crawlconfig.config, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 6497f516be..5f06d29250 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -429,6 +429,8 @@ class CrawlConfigCore(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 + + scale: Optional[Scale] = None browserWindows: BrowserWindowCount = 2 oid: UUID @@ -893,6 +895,7 @@ class CrawlOut(BaseMongoModel): pausedAt: Optional[datetime] = None manual: bool = False cid_rev: Optional[int] = None + scale: Optional[Scale] = None browserWindows: BrowserWindowCount = 2 storageQuotaReached: Optional[bool] = False From 4166e322f1134bd586b85bc044ef518e3eecb2cf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 16:22:23 -0400 Subject: [PATCH 49/57] Update scale in crawl model when crawl is live rescaled --- backend/btrixcloud/crawls.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a0c7653688..8529879ebc 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -395,19 +395,24 @@ async def add_new_crawl( pass async def update_crawl_scale( - self, crawl_id: str, org: Organization, browser_windows: int, user: User + self, + crawl_id: str, + org: Organization, + scale: int, + browser_windows: int, + user: User, ) -> bool: """Update crawl scale in the db""" crawl = await self.get_crawl(crawl_id, org) update = UpdateCrawlConfig(browserWindows=browser_windows) - await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, { "$set": { + "scale": scale, "browserWindows": browser_windows, } }, @@ -1553,10 +1558,13 @@ async def scale_crawl( cast(int, scale.scale) ) - await ops.update_crawl_scale(crawl_id, org, scale.browserWindows, user) + num_scale = cast(int, scale.scale) + browser_windows = cast(int, scale.browserWindows) + + await ops.update_crawl_scale(crawl_id, org, num_scale, browser_windows, user) result = await ops.crawl_manager.scale_crawl( - crawl_id, cast(int, scale.scale), cast(int, scale.browserWindows) + crawl_id, num_scale, browser_windows ) if not result or not result.get("success"): raise HTTPException( From 43551e15ae845be8373001071d7be7af40aaf671 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 16:38:07 -0400 Subject: [PATCH 50/57] Add some tests --- backend/test/test_crawlconfigs.py | 96 +++++++++++++++++++++++++++++++ backend/test/test_run_crawl.py | 2 + 2 files changed, 98 insertions(+) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index eaafb10c1e..a4a1838456 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -53,6 +53,64 @@ def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_dat cid = data["id"] +def test_verify_default_browser_windows( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 2 + + +def test_custom_browser_windows( + crawler_auth_headers, default_org_id, sample_crawl_data +): + sample_crawl_data["browserWindows"] = 4 + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + workflow_id = r.json()["id"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 4 + + +def test_custom_scale(crawler_auth_headers, default_org_id, sample_crawl_data): + sample_crawl_data["scale"] = 3 + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + workflow_id = r.json()["id"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 6 + + def test_update_name_only(crawler_auth_headers, default_org_id): # update name only r = requests.patch( @@ -326,6 +384,44 @@ def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_craw assert data["maxCrawlSize"] == 4096 +def test_update_browser_windows(crawler_auth_headers, default_org_id): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"browserWindows": 1}, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 1 + + +def test_update_scale(crawler_auth_headers, default_org_id): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"scale": 1}, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 2 + + def test_verify_delete_tags(crawler_auth_headers, default_org_id): # Verify that deleting tags and name works as well r = requests.patch( diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 33eb785132..82b2ad0071 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -239,6 +239,8 @@ def test_crawl_info(admin_auth_headers, default_org_id): assert data["fileCount"] == 1 assert data["userName"] assert data["version"] == 2 + assert data["scale"] == 1 + assert data["browserWindows"] == 2 def test_crawls_include_seed_info(admin_auth_headers, default_org_id): From a9ee78481de6e7e10006cab6b463cd1046bfe7cd Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 17:16:31 -0400 Subject: [PATCH 51/57] Update expected totals in tests --- backend/test/test_crawlconfigs.py | 4 ++-- backend/test/test_filter_sort_results.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index a4a1838456..b4a70a12bb 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -450,9 +450,9 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] == 3 + assert data["total"] == 5 items = data["items"] - assert len(items) == 3 + assert len(items) == 5 sorted_data = sorted(items, key=lambda revision: revision["rev"]) assert sorted_data[0]["config"]["scopeType"] == "prefix" diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index fc104197c0..77c7e185c3 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 5 - assert r.json()["total"] == 5 + assert len(r.json()["items"]) == 7 + assert r.json()["total"] == 7 def test_get_config_by_modified_by( @@ -23,8 +23,8 @@ def test_get_config_by_modified_by( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 5 - assert r.json()["total"] == 5 + assert len(r.json()["items"]) == 7 + assert r.json()["total"] == 7 def test_get_configs_by_first_seed( @@ -362,9 +362,9 @@ def test_sort_crawl_configs( headers=crawler_auth_headers, ) data = r.json() - assert data["total"] == 11 + assert data["total"] == 13 items = data["items"] - assert len(items) == 11 + assert len(items) == 13 last_created = None for config in items: From 9fbb99f44fa4f03a6dd2c76991f529b195270465 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 27 May 2025 17:19:56 -0400 Subject: [PATCH 52/57] Remove outdated pylint comment --- backend/btrixcloud/operator/crawls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index e772c42ecd..5993d82911 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -589,7 +589,6 @@ async def _resolve_scale( self, crawl: CrawlSpec, redis: Redis, - # pylint: disable=unused-argument status: CrawlStatus, pods: dict[str, dict], ): From d9396254ea01e3c3120798a7dcc9f6eb891ecdff Mon Sep 17 00:00:00 2001 From: sua yoo Date: Wed, 28 May 2025 19:07:19 -0700 Subject: [PATCH 53/57] set max width --- frontend/src/components/screencast.ts | 4 ++-- frontend/src/utils/workflow.ts | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index affa762622..52296ea62a 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -203,10 +203,10 @@ export class Screencast extends BtrixElement { return html`
${Array.from({ length: screenCount }).map((_, i) => this.renderScreen(`${i}`), diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 2266fcd743..c31700ed8a 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -380,7 +380,6 @@ export async function getServerDefaults(): Promise { export function* rangeBrowserWindows( settings: AppSettings | null, ): Iterable { - console.log("range"); if (!settings) { yield 1; return; From aab9f5a02bd922a0bbe5a7f580f9419ed9a984bb Mon Sep 17 00:00:00 2001 From: sua yoo Date: Wed, 28 May 2025 19:17:53 -0700 Subject: [PATCH 54/57] fix spinner --- frontend/src/components/screencast.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index 52296ea62a..c6d31d978a 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -244,6 +244,7 @@ export class Screencast extends BtrixElement { private readonly renderScreen = (id: string) => { const pageData = this.dataMap[id]; + return html`
(this.focusedScreenData = pageData) : () => {}} >
${pageData?.url || html` `}
-
+
${pageData ? html`` - : html``} + : html`
+ +
`}
`; }; From cee8dc90512c31f4aa27159f01a643af405d3445 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 May 2025 11:33:10 -0700 Subject: [PATCH 55/57] cleanup: - rename pod_count -> scale for consistency - remove debug logging - simplify update_scale to remove cast --- backend/btrixcloud/crawlconfigs.py | 8 ++-- backend/btrixcloud/crawlmanager.py | 6 +-- backend/btrixcloud/crawls.py | 42 ++++++++----------- ...migration_0047_scale_to_browser_windows.py | 10 ++--- backend/btrixcloud/operator/crawls.py | 14 +++---- backend/btrixcloud/operator/cronjobs.py | 4 +- backend/btrixcloud/orgs.py | 10 ++--- backend/btrixcloud/utils.py | 6 +-- 8 files changed, 40 insertions(+), 60 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index f6917e74f7..09f0b911d5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -52,7 +52,7 @@ validate_regexes, validate_language_code, is_url, - browser_windows_from_pod_count, + browser_windows_from_scale, ) if TYPE_CHECKING: @@ -225,7 +225,7 @@ async def add_crawl_config( # Overrides scale if set if config_in.browserWindows is None: - config_in.browserWindows = browser_windows_from_pod_count( + config_in.browserWindows = browser_windows_from_scale( cast(int, config_in.scale) ) @@ -416,9 +416,7 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) if update.scale: - update.browserWindows = browser_windows_from_pod_count( - cast(int, update.scale) - ) + update.browserWindows = browser_windows_from_scale(cast(int, update.scale)) update.scale = None if update.config and update.config.exclude: diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 1051c241fb..94dfcce1b7 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -8,7 +8,7 @@ from fastapi import HTTPException -from .utils import dt_now, date_to_str, pod_count_from_browser_windows +from .utils import dt_now, date_to_str, scale_from_browser_windows from .k8sapi import K8sAPI from .models import StorageRef, CrawlConfig, BgJobType @@ -227,7 +227,7 @@ async def create_crawl_job( await self.has_storage_secret(storage_secret) - scale = pod_count_from_browser_windows(crawlconfig.browserWindows) + scale = scale_from_browser_windows(crawlconfig.browserWindows) return await self.new_crawl_job( cid, @@ -261,7 +261,7 @@ async def update_running_crawl_config( # pylint: disable=use-dict-literal patch = dict( crawlerChannel=crawlconfig.crawlerChannel, - scale=pod_count_from_browser_windows(crawlconfig.browserWindows), + scale=scale_from_browser_windows(crawlconfig.browserWindows), browserWindows=crawlconfig.browserWindows, timeout=crawlconfig.crawlTimeout, maxCrawlSize=crawlconfig.maxCrawlSize, diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 8529879ebc..2891957a0c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -10,7 +10,7 @@ from datetime import datetime from uuid import UUID -from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator, cast +from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator from fastapi import Depends, HTTPException from fastapi.responses import StreamingResponse @@ -25,8 +25,8 @@ parse_jsonl_log_messages, stream_dict_list_as_csv, validate_regexes, - pod_count_from_browser_windows, - browser_windows_from_pod_count, + scale_from_browser_windows, + browser_windows_from_scale, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -370,7 +370,7 @@ async def add_new_crawl( oid=crawlconfig.oid, cid=crawlconfig.id, cid_rev=crawlconfig.rev, - scale=pod_count_from_browser_windows(crawlconfig.browserWindows), + scale=scale_from_browser_windows(crawlconfig.browserWindows), browserWindows=crawlconfig.browserWindows, jobType=crawlconfig.jobType, config=crawlconfig.config, @@ -552,8 +552,8 @@ async def add_or_remove_exclusion( } query_str = json.dumps(query) - pod_count = pod_count_from_browser_windows(browser_windows) - for i in range(0, pod_count): + scale = scale_from_browser_windows(browser_windows) + for i in range(0, scale): await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str) new_config = await self.crawl_configs.add_or_remove_exclusion( @@ -1539,39 +1539,31 @@ async def update_crawl_api( response_model=CrawlScaleResponse, ) async def scale_crawl( - scale: CrawlScale, + crawl_scale: CrawlScale, crawl_id, user: User = Depends(user_dep), org: Organization = Depends(org_crawl_dep), ): - if scale.scale is None and scale.browserWindows is None: + if crawl_scale.browserWindows: + browser_windows = crawl_scale.browserWindows + scale = scale_from_browser_windows(browser_windows) + elif crawl_scale.scale: + scale = crawl_scale.scale + browser_windows = browser_windows_from_scale(scale) + else: raise HTTPException( status_code=400, detail="browser_windows_or_scale_required" ) - if scale.browserWindows: - scale.scale = pod_count_from_browser_windows( - cast(int, scale.browserWindows) - ) - else: - scale.browserWindows = browser_windows_from_pod_count( - cast(int, scale.scale) - ) + await ops.update_crawl_scale(crawl_id, org, scale, browser_windows, user) - num_scale = cast(int, scale.scale) - browser_windows = cast(int, scale.browserWindows) - - await ops.update_crawl_scale(crawl_id, org, num_scale, browser_windows, user) - - result = await ops.crawl_manager.scale_crawl( - crawl_id, num_scale, browser_windows - ) + result = await ops.crawl_manager.scale_crawl(crawl_id, scale, browser_windows) if not result or not result.get("success"): raise HTTPException( status_code=400, detail=result.get("error") or "unknown" ) - return {"scaled": True, "browserWindows": scale.browserWindows} + return {"scaled": True, "browserWindows": browser_windows} @app.get( "/orgs/{oid}/crawls/{crawl_id}/access", diff --git a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py index 29a3f5c214..59897fb0ba 100644 --- a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py +++ b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py @@ -3,7 +3,7 @@ """ from btrixcloud.migrations import BaseMigration -from btrixcloud.utils import browser_windows_from_pod_count +from btrixcloud.utils import browser_windows_from_scale MIGRATION_VERSION = "0047" @@ -33,9 +33,7 @@ async def migrate_up(self): await configs_mdb.find_one_and_update( {"_id": config_id}, { - "$set": { - "browserWindows": browser_windows_from_pod_count(scale) - }, + "$set": {"browserWindows": browser_windows_from_scale(scale)}, }, ) # pylint: disable=broad-exception-caught @@ -53,9 +51,7 @@ async def migrate_up(self): await crawls_mdb.find_one_and_update( {"_id": crawl_id}, { - "$set": { - "browserWindows": browser_windows_from_pod_count(scale) - }, + "$set": {"browserWindows": browser_windows_from_scale(scale)}, }, ) # pylint: disable=broad-exception-caught diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 5993d82911..c9b1e1e61b 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -36,7 +36,7 @@ str_to_date, date_to_str, dt_now, - pod_count_from_browser_windows, + scale_from_browser_windows, ) from .baseoperator import BaseOperator, Redis @@ -365,14 +365,12 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" - print(f"status.scale: {status.scale}", flush=True) - print(f"crawl.browser_windows: {crawl.browser_windows}", flush=True) - - crawler_pod_count = pod_count_from_browser_windows(crawl.browser_windows) + # crawl_scale is the number of pods to create + crawler_scale = scale_from_browser_windows(crawl.browser_windows) browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - for i in range(0, crawler_pod_count): + for i in range(0, crawler_scale): if status.pagesFound < i * browsers_per_pod: break @@ -380,7 +378,7 @@ async def sync_crawls(self, data: MCSyncData): self._load_crawler( params, i, - crawler_pod_count, + crawler_scale, crawl.browser_windows, status, data.children, @@ -600,7 +598,7 @@ async def _resolve_scale( scale and clean up previous scale state. """ - desired_scale = pod_count_from_browser_windows(crawl.browser_windows) + desired_scale = scale_from_browser_windows(crawl.browser_windows) if status.pagesFound < desired_scale: desired_scale = max(1, status.pagesFound) diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 5a3ba7d5ac..cb515c4b18 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -9,7 +9,7 @@ from .baseoperator import BaseOperator from ..models import CrawlConfig -from ..utils import pod_count_from_browser_windows +from ..utils import scale_from_browser_windows # pylint: disable=too-many-locals @@ -130,7 +130,7 @@ async def make_new_crawljob( oid=str(oid), storage=str(org.storage), crawler_channel=crawlconfig.crawlerChannel or "default", - scale=pod_count_from_browser_windows(crawlconfig.browserWindows), + scale=scale_from_browser_windows(crawlconfig.browserWindows), browser_windows=crawlconfig.browserWindows, crawl_timeout=crawlconfig.crawlTimeout, max_crawl_size=crawlconfig.maxCrawlSize, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 2bd7938a6d..83cce13afa 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -90,7 +90,7 @@ get_duplicate_key_error_field, validate_language_code, JSONSerializer, - browser_windows_from_pod_count, + browser_windows_from_scale, ) if TYPE_CHECKING: @@ -1270,9 +1270,7 @@ async def import_org( # Convert scale to browser windows and respect limits workflow_scale = max(workflow.get("scale", 1), MAX_CRAWL_SCALE) if workflow.get("browserWindows") is None: - workflow_browser_windows = browser_windows_from_pod_count( - workflow_scale - ) + workflow_browser_windows = browser_windows_from_scale(workflow_scale) workflow["browserWindows"] = max( workflow_browser_windows, MAX_BROWSER_WINDOWS ) @@ -1310,9 +1308,7 @@ async def import_org( # Set browserWindows browser_windows = item.get("browserWindows") if browser_windows is None: - browser_windows = browser_windows_from_pod_count( - item.get("scale", 1) - ) + browser_windows = browser_windows_from_scale(item.get("scale", 1)) item["browserWindows"] = max(browser_windows, MAX_BROWSER_WINDOWS) item_obj = Crawl.from_dict(item) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 1e506bdff7..4dda18792e 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -203,13 +203,13 @@ def validate_language_code(lang: str): raise HTTPException(status_code=400, detail="invalid_lang") -def pod_count_from_browser_windows(browser_windows: int) -> int: +def scale_from_browser_windows(browser_windows: int) -> int: """Return number of pods for given number of browser windows""" browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) return math.ceil(browser_windows / browsers_per_pod) -def browser_windows_from_pod_count(pod_count: int) -> int: +def browser_windows_from_scale(scale: int) -> int: """Return number of browser windows from specified scale""" browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - return pod_count * browsers_per_pod + return scale * browsers_per_pod From 435a33e258ff160539a0a1561974fd9ae7e1a9b6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 2 Jun 2025 20:31:57 -0700 Subject: [PATCH 56/57] Update backend/btrixcloud/operator/crawls.py Co-authored-by: Tessa Walsh --- backend/btrixcloud/operator/crawls.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index c9b1e1e61b..aed9cf6679 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -558,12 +558,6 @@ def _load_crawler( print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False - worker_count = params["workers"] - print( - f"crawler pod {i}, {worker_count} workers", - flush=True, - ) - return self.load_from_yaml("crawler.yaml", params) def _qa_configmap_update_needed(self, name, configmap): From 5e2171574988d28a95e9795caaa00c37e3a5362d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 3 Jun 2025 15:03:29 -0400 Subject: [PATCH 57/57] Add deprecated flag to Scale --- backend/btrixcloud/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 5f06d29250..27ecb24b17 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -64,7 +64,7 @@ EmptyStr = Annotated[str, Field(min_length=0, max_length=0)] -Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE)] +Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE, deprecated=True)] BrowserWindowCount = Annotated[int, Field(strict=True, ge=1, le=MAX_BROWSER_WINDOWS)] ReviewStatus = Optional[Annotated[int, Field(strict=True, ge=1, le=5)]]