diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 02606a75b4..09f0b911d5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -52,6 +52,7 @@ validate_regexes, validate_language_code, is_url, + browser_windows_from_scale, ) if TYPE_CHECKING: @@ -222,6 +223,12 @@ async def add_crawl_config( ) -> CrawlConfigAddedResponse: """Add new crawl config""" + # Overrides scale if set + if config_in.browserWindows is None: + config_in.browserWindows = browser_windows_from_scale( + cast(int, config_in.scale) + ) + # ensure crawlChannel is valid if not self.get_channel_crawler_image(config_in.crawlerChannel): raise HTTPException(status_code=404, detail="crawler_not_found") @@ -272,7 +279,7 @@ async def add_crawl_config( jobType=config_in.jobType, crawlTimeout=config_in.crawlTimeout, maxCrawlSize=config_in.maxCrawlSize, - scale=config_in.scale, + browserWindows=config_in.browserWindows, autoAddCollections=config_in.autoAddCollections, profileid=profileid, crawlerChannel=config_in.crawlerChannel, @@ -408,6 +415,10 @@ async def update_crawl_config( orig_crawl_config = await self.get_crawl_config(cid, org.id) + if update.scale: + update.browserWindows = browser_windows_from_scale(cast(int, update.scale)) + update.scale = None + if update.config and update.config.exclude: exclude = update.config.exclude if isinstance(exclude, str): @@ -441,7 +452,9 @@ async def update_crawl_config( changed = changed or ( self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate") ) - changed = changed or self.check_attr_changed(orig_crawl_config, update, "scale") + changed = changed or self.check_attr_changed( + orig_crawl_config, update, "browserWindows" + ) schedule_changed = self.check_attr_changed( orig_crawl_config, update, "schedule" diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 88999b28f7..94dfcce1b7 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -8,7 +8,7 @@ from fastapi import HTTPException -from .utils import dt_now, date_to_str +from .utils import dt_now, date_to_str, scale_from_browser_windows from .k8sapi import K8sAPI from .models import StorageRef, CrawlConfig, BgJobType @@ -227,13 +227,16 @@ async def create_crawl_job( await self.has_storage_secret(storage_secret) + scale = scale_from_browser_windows(crawlconfig.browserWindows) + return await self.new_crawl_job( cid, userid, str(crawlconfig.oid), str(storage), crawlconfig.crawlerChannel, - crawlconfig.scale, + scale, + crawlconfig.browserWindows, crawlconfig.crawlTimeout, crawlconfig.maxCrawlSize, manual=True, @@ -258,7 +261,8 @@ async def update_running_crawl_config( # pylint: disable=use-dict-literal patch = dict( crawlerChannel=crawlconfig.crawlerChannel, - scale=crawlconfig.scale, + scale=scale_from_browser_windows(crawlconfig.browserWindows), + browserWindows=crawlconfig.browserWindows, timeout=crawlconfig.crawlTimeout, maxCrawlSize=crawlconfig.maxCrawlSize, proxyId=crawlconfig.proxyId or DEFAULT_PROXY_ID, @@ -373,9 +377,13 @@ async def rollover_restart_crawl(self, crawl_id: str) -> dict: update = date_to_str(dt_now()) return await self._patch_job(crawl_id, {"restartTime": update}) - async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict: + async def scale_crawl( + self, crawl_id: str, scale: int = 1, browser_windows: int = 1 + ) -> dict: """Set the crawl scale (job parallelism) on the specified job""" - return await self._patch_job(crawl_id, {"scale": scale}) + return await self._patch_job( + crawl_id, {"scale": scale, "browserWindows": browser_windows} + ) async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict: """Request a crawl cancelation or stop by calling an API diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 2dabd59c6b..2891957a0c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -25,6 +25,8 @@ parse_jsonl_log_messages, stream_dict_list_as_csv, validate_regexes, + scale_from_browser_windows, + browser_windows_from_scale, ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager @@ -368,7 +370,8 @@ async def add_new_crawl( oid=crawlconfig.oid, cid=crawlconfig.id, cid_rev=crawlconfig.rev, - scale=crawlconfig.scale, + scale=scale_from_browser_windows(crawlconfig.browserWindows), + browserWindows=crawlconfig.browserWindows, jobType=crawlconfig.jobType, config=crawlconfig.config, profileid=crawlconfig.profileid, @@ -392,16 +395,27 @@ async def add_new_crawl( pass async def update_crawl_scale( - self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User + self, + crawl_id: str, + org: Organization, + scale: int, + browser_windows: int, + user: User, ) -> bool: """Update crawl scale in the db""" crawl = await self.get_crawl(crawl_id, org) - update = UpdateCrawlConfig(scale=crawl_scale.scale) + + update = UpdateCrawlConfig(browserWindows=browser_windows) await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, - {"$set": {"scale": crawl_scale.scale}}, + { + "$set": { + "scale": scale, + "browserWindows": browser_windows, + } + }, return_document=pymongo.ReturnDocument.AFTER, ) @@ -529,7 +543,7 @@ async def add_or_remove_exclusion( cid = crawl.cid - scale = crawl.scale or 1 + browser_windows = crawl.browserWindows or 2 async with self.get_redis(crawl_id) as redis: query = { @@ -538,6 +552,7 @@ async def add_or_remove_exclusion( } query_str = json.dumps(query) + scale = scale_from_browser_windows(browser_windows) for i in range(0, scale): await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str) @@ -1524,20 +1539,31 @@ async def update_crawl_api( response_model=CrawlScaleResponse, ) async def scale_crawl( - scale: CrawlScale, + crawl_scale: CrawlScale, crawl_id, user: User = Depends(user_dep), org: Organization = Depends(org_crawl_dep), ): - await ops.update_crawl_scale(crawl_id, org, scale, user) + if crawl_scale.browserWindows: + browser_windows = crawl_scale.browserWindows + scale = scale_from_browser_windows(browser_windows) + elif crawl_scale.scale: + scale = crawl_scale.scale + browser_windows = browser_windows_from_scale(scale) + else: + raise HTTPException( + status_code=400, detail="browser_windows_or_scale_required" + ) + + await ops.update_crawl_scale(crawl_id, org, scale, browser_windows, user) - result = await ops.crawl_manager.scale_crawl(crawl_id, scale.scale) + result = await ops.crawl_manager.scale_crawl(crawl_id, scale, browser_windows) if not result or not result.get("success"): raise HTTPException( status_code=400, detail=result.get("error") or "unknown" ) - return {"scaled": scale.scale} + return {"scaled": True, "browserWindows": browser_windows} @app.get( "/orgs/{oid}/crawls/{crawl_id}/access", diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index e27b499dd4..cbdd252d2e 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -32,7 +32,7 @@ ) = PageOps = BackgroundJobOps = object -CURR_DB_VERSION = "0046" +CURR_DB_VERSION = "0047" # ============================================================================ diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index bfaaabb656..41495f60de 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -85,6 +85,7 @@ def new_crawl_job_yaml( storage: str, crawler_channel: Optional[str] = "", scale: Optional[int] = 1, + browser_windows: Optional[int] = 1, crawl_timeout: Optional[int] = 0, max_crawl_size: Optional[int] = 0, manual: bool = True, @@ -109,6 +110,7 @@ def new_crawl_job_yaml( "storage_name": storage, "crawler_channel": crawler_channel, "scale": scale, + "browser_windows": browser_windows, "timeout": crawl_timeout, "max_crawl_size": max_crawl_size or 0, "manual": "1" if manual else "0", @@ -130,6 +132,7 @@ async def new_crawl_job( storage: str, crawler_channel: Optional[str] = "", scale: Optional[int] = 1, + browser_windows: Optional[int] = 1, crawl_timeout: Optional[int] = 0, max_crawl_size: Optional[int] = 0, manual: bool = True, @@ -148,6 +151,7 @@ async def new_crawl_job( storage=storage, crawler_channel=crawler_channel, scale=scale, + browser_windows=browser_windows, crawl_timeout=crawl_timeout, max_crawl_size=max_crawl_size, manual=manual, diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 0958c6a31f..6b3ecefb8b 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -112,8 +112,8 @@ class SettingsResponse(BaseModel): defaultPageLoadTimeSeconds: int maxPagesPerCrawl: int - numBrowsers: int - maxScale: int + numBrowsersPerInstance: int + maxBrowserWindows: int billingEnabled: bool @@ -149,8 +149,8 @@ def main() -> None: os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120) ), maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), - numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)), - maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)), + numBrowsersPerInstance=int(os.environ.get("NUM_BROWSERS", 1)), + maxBrowserWindows=int(os.environ.get("MAX_BROWSER_WINDOWS", 8)), billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")), signUpUrl=os.environ.get("SIGN_UP_URL", ""), salesEmail=os.environ.get("SALES_EMAIL", ""), diff --git a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py index 6e2d005ab2..1f70e0cf6d 100644 --- a/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py +++ b/backend/btrixcloud/migrations/migration_0005_operator_scheduled_jobs.py @@ -43,7 +43,7 @@ async def migrate_up(self): config = CrawlConfig.from_dict(config_dict) print( f"Updating Crawl Config {config.id}: schedule: {config.schedule}, " - + f"timeout: {config.crawlTimeout}, scale: {config.scale}" + + f"timeout: {config.crawlTimeout}" ) try: await crawl_manager.update_scheduled_job(config) diff --git a/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py new file mode 100644 index 0000000000..59897fb0ba --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0047_scale_to_browser_windows.py @@ -0,0 +1,62 @@ +""" +Migration 0047 - Convert scale to browserWindows +""" + +from btrixcloud.migrations import BaseMigration +from btrixcloud.utils import browser_windows_from_scale + + +MIGRATION_VERSION = "0047" + + +# pylint: disable=duplicate-code +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Calculate and store browserWindows from existing scale on workflows and crawls + """ + configs_mdb = self.mdb["crawl_configs"] + crawls_mdb = self.mdb["crawls"] + + async for config_raw in configs_mdb.find({"browserWindows": None}): + config_id = config_raw["_id"] + scale = config_raw.get("scale", 1) + + try: + await configs_mdb.find_one_and_update( + {"_id": config_id}, + { + "$set": {"browserWindows": browser_windows_from_scale(scale)}, + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to set browser windows from scale for workflow {config_id}: {err}", + flush=True, + ) + + async for crawl_raw in crawls_mdb.find({"browserWindows": None}): + crawl_id = crawl_raw["_id"] + scale = crawl_raw.get("scale", 1) + + try: + await crawls_mdb.find_one_and_update( + {"_id": crawl_id}, + { + "$set": {"browserWindows": browser_windows_from_scale(scale)}, + }, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to set browser windows from scale for crawl {crawl_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2e9d8da309..27ecb24b17 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -8,6 +8,7 @@ import base64 import hashlib import mimetypes +import math import os from typing import Optional, List, Dict, Union, Literal, Any, get_args @@ -30,8 +31,19 @@ from .db import BaseMongoModel +# num browsers per crawler instance +NUM_BROWSERS = int(os.environ.get("NUM_BROWSERS", 2)) + +# browser window for constraint (preferred over scale if provided) +MAX_BROWSER_WINDOWS = os.environ.get("MAX_BROWSER_WINDOWS") or 0 + # crawl scale for constraint -MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +if MAX_BROWSER_WINDOWS: + MAX_BROWSER_WINDOWS = int(MAX_BROWSER_WINDOWS) + MAX_CRAWL_SCALE = math.ceil(MAX_BROWSER_WINDOWS / NUM_BROWSERS) +else: + MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) + MAX_BROWSER_WINDOWS = MAX_CRAWL_SCALE * NUM_BROWSERS # Presign duration must be less than 604800 seconds (one week), # so set this one minute short of a week @@ -52,7 +64,8 @@ EmptyStr = Annotated[str, Field(min_length=0, max_length=0)] -Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE)] +Scale = Annotated[int, Field(strict=True, ge=1, le=MAX_CRAWL_SCALE, deprecated=True)] +BrowserWindowCount = Annotated[int, Field(strict=True, ge=1, le=MAX_BROWSER_WINDOWS)] ReviewStatus = Optional[Annotated[int, Field(strict=True, ge=1, le=5)]] any_http_url_adapter = TypeAdapter(AnyHttpUrlNonStr) @@ -369,8 +382,12 @@ class CrawlConfigIn(BaseModel): crawlTimeout: int = 0 maxCrawlSize: int = 0 + scale: Scale = 1 + # Overrides scale if set + browserWindows: Optional[BrowserWindowCount] = None + crawlFilenameTemplate: Optional[str] = None @@ -390,7 +407,8 @@ class ConfigRevision(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 - scale: Scale = 1 + scale: Optional[Scale] = 1 + browserWindows: Optional[BrowserWindowCount] = 2 modified: datetime modifiedBy: Optional[UUID] = None @@ -411,7 +429,9 @@ class CrawlConfigCore(BaseMongoModel): crawlTimeout: Optional[int] = 0 maxCrawlSize: Optional[int] = 0 - scale: Scale = 1 + + scale: Optional[Scale] = None + browserWindows: BrowserWindowCount = 2 oid: UUID @@ -522,7 +542,8 @@ class UpdateCrawlConfig(BaseModel): proxyId: Optional[str] = None crawlTimeout: Optional[int] = None maxCrawlSize: Optional[int] = None - scale: Scale = 1 + scale: Optional[Scale] = None + browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None config: Optional[RawCrawlConfig] = None @@ -874,7 +895,8 @@ class CrawlOut(BaseMongoModel): pausedAt: Optional[datetime] = None manual: bool = False cid_rev: Optional[int] = None - scale: Scale = 1 + scale: Optional[Scale] = None + browserWindows: BrowserWindowCount = 2 storageQuotaReached: Optional[bool] = False execMinutesQuotaReached: Optional[bool] = False @@ -959,9 +981,10 @@ class MatchCrawlQueueResponse(BaseModel): # ============================================================================ class CrawlScale(BaseModel): - """scale the crawl to N parallel containers""" + """scale the crawl to N parallel containers or windows""" - scale: Scale = 1 + scale: Optional[Scale] = None + browserWindows: Optional[BrowserWindowCount] = None # ============================================================================ @@ -1053,7 +1076,8 @@ class CrawlCompleteIn(BaseModel): class CrawlScaleResponse(BaseModel): """Response model for modifying crawl scale""" - scaled: int + scaled: bool + browserWindows: int # ============================================================================ diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index b63ff9f4ae..8351c55d12 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -57,40 +57,15 @@ def compute_crawler_resources(self) -> None: except: # default to 1 for now for best results (to revisit in the future) qa_num_workers = 1 - crawler_cpu: float = 0 - crawler_memory: int = 0 - qa_cpu: float = 0 - qa_memory: int = 0 - print("crawler resources") - if not p.get("crawler_cpu"): - base = parse_quantity(p["crawler_cpu_base"]) - extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) - - # cpu is a floating value of cpu cores - crawler_cpu = float(base + (num_workers - 1) * extra) - qa_cpu = float(base + (qa_num_workers - 1) * extra) - - print(f"cpu = {base} + {num_workers - 1} * {extra} = {crawler_cpu}") - print(f"qa_cpu = {base} + {qa_num_workers - 1} * {extra} = {qa_cpu}") - else: - crawler_cpu = float(parse_quantity(p["crawler_cpu"])) - qa_cpu = crawler_cpu - print(f"cpu = {crawler_cpu}") - if not p.get("crawler_memory"): - base = parse_quantity(p["crawler_memory_base"]) - extra = parse_quantity(p["crawler_extra_memory_per_browser"]) - - # memory is always an int - crawler_memory = int(base + (num_workers - 1) * extra) - qa_memory = int(base + (qa_num_workers - 1) * extra) + crawler_memory, crawler_cpu = self.compute_for_num_browsers( + num_workers, p.get("crawler_memory"), p.get("crawler_cpu") + ) + qa_memory, qa_cpu = self.compute_for_num_browsers(qa_num_workers) - print(f"memory = {base} + {num_workers - 1} * {extra} = {crawler_memory}") - print(f"qa_memory = {base} + {qa_num_workers - 1} * {extra} = {qa_memory}") - else: - crawler_memory = int(parse_quantity(p["crawler_memory"])) - qa_memory = crawler_memory - print(f"memory = {crawler_memory}") + print("crawler resources") + print(f"cpu = {crawler_cpu} qa: {qa_cpu}") + print(f"memory = {crawler_memory} qa: {qa_memory}") max_crawler_memory_size = 0 max_crawler_memory = os.environ.get("MAX_CRAWLER_MEMORY") @@ -108,6 +83,33 @@ def compute_crawler_resources(self) -> None: p["qa_memory"] = qa_memory p["qa_workers"] = qa_num_workers + def compute_for_num_browsers( + self, num_browsers, crawler_memory_fixed="", crawler_cpu_fixed="" + ) -> tuple[int, float]: + """compute memory, cpu for given num of browsers""" + p = self.shared_params + + if not crawler_memory_fixed: + base = parse_quantity(p["crawler_memory_base"]) + extra = parse_quantity(p["crawler_extra_memory_per_browser"]) + + # memory is always an int + crawler_memory = int(base + (num_browsers - 1) * extra) + else: + crawler_memory = int(parse_quantity(crawler_memory_fixed)) + + if not crawler_cpu_fixed: + base = parse_quantity(p["crawler_cpu_base"]) + extra = parse_quantity(p["crawler_extra_cpu_per_browser"]) + + # cpu is a floating value of cpu cores + crawler_cpu = float(base + (num_browsers - 1) * extra) + + else: + crawler_cpu = float(parse_quantity(crawler_cpu_fixed)) + + return crawler_memory, crawler_cpu + def compute_profile_resources(self) -> None: """compute memory /cpu resources for a single profile browser""" p = self.shared_params diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 8a7b5e0995..aed9cf6679 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -32,7 +32,12 @@ StorageRef, ) -from btrixcloud.utils import str_to_date, date_to_str, dt_now +from btrixcloud.utils import ( + str_to_date, + date_to_str, + dt_now, + scale_from_browser_windows, +) from .baseoperator import BaseOperator, Redis from .models import ( @@ -168,6 +173,7 @@ async def sync_crawls(self, data: MCSyncData): crawler_channel=spec.get("crawlerChannel", "default"), proxy_id=spec.get("proxyId"), scale=spec.get("scale", 1), + browser_windows=spec.get("browserWindows", 1), started=data.parent["metadata"]["creationTimestamp"], stopping=spec.get("stopping", False), paused_at=str_to_date(spec.get("pausedAt")), @@ -178,7 +184,7 @@ async def sync_crawls(self, data: MCSyncData): ) if crawl.qa_source_crawl_id: - crawl.scale = int(params.get("qa_scale", 1)) + crawl.browser_windows = int(params.get("qa_scale", 1)) # if finalizing, crawl is being deleted if data.finalizing: @@ -359,9 +365,25 @@ async def sync_crawls(self, data: MCSyncData): is_paused = bool(crawl.paused_at) and status.state == "paused" - for i in range(0, status.scale): + # crawl_scale is the number of pods to create + crawler_scale = scale_from_browser_windows(crawl.browser_windows) + + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + + for i in range(0, crawler_scale): + if status.pagesFound < i * browsers_per_pod: + break + children.extend( - self._load_crawler(params, i, status, data.children, is_paused) + self._load_crawler( + params, + i, + crawler_scale, + crawl.browser_windows, + status, + data.children, + is_paused, + ) ) return { @@ -467,7 +489,16 @@ async def _load_qa_configmap(self, params, children): return self.load_from_yaml("qa_configmap.yaml", params) # pylint: disable=too-many-arguments - def _load_crawler(self, params, i, status: CrawlStatus, children, paused: bool): + def _load_crawler( + self, + params, + i: int, + total_pods: int, + total_browser_windows: int, + status: CrawlStatus, + children, + is_paused: bool, + ): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] @@ -482,20 +513,47 @@ def _load_crawler(self, params, i, status: CrawlStatus, children, paused: bool): worker_field = "crawler_workers" pri_class = f"crawl-pri-{i}" + browsers_per_pod = params.get(worker_field) or 1 + + # if last pod, compute remaining browsers, or full amount if 0 + if i == total_pods - 1: + workers = (total_browser_windows % browsers_per_pod) or browsers_per_pod + else: + workers = browsers_per_pod + + # scale resources if < full browsers_per_pod + if workers < browsers_per_pod: + memory, cpu = self.k8s.compute_for_num_browsers(workers) + else: + cpu = params.get(cpu_field) + memory = params.get(mem_field) + pod_info = status.podStatus[name] + + # compute if number of browsers for this pod has changed + workers_changed = pod_info.lastWorkers != workers + if workers_changed: + print(f"Workers changed for {i}: {pod_info.lastWorkers} -> {workers}") + + pod_info.lastWorkers = workers + params["name"] = name params["priorityClassName"] = pri_class - params["cpu"] = pod_info.newCpu or params.get(cpu_field) - params["memory"] = pod_info.newMemory or params.get(mem_field) + params["cpu"] = pod_info.newCpu or cpu + params["memory"] = pod_info.newMemory or memory + params["workers"] = workers if self.k8s.enable_auto_resize: params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING else: params["memory_limit"] = self.k8s.max_crawler_memory_size params["storage"] = pod_info.newStorage or params.get("crawler_storage") - params["workers"] = params.get(worker_field) or 1 - params["init_crawler"] = not paused - if has_pod and not paused: + + params["init_crawler"] = not is_paused + if has_pod and not is_paused: restart_reason = pod_info.should_restart_pod(params.get("force_restart")) + if not restart_reason and workers_changed: + restart_reason = "pod_resized" + if restart_reason: print(f"Restarting {name}, reason: {restart_reason}") params["init_crawler"] = False @@ -521,8 +579,7 @@ def _qa_configmap_update_needed(self, name, configmap): # pylint: disable=too-many-arguments async def _resolve_scale( self, - crawl_id: str, - desired_scale: int, + crawl: CrawlSpec, redis: Redis, status: CrawlStatus, pods: dict[str, dict], @@ -535,15 +592,21 @@ async def _resolve_scale( scale and clean up previous scale state. """ + desired_scale = scale_from_browser_windows(crawl.browser_windows) + + if status.pagesFound < desired_scale: + desired_scale = max(1, status.pagesFound) + + if desired_scale == status.scale: + return status.scale + + crawl_id = crawl.id + # actual scale (minus redis pod) actual_scale = len(pods) if pods.get(f"redis-{crawl_id}"): actual_scale -= 1 - # ensure at least enough pages for the scale - if status.pagesFound < desired_scale: - desired_scale = max(1, status.pagesFound) - # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: return desired_scale @@ -1477,13 +1540,11 @@ async def update_crawl_state( ) # resolve scale - if crawl.scale != status.scale: - status.scale = await self._resolve_scale( - crawl.id, crawl.scale, redis, status, pods - ) + await self._resolve_scale(crawl, redis, status, pods) # check if done / failed status_count: dict[str, int] = {} + for i in range(status.scale): res = results.get(f"crawl-{crawl.id}-{i}") if res: diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 9a411431e5..cb515c4b18 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -9,6 +9,7 @@ from .baseoperator import BaseOperator from ..models import CrawlConfig +from ..utils import scale_from_browser_windows # pylint: disable=too-many-locals @@ -129,7 +130,8 @@ async def make_new_crawljob( oid=str(oid), storage=str(org.storage), crawler_channel=crawlconfig.crawlerChannel or "default", - scale=crawlconfig.scale, + scale=scale_from_browser_windows(crawlconfig.browserWindows), + browser_windows=crawlconfig.browserWindows, crawl_timeout=crawlconfig.crawlTimeout, max_crawl_size=crawlconfig.maxCrawlSize, manual=False, diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 555f32deda..02f4d2a244 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -75,6 +75,7 @@ class CrawlSpec(BaseModel): oid: UUID org: Organization scale: int = 1 + browser_windows: int = 1 storage: StorageRef started: str crawler_channel: str @@ -143,6 +144,8 @@ class PodInfo(BaseModel): evicted: Optional[bool] = False + lastWorkers: Optional[int] = 0 + def dict(self, *a, **kw): res = super().dict(*a, **kw) percent = { @@ -205,6 +208,7 @@ class CrawlStatus(BaseModel): size: int = 0 # human readable size string sizeHuman: str = "" + # number of pods scale: int = 1 filesAdded: int = 0 filesAddedSize: int = 0 diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 40130a313d..83cce13afa 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -79,6 +79,8 @@ OrgSlugsResponse, OrgImportResponse, OrgPublicProfileUpdate, + MAX_BROWSER_WINDOWS, + MAX_CRAWL_SCALE, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import ( @@ -88,6 +90,7 @@ get_duplicate_key_error_field, validate_language_code, JSONSerializer, + browser_windows_from_scale, ) if TYPE_CHECKING: @@ -105,8 +108,6 @@ DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") -MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) - # number of items to delete at a time DEL_ITEMS = 1000 @@ -1266,9 +1267,13 @@ async def import_org( if old_userid and old_userid in user_id_map: workflow[userid_field] = user_id_map[old_userid] - # Ensure scale isn't above max_scale - workflow_scale = workflow.get("scale", 1) - workflow["scale"] = max(workflow_scale, MAX_CRAWL_SCALE) + # Convert scale to browser windows and respect limits + workflow_scale = max(workflow.get("scale", 1), MAX_CRAWL_SCALE) + if workflow.get("browserWindows") is None: + workflow_browser_windows = browser_windows_from_scale(workflow_scale) + workflow["browserWindows"] = max( + workflow_browser_windows, MAX_BROWSER_WINDOWS + ) # Ensure crawlerChannel is set if not workflow.get("crawlerChannel"): @@ -1299,6 +1304,13 @@ async def import_org( # Ensure crawlerChannel is set if not item.get("crawlerChannel"): item["crawlerChannel"] = "default" + + # Set browserWindows + browser_windows = item.get("browserWindows") + if browser_windows is None: + browser_windows = browser_windows_from_scale(item.get("scale", 1)) + item["browserWindows"] = max(browser_windows, MAX_BROWSER_WINDOWS) + item_obj = Crawl.from_dict(item) if item["type"] == "upload": item_obj = UploadedCrawl.from_dict(item) # type: ignore diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 0c145c3164..4dda18792e 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -8,6 +8,7 @@ import os import sys import re +import math from datetime import datetime, timezone from typing import Optional, Dict, Union, List, Any @@ -200,3 +201,15 @@ def validate_language_code(lang: str): """Validate ISO-639-1 language code, raise HTTPException if invalid""" if not is_language(lang, "pt1"): raise HTTPException(status_code=400, detail="invalid_lang") + + +def scale_from_browser_windows(browser_windows: int) -> int: + """Return number of pods for given number of browser windows""" + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + return math.ceil(browser_windows / browsers_per_pod) + + +def browser_windows_from_scale(scale: int) -> int: + """Return number of browser windows from specified scale""" + browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + return scale * browsers_per_pod diff --git a/backend/test/test_api.py b/backend/test/test_api.py index 88cb0e3806..53cae1248f 100644 --- a/backend/test/test_api.py +++ b/backend/test/test_api.py @@ -43,8 +43,8 @@ def test_api_settings(): "jwtTokenLifetime": 1440, "defaultBehaviorTimeSeconds": 300, "maxPagesPerCrawl": 4, - "numBrowsers": 2, - "maxScale": 3, + "maxBrowserWindows": 8, + "numBrowsersPerInstance": 2, "defaultPageLoadTimeSeconds": 120, "billingEnabled": True, "signUpUrl": "", diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index eaafb10c1e..b4a70a12bb 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -53,6 +53,64 @@ def test_add_crawl_config(crawler_auth_headers, default_org_id, sample_crawl_dat cid = data["id"] +def test_verify_default_browser_windows( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 2 + + +def test_custom_browser_windows( + crawler_auth_headers, default_org_id, sample_crawl_data +): + sample_crawl_data["browserWindows"] = 4 + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + workflow_id = r.json()["id"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 4 + + +def test_custom_scale(crawler_auth_headers, default_org_id, sample_crawl_data): + sample_crawl_data["scale"] = 3 + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + workflow_id = r.json()["id"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{workflow_id}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 6 + + def test_update_name_only(crawler_auth_headers, default_org_id): # update name only r = requests.patch( @@ -326,6 +384,44 @@ def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_craw assert data["maxCrawlSize"] == 4096 +def test_update_browser_windows(crawler_auth_headers, default_org_id): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"browserWindows": 1}, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 1 + + +def test_update_scale(crawler_auth_headers, default_org_id): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"scale": 1}, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 2 + + def test_verify_delete_tags(crawler_auth_headers, default_org_id): # Verify that deleting tags and name works as well r = requests.patch( @@ -354,9 +450,9 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id): assert r.status_code == 200 data = r.json() - assert data["total"] == 3 + assert data["total"] == 5 items = data["items"] - assert len(items) == 3 + assert len(items) == 5 sorted_data = sorted(items, key=lambda revision: revision["rev"]) assert sorted_data[0]["config"]["scopeType"] == "prefix" diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index fc104197c0..77c7e185c3 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 5 - assert r.json()["total"] == 5 + assert len(r.json()["items"]) == 7 + assert r.json()["total"] == 7 def test_get_config_by_modified_by( @@ -23,8 +23,8 @@ def test_get_config_by_modified_by( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 5 - assert r.json()["total"] == 5 + assert len(r.json()["items"]) == 7 + assert r.json()["total"] == 7 def test_get_configs_by_first_seed( @@ -362,9 +362,9 @@ def test_sort_crawl_configs( headers=crawler_auth_headers, ) data = r.json() - assert data["total"] == 11 + assert data["total"] == 13 items = data["items"] - assert len(items) == 11 + assert len(items) == 13 last_created = None for config in items: diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 33eb785132..82b2ad0071 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -239,6 +239,8 @@ def test_crawl_info(admin_auth_headers, default_org_id): assert data["fileCount"] == 1 assert data["userName"] assert data["version"] == 2 + assert data["scale"] == 1 + assert data["browserWindows"] == 2 def test_crawls_include_seed_info(admin_auth_headers, default_org_id): diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 9f9d966d26..4b749fab92 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -19,6 +19,7 @@ spec: cid: "{{ cid }}" oid: "{{ oid }}" scale: {{ scale }} + browserWindows: {{ browser_windows }} profile_filename: "{{ profile_filename }}" storage_filename: "{{ storage_filename }}" diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 9f01a474a7..4e681840aa 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -50,6 +50,7 @@ data: FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" MAX_CRAWL_SCALE: "{{ .Values.max_crawl_scale | default 3 }}" + MAX_BROWSER_WINDOWS: "{{ .Values.max_browser_windows | default 8 }}" LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 9e20d433ce..97d3af72fb 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -1,5 +1,7 @@ +{{ $max_browser_windows := not (empty .Values.max_browser_windows) | ternary (int .Values.max_browser_windows) (mul (int .Values.max_crawl_scale) (int .Values.crawler_browser_instances) ) }} -{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} + +{{- range untilStep 0 $max_browser_windows 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass @@ -11,7 +13,7 @@ description: "Priority for crawl instance #{{ . }}" {{- end }} -{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} +{{- range untilStep 0 $max_browser_windows 1 }} --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass diff --git a/chart/values.yaml b/chart/values.yaml index b2f320ce6f..a7ee00a4a6 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -50,8 +50,8 @@ default_crawl_filename_template: "@ts-@hostsuffix.wacz" crawler_extra_args: "" -# max allowed crawl scale per crawl -max_crawl_scale: 3 +# max allowed browser windows per crawl +max_browser_windows: 8 # Cluster Settings diff --git a/frontend/src/__mocks__/api/settings.js b/frontend/src/__mocks__/api/settings.js index a47b266cb7..c1543c6736 100644 --- a/frontend/src/__mocks__/api/settings.js +++ b/frontend/src/__mocks__/api/settings.js @@ -4,7 +4,7 @@ export default { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, - maxScale: 0, + maxBrowserWindows: 0, billingEnabled: true, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/components/screencast.ts b/frontend/src/components/screencast.ts index bf63fec50f..c6d31d978a 100644 --- a/frontend/src/components/screencast.ts +++ b/frontend/src/components/screencast.ts @@ -130,7 +130,16 @@ export class Screencast extends BtrixElement { crawlId?: string; @property({ type: Number }) - scale = 1; + browserWindows = 1; + + @property({ type: Number }) + numBrowsersPerInstance = 1; + + @state() + private scale = 1; + + @state() + private lastIndexOffset = 0; // List of browser screens @state() @@ -141,17 +150,23 @@ export class Screencast extends BtrixElement { // Websocket connections private readonly wsMap = new Map(); - // Number of available browsers. - // Multiply by scale to get available browser window count - private browsersCount = 1; private screenWidth = 640; private screenHeight = 480; private readonly timerIds: number[] = []; protected firstUpdated() { + this.updateScale(); + // Connect to websocket server this.connectAll(); } + protected updateScale() { + const remainder = this.browserWindows % this.numBrowsersPerInstance; + this.scale = Math.ceil(this.browserWindows / this.numBrowsersPerInstance); + this.lastIndexOffset = remainder + ? (this.scale - 1) * (this.numBrowsersPerInstance - remainder) + : 0; + } async updated( changedProperties: PropertyValues & Map, @@ -164,9 +179,12 @@ export class Screencast extends BtrixElement { this.disconnectAll(); this.connectAll(); } - const prevScale = changedProperties.get("scale"); - if (prevScale !== undefined) { - if (this.scale > prevScale) { + if (changedProperties.has("browserWindows")) { + this.updateScale(); + } + const prevWindows = changedProperties.get("browserWindows"); + if (prevWindows !== undefined) { + if (this.browserWindows > prevWindows) { this.scaleUp(); } else { this.scaleDown(); @@ -181,14 +199,14 @@ export class Screencast extends BtrixElement { } render() { - const screenCount = this.scale * this.browsersCount; + const screenCount = this.browserWindows; return html`
${Array.from({ length: screenCount }).map((_, i) => this.renderScreen(`${i}`), @@ -226,6 +244,7 @@ export class Screencast extends BtrixElement { private readonly renderScreen = (id: string) => { const pageData = this.dataMap[id]; + return html`
(this.focusedScreenData = pageData) : () => {}} >
${pageData?.url || html` `}
-
+
${pageData ? html`` - : html``} + : html`
+ +
`}
`; }; @@ -301,18 +322,21 @@ export class Screencast extends BtrixElement { private handleMessage( message: InitMessage | ScreencastMessage | CloseMessage, + isLast: boolean, ) { if (message.msg === "init") { const dataMap: Record = {}; - for (let i = 0; i < message.browsers * this.scale; i++) { + for (let i = 0; i < this.browserWindows; i++) { dataMap[i] = null; } this.dataMap = dataMap; - this.browsersCount = message.browsers; this.screenWidth = message.width; this.screenHeight = message.height; } else { - const { id } = message; + let { id } = message; + if (isLast) { + id += this.lastIndexOffset; + } const dataMap = { ...this.dataMap }; if (message.msg === "screencast") { @@ -348,6 +372,7 @@ export class Screencast extends BtrixElement { ws.addEventListener("message", ({ data }: MessageEvent) => { this.handleMessage( JSON.parse(data) as InitMessage | ScreencastMessage | CloseMessage, + index === this.scale - 1, ); }); diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 45009e21a1..599c54b4eb 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -251,9 +251,7 @@ export class ConfigDetails extends BtrixElement { )} ${this.renderSetting( msg("Browser Windows"), - crawlConfig?.scale && this.appState.settings - ? `${crawlConfig.scale * this.appState.settings.numBrowsers}` - : "", + crawlConfig?.browserWindows ? `${crawlConfig.browserWindows}` : "", )} ${this.renderSetting( msg("Crawler Channel (Exact Crawler Version)"), diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index f0ea0d1a03..0a1271c99f 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -33,7 +33,6 @@ import { } from "lit/decorators.js"; import { ifDefined } from "lit/directives/if-defined.js"; import { map } from "lit/directives/map.js"; -import { range } from "lit/directives/range.js"; import { when } from "lit/directives/when.js"; import compact from "lodash/fp/compact"; import flow from "lodash/fp/flow"; @@ -115,6 +114,7 @@ import { getInitialFormState, getServerDefaults, makeUserGuideEvent, + rangeBrowserWindows, SECTIONS, workflowTabToGuideHash, type FormState, @@ -1581,20 +1581,18 @@ https://archiveweb.page/images/${"logo.svg"}`} this.updateFormState({ - scale: +(e.target as SlCheckbox).value, + browserWindows: +(e.target as SlCheckbox).value, })} > - ${when(this.appState.settings?.numBrowsers, (numBrowsers) => - map( - range(this.orgDefaults.maxScale), - (i: number) => - html` ${(i + 1) * numBrowsers}`, - ), + ${map( + rangeBrowserWindows(this.appState.settings), + (i: number) => + html` ${i}`, )} `)} @@ -2550,7 +2548,7 @@ https://archiveweb.page/images/${"logo.svg"}`} jobType: "custom", name: this.formState.jobName || "", description: this.formState.description, - scale: this.formState.scale, + browserWindows: this.formState.browserWindows, profileid: this.formState.browserProfile?.id || "", schedule: this.formState.scheduleType === "cron" ? this.utcSchedule : "", crawlTimeout: this.formState.crawlTimeoutMinutes * 60, diff --git a/frontend/src/index.test.ts b/frontend/src/index.test.ts index 3fdd863c88..e09bc96a8d 100644 --- a/frontend/src/index.test.ts +++ b/frontend/src/index.test.ts @@ -40,8 +40,7 @@ const mockAppSettings: AppSettings = { defaultBehaviorTimeSeconds: 300, defaultPageLoadTimeSeconds: 120, maxPagesPerCrawl: 50000, - numBrowsers: 2, - maxScale: 3, + maxBrowserWindows: 4, billingEnabled: false, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/index.ts b/frontend/src/index.ts index a5b0246218..e6ba75afed 100644 --- a/frontend/src/index.ts +++ b/frontend/src/index.ts @@ -885,7 +885,8 @@ export class App extends BtrixElement { class="w-full" .viewStateData=${this.viewState.data} .params=${this.viewState.params} - .maxScale=${this.appState.settings?.maxScale || DEFAULT_MAX_SCALE} + .maxBrowserWindows=${this.appState.settings?.maxBrowserWindows || + DEFAULT_MAX_SCALE} orgPath=${orgPath.split(slug)[1]} orgTab=${orgTab} >`; diff --git a/frontend/src/pages/org/index.ts b/frontend/src/pages/org/index.ts index e6dcdf2fb2..ef7f449baf 100644 --- a/frontend/src/pages/org/index.ts +++ b/frontend/src/pages/org/index.ts @@ -113,7 +113,7 @@ export class Org extends BtrixElement { orgTab?: OrgTab | string; @property({ type: Number }) - maxScale: number = DEFAULT_MAX_SCALE; + maxBrowserWindows: number = DEFAULT_MAX_SCALE; @state() private openDialogName?: ResourceName; @@ -536,7 +536,7 @@ export class Org extends BtrixElement { openDialogName=${this.viewStateData?.dialog} ?isEditing=${isEditing} ?isCrawler=${this.appState.isCrawler} - .maxScale=${this.maxScale} + .maxBrowserWindows=${this.maxBrowserWindows} > `; } diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index f90d6f0330..b62d99d7da 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -41,6 +41,7 @@ import { humanizeExecutionSeconds } from "@/utils/executionTimeFormatter"; import { isArchivingDisabled } from "@/utils/orgs"; import { pluralOf } from "@/utils/pluralize"; import { tw } from "@/utils/tailwind"; +import { rangeBrowserWindows } from "@/utils/workflow"; const POLL_INTERVAL_SECONDS = 10; const CRAWLS_PAGINATION_NAME = "crawlsPage"; @@ -78,7 +79,7 @@ export class WorkflowDetail extends BtrixElement { | "deleteCrawl"; @property({ type: Number }) - maxScale = DEFAULT_MAX_SCALE; + maxBrowserWindows = DEFAULT_MAX_SCALE; @state() private lastCrawlId: Workflow["lastCrawlId"] = null; @@ -1591,8 +1592,7 @@ export class WorkflowDetail extends BtrixElement { if (!this.isCrawler) return; const enableEditBrowserWindows = !this.workflow.lastCrawlStopping; - const windowCount = - this.workflow.scale * (this.appState.settings?.numBrowsers || 1); + const windowCount = this.workflow.browserWindows || 1; return html`
@@ -1789,7 +1789,9 @@ export class WorkflowDetail extends BtrixElement {
@@ -2002,10 +2004,10 @@ export class WorkflowDetail extends BtrixElement { const scaleOptions = []; if (this.appState.settings) { - for (let value = 1; value <= this.maxScale; value++) { + for (const value of rangeBrowserWindows(this.appState.settings)) { scaleOptions.push({ value, - label: value * this.appState.settings.numBrowsers, + label: value, }); } } @@ -2017,7 +2019,7 @@ export class WorkflowDetail extends BtrixElement { "Change the number of browser windows crawling in parallel. This change will take effect immediately on the currently running crawl and update crawl workflow settings.", )}

- + ${scaleOptions.map( ({ value, label }) => html` ( - `/orgs/${this.orgId}/crawls/${this.lastCrawlId}/scale`, - { - method: "POST", - body: JSON.stringify({ scale: +value }), - signal, - }, - ); + const data = await this.api.fetch<{ + scaled: boolean; + browserWindows: number; + }>(`/orgs/${this.orgId}/crawls/${this.lastCrawlId}/scale`, { + method: "POST", + body: JSON.stringify({ browserWindows: +value }), + signal, + }); if (data.scaled) { this.notify.toast({ diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 5b274f2ccb..b63112019f 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -66,7 +66,7 @@ export class WorkflowsNew extends LiteElement { crawlTimeout: null, maxCrawlSize: null, jobType: "custom", - scale: 1, + browserWindows: this.appState.settings?.numBrowsersPerInstance || 1, autoAddCollections: [], crawlerChannel: "default", proxyId: null, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index f348bb5559..a903be965d 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -56,7 +56,7 @@ export type WorkflowParams = { jobType?: JobType; name: string; schedule: string; - scale: number; + browserWindows: number; profileid: string | null; config: SeedConfig; tags: string[]; @@ -193,6 +193,7 @@ export type Crawl = ArchivedItemBase & schedule: string; manual: boolean; scale: number; + browserWindows: number; shouldPause: boolean | null; resources?: { name: string; diff --git a/frontend/src/utils/app.ts b/frontend/src/utils/app.ts index d1bd85347a..e6c8670aeb 100644 --- a/frontend/src/utils/app.ts +++ b/frontend/src/utils/app.ts @@ -8,8 +8,8 @@ export type AppSettings = { defaultBehaviorTimeSeconds: number; defaultPageLoadTimeSeconds: number; maxPagesPerCrawl: number; - numBrowsers: number; - maxScale: number; + numBrowsersPerInstance: number; + maxBrowserWindows: number; billingEnabled: boolean; signUpUrl: string; salesEmail: string; @@ -37,8 +37,8 @@ export async function getAppSettings(): Promise { defaultBehaviorTimeSeconds: 0, defaultPageLoadTimeSeconds: 0, maxPagesPerCrawl: 0, - numBrowsers: 1, - maxScale: 0, + numBrowsersPerInstance: 1, + maxBrowserWindows: 4, billingEnabled: false, signUpUrl: "", salesEmail: "", diff --git a/frontend/src/utils/crawler.ts b/frontend/src/utils/crawler.ts index 2b2003aa0d..7d90fa3a05 100644 --- a/frontend/src/utils/crawler.ts +++ b/frontend/src/utils/crawler.ts @@ -19,7 +19,7 @@ export const activeCrawlStates = RUNNING_AND_WAITING_STATES; export const finishedCrawlStates = SUCCESSFUL_STATES; export const inactiveCrawlStates = SUCCESSFUL_AND_FAILED_STATES; -export const DEFAULT_MAX_SCALE = 3; +export const DEFAULT_MAX_SCALE = 8; export const DEPTH_SUPPORTED_SCOPES = [ "prefix", diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 5946f5b090..c31700ed8a 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -1,7 +1,7 @@ import { msg, str } from "@lit/localize"; import { z } from "zod"; -import { getAppSettings } from "./app"; +import { getAppSettings, type AppSettings } from "./app"; import type { Tags } from "@/components/ui/tag-input"; import type { UserGuideEventMap } from "@/index"; @@ -107,7 +107,7 @@ export type FormState = { | (typeof NewWorkflowOnlyScopeType)[keyof typeof NewWorkflowOnlyScopeType]; exclusions: WorkflowParams["config"]["exclude"]; pageLimit: WorkflowParams["config"]["limit"]; - scale: WorkflowParams["scale"]; + browserWindows: WorkflowParams["browserWindows"]; blockAds: WorkflowParams["config"]["blockAds"]; lang: WorkflowParams["config"]["lang"]; scheduleType: "date" | "cron" | "none"; @@ -140,11 +140,11 @@ export type WorkflowDefaults = { behaviorTimeoutSeconds?: number; pageLoadTimeoutSeconds?: number; maxPagesPerCrawl?: number; - maxScale: number; + maxBrowserWindows: number; }; export const appDefaults: WorkflowDefaults = { - maxScale: DEFAULT_MAX_SCALE, + maxBrowserWindows: DEFAULT_MAX_SCALE, }; export const getDefaultFormState = (): FormState => ({ @@ -164,7 +164,7 @@ export const getDefaultFormState = (): FormState => ({ scopeType: ScopeType.Page, exclusions: [], pageLimit: null, - scale: 1, + browserWindows: 2, blockAds: true, lang: getDefaultLang(), scheduleType: "none", @@ -306,7 +306,7 @@ export function getInitialFormState(params: { postLoadDelaySeconds: seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds, maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth, - scale: params.initialWorkflow.scale, + browserWindows: params.initialWorkflow.browserWindows, blockAds: params.initialWorkflow.config.blockAds, lang: params.initialWorkflow.config.lang ?? defaultFormState.lang, scheduleType: defaultFormState.scheduleType, @@ -365,8 +365,8 @@ export async function getServerDefaults(): Promise { if (data.maxPagesPerCrawl > 0) { defaults.maxPagesPerCrawl = data.maxPagesPerCrawl; } - if (data.maxScale) { - defaults.maxScale = data.maxScale; + if (data.maxBrowserWindows) { + defaults.maxBrowserWindows = data.maxBrowserWindows; } return defaults; @@ -376,3 +376,26 @@ export async function getServerDefaults(): Promise { return defaults; } + +export function* rangeBrowserWindows( + settings: AppSettings | null, +): Iterable { + if (!settings) { + yield 1; + return; + } + + const { numBrowsersPerInstance, maxBrowserWindows } = settings; + + for (let i = 1; i < numBrowsersPerInstance; i++) { + yield i; + } + + for ( + let i = numBrowsersPerInstance; + i <= maxBrowserWindows; + i += numBrowsersPerInstance + ) { + yield i; + } +}