From 5002aba82f76f6cb69890197342d0f2b27cc3a6a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 10 Jun 2025 19:30:38 -0700 Subject: [PATCH 01/10] additional scale / browser window cleanup to properly support QA: - use qa_num_browser_windows to set exact number of QA browsers, fallback to qa_scale - set num_browser_windows and num_browsers_per_pod using crawler / qa values depending if QA crawl - scale_from_browser_windows() accepts optional browsers_per_pod if dealing with possible QA override - store 'desiredScale' in CrawlStatus to avoid recomputing for later scale resolving - ensure status.scale is always the actual scale observed --- backend/btrixcloud/operator/crawls.py | 65 +++++++++++++-------------- backend/btrixcloud/operator/models.py | 8 +++- backend/btrixcloud/utils.py | 10 +++-- chart/templates/configmap.yaml | 2 +- chart/values.yaml | 4 +- 5 files changed, 45 insertions(+), 44 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index c52314d0a5..4a87c181da 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -101,8 +101,6 @@ class CrawlOperator(BaseOperator): paused_expires_delta: timedelta - num_browsers_per_pod: int - def __init__(self, *args): super().__init__(*args) @@ -127,8 +125,6 @@ def __init__(self, *args): self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes) - self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - def init_routes(self, app): """init routes for this operator""" @@ -188,9 +184,6 @@ async def sync_crawls(self, data: MCSyncData): is_single_page=spec.get("isSinglePage") == "1", ) - if crawl.qa_source_crawl_id: - crawl.browser_windows = int(params.get("qa_scale", 1)) - # if finalizing, crawl is being deleted if data.finalizing: if not status.finished: @@ -259,8 +252,13 @@ async def sync_crawls(self, data: MCSyncData): "starting", status, crawl, allowed_from=["waiting_org_limit"] ) - if len(pods): + status.scale = len(pods) + if status.scale: for pod_name, pod in pods.items(): + # don't count redis pod + if pod_name == f"redis-{crawl_id}": + status.scale -= 1 + self.sync_resources(status, pod_name, pod, data.children) status = await self.sync_crawl_state(redis_url, crawl, status, pods, data) @@ -282,9 +280,6 @@ async def sync_crawls(self, data: MCSyncData): pods, crawl, status, EXEC_TIME_UPDATE_SECS ) - else: - status.scale = 1 - # stopping paused crawls if crawl.paused_at: stop_reason: Optional[StopReason] = None @@ -367,22 +362,28 @@ async def sync_crawls(self, data: MCSyncData): if crawl.qa_source_crawl_id: params["qa_source_crawl_id"] = crawl.qa_source_crawl_id children.extend(await self._load_qa_configmap(params, data.children)) + num_browsers_per_pod = int(params["qa_browser_instances"]) + num_browser_windows = int(params.get("qa_num_browser_windows", 1)) + else: + num_browsers_per_pod = int(params["crawler_browser_instances"]) + num_browser_windows = crawl.browser_windows - is_paused = bool(crawl.paused_at) and status.state == "paused" + # desired scale is the number of pods to create + status.desiredScale = scale_from_browser_windows( + num_browser_windows, num_browsers_per_pod + ) - # crawl_scale is the number of pods to create - crawler_scale = scale_from_browser_windows(crawl.browser_windows) + is_paused = bool(crawl.paused_at) and status.state == "paused" - for i in range(0, crawler_scale): - if status.pagesFound < i * self.num_browsers_per_pod: + for i in range(0, status.desiredScale): + if status.pagesFound < i * num_browsers_per_pod: break children.extend( self._load_crawler( params, i, - crawler_scale, - crawl.browser_windows, + num_browser_windows, status, data.children, is_paused, @@ -498,7 +499,6 @@ def _load_crawler( self, params, i: int, - total_pods: int, total_browser_windows: int, status: CrawlStatus, children, @@ -506,6 +506,7 @@ def _load_crawler( ): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] + total_pods = status.desiredScale if params.get("qa_source_crawl_id"): cpu_field = "qa_cpu" @@ -581,37 +582,31 @@ def _qa_configmap_update_needed(self, name, configmap): return False - # pylint: disable=too-many-arguments - async def _resolve_scale( + async def _resolve_scale_down( self, crawl: CrawlSpec, redis: Redis, status: CrawlStatus, pods: dict[str, dict], ): - """Resolve scale - If desired_scale >= actual scale, just set (also limit by number of pages - found). + """Resolve scale down + Limit desired scale to number of pages + If desired_scale >= actual scale, just return If desired scale < actual scale, attempt to shut down each crawl instance via redis setting. If contiguous instances shutdown (successful exit), lower scale and clean up previous scale state. """ - - desired_scale = scale_from_browser_windows(crawl.browser_windows) + desired_scale = status.desiredScale + actual_scale = status.scale if status.pagesFound < desired_scale: desired_scale = max(1, status.pagesFound) - if desired_scale == status.scale: - return status.scale + if desired_scale == actual_scale: + return actual_scale crawl_id = crawl.id - # actual scale (minus redis pod) - actual_scale = len(pods) - if pods.get(f"redis-{crawl_id}"): - actual_scale -= 1 - # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: return desired_scale @@ -1544,8 +1539,8 @@ async def update_crawl_state( f"Crawl gracefully stopping: {status.stopReason}, id: {crawl.id}" ) - # resolve scale - await self._resolve_scale(crawl, redis, status, pods) + # resolve scale down, if needed + await self._resolve_scale_down(crawl, redis, status, pods) # check if done / failed status_count: dict[str, int] = {} diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index d98db940d6..944d66a9c5 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -209,8 +209,12 @@ class CrawlStatus(BaseModel): size: int = 0 # human readable size string sizeHuman: str = "" - # number of pods - scale: int = 1 + + # actual observed scale (number of pods active) + scale: int = 0 + # desired scale as computed by crawl state (number of pods that should be active) + desiredScale: int = 0 + filesAdded: int = 0 filesAddedSize: int = 0 finished: Optional[str] = None diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 4dda18792e..9b8935ff48 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -24,6 +24,8 @@ default_origin = os.environ.get("APP_ORIGIN", "") +browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + class JSONSerializer(json.JSONEncoder): """Serializer class for json.dumps with UUID and datetime support""" @@ -203,13 +205,13 @@ def validate_language_code(lang: str): raise HTTPException(status_code=400, detail="invalid_lang") -def scale_from_browser_windows(browser_windows: int) -> int: +def scale_from_browser_windows( + browser_windows: int, custom_browsers_per_pod=None +) -> int: """Return number of pods for given number of browser windows""" - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - return math.ceil(browser_windows / browsers_per_pod) + return math.ceil(browser_windows / (custom_browsers_per_pod or browsers_per_pod)) def browser_windows_from_scale(scale: int) -> int: """Return number of browser windows from specified scale""" - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) return scale * browsers_per_pod diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 4e681840aa..a7d2da0211 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -174,7 +174,7 @@ data: profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}" - qa_scale: "{{ .Values.qa_scale | default 1 }}" + qa_num_browser_windows: "{{ .Values.qa_num_browser_windows | default (.Values.qa_scale | default 1) }}" crawler_node_type: "{{ .Values.crawler_node_type }}" redis_node_type: "{{ .Values.redis_node_type }}" diff --git a/chart/values.yaml b/chart/values.yaml index 5f19518684..d51260ec4f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -281,8 +281,8 @@ crawler_browser_instances: 2 # defaults to 'crawler_browser_instances' if not set qa_browser_instances: 1 -# fixed scale (number of QA pods) to run -qa_scale: 1 +# number of browser windows to run for QA (with 'qa_browser_instances' per pod) +qa_num_browser_windows: 2 # this value is added to crawler_cpu_base, for each additional browser # crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1) From 4839ce4c700e7a3a5996790414fab32a6a05b5c5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 00:29:57 -0700 Subject: [PATCH 02/10] default scale to 1 --- backend/btrixcloud/operator/crawls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 4a87c181da..b61eafca6d 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -279,6 +279,8 @@ async def sync_crawls(self, data: MCSyncData): await self.increment_pod_exec_time( pods, crawl, status, EXEC_TIME_UPDATE_SECS ) + else: + status.scale = 1 # stopping paused crawls if crawl.paused_at: From e845810d6de502042027c98063578ea0edb93efc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 08:14:36 -0700 Subject: [PATCH 03/10] undo some changes --- backend/btrixcloud/operator/baseoperator.py | 1 + backend/btrixcloud/operator/crawls.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index 8351c55d12..75b33dad03 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -57,6 +57,7 @@ def compute_crawler_resources(self) -> None: except: # default to 1 for now for best results (to revisit in the future) qa_num_workers = 1 + p["qa_browser_instances"] = 1 crawler_memory, crawler_cpu = self.compute_for_num_browsers( num_workers, p.get("crawler_memory"), p.get("crawler_cpu") diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index b61eafca6d..d115435b8e 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -279,8 +279,6 @@ async def sync_crawls(self, data: MCSyncData): await self.increment_pod_exec_time( pods, crawl, status, EXEC_TIME_UPDATE_SECS ) - else: - status.scale = 1 # stopping paused crawls if crawl.paused_at: @@ -590,7 +588,7 @@ async def _resolve_scale_down( redis: Redis, status: CrawlStatus, pods: dict[str, dict], - ): + ) -> None: """Resolve scale down Limit desired scale to number of pages If desired_scale >= actual scale, just return @@ -605,13 +603,18 @@ async def _resolve_scale_down( desired_scale = max(1, status.pagesFound) if desired_scale == actual_scale: - return actual_scale + return crawl_id = crawl.id + # actual scale (minus redis pod) + actual_scale = len(pods) + if pods.get(f"redis-{crawl_id}"): + actual_scale -= 1 + # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: - return desired_scale + return new_scale = actual_scale for i in range(actual_scale - 1, desired_scale - 1, -1): @@ -638,8 +641,6 @@ async def _resolve_scale_down( await redis.hdel(f"{crawl_id}:stopone", name) await redis.hdel(f"{crawl_id}:status", name) - return new_scale - def sync_resources(self, status, name, pod, children): """set crawljob status from current resources""" resources = status.podStatus[name].allocated @@ -1042,6 +1043,7 @@ def sync_pod_status( crawler_running = False redis_running = False pod_done_count = 0 + scale_count = 0 try: for name, pod in pods.items(): @@ -1070,9 +1072,12 @@ def sync_pod_status( crawler_running = crawler_running or running if phase == "Succeeded": pod_done_count += 1 + scale_count += 1 elif role == "redis": redis_running = redis_running or running + status.scale = scale_count + # pylint: disable=broad-except except Exception as exc: print(exc) From 36b2dc0ca8d5e4547458c0201735386a06062010 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 08:42:02 -0700 Subject: [PATCH 04/10] don't recompute? --- backend/btrixcloud/operator/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d115435b8e..7f8bf64f92 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1076,7 +1076,7 @@ def sync_pod_status( elif role == "redis": redis_running = redis_running or running - status.scale = scale_count + #status.scale = scale_count # pylint: disable=broad-except except Exception as exc: From 0e0e25e05b01273c6edcdb545c5c2ea9ec29f69a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 10:02:14 -0700 Subject: [PATCH 05/10] test with redis- prefix --- backend/btrixcloud/operator/crawls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 7f8bf64f92..9d1600897a 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -256,7 +256,7 @@ async def sync_crawls(self, data: MCSyncData): if status.scale: for pod_name, pod in pods.items(): # don't count redis pod - if pod_name == f"redis-{crawl_id}": + if pod_name.startswith("redis-") status.scale -= 1 self.sync_resources(status, pod_name, pod, data.children) @@ -608,9 +608,9 @@ async def _resolve_scale_down( crawl_id = crawl.id # actual scale (minus redis pod) - actual_scale = len(pods) - if pods.get(f"redis-{crawl_id}"): - actual_scale -= 1 + # actual_scale = len(pods) + #if pods.get(f"redis-{crawl_id}"): + # actual_scale -= 1 # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: From 1231f70c563734b2b6828570e63eb185f3a05310 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 10:16:11 -0700 Subject: [PATCH 06/10] fix typo --- backend/btrixcloud/operator/crawls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 9d1600897a..8986a2f35f 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -256,7 +256,7 @@ async def sync_crawls(self, data: MCSyncData): if status.scale: for pod_name, pod in pods.items(): # don't count redis pod - if pod_name.startswith("redis-") + if pod_name.startswith("redis-"): status.scale -= 1 self.sync_resources(status, pod_name, pod, data.children) @@ -609,7 +609,7 @@ async def _resolve_scale_down( # actual scale (minus redis pod) # actual_scale = len(pods) - #if pods.get(f"redis-{crawl_id}"): + # if pods.get(f"redis-{crawl_id}"): # actual_scale -= 1 # if desired_scale same or scaled up, return desired_scale @@ -1076,7 +1076,7 @@ def sync_pod_status( elif role == "redis": redis_running = redis_running or running - #status.scale = scale_count + # status.scale = scale_count # pylint: disable=broad-except except Exception as exc: From ab6610194a28e77a622f9bcb1e076b71aa4f27df Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 10:50:36 -0700 Subject: [PATCH 07/10] cleanup --- backend/btrixcloud/operator/crawls.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 8986a2f35f..2547f662c9 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -607,11 +607,6 @@ async def _resolve_scale_down( crawl_id = crawl.id - # actual scale (minus redis pod) - # actual_scale = len(pods) - # if pods.get(f"redis-{crawl_id}"): - # actual_scale -= 1 - # if desired_scale same or scaled up, return desired_scale if desired_scale >= actual_scale: return @@ -1043,7 +1038,6 @@ def sync_pod_status( crawler_running = False redis_running = False pod_done_count = 0 - scale_count = 0 try: for name, pod in pods.items(): @@ -1072,12 +1066,9 @@ def sync_pod_status( crawler_running = crawler_running or running if phase == "Succeeded": pod_done_count += 1 - scale_count += 1 elif role == "redis": redis_running = redis_running or running - # status.scale = scale_count - # pylint: disable=broad-except except Exception as exc: print(exc) From 6ede4a221a6e15501affa002d8ce41f7029c9b81 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 15:08:57 -0700 Subject: [PATCH 08/10] test: increase sleep() --- backend/test/test_qa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index d6034bc3e3..0fc6e9c289 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -42,7 +42,7 @@ def qa_run_pages_ready(qa_crawl_id, crawler_auth_headers, default_org_id, qa_run if count + 1 == MAX_ATTEMPTS: assert False - time.sleep(5) + time.sleep(10) count += 1 # Wait until pages are ready From 4e47d67b5b455d4d063994583745a398e2a7dc00 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 15:34:42 -0700 Subject: [PATCH 09/10] check desiredScale by page size --- backend/btrixcloud/operator/crawls.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 2547f662c9..f80c490079 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -373,6 +373,9 @@ async def sync_crawls(self, data: MCSyncData): num_browser_windows, num_browsers_per_pod ) + if status.pagesFound < desired_scale: + desired_scale = max(1, status.pagesFound) + is_paused = bool(crawl.paused_at) and status.state == "paused" for i in range(0, status.desiredScale): @@ -599,18 +602,12 @@ async def _resolve_scale_down( desired_scale = status.desiredScale actual_scale = status.scale - if status.pagesFound < desired_scale: - desired_scale = max(1, status.pagesFound) - - if desired_scale == actual_scale: + # if not scaling down, just return + if desired_scale >= actual_scale: return crawl_id = crawl.id - # if desired_scale same or scaled up, return desired_scale - if desired_scale >= actual_scale: - return - new_scale = actual_scale for i in range(actual_scale - 1, desired_scale - 1, -1): name = f"crawl-{crawl_id}-{i}" From 6fc1019f0018739aef4b7b5cc03ecea84ea4694b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jun 2025 15:50:29 -0700 Subject: [PATCH 10/10] typeo fix --- backend/btrixcloud/operator/crawls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index f80c490079..b9fdd7f501 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -373,8 +373,8 @@ async def sync_crawls(self, data: MCSyncData): num_browser_windows, num_browsers_per_pod ) - if status.pagesFound < desired_scale: - desired_scale = max(1, status.pagesFound) + if status.pagesFound < status.desiredScale: + status.desiredScale = max(1, status.pagesFound) is_paused = bool(crawl.paused_at) and status.state == "paused"