Skip to content

additional scale / browser window cleanup to properly support QA: #2663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/btrixcloud/operator/baseoperator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def compute_crawler_resources(self) -> None:
except:
# default to 1 for now for best results (to revisit in the future)
qa_num_workers = 1
p["qa_browser_instances"] = 1

crawler_memory, crawler_cpu = self.compute_for_num_browsers(
num_workers, p.get("crawler_memory"), p.get("crawler_cpu")
Expand Down
80 changes: 35 additions & 45 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,6 @@ class CrawlOperator(BaseOperator):

paused_expires_delta: timedelta

num_browsers_per_pod: int

def __init__(self, *args):
super().__init__(*args)

Expand All @@ -127,8 +125,6 @@ def __init__(self, *args):

self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)

self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))

def init_routes(self, app):
"""init routes for this operator"""

Expand Down Expand Up @@ -188,9 +184,6 @@ async def sync_crawls(self, data: MCSyncData):
is_single_page=spec.get("isSinglePage") == "1",
)

if crawl.qa_source_crawl_id:
crawl.browser_windows = int(params.get("qa_scale", 1))

# if finalizing, crawl is being deleted
if data.finalizing:
if not status.finished:
Expand Down Expand Up @@ -259,8 +252,13 @@ async def sync_crawls(self, data: MCSyncData):
"starting", status, crawl, allowed_from=["waiting_org_limit"]
)

if len(pods):
status.scale = len(pods)
if status.scale:
for pod_name, pod in pods.items():
# don't count redis pod
if pod_name.startswith("redis-"):
status.scale -= 1

self.sync_resources(status, pod_name, pod, data.children)

status = await self.sync_crawl_state(redis_url, crawl, status, pods, data)
Expand All @@ -282,9 +280,6 @@ async def sync_crawls(self, data: MCSyncData):
pods, crawl, status, EXEC_TIME_UPDATE_SECS
)

else:
status.scale = 1

# stopping paused crawls
if crawl.paused_at:
stop_reason: Optional[StopReason] = None
Expand Down Expand Up @@ -367,22 +362,31 @@ async def sync_crawls(self, data: MCSyncData):
if crawl.qa_source_crawl_id:
params["qa_source_crawl_id"] = crawl.qa_source_crawl_id
children.extend(await self._load_qa_configmap(params, data.children))
num_browsers_per_pod = int(params["qa_browser_instances"])
num_browser_windows = int(params.get("qa_num_browser_windows", 1))
else:
num_browsers_per_pod = int(params["crawler_browser_instances"])
num_browser_windows = crawl.browser_windows

is_paused = bool(crawl.paused_at) and status.state == "paused"
# desired scale is the number of pods to create
status.desiredScale = scale_from_browser_windows(
num_browser_windows, num_browsers_per_pod
)

if status.pagesFound < status.desiredScale:
status.desiredScale = max(1, status.pagesFound)

# crawl_scale is the number of pods to create
crawler_scale = scale_from_browser_windows(crawl.browser_windows)
is_paused = bool(crawl.paused_at) and status.state == "paused"

for i in range(0, crawler_scale):
if status.pagesFound < i * self.num_browsers_per_pod:
for i in range(0, status.desiredScale):
if status.pagesFound < i * num_browsers_per_pod:
break

children.extend(
self._load_crawler(
params,
i,
crawler_scale,
crawl.browser_windows,
num_browser_windows,
status,
data.children,
is_paused,
Expand Down Expand Up @@ -498,14 +502,14 @@ def _load_crawler(
self,
params,
i: int,
total_pods: int,
total_browser_windows: int,
status: CrawlStatus,
children,
is_paused: bool,
):
name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD]
total_pods = status.desiredScale

if params.get("qa_source_crawl_id"):
cpu_field = "qa_cpu"
Expand Down Expand Up @@ -581,41 +585,29 @@ def _qa_configmap_update_needed(self, name, configmap):

return False

# pylint: disable=too-many-arguments
async def _resolve_scale(
async def _resolve_scale_down(
self,
crawl: CrawlSpec,
redis: Redis,
status: CrawlStatus,
pods: dict[str, dict],
):
"""Resolve scale
If desired_scale >= actual scale, just set (also limit by number of pages
found).
) -> None:
"""Resolve scale down
Limit desired scale to number of pages
If desired_scale >= actual scale, just return
If desired scale < actual scale, attempt to shut down each crawl instance
via redis setting. If contiguous instances shutdown (successful exit), lower
scale and clean up previous scale state.
"""
desired_scale = status.desiredScale
actual_scale = status.scale

desired_scale = scale_from_browser_windows(crawl.browser_windows)

if status.pagesFound < desired_scale:
desired_scale = max(1, status.pagesFound)

if desired_scale == status.scale:
return status.scale
# if not scaling down, just return
if desired_scale >= actual_scale:
return

crawl_id = crawl.id

# actual scale (minus redis pod)
actual_scale = len(pods)
if pods.get(f"redis-{crawl_id}"):
actual_scale -= 1

# if desired_scale same or scaled up, return desired_scale
if desired_scale >= actual_scale:
return desired_scale

new_scale = actual_scale
for i in range(actual_scale - 1, desired_scale - 1, -1):
name = f"crawl-{crawl_id}-{i}"
Expand All @@ -641,8 +633,6 @@ async def _resolve_scale(
await redis.hdel(f"{crawl_id}:stopone", name)
await redis.hdel(f"{crawl_id}:status", name)

return new_scale

def sync_resources(self, status, name, pod, children):
"""set crawljob status from current resources"""
resources = status.podStatus[name].allocated
Expand Down Expand Up @@ -1544,8 +1534,8 @@ async def update_crawl_state(
f"Crawl gracefully stopping: {status.stopReason}, id: {crawl.id}"
)

# resolve scale
await self._resolve_scale(crawl, redis, status, pods)
# resolve scale down, if needed
await self._resolve_scale_down(crawl, redis, status, pods)

# check if done / failed
status_count: dict[str, int] = {}
Expand Down
8 changes: 6 additions & 2 deletions backend/btrixcloud/operator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,12 @@ class CrawlStatus(BaseModel):
size: int = 0
# human readable size string
sizeHuman: str = ""
# number of pods
scale: int = 1

# actual observed scale (number of pods active)
scale: int = 0
# desired scale as computed by crawl state (number of pods that should be active)
desiredScale: int = 0

filesAdded: int = 0
filesAddedSize: int = 0
finished: Optional[str] = None
Expand Down
10 changes: 6 additions & 4 deletions backend/btrixcloud/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

default_origin = os.environ.get("APP_ORIGIN", "")

browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))


class JSONSerializer(json.JSONEncoder):
"""Serializer class for json.dumps with UUID and datetime support"""
Expand Down Expand Up @@ -203,13 +205,13 @@ def validate_language_code(lang: str):
raise HTTPException(status_code=400, detail="invalid_lang")


def scale_from_browser_windows(browser_windows: int) -> int:
def scale_from_browser_windows(
browser_windows: int, custom_browsers_per_pod=None
) -> int:
"""Return number of pods for given number of browser windows"""
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
return math.ceil(browser_windows / browsers_per_pod)
return math.ceil(browser_windows / (custom_browsers_per_pod or browsers_per_pod))


def browser_windows_from_scale(scale: int) -> int:
"""Return number of browser windows from specified scale"""
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
return scale * browsers_per_pod
2 changes: 1 addition & 1 deletion backend/test/test_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def qa_run_pages_ready(qa_crawl_id, crawler_auth_headers, default_org_id, qa_run
if count + 1 == MAX_ATTEMPTS:
assert False

time.sleep(5)
time.sleep(10)
count += 1

# Wait until pages are ready
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ data:

profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}"

qa_scale: "{{ .Values.qa_scale | default 1 }}"
qa_num_browser_windows: "{{ .Values.qa_num_browser_windows | default (.Values.qa_scale | default 1) }}"

crawler_node_type: "{{ .Values.crawler_node_type }}"
redis_node_type: "{{ .Values.redis_node_type }}"
Expand Down
4 changes: 2 additions & 2 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ crawler_browser_instances: 2
# defaults to 'crawler_browser_instances' if not set
qa_browser_instances: 1

# fixed scale (number of QA pods) to run
qa_scale: 1
# number of browser windows to run for QA (with 'qa_browser_instances' per pod)
qa_num_browser_windows: 2

# this value is added to crawler_cpu_base, for each additional browser
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
Expand Down
Loading