Skip to content

Allow users to run crawls with 1 or 2 browser windows #2627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 57 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
8b9d4bd
Modify backend scale to be number of browser windows
tw4l May 20, 2025
fa339c0
Fix SettingsReponse model
tw4l May 21, 2025
54d91f6
Update frontend for simplified maxBrowserWindows
tw4l May 21, 2025
fc1eafb
Import math
tw4l May 21, 2025
d18f500
Operator fixups
tw4l May 21, 2025
3958bfd
Handle case where scale < workers per pod
tw4l May 21, 2025
a7de12b
Add pylint comment
tw4l May 21, 2025
ece1bc7
Update API settings test
tw4l May 21, 2025
31f5ade
Fix pylint comment
tw4l May 21, 2025
5cd019c
Use crawl.scale, add lots of debug print logging
tw4l May 21, 2025
2561701
Fixups
tw4l May 21, 2025
0396c1c
Consolidate print logging lines
tw4l May 21, 2025
6fd850a
Remove some debug print logging
tw4l May 21, 2025
bf8f1d0
Temp: Debug print log sync_crawls return
tw4l May 21, 2025
551b18b
rebase fix
ikreymer May 22, 2025
27d0fc1
work
ikreymer May 22, 2025
9e1a86e
switch back to last pod
ikreymer May 25, 2025
b7f855d
fix remainder check
ikreymer May 25, 2025
92557ec
rename priorities to use max_browser_windows
ikreymer May 25, 2025
1249ead
Fix screencast window count
tw4l May 26, 2025
13c85be
Rename scale fields to distinguish pods from browser windows
tw4l May 26, 2025
c5da074
Fix linting
tw4l May 26, 2025
92ed072
Undo change to worker index calculation for screenshots
tw4l May 26, 2025
9e67798
Remove unused variable
tw4l May 26, 2025
5f2f859
Add separate browserWindows on backend alongside scale
tw4l May 26, 2025
459e466
Update frontend to use browserWindows not scale
tw4l May 26, 2025
a2756db
Fix types
tw4l May 26, 2025
b0be90b
Calculate browserWindows as needed on org import
tw4l May 26, 2025
b2fd748
Fix backend typing
tw4l May 26, 2025
fd241fe
Fix frontend compilation errors
tw4l May 26, 2025
89f7e39
More backend fixups
tw4l May 26, 2025
2d1b152
Update CrawlSpec model to have both scale and browser windows
tw4l May 26, 2025
b6911ad
Add browser windows to crawl_job
tw4l May 26, 2025
85c5254
Keep default of scale=1 if browser windows/scale not specified
tw4l May 26, 2025
2d0646b
Fix broken org import test
tw4l May 27, 2025
ae6a4dd
cleanup, use desired pod count with page check to avoid starting up m…
ikreymer May 25, 2025
252fc6d
compute resources for remainder browsers
ikreymer May 26, 2025
67cb22a
additional fixes for N browses:
ikreymer May 27, 2025
9f466b9
ensure backwards compatible with max_crawl_scale if no max_browser_wi…
ikreymer May 27, 2025
ae50e6e
fix tests
ikreymer May 27, 2025
844f253
switch browser windows to text box
ikreymer May 27, 2025
7977977
Calculate scale at time of need instead of storing in db
tw4l May 27, 2025
c6275aa
Update org import for change
tw4l May 27, 2025
eaef86f
More fixups for removing scale from db
tw4l May 27, 2025
bb29cc5
Add migration to convert scale to browserWindows in db
tw4l May 27, 2025
c9014db
frontend: custom range for browser windows, by 1 until browser instan…
ikreymer May 27, 2025
5df33de
Don't unset scale in migration
tw4l May 27, 2025
4922179
Store scale in crawl object
tw4l May 27, 2025
4166e32
Update scale in crawl model when crawl is live rescaled
tw4l May 27, 2025
43551e1
Add some tests
tw4l May 27, 2025
a9ee784
Update expected totals in tests
tw4l May 27, 2025
9fbb99f
Remove outdated pylint comment
tw4l May 27, 2025
d939625
set max width
SuaYoo May 29, 2025
aab9f5a
fix spinner
SuaYoo May 29, 2025
cee8dc9
cleanup:
ikreymer May 29, 2025
435a33e
Update backend/btrixcloud/operator/crawls.py
ikreymer Jun 3, 2025
5e21715
Add deprecated flag to Scale
tw4l Jun 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
validate_regexes,
validate_language_code,
is_url,
browser_windows_from_scale,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -222,6 +223,12 @@ async def add_crawl_config(
) -> CrawlConfigAddedResponse:
"""Add new crawl config"""

# Overrides scale if set
if config_in.browserWindows is None:
config_in.browserWindows = browser_windows_from_scale(
cast(int, config_in.scale)
)

# ensure crawlChannel is valid
if not self.get_channel_crawler_image(config_in.crawlerChannel):
raise HTTPException(status_code=404, detail="crawler_not_found")
Expand Down Expand Up @@ -272,7 +279,7 @@ async def add_crawl_config(
jobType=config_in.jobType,
crawlTimeout=config_in.crawlTimeout,
maxCrawlSize=config_in.maxCrawlSize,
scale=config_in.scale,
browserWindows=config_in.browserWindows,
autoAddCollections=config_in.autoAddCollections,
profileid=profileid,
crawlerChannel=config_in.crawlerChannel,
Expand Down Expand Up @@ -408,6 +415,10 @@ async def update_crawl_config(

orig_crawl_config = await self.get_crawl_config(cid, org.id)

if update.scale:
update.browserWindows = browser_windows_from_scale(cast(int, update.scale))
update.scale = None

if update.config and update.config.exclude:
exclude = update.config.exclude
if isinstance(exclude, str):
Expand Down Expand Up @@ -441,7 +452,9 @@ async def update_crawl_config(
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
)
changed = changed or self.check_attr_changed(orig_crawl_config, update, "scale")
changed = changed or self.check_attr_changed(
orig_crawl_config, update, "browserWindows"
)

schedule_changed = self.check_attr_changed(
orig_crawl_config, update, "schedule"
Expand Down
18 changes: 13 additions & 5 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from fastapi import HTTPException

from .utils import dt_now, date_to_str
from .utils import dt_now, date_to_str, scale_from_browser_windows
from .k8sapi import K8sAPI

from .models import StorageRef, CrawlConfig, BgJobType
Expand Down Expand Up @@ -227,13 +227,16 @@ async def create_crawl_job(

await self.has_storage_secret(storage_secret)

scale = scale_from_browser_windows(crawlconfig.browserWindows)

return await self.new_crawl_job(
cid,
userid,
str(crawlconfig.oid),
str(storage),
crawlconfig.crawlerChannel,
crawlconfig.scale,
scale,
crawlconfig.browserWindows,
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
Expand All @@ -258,7 +261,8 @@ async def update_running_crawl_config(
# pylint: disable=use-dict-literal
patch = dict(
crawlerChannel=crawlconfig.crawlerChannel,
scale=crawlconfig.scale,
scale=scale_from_browser_windows(crawlconfig.browserWindows),
browserWindows=crawlconfig.browserWindows,
timeout=crawlconfig.crawlTimeout,
maxCrawlSize=crawlconfig.maxCrawlSize,
proxyId=crawlconfig.proxyId or DEFAULT_PROXY_ID,
Expand Down Expand Up @@ -373,9 +377,13 @@ async def rollover_restart_crawl(self, crawl_id: str) -> dict:
update = date_to_str(dt_now())
return await self._patch_job(crawl_id, {"restartTime": update})

async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict:
async def scale_crawl(
self, crawl_id: str, scale: int = 1, browser_windows: int = 1
) -> dict:
"""Set the crawl scale (job parallelism) on the specified job"""
return await self._patch_job(crawl_id, {"scale": scale})
return await self._patch_job(
crawl_id, {"scale": scale, "browserWindows": browser_windows}
)

async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict:
"""Request a crawl cancelation or stop by calling an API
Expand Down
44 changes: 35 additions & 9 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
parse_jsonl_log_messages,
stream_dict_list_as_csv,
validate_regexes,
scale_from_browser_windows,
browser_windows_from_scale,
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
Expand Down Expand Up @@ -368,7 +370,8 @@ async def add_new_crawl(
oid=crawlconfig.oid,
cid=crawlconfig.id,
cid_rev=crawlconfig.rev,
scale=crawlconfig.scale,
scale=scale_from_browser_windows(crawlconfig.browserWindows),
browserWindows=crawlconfig.browserWindows,
jobType=crawlconfig.jobType,
config=crawlconfig.config,
profileid=crawlconfig.profileid,
Expand All @@ -392,16 +395,27 @@ async def add_new_crawl(
pass

async def update_crawl_scale(
self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User
self,
crawl_id: str,
org: Organization,
scale: int,
browser_windows: int,
user: User,
) -> bool:
"""Update crawl scale in the db"""
crawl = await self.get_crawl(crawl_id, org)
update = UpdateCrawlConfig(scale=crawl_scale.scale)

update = UpdateCrawlConfig(browserWindows=browser_windows)
await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update)

result = await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl", "oid": org.id},
{"$set": {"scale": crawl_scale.scale}},
{
"$set": {
"scale": scale,
"browserWindows": browser_windows,
}
},
return_document=pymongo.ReturnDocument.AFTER,
)

Expand Down Expand Up @@ -529,7 +543,7 @@ async def add_or_remove_exclusion(

cid = crawl.cid

scale = crawl.scale or 1
browser_windows = crawl.browserWindows or 2

async with self.get_redis(crawl_id) as redis:
query = {
Expand All @@ -538,6 +552,7 @@ async def add_or_remove_exclusion(
}
query_str = json.dumps(query)

scale = scale_from_browser_windows(browser_windows)
for i in range(0, scale):
await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str)

Expand Down Expand Up @@ -1524,20 +1539,31 @@ async def update_crawl_api(
response_model=CrawlScaleResponse,
)
async def scale_crawl(
scale: CrawlScale,
crawl_scale: CrawlScale,
crawl_id,
user: User = Depends(user_dep),
org: Organization = Depends(org_crawl_dep),
):
await ops.update_crawl_scale(crawl_id, org, scale, user)
if crawl_scale.browserWindows:
browser_windows = crawl_scale.browserWindows
scale = scale_from_browser_windows(browser_windows)
elif crawl_scale.scale:
scale = crawl_scale.scale
browser_windows = browser_windows_from_scale(scale)
else:
raise HTTPException(
status_code=400, detail="browser_windows_or_scale_required"
)

await ops.update_crawl_scale(crawl_id, org, scale, browser_windows, user)

result = await ops.crawl_manager.scale_crawl(crawl_id, scale.scale)
result = await ops.crawl_manager.scale_crawl(crawl_id, scale, browser_windows)
if not result or not result.get("success"):
raise HTTPException(
status_code=400, detail=result.get("error") or "unknown"
)

return {"scaled": scale.scale}
return {"scaled": True, "browserWindows": browser_windows}

@app.get(
"/orgs/{oid}/crawls/{crawl_id}/access",
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
) = PageOps = BackgroundJobOps = object


CURR_DB_VERSION = "0046"
CURR_DB_VERSION = "0047"


# ============================================================================
Expand Down
4 changes: 4 additions & 0 deletions backend/btrixcloud/k8sapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def new_crawl_job_yaml(
storage: str,
crawler_channel: Optional[str] = "",
scale: Optional[int] = 1,
browser_windows: Optional[int] = 1,
crawl_timeout: Optional[int] = 0,
max_crawl_size: Optional[int] = 0,
manual: bool = True,
Expand All @@ -109,6 +110,7 @@ def new_crawl_job_yaml(
"storage_name": storage,
"crawler_channel": crawler_channel,
"scale": scale,
"browser_windows": browser_windows,
"timeout": crawl_timeout,
"max_crawl_size": max_crawl_size or 0,
"manual": "1" if manual else "0",
Expand All @@ -130,6 +132,7 @@ async def new_crawl_job(
storage: str,
crawler_channel: Optional[str] = "",
scale: Optional[int] = 1,
browser_windows: Optional[int] = 1,
crawl_timeout: Optional[int] = 0,
max_crawl_size: Optional[int] = 0,
manual: bool = True,
Expand All @@ -148,6 +151,7 @@ async def new_crawl_job(
storage=storage,
crawler_channel=crawler_channel,
scale=scale,
browser_windows=browser_windows,
crawl_timeout=crawl_timeout,
max_crawl_size=max_crawl_size,
manual=manual,
Expand Down
8 changes: 4 additions & 4 deletions backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ class SettingsResponse(BaseModel):
defaultPageLoadTimeSeconds: int

maxPagesPerCrawl: int
numBrowsers: int
maxScale: int
numBrowsersPerInstance: int
maxBrowserWindows: int

billingEnabled: bool

Expand Down Expand Up @@ -149,8 +149,8 @@ def main() -> None:
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)),
maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)),
numBrowsersPerInstance=int(os.environ.get("NUM_BROWSERS", 1)),
maxBrowserWindows=int(os.environ.get("MAX_BROWSER_WINDOWS", 8)),
billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")),
signUpUrl=os.environ.get("SIGN_UP_URL", ""),
salesEmail=os.environ.get("SALES_EMAIL", ""),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ async def migrate_up(self):
config = CrawlConfig.from_dict(config_dict)
print(
f"Updating Crawl Config {config.id}: schedule: {config.schedule}, "
+ f"timeout: {config.crawlTimeout}, scale: {config.scale}"
+ f"timeout: {config.crawlTimeout}"
)
try:
await crawl_manager.update_scheduled_job(config)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Migration 0047 - Convert scale to browserWindows
"""

from btrixcloud.migrations import BaseMigration
from btrixcloud.utils import browser_windows_from_scale


MIGRATION_VERSION = "0047"


# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

async def migrate_up(self):
"""Perform migration up.

Calculate and store browserWindows from existing scale on workflows and crawls
"""
configs_mdb = self.mdb["crawl_configs"]
crawls_mdb = self.mdb["crawls"]

async for config_raw in configs_mdb.find({"browserWindows": None}):
config_id = config_raw["_id"]
scale = config_raw.get("scale", 1)

try:
await configs_mdb.find_one_and_update(
{"_id": config_id},
{
"$set": {"browserWindows": browser_windows_from_scale(scale)},
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to set browser windows from scale for workflow {config_id}: {err}",
flush=True,
)

async for crawl_raw in crawls_mdb.find({"browserWindows": None}):
crawl_id = crawl_raw["_id"]
scale = crawl_raw.get("scale", 1)

try:
await crawls_mdb.find_one_and_update(
{"_id": crawl_id},
{
"$set": {"browserWindows": browser_windows_from_scale(scale)},
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to set browser windows from scale for crawl {crawl_id}: {err}",
flush=True,
)
Loading
Loading