Skip to content

Add support for autoclick #2313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,31 @@ def _load_redis(self, params, status: CrawlStatus, children):

return self.load_from_yaml("redis.yaml", params)

def _filter_autoclick_behavior(
self, behaviors: Optional[str], crawler_image: str
) -> Optional[str]:
"""Remove autoclick behavior if crawler version doesn't support it"""
min_autoclick_crawler_image = os.environ.get("MIN_AUTOCLICK_CRAWLER_IMAGE")

if (
min_autoclick_crawler_image
and behaviors
and "autoclick" in behaviors
and crawler_image
and crawler_image < min_autoclick_crawler_image
):
print(
"Crawler version < min_autoclick_crawler_image, removing autoclick behavior",
flush=True,
)
behaviors_list = behaviors.split(",")
filtered_behaviors = [
behavior for behavior in behaviors_list if behavior != "autoclick"
]
return ",".join(filtered_behaviors)

return behaviors

async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):
name = f"crawl-config-{crawl.id}"

Expand All @@ -357,7 +382,13 @@ async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params):

crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)

params["config"] = json.dumps(crawlconfig.get_raw_config())
raw_config = crawlconfig.get_raw_config()

raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
)

params["config"] = json.dumps(raw_config)

return self.load_from_yaml("crawl_configmap.yaml", params)

Expand Down
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ data:

MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}"

MIN_AUTOCLICK_CRAWLER_IMAGE: "{{ .Values.min_autoclick_crawler_image }}"

NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}"

MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"
Expand Down
3 changes: 3 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ crawler_namespace: "crawlers"
# if set, will restrict QA to image names that are >= than this value
# min_qa_crawler_image: ""

# if set, will restrict autoclick behavior to image names that are >= than this value
min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.4.0"

# optional: enable to use a persist volume claim for all crawls
# can be enabled to use a multi-write shared filesystem
# crawler_pv_claim: "nfs-shared-crawls"
Expand Down
33 changes: 29 additions & 4 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
),
false,
)}
${inputCol(
html`<sl-checkbox
name="autoclickBehavior"
?checked=${this.formState.autoclickBehavior}
>
${msg("Autoclick behavior")}
</sl-checkbox>`,
)}
${this.renderHelpTextCol(
msg(
`When enabled the browser will automatically click on links that don't navigate to other pages.`,
),
false,
)}
${inputCol(html`
<sl-input
name="pageExtraDelaySeconds"
Expand Down Expand Up @@ -2151,10 +2165,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
lang: this.formState.lang || "",
blockAds: this.formState.blockAds,
exclude: trimArray(this.formState.exclusions),
behaviors: (this.formState.autoscrollBehavior
? DEFAULT_BEHAVIORS
: DEFAULT_BEHAVIORS.slice(1)
).join(","),
behaviors: this.setBehaviors(),
},
crawlerChannel: this.formState.crawlerChannel || "default",
proxyId: this.formState.proxyId,
Expand All @@ -2163,6 +2174,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
return config;
}

private setBehaviors(): string {
let behaviors = (
this.formState.autoscrollBehavior
? DEFAULT_BEHAVIORS
: DEFAULT_BEHAVIORS.slice(1)
).join(",");

if (this.formState.autoclickBehavior) {
behaviors += ",autoclick";
}

return behaviors;
}

private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
Expand Down
5 changes: 5 additions & 0 deletions frontend/src/utils/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ export type FormState = {
autoAddCollections: string[];
description: WorkflowParams["description"];
autoscrollBehavior: boolean;
autoclickBehavior: boolean;
userAgent: string | null;
crawlerChannel: string;
proxyId: string | null;
Expand Down Expand Up @@ -138,6 +139,7 @@ export const getDefaultFormState = (): FormState => ({
autoAddCollections: [],
description: null,
autoscrollBehavior: true,
autoclickBehavior: false,
userAgent: null,
crawlerChannel: "default",
proxyId: null,
Expand Down Expand Up @@ -286,6 +288,9 @@ export function getInitialFormState(params: {
autoscrollBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes("autoscroll")
: defaultFormState.autoscrollBehavior,
autoclickBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes("autoclick")
: defaultFormState.autoclickBehavior,
userAgent:
params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
crawlerChannel:
Expand Down
Loading