diff --git a/src/crawler.ts b/src/crawler.ts index e7276e95..448f7f9b 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -62,7 +62,7 @@ import { } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; -import { ScopedSeed } from "./util/seeds.js"; +import { ScopedSeed, parseSeeds } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, @@ -134,7 +134,7 @@ export class Crawler { maxPageTime: number; - seeds: ScopedSeed[]; + seeds: ScopedSeed[] = []; numOriginalSeeds = 0; // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -255,9 +255,6 @@ export class Crawler { this.saveStateFiles = []; this.lastSaveTime = 0; - this.seeds = this.params.scopedSeeds as ScopedSeed[]; - this.numOriginalSeeds = this.seeds.length; - // sum of page load + behavior timeouts + 2 x pageop timeouts (for cloudflare, link extraction) + extra page delay // if exceeded, will interrupt and move on to next page (likely behaviors or some other operation is stuck) this.maxPageTime = @@ -514,6 +511,9 @@ export class Crawler { this.proxyServer = await initProxy(this.params, RUN_DETACHED); + this.seeds = await parseSeeds(this.params); + this.numOriginalSeeds = this.seeds.length; + logger.info("Seeds", this.seeds); logger.info("Link Selectors", this.params.selectLinks); diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 75abfc4e..819bcf39 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -96,8 +96,6 @@ export class ReplayCrawler extends Crawler { // skip text from first two frames, as they are RWP boilerplate this.skipTextDocs = SKIP_FRAMES; - this.params.scopedSeeds = []; - this.params.screenshot = ["view"]; this.params.text = ["to-warc"]; diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 73da878b..b17199f0 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -20,7 +20,6 @@ import { BxFunctionBindings, DEFAULT_CRAWL_ID_TEMPLATE, } from "./constants.js"; -import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; import { @@ -37,8 +36,6 @@ export type CrawlerArgs = ReturnType & { logExcludeContext: LogContext[]; text: string[]; - scopedSeeds: ScopedSeed[]; - customBehaviors: string[]; selectLinks: ExtractSelector[]; @@ -770,22 +767,6 @@ class ArgParser { } } - if (argv.seedFile) { - const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); - const urlSeedFileList = urlSeedFile.split("\n"); - - if (typeof argv.seeds === "string") { - argv.seeds = [argv.seeds]; - } - - for (const seed of urlSeedFileList) { - if (seed) { - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (argv.seeds as any).push(seed); - } - } - } - let selectLinks: ExtractSelector[]; if (argv.selectLinks) { @@ -817,50 +798,10 @@ class ArgParser { //logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`); } - const scopedSeeds: ScopedSeed[] = []; - - if (!isQA) { - const scopeOpts = { - scopeType: argv.scopeType, - sitemap: argv.sitemap, - include: argv.include, - exclude: argv.exclude, - depth: argv.depth, - extraHops: argv.extraHops, - }; - - for (const seed of argv.seeds) { - const newSeed = typeof seed === "string" ? { url: seed } : seed; - - try { - scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - logger.error("Failed to create seed", { - error: e.toString(), - ...scopeOpts, - ...newSeed, - }); - if (argv.failOnFailedSeed) { - logger.fatal( - "Invalid seed specified, aborting crawl", - { url: newSeed.url }, - "general", - 1, - ); - } - } - } - - if (!scopedSeeds.length) { - logger.fatal("No valid seeds specified, aborting crawl"); - } - } else if (!argv.qaSource) { + if (isQA && !argv.qaSource) { logger.fatal("--qaSource required for QA mode"); } - argv.scopedSeeds = scopedSeeds; - // Resolve statsFilename if (argv.statsFilename) { argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index fa8ad0bc..42dbe40b 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -24,6 +24,35 @@ export type FileSource = { export type FileSources = FileSource[]; +async function getTempFile( + filename: string, + dirPrefix: string, +): Promise { + const tmpDir = `/tmp/${dirPrefix}-${crypto.randomBytes(4).toString("hex")}`; + await fsp.mkdir(tmpDir, { recursive: true }); + return path.join(tmpDir, filename); +} + +async function writeUrlContentsToFile(url: string, filepath: string) { + const res = await fetch(url, { dispatcher: getProxyDispatcher() }); + const fileContents = await res.text(); + await fsp.writeFile(filepath, fileContents); +} + +export async function collectOnlineSeedFile(url: string): Promise { + const filename = path.basename(new URL(url).pathname); + const filepath = await getTempFile(filename, "seeds-"); + + try { + await writeUrlContentsToFile(url, filepath); + logger.info("Seed file downloaded", { url, path: filepath }); + } catch (e) { + logger.fatal("Error downloading seed file from URL", { url, error: e }); + } + + return filepath; +} + export async function collectCustomBehaviors( sources: string[], ): Promise { @@ -88,17 +117,10 @@ async function collectGitBehaviors(gitUrl: string): Promise { async function collectOnlineBehavior(url: string): Promise { const filename = path.basename(new URL(url).pathname); - const tmpDir = path.join( - os.tmpdir(), - `behaviors-${crypto.randomBytes(4).toString("hex")}`, - ); - await fsp.mkdir(tmpDir, { recursive: true }); - const behaviorFilepath = path.join(tmpDir, filename); + const behaviorFilepath = await getTempFile(filename, "behaviors-"); try { - const res = await fetch(url, { dispatcher: getProxyDispatcher() }); - const fileContents = await res.text(); - await fsp.writeFile(behaviorFilepath, fileContents); + await writeUrlContentsToFile(url, behaviorFilepath); logger.info( "Custom behavior file downloaded", { url, path: behaviorFilepath }, diff --git a/src/util/seeds.ts b/src/util/seeds.ts index d0e24445..40396ccf 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -1,5 +1,8 @@ -import { logger } from "./logger.js"; +import fs from "fs"; + import { MAX_DEPTH } from "./constants.js"; +import { collectOnlineSeedFile } from "./file_reader.js"; +import { logger } from "./logger.js"; type ScopeType = | "prefix" @@ -300,6 +303,71 @@ export class ScopedSeed { } } +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export async function parseSeeds(params: any): Promise { + let seeds = params.seeds; + const scopedSeeds: ScopedSeed[] = []; + + if (params.seedFile) { + let seedFilePath = params.seedFile; + if (params.seedFile.startsWith("http")) { + seedFilePath = await collectOnlineSeedFile(params.seedFile as string); + } + + const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); + const urlSeedFileList = urlSeedFile.split("\n"); + + if (typeof seeds === "string") { + seeds = [seeds]; + } + + for (const seed of urlSeedFileList) { + if (seed) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (seeds as any).push(seed); + } + } + } + + const scopeOpts = { + scopeType: params.scopeType, + sitemap: params.sitemap, + include: params.include, + exclude: params.exclude, + depth: params.depth, + extraHops: params.extraHops, + }; + + for (const seed of seeds) { + const newSeed = typeof seed === "string" ? { url: seed } : seed; + + try { + scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed", { + error: e.toString(), + ...scopeOpts, + ...newSeed, + }); + if (params.failOnFailedSeed) { + logger.fatal( + "Invalid seed specified, aborting crawl", + { url: newSeed.url }, + "general", + 1, + ); + } + } + } + + if (!params.qaSource && !scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl"); + } + + return scopedSeeds; +} + export function rxEscape(string: string) { return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); } diff --git a/src/util/worker.ts b/src/util/worker.ts index f4dc8ddf..dee9ceba 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -351,7 +351,7 @@ export class PageWorker { let loggedWaiting = false; while (await this.crawler.isCrawlRunning()) { - await crawlState.processMessage(this.crawler.params.scopedSeeds); + await crawlState.processMessage(this.crawler.seeds); const data = await crawlState.nextFromQueue(); diff --git a/tests/scopes.test.js b/tests/scopes.test.js index ddb64e04..9717fb11 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -1,8 +1,9 @@ import { parseArgs } from "../dist/util/argParser.js"; +import { parseSeeds } from "../dist/util/seeds.js"; import fs from "fs"; -function getSeeds(config) { +async function getSeeds(config) { const orig = fs.readFileSync; fs.readFileSync = (name, ...args) => { @@ -12,12 +13,12 @@ function getSeeds(config) { return orig(name, ...args); }; - const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]); - return res.scopedSeeds; + const params = parseArgs(["node", "crawler", "--config", "stdinconfig"]); + return await parseSeeds(params); } test("default scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -30,7 +31,7 @@ seeds: }); test("default scope + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -45,7 +46,7 @@ exclude: https://example.com/pathexclude }); test("default scope + exclude is numeric", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -60,7 +61,7 @@ exclude: "2022" }); test("prefix scope global + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - https://example.com/ @@ -76,7 +77,7 @@ exclude: https://example.com/pathexclude }); test("prefix scope per seed + exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ scopeType: prefix @@ -92,7 +93,7 @@ exclude: https://example.com/pathexclude }); test("host scope and domain scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ @@ -127,7 +128,7 @@ seeds: }); test("domain scope drop www.", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://www.example.com/ scopeType: domain @@ -139,7 +140,7 @@ seeds: }); test("custom scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ include: https?://example.com/(path|other) @@ -153,7 +154,7 @@ seeds: }); test("inherit scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -177,7 +178,7 @@ exclude: https://example.com/pathexclude }); test("override scope", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -220,7 +221,7 @@ include: https://example.com/onlythispath }); test("override scope with exclude", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/1 @@ -275,7 +276,7 @@ exclude: }); test("with exclude non-string types", async () => { - const seeds = getSeeds(` + const seeds = await getSeeds(` seeds: - url: https://example.com/ exclude: "2023" diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 9901ff36..c76afa6e 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -38,3 +38,39 @@ test("check that URLs in seed-list are crawled", async () => { } expect(foundSeedUrl).toBe(true); }); + + +test("check that URLs in seed-list hosted at URL are crawled", async () => { + try { + await exec( + 'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000', + ); + } catch (error) { + console.log(error); + } + + let crawled_pages = fs.readFileSync( + "test-crawls/collections/onlinefilelisttest/pages/pages.jsonl", + "utf8", + ); + let seed_file = fs + .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8") + .split("\n") + .sort(); + + let seed_file_list = []; + for (var j = 0; j < seed_file.length; j++) { + if (seed_file[j] != undefined) { + seed_file_list.push(seed_file[j]); + } + } + + let foundSeedUrl = true; + + for (var i = 1; i < seed_file_list.length; i++) { + if (crawled_pages.indexOf(seed_file_list[i]) == -1) { + foundSeedUrl = false; + } + } + expect(foundSeedUrl).toBe(true); +});