diff --git a/src/crawler.ts b/src/crawler.ts index 9399227f..fe085bd5 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js"; import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, - HTML_TYPES, DEFAULT_SELECTORS, } from "./util/constants.js"; import { AdBlockRules, BlockRules } from "./util/blockrules.js"; import { OriginOverride } from "./util/originoverride.js"; -// to ignore HTTPS error for HEAD check -import { Agent as HTTPAgent } from "http"; -import { Agent as HTTPSAgent } from "https"; import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; - -const HTTPS_AGENT = new HTTPSAgent({ - rejectUnauthorized: false, -}); - -const HTTP_AGENT = new HTTPAgent(); +import { isHTMLContentType } from "./util/reqresp.js"; const behaviors = fs.readFileSync( new URL( @@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior(); async crawlPage(opts: WorkerState): Promise { await this.writeStats(); - const { page, data, workerid, callbacks, directFetchCapture } = opts; + const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts; data.callbacks = callbacks; const { url } = data; @@ -790,35 +781,27 @@ self.__bx_behaviors.selectMainBehavior(); data.logDetails = logDetails; data.workerid = workerid; - data.isHTMLPage = await timedRun( - this.isHTML(url, logDetails), - FETCH_TIMEOUT_SECS, - "HEAD request to determine if URL is HTML page timed out", - logDetails, - "fetch", - true, - ); - - if (!data.isHTMLPage && directFetchCapture) { + if (directFetchCapture) { try { - const { fetched, mime } = await timedRun( - directFetchCapture(url), + const { fetched, mime, ts } = await timedRun( + directFetchCapture({ url, headers: this.headers, cdp }), FETCH_TIMEOUT_SECS, "Direct fetch capture attempt timed out", logDetails, "fetch", true, ); + if (mime) { + data.mime = mime; + data.isHTMLPage = isHTMLContentType(mime); + } if (fetched) { data.loadState = LoadState.FULL_PAGE_LOADED; - if (mime) { - data.mime = mime; - } data.status = 200; - data.ts = new Date(); + data.ts = ts || new Date(); logger.info( "Direct fetch successful", - { url, ...logDetails }, + { url, mime, ...logDetails }, "fetch", ); return; @@ -1752,7 +1735,7 @@ self.__bx_behaviors.selectMainBehavior(); const contentType = resp.headers()["content-type"]; - isHTMLPage = this.isHTMLContentType(contentType); + isHTMLPage = isHTMLContentType(contentType); if (contentType) { data.mime = contentType.split(";")[0]; @@ -1878,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior(); "behavior", ); try { - await frame.evaluate("self.__bx_behaviors.awaitPageLoad();"); + await frame.evaluate( + "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();", + ); } catch (e) { logger.warn("Waiting for custom page load failed", e, "behavior"); } @@ -2191,49 +2176,6 @@ self.__bx_behaviors.selectMainBehavior(); } } - resolveAgent(urlParsed: URL) { - return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT; - } - - async isHTML(url: string, logDetails: LogDetails) { - try { - const resp = await fetch(url, { - method: "HEAD", - headers: this.headers, - agent: this.resolveAgent, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any); - if (resp.status !== 200) { - logger.debug("HEAD response code != 200, loading in browser", { - status: resp.status, - ...logDetails, - }); - return true; - } - - return this.isHTMLContentType(resp.headers.get("Content-Type")); - } catch (e) { - // can't confirm not html, so try in browser - logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails }); - return true; - } - } - - isHTMLContentType(contentType: string | null) { - // just load if no content-type - if (!contentType) { - return true; - } - - const mime = contentType.split(";")[0]; - - if (HTML_TYPES.includes(mime)) { - return true; - } - - return false; - } - async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) { if (!sitemap) { return; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index d4a92bc8..182571ea 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -6,7 +6,7 @@ import PQueue from "p-queue"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; -import { RequestResponseInfo } from "./reqresp.js"; +import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; @@ -75,11 +75,23 @@ export type AsyncFetchOptions = { filter?: (resp: Response) => boolean; ignoreDupe?: boolean; maxFetchSize?: number; + manualRedirect?: boolean; }; // ================================================================= -export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & { +export type DirectFetchRequest = { + url: string; + headers: Record; + cdp: CDPSession; +}; + +// ================================================================= +export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & { cdp: CDPSession; +}; + +// ================================================================= +export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & { requestId: string; }; @@ -1062,12 +1074,23 @@ export class Recorder { this.writer.writeRecordPair(responseRecord, requestRecord); } - async directFetchCapture( - url: string, - ): Promise<{ fetched: boolean; mime: string }> { + async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{ + fetched: boolean; + mime: string; + ts: Date; + }> { const reqresp = new RequestResponseInfo("0"); + const ts = new Date(); + + const cookie = await this.getCookieString(cdp, url); + if (cookie) { + headers["Cookie"] = cookie; + } + reqresp.url = url; reqresp.method = "GET"; + reqresp.requestHeaders = headers; + reqresp.ts = ts; logger.debug( "Directly fetching page URL without browser", @@ -1075,8 +1098,21 @@ export class Recorder { "recorder", ); - const filter = (resp: Response) => - resp.status === 200 && !resp.headers.get("set-cookie"); + let mime: string = ""; + + const filter = (resp: Response) => { + // only direct load 200 responses + if (resp.status !== 200) { + return false; + } + + const ct = resp.headers.get("content-type"); + if (ct) { + mime = ct.split(";")[0]; + } + + return !isHTMLContentType(mime); + }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, // should not get here, as dupe pages tracked via seen list @@ -1087,16 +1123,28 @@ export class Recorder { networkId: "0", filter, ignoreDupe: true, + manualRedirect: true, }); const res = await fetcher.load(); - const mime = - (reqresp.responseHeaders && - reqresp.responseHeaders["content-type"] && - reqresp.responseHeaders["content-type"].split(";")[0]) || - ""; + this.addPageRecord(reqresp); + + if (url === this.pageUrl && !this.pageInfo.ts) { + logger.debug("Setting page timestamp", { ts, url }); + this.pageInfo.ts = ts; + } + + return { fetched: res === "fetched", mime, ts }; + } + + async getCookieString(cdp: CDPSession, url: string) { + const cookieList: string[] = []; + const { cookies } = await cdp.send("Network.getCookies", { urls: [url] }); + for (const { name, value } of cookies) { + cookieList.push(`${name}=${value}`); + } - return { fetched: res === "fetched", mime }; + return cookieList.join(";"); } } @@ -1115,6 +1163,8 @@ class AsyncFetcher { tempdir: string; filename: string; + manualRedirect = false; + constructor({ tempdir, reqresp, @@ -1124,6 +1174,7 @@ class AsyncFetcher { filter = undefined, ignoreDupe = false, maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE, + manualRedirect = false, }: AsyncFetchOptions) { this.reqresp = reqresp; this.reqresp.expectedSize = expectedSize; @@ -1142,6 +1193,8 @@ class AsyncFetcher { ); this.maxFetchSize = maxFetchSize; + + this.manualRedirect = manualRedirect; } async load() { @@ -1277,9 +1330,9 @@ class AsyncFetcher { reqresp.status = 0; reqresp.errorText = e.message; } finally { + recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id if (networkId !== "0") { - recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } } @@ -1307,6 +1360,7 @@ class AsyncFetcher { headers, body: reqresp.postData || undefined, signal, + redirect: this.manualRedirect ? "manual" : "follow", }); if (this.filter && !this.filter(resp) && abort) { @@ -1323,6 +1377,7 @@ class AsyncFetcher { } if (reqresp.expectedSize === 0) { + reqresp.fillFetchResponse(resp); reqresp.payload = new Uint8Array(); return; } else if (!resp.body) { @@ -1422,7 +1477,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher { class NetworkLoadStreamAsyncFetcher extends AsyncFetcher { cdp: CDPSession; - constructor(opts: ResponseStreamAsyncFetchOptions) { + constructor(opts: NetworkLoadAsyncFetchOptions) { super(opts); this.cdp = opts.cdp; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index ac58a029..1fc9fbe5 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; +import { HTML_TYPES } from "./constants.js"; const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; @@ -148,10 +149,15 @@ export class RequestResponseInfo { } } + isRedirectStatus() { + return this.status >= 300 && this.status < 400 && this.status !== 304; + } + isSelfRedirect() { - if (this.status < 300 || this.status >= 400 || this.status === 304) { + if (!this.isRedirectStatus()) { return false; } + try { const headers = new Headers(this.getResponseHeadersDict()); const location = headers.get("location") || ""; @@ -362,3 +368,18 @@ export class RequestResponseInfo { return value.replace(/\n/g, ", "); } } + +export function isHTMLContentType(contentType: string | null) { + // just load if no content-type + if (!contentType) { + return true; + } + + const mime = contentType.split(";")[0]; + + if (HTML_TYPES.includes(mime)) { + return true; + } + + return false; +} diff --git a/src/util/state.ts b/src/util/state.ts index d455d877..3354f169 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -66,7 +66,7 @@ export class PageState { callbacks: PageCallbacks = {}; - isHTMLPage?: boolean; + isHTMLPage = true; text?: string; screenshotView?: Buffer; favicon?: string; diff --git a/src/util/worker.ts b/src/util/worker.ts index 4d1ed64d..b54bc2a7 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -2,7 +2,7 @@ import os from "os"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun } from "./timing.js"; -import { Recorder } from "./recorder.js"; +import { DirectFetchRequest, Recorder } from "./recorder.js"; import { rxEscape } from "./seeds.js"; import { CDPSession, Page } from "puppeteer-core"; import { PageState, WorkerId } from "./state.js"; @@ -20,8 +20,10 @@ export type WorkerOpts = { workerid: WorkerId; // eslint-disable-next-line @typescript-eslint/ban-types callbacks: Record; - directFetchCapture?: - | ((url: string) => Promise<{ fetched: boolean; mime: string }>) + directFetchCapture: + | (( + request: DirectFetchRequest, + ) => Promise<{ fetched: boolean; mime: string; ts: Date }>) | null; frameIdToExecId: Map; }; @@ -171,7 +173,7 @@ export class PageWorker { this.cdp = cdp; this.callbacks = {}; const directFetchCapture = this.recorder - ? (x: string) => this.recorder!.directFetchCapture(x) + ? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req) : null; this.opts = { page,