From 23b55eb9946ef45f0ce16cb5ba9383a99da10d86 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 May 2024 10:51:31 -0700 Subject: [PATCH 1/5] direct fetch optimization: - drop initial HEAD check (and obsolete 'agent' params to fetch, no longer used) - load cookies for each page for direct fetch - attempt direct fetch GET request on every page with cookies + correct user-agent - abort direct fetch if response is HTML, and then load in browser, otherwise proceed with direct fetch - ensure direct fetch timestamp is set correctly, populated in pageinfo --- src/crawler.ts | 92 +++++++++++--------------------------------- src/util/recorder.ts | 34 +++++++++++----- src/util/reqresp.ts | 16 ++++++++ src/util/state.ts | 2 +- src/util/worker.ts | 10 +++-- 5 files changed, 70 insertions(+), 84 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 9399227fc..3fe38f6a8 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js"; import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, - HTML_TYPES, DEFAULT_SELECTORS, } from "./util/constants.js"; import { AdBlockRules, BlockRules } from "./util/blockrules.js"; import { OriginOverride } from "./util/originoverride.js"; -// to ignore HTTPS error for HEAD check -import { Agent as HTTPAgent } from "http"; -import { Agent as HTTPSAgent } from "https"; import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; - -const HTTPS_AGENT = new HTTPSAgent({ - rejectUnauthorized: false, -}); - -const HTTP_AGENT = new HTTPAgent(); +import { isHTMLContentType } from "./util/reqresp.js"; const behaviors = fs.readFileSync( new URL( @@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior(); async crawlPage(opts: WorkerState): Promise { await this.writeStats(); - const { page, data, workerid, callbacks, directFetchCapture } = opts; + const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts; data.callbacks = callbacks; const { url } = data; @@ -790,32 +781,26 @@ self.__bx_behaviors.selectMainBehavior(); data.logDetails = logDetails; data.workerid = workerid; - data.isHTMLPage = await timedRun( - this.isHTML(url, logDetails), - FETCH_TIMEOUT_SECS, - "HEAD request to determine if URL is HTML page timed out", - logDetails, - "fetch", - true, - ); + if (directFetchCapture) { + const cookie = await this.getCookieString(cdp, url); - if (!data.isHTMLPage && directFetchCapture) { try { - const { fetched, mime } = await timedRun( - directFetchCapture(url), + const { fetched, mime, ts } = await timedRun( + directFetchCapture(url, { Cookie: cookie, ...this.headers }), FETCH_TIMEOUT_SECS, "Direct fetch capture attempt timed out", logDetails, "fetch", true, ); + if (mime) { + data.mime = mime; + data.isHTMLPage = isHTMLContentType(mime); + } if (fetched) { data.loadState = LoadState.FULL_PAGE_LOADED; - if (mime) { - data.mime = mime; - } data.status = 200; - data.ts = new Date(); + data.ts = ts || new Date(); logger.info( "Direct fetch successful", { url, ...logDetails }, @@ -842,6 +827,16 @@ self.__bx_behaviors.selectMainBehavior(); await this.doPostLoadActions(opts); } + async getCookieString(cdp: CDPSession, url: string) { + const cookieList: string[] = []; + const { cookies } = await cdp.send("Network.getCookies", { urls: [url] }); + for (const { name, value } of cookies) { + cookieList.push(`${name}=${value}`); + } + + return cookieList.join(";"); + } + async doPostLoadActions(opts: WorkerState, saveOutput = false) { const { page, cdp, data, workerid } = opts; const { url } = data; @@ -1752,7 +1747,7 @@ self.__bx_behaviors.selectMainBehavior(); const contentType = resp.headers()["content-type"]; - isHTMLPage = this.isHTMLContentType(contentType); + isHTMLPage = isHTMLContentType(contentType); if (contentType) { data.mime = contentType.split(";")[0]; @@ -2191,49 +2186,6 @@ self.__bx_behaviors.selectMainBehavior(); } } - resolveAgent(urlParsed: URL) { - return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT; - } - - async isHTML(url: string, logDetails: LogDetails) { - try { - const resp = await fetch(url, { - method: "HEAD", - headers: this.headers, - agent: this.resolveAgent, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any); - if (resp.status !== 200) { - logger.debug("HEAD response code != 200, loading in browser", { - status: resp.status, - ...logDetails, - }); - return true; - } - - return this.isHTMLContentType(resp.headers.get("Content-Type")); - } catch (e) { - // can't confirm not html, so try in browser - logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails }); - return true; - } - } - - isHTMLContentType(contentType: string | null) { - // just load if no content-type - if (!contentType) { - return true; - } - - const mime = contentType.split(";")[0]; - - if (HTML_TYPES.includes(mime)) { - return true; - } - - return false; - } - async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) { if (!sitemap) { return; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index d4a92bc8e..7cba42f3a 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -6,7 +6,7 @@ import PQueue from "p-queue"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; -import { RequestResponseInfo } from "./reqresp.js"; +import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; @@ -1064,10 +1064,15 @@ export class Recorder { async directFetchCapture( url: string, - ): Promise<{ fetched: boolean; mime: string }> { + headers: Record, + ): Promise<{ fetched: boolean; mime: string; ts: Date }> { const reqresp = new RequestResponseInfo("0"); + const ts = new Date(); + reqresp.url = url; reqresp.method = "GET"; + reqresp.requestHeaders = headers; + reqresp.ts = ts; logger.debug( "Directly fetching page URL without browser", @@ -1075,8 +1080,16 @@ export class Recorder { "recorder", ); - const filter = (resp: Response) => - resp.status === 200 && !resp.headers.get("set-cookie"); + let mime: string = ""; + + const filter = (resp: Response) => { + const ct = resp.headers.get("content-type"); + if (ct) { + mime = ct.split(";")[0]; + } + + return !isHTMLContentType(mime); + }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, // should not get here, as dupe pages tracked via seen list @@ -1090,13 +1103,14 @@ export class Recorder { }); const res = await fetcher.load(); - const mime = - (reqresp.responseHeaders && - reqresp.responseHeaders["content-type"] && - reqresp.responseHeaders["content-type"].split(";")[0]) || - ""; + this.addPageRecord(reqresp); + + if (url === this.pageUrl && !this.pageInfo.ts) { + logger.debug("Setting page timestamp", { ts, url }); + this.pageInfo.ts = ts; + } - return { fetched: res === "fetched", mime }; + return { fetched: res === "fetched", mime, ts }; } } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index ac58a0293..bbee4b433 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; +import { HTML_TYPES } from "./constants.js"; const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; @@ -362,3 +363,18 @@ export class RequestResponseInfo { return value.replace(/\n/g, ", "); } } + +export function isHTMLContentType(contentType: string | null) { + // just load if no content-type + if (!contentType) { + return true; + } + + const mime = contentType.split(";")[0]; + + if (HTML_TYPES.includes(mime)) { + return true; + } + + return false; +} diff --git a/src/util/state.ts b/src/util/state.ts index d455d877a..3354f1695 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -66,7 +66,7 @@ export class PageState { callbacks: PageCallbacks = {}; - isHTMLPage?: boolean; + isHTMLPage = true; text?: string; screenshotView?: Buffer; favicon?: string; diff --git a/src/util/worker.ts b/src/util/worker.ts index 4d1ed64dc..00cf3c7c1 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -20,8 +20,11 @@ export type WorkerOpts = { workerid: WorkerId; // eslint-disable-next-line @typescript-eslint/ban-types callbacks: Record; - directFetchCapture?: - | ((url: string) => Promise<{ fetched: boolean; mime: string }>) + directFetchCapture: + | (( + url: string, + headers: Record, + ) => Promise<{ fetched: boolean; mime: string; ts: Date }>) | null; frameIdToExecId: Map; }; @@ -171,7 +174,8 @@ export class PageWorker { this.cdp = cdp; this.callbacks = {}; const directFetchCapture = this.recorder - ? (x: string) => this.recorder!.directFetchCapture(x) + ? (x: string, h: Record) => + this.recorder!.directFetchCapture(x, h) : null; this.opts = { page, From 129005031c7301290ab6180d783bdab097c88781 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 May 2024 15:32:50 -0700 Subject: [PATCH 2/5] additional cleanup: - handle direct fetch redirects via additional fetching - use manual redirect mode for AsyncFetcher - fallback to browser for error responses, just in case --- src/crawler.ts | 18 ++++----------- src/util/recorder.ts | 53 ++++++++++++++++++++++++++++++++++++++------ src/util/reqresp.ts | 7 +++++- src/util/worker.ts | 8 +++---- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 3fe38f6a8..1d275893f 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -782,11 +782,9 @@ self.__bx_behaviors.selectMainBehavior(); data.workerid = workerid; if (directFetchCapture) { - const cookie = await this.getCookieString(cdp, url); - try { const { fetched, mime, ts } = await timedRun( - directFetchCapture(url, { Cookie: cookie, ...this.headers }), + directFetchCapture({ url, headers: this.headers, cdp }), FETCH_TIMEOUT_SECS, "Direct fetch capture attempt timed out", logDetails, @@ -827,16 +825,6 @@ self.__bx_behaviors.selectMainBehavior(); await this.doPostLoadActions(opts); } - async getCookieString(cdp: CDPSession, url: string) { - const cookieList: string[] = []; - const { cookies } = await cdp.send("Network.getCookies", { urls: [url] }); - for (const { name, value } of cookies) { - cookieList.push(`${name}=${value}`); - } - - return cookieList.join(";"); - } - async doPostLoadActions(opts: WorkerState, saveOutput = false) { const { page, cdp, data, workerid } = opts; const { url } = data; @@ -1873,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior(); "behavior", ); try { - await frame.evaluate("self.__bx_behaviors.awaitPageLoad();"); + await frame.evaluate( + "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();", + ); } catch (e) { logger.warn("Waiting for custom page load failed", e, "behavior"); } diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 7cba42f3a..0dbbec1ea 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -78,8 +78,19 @@ export type AsyncFetchOptions = { }; // ================================================================= -export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & { +export type DirectFetchRequest = { + url: string; + headers: Record; + cdp: CDPSession; +}; + +// ================================================================= +export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & { cdp: CDPSession; +}; + +// ================================================================= +export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & { requestId: string; }; @@ -1062,13 +1073,19 @@ export class Recorder { this.writer.writeRecordPair(responseRecord, requestRecord); } - async directFetchCapture( - url: string, - headers: Record, - ): Promise<{ fetched: boolean; mime: string; ts: Date }> { + async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{ + fetched: boolean; + mime: string; + ts: Date; + }> { const reqresp = new RequestResponseInfo("0"); const ts = new Date(); + const cookie = await this.getCookieString(cdp, url); + if (cookie) { + headers["Cookie"] = cookie; + } + reqresp.url = url; reqresp.method = "GET"; reqresp.requestHeaders = headers; @@ -1083,12 +1100,17 @@ export class Recorder { let mime: string = ""; const filter = (resp: Response) => { + // just in case, attempt to load invalid responses via browser + if (resp.status >= 400) { + return false; + } + const ct = resp.headers.get("content-type"); if (ct) { mime = ct.split(";")[0]; } - return !isHTMLContentType(mime); + return reqresp.isRedirectStatus(resp.status) || !isHTMLContentType(mime); }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, @@ -1110,8 +1132,23 @@ export class Recorder { this.pageInfo.ts = ts; } + if (reqresp.isRedirectStatus(reqresp.status) && reqresp.responseHeaders) { + const url = reqresp.responseHeaders["location"]; + await this.directFetchCapture({ url, headers, cdp }); + } + return { fetched: res === "fetched", mime, ts }; } + + async getCookieString(cdp: CDPSession, url: string) { + const cookieList: string[] = []; + const { cookies } = await cdp.send("Network.getCookies", { urls: [url] }); + for (const { name, value } of cookies) { + cookieList.push(`${name}=${value}`); + } + + return cookieList.join(";"); + } } // ================================================================= @@ -1321,6 +1358,7 @@ class AsyncFetcher { headers, body: reqresp.postData || undefined, signal, + redirect: "manual", }); if (this.filter && !this.filter(resp) && abort) { @@ -1337,6 +1375,7 @@ class AsyncFetcher { } if (reqresp.expectedSize === 0) { + reqresp.fillFetchResponse(resp); reqresp.payload = new Uint8Array(); return; } else if (!resp.body) { @@ -1436,7 +1475,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher { class NetworkLoadStreamAsyncFetcher extends AsyncFetcher { cdp: CDPSession; - constructor(opts: ResponseStreamAsyncFetchOptions) { + constructor(opts: NetworkLoadAsyncFetchOptions) { super(opts); this.cdp = opts.cdp; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index bbee4b433..2b3a4e152 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -149,10 +149,15 @@ export class RequestResponseInfo { } } + isRedirectStatus(status: number) { + return status >= 300 && status < 400 && status !== 304; + } + isSelfRedirect() { - if (this.status < 300 || this.status >= 400 || this.status === 304) { + if (!this.isRedirectStatus(this.status)) { return false; } + try { const headers = new Headers(this.getResponseHeadersDict()); const location = headers.get("location") || ""; diff --git a/src/util/worker.ts b/src/util/worker.ts index 00cf3c7c1..b54bc2a71 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -2,7 +2,7 @@ import os from "os"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun } from "./timing.js"; -import { Recorder } from "./recorder.js"; +import { DirectFetchRequest, Recorder } from "./recorder.js"; import { rxEscape } from "./seeds.js"; import { CDPSession, Page } from "puppeteer-core"; import { PageState, WorkerId } from "./state.js"; @@ -22,8 +22,7 @@ export type WorkerOpts = { callbacks: Record; directFetchCapture: | (( - url: string, - headers: Record, + request: DirectFetchRequest, ) => Promise<{ fetched: boolean; mime: string; ts: Date }>) | null; frameIdToExecId: Map; @@ -174,8 +173,7 @@ export class PageWorker { this.cdp = cdp; this.callbacks = {}; const directFetchCapture = this.recorder - ? (x: string, h: Record) => - this.recorder!.directFetchCapture(x, h) + ? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req) : null; this.opts = { page, From 49573969ca1c3567d5dc371f4801dc5e34181dbc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 May 2024 15:40:13 -0700 Subject: [PATCH 3/5] make manualRedirect opt configurable --- src/util/recorder.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 0dbbec1ea..458a0f5e7 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -75,6 +75,7 @@ export type AsyncFetchOptions = { filter?: (resp: Response) => boolean; ignoreDupe?: boolean; maxFetchSize?: number; + manualRedirect?: boolean; }; // ================================================================= @@ -1122,11 +1123,10 @@ export class Recorder { networkId: "0", filter, ignoreDupe: true, + manualRedirect: true, }); const res = await fetcher.load(); - this.addPageRecord(reqresp); - if (url === this.pageUrl && !this.pageInfo.ts) { logger.debug("Setting page timestamp", { ts, url }); this.pageInfo.ts = ts; @@ -1166,6 +1166,8 @@ class AsyncFetcher { tempdir: string; filename: string; + manualRedirect = false; + constructor({ tempdir, reqresp, @@ -1175,6 +1177,7 @@ class AsyncFetcher { filter = undefined, ignoreDupe = false, maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE, + manualRedirect = false, }: AsyncFetchOptions) { this.reqresp = reqresp; this.reqresp.expectedSize = expectedSize; @@ -1193,6 +1196,8 @@ class AsyncFetcher { ); this.maxFetchSize = maxFetchSize; + + this.manualRedirect = manualRedirect; } async load() { @@ -1328,9 +1333,9 @@ class AsyncFetcher { reqresp.status = 0; reqresp.errorText = e.message; } finally { + recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id if (networkId !== "0") { - recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } } @@ -1358,7 +1363,7 @@ class AsyncFetcher { headers, body: reqresp.postData || undefined, signal, - redirect: "manual", + redirect: this.manualRedirect ? "manual" : "follow", }); if (this.filter && !this.filter(resp) && abort) { From 7d0adc40739553e9c7cf8099c5fd4d1d332a58bd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 May 2024 17:10:16 -0700 Subject: [PATCH 4/5] direct fetch: ensure redirect succeeds before committing direct fetch redirect records --- src/crawler.ts | 2 +- src/util/recorder.ts | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 1d275893f..fe085bd58 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -801,7 +801,7 @@ self.__bx_behaviors.selectMainBehavior(); data.ts = ts || new Date(); logger.info( "Direct fetch successful", - { url, ...logDetails }, + { url, mime, ...logDetails }, "fetch", ); return; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 458a0f5e7..b668ad5ee 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1111,7 +1111,7 @@ export class Recorder { mime = ct.split(";")[0]; } - return reqresp.isRedirectStatus(resp.status) || !isHTMLContentType(mime); + return !isHTMLContentType(mime) || reqresp.isRedirectStatus(resp.status); }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, @@ -1127,16 +1127,25 @@ export class Recorder { }); const res = await fetcher.load(); + // recursively attempt to handle redirect + if (reqresp.isRedirectStatus(reqresp.status) && reqresp.responseHeaders) { + const newUrl = new URL(reqresp.responseHeaders["location"], url).href; + const res = await this.directFetchCapture({ url: newUrl, headers, cdp }); + if (res.fetched) { + mime = res.mime; + } + } + + // get here if got a non-redirect response successfully + this.addPageRecord(reqresp); + + await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url); + if (url === this.pageUrl && !this.pageInfo.ts) { logger.debug("Setting page timestamp", { ts, url }); this.pageInfo.ts = ts; } - if (reqresp.isRedirectStatus(reqresp.status) && reqresp.responseHeaders) { - const url = reqresp.responseHeaders["location"]; - await this.directFetchCapture({ url, headers, cdp }); - } - return { fetched: res === "fetched", mime, ts }; } @@ -1212,6 +1221,7 @@ class AsyncFetcher { if ( reqresp.method === "GET" && url && + networkId !== "0" && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url)) ) { if (!this.ignoreDupe) { @@ -1333,9 +1343,9 @@ class AsyncFetcher { reqresp.status = 0; reqresp.errorText = e.message; } finally { - recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id if (networkId !== "0") { + recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } } From e8e1cdf8af1ab264c1deecaebe67e2af61cd2d96 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 23 May 2024 17:15:27 -0700 Subject: [PATCH 5/5] only handle non-redirects in direct fetch, as redirects records get serialized even if need to redo via browser --- src/util/recorder.ts | 21 ++++----------------- src/util/reqresp.ts | 6 +++--- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index b668ad5ee..182571ea6 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1101,8 +1101,8 @@ export class Recorder { let mime: string = ""; const filter = (resp: Response) => { - // just in case, attempt to load invalid responses via browser - if (resp.status >= 400) { + // only direct load 200 responses + if (resp.status !== 200) { return false; } @@ -1111,7 +1111,7 @@ export class Recorder { mime = ct.split(";")[0]; } - return !isHTMLContentType(mime) || reqresp.isRedirectStatus(resp.status); + return !isHTMLContentType(mime); }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, @@ -1127,20 +1127,8 @@ export class Recorder { }); const res = await fetcher.load(); - // recursively attempt to handle redirect - if (reqresp.isRedirectStatus(reqresp.status) && reqresp.responseHeaders) { - const newUrl = new URL(reqresp.responseHeaders["location"], url).href; - const res = await this.directFetchCapture({ url: newUrl, headers, cdp }); - if (res.fetched) { - mime = res.mime; - } - } - - // get here if got a non-redirect response successfully this.addPageRecord(reqresp); - await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url); - if (url === this.pageUrl && !this.pageInfo.ts) { logger.debug("Setting page timestamp", { ts, url }); this.pageInfo.ts = ts; @@ -1221,7 +1209,6 @@ class AsyncFetcher { if ( reqresp.method === "GET" && url && - networkId !== "0" && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url)) ) { if (!this.ignoreDupe) { @@ -1343,9 +1330,9 @@ class AsyncFetcher { reqresp.status = 0; reqresp.errorText = e.message; } finally { + recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id if (networkId !== "0") { - recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 2b3a4e152..1fc9fbe55 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -149,12 +149,12 @@ export class RequestResponseInfo { } } - isRedirectStatus(status: number) { - return status >= 300 && status < 400 && status !== 304; + isRedirectStatus() { + return this.status >= 300 && this.status < 400 && this.status !== 304; } isSelfRedirect() { - if (!this.isRedirectStatus(this.status)) { + if (!this.isRedirectStatus()) { return false; }