Skip to content

Load non-HTML resources directly whenever possible #583

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 15 additions & 73 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js";
import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
HTML_TYPES,
DEFAULT_SELECTORS,
} from "./util/constants.js";

import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js";

// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";

const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false,
});

const HTTP_AGENT = new HTTPAgent();
import { isHTMLContentType } from "./util/reqresp.js";

const behaviors = fs.readFileSync(
new URL(
Expand Down Expand Up @@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior();
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();

const { page, data, workerid, callbacks, directFetchCapture } = opts;
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
data.callbacks = callbacks;

const { url } = data;
Expand All @@ -790,35 +781,27 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails;
data.workerid = workerid;

data.isHTMLPage = await timedRun(
this.isHTML(url, logDetails),
FETCH_TIMEOUT_SECS,
"HEAD request to determine if URL is HTML page timed out",
logDetails,
"fetch",
true,
);

if (!data.isHTMLPage && directFetchCapture) {
if (directFetchCapture) {
try {
const { fetched, mime } = await timedRun(
directFetchCapture(url),
const { fetched, mime, ts } = await timedRun(
directFetchCapture({ url, headers: this.headers, cdp }),
FETCH_TIMEOUT_SECS,
"Direct fetch capture attempt timed out",
logDetails,
"fetch",
true,
);
if (mime) {
data.mime = mime;
data.isHTMLPage = isHTMLContentType(mime);
}
if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED;
if (mime) {
data.mime = mime;
}
data.status = 200;
data.ts = new Date();
data.ts = ts || new Date();
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
{ url, mime, ...logDetails },
"fetch",
);
return;
Expand Down Expand Up @@ -1752,7 +1735,7 @@ self.__bx_behaviors.selectMainBehavior();

const contentType = resp.headers()["content-type"];

isHTMLPage = this.isHTMLContentType(contentType);
isHTMLPage = isHTMLContentType(contentType);

if (contentType) {
data.mime = contentType.split(";")[0];
Expand Down Expand Up @@ -1878,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior();
"behavior",
);
try {
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
await frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
);
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
}
Expand Down Expand Up @@ -2191,49 +2176,6 @@ self.__bx_behaviors.selectMainBehavior();
}
}

resolveAgent(urlParsed: URL) {
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
}

async isHTML(url: string, logDetails: LogDetails) {
try {
const resp = await fetch(url, {
method: "HEAD",
headers: this.headers,
agent: this.resolveAgent,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any);
if (resp.status !== 200) {
logger.debug("HEAD response code != 200, loading in browser", {
status: resp.status,
...logDetails,
});
return true;
}

return this.isHTMLContentType(resp.headers.get("Content-Type"));
} catch (e) {
// can't confirm not html, so try in browser
logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
return true;
}
}

isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}

const mime = contentType.split(";")[0];

if (HTML_TYPES.includes(mime)) {
return true;
}

return false;
}

async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
if (!sitemap) {
return;
Expand Down
85 changes: 70 additions & 15 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import PQueue from "p-queue";

import { logger, formatErr } from "./logger.js";
import { sleep, timedRun, timestampNow } from "./timing.js";
import { RequestResponseInfo } from "./reqresp.js";
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";

// @ts-expect-error TODO fill in why error is expected
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
Expand Down Expand Up @@ -75,11 +75,23 @@ export type AsyncFetchOptions = {
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
maxFetchSize?: number;
manualRedirect?: boolean;
};

// =================================================================
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
export type DirectFetchRequest = {
url: string;
headers: Record<string, string>;
cdp: CDPSession;
};

// =================================================================
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
cdp: CDPSession;
};

// =================================================================
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
requestId: string;
};

Expand Down Expand Up @@ -1062,21 +1074,45 @@ export class Recorder {
this.writer.writeRecordPair(responseRecord, requestRecord);
}

async directFetchCapture(
url: string,
): Promise<{ fetched: boolean; mime: string }> {
async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
fetched: boolean;
mime: string;
ts: Date;
}> {
const reqresp = new RequestResponseInfo("0");
const ts = new Date();

const cookie = await this.getCookieString(cdp, url);
if (cookie) {
headers["Cookie"] = cookie;
}

reqresp.url = url;
reqresp.method = "GET";
reqresp.requestHeaders = headers;
reqresp.ts = ts;

logger.debug(
"Directly fetching page URL without browser",
{ url, ...this.logDetails },
"recorder",
);

const filter = (resp: Response) =>
resp.status === 200 && !resp.headers.get("set-cookie");
let mime: string = "";

const filter = (resp: Response) => {
// only direct load 200 responses
if (resp.status !== 200) {
return false;
}

const ct = resp.headers.get("content-type");
if (ct) {
mime = ct.split(";")[0];
}

return !isHTMLContentType(mime);
};

// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list
Expand All @@ -1087,16 +1123,28 @@ export class Recorder {
networkId: "0",
filter,
ignoreDupe: true,
manualRedirect: true,
});
const res = await fetcher.load();

const mime =
(reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) ||
"";
this.addPageRecord(reqresp);

if (url === this.pageUrl && !this.pageInfo.ts) {
logger.debug("Setting page timestamp", { ts, url });
this.pageInfo.ts = ts;
}

return { fetched: res === "fetched", mime, ts };
}

async getCookieString(cdp: CDPSession, url: string) {
const cookieList: string[] = [];
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
for (const { name, value } of cookies) {
cookieList.push(`${name}=${value}`);
}

return { fetched: res === "fetched", mime };
return cookieList.join(";");
}
}

Expand All @@ -1115,6 +1163,8 @@ class AsyncFetcher {
tempdir: string;
filename: string;

manualRedirect = false;

constructor({
tempdir,
reqresp,
Expand All @@ -1124,6 +1174,7 @@ class AsyncFetcher {
filter = undefined,
ignoreDupe = false,
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
manualRedirect = false,
}: AsyncFetchOptions) {
this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize;
Expand All @@ -1142,6 +1193,8 @@ class AsyncFetcher {
);

this.maxFetchSize = maxFetchSize;

this.manualRedirect = manualRedirect;
}

async load() {
Expand Down Expand Up @@ -1277,9 +1330,9 @@ class AsyncFetcher {
reqresp.status = 0;
reqresp.errorText = e.message;
} finally {
recorder.addPageRecord(reqresp);
// exclude direct fetch request with fake id
if (networkId !== "0") {
recorder.addPageRecord(reqresp);
recorder.removeReqResp(networkId);
}
}
Expand Down Expand Up @@ -1307,6 +1360,7 @@ class AsyncFetcher {
headers,
body: reqresp.postData || undefined,
signal,
redirect: this.manualRedirect ? "manual" : "follow",
});

if (this.filter && !this.filter(resp) && abort) {
Expand All @@ -1323,6 +1377,7 @@ class AsyncFetcher {
}

if (reqresp.expectedSize === 0) {
reqresp.fillFetchResponse(resp);
reqresp.payload = new Uint8Array();
return;
} else if (!resp.body) {
Expand Down Expand Up @@ -1422,7 +1477,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
cdp: CDPSession;

constructor(opts: ResponseStreamAsyncFetchOptions) {
constructor(opts: NetworkLoadAsyncFetchOptions) {
super(opts);
this.cdp = opts.cdp;
}
Expand Down
23 changes: 22 additions & 1 deletion src/util/reqresp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";

import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
import { HTML_TYPES } from "./constants.js";

const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type";
Expand Down Expand Up @@ -148,10 +149,15 @@ export class RequestResponseInfo {
}
}

isRedirectStatus() {
return this.status >= 300 && this.status < 400 && this.status !== 304;
}

isSelfRedirect() {
if (this.status < 300 || this.status >= 400 || this.status === 304) {
if (!this.isRedirectStatus()) {
return false;
}

try {
const headers = new Headers(this.getResponseHeadersDict());
const location = headers.get("location") || "";
Expand Down Expand Up @@ -362,3 +368,18 @@ export class RequestResponseInfo {
return value.replace(/\n/g, ", ");
}
}

export function isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}

const mime = contentType.split(";")[0];

if (HTML_TYPES.includes(mime)) {
return true;
}

return false;
}
2 changes: 1 addition & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ export class PageState {

callbacks: PageCallbacks = {};

isHTMLPage?: boolean;
isHTMLPage = true;
text?: string;
screenshotView?: Buffer;
favicon?: string;
Expand Down
Loading
Loading