From 0a881b9fc1772756e976b10be31c17307e38cf53 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 May 2024 21:33:08 -0700 Subject: [PATCH 01/12] proxy: fix proxy settings when PROXY_PORT and PROXY_PORT env vars are set (to be compatible with 0.12.x) accidentally always ignored proxy settings before fixes #587 --- src/crawler.ts | 5 ++++- src/create-login-profile.ts | 1 - src/util/browser.ts | 8 +++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index fe085bd5..8c935ab1 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1289,7 +1289,10 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: false, + proxy: + process.env.PROXY_HOST && process.env.PROXY_PORT + ? `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` + : "", userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 047ed017..976919fc 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -179,7 +179,6 @@ async function main() { headless: params.headless, signals: false, chromeOptions: { - proxy: false, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/src/util/browser.ts b/src/util/browser.ts index 65500cb4..6c1b5012 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; type BtrixChromeOpts = { - proxy?: boolean; + proxy?: string; userAgent?: string | null; extraArgs?: string[]; }; @@ -217,7 +217,7 @@ export class Browser { } chromeArgs({ - proxy = true, + proxy = "", userAgent = null, extraArgs = [], }: BtrixChromeOpts) { @@ -238,9 +238,7 @@ export class Browser { if (proxy) { args.push("--ignore-certificate-errors"); - args.push( - `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`, - ); + args.push(`--proxy-server=${proxy}`); } return args; From 5354c633ed497b79bd1dc0634b47f6d7e0206bcd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 May 2024 22:26:09 -0700 Subject: [PATCH 02/12] remove obsolete PROXY_* vars --- Dockerfile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index adefbd21..d796af28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE} # needed to add args to main build stage ARG BROWSER_VERSION -ENV PROXY_HOST=localhost \ - PROXY_PORT=8080 \ - PROXY_CA_URL=http://wsgiprox/download/pem \ - PROXY_CA_FILE=/tmp/proxy-ca.pem \ - DISPLAY=:99 \ +ENV DISPLAY=:99 \ GEOMETRY=1360x1020x16 \ BROWSER_VERSION=${BROWSER_VERSION} \ BROWSER_BIN=google-chrome \ From eb47cb7a5030164253c810f3ac6c71a98caa0845 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 May 2024 22:46:58 -0700 Subject: [PATCH 03/12] add support for PROXY_SERVER env var for setting custom proxy, as well as --proxyServer cli flag --- src/crawler.ts | 5 +---- src/create-login-profile.ts | 8 +++++--- src/util/argParser.ts | 6 ++++++ src/util/browser.ts | 15 +++++++++++++++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 8c935ab1..11404e16 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1289,10 +1289,7 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: - process.env.PROXY_HOST && process.env.PROXY_PORT - ? `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}` - : "", + proxy: this.params.proxyServer, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 976919fc..2f19f0ea 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } { default: getDefaultWindowSize(), }, - proxy: { - type: "boolean", - default: false, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", }, cookieDays: { @@ -179,6 +180,7 @@ async function main() { headless: params.headless, signals: false, chromeOptions: { + proxy: params.proxyServer, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 9a8d1c68..7e50fb24 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -545,6 +545,12 @@ class ArgParser { default: "disabled", }, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", + }, + qaSource: { describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA", type: "string", diff --git a/src/util/browser.ts b/src/util/browser.ts index 6c1b5012..8b8ac980 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -236,6 +236,8 @@ export class Browser { ...extraArgs, ]; + proxy = proxy || this.getProxy(); + if (proxy) { args.push("--ignore-certificate-errors"); args.push(`--proxy-server=${proxy}`); @@ -265,6 +267,19 @@ export class Browser { return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; } + getProxy() { + if (process.env.PROXY_SERVER) { + return process.env.PROXY_SERVER; + } + + // for backwards compatibility with 0.x proxy settings + if (process.env.PROXY_HOST && process.env.PROXY_PORT) { + return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; + } + + return ""; + } + getBrowserExe() { const files = [ process.env.BROWSER_BIN, From ec07cc99f10f26a20a5a1b45977bea2fc3398f63 Mon Sep 17 00:00:00 2001 From: Vinzenz Sinapius Date: Fri, 31 May 2024 09:31:41 +0200 Subject: [PATCH 04/12] WIP 1 --- package.json | 3 ++ src/util/browser.ts | 3 +- src/util/recorder.ts | 22 ++++++++++- src/util/reqresp.ts | 2 + yarn.lock | 91 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 117 insertions(+), 4 deletions(-) diff --git a/package.json b/package.json index f05071eb..1afd24ad 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ }, "dependencies": { "@novnc/novnc": "^1.4.0", + "@types/node-fetch": "^2.6.11", "@types/sax": "^1.2.7", "@webrecorder/wabac": "^2.16.12", "browsertrix-behaviors": "^0.6.0", @@ -27,12 +28,14 @@ "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", + "node-fetch": "^3.3.2", "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^22.6.1", "sax": "^1.3.0", "sharp": "^0.32.6", + "socks-proxy-agent": "^8.0.3", "tsc": "^2.0.4", "uuid": "8.3.2", "warcio": "^2.2.1", diff --git a/src/util/browser.ts b/src/util/browser.ts index 8b8ac980..22579e95 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -115,7 +115,7 @@ export class Browser { ? undefined : (target) => this.targetFilter(target), }; - + logger.info("Launching browser", launchOpts); await this._init(launchOpts, ondisconnect, recording); } @@ -237,6 +237,7 @@ export class Browser { ]; proxy = proxy || this.getProxy(); + logger.info(`Proxy settings: ${proxy}`); if (proxy) { args.push("--ignore-certificate-errors"); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 182571ea..b193bcea 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,6 +8,13 @@ import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; +import { SocksProxyAgent } from "socks-proxy-agent"; + +import fetch, { Response } from "node-fetch"; + +import { default as stream } from "node:stream"; +import type { ReadableStream } from "node:stream/web"; + // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; import { @@ -1165,6 +1172,8 @@ class AsyncFetcher { manualRedirect = false; + socksAgent: SocksProxyAgent | null = null; + constructor({ tempdir, reqresp, @@ -1195,6 +1204,10 @@ class AsyncFetcher { this.maxFetchSize = maxFetchSize; this.manualRedirect = manualRedirect; + + if (process.env.PROXY_SERVER) { + this.socksAgent = new SocksProxyAgent(process.env.PROXY_SERVER); + } } async load() { @@ -1361,6 +1374,7 @@ class AsyncFetcher { body: reqresp.postData || undefined, signal, redirect: this.manualRedirect ? "manual" : "follow", + agent: this.socksAgent || undefined, }); if (this.filter && !this.filter(resp) && abort) { @@ -1386,10 +1400,14 @@ class AsyncFetcher { reqresp.fillFetchResponse(resp); - return this.takeReader(resp.body.getReader()); + const reader = stream.Readable.fromWeb( + resp.body as unknown as ReadableStream, + ); + + return this.takeReader(reader); } - async *takeReader(reader: ReadableStreamDefaultReader) { + async *takeReader(reader: stream.Readable) { let size = 0; try { while (true) { diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 1fc9fbe5..515e6517 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -5,6 +5,8 @@ import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; +import { Response } from "node-fetch"; + const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"]; diff --git a/yarn.lock b/yarn.lock index 07506669..d7f034b4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1136,6 +1136,14 @@ resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== +"@types/node-fetch@^2.6.11": + version "2.6.11" + resolved "https://registry.yarnpkg.com/@types/node-fetch/-/node-fetch-2.6.11.tgz#9b39b78665dae0e82a08f02f4967d62c66f95d24" + integrity sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g== + dependencies: + "@types/node" "*" + form-data "^4.0.0" + "@types/node@*": version "15.3.0" resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26" @@ -1363,6 +1371,13 @@ agent-base@^7.0.2, agent-base@^7.1.0: dependencies: debug "^4.3.4" +agent-base@^7.1.1: + version "7.1.1" + resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-7.1.1.tgz#bdbded7dfb096b751a2a087eeeb9664725b2e317" + integrity sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA== + dependencies: + debug "^4.3.4" + ajv@^6.12.4: version "6.12.6" resolved "https://registry.yarnpkg.com/ajv/-/ajv-6.12.6.tgz#baf5a62e802b07d977034586f8c3baf5adf26df4" @@ -1476,6 +1491,11 @@ async@^3.2.4: resolved "https://registry.yarnpkg.com/async/-/async-3.2.4.tgz#2d22e00f8cddeb5fde5dd33522b56d1cf569a81c" integrity sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ== +asynckit@^0.4.0: + version "0.4.0" + resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" + integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q== + auto-js-ipfs@^2.1.1: version "2.3.0" resolved "https://registry.yarnpkg.com/auto-js-ipfs/-/auto-js-ipfs-2.3.0.tgz#2c2684074cdaa2eb579345c4f86420d7635956c8" @@ -1849,6 +1869,13 @@ color@^4.2.3: color-convert "^2.0.1" color-string "^1.9.0" +combined-stream@^1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f" + integrity sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg== + dependencies: + delayed-stream "~1.0.0" + concat-map@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" @@ -1905,6 +1932,11 @@ crypto-random-string@^4.0.0: dependencies: type-fest "^1.0.1" +data-uri-to-buffer@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz#d8feb2b2881e6a4f58c2e08acfd0e2834e26222e" + integrity sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A== + data-uri-to-buffer@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c" @@ -1972,6 +2004,11 @@ degenerator@^5.0.0: escodegen "^2.1.0" esprima "^4.0.1" +delayed-stream@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619" + integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ== + denque@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/denque/-/denque-2.1.0.tgz#e93e1a6569fb5e66f16a3c2a2964617d349d6ab1" @@ -2386,6 +2423,14 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" +fetch-blob@^3.1.2, fetch-blob@^3.1.4: + version "3.2.0" + resolved "https://registry.yarnpkg.com/fetch-blob/-/fetch-blob-3.2.0.tgz#f09b8d4bbd45adc6f0c20b7e787e793e309dcce9" + integrity sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ== + dependencies: + node-domexception "^1.0.0" + web-streams-polyfill "^3.0.3" + file-entry-cache@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" @@ -2439,6 +2484,22 @@ foreach@^2.0.5: resolved "https://registry.yarnpkg.com/foreach/-/foreach-2.0.5.tgz#0bee005018aeb260d0a3af3ae658dd0136ec1b99" integrity sha1-C+4AUBiusmDQo6865ljdATbsG5k= +form-data@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452" + integrity sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + +formdata-polyfill@^4.0.10: + version "4.0.10" + resolved "https://registry.yarnpkg.com/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz#24807c31c9d402e002ab3d8c720144ceb8848423" + integrity sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g== + dependencies: + fetch-blob "^3.1.2" + fs-constants@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad" @@ -3606,7 +3667,7 @@ mime-db@1.52.0: resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.52.0.tgz#bbabcdc02859f4987301c856e3387ce5ec43bf70" integrity sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg== -mime-types@^2.1.35: +mime-types@^2.1.12, mime-types@^2.1.35: version "2.1.35" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a" integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw== @@ -3704,6 +3765,20 @@ node-addon-api@^6.1.0: resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-6.1.0.tgz#ac8470034e58e67d0c6f1204a18ae6995d9c0d76" integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA== +node-domexception@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5" + integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ== + +node-fetch@^3.3.2: + version "3.3.2" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-3.3.2.tgz#d1e889bacdf733b4ff3b2b243eb7a12866a0b78b" + integrity sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA== + dependencies: + data-uri-to-buffer "^4.0.0" + fetch-blob "^3.1.4" + formdata-polyfill "^4.0.10" + node-int64@^0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/node-int64/-/node-int64-0.4.0.tgz#87a9065cdb355d3182d8f94ce11188b825c68a3b" @@ -4429,6 +4504,15 @@ socks-proxy-agent@^8.0.2: debug "^4.3.4" socks "^2.7.1" +socks-proxy-agent@^8.0.3: + version "8.0.3" + resolved "https://registry.yarnpkg.com/socks-proxy-agent/-/socks-proxy-agent-8.0.3.tgz#6b2da3d77364fde6292e810b496cb70440b9b89d" + integrity sha512-VNegTZKhuGq5vSD6XNKlbqWhyt/40CgoEw8XxD6dhnm8Jq9IEa3nIa4HwnM8XOqU0CdB0BwWVXusqiFXfHB3+A== + dependencies: + agent-base "^7.1.1" + debug "^4.3.4" + socks "^2.7.1" + socks@^2.7.1: version "2.7.1" resolved "https://registry.yarnpkg.com/socks/-/socks-2.7.1.tgz#d8e651247178fde79c0663043e07240196857d55" @@ -4958,6 +5042,11 @@ web-encoding@^1.1.5: optionalDependencies: "@zxing/text-encoding" "0.9.0" +web-streams-polyfill@^3.0.3: + version "3.3.3" + resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz#2073b91a2fdb1fbfbd401e7de0ac9f8214cecb4b" + integrity sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw== + which-boxed-primitive@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" From ae146b182ec9b05ec77e087c0baf7a92b0818b48 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 14:52:09 -0700 Subject: [PATCH 05/12] switch to undici proxy --- package.json | 5 +- src/util/browser.ts | 28 +++++----- src/util/recorder.ts | 52 ++++++++++++------ src/util/reqresp.ts | 2 +- yarn.lock | 126 +++++++++++++------------------------------ 5 files changed, 90 insertions(+), 123 deletions(-) diff --git a/package.json b/package.json index 1afd24ad..a5ca8728 100644 --- a/package.json +++ b/package.json @@ -17,26 +17,25 @@ }, "dependencies": { "@novnc/novnc": "^1.4.0", - "@types/node-fetch": "^2.6.11", "@types/sax": "^1.2.7", "@webrecorder/wabac": "^2.16.12", "browsertrix-behaviors": "^0.6.0", "crc": "^4.3.2", + "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", "ioredis": "^5.3.2", "js-levenshtein": "^1.1.6", "js-yaml": "^4.1.0", "minio": "^7.1.3", - "node-fetch": "^3.3.2", "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^22.6.1", "sax": "^1.3.0", "sharp": "^0.32.6", - "socks-proxy-agent": "^8.0.3", "tsc": "^2.0.4", + "undici": "^6.18.2", "uuid": "8.3.2", "warcio": "^2.2.1", "ws": "^7.4.4", diff --git a/src/util/browser.ts b/src/util/browser.ts index 22579e95..c6cdeba7 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -236,7 +236,7 @@ export class Browser { ...extraArgs, ]; - proxy = proxy || this.getProxy(); + proxy = proxy || getProxy(); logger.info(`Proxy settings: ${proxy}`); if (proxy) { @@ -268,19 +268,6 @@ export class Browser { return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; } - getProxy() { - if (process.env.PROXY_SERVER) { - return process.env.PROXY_SERVER; - } - - // for backwards compatibility with 0.x proxy settings - if (process.env.PROXY_HOST && process.env.PROXY_PORT) { - return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; - } - - return ""; - } - getBrowserExe() { const files = [ process.env.BROWSER_BIN, @@ -614,3 +601,16 @@ export const defaultArgs = [ "--export-tagged-pdf", "--apps-keep-chrome-alive-in-tests", ]; + +export function getProxy() { + if (process.env.PROXY_SERVER) { + return process.env.PROXY_SERVER; + } + + // for backwards compatibility with 0.x proxy settings + if (process.env.PROXY_HOST && process.env.PROXY_PORT) { + return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; + } + + return ""; +} diff --git a/src/util/recorder.ts b/src/util/recorder.ts index b193bcea..411aeb39 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,12 +8,12 @@ import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; -import { SocksProxyAgent } from "socks-proxy-agent"; +import { fetch, Response, Dispatcher, ProxyAgent } from "undici"; -import fetch, { Response } from "node-fetch"; +import { socksDispatcher } from "fetch-socks"; +import type { SocksProxyType } from "socks/typings/common/constants.js"; -import { default as stream } from "node:stream"; -import type { ReadableStream } from "node:stream/web"; +import { getProxy } from "./browser.js"; // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; @@ -138,6 +138,8 @@ export class Recorder { frameIdToExecId: Map | null; + dispatcher?: Dispatcher; + constructor({ workerid, writer, @@ -160,6 +162,8 @@ export class Recorder { this.fetcherQ = new PQueue({ concurrency: 1 }); this.frameIdToExecId = null; + + this.dispatcher = createDispatcher(); } async onCreatePage({ @@ -1172,8 +1176,6 @@ class AsyncFetcher { manualRedirect = false; - socksAgent: SocksProxyAgent | null = null; - constructor({ tempdir, reqresp, @@ -1204,10 +1206,6 @@ class AsyncFetcher { this.maxFetchSize = maxFetchSize; this.manualRedirect = manualRedirect; - - if (process.env.PROXY_SERVER) { - this.socksAgent = new SocksProxyAgent(process.env.PROXY_SERVER); - } } async load() { @@ -1374,7 +1372,7 @@ class AsyncFetcher { body: reqresp.postData || undefined, signal, redirect: this.manualRedirect ? "manual" : "follow", - agent: this.socksAgent || undefined, + dispatcher: this.recorder.dispatcher, }); if (this.filter && !this.filter(resp) && abort) { @@ -1400,14 +1398,10 @@ class AsyncFetcher { reqresp.fillFetchResponse(resp); - const reader = stream.Readable.fromWeb( - resp.body as unknown as ReadableStream, - ); - - return this.takeReader(reader); + return this.takeReader(resp.body.getReader()); } - async *takeReader(reader: stream.Readable) { + async *takeReader(reader: ReadableStreamDefaultReader) { let size = 0; try { while (true) { @@ -1655,3 +1649,27 @@ function createRequest( requestBody, ); } + +function createDispatcher(): Dispatcher | undefined { + const proxyUrl = getProxy(); + if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { + return new ProxyAgent({ uri: proxyUrl }); + } else if ( + proxyUrl.startsWith("socks://") || + proxyUrl.startsWith("socks5://") || + proxyUrl.startsWith("socks4://") + ) { + const url = new URL(proxyUrl); + const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; + const params = { + type, + host: url.host, + port: parseInt(url.port), + userId: url.username || undefined, + password: url.password || undefined, + }; + return socksDispatcher(params); + } else { + return undefined; + } +} diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 515e6517..1f9fbc0f 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -5,7 +5,7 @@ import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; -import { Response } from "node-fetch"; +import { Response } from "undici"; const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; diff --git a/yarn.lock b/yarn.lock index d7f034b4..097c0363 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1136,14 +1136,6 @@ resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== -"@types/node-fetch@^2.6.11": - version "2.6.11" - resolved "https://registry.yarnpkg.com/@types/node-fetch/-/node-fetch-2.6.11.tgz#9b39b78665dae0e82a08f02f4967d62c66f95d24" - integrity sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g== - dependencies: - "@types/node" "*" - form-data "^4.0.0" - "@types/node@*": version "15.3.0" resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26" @@ -1371,13 +1363,6 @@ agent-base@^7.0.2, agent-base@^7.1.0: dependencies: debug "^4.3.4" -agent-base@^7.1.1: - version "7.1.1" - resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-7.1.1.tgz#bdbded7dfb096b751a2a087eeeb9664725b2e317" - integrity sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA== - dependencies: - debug "^4.3.4" - ajv@^6.12.4: version "6.12.6" resolved "https://registry.yarnpkg.com/ajv/-/ajv-6.12.6.tgz#baf5a62e802b07d977034586f8c3baf5adf26df4" @@ -1491,11 +1476,6 @@ async@^3.2.4: resolved "https://registry.yarnpkg.com/async/-/async-3.2.4.tgz#2d22e00f8cddeb5fde5dd33522b56d1cf569a81c" integrity sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ== -asynckit@^0.4.0: - version "0.4.0" - resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" - integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q== - auto-js-ipfs@^2.1.1: version "2.3.0" resolved "https://registry.yarnpkg.com/auto-js-ipfs/-/auto-js-ipfs-2.3.0.tgz#2c2684074cdaa2eb579345c4f86420d7635956c8" @@ -1869,13 +1849,6 @@ color@^4.2.3: color-convert "^2.0.1" color-string "^1.9.0" -combined-stream@^1.0.8: - version "1.0.8" - resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f" - integrity sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg== - dependencies: - delayed-stream "~1.0.0" - concat-map@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" @@ -1932,11 +1905,6 @@ crypto-random-string@^4.0.0: dependencies: type-fest "^1.0.1" -data-uri-to-buffer@^4.0.0: - version "4.0.1" - resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz#d8feb2b2881e6a4f58c2e08acfd0e2834e26222e" - integrity sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A== - data-uri-to-buffer@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c" @@ -2004,11 +1972,6 @@ degenerator@^5.0.0: escodegen "^2.1.0" esprima "^4.0.1" -delayed-stream@~1.0.0: - version "1.0.0" - resolved "https://registry.yarnpkg.com/delayed-stream/-/delayed-stream-1.0.0.tgz#df3ae199acadfb7d440aaae0b29e2272b24ec619" - integrity sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ== - denque@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/denque/-/denque-2.1.0.tgz#e93e1a6569fb5e66f16a3c2a2964617d349d6ab1" @@ -2423,13 +2386,13 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" -fetch-blob@^3.1.2, fetch-blob@^3.1.4: - version "3.2.0" - resolved "https://registry.yarnpkg.com/fetch-blob/-/fetch-blob-3.2.0.tgz#f09b8d4bbd45adc6f0c20b7e787e793e309dcce9" - integrity sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ== +fetch-socks@^1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b" + integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw== dependencies: - node-domexception "^1.0.0" - web-streams-polyfill "^3.0.3" + socks "^2.8.1" + undici "^6.10.1" file-entry-cache@^6.0.1: version "6.0.1" @@ -2484,22 +2447,6 @@ foreach@^2.0.5: resolved "https://registry.yarnpkg.com/foreach/-/foreach-2.0.5.tgz#0bee005018aeb260d0a3af3ae658dd0136ec1b99" integrity sha1-C+4AUBiusmDQo6865ljdATbsG5k= -form-data@^4.0.0: - version "4.0.0" - resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452" - integrity sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww== - dependencies: - asynckit "^0.4.0" - combined-stream "^1.0.8" - mime-types "^2.1.12" - -formdata-polyfill@^4.0.10: - version "4.0.10" - resolved "https://registry.yarnpkg.com/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz#24807c31c9d402e002ab3d8c720144ceb8848423" - integrity sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g== - dependencies: - fetch-blob "^3.1.2" - fs-constants@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad" @@ -2839,6 +2786,14 @@ ioredis@^5.3.2: redis-parser "^3.0.0" standard-as-callback "^2.1.0" +ip-address@^9.0.5: + version "9.0.5" + resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a" + integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g== + dependencies: + jsbn "1.1.0" + sprintf-js "^1.1.3" + ip@^1.1.8: version "1.1.8" resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48" @@ -3488,6 +3443,11 @@ js-yaml@^4.1.0: dependencies: argparse "^2.0.1" +jsbn@1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040" + integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A== + jsesc@^2.5.1: version "2.5.2" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" @@ -3667,7 +3627,7 @@ mime-db@1.52.0: resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.52.0.tgz#bbabcdc02859f4987301c856e3387ce5ec43bf70" integrity sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg== -mime-types@^2.1.12, mime-types@^2.1.35: +mime-types@^2.1.35: version "2.1.35" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.35.tgz#381a871b62a734450660ae3deee44813f70d959a" integrity sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw== @@ -3765,20 +3725,6 @@ node-addon-api@^6.1.0: resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-6.1.0.tgz#ac8470034e58e67d0c6f1204a18ae6995d9c0d76" integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA== -node-domexception@^1.0.0: - version "1.0.0" - resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5" - integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ== - -node-fetch@^3.3.2: - version "3.3.2" - resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-3.3.2.tgz#d1e889bacdf733b4ff3b2b243eb7a12866a0b78b" - integrity sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA== - dependencies: - data-uri-to-buffer "^4.0.0" - fetch-blob "^3.1.4" - formdata-polyfill "^4.0.10" - node-int64@^0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/node-int64/-/node-int64-0.4.0.tgz#87a9065cdb355d3182d8f94ce11188b825c68a3b" @@ -4504,15 +4450,6 @@ socks-proxy-agent@^8.0.2: debug "^4.3.4" socks "^2.7.1" -socks-proxy-agent@^8.0.3: - version "8.0.3" - resolved "https://registry.yarnpkg.com/socks-proxy-agent/-/socks-proxy-agent-8.0.3.tgz#6b2da3d77364fde6292e810b496cb70440b9b89d" - integrity sha512-VNegTZKhuGq5vSD6XNKlbqWhyt/40CgoEw8XxD6dhnm8Jq9IEa3nIa4HwnM8XOqU0CdB0BwWVXusqiFXfHB3+A== - dependencies: - agent-base "^7.1.1" - debug "^4.3.4" - socks "^2.7.1" - socks@^2.7.1: version "2.7.1" resolved "https://registry.yarnpkg.com/socks/-/socks-2.7.1.tgz#d8e651247178fde79c0663043e07240196857d55" @@ -4521,6 +4458,14 @@ socks@^2.7.1: ip "^2.0.0" smart-buffer "^4.2.0" +socks@^2.8.1: + version "2.8.3" + resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5" + integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw== + dependencies: + ip-address "^9.0.5" + smart-buffer "^4.2.0" + source-map-support@0.5.13: version "0.5.13" resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932" @@ -4539,6 +4484,11 @@ split-on-first@^1.0.0: resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f" integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw== +sprintf-js@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a" + integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA== + sprintf-js@~1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c" @@ -4926,6 +4876,11 @@ undici-types@~5.25.1: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3" integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA== +undici@^6.10.1, undici@^6.18.2: + version "6.18.2" + resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97" + integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg== + unique-string@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/unique-string/-/unique-string-3.0.0.tgz#84a1c377aff5fd7a8bc6b55d8244b2bd90d75b9a" @@ -5042,11 +4997,6 @@ web-encoding@^1.1.5: optionalDependencies: "@zxing/text-encoding" "0.9.0" -web-streams-polyfill@^3.0.3: - version "3.3.3" - resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz#2073b91a2fdb1fbfbd401e7de0ac9f8214cecb4b" - integrity sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw== - which-boxed-primitive@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" From 1202b52853c9137fbebb1fe77bb04057c93e492b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 15:23:55 -0700 Subject: [PATCH 06/12] tweak logging, hostname use --- src/util/browser.ts | 5 +++-- src/util/recorder.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/util/browser.ts b/src/util/browser.ts index c6cdeba7..2fad9708 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -115,7 +115,6 @@ export class Browser { ? undefined : (target) => this.targetFilter(target), }; - logger.info("Launching browser", launchOpts); await this._init(launchOpts, ondisconnect, recording); } @@ -237,7 +236,9 @@ export class Browser { ]; proxy = proxy || getProxy(); - logger.info(`Proxy settings: ${proxy}`); + if (proxy) { + logger.info("Using proxy", { proxy }, "browser"); + } if (proxy) { args.push("--ignore-certificate-errors"); diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 411aeb39..11fbf453 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1663,7 +1663,7 @@ function createDispatcher(): Dispatcher | undefined { const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; const params = { type, - host: url.host, + host: url.hostname, port: parseInt(url.port), userId: url.username || undefined, password: url.password || undefined, From b28a417cffcde6932da5b064c7eecd7874508d19 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 17:39:55 -0700 Subject: [PATCH 07/12] use global dispatcher tests: add proxy tests for socks5 and http! --- src/crawler.ts | 3 +++ src/util/blockrules.ts | 2 ++ src/util/browser.ts | 14 +--------- src/util/recorder.ts | 40 ++++----------------------- src/util/reqresp.ts | 1 - src/util/sitemapper.ts | 5 +++- tests/proxy.test.js | 61 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 76 insertions(+), 50 deletions(-) create mode 100644 tests/proxy.test.js diff --git a/src/crawler.ts b/src/crawler.ts index 11404e16..6a6c2698 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -56,6 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; import { isHTMLContentType } from "./util/reqresp.js"; +import { initDispatcher } from "./util/proxy.js"; const behaviors = fs.readFileSync( new URL( @@ -436,6 +437,8 @@ export class Crawler { async bootstrap() { const subprocesses: ChildProcess[] = []; + await initDispatcher(); + subprocesses.push(this.launchRedis()); await fsp.mkdir(this.logDir, { recursive: true }); diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index a0fa1ebe..3f258b33 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js"; import { HTTPRequest, Page } from "puppeteer-core"; import { Browser } from "./browser.js"; +import { fetch } from "undici"; + const RULE_TYPES = ["block", "allowOnly"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; diff --git a/src/util/browser.ts b/src/util/browser.ts index 2fad9708..e72f9a7d 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -20,6 +20,7 @@ import puppeteer, { } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; +import { getProxy } from "./proxy.js"; type BtrixChromeOpts = { proxy?: string; @@ -602,16 +603,3 @@ export const defaultArgs = [ "--export-tagged-pdf", "--apps-keep-chrome-alive-in-tests", ]; - -export function getProxy() { - if (process.env.PROXY_SERVER) { - return process.env.PROXY_SERVER; - } - - // for backwards compatibility with 0.x proxy settings - if (process.env.PROXY_HOST && process.env.PROXY_PORT) { - return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; - } - - return ""; -} diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 11fbf453..09e40d56 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,12 +8,7 @@ import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; -import { fetch, Response, Dispatcher, ProxyAgent } from "undici"; - -import { socksDispatcher } from "fetch-socks"; -import type { SocksProxyType } from "socks/typings/common/constants.js"; - -import { getProxy } from "./browser.js"; +import { fetch, Response } from "undici"; // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; @@ -138,8 +133,6 @@ export class Recorder { frameIdToExecId: Map | null; - dispatcher?: Dispatcher; - constructor({ workerid, writer, @@ -162,8 +155,6 @@ export class Recorder { this.fetcherQ = new PQueue({ concurrency: 1 }); this.frameIdToExecId = null; - - this.dispatcher = createDispatcher(); } async onCreatePage({ @@ -762,6 +753,10 @@ export class Recorder { writePageInfoRecord() { const text = JSON.stringify(this.pageInfo, null, 2); + if (!Object.keys(this.pageInfo.urls).length) { + logger.debug("No entries, skipping pageinfo record"); + } + const url = this.pageUrl; this.writer.writeNewResourceRecord( @@ -1372,7 +1367,6 @@ class AsyncFetcher { body: reqresp.postData || undefined, signal, redirect: this.manualRedirect ? "manual" : "follow", - dispatcher: this.recorder.dispatcher, }); if (this.filter && !this.filter(resp) && abort) { @@ -1649,27 +1643,3 @@ function createRequest( requestBody, ); } - -function createDispatcher(): Dispatcher | undefined { - const proxyUrl = getProxy(); - if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { - return new ProxyAgent({ uri: proxyUrl }); - } else if ( - proxyUrl.startsWith("socks://") || - proxyUrl.startsWith("socks5://") || - proxyUrl.startsWith("socks4://") - ) { - const url = new URL(proxyUrl); - const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; - const params = { - type, - host: url.hostname, - port: parseInt(url.port), - userId: url.username || undefined, - password: url.password || undefined, - }; - return socksDispatcher(params); - } else { - return undefined; - } -} diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 1f9fbc0f..2052c143 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -4,7 +4,6 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; - import { Response } from "undici"; const CONTENT_LENGTH = "content-length"; diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 5d8507b5..e34a9bf1 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js"; import { DETECT_SITEMAP } from "./constants.js"; import { sleep } from "./timing.js"; +import { fetch, Response } from "undici"; + const SITEMAP_CONCURRENCY = 5; const TEXT_CONTENT_TYPE = ["text/plain"]; @@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter { resp.headers.get("content-encoding") !== "gzip" ) { const ds = new DecompressionStream("gzip"); - stream = body.pipeThrough(ds); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + stream = body.pipeThrough(ds as any); } else { stream = body; } diff --git a/tests/proxy.test.js b/tests/proxy.test.js new file mode 100644 index 00000000..20c08934 --- /dev/null +++ b/tests/proxy.test.js @@ -0,0 +1,61 @@ +import child_process from "child_process"; + +let port = 33080; + +const PROXY_IMAGE = "ghcr.io/tarampampam/3proxy:1.9.1" + +function runSocksProxy(scheme, user="", pass="") { + const isSocks = scheme === "socks5"; + const id = child_process.execSync(`docker run -d --rm -e PROXY_USER=${user} -e PROXY_PASSWORD=${pass} -p ${port++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"}); + return {id, port}; +} + +describe("socks5 + https proxy tests", () => { + for (const mode of ["socks5", "http"]) { + const scheme = mode; + + test(`${scheme} proxy, no auth`, async () => { + const {id, port} = runSocksProxy(mode); + const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"}); + + child_process.execSync(`docker kill -s SIGINT ${id}`); + + expect(!!result).toBe(true); + }); + + test(`${scheme} proxy, with auth`, async () => { + const {id, port} = runSocksProxy(mode, "user", "passw0rd"); + const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"}); + + child_process.execSync(`docker kill -s SIGINT ${id}`); + + expect(!!result).toBe(true); + }); + + test(`${scheme} proxy, error, not running`, async () => { + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + + test(`${scheme} proxy, error, wrong auth`, async () => { + const {id, port} = runSocksProxy(mode, "user", "passw1rd"); + + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed --timeout 10`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + + child_process.execSync(`docker kill -s SIGINT ${id}`); + }); + } +}); From b6942786b1de7e8be728394dc0196fb9c23c3109 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 18:43:41 -0700 Subject: [PATCH 08/12] add missing file --- src/util/proxy.ts | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/util/proxy.ts diff --git a/src/util/proxy.ts b/src/util/proxy.ts new file mode 100644 index 00000000..7bb28286 --- /dev/null +++ b/src/util/proxy.ts @@ -0,0 +1,48 @@ +import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; + +import { socksDispatcher } from "fetch-socks"; +import type { SocksProxyType } from "socks/typings/common/constants.js"; + +export function getProxy() { + if (process.env.PROXY_SERVER) { + return process.env.PROXY_SERVER; + } + + // for backwards compatibility with 0.x proxy settings + if (process.env.PROXY_HOST && process.env.PROXY_PORT) { + return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; + } + + return ""; +} + +export function initDispatcher() { + const dispatcher = createDispatcher(); + if (dispatcher) { + setGlobalDispatcher(dispatcher); + } +} + +export function createDispatcher(): Dispatcher | undefined { + const proxyUrl = getProxy(); + if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { + return new ProxyAgent({ uri: proxyUrl }); + } else if ( + proxyUrl.startsWith("socks://") || + proxyUrl.startsWith("socks5://") || + proxyUrl.startsWith("socks4://") + ) { + const url = new URL(proxyUrl); + const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; + const params = { + type, + host: url.hostname, + port: parseInt(url.port), + userId: url.username || undefined, + password: url.password || undefined, + }; + return socksDispatcher(params); + } else { + return undefined; + } +} From 61628049cb1d4dc3482b6bcf8efa7bac8a3ff554 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 22:02:55 -0700 Subject: [PATCH 09/12] proxy auth: - support for SOCKS5 (as supported in Brave though not Chromium) but not HTTP (not supported in any browser w/o interactive prompt) - tests: update tests to check socks5/http and html/pdf in a loop --- src/util/proxy.ts | 8 ++- tests/pdf-crawl.test.js | 2 +- tests/proxy.test.js | 121 ++++++++++++++++++++++++++++------------ 3 files changed, 92 insertions(+), 39 deletions(-) diff --git a/src/util/proxy.ts b/src/util/proxy.ts index 7bb28286..57c3fa18 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -26,12 +26,18 @@ export function initDispatcher() { export function createDispatcher(): Dispatcher | undefined { const proxyUrl = getProxy(); if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { - return new ProxyAgent({ uri: proxyUrl }); + // HTTP PROXY does not support auth, as it's not supported in the browser + // so must drop username/password for consistency + const url = new URL(proxyUrl); + url.username = ""; + url.password = ""; + return new ProxyAgent({ uri: url.href }); } else if ( proxyUrl.startsWith("socks://") || proxyUrl.startsWith("socks5://") || proxyUrl.startsWith("socks4://") ) { + // support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium) const url = new URL(proxyUrl); const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; const params = { diff --git a/tests/pdf-crawl.test.js b/tests/pdf-crawl.test.js index 00c314d7..3bc6c077 100644 --- a/tests/pdf-crawl.test.js +++ b/tests/pdf-crawl.test.js @@ -3,7 +3,7 @@ import fs from "fs"; import path from "path"; import { WARCParser } from "warcio"; -const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"; +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; test("ensure pdf is crawled", async () => { child_process.execSync( diff --git a/tests/proxy.test.js b/tests/proxy.test.js index 20c08934..b2bdad96 100644 --- a/tests/proxy.test.js +++ b/tests/proxy.test.js @@ -1,61 +1,108 @@ import child_process from "child_process"; -let port = 33080; +let globalPort = 33080; -const PROXY_IMAGE = "ghcr.io/tarampampam/3proxy:1.9.1" +const PROXY_IMAGE = "tarampampam/3proxy:1.9.1"; + +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; +const HTML = "https://webrecorder.net/"; + +const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug"; + +function killContainer(id) { + child_process.execSync(`docker kill -s SIGINT ${id}`); +} function runSocksProxy(scheme, user="", pass="") { const isSocks = scheme === "socks5"; - const id = child_process.execSync(`docker run -d --rm -e PROXY_USER=${user} -e PROXY_PASSWORD=${pass} -p ${port++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"}); + const port = globalPort; + const id = child_process.execSync(`docker run -e PROXY_LOGIN=${user} -e PROXY_PASSWORD=${pass} -d --rm -p ${globalPort++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"}); return {id, port}; } describe("socks5 + https proxy tests", () => { - for (const mode of ["socks5", "http"]) { - const scheme = mode; - - test(`${scheme} proxy, no auth`, async () => { - const {id, port} = runSocksProxy(mode); - const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"}); - - child_process.execSync(`docker kill -s SIGINT ${id}`); - - expect(!!result).toBe(true); - }); - - test(`${scheme} proxy, with auth`, async () => { - const {id, port} = runSocksProxy(mode, "user", "passw0rd"); - const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"}); - - child_process.execSync(`docker kill -s SIGINT ${id}`); - - expect(!!result).toBe(true); - }); - - test(`${scheme} proxy, error, not running`, async () => { + for (const scheme of ["socks5", "http"]) { + for (const type of ["HTML page", "PDF"]) { + + const url = type === "PDF" ? PDF : HTML; + + test(`${scheme} proxy, ${type}, no auth`, () => { + const {id, port} = runSocksProxy(scheme); + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } finally { + killContainer(id); + } + expect(status).toBe(0); + }); + + test(`${scheme} proxy, ${type}, with auth`, () => { + const {id, port} = runSocksProxy(scheme, "user", "passw0rd"); + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } finally { + killContainer(id); + } + // auth supported only for SOCKS5 + expect(status).toBe(scheme === "socks5" ? 0 : 1); + }); + + test(`${scheme} proxy, ${type}, wrong auth`, () => { + const {id, port} = runSocksProxy(scheme, "user", "passw1rd"); + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } finally { + killContainer(id); + } + expect(status).toBe(1); + }); + } + + test(`${scheme} proxy, proxy missing error`, () => { let status = 0; try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed`, {encoding: "utf-8"}); + child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; } expect(status).toBe(1); }); + } +}); - test(`${scheme} proxy, error, wrong auth`, async () => { - const {id, port} = runSocksProxy(mode, "user", "passw1rd"); - let status = 0; +test("http proxy, PDF, separate env vars", () => { + const {id, port} = runSocksProxy("http"); - try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed --timeout 10`, {encoding: "utf-8"}); - } catch (e) { - status = e.status; - } - expect(status).toBe(1); + try { + child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${port} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } finally { + killContainer(id); + } +}); - child_process.execSync(`docker kill -s SIGINT ${id}`); - }); +test("http proxy, error, not running, separate env vars", () => { + let status = 0; + + try { + child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; } + expect(status).toBe(1); }); + + From 4ee0ce620f3a3c7b8589bb65822f3a9aebfe70cc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Jun 2024 22:26:18 -0700 Subject: [PATCH 10/12] ensure --proxyServer cli flag also supported by both direct fetch and browser, simplify proxy init --- src/crawler.ts | 8 +++++--- src/util/browser.ts | 2 -- src/util/proxy.ts | 20 +++++++++++++------- tests/proxy.test.js | 13 ++++++++++++- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 6a6c2698..abddbbbc 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -56,7 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; import { isHTMLContentType } from "./util/reqresp.js"; -import { initDispatcher } from "./util/proxy.js"; +import { initProxy } from "./util/proxy.js"; const behaviors = fs.readFileSync( new URL( @@ -171,6 +171,8 @@ export class Crawler { maxHeapUsed = 0; maxHeapTotal = 0; + proxyServer?: string; + driver!: (opts: { page: Page; data: PageState; @@ -437,7 +439,7 @@ export class Crawler { async bootstrap() { const subprocesses: ChildProcess[] = []; - await initDispatcher(); + this.proxyServer = initProxy(this.params.proxyServer); subprocesses.push(this.launchRedis()); @@ -1292,7 +1294,7 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: this.params.proxyServer, + proxy: this.proxyServer, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/util/browser.ts b/src/util/browser.ts index e72f9a7d..73a74222 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -20,7 +20,6 @@ import puppeteer, { } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; -import { getProxy } from "./proxy.js"; type BtrixChromeOpts = { proxy?: string; @@ -236,7 +235,6 @@ export class Browser { ...extraArgs, ]; - proxy = proxy || getProxy(); if (proxy) { logger.info("Using proxy", { proxy }, "browser"); } diff --git a/src/util/proxy.ts b/src/util/proxy.ts index 57c3fa18..c7fe2a85 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -3,7 +3,7 @@ import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; import { socksDispatcher } from "fetch-socks"; import type { SocksProxyType } from "socks/typings/common/constants.js"; -export function getProxy() { +export function getEnvProxyUrl() { if (process.env.PROXY_SERVER) { return process.env.PROXY_SERVER; } @@ -16,15 +16,21 @@ export function getProxy() { return ""; } -export function initDispatcher() { - const dispatcher = createDispatcher(); - if (dispatcher) { - setGlobalDispatcher(dispatcher); +export function initProxy(proxy?: string): string { + if (!proxy) { + proxy = getEnvProxyUrl(); } + if (proxy) { + const dispatcher = createDispatcher(proxy); + if (dispatcher) { + setGlobalDispatcher(dispatcher); + return proxy; + } + } + return ""; } -export function createDispatcher(): Dispatcher | undefined { - const proxyUrl = getProxy(); +export function createDispatcher(proxyUrl: string): Dispatcher | undefined { if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { // HTTP PROXY does not support auth, as it's not supported in the browser // so must drop username/password for consistency diff --git a/tests/proxy.test.js b/tests/proxy.test.js index b2bdad96..49326bf8 100644 --- a/tests/proxy.test.js +++ b/tests/proxy.test.js @@ -94,7 +94,7 @@ test("http proxy, PDF, separate env vars", () => { } }); -test("http proxy, error, not running, separate env vars", () => { +test("http proxy set, but not running, separate env vars", () => { let status = 0; try { @@ -105,4 +105,15 @@ test("http proxy, error, not running, separate env vars", () => { expect(status).toBe(1); }); +test("http proxy set, but not running, cli arg", () => { + let status = 0; + + try { + child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); + From 9191b6cf197b58045487d5f61ed9b2f6fb086689 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Jun 2024 09:27:38 -0700 Subject: [PATCH 11/12] tests fix: use docker network, persistent proxies accross tests --- tests/proxy.test.js | 74 +++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/tests/proxy.test.js b/tests/proxy.test.js index 49326bf8..1c162620 100644 --- a/tests/proxy.test.js +++ b/tests/proxy.test.js @@ -1,70 +1,84 @@ -import child_process from "child_process"; +import { execSync, exec } from "child_process"; -let globalPort = 33080; +const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); const PROXY_IMAGE = "tarampampam/3proxy:1.9.1"; +const SOCKS_PORT = "1080"; +const HTTP_PORT = "3128"; +const WRONG_PORT = "33130"; const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; const HTML = "https://webrecorder.net/"; const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug"; -function killContainer(id) { - child_process.execSync(`docker kill -s SIGINT ${id}`); -} +let proxyAuthId; +let proxyNoAuthId; -function runSocksProxy(scheme, user="", pass="") { - const isSocks = scheme === "socks5"; - const port = globalPort; - const id = child_process.execSync(`docker run -e PROXY_LOGIN=${user} -e PROXY_PASSWORD=${pass} -d --rm -p ${globalPort++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"}); - return {id, port}; -} +beforeAll(() => { + execSync("docker network create proxy-test-net"); + + proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); + + proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); +}); + +afterAll(async () => { + execSync(`docker kill -s SIGINT ${proxyAuthId}`); + execSync(`docker kill -s SIGINT ${proxyNoAuthId}`); + await sleep(3000); + execSync("docker network rm proxy-test-net"); +}); describe("socks5 + https proxy tests", () => { for (const scheme of ["socks5", "http"]) { + const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT; + for (const type of ["HTML page", "PDF"]) { const url = type === "PDF" ? PDF : HTML; test(`${scheme} proxy, ${type}, no auth`, () => { - const {id, port} = runSocksProxy(scheme); let status = 0; try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; - } finally { - killContainer(id); } expect(status).toBe(0); }); test(`${scheme} proxy, ${type}, with auth`, () => { - const {id, port} = runSocksProxy(scheme, "user", "passw0rd"); let status = 0; try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; - } finally { - killContainer(id); } // auth supported only for SOCKS5 expect(status).toBe(scheme === "socks5" ? 0 : 1); }); test(`${scheme} proxy, ${type}, wrong auth`, () => { - const {id, port} = runSocksProxy(scheme, "user", "passw1rd"); let status = 0; try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + + test(`${scheme} proxy, ${type}, wrong protocol`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; - } finally { - killContainer(id); } expect(status).toBe(1); }); @@ -74,7 +88,7 @@ describe("socks5 + https proxy tests", () => { let status = 0; try { - child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; } @@ -85,20 +99,14 @@ describe("socks5 + https proxy tests", () => { test("http proxy, PDF, separate env vars", () => { - const {id, port} = runSocksProxy("http"); - - try { - child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${port} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); - } finally { - killContainer(id); - } + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); }); test("http proxy set, but not running, separate env vars", () => { let status = 0; try { - child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; } @@ -109,7 +117,7 @@ test("http proxy set, but not running, cli arg", () => { let status = 0; try { - child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); } catch (e) { status = e.status; } From 3764df33b63c2ecf90c90b4cdc7ebcfaa3baad08 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Jun 2024 09:29:17 -0700 Subject: [PATCH 12/12] remove logging test --- src/util/recorder.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 09e40d56..1a0fd7b1 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -753,10 +753,6 @@ export class Recorder { writePageInfoRecord() { const text = JSON.stringify(this.pageInfo, null, 2); - if (!Object.keys(this.pageInfo.urls).length) { - logger.debug("No entries, skipping pageinfo record"); - } - const url = this.pageUrl; this.writer.writeNewResourceRecord(