Skip to content

Commit 667d758

Browse files
committed
Feat: Added Rotating Proxy Feature
1 parent 704e5fa commit 667d758

File tree

9 files changed

+1391
-99
lines changed

9 files changed

+1391
-99
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
# [v7.0.0](https://github.com/coder-hxl/x-crawl/compare/v6.0.1...v7.0.0) (2023-04-23)
1+
# [v7.0.0](https://github.com/coder-hxl/x-crawl/compare/v6.0.1...v7.0.0) (2023-04-24)
22

33
### 🚨 重大改变
44

55
- 进阶写法的配置指纹改用数组写法,里面存放 DetailTargetFingerprintCommon 类型的对象,方便定制。内部会将里面的对象随机分配给目标。
66
- crawlPage 的进阶写法和详细目标写法的指纹配置的最大宽高改为可选项。
7+
- 创建爬虫实例、进阶写法以及详细目标写法的 proxy 更改为对象写法, 拥有 urls、switchByHttpStatus 以及 switchByErrorCount 这三个属性,urls 可以设置多个代理 URL ,内部默认先采用第一个,switchByHttpStatus 设置遇到哪些不符合的响应状态码需要切换代理,switchByErrorCount 设置像超时等错误时到达多少次需要切换代理。该代理轮换功能需要配合错误重试才能使用。
78

89
### 🚀 特征
910

docs/cn.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ x-crawl 是一个灵活的 Node.js 多功能爬虫库。用法灵活,并且内
1414
- **👀 设备指纹** - 零配置或自定义配置,避免指纹识别从不同位置识别并跟踪我们。
1515
- **⏱️ 间隔爬取** - 无间隔、固定间隔以及随机间隔,产生或避免高并发爬取。
1616
- **🔄 失败重试** - 避免因短暂的问题而造成爬取失败,无限制重试次数。
17+
- **➡️ 轮换代理** - 配合失败重试,根据自定义错误次数以及 HTTP 状态码自动轮换代理。
1718
- **🚀 优先队列** - 根据单个爬取目标的优先级可以优先于其他目标提前爬取。
1819
- **☁️ 爬取 SPA** - 爬取 SPA(单页应用程序)生成预渲染内容(即“SSR”(服务器端渲染))。
1920
- **⚒️ 控制页面** - 可以表单提交、键盘输入、事件操作、生成页面的屏幕截图等。

src/api.ts

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -84,20 +84,30 @@ interface PageSingleCrawlResult {
8484

8585
// Create config
8686
// Loader
87+
export type ProxyDetails = { url: string; state: boolean }[]
88+
89+
type LoaderCommonConfig = {
90+
proxyUrl?: string
91+
proxyDetails: ProxyDetails
92+
}
93+
8794
type LoaderHasConfig = {
8895
timeout: number
8996
maxRetry: number
9097
priority: number
9198
}
9299

93-
export type LoaderCrawlPageDetail = CrawlPageDetailTargetConfig &
94-
LoaderHasConfig
100+
export type LoaderCrawlPageDetail = LoaderCommonConfig &
101+
LoaderHasConfig &
102+
CrawlPageDetailTargetConfig
95103

96-
export type LoaderCrawlDataDetail = CrawlDataDetailTargetConfig &
97-
LoaderHasConfig
104+
export type LoaderCrawlDataDetail = LoaderCommonConfig &
105+
LoaderHasConfig &
106+
CrawlDataDetailTargetConfig
98107

99-
export type LoaderCrawlFileDetail = CrawlFileDetailTargetConfig &
100-
LoaderHasConfig
108+
export type LoaderCrawlFileDetail = LoaderCommonConfig &
109+
LoaderHasConfig &
110+
CrawlFileDetailTargetConfig
101111

102112
// AdvancedDetailTargets
103113
interface CrawlPageAdvancedDetailTargetsConfig extends CrawlPageAdvancedConfig {
@@ -382,7 +392,16 @@ function loaderCommonConfigToCrawlConfig(
382392
}
383393
}
384394

385-
// 1.3.porxy
395+
// 1.3.maxRetry
396+
if (isUndefined(maxRetry)) {
397+
if (!isUndefined(advancedDetailTargetsConfig.maxRetry)) {
398+
detail.maxRetry = advancedDetailTargetsConfig.maxRetry
399+
} else {
400+
detail.maxRetry = xCrawlConfig.maxRetry
401+
}
402+
}
403+
404+
// 1.4.proxy
386405
if (isUndefined(proxy)) {
387406
if (!isUndefined(advancedDetailTargetsConfig.proxy)) {
388407
detail.proxy = advancedDetailTargetsConfig.proxy
@@ -391,26 +410,24 @@ function loaderCommonConfigToCrawlConfig(
391410
}
392411
}
393412

394-
// 1.4.maxRetry
395-
if (isUndefined(maxRetry)) {
396-
if (!isUndefined(advancedDetailTargetsConfig.maxRetry)) {
397-
detail.maxRetry = advancedDetailTargetsConfig.maxRetry
398-
} else {
399-
detail.maxRetry = xCrawlConfig.maxRetry
400-
}
413+
// 1.5.proxyUrl & proxyDetail
414+
if (!isUndefined(detail.proxy?.urls)) {
415+
const urls = detail.proxy!.urls
416+
detail.proxyUrl = urls[0]
417+
detail.proxyDetails = urls.map((url) => ({ url, state: true }))
401418
}
402419

403-
// 1.5.priority
420+
// 1.6.priority
404421
if (isUndefined(priority)) {
405422
detail.priority = 0
406423
}
407424

408-
// 1.6.header
425+
// 1.7.header
409426
if (isUndefined(headers) && advancedDetailTargetsConfig.headers) {
410427
detail.headers = { ...advancedDetailTargetsConfig.headers }
411428
}
412429

413-
// 1.7.fingerprint(公共部分)
430+
// 1.8.fingerprint(公共部分)
414431
if (fingerprint) {
415432
// detaileTarget
416433

@@ -675,9 +692,9 @@ async function pageSingleCrawlHandle(
675692

676693
let response: HTTPResponse | null = null
677694
try {
678-
if (detailTarget.proxy) {
695+
if (detailTarget.proxyUrl) {
679696
await browser.createIncognitoBrowserContext({
680-
proxyServer: detailTarget.proxy
697+
proxyServer: detailTarget.proxyUrl
681698
})
682699
} else {
683700
await browser.createIncognitoBrowserContext({
@@ -973,8 +990,6 @@ export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) {
973990
}
974991

975992
const crawlDataSingleRes: AnyObject = detaileInfo
976-
delete crawlDataSingleRes.detailTarget
977-
delete crawlDataSingleRes.detailTargetRes
978993

979994
if (onCrawlItemComplete) {
980995
onCrawlItemComplete(crawlDataSingleRes as CrawlDataSingleRes<T>)

src/batchCrawlHandle.ts

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { isNumber, isUndefined, log, logNumber, random, sleep } from './utils'
22

33
import type { ExtraCommonConfig } from './api'
4-
import type { DetailInfo, CrawlDetail } from './controller'
4+
import { DetailInfo, CrawlDetail, getCrawlStatus } from './controller'
55

66
async function useSleepByBatch(
77
isHaventervalTime: boolean,
@@ -45,8 +45,8 @@ export async function asyncBatchCrawl<
4545
const isNumberIntervalTime = isNumber(intervalTime)
4646

4747
const crawlPendingQueue: Promise<any>[] = []
48-
for (const detaileInfo of detailInfos) {
49-
const { id } = detaileInfo
48+
for (const detailInfo of detailInfos) {
49+
const { id } = detailInfo
5050

5151
await useSleepByBatch(
5252
isHaventervalTime,
@@ -55,24 +55,34 @@ export async function asyncBatchCrawl<
5555
id
5656
)
5757

58-
const crawlSinglePending = singleCrawlHandle(detaileInfo, extraConfig)
58+
const crawlSinglePending = singleCrawlHandle(detailInfo, extraConfig)
5959
.catch((error) => {
60-
detaileInfo.crawlErrorQueue.push(error)
60+
detailInfo.crawlErrorQueue.push(error)
6161
return false
6262
})
6363
.then((detailTargetRes) => {
64+
const notAllowRetry = detailInfo.retryCount === detailInfo.maxRetry
65+
6466
if (typeof detailTargetRes === 'boolean') {
65-
if (detaileInfo.retryCount === detaileInfo.maxRetry) {
66-
singleResultHandle(detaileInfo, extraConfig)
67+
if (notAllowRetry) {
68+
singleResultHandle(detailInfo, extraConfig)
6769
}
6870

6971
return
7072
}
7173

72-
detaileInfo.isSuccess = true
73-
detaileInfo.detailTargetRes = detailTargetRes
74+
detailInfo.isSuccess = true
75+
detailInfo.detailTargetRes = detailTargetRes
7476

75-
singleResultHandle(detaileInfo, extraConfig)
77+
// 根据 状态码/是否无法重试 决定处理结果
78+
const { detailTarget } = detailInfo
79+
80+
const status = getCrawlStatus(detailTargetRes)
81+
const switchByHttpStatus = detailTarget.proxy?.switchByHttpStatus ?? []
82+
if ((status && !switchByHttpStatus.includes(status)) || notAllowRetry) {
83+
singleResultHandle(detailInfo, extraConfig)
84+
delete detailInfo._notHandle
85+
}
7686
})
7787

7888
crawlPendingQueue.push(crawlSinglePending)
@@ -120,8 +130,20 @@ export async function syncBatchCrawl<
120130
detailInfo.crawlErrorQueue.push(error)
121131
}
122132

123-
if (detailInfo.isSuccess || detailInfo.retryCount === detailInfo.maxRetry) {
133+
// 根据 是否成功和状态码/是否无法重试 决定处理结果
134+
const { detailTarget, detailTargetRes } = detailInfo
135+
136+
const status = getCrawlStatus(detailTargetRes)
137+
const switchByHttpStatus = detailTarget.proxy?.switchByHttpStatus ?? []
138+
const notAllowRetry = detailInfo.retryCount === detailInfo.maxRetry
139+
if (
140+
(detailInfo.isSuccess &&
141+
status &&
142+
!switchByHttpStatus.includes(status)) ||
143+
notAllowRetry
144+
) {
124145
singleResultHandle(detailInfo, extraConfig)
146+
delete detailInfo._notHandle
125147
}
126148
}
127149
}

src/controller.ts

Lines changed: 97 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,36 @@ import {
55
ExtraCommonConfig,
66
LoaderCrawlDataDetail,
77
LoaderCrawlFileDetail,
8-
LoaderCrawlPageDetail
8+
LoaderCrawlPageDetail,
9+
ProxyDetails
910
} from './api'
1011

11-
import { log, logError, logNumber, logSuccess, logWarn } from './utils'
12+
import {
13+
isObject,
14+
isUndefined,
15+
log,
16+
logError,
17+
logNumber,
18+
logSuccess,
19+
logWarn
20+
} from './utils'
21+
import { HTTPResponse } from 'puppeteer'
22+
import { Request } from './request'
1223

1324
export type CrawlDetail =
1425
| LoaderCrawlPageDetail
1526
| LoaderCrawlDataDetail
1627
| LoaderCrawlFileDetail
1728

1829
export interface DetailInfo<T extends CrawlDetail, R> {
30+
_notHandle: any
31+
1932
id: number
2033
isSuccess: boolean
2134
maxRetry: number
2235
retryCount: number
2336
crawlErrorQueue: Error[]
37+
proxyDetailes: ProxyDetails
2438
data: any | null
2539

2640
detailTarget: T
@@ -32,6 +46,25 @@ type TargetSingleRes = Omit<
3246
'detailTarget' | 'detailTargetRes'
3347
>
3448

49+
export function getCrawlStatus(detailTargetRes: any) {
50+
let status: number | null = null
51+
52+
if (
53+
isObject(detailTargetRes) &&
54+
Object.hasOwn(detailTargetRes, 'response') &&
55+
(detailTargetRes as any).response
56+
) {
57+
// crawlPage
58+
const response: HTTPResponse = (detailTargetRes as any).response
59+
status = response.status()
60+
} else if (isObject(detailTargetRes)) {
61+
// crawlData / crawlFie
62+
status = (detailTargetRes as any as Request).statusCode ?? null
63+
}
64+
65+
return status
66+
}
67+
3568
export async function controller<
3669
T extends CrawlDetail,
3770
E extends ExtraCommonConfig,
@@ -63,11 +96,14 @@ export async function controller<
6396
// 通过映射生成新的配置数组
6497
const detailInfos: DetailInfo<T, R>[] = detailTargetConfigs.map(
6598
(detailTarget, index) => ({
99+
_notHandle: true,
100+
66101
id: index + 1,
67102
isSuccess: false,
68103
maxRetry: detailTarget.maxRetry,
69104
retryCount: 0,
70105
crawlErrorQueue: [],
106+
proxyDetailes: detailTarget.proxyDetails,
71107
data: null,
72108

73109
detailTarget,
@@ -94,12 +130,65 @@ export async function controller<
94130
singleResultHandle
95131
)
96132

97-
crawlQueue = crawlQueue.filter(
98-
(config) =>
99-
config.maxRetry &&
100-
!config.isSuccess &&
101-
config.retryCount < config.maxRetry
102-
)
133+
crawlQueue = crawlQueue.filter((detailInfo) => {
134+
const {
135+
isSuccess,
136+
maxRetry,
137+
retryCount,
138+
proxyDetailes,
139+
crawlErrorQueue,
140+
detailTarget,
141+
detailTargetRes
142+
} = detailInfo
143+
144+
let isRetry = false
145+
const haveRetryChance = maxRetry && retryCount < maxRetry
146+
147+
// 没有被处理/没成功/状态码不符合
148+
if (Object.hasOwn(detailInfo, '_notHandle') && haveRetryChance) {
149+
// 1.不成功
150+
if (!isSuccess) {
151+
isRetry = true
152+
}
153+
154+
// 2.代理多, 轮换代理
155+
if (proxyDetailes.length >= 2) {
156+
// 获取状态码
157+
const status = getCrawlStatus(detailTargetRes)
158+
159+
// 错误次数 / 检测状态码
160+
const switchByErrorCount = detailTarget.proxy?.switchByErrorCount ?? 0
161+
const switchByHttpStatus =
162+
detailTarget.proxy?.switchByHttpStatus ?? []
163+
if (
164+
(status && switchByHttpStatus.includes(status)) ||
165+
switchByErrorCount >= crawlErrorQueue.length
166+
) {
167+
isRetry = true
168+
proxyDetailes.find(
169+
(detail) => detail.url === detailTarget.proxyUrl
170+
)!.state = false
171+
172+
// 寻找新代理 URL
173+
const newProxyUrl = proxyDetailes.find(
174+
(detaile) => detaile.state
175+
)?.url
176+
177+
// 无则不切换
178+
if (!isUndefined(newProxyUrl)) {
179+
detailTarget.proxyUrl = newProxyUrl
180+
}
181+
}
182+
}
183+
}
184+
185+
// 重置需要重试的 isSuccess
186+
if (isRetry) {
187+
detailInfo.isSuccess = false
188+
}
189+
190+
return isRetry
191+
})
103192

104193
if (crawlQueue.length) {
105194
const retriedIds = crawlQueue.map((item) => {

src/request.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ function handleRequestConfig(
6363
const isHttp = protocol === 'http:'
6464

6565
const config: RequestOptions & MapTypeEmptyObject<URL> = {
66-
agent: rawConfig.proxy
67-
? HttpsProxyAgent(rawConfig.proxy)
66+
agent: rawConfig.proxyUrl
67+
? HttpsProxyAgent(rawConfig.proxyUrl)
6868
: isHttp
6969
? new http.Agent()
7070
: new https.Agent(),

0 commit comments

Comments
 (0)