Skip to content

Commit 61a65c8

Browse files
committed
Feat: Device fingerprint change
1 parent 23edefb commit 61a65c8

File tree

8 files changed

+269
-192
lines changed

8 files changed

+269
-192
lines changed

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
# [v7.0.0](https://github.com/coder-hxl/x-crawl/compare/v6.0.1...v7.0.0) (2023-04-23)
2+
3+
### 🚨 重大改变
4+
5+
- 进阶写法的配置指纹改用数组写法,里面存放 DetailTargetFingerprintCommon 类型的对象,方便定制。内部会将里面的对象随机分配给目标。
6+
- CrawlPageDetailTargetConfig 和 CrawlPageDetailTargetConfig 类型的指纹配置的最大宽高改为可选项。
7+
8+
### 🚀 特征
9+
10+
- DetailTargetFingerprintCommon 里的 userAgent 选项改写对象写法,并允许定制里面的主版本、次版本以及修订号的最大值和最小值。每个爬取目标都会获取一个新的 userAgent 。
11+
- 指纹配置的 mobile 选项添加 'random' 属性值,允许由内部随机决定。
12+
113
# [v6.0.1](https://github.com/coder-hxl/x-crawl/compare/v6.0.0...v6.0.1) (2023-04-21)
214

315
### 🚀 Features
@@ -6,7 +18,7 @@
618

719
---
820

9-
### 特征
21+
### 🚀 特征
1022

1123
- 完善文档。
1224

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ x-crawl is a flexible Node.js multipurpose crawler library. The usage is flexibl
99
## Features
1010

1111
- **🔥 Asynchronous Synchronous** - Just change the mode property to toggle asynchronous or synchronous crawling mode.
12-
- **️Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios.
12+
- **️ Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios.
1313
- **🖋️ Flexible writing style** - The same crawling API can be adapted to multiple configurations, and each configuration method is very unique.
14-
- **👀Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations.
14+
- **👀 Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations.
1515
- **⏱️ Interval Crawling** - No interval, fixed interval and random interval to generate or avoid high concurrent crawling.
1616
- **🔄 Failed Retry** - Avoid crawling failure due to transient problems, unlimited retries.
1717
- **🚀 Priority Queue** - According to the priority of a single crawling target, it can be crawled ahead of other targets.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"private": true,
33
"name": "x-crawl",
4-
"version": "6.0.1",
4+
"version": "7.0.0",
55
"author": "coderHXL",
66
"description": "x-crawl is a flexible Node.js multifunctional crawler library.",
77
"license": "MIT",

src/api.ts

Lines changed: 122 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,11 @@ import {
3131
CrawlFileAdvancedConfig,
3232
CrawlDataAdvancedConfig,
3333
IntervalTime,
34-
DetailTargetFingerprintCommon,
35-
Platform,
36-
Mobile
34+
DetailTargetFingerprintCommon
3735
} from './types/api'
3836
import { LoaderXCrawlConfig } from './types'
3937
import { AnyObject } from './types/common'
40-
import { randomFingerprint } from './default'
38+
import { fingerprints } from './default'
4139

4240
/* Types */
4341

@@ -104,6 +102,9 @@ interface PageSingleCrawlResult {
104102
interface CrawlPageConfigOriginal {
105103
detailTargets: CrawlPageDetailTargetConfig[]
106104
intervalTime: IntervalTime | undefined
105+
106+
selectFingerprintIndexs: number[]
107+
107108
onCrawlItemComplete:
108109
| ((crawlPageSingleRes: CrawlPageSingleRes) => void)
109110
| undefined
@@ -112,6 +113,9 @@ interface CrawlPageConfigOriginal {
112113
interface CrawlDataConfigOriginal {
113114
detailTargets: CrawlDataDetailTargetConfig[]
114115
intervalTime: IntervalTime | undefined
116+
117+
selectFingerprintIndexs: number[]
118+
115119
onCrawlItemComplete:
116120
| ((crawlDataSingleRes: CrawlDataSingleRes<any>) => void)
117121
| undefined
@@ -120,6 +124,9 @@ interface CrawlDataConfigOriginal {
120124
interface CrawlFileConfigOriginal {
121125
detailTargets: CrawlFileDetailTargetConfig[]
122126
intervalTime: IntervalTime | undefined
127+
128+
selectFingerprintIndexs: number[]
129+
123130
onBeforeSaveItemFile:
124131
| ((info: {
125132
id: number
@@ -225,7 +232,7 @@ function loaderCommonFingerprintToDetailTarget(
225232
| CrawlFileDetailTargetConfig,
226233
fingerprint: DetailTargetFingerprintCommon
227234
) {
228-
const { userAgent, ua, platform, platformVersion, mobile, acceptLanguage } =
235+
const { ua, platform, platformVersion, mobile, acceptLanguage, userAgent } =
229236
fingerprint
230237

231238
let headers = detail.headers
@@ -234,16 +241,17 @@ function loaderCommonFingerprintToDetailTarget(
234241
detail.headers = headers = {}
235242
}
236243

237-
// 1.user-agent
238-
if (userAgent) {
239-
headers['user-agent'] = userAgent
240-
}
241-
242-
// 2.sec-ch-ua
244+
// 1.sec-ch-ua
243245
if (ua) {
244246
headers['sec-ch-ua'] = ua
245247
}
246248

249+
// 2.sec-ch-ua-mobile
250+
if (mobile) {
251+
headers['sec-ch-ua-mobile'] =
252+
mobile === 'random' ? (random(2) ? '?1' : '?0') : mobile
253+
}
254+
247255
// 3.sec-ch-platform
248256
if (platform) {
249257
headers['sec-ch-platform'] = platform
@@ -254,38 +262,85 @@ function loaderCommonFingerprintToDetailTarget(
254262
headers['sec-ch-ua-platform-version'] = platformVersion
255263
}
256264

257-
// 5.sec-ch-mobile
258-
if (mobile) {
259-
headers['sec-ch-mobile'] = mobile
260-
}
261-
262-
// 6.accept-language
265+
// 5.accept-language
263266
if (acceptLanguage) {
264267
headers['accept-language'] = acceptLanguage
265268
}
269+
270+
// 6.user-agent
271+
if (userAgent) {
272+
let value = userAgent.value
273+
274+
userAgent.versions?.forEach((version) => {
275+
const {
276+
name,
277+
maxMajorVersion,
278+
minMajorVersion,
279+
maxMinorVersion,
280+
minMinorVersion,
281+
maxPatchVersion,
282+
minPatchVersion
283+
} = version
284+
285+
const nameSplit = value.split(`${name}/`)
286+
const versionSplit: any[] = nameSplit[1].split(' ')[0].split('.')
287+
const originalVersion = versionSplit.join('.')
288+
289+
if (!isUndefined(maxMajorVersion)) {
290+
versionSplit[0] =
291+
maxMajorVersion === minMajorVersion
292+
? maxMajorVersion
293+
: random(maxMajorVersion, minMajorVersion)
294+
}
295+
296+
if (!isUndefined(maxMinorVersion)) {
297+
versionSplit[1] =
298+
maxMinorVersion === minMinorVersion
299+
? maxMinorVersion
300+
: random(maxMinorVersion, minMinorVersion)
301+
}
302+
303+
if (!isUndefined(maxPatchVersion)) {
304+
versionSplit[2] =
305+
maxPatchVersion === minPatchVersion
306+
? maxPatchVersion
307+
: random(maxPatchVersion, minPatchVersion)
308+
}
309+
310+
const searchValue = `${name}/${originalVersion}`
311+
const replaceValue = `${name}/${versionSplit.join('.')}`
312+
value = value.replace(searchValue, replaceValue)
313+
})
314+
315+
headers['user-agent'] = value
316+
}
266317
}
267318

268319
function loaderPageFingerprintToDetailTarget(
269320
detail: CrawlPageDetailTargetConfig,
270321
fingerprint: {
271-
maxWidth: number
322+
maxWidth?: number
272323
minWidth?: number
273-
maxHeight: number
324+
maxHeight?: number
274325
minHidth?: number
275326
}
276327
) {
277328
const { maxWidth, minWidth, maxHeight, minHidth } = fingerprint
278329

330+
const viewport: any = detail.viewport ?? {}
279331
// 1.width / height
280-
const width = maxWidth === minWidth ? maxWidth : random(maxWidth, minWidth)
281-
const height =
282-
maxHeight === minHidth ? maxHeight : random(maxHeight, minHidth)
283-
const viewport = detail.viewport
284-
if (!viewport) {
285-
detail.viewport = { width, height }
286-
} else {
287-
viewport.width = width
288-
viewport.height = height
332+
if (maxWidth) {
333+
viewport.width =
334+
maxWidth === minWidth ? maxWidth : random(maxWidth, minWidth)
335+
}
336+
337+
if (maxHeight) {
338+
viewport.height =
339+
maxHeight === minHidth ? maxHeight : random(maxHeight, minHidth)
340+
}
341+
342+
if (Object.hasOwn(viewport, 'width') && Object.hasOwn(viewport, 'height')) {
343+
detail.viewport = viewport
289344
}
290345
}
291346

@@ -353,73 +408,30 @@ function loaderCommonConfigToCrawlConfig(
353408
// detaileTarget
354409

355410
loaderCommonFingerprintToDetailTarget(detail, fingerprint)
356-
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
411+
} else if (
412+
isUndefined(fingerprint) &&
413+
isArray(advancedConfig.fingerprints) &&
414+
advancedConfig.fingerprints.length
415+
) {
357416
// advancedConfig
358417

359-
const {
360-
userAgents,
361-
uas,
362-
platforms,
363-
platformVersions,
364-
mobiles,
365-
acceptLanguages
366-
} = advancedConfig.fingerprint
367-
368-
// 1.user-agent
369-
const userAgent = userAgents
370-
? userAgents[random(userAgents.length)]
371-
: undefined
372-
373-
// 2.sec-ch-ua
374-
const ua = uas ? uas[random(uas.length)] : undefined
375-
376-
// 3.sec-ch-platform
377-
const platform = platforms
378-
? platforms[random(platforms.length)]
379-
: undefined
380-
381-
// 4.sec-ch-platform-version
382-
const platformVersion = platformVersions
383-
? platformVersions[random(platformVersions.length)]
384-
: undefined
385-
386-
// 5.sec-ch-mobile
387-
const mobile = mobiles ? mobiles[random(mobiles.length)] : undefined
388-
389-
// 6.accept-language
390-
const acceptLanguage = acceptLanguages
391-
? acceptLanguages[random(acceptLanguages.length)]
392-
: undefined
393-
394-
loaderCommonFingerprintToDetailTarget(detail, {
395-
userAgent,
396-
ua,
397-
platform,
398-
platformVersion,
399-
mobile,
400-
acceptLanguage
401-
})
402-
} else if (xCrawlConfig.enableRandomFingerprint) {
403-
// xCrawlConfig
418+
const fingerprints = advancedConfig.fingerprints
419+
const selectFingerprintIndex = random(fingerprints.length)
420+
const fingerprint = fingerprints[selectFingerprintIndex]
404421

405-
const { platforms, mobiles } = randomFingerprint
422+
// 记录每个目标选中的指纹索引
423+
crawlConfig.selectFingerprintIndexs.push(selectFingerprintIndex)
406424

407-
// 1.user-agent
408-
const userAgent = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.${random(
409-
10
410-
)}.${random(10000)}.${random(1000)} Safari/537.36`
411-
412-
// 2.sec-ch-platform
413-
const platform = platforms[random(platforms.length)] as Platform
414-
415-
// 3.sec-ch-mobile
416-
const mobile = mobiles[random(mobiles.length)] as Mobile
425+
loaderCommonFingerprintToDetailTarget(detail, fingerprint)
426+
} else if (
427+
isUndefined(fingerprint) &&
428+
!isArray(advancedConfig.fingerprints) &&
429+
xCrawlConfig.enableRandomFingerprint
430+
) {
431+
// xCrawlConfig
432+
const fingerprint = fingerprints[random(fingerprints.length)]
417433

418-
loaderCommonFingerprintToDetailTarget(detail, {
419-
userAgent,
420-
platform,
421-
mobile
422-
})
434+
loaderCommonFingerprintToDetailTarget(detail, fingerprint)
423435
}
424436
})
425437

@@ -453,6 +465,9 @@ function createCrawlPageConfig(
453465
const crawlPageConfig: CrawlPageConfigOriginal = {
454466
detailTargets: [],
455467
intervalTime: undefined,
468+
469+
selectFingerprintIndexs: [],
470+
456471
onCrawlItemComplete: undefined
457472
}
458473

@@ -482,7 +497,7 @@ function createCrawlPageConfig(
482497
loaderCommonConfigToCrawlConfig(xCrawlConfig, advancedConfig, crawlPageConfig)
483498

484499
// 装载单独配置
485-
crawlPageConfig.detailTargets.forEach((detail) => {
500+
crawlPageConfig.detailTargets.forEach((detail, index) => {
486501
// detail > advanced > xCrawl
487502
const { cookies, viewport, fingerprint } = detail
488503

@@ -499,8 +514,16 @@ function createCrawlPageConfig(
499514
// 3.fingerprint
500515
if (fingerprint) {
501516
loaderPageFingerprintToDetailTarget(detail, fingerprint)
502-
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
503-
loaderPageFingerprintToDetailTarget(detail, advancedConfig.fingerprint)
517+
} else if (
518+
isUndefined(fingerprint) &&
519+
advancedConfig.fingerprints?.length
520+
) {
521+
// 从对应的选中记录中取出指纹索引
522+
const selectFingerprintIndex =
523+
crawlPageConfig.selectFingerprintIndexs[index]
524+
const fingerprint = advancedConfig.fingerprints[selectFingerprintIndex]
525+
526+
loaderPageFingerprintToDetailTarget(detail, fingerprint)
504527
}
505528
})
506529

@@ -514,6 +537,9 @@ function createCrawlDataConfig<T>(
514537
const crawlDataConfig: CrawlDataConfigOriginal = {
515538
detailTargets: [],
516539
intervalTime: undefined,
540+
541+
selectFingerprintIndexs: [],
542+
517543
onCrawlItemComplete: undefined
518544
}
519545

@@ -551,6 +577,9 @@ function createCrawlFileConfig(
551577
const crawlFileConfig: CrawlFileConfigOriginal = {
552578
detailTargets: [],
553579
intervalTime: undefined,
580+
581+
selectFingerprintIndexs: [],
582+
554583
onBeforeSaveItemFile: undefined,
555584
onCrawlItemComplete: undefined
556585
}

0 commit comments

Comments
 (0)