Skip to content

Commit e1ea111

Browse files
committed
Add: Complete priority request function
1 parent 2be3cda commit e1ea111

File tree

5 files changed

+100
-11
lines changed

5 files changed

+100
-11
lines changed

src/api.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import puppeteer, { Browser, HTTPResponse, Page, Protocol } from 'puppeteer'
55

66
import { ControllerConfig, controller } from './controller'
77
import { request } from './request'
8+
import { quickSort } from './sort'
89
import {
910
isArray,
1011
isObject,
@@ -41,7 +42,6 @@ import {
4142
LoaderFileRequestConfig
4243
} from './types/api'
4344
import { LoaderXCrawlBaseConfig } from './types'
44-
import { quickSort } from './sort'
4545

4646
async function crawlRequestSingle(
4747
controllerConfig: ControllerConfig<
@@ -109,7 +109,7 @@ function loaderCommonConfig(
109109
) {
110110
// 1.requestConfigs
111111
loaderConfig.requestConfigs = requestObjecs.map((requestConfig) => {
112-
let { url, timeout, proxy, maxRetry } = requestConfig
112+
let { url, timeout, proxy, maxRetry, priority } = requestConfig
113113

114114
// 1.1.baseUrl
115115
if (!isUndefined(baseConfig.baseUrl)) {
@@ -141,7 +141,12 @@ function loaderCommonConfig(
141141
}
142142
}
143143

144-
return { url, timeout, proxy, maxRetry }
144+
// 1.5.priority
145+
if (isUndefined(priority)) {
146+
priority = 0
147+
}
148+
149+
return { url, timeout, proxy, maxRetry, priority }
145150
})
146151

147152
// 2.intervalTime

src/batchCrawlHandle.ts

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import { isNumber, isUndefined, log, logNumber, random, sleep } from './utils'
22

3-
import type { IntervalTime } from './types/api'
3+
import type {
4+
IntervalTime,
5+
LoaderDataRequestConfig,
6+
LoaderFileRequestConfig,
7+
LoaderPageRequestConfig
8+
} from './types/api'
49
import type { ControllerConfig } from './controller'
510

611
async function useSleepByBatch(
@@ -26,7 +31,14 @@ async function useSleepByBatch(
2631
}
2732
}
2833

29-
export async function asyncBatchCrawl<T, V, C>(
34+
export async function asyncBatchCrawl<
35+
T extends
36+
| LoaderPageRequestConfig
37+
| LoaderDataRequestConfig
38+
| LoaderFileRequestConfig,
39+
V,
40+
C
41+
>(
3042
controllerConfigs: ControllerConfig<T, V>[],
3143
intervalTime: IntervalTime | undefined,
3244
crawlSingleFnExtraConfig: C,
@@ -73,7 +85,14 @@ export async function asyncBatchCrawl<T, V, C>(
7385
await Promise.all(crawlQueue)
7486
}
7587

76-
export async function syncBatchCrawl<T, V, C>(
88+
export async function syncBatchCrawl<
89+
T extends
90+
| LoaderPageRequestConfig
91+
| LoaderDataRequestConfig
92+
| LoaderFileRequestConfig,
93+
V,
94+
C
95+
>(
7796
controllerConfigs: ControllerConfig<T, V>[],
7897
intervalTime: IntervalTime | undefined,
7998
crawlSingleFnExtraConfig: C,

src/controller.ts

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
11
import { asyncBatchCrawl, syncBatchCrawl } from './batchCrawlHandle'
2-
import { IntervalTime } from './types/api'
2+
import { priorityQueueMergeSort } from './sort'
3+
import {
4+
IntervalTime,
5+
LoaderDataRequestConfig,
6+
LoaderFileRequestConfig,
7+
LoaderPageRequestConfig
8+
} from './types/api'
39
import { log, logError, logNumber, logSuccess, logWarn } from './utils'
410

5-
export interface ControllerConfig<T, V> {
11+
export interface ControllerConfig<
12+
T extends
13+
| LoaderPageRequestConfig
14+
| LoaderDataRequestConfig
15+
| LoaderFileRequestConfig,
16+
V
17+
> {
618
id: number
719
isSuccess: boolean
820
crawlCount: number
@@ -12,7 +24,14 @@ export interface ControllerConfig<T, V> {
1224
crawlSingleRes: V | null
1325
}
1426

15-
export async function controller<T extends { maxRetry: number }, V, C>(
27+
export async function controller<
28+
T extends
29+
| LoaderPageRequestConfig
30+
| LoaderDataRequestConfig
31+
| LoaderFileRequestConfig,
32+
V,
33+
C
34+
>(
1635
name: 'page' | 'data' | 'file',
1736
mode: 'async' | 'sync',
1837
requestConfigs: T[],
@@ -23,8 +42,21 @@ export async function controller<T extends { maxRetry: number }, V, C>(
2342
crawlSingleFnExtraConfig: C
2443
) => Promise<V>
2544
): Promise<ControllerConfig<T, V>[]> {
45+
// 是否使用优先爬取
46+
const isPriorityCrawl = !requestConfigs.every(
47+
(item) => item.priority === requestConfigs[0].priority
48+
)
49+
const targetRequestConfigs = isPriorityCrawl
50+
? priorityQueueMergeSort(
51+
requestConfigs.map((item) => ({
52+
...item,
53+
valueOf: () => item.priority
54+
}))
55+
)
56+
: requestConfigs
57+
2658
// 通过映射生成新的配置数组
27-
const controllerConfigs: ControllerConfig<T, V>[] = requestConfigs.map(
59+
const controllerConfigs: ControllerConfig<T, V>[] = targetRequestConfigs.map(
2860
(requestConfig, index) => ({
2961
id: index + 1,
3062
isSuccess: false,

src/sort.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,33 @@ export function quickSort<T extends any[]>(arr: T): T {
4545
return arr
4646
}
4747

48-
// console.log(quickSort([7, 3, 6, 4, 9, 2, 1, 5]))
48+
export function priorityQueueMergeSort<T extends any[]>(arr: T): T {
49+
if (arr.length === 1) return arr
50+
51+
const mid = Math.floor(arr.length / 2)
52+
const newLeftArr = priorityQueueMergeSort(arr.slice(0, mid))
53+
const newRightArr = priorityQueueMergeSort(arr.slice(mid))
54+
55+
const newArr = [] as any as T
56+
let i = 0
57+
let j = 0
58+
while (i < newLeftArr.length && j < newRightArr.length) {
59+
if (newLeftArr[i] >= newRightArr[j]) {
60+
newArr.push(newLeftArr[i])
61+
i++
62+
} else {
63+
newArr.push(newRightArr[j])
64+
j++
65+
}
66+
}
67+
68+
if (i < newLeftArr.length) {
69+
newArr.push(...newLeftArr.slice(i))
70+
}
71+
72+
if (j < newRightArr.length) {
73+
newArr.push(...newRightArr.splice(j))
74+
}
75+
76+
return newArr
77+
}

src/types/api.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { AnyObject, MapTypeObject } from './common'
77
type LoaderHasConfig = {
88
timeout: number
99
maxRetry: number
10+
priority: number
1011
}
1112

1213
export type LoaderPageRequestConfig = PageRequestConfig & LoaderHasConfig
@@ -71,6 +72,7 @@ export interface PageRequestConfig {
7172
proxy?: string
7273
cookies?: PageRequestConfigCookies
7374
maxRetry?: number
75+
priority?: number
7476
}
7577

7678
export interface DataRequestConfig {
@@ -82,6 +84,7 @@ export interface DataRequestConfig {
8284
timeout?: number
8385
proxy?: string
8486
maxRetry?: number
87+
priority?: number
8588
}
8689

8790
export interface FileRequestConfig {
@@ -90,6 +93,7 @@ export interface FileRequestConfig {
9093
timeout?: number
9194
proxy?: string
9295
maxRetry?: number
96+
priority?: number
9397
}
9498

9599
// CrawlConfig

0 commit comments

Comments
 (0)