Skip to content

Commit 1804653

Browse files
committed
feat: added crawlHTML API
1 parent 448c9bb commit 1804653

File tree

5 files changed

+234
-11
lines changed

5 files changed

+234
-11
lines changed

src/api.ts

Lines changed: 179 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ import {
3333
CrawlFileAdvancedConfig,
3434
CrawlDataAdvancedConfig,
3535
IntervalTime,
36-
DetailTargetFingerprintCommon
36+
DetailTargetFingerprintCommon,
37+
CrawlHTMLSingleResult,
38+
CrawlHTMLDetailTargetConfig,
39+
CrawlHTMLAdvancedConfig
3740
} from './types/api'
3841
import { LoaderXCrawlConfig } from './types'
3942
import { fingerprints } from './default'
@@ -42,7 +45,7 @@ import { fingerprints } from './default'
4245

4346
// Extra config
4447
export interface ExtraCommonConfig {
45-
type: 'page' | 'data' | 'file'
48+
type: 'page' | 'html' | 'data' | 'file'
4649

4750
intervalTime: IntervalTime | undefined
4851
}
@@ -54,6 +57,12 @@ interface ExtraPageConfig extends ExtraCommonConfig {
5457
| undefined
5558
}
5659

60+
interface ExtraHTMLConfig extends ExtraCommonConfig {
61+
onCrawlItemComplete:
62+
| ((crawlHTMLSingleResult: CrawlHTMLSingleResult) => void)
63+
| undefined
64+
}
65+
5766
interface ExtraDataConfig<T> extends ExtraCommonConfig {
5867
onCrawlItemComplete:
5968
| ((crawlDataSingleResult: CrawlDataSingleResult<T>) => void)
@@ -101,6 +110,10 @@ export type LoaderCrawlPageDetail = LoaderCommonConfig &
101110
LoaderHasConfig &
102111
CrawlPageDetailTargetConfig
103112

113+
export type LoaderCrawlHTMLDetail = LoaderCommonConfig &
114+
LoaderHasConfig &
115+
CrawlHTMLDetailTargetConfig
116+
104117
export type LoaderCrawlDataDetail = LoaderCommonConfig &
105118
LoaderHasConfig &
106119
CrawlDataDetailTargetConfig
@@ -114,6 +127,10 @@ interface CrawlPageAdvancedDetailTargetsConfig extends CrawlPageAdvancedConfig {
114127
detailTargets: CrawlPageDetailTargetConfig[]
115128
}
116129

130+
interface CrawlHTMLAdvancedDetailTargetsConfig extends CrawlHTMLAdvancedConfig {
131+
detailTargets: CrawlHTMLDetailTargetConfig[]
132+
}
133+
117134
interface CrawlDataAdvancedDetailTargetsConfig<T>
118135
extends CrawlDataAdvancedConfig<T> {
119136
detailTargets: CrawlDataDetailTargetConfig[]
@@ -135,6 +152,17 @@ interface CrawlPageConfig {
135152
| undefined
136153
}
137154

155+
interface CrawlHTMLConfig {
156+
detailTargets: LoaderCrawlHTMLDetail[]
157+
intervalTime: IntervalTime | undefined
158+
159+
selectFingerprintIndexs: number[]
160+
161+
onCrawlItemComplete:
162+
| ((crawlHTMLSingleResult: CrawlHTMLSingleResult) => void)
163+
| undefined
164+
}
165+
138166
interface CrawlDataConfig {
139167
detailTargets: LoaderCrawlDataDetail[]
140168
intervalTime: IntervalTime | undefined
@@ -172,6 +200,12 @@ type UniteCrawlPageConfig =
172200
| (string | CrawlPageDetailTargetConfig)[]
173201
| CrawlPageAdvancedConfig
174202

203+
type UniteCrawlHTMLConfig =
204+
| string
205+
| CrawlHTMLDetailTargetConfig
206+
| (string | CrawlHTMLDetailTargetConfig)[]
207+
| CrawlHTMLAdvancedConfig
208+
175209
type UniteCrawlDataConfig<T> =
176210
| string
177211
| CrawlDataDetailTargetConfig
@@ -361,9 +395,14 @@ function loaderCommonConfigToCrawlConfig(
361395
xCrawlConfig: LoaderXCrawlConfig,
362396
advancedDetailTargetsConfig:
363397
| CrawlPageAdvancedDetailTargetsConfig
398+
| CrawlHTMLAdvancedDetailTargetsConfig
364399
| CrawlDataAdvancedDetailTargetsConfig<any>
365400
| CrawlFileAdvancedDetailTargetsConfig,
366-
crawlConfig: CrawlPageConfig | CrawlDataConfig | CrawlFileConfig
401+
crawlConfig:
402+
| CrawlPageConfig
403+
| CrawlHTMLConfig
404+
| CrawlDataConfig
405+
| CrawlFileConfig
367406
) {
368407
// 1.detailTargets
369408
crawlConfig.detailTargets = advancedDetailTargetsConfig.detailTargets.map(
@@ -567,6 +606,55 @@ function createCrawlPageConfig(
567606
return crawlPageConfig
568607
}
569608

609+
function createCrawlHTMLConfig(
610+
xCrawlConfig: LoaderXCrawlConfig,
611+
originalConfig: UniteCrawlHTMLConfig
612+
): CrawlHTMLConfig {
613+
const crawlHTMLConfig: CrawlHTMLConfig = {
614+
detailTargets: [],
615+
intervalTime: undefined,
616+
617+
selectFingerprintIndexs: [],
618+
619+
onCrawlItemComplete: undefined
620+
}
621+
622+
let advancedDetailTargetsConfig: CrawlHTMLAdvancedDetailTargetsConfig = {
623+
targets: [],
624+
detailTargets: []
625+
}
626+
627+
if (isObject(originalConfig) && Object.hasOwn(originalConfig, 'targets')) {
628+
// CrawlHTMLAdvancedConfig
629+
const { targets } = originalConfig as CrawlHTMLAdvancedConfig
630+
631+
advancedDetailTargetsConfig = {
632+
...advancedDetailTargetsConfig,
633+
...(originalConfig as CrawlHTMLAdvancedConfig)
634+
}
635+
636+
advancedDetailTargetsConfig.detailTargets =
637+
transformTargetToDetailTargets(targets)
638+
} else {
639+
// string | CrawlHTMLDetailTargetConfig | (string | CrawlHTMLDetailTargetConfig)[]
640+
641+
advancedDetailTargetsConfig.detailTargets = transformTargetToDetailTargets(
642+
originalConfig as
643+
| string
644+
| CrawlDataDetailTargetConfig
645+
| (string | CrawlDataDetailTargetConfig)[]
646+
)
647+
}
648+
649+
loaderCommonConfigToCrawlConfig(
650+
xCrawlConfig,
651+
advancedDetailTargetsConfig,
652+
crawlHTMLConfig
653+
)
654+
655+
return crawlHTMLConfig
656+
}
657+
570658
function createCrawlDataConfig<T>(
571659
xCrawlConfig: LoaderXCrawlConfig,
572660
originalConfig: UniteCrawlDataConfig<T>
@@ -772,9 +860,12 @@ async function pageSingleCrawlHandle(
772860
}
773861
}
774862

775-
async function dataAndFileSingleCrawlHandle(
776-
device: Device<LoaderCrawlDataDetail | LoaderCrawlFileDetail, Request>,
777-
extraConfig: ExtraDataConfig<any> | ExtraFileConfig
863+
async function useRequestFnSingleCrawlHandle(
864+
device: Device<
865+
LoaderCrawlHTMLDetail | LoaderCrawlDataDetail | LoaderCrawlFileDetail,
866+
Request
867+
>,
868+
extraConfig: ExtraHTMLConfig | ExtraDataConfig<any> | ExtraFileConfig
778869
) {
779870
const { detailTargetConfig, crawlErrorQueue, maxRetry, retryCount } = device
780871
const notAllowRetry = maxRetry === retryCount
@@ -800,7 +891,9 @@ async function dataAndFileSingleCrawlHandle(
800891
if (isSuccess || notAllowRetry) {
801892
device.isHandle = true
802893

803-
if (extraConfig.type === 'data') {
894+
if (extraConfig.type === 'html') {
895+
HTMLSingleResultHandle(device, extraConfig as ExtraHTMLConfig)
896+
} else if (extraConfig.type === 'data') {
804897
dataSingleResultHandle(device, extraConfig as ExtraDataConfig<any>)
805898
} else if (extraConfig.type === 'file') {
806899
fileSingleResultHandle(device, extraConfig as ExtraFileConfig)
@@ -835,6 +928,27 @@ function pageSingleResultHandle(
835928
}
836929
}
837930

931+
function HTMLSingleResultHandle(
932+
device: Device<LoaderCrawlHTMLDetail, Request>,
933+
extraConfig: ExtraHTMLConfig
934+
) {
935+
const { isSuccess, detailTargetResult, result } = device
936+
const { onCrawlItemComplete } = extraConfig
937+
938+
handleResultEssentialOtherValue(device)
939+
940+
if (isSuccess && detailTargetResult) {
941+
const { data, headers, statusCode } = detailTargetResult
942+
const html = data.toString()
943+
944+
result.data = { statusCode, headers, html }
945+
}
946+
947+
if (onCrawlItemComplete) {
948+
onCrawlItemComplete(result as CrawlHTMLSingleResult)
949+
}
950+
}
951+
838952
function dataSingleResultHandle(
839953
device: Device<LoaderCrawlDataDetail, Request>,
840954
extraConfig: ExtraDataConfig<any>
@@ -1029,6 +1143,62 @@ export function createCrawlPage(xCrawlConfig: LoaderXCrawlConfig) {
10291143
return crawlPage
10301144
}
10311145

1146+
export function createCrawlHTML(xCrawlConfig: LoaderXCrawlConfig) {
1147+
function crawlHTML(
1148+
config: string,
1149+
callback?: (result: CrawlHTMLSingleResult) => void
1150+
): Promise<CrawlHTMLSingleResult>
1151+
1152+
function crawlHTML(
1153+
config: CrawlHTMLDetailTargetConfig,
1154+
callback?: (result: CrawlHTMLSingleResult) => void
1155+
): Promise<CrawlHTMLSingleResult>
1156+
1157+
function crawlHTML(
1158+
config: (string | CrawlHTMLDetailTargetConfig)[],
1159+
callback?: (result: CrawlHTMLSingleResult[]) => void
1160+
): Promise<CrawlHTMLSingleResult[]>
1161+
1162+
function crawlHTML(
1163+
config: CrawlHTMLAdvancedConfig,
1164+
callback?: (result: CrawlHTMLSingleResult[]) => void
1165+
): Promise<CrawlHTMLSingleResult[]>
1166+
1167+
async function crawlHTML(
1168+
config: UniteCrawlHTMLConfig,
1169+
callback?: (result: any) => void
1170+
): Promise<CrawlHTMLSingleResult | CrawlHTMLSingleResult[]> {
1171+
const { detailTargets, intervalTime, onCrawlItemComplete } =
1172+
createCrawlHTMLConfig(xCrawlConfig, config)
1173+
1174+
const extraConfig: ExtraHTMLConfig = {
1175+
type: 'html',
1176+
intervalTime,
1177+
onCrawlItemComplete
1178+
}
1179+
1180+
const crawlResultArr = (await controller(
1181+
xCrawlConfig.mode,
1182+
detailTargets,
1183+
extraConfig,
1184+
useRequestFnSingleCrawlHandle
1185+
)) as CrawlHTMLSingleResult[]
1186+
1187+
const crawlResult =
1188+
isArray(config) || (isObject(config) && Object.hasOwn(config, 'targets'))
1189+
? crawlResultArr
1190+
: crawlResultArr[0]
1191+
1192+
if (callback) {
1193+
callback(crawlResult)
1194+
}
1195+
1196+
return crawlResult
1197+
}
1198+
1199+
return crawlHTML
1200+
}
1201+
10321202
export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) {
10331203
function crawlData<T = any>(
10341204
config: string,
@@ -1067,7 +1237,7 @@ export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) {
10671237
xCrawlConfig.mode,
10681238
detailTargets,
10691239
extraConfig,
1070-
dataAndFileSingleCrawlHandle
1240+
useRequestFnSingleCrawlHandle
10711241
)) as CrawlDataSingleResult<T>[]
10721242

10731243
const crawlResult =
@@ -1127,7 +1297,7 @@ export function createCrawlFile(xCrawlConfig: LoaderXCrawlConfig) {
11271297
xCrawlConfig.mode,
11281298
detailTargets,
11291299
extraConfig,
1130-
dataAndFileSingleCrawlHandle
1300+
useRequestFnSingleCrawlHandle
11311301
)) as CrawlFileSingleResult[]
11321302

11331303
const { saveFilePendingQueue, saveFileErrorArr } = extraConfig

src/controller.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
ExtraCommonConfig,
66
LoaderCrawlDataDetail,
77
LoaderCrawlFileDetail,
8+
LoaderCrawlHTMLDetail,
89
LoaderCrawlPageDetail,
910
ProxyDetails
1011
} from './api'
@@ -25,6 +26,7 @@ import { CrawlCommonResult } from './types/api'
2526

2627
export type CrawlDetail =
2728
| LoaderCrawlPageDetail
29+
| LoaderCrawlHTMLDetail
2830
| LoaderCrawlDataDetail
2931
| LoaderCrawlFileDetail
3032

src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import {
22
createCrawlData,
33
createCrawlFile,
4+
createCrawlHTML,
45
createCrawlPage,
56
startPolling
67
} from './api'
@@ -33,6 +34,7 @@ function loaderBaseConfig(
3334
function createnInstance(baseConfig: LoaderXCrawlConfig): XCrawlInstance {
3435
const instance: XCrawlInstance = {
3536
crawlPage: createCrawlPage(baseConfig),
37+
crawlHTML: createCrawlHTML(baseConfig),
3638
crawlData: createCrawlData(baseConfig),
3739
crawlFile: createCrawlFile(baseConfig),
3840
startPolling

src/types/api.ts

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig {
9595
| null
9696
}
9797

98+
export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig {
99+
url: string
100+
headers?: AnyObject | null
101+
priority?: number
102+
fingerprint?: DetailTargetFingerprintCommon | null
103+
}
104+
98105
export interface CrawlDataDetailTargetConfig extends CrawlCommonConfig {
99106
url: string
100107
method?: Method
@@ -108,7 +115,6 @@ export interface CrawlDataDetailTargetConfig extends CrawlCommonConfig {
108115
export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
109116
url: string
110117
headers?: AnyObject | null
111-
priority?: number
112118
storeDir?: string | null
113119
fileName?: string | null
114120
extension?: string | null
@@ -133,6 +139,16 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig {
133139
onCrawlItemComplete?: (crawlPageSingleResult: CrawlPageSingleResult) => void
134140
}
135141

142+
export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig {
143+
targets: (string | CrawlHTMLDetailTargetConfig)[]
144+
intervalTime?: IntervalTime
145+
fingerprints?: DetailTargetFingerprintCommon[]
146+
147+
headers?: AnyObject
148+
149+
onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void
150+
}
151+
136152
export interface CrawlDataAdvancedConfig<T> extends CrawlCommonConfig {
137153
targets: (string | CrawlDataDetailTargetConfig)[]
138154
intervalTime?: IntervalTime
@@ -188,6 +204,14 @@ export interface CrawlPageSingleResult extends CrawlCommonResult {
188204
}
189205
}
190206

207+
export interface CrawlHTMLSingleResult extends CrawlCommonResult {
208+
data: {
209+
statusCode: number | undefined
210+
headers: IncomingHttpHeaders
211+
html: string
212+
} | null
213+
}
214+
191215
export interface CrawlDataSingleResult<D> extends CrawlCommonResult {
192216
data: {
193217
statusCode: number | undefined

0 commit comments

Comments
 (0)