@@ -33,7 +33,10 @@ import {
33
33
CrawlFileAdvancedConfig ,
34
34
CrawlDataAdvancedConfig ,
35
35
IntervalTime ,
36
- DetailTargetFingerprintCommon
36
+ DetailTargetFingerprintCommon ,
37
+ CrawlHTMLSingleResult ,
38
+ CrawlHTMLDetailTargetConfig ,
39
+ CrawlHTMLAdvancedConfig
37
40
} from './types/api'
38
41
import { LoaderXCrawlConfig } from './types'
39
42
import { fingerprints } from './default'
@@ -42,7 +45,7 @@ import { fingerprints } from './default'
42
45
43
46
// Extra config
44
47
export interface ExtraCommonConfig {
45
- type : 'page' | 'data' | 'file'
48
+ type : 'page' | 'html' | ' data' | 'file'
46
49
47
50
intervalTime : IntervalTime | undefined
48
51
}
@@ -54,6 +57,12 @@ interface ExtraPageConfig extends ExtraCommonConfig {
54
57
| undefined
55
58
}
56
59
60
+ interface ExtraHTMLConfig extends ExtraCommonConfig {
61
+ onCrawlItemComplete :
62
+ | ( ( crawlHTMLSingleResult : CrawlHTMLSingleResult ) => void )
63
+ | undefined
64
+ }
65
+
57
66
interface ExtraDataConfig < T > extends ExtraCommonConfig {
58
67
onCrawlItemComplete :
59
68
| ( ( crawlDataSingleResult : CrawlDataSingleResult < T > ) => void )
@@ -101,6 +110,10 @@ export type LoaderCrawlPageDetail = LoaderCommonConfig &
101
110
LoaderHasConfig &
102
111
CrawlPageDetailTargetConfig
103
112
113
+ export type LoaderCrawlHTMLDetail = LoaderCommonConfig &
114
+ LoaderHasConfig &
115
+ CrawlHTMLDetailTargetConfig
116
+
104
117
export type LoaderCrawlDataDetail = LoaderCommonConfig &
105
118
LoaderHasConfig &
106
119
CrawlDataDetailTargetConfig
@@ -114,6 +127,10 @@ interface CrawlPageAdvancedDetailTargetsConfig extends CrawlPageAdvancedConfig {
114
127
detailTargets : CrawlPageDetailTargetConfig [ ]
115
128
}
116
129
130
+ interface CrawlHTMLAdvancedDetailTargetsConfig extends CrawlHTMLAdvancedConfig {
131
+ detailTargets : CrawlHTMLDetailTargetConfig [ ]
132
+ }
133
+
117
134
interface CrawlDataAdvancedDetailTargetsConfig < T >
118
135
extends CrawlDataAdvancedConfig < T > {
119
136
detailTargets : CrawlDataDetailTargetConfig [ ]
@@ -135,6 +152,17 @@ interface CrawlPageConfig {
135
152
| undefined
136
153
}
137
154
155
+ interface CrawlHTMLConfig {
156
+ detailTargets : LoaderCrawlHTMLDetail [ ]
157
+ intervalTime : IntervalTime | undefined
158
+
159
+ selectFingerprintIndexs : number [ ]
160
+
161
+ onCrawlItemComplete :
162
+ | ( ( crawlHTMLSingleResult : CrawlHTMLSingleResult ) => void )
163
+ | undefined
164
+ }
165
+
138
166
interface CrawlDataConfig {
139
167
detailTargets : LoaderCrawlDataDetail [ ]
140
168
intervalTime : IntervalTime | undefined
@@ -172,6 +200,12 @@ type UniteCrawlPageConfig =
172
200
| ( string | CrawlPageDetailTargetConfig ) [ ]
173
201
| CrawlPageAdvancedConfig
174
202
203
+ type UniteCrawlHTMLConfig =
204
+ | string
205
+ | CrawlHTMLDetailTargetConfig
206
+ | ( string | CrawlHTMLDetailTargetConfig ) [ ]
207
+ | CrawlHTMLAdvancedConfig
208
+
175
209
type UniteCrawlDataConfig < T > =
176
210
| string
177
211
| CrawlDataDetailTargetConfig
@@ -361,9 +395,14 @@ function loaderCommonConfigToCrawlConfig(
361
395
xCrawlConfig : LoaderXCrawlConfig ,
362
396
advancedDetailTargetsConfig :
363
397
| CrawlPageAdvancedDetailTargetsConfig
398
+ | CrawlHTMLAdvancedDetailTargetsConfig
364
399
| CrawlDataAdvancedDetailTargetsConfig < any >
365
400
| CrawlFileAdvancedDetailTargetsConfig ,
366
- crawlConfig : CrawlPageConfig | CrawlDataConfig | CrawlFileConfig
401
+ crawlConfig :
402
+ | CrawlPageConfig
403
+ | CrawlHTMLConfig
404
+ | CrawlDataConfig
405
+ | CrawlFileConfig
367
406
) {
368
407
// 1.detailTargets
369
408
crawlConfig . detailTargets = advancedDetailTargetsConfig . detailTargets . map (
@@ -567,6 +606,55 @@ function createCrawlPageConfig(
567
606
return crawlPageConfig
568
607
}
569
608
609
+ function createCrawlHTMLConfig (
610
+ xCrawlConfig : LoaderXCrawlConfig ,
611
+ originalConfig : UniteCrawlHTMLConfig
612
+ ) : CrawlHTMLConfig {
613
+ const crawlHTMLConfig : CrawlHTMLConfig = {
614
+ detailTargets : [ ] ,
615
+ intervalTime : undefined ,
616
+
617
+ selectFingerprintIndexs : [ ] ,
618
+
619
+ onCrawlItemComplete : undefined
620
+ }
621
+
622
+ let advancedDetailTargetsConfig : CrawlHTMLAdvancedDetailTargetsConfig = {
623
+ targets : [ ] ,
624
+ detailTargets : [ ]
625
+ }
626
+
627
+ if ( isObject ( originalConfig ) && Object . hasOwn ( originalConfig , 'targets' ) ) {
628
+ // CrawlHTMLAdvancedConfig
629
+ const { targets } = originalConfig as CrawlHTMLAdvancedConfig
630
+
631
+ advancedDetailTargetsConfig = {
632
+ ...advancedDetailTargetsConfig ,
633
+ ...( originalConfig as CrawlHTMLAdvancedConfig )
634
+ }
635
+
636
+ advancedDetailTargetsConfig . detailTargets =
637
+ transformTargetToDetailTargets ( targets )
638
+ } else {
639
+ // string | CrawlHTMLDetailTargetConfig | (string | CrawlHTMLDetailTargetConfig)[]
640
+
641
+ advancedDetailTargetsConfig . detailTargets = transformTargetToDetailTargets (
642
+ originalConfig as
643
+ | string
644
+ | CrawlDataDetailTargetConfig
645
+ | ( string | CrawlDataDetailTargetConfig ) [ ]
646
+ )
647
+ }
648
+
649
+ loaderCommonConfigToCrawlConfig (
650
+ xCrawlConfig ,
651
+ advancedDetailTargetsConfig ,
652
+ crawlHTMLConfig
653
+ )
654
+
655
+ return crawlHTMLConfig
656
+ }
657
+
570
658
function createCrawlDataConfig < T > (
571
659
xCrawlConfig : LoaderXCrawlConfig ,
572
660
originalConfig : UniteCrawlDataConfig < T >
@@ -772,9 +860,12 @@ async function pageSingleCrawlHandle(
772
860
}
773
861
}
774
862
775
- async function dataAndFileSingleCrawlHandle (
776
- device : Device < LoaderCrawlDataDetail | LoaderCrawlFileDetail , Request > ,
777
- extraConfig : ExtraDataConfig < any > | ExtraFileConfig
863
+ async function useRequestFnSingleCrawlHandle (
864
+ device : Device <
865
+ LoaderCrawlHTMLDetail | LoaderCrawlDataDetail | LoaderCrawlFileDetail ,
866
+ Request
867
+ > ,
868
+ extraConfig : ExtraHTMLConfig | ExtraDataConfig < any > | ExtraFileConfig
778
869
) {
779
870
const { detailTargetConfig, crawlErrorQueue, maxRetry, retryCount } = device
780
871
const notAllowRetry = maxRetry === retryCount
@@ -800,7 +891,9 @@ async function dataAndFileSingleCrawlHandle(
800
891
if ( isSuccess || notAllowRetry ) {
801
892
device . isHandle = true
802
893
803
- if ( extraConfig . type === 'data' ) {
894
+ if ( extraConfig . type === 'html' ) {
895
+ HTMLSingleResultHandle ( device , extraConfig as ExtraHTMLConfig )
896
+ } else if ( extraConfig . type === 'data' ) {
804
897
dataSingleResultHandle ( device , extraConfig as ExtraDataConfig < any > )
805
898
} else if ( extraConfig . type === 'file' ) {
806
899
fileSingleResultHandle ( device , extraConfig as ExtraFileConfig )
@@ -835,6 +928,27 @@ function pageSingleResultHandle(
835
928
}
836
929
}
837
930
931
+ function HTMLSingleResultHandle (
932
+ device : Device < LoaderCrawlHTMLDetail , Request > ,
933
+ extraConfig : ExtraHTMLConfig
934
+ ) {
935
+ const { isSuccess, detailTargetResult, result } = device
936
+ const { onCrawlItemComplete } = extraConfig
937
+
938
+ handleResultEssentialOtherValue ( device )
939
+
940
+ if ( isSuccess && detailTargetResult ) {
941
+ const { data, headers, statusCode } = detailTargetResult
942
+ const html = data . toString ( )
943
+
944
+ result . data = { statusCode, headers, html }
945
+ }
946
+
947
+ if ( onCrawlItemComplete ) {
948
+ onCrawlItemComplete ( result as CrawlHTMLSingleResult )
949
+ }
950
+ }
951
+
838
952
function dataSingleResultHandle (
839
953
device : Device < LoaderCrawlDataDetail , Request > ,
840
954
extraConfig : ExtraDataConfig < any >
@@ -1029,6 +1143,62 @@ export function createCrawlPage(xCrawlConfig: LoaderXCrawlConfig) {
1029
1143
return crawlPage
1030
1144
}
1031
1145
1146
+ export function createCrawlHTML ( xCrawlConfig : LoaderXCrawlConfig ) {
1147
+ function crawlHTML (
1148
+ config : string ,
1149
+ callback ?: ( result : CrawlHTMLSingleResult ) => void
1150
+ ) : Promise < CrawlHTMLSingleResult >
1151
+
1152
+ function crawlHTML (
1153
+ config : CrawlHTMLDetailTargetConfig ,
1154
+ callback ?: ( result : CrawlHTMLSingleResult ) => void
1155
+ ) : Promise < CrawlHTMLSingleResult >
1156
+
1157
+ function crawlHTML (
1158
+ config : ( string | CrawlHTMLDetailTargetConfig ) [ ] ,
1159
+ callback ?: ( result : CrawlHTMLSingleResult [ ] ) => void
1160
+ ) : Promise < CrawlHTMLSingleResult [ ] >
1161
+
1162
+ function crawlHTML (
1163
+ config : CrawlHTMLAdvancedConfig ,
1164
+ callback ?: ( result : CrawlHTMLSingleResult [ ] ) => void
1165
+ ) : Promise < CrawlHTMLSingleResult [ ] >
1166
+
1167
+ async function crawlHTML (
1168
+ config : UniteCrawlHTMLConfig ,
1169
+ callback ?: ( result : any ) => void
1170
+ ) : Promise < CrawlHTMLSingleResult | CrawlHTMLSingleResult [ ] > {
1171
+ const { detailTargets, intervalTime, onCrawlItemComplete } =
1172
+ createCrawlHTMLConfig ( xCrawlConfig , config )
1173
+
1174
+ const extraConfig : ExtraHTMLConfig = {
1175
+ type : 'html' ,
1176
+ intervalTime,
1177
+ onCrawlItemComplete
1178
+ }
1179
+
1180
+ const crawlResultArr = ( await controller (
1181
+ xCrawlConfig . mode ,
1182
+ detailTargets ,
1183
+ extraConfig ,
1184
+ useRequestFnSingleCrawlHandle
1185
+ ) ) as CrawlHTMLSingleResult [ ]
1186
+
1187
+ const crawlResult =
1188
+ isArray ( config ) || ( isObject ( config ) && Object . hasOwn ( config , 'targets' ) )
1189
+ ? crawlResultArr
1190
+ : crawlResultArr [ 0 ]
1191
+
1192
+ if ( callback ) {
1193
+ callback ( crawlResult )
1194
+ }
1195
+
1196
+ return crawlResult
1197
+ }
1198
+
1199
+ return crawlHTML
1200
+ }
1201
+
1032
1202
export function createCrawlData ( xCrawlConfig : LoaderXCrawlConfig ) {
1033
1203
function crawlData < T = any > (
1034
1204
config : string ,
@@ -1067,7 +1237,7 @@ export function createCrawlData(xCrawlConfig: LoaderXCrawlConfig) {
1067
1237
xCrawlConfig . mode ,
1068
1238
detailTargets ,
1069
1239
extraConfig ,
1070
- dataAndFileSingleCrawlHandle
1240
+ useRequestFnSingleCrawlHandle
1071
1241
) ) as CrawlDataSingleResult < T > [ ]
1072
1242
1073
1243
const crawlResult =
@@ -1127,7 +1297,7 @@ export function createCrawlFile(xCrawlConfig: LoaderXCrawlConfig) {
1127
1297
xCrawlConfig . mode ,
1128
1298
detailTargets ,
1129
1299
extraConfig ,
1130
- dataAndFileSingleCrawlHandle
1300
+ useRequestFnSingleCrawlHandle
1131
1301
) ) as CrawlFileSingleResult [ ]
1132
1302
1133
1303
const { saveFilePendingQueue, saveFileErrorArr } = extraConfig
0 commit comments