@@ -26,53 +26,53 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p
26
26
- [ Install] ( #Install )
27
27
- [ Example] ( #Example )
28
28
- [ Core concepts] ( #Core-concepts )
29
- * [ Create application] ( #Create-application )
30
- + [ An example of a crawler application] ( #An-example-of-a-crawler-application )
31
- + [ Choose crawling mode] ( #Choose-crawling-mode )
32
- + [ Multiple crawler application instances] ( #Multiple-crawler-application-instances )
33
- * [ Crawl page] ( #Crawl-page )
34
- + [ jsdom instance] ( #jsdom-instance )
35
- + [ browser instance] ( #browser-instance )
36
- + [ page instance] ( #page-instance )
37
- * [ Crawl interface] ( #Crawl-interface )
38
- * [ Crawl files] ( #Crawl-files )
39
- * [ Start polling] ( #Start-polling )
40
- * [ Crawl interval] ( #Crawl-interval )
41
- * [ Multiple ways of writing requestConfig options] ( #Multiple-ways-of-writing-requestConfig-options )
42
- * [ Multiple ways to get results] ( #Multiple-ways-to-get-results )
29
+ - [ Create application] ( #Create-application )
30
+ - [ An example of a crawler application] ( #An-example-of-a-crawler-application )
31
+ - [ Choose crawling mode] ( #Choose-crawling-mode )
32
+ - [ Multiple crawler application instances] ( #Multiple-crawler-application-instances )
33
+ - [ Crawl page] ( #Crawl-page )
34
+ - [ jsdom instance] ( #jsdom-instance )
35
+ - [ browser instance] ( #browser-instance )
36
+ - [ page instance] ( #page-instance )
37
+ - [ Crawl interface] ( #Crawl-interface )
38
+ - [ Crawl files] ( #Crawl-files )
39
+ - [ Start polling] ( #Start-polling )
40
+ - [ Crawl interval] ( #Crawl-interval )
41
+ - [ Multiple ways of writing requestConfig options] ( #Multiple-ways-of-writing-requestConfig-options )
42
+ - [ Multiple ways to get results] ( #Multiple-ways-to-get-results )
43
43
- [ API] ( #API )
44
- * [ xCrawl] ( #xCrawl )
45
- + [ Type] ( #Type )
46
- + [ Example] ( #Example-1 )
47
- * [ crawlPage] ( #crawlPage )
48
- + [ Type] ( #Type-1 )
49
- + [ Example] ( #Example-2 )
50
- * [ crawlData] ( #crawlData )
51
- + [ Type] ( #Type-2 )
52
- + [ Example] ( #Example-3 )
53
- * [ crawlFile] ( #crawlFile )
54
- + [ Type] ( #Type-3 )
55
- + [ Example] ( #Example-4 )
56
- * [ crawlPolling] ( #crawlPolling )
57
- + [ Type] ( #Type-4 )
58
- + [ Example] ( #Example-5 )
44
+ - [ xCrawl] ( #xCrawl )
45
+ - [ Type] ( #Type )
46
+ - [ Example] ( #Example-1 )
47
+ - [ crawlPage] ( #crawlPage )
48
+ - [ Type] ( #Type-1 )
49
+ - [ Example] ( #Example-2 )
50
+ - [ crawlData] ( #crawlData )
51
+ - [ Type] ( #Type-2 )
52
+ - [ Example] ( #Example-3 )
53
+ - [ crawlFile] ( #crawlFile )
54
+ - [ Type] ( #Type-3 )
55
+ - [ Example] ( #Example-4 )
56
+ - [ crawlPolling] ( #crawlPolling )
57
+ - [ Type] ( #Type-4 )
58
+ - [ Example] ( #Example-5 )
59
59
- [ Types] ( #Types )
60
- * [ AnyObject] ( #AnyObject )
61
- * [ Method] ( #Method )
62
- * [ RequestConfigObjectV1] ( #RequestConfigObjectV1 )
63
- * [ RequestConfigObjectV2] ( #RequestConfigObjectV2 )
64
- * [ RequestConfig] ( #RequestConfig )
65
- * [ IntervalTime] ( #IntervalTime )
66
- * [ XCrawlBaseConfig] ( #XCrawlBaseConfig )
67
- * [ CrawlPageConfig] ( #CrawlPageConfig )
68
- * [ CrawlBaseConfigV1] ( #CrawlBaseConfigV1 )
69
- * [ CrawlDataConfig] ( #CrawlDataConfig )
70
- * [ CrawlFileConfig] ( #CrawlFileConfig )
71
- * [ StartPollingConfig] ( #StartPollingConfig )
72
- * [ CrawlResCommonV1] ( #CrawlResCommonV1 )
73
- * [ CrawlResCommonArrV1] ( #CrawlResCommonArrV1 )
74
- * [ CrawlPage] ( #CrawlPage-1 )
75
- * [ FileInfo] ( #FileInfo )
60
+ - [ AnyObject] ( #AnyObject )
61
+ - [ Method] ( #Method )
62
+ - [ RequestConfigObjectV1] ( #RequestConfigObjectV1 )
63
+ - [ RequestConfigObjectV2] ( #RequestConfigObjectV2 )
64
+ - [ RequestConfig] ( #RequestConfig )
65
+ - [ IntervalTime] ( #IntervalTime )
66
+ - [ XCrawlBaseConfig] ( #XCrawlBaseConfig )
67
+ - [ CrawlPageConfig] ( #CrawlPageConfig )
68
+ - [ CrawlBaseConfigV1] ( #CrawlBaseConfigV1 )
69
+ - [ CrawlDataConfig] ( #CrawlDataConfig )
70
+ - [ CrawlFileConfig] ( #CrawlFileConfig )
71
+ - [ StartPollingConfig] ( #StartPollingConfig )
72
+ - [ CrawlResCommonV1] ( #CrawlResCommonV1 )
73
+ - [ CrawlResCommonArrV1] ( #CrawlResCommonArrV1 )
74
+ - [ CrawlPage] ( #CrawlPage-1 )
75
+ - [ FileInfo] ( #FileInfo )
76
76
- [ More] ( #More )
77
77
78
78
## Install
@@ -107,9 +107,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
107
107
const { jsdom } = await myXCrawl .crawlPage (' https://zh.airbnb.com/s/*/plus_homes' )
108
108
109
109
// Get the cover image elements for Plus listings
110
- const imgEls = jsdom .window .document
111
- .querySelector (' .a1stauiv' )
112
- ? .querySelectorAll (' picture img' )
110
+ const imgEls = jsdom .window .document .querySelector (' .a1stauiv' )? .querySelectorAll (' picture img' )
113
111
114
112
// set request configuration
115
113
const requestConfig: string [] = []
@@ -190,13 +188,13 @@ Crawl a page via [crawlPage()](#crawlPage) .
190
188
` ` ` js
191
189
import xCrawl from ' x-crawl'
192
190
193
- const myXCrawl = xCrawl ({
191
+ const myXCrawl = xCrawl ({
194
192
timeout: 10000
195
193
})
196
194
197
- myXCrawl .crawlPage (' https://xxx.com' ).then (res => {
195
+ myXCrawl .crawlPage (' https://xxx.com' ).then (( res ) => {
198
196
const { jsdom , browser , page } = res
199
-
197
+
200
198
// Close the browser
201
199
browser .close ()
202
200
})
@@ -235,10 +233,10 @@ myXCrawl.crawlPage('https://www.xxx.com').then(async (res) => {
235
233
236
234
// Get the latest page content
237
235
const content = await page .content ()
238
-
236
+
239
237
// Use the jsdom library to parse it yourself
240
238
const jsdom = new JSDOM (content)
241
-
239
+
242
240
console .log (jsdom .window .document .querySelector (' title' ).textContent )
243
241
})
244
242
` ` `
@@ -250,16 +248,14 @@ import xCrawl from 'x-crawl'
250
248
251
249
const myXCrawl = xCrawl ({ timeout: 10000 })
252
250
253
- myXCrawl
254
- .crawlPage (' https://xxx.com' )
255
- .then (async (res ) => {
256
- const { page } = res
251
+ myXCrawl .crawlPage (' https://xxx.com' ).then (async (res ) => {
252
+ const { page } = res
257
253
258
- // Get a screenshot of the rendered page
259
- await page .screenshot ({ path: ' ./upload/page.png' })
254
+ // Get a screenshot of the rendered page
255
+ await page .screenshot ({ path: ' ./upload/page.png' })
260
256
261
- console .log (' Screen capture is complete' )
262
- })
257
+ console .log (' Screen capture is complete' )
258
+ })
263
259
` ` `
264
260
265
261
### Crawl interface
@@ -269,7 +265,7 @@ Crawl interface data through [crawlData()](#crawlData) .
269
265
` ` ` js
270
266
import xCrawl from ' x-crawl'
271
267
272
- const myXCrawl = xCrawl ({
268
+ const myXCrawl = xCrawl ({
273
269
timeout: 10000 ,
274
270
intervalTime: { max: 3000 , min: 1000 }
275
271
})
@@ -280,8 +276,8 @@ const requestConfig = [
280
276
{ url: ' https://xxx.com/xxxx' }
281
277
]
282
278
283
- myXCrawl .crawlData ({ requestConfig }).then (res => {
284
- // deal with
279
+ myXCrawl .crawlData ({ requestConfig }).then (( res ) => {
280
+ // deal with
285
281
})
286
282
` ` `
287
283
@@ -292,12 +288,12 @@ Crawl file data via [crawlFile()](#crawlFile) .
292
288
` ` ` js
293
289
import xCrawl from ' x-crawl'
294
290
295
- const myXCrawl = xCrawl ({
291
+ const myXCrawl = xCrawl ({
296
292
timeout: 10000 ,
297
293
intervalTime: { max: 3000 , min: 1000 }
298
294
})
299
295
300
- const requestConfig = [ ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
296
+ const requestConfig = [' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
301
297
302
298
myXCrawl
303
299
.crawlFile ({
@@ -318,12 +314,12 @@ Start a polling crawl with [startPolling()](#startPolling) .
318
314
` ` ` js
319
315
import xCrawl from ' x-crawl'
320
316
321
- const myXCrawl = xCrawl ({
317
+ const myXCrawl = xCrawl ({
322
318
timeout: 10000 ,
323
319
intervalTime: { max: 3000 , min: 1000 }
324
320
})
325
321
326
- myXCrawl . startPolling ({ h: 2 , m: 30 }, async (count , stopPolling ) => {
322
+ myXCrawl .startPolling ({ h: 2 , m: 30 }, async (count , stopPolling ) => {
327
323
// will be executed every two and a half hours
328
324
// crawlPage/crawlData/crawlFile
329
325
const { jsdom , browser , page } = await myXCrawl .crawlPage (' https://xxx.com' )
@@ -351,7 +347,7 @@ const myXCrawl = xCrawl({
351
347
352
348
// Set individually (high priority)
353
349
myXCrawl .crawlFile ({
354
- requestConfig: [ ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ],
350
+ requestConfig: [' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ],
355
351
intervalTime: { max: 2000 , min: 1000 }
356
352
})
357
353
` ` `
@@ -376,7 +372,7 @@ The writing method of requestConfig is very flexible, there are 5 types in total
376
372
` ` ` js
377
373
import xCrawl from ' x-crawl'
378
374
379
- const myXCrawl = xCrawl ({
375
+ const myXCrawl = xCrawl ({
380
376
timeout: 10000 ,
381
377
intervalTime: { max: 3000 , min: 1000 }
382
378
})
@@ -385,7 +381,7 @@ const myXCrawl = xCrawl({
385
381
const requestConfig1 = ' https://xxx.com/xxxx'
386
382
387
383
// requestConfig writing method 2:
388
- const requestConfig2 = [ ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
384
+ const requestConfig2 = [' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
389
385
390
386
// requestConfig writing method 3:
391
387
const requestConfig3 = {
@@ -408,7 +404,7 @@ const requestConfig5 = [
408
404
' https://xxx.com/xxxx'
409
405
]
410
406
411
- myXCrawl .crawlData ({ requestConfig: requestConfig5 }).then (res => {
407
+ myXCrawl .crawlData ({ requestConfig: requestConfig5 }).then (( res ) => {
412
408
console .log (res)
413
409
})
414
410
` ` `
@@ -432,7 +428,7 @@ const myXCrawl = xCrawl({
432
428
intervalTime: { max: 3000 , min: 1000 }
433
429
})
434
430
435
- const requestConfig = [ ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
431
+ const requestConfig = [' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
436
432
437
433
// Method 1: Promise
438
434
myXCrawl
@@ -534,7 +530,7 @@ const myXCrawl = xCrawl({ timeout: 10000 })
534
530
myXCrawl .crawlPage (' https://xxx.com/xxxx' ).then ((res ) => {
535
531
const { jsdom , browser , page } = res
536
532
console .log (jsdom .window .document .querySelector (' title' )? .textContent )
537
-
533
+
538
534
// Close the browser
539
535
browser .close ()
540
536
})
@@ -574,7 +570,7 @@ const requestConfig = [
574
570
]
575
571
576
572
// crawlData API
577
- myXCrawl .crawlData ({ requestConfig }).then (res => {
573
+ myXCrawl .crawlData ({ requestConfig }).then (( res ) => {
578
574
console .log (res)
579
575
})
580
576
` ` `
@@ -607,7 +603,7 @@ const myXCrawl = xCrawl({
607
603
intervalTime: { max: 2000 , min: 1000 }
608
604
})
609
605
610
- const requestConfig = [ ' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
606
+ const requestConfig = [' https://xxx.com/xxxx' , ' https://xxx.com/xxxx' ]
611
607
612
608
myXCrawl
613
609
.crawlFile ({
@@ -666,12 +662,32 @@ interface AnyObject extends Object {
666
662
### Method
667
663
668
664
` ` ` ts
669
- type Method = ' get' | ' GET' | ' delete' | ' DELETE' | ' head' | ' HEAD' | ' options' | ' OPTONS' | ' post' | ' POST' | ' put' | ' PUT' | ' patch' | ' PATCH' | ' purge' | ' PURGE' | ' link' | ' LINK' | ' unlink' | ' UNLINK'
665
+ type Method =
666
+ | ' get'
667
+ | ' GET'
668
+ | ' delete'
669
+ | ' DELETE'
670
+ | ' head'
671
+ | ' HEAD'
672
+ | ' options'
673
+ | ' OPTONS'
674
+ | ' post'
675
+ | ' POST'
676
+ | ' put'
677
+ | ' PUT'
678
+ | ' patch'
679
+ | ' PATCH'
680
+ | ' purge'
681
+ | ' PURGE'
682
+ | ' link'
683
+ | ' LINK'
684
+ | ' unlink'
685
+ | ' UNLINK'
670
686
` ` `
671
687
672
688
### RequestConfigObjectV1
673
689
674
- ` ` ` ts
690
+ ` ` ` ts
675
691
interface RequestConfigObjectV1 {
676
692
url: string
677
693
headers?: AnyObject
@@ -682,7 +698,7 @@ interface RequestConfigObjectV1 {
682
698
683
699
### RequestConfigObjectV2
684
700
685
- ` ` ` ts
701
+ ` ` ` ts
686
702
interface RequestConfigObjectV2 {
687
703
url: string
688
704
method?: Method
@@ -703,10 +719,12 @@ type RequestConfig = string | RequestConfigObjectV2
703
719
### IntervalTime
704
720
705
721
` ` ` ts
706
- type IntervalTime = number | {
707
- max: number
708
- min?: number
709
- }
722
+ type IntervalTime =
723
+ | number
724
+ | {
725
+ max: number
726
+ min?: number
727
+ }
710
728
` ` `
711
729
712
730
### XCrawlBaseConfig
@@ -739,8 +757,7 @@ interface CrawlBaseConfigV1 {
739
757
### CrawlDataConfig
740
758
741
759
` ` ` ts
742
- interface CrawlDataConfig extends CrawlBaseConfigV1 {
743
- }
760
+ interface CrawlDataConfig extends CrawlBaseConfigV1 {}
744
761
` ` `
745
762
746
763
### CrawlFileConfig
0 commit comments