@@ -37,6 +37,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p
37
37
- [ page instance] ( #page-instance )
38
38
- [ Crawl interface] ( #Crawl-interface )
39
39
- [ Crawl files] ( #Crawl-files )
40
+ - [ life cycle] ( #life-cycle )
41
+ - [ beforeSave] ( #beforeSave )
40
42
- [ Start polling] ( #Start-polling )
41
43
- [ Config priority] ( #Config-Priority )
42
44
- [ Interval time] ( #Interval-time )
@@ -135,7 +137,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
135
137
136
138
// Gets the URL of the page's wheel image element
137
139
const boxHandle = await page .$ (elSelectorMap[id - 1 ])
138
- const urls = await boxHandle! .$$eval (' picture img' , (imgEls ) => {
140
+ const urls = await boxHandle .$$eval (' picture img' , (imgEls ) => {
139
141
return imgEls .map ((item ) => item .src )
140
142
})
141
143
imgUrls .push (... urls)
@@ -224,7 +226,7 @@ import xCrawl from 'x-crawl'
224
226
225
227
const myXCrawl = xCrawl ()
226
228
227
- myXCrawl .crawlPage (' https://xxx .com' ).then ((res ) => {
229
+ myXCrawl .crawlPage (' https://www.example .com' ).then ((res ) => {
228
230
const { browser , page } = res .data
229
231
230
232
// Close the browser
@@ -253,7 +255,7 @@ import xCrawl from 'x-crawl'
253
255
254
256
const myXCrawl = xCrawl ()
255
257
256
- myXCrawl .crawlPage (' https://xxx .com' ).then (async (res ) => {
258
+ myXCrawl .crawlPage (' https://www.example .com' ).then (async (res ) => {
257
259
const { browser , page } = res .data
258
260
259
261
// Get a screenshot of the rendered page
@@ -275,9 +277,9 @@ import xCrawl from 'x-crawl'
275
277
const myXCrawl = xCrawl ({ intervalTime: { max: 3000 , min: 1000 } })
276
278
277
279
const requestConfigs = [
278
- ' https://xxx. com/xxxx ' ,
279
- ' https://xxx. com/xxxx ' ,
280
- { url: ' https://xxx. com/xxxx ' , method: ' POST' , data: { name: ' coderhxl' } }
280
+ ' https://www.example. com/api-1 ' ,
281
+ ' https://www.example. com/api-2 ' ,
282
+ { url: ' https://www.example. com/api-3 ' , method: ' POST' , data: { name: ' coderhxl' } }
281
283
]
282
284
283
285
myXCrawl .crawlData ({ requestConfigs }).then ((res ) => {
@@ -296,7 +298,7 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } })
296
298
297
299
myXCrawl
298
300
.crawlFile ({
299
- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
301
+ requestConfigs: [' https://www.example. com/file-1 ' , ' https://www.example. com/file-2 ' ],
300
302
fileConfig: {
301
303
storeDir: ' ./upload' // storage folder
302
304
}
@@ -306,6 +308,42 @@ myXCrawl
306
308
})
307
309
```
308
310
311
+ #### life cycle
312
+
313
+ The crawlFile API has a lifetime function:
314
+
315
+ - beforeSave: executed before saving the file
316
+
317
+ ##### beforeSave
318
+
319
+ In the beforeSave function you can get a file of type Buffer, which you can process and return a Promise and resolve as a Buffer.
320
+
321
+ ** Resize picture**
322
+
323
+ Use the sharp library to resize the images to be crawled:
324
+
325
+ ``` js
326
+ import xCrawl from ' x-crawl'
327
+ import sharp from ' sharp'
328
+
329
+ const testXCrawl = xCrawl ()
330
+
331
+ testXCrawl
332
+ .crawlFile ({
333
+ requestConfigs: [' https://www.example.com/file-1.jpg' , ' https://www.example.com/file-2.jpg' ],
334
+ fileConfig: {
335
+ beforeSave (info ) {
336
+ return sharp (info .data ).resize (200 ).toBuffer ()
337
+ }
338
+ }
339
+ })
340
+ .then ((res ) => {
341
+ res .forEach ((item ) => {
342
+ console .log (item .data ? .data .isSuccess )
343
+ })
344
+ })
345
+ ` ` `
346
+
309
347
### Start polling
310
348
311
349
Start a polling crawl with [startPolling()](#startPolling) .
@@ -321,7 +359,7 @@ const myXCrawl = xCrawl({
321
359
myXCrawl .startPolling ({ h: 2 , m: 30 }, async (count , stopPolling ) => {
322
360
// will be executed every two and a half hours
323
361
// crawlPage/crawlData/crawlFile
324
- const res = await myXCrawl .crawlPage (' https://xxx .com' )
362
+ const res = await myXCrawl .crawlPage (' https://www.example .com' )
325
363
res .data .page .close ()
326
364
})
327
365
` ` `
@@ -356,7 +394,7 @@ const myXCrawl = xCrawl()
356
394
357
395
myXCrawl
358
396
.crawlData ({
359
- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
397
+ requestConfigs: [' https://www.example. com/api-1 ' , ' https://www.example. com/api-2 ' ],
360
398
intervalTime: { max: 2000 , min: 1000 }
361
399
})
362
400
.then ((res ) => {})
@@ -378,7 +416,7 @@ import xCrawl from 'x-crawl'
378
416
379
417
const myXCrawl = xCrawl ()
380
418
381
- myXCrawl .crawlData ({ url: ' https://xxx. com/xxxx ' , maxRetry: 1 }).then ((res ) => {})
419
+ myXCrawl .crawlData ({ url: ' https://www.example. com/api ' , maxRetry: 1 }).then ((res ) => {})
382
420
` ` `
383
421
384
422
The maxRetry attribute determines how many times to retry.
@@ -394,9 +432,9 @@ const myXCrawl = xCrawl()
394
432
395
433
myXCrawl
396
434
.crawlData ([
397
- { url: ' https://xxx. com/xxxx ' , priority: 1 },
398
- { url: ' https://xxx. com/xxxx ' , priority: 10 },
399
- { url: ' https://xxx. com/xxxx ' , priority: 8 }
435
+ { url: ' https://www.example. com/api-1 ' , priority: 1 },
436
+ { url: ' https://www.example. com/api-2 ' , priority: 10 },
437
+ { url: ' https://www.example. com/api-3 ' , priority: 8 }
400
438
])
401
439
.then ((res ) => {})
402
440
` ` `
@@ -439,7 +477,7 @@ import xCrawl from 'x-crawl'
439
477
440
478
// xCrawl API
441
479
const myXCrawl = xCrawl({
442
- baseUrl: 'https://xxx .com',
480
+ baseUrl: ' https://www.example .com' ,
443
481
timeout: 10000 ,
444
482
intervalTime: { max: 2000 , min: 1000 }
445
483
})
@@ -472,7 +510,7 @@ import xCrawl from 'x-crawl'
472
510
const myXCrawl = xCrawl()
473
511
474
512
// crawlPage API
475
- myXCrawl.crawlPage('https://xxx. com/xxxx ').then((res) => {
513
+ myXCrawl.crawlPage('https:// www.example. com').then((res) => {
476
514
const { browser, page } = res .data
477
515
478
516
// Close the browser
@@ -498,7 +536,7 @@ import xCrawl from 'x-crawl'
498
536
499
537
const myXCrawl = xCrawl ()
500
538
501
- myXCrawl.crawlPage('https://xxx. com/xxxx ').then((res) => {})
539
+ myXCrawl .crawlPage (' https://www.example. com' ).then ((res ) => {})
502
540
` ` `
503
541
504
542
The res you get will be an object.
@@ -516,7 +554,7 @@ const myXCrawl = xCrawl()
516
554
517
555
myXCrawl
518
556
.crawlPage ({
519
- url: 'https://xxx. com/xxxx ',
557
+ url: ' https://www.example. com' ,
520
558
proxy: ' xxx' ,
521
559
maxRetry: 1
522
560
})
@@ -537,7 +575,10 @@ import xCrawl from 'x-crawl'
537
575
const myXCrawl = xCrawl ()
538
576
539
577
myXCrawl
540
- .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }])
578
+ .crawlPage ([
579
+ ' https://www.example.com/page-1' ,
580
+ { url: ' https://www.example.com/page-2' , maxRetry: 2 }
581
+ ])
541
582
.then ((res ) => {})
542
583
` ` `
543
584
@@ -549,20 +590,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP
549
590
550
591
If you want to crawl multiple pages, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this way of writing:
551
592
552
- ```
593
+ ` ` ` js
553
594
import xCrawl from ' x-crawl'
554
595
555
596
const myXCrawl = xCrawl ()
556
597
557
- myXCrawl.crawlPage({
558
- requestConfigs: [
559
- 'https://xxx.com/xxxx ',
560
- { url: 'https://xxx.com/xxxx ', maxRetry: 6 }
561
- ] ,
562
- intervalTime: { max: 3000, min: 1000 },
563
- cookies: 'xxx',
564
- maxRetry: 1
565
- }).then((res) => {})
598
+ myXCrawl
599
+ .crawlPage ({
600
+ requestConfigs: [
601
+ ' https://www.example.com/page-1' ,
602
+ { url: ' https://www.example.com/page-2' , maxRetry: 6 }
603
+ ],
604
+ intervalTime: { max: 3000 , min: 1000 },
605
+ cookies: ' xxx' ,
606
+ maxRetry: 1
607
+ })
608
+ .then ((res ) => {})
566
609
` ` `
567
610
568
611
The res you get will be an array of objects.
@@ -598,7 +641,7 @@ const myXCrawl = xCrawl({
598
641
599
642
myXCrawl
600
643
.crawlData ({
601
- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
644
+ requestConfigs: [' https://www.example. com/api-1 ' , ' https://www.example. com/api-2 ' ],
602
645
intervalTime: { max: 3000 , min: 1000 },
603
646
cookies: ' xxx' ,
604
647
maxRetry: 1
@@ -626,7 +669,7 @@ import xCrawl from 'x-crawl'
626
669
627
670
const myXCrawl = xCrawl ()
628
671
629
- myXCrawl .crawlData (' https://xxx. com/xxxx ' ).then ((res ) => {})
672
+ myXCrawl .crawlData (' https://www.example. com/api ' ).then ((res ) => {})
630
673
` ` `
631
674
632
675
The res you get will be an object.
@@ -644,7 +687,7 @@ const myXCrawl = xCrawl()
644
687
645
688
myXCrawl
646
689
.crawlData ({
647
- url: ' https://xxx. com/xxxx ' ,
690
+ url: ' https://www.example. com/api ' ,
648
691
proxy: ' xxx' ,
649
692
maxRetry: 1
650
693
})
@@ -665,7 +708,10 @@ import xCrawl from 'x-crawl'
665
708
const myXCrawl = xCrawl ()
666
709
667
710
myXCrawl
668
- .crawlPage ([' https://xxx.com/xxxx' , { url: ' https://xxx.com/xxxx' , maxRetry: 2 }])
711
+ .crawlData ([
712
+ ' https://www.example.com/api-1' ,
713
+ { url: ' https://www.example.com/api-2' , maxRetry: 2 }
714
+ ])
669
715
.then ((res ) => {})
670
716
` ` `
671
717
@@ -677,20 +723,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP
677
723
678
724
If you want to crawl multiple data, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this writing method:
679
725
680
- ```
726
+ ` ` ` js
681
727
import xCrawl from ' x-crawl'
682
728
683
729
const myXCrawl = xCrawl ()
684
730
685
- myXCrawl.crawlData({
686
- requestConfigs: [
687
- 'https://xxx.com/xxxx',
688
- { url: 'https://xxx.com/xxxx', maxRetry: 6 }
689
- ],
690
- intervalTime: { max: 3000, min: 1000 },
691
- cookies: 'xxx',
692
- maxRetry: 1
693
- }).then((res) => {})
731
+ myXCrawl
732
+ .crawlData ({
733
+ requestConfigs: [
734
+ ' https://www.example.com/api-1' ,
735
+ { url: ' https://www.example.com/api-2' , maxRetry: 6 }
736
+ ],
737
+ intervalTime: { max: 3000 , min: 1000 },
738
+ cookies: ' xxx' ,
739
+ maxRetry: 1
740
+ })
741
+ .then ((res ) => {})
694
742
` ` `
695
743
696
744
The res you get will be an array of objects.
@@ -727,7 +775,7 @@ const myXCrawl = xCrawl({
727
775
// crawlFile API
728
776
myXCrawl
729
777
.crawlFile ({
730
- requestConfigs: ['https://xxx. com/xxxx ', 'https://xxx. com/xxxx '],
778
+ requestConfigs: [' https://www.example. com/file-1 ' , ' https://www.example. com/file-2 ' ],
731
779
storeDir: ' ./upload' ,
732
780
intervalTime: { max: 3000 , min: 1000 },
733
781
maxRetry: 1
@@ -757,7 +805,7 @@ const myXCrawl = xCrawl()
757
805
758
806
myXCrawl
759
807
.crawlFile ({
760
- url: 'https://xxx. com/xxxx ',
808
+ url: ' https://www.example. com/file ' ,
761
809
proxy: ' xxx' ,
762
810
maxRetry: 1 ,
763
811
storeDir: ' ./upload' ,
@@ -781,8 +829,8 @@ const myXCrawl = xCrawl()
781
829
782
830
myXCrawl
783
831
.crawlFile ([
784
- { url: 'https://xxx. com/xxxx ', storeDir: './upload' },
785
- { url: 'https://xxx. com/xxxx ', storeDir: './upload', maxRetry: 2 }
832
+ { url: ' https://www.example. com/file-1 ' , storeDir: ' ./upload' },
833
+ { url: ' https://www.example. com/file-2 ' , storeDir: ' ./upload' , maxRetry: 2 }
786
834
])
787
835
.then ((res ) => {})
788
836
` ` `
@@ -795,20 +843,22 @@ For more configuration options of CrawlFileConfigObject, please refer to [CrawlF
795
843
796
844
If you want to crawl multiple data, and the request configuration (storeDir, proxy, retry, etc.) does not want to be written repeatedly, and you need interval time, etc., you can try this way of writing:
797
845
798
- ```
846
+ ` ` ` js
799
847
import xCrawl from ' x-crawl'
800
848
801
849
const myXCrawl = xCrawl ()
802
850
803
- myXCrawl.crawlFile({
804
- requestConfigs: [
805
- 'https://xxx.com/xxxx ',
806
- { url: 'https://xxx.com/xxxx ', storeDir: './upload/xxx' }
807
- ] ,
808
- storeDir: './upload',
809
- intervalTime: { max: 3000, min: 1000 },
810
- maxRetry: 1
811
- }).then((res) => {})
851
+ myXCrawl
852
+ .crawlFile ({
853
+ requestConfigs: [
854
+ ' https://www.example.com/file-1' ,
855
+ { url: ' https://www.example.com/file-2' , storeDir: ' ./upload/xxx' }
856
+ ],
857
+ storeDir: ' ./upload' ,
858
+ intervalTime: { max: 3000 , min: 1000 },
859
+ maxRetry: 1
860
+ })
861
+ .then ((res ) => {})
812
862
` ` `
813
863
814
864
The res you get will be an array of objects.
@@ -999,7 +1049,7 @@ export interface CrawlFileConfigObject {
999
1049
fileName : string
1000
1050
filePath : string
1001
1051
data : Buffer
1002
- }) => Buffer | void
1052
+ }) = > Promise < Buffer >
1003
1053
}
1004
1054
}
1005
1055
` ` `
@@ -1167,3 +1217,5 @@ export interface AnyObject extends Object {
1167
1217
## More
1168
1218
1169
1219
If you have **problems, needs, good suggestions** please raise **Issues** in https://github.com/coder-hxl/x-crawl/issues.
1220
+
1221
+ [#life-cycle]:
0 commit comments