@@ -51,7 +51,7 @@ The crawlPage API has built-in [puppeteer](https://github.com/puppeteer/puppetee
51
51
- [ Config Priority] ( #Config-Priority )
52
52
- [ Interval Time] ( #Interval-Time )
53
53
- [ Fail Retry] ( #Fail-Retry )
54
- - [ Rotate Proxy] (#Rotate Proxy)
54
+ - [ Rotate Proxy] ( #Rotate- Proxy )
55
55
- [ Custom Device Fingerprint] ( #Custom-Device-Fingerprint )
56
56
- [ Priority Queue] ( #Priority-Queue )
57
57
- [ About Results] ( #About-Results )
@@ -115,6 +115,8 @@ The crawlPage API has built-in [puppeteer](https://github.com/puppeteer/puppetee
115
115
- [ API Other] ( #API-Other )
116
116
- [ AnyObject] ( #AnyObject )
117
117
- [ More] ( #More )
118
+ - [ Community] ( #Community )
119
+ - [ Issues] ( #Issues )
118
120
119
121
## Install
120
122
@@ -126,14 +128,14 @@ npm install x-crawl
126
128
127
129
## Example
128
130
129
- Take the automatic acquisition of photos of experiences and homes in hawaii every day as an example: :
131
+ Take the automatic acquisition of some photos of experiences and homes around the world every day as an example:
130
132
131
133
``` js
132
134
// 1.Import module ES/CJS
133
135
import xCrawl from ' x-crawl'
134
136
135
137
// 2.Create a crawler instance
136
- const myXCrawl = xCrawl ({ maxRetry: 3 , intervalTime: { max: 3000 , min: 2000 } })
138
+ const myXCrawl = xCrawl ({maxRetry: 3 ,intervalTime: { max: 3000 , min: 2000 }})
137
139
138
140
// 3.Set the crawling task
139
141
/*
@@ -142,27 +144,31 @@ const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } })
142
144
*/
143
145
myXCrawl .startPolling ({ d: 1 }, async (count , stopPolling ) => {
144
146
// Call crawlPage API to crawl Page
145
- const res = await myXCrawl .crawlPage ([
146
- ' https://zh.airbnb.com/s/hawaii/experiences' ,
147
- ' https://zh.airbnb.com/s/hawaii/homes'
148
- ])
147
+ const res = await myXCrawl .crawlPage ({
148
+ targets: [
149
+ ' https://www.airbnb.cn/s/experiences' ,
150
+ ' https://www.airbnb.cn/s/plus_homes'
151
+ ],
152
+ viewport: { width: 1920 , height: 1080 }
153
+ })
149
154
150
155
// Store the image URL to targets
151
156
const targets = []
152
- const elSelectorMap = [' .c14whb16 ' , ' .l196t2l1 ' ]
157
+ const elSelectorMap = [' ._fig15y ' , ' ._aov0j6 ' ]
153
158
for (const item of res) {
154
159
const { id } = item
155
160
const { page } = item .data
156
- const boxSelector = elSelectorMap[id - 1 ]
157
161
158
- // Wait for the image element to appear
159
- await page . waitForSelector ( ` ${ boxSelector } img ` )
162
+ // Wait for the page to load
163
+ await new Promise (( r ) => setTimeout (r, 300 ) )
160
164
161
- // Gets the URL of the page's wheel image element
162
- const boxHandle = await page .$ (boxSelector)
163
- const urls = await boxHandle .$$eval (' picture img' , (imgEls ) => {
164
- return imgEls .map ((item ) => item .src )
165
- })
165
+ // Gets the URL of the page image
166
+ const urls = await page! .$$eval (
167
+ ` ${ elSelectorMap[id - 1 ]} img` ,
168
+ (imgEls ) => {
169
+ return imgEls .map ((item ) => item .src )
170
+ }
171
+ )
166
172
targets .push (... urls)
167
173
168
174
// Close page
@@ -532,7 +538,7 @@ The intervalTime option defaults to undefined . If there is a setting value, it
532
538
533
539
It can avoid crawling failure due to temporary problems, and will wait for the end of this round of crawling targets to crawl again.
534
540
535
- The number of failed retries can be set by creating crawler application instance, advanced usage, and detailed target.
541
+ You can create crawler application instance, advanced usage, detailed target these three places Settings .
536
542
537
543
` ` ` js
538
544
import xCrawl from ' x-crawl'
@@ -550,7 +556,7 @@ The maxRetry attribute determines how many times to retry.
550
556
551
557
With failed retries, custom error times and HTTP status codes, the proxy is automatically rotated for crawling targets.
552
558
553
- You can set the number of failed retries in the three places of creating a crawler application instance, advanced usage, and detailed goals .
559
+ You can create crawler application instance, advanced usage, detailed target these three places Settings .
554
560
555
561
Take crawlPage as an example:
556
562
@@ -615,9 +621,9 @@ myXCrawl.crawlPage({
615
621
' https://www.example.com/page-1' ,
616
622
' https://www.example.com/page-2' ,
617
623
' https://www.example.com/page-3' ,
618
- // Unfingerprint for this target
624
+ // Cancel the fingerprint for this target
619
625
{ url: ' https://www.example.com/page-4' , fingerprint: null },
620
- // Set the fingerprint individually for this target
626
+ // Set a separate fingerprint for this target
621
627
{
622
628
url: ' https://www.example.com/page-5' ,
623
629
fingerprint: {
@@ -635,8 +641,9 @@ myXCrawl.crawlPage({
635
641
}
636
642
}
637
643
],
638
- // Set the fingerprint uniformly for this target
644
+ // Set fingerprints uniformly for this target
639
645
fingerprints: [
646
+ // Device fingerprint 1
640
647
{
641
648
maxWidth: 1024 ,
642
649
maxHeight: 800 ,
@@ -648,7 +655,7 @@ myXCrawl.crawlPage({
648
655
versions: [
649
656
{
650
657
name: ' Chrome' ,
651
- // browser version
658
+ // Browser version
652
659
maxMajorVersion: 112 ,
653
660
minMajorVersion: 100 ,
654
661
maxMinorVersion: 20 ,
@@ -663,6 +670,44 @@ myXCrawl.crawlPage({
663
670
}
664
671
]
665
672
}
673
+ },
674
+ // Device fingerprint 2
675
+ {
676
+ platform: ' Windows' ,
677
+ mobile: ' random' ,
678
+ userAgent: {
679
+ value:
680
+ ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59' ,
681
+ versions: [
682
+ {
683
+ name: ' Chrome' ,
684
+ maxMajorVersion: 91 ,
685
+ minMajorVersion: 88 ,
686
+ maxMinorVersion: 10 ,
687
+ maxPatchVersion: 5615
688
+ },
689
+ { name: ' Safari' , maxMinorVersion: 36 , maxPatchVersion: 2333 },
690
+ { name: ' Edg' , maxMinorVersion: 10 , maxPatchVersion: 864 }
691
+ ]
692
+ }
693
+ },
694
+ // Device fingerprint 3
695
+ {
696
+ platform: ' Windows' ,
697
+ mobile: ' random' ,
698
+ userAgent: {
699
+ value:
700
+ ' Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0' ,
701
+ versions: [
702
+ {
703
+ name: ' Firefox' ,
704
+ maxMajorVersion: 47 ,
705
+ minMajorVersion: 43 ,
706
+ maxMinorVersion: 10 ,
707
+ maxPatchVersion: 5000
708
+ }
709
+ ]
710
+ }
666
711
}
667
712
]
668
713
})
@@ -1706,4 +1751,10 @@ export interface AnyObject extends Object {
1706
1751
1707
1752
## More
1708
1753
1709
- If you have **problems, needs, good suggestions** please raise **Issues** in https://github.com/coder-hxl/x-crawl/issues.
1754
+ ### Community
1755
+
1756
+ **GitHub Discussions:** May be discussed through [GitHub Discussions](https://github.com/coder-hxl/x-crawl/discussions).
1757
+
1758
+ ### Issues
1759
+
1760
+ If you have questions, needs, or good suggestions, you can raise them at [GitHub Issues](https://github.com/coder-hxl/x-crawl/issues).
0 commit comments