Skip to content

Commit 0a847f1

Browse files
committed
Update: Docs
1 parent b16cb70 commit 0a847f1

File tree

6 files changed

+320
-269
lines changed

6 files changed

+320
-269
lines changed

.prettierrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
"printWidth": 80,
55
"singleQuote": true,
66
"trailingComma": "none",
7-
"semi": false
7+
"semi": false,
8+
"overrides": [{ "files": "*.md", "options": { "printWidth": 100 } }]
89
}

README.md

Lines changed: 102 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -26,53 +26,53 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p
2626
- [Install](#Install)
2727
- [Example](#Example)
2828
- [Core concepts](#Core-concepts)
29-
* [Create application](#Create-application)
30-
+ [An example of a crawler application](#An-example-of-a-crawler-application)
31-
+ [Choose crawling mode](#Choose-crawling-mode)
32-
+ [Multiple crawler application instances](#Multiple-crawler-application-instances)
33-
* [Crawl page](#Crawl-page)
34-
+ [jsdom instance](#jsdom-instance)
35-
+ [browser instance](#browser-instance)
36-
+ [page instance](#page-instance)
37-
* [Crawl interface](#Crawl-interface)
38-
* [Crawl files](#Crawl-files)
39-
* [Start polling](#Start-polling)
40-
* [Crawl interval](#Crawl-interval)
41-
* [Multiple ways of writing requestConfig options](#Multiple-ways-of-writing-requestConfig-options)
42-
* [Multiple ways to get results](#Multiple-ways-to-get-results)
29+
- [Create application](#Create-application)
30+
- [An example of a crawler application](#An-example-of-a-crawler-application)
31+
- [Choose crawling mode](#Choose-crawling-mode)
32+
- [Multiple crawler application instances](#Multiple-crawler-application-instances)
33+
- [Crawl page](#Crawl-page)
34+
- [jsdom instance](#jsdom-instance)
35+
- [browser instance](#browser-instance)
36+
- [page instance](#page-instance)
37+
- [Crawl interface](#Crawl-interface)
38+
- [Crawl files](#Crawl-files)
39+
- [Start polling](#Start-polling)
40+
- [Crawl interval](#Crawl-interval)
41+
- [Multiple ways of writing requestConfig options](#Multiple-ways-of-writing-requestConfig-options)
42+
- [Multiple ways to get results](#Multiple-ways-to-get-results)
4343
- [API](#API)
44-
* [xCrawl](#xCrawl)
45-
+ [Type](#Type)
46-
+ [Example](#Example-1)
47-
* [crawlPage](#crawlPage)
48-
+ [Type](#Type-1)
49-
+ [Example](#Example-2)
50-
* [crawlData](#crawlData)
51-
+ [Type](#Type-2)
52-
+ [Example](#Example-3)
53-
* [crawlFile](#crawlFile)
54-
+ [Type](#Type-3)
55-
+ [Example](#Example-4)
56-
* [crawlPolling](#crawlPolling)
57-
+ [Type](#Type-4)
58-
+ [Example](#Example-5)
44+
- [xCrawl](#xCrawl)
45+
- [Type](#Type)
46+
- [Example](#Example-1)
47+
- [crawlPage](#crawlPage)
48+
- [Type](#Type-1)
49+
- [Example](#Example-2)
50+
- [crawlData](#crawlData)
51+
- [Type](#Type-2)
52+
- [Example](#Example-3)
53+
- [crawlFile](#crawlFile)
54+
- [Type](#Type-3)
55+
- [Example](#Example-4)
56+
- [crawlPolling](#crawlPolling)
57+
- [Type](#Type-4)
58+
- [Example](#Example-5)
5959
- [Types](#Types)
60-
* [AnyObject](#AnyObject)
61-
* [Method](#Method)
62-
* [RequestConfigObjectV1](#RequestConfigObjectV1)
63-
* [RequestConfigObjectV2](#RequestConfigObjectV2)
64-
* [RequestConfig](#RequestConfig)
65-
* [IntervalTime](#IntervalTime)
66-
* [XCrawlBaseConfig](#XCrawlBaseConfig)
67-
* [CrawlPageConfig](#CrawlPageConfig)
68-
* [CrawlBaseConfigV1](#CrawlBaseConfigV1)
69-
* [CrawlDataConfig](#CrawlDataConfig)
70-
* [CrawlFileConfig](#CrawlFileConfig)
71-
* [StartPollingConfig](#StartPollingConfig)
72-
* [CrawlResCommonV1](#CrawlResCommonV1)
73-
* [CrawlResCommonArrV1](#CrawlResCommonArrV1)
74-
* [CrawlPage](#CrawlPage-1)
75-
* [FileInfo](#FileInfo)
60+
- [AnyObject](#AnyObject)
61+
- [Method](#Method)
62+
- [RequestConfigObjectV1](#RequestConfigObjectV1)
63+
- [RequestConfigObjectV2](#RequestConfigObjectV2)
64+
- [RequestConfig](#RequestConfig)
65+
- [IntervalTime](#IntervalTime)
66+
- [XCrawlBaseConfig](#XCrawlBaseConfig)
67+
- [CrawlPageConfig](#CrawlPageConfig)
68+
- [CrawlBaseConfigV1](#CrawlBaseConfigV1)
69+
- [CrawlDataConfig](#CrawlDataConfig)
70+
- [CrawlFileConfig](#CrawlFileConfig)
71+
- [StartPollingConfig](#StartPollingConfig)
72+
- [CrawlResCommonV1](#CrawlResCommonV1)
73+
- [CrawlResCommonArrV1](#CrawlResCommonArrV1)
74+
- [CrawlPage](#CrawlPage-1)
75+
- [FileInfo](#FileInfo)
7676
- [More](#More)
7777

7878
## Install
@@ -107,9 +107,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
107107
const { jsdom } = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes')
108108

109109
// Get the cover image elements for Plus listings
110-
const imgEls = jsdom.window.document
111-
.querySelector('.a1stauiv')
112-
?.querySelectorAll('picture img')
110+
const imgEls = jsdom.window.document.querySelector('.a1stauiv')?.querySelectorAll('picture img')
113111

114112
// set request configuration
115113
const requestConfig: string[] = []
@@ -190,13 +188,13 @@ Crawl a page via [crawlPage()](#crawlPage) .
190188
```js
191189
import xCrawl from 'x-crawl'
192190

193-
const myXCrawl = xCrawl({
191+
const myXCrawl = xCrawl({
194192
timeout: 10000
195193
})
196194

197-
myXCrawl.crawlPage('https://xxx.com').then(res => {
195+
myXCrawl.crawlPage('https://xxx.com').then((res) => {
198196
const { jsdom, browser, page } = res
199-
197+
200198
// Close the browser
201199
browser.close()
202200
})
@@ -235,10 +233,10 @@ myXCrawl.crawlPage('https://www.xxx.com').then(async (res) => {
235233

236234
// Get the latest page content
237235
const content = await page.content()
238-
236+
239237
// Use the jsdom library to parse it yourself
240238
const jsdom = new JSDOM(content)
241-
239+
242240
console.log(jsdom.window.document.querySelector('title').textContent)
243241
})
244242
```
@@ -250,16 +248,14 @@ import xCrawl from 'x-crawl'
250248

251249
const myXCrawl = xCrawl({ timeout: 10000 })
252250

253-
myXCrawl
254-
.crawlPage('https://xxx.com')
255-
.then(async (res) => {
256-
const { page } = res
251+
myXCrawl.crawlPage('https://xxx.com').then(async (res) => {
252+
const { page } = res
257253

258-
// Get a screenshot of the rendered page
259-
await page.screenshot({ path: './upload/page.png' })
254+
// Get a screenshot of the rendered page
255+
await page.screenshot({ path: './upload/page.png' })
260256

261-
console.log('Screen capture is complete')
262-
})
257+
console.log('Screen capture is complete')
258+
})
263259
```
264260
265261
### Crawl interface
@@ -269,7 +265,7 @@ Crawl interface data through [crawlData()](#crawlData) .
269265
```js
270266
import xCrawl from 'x-crawl'
271267

272-
const myXCrawl = xCrawl({
268+
const myXCrawl = xCrawl({
273269
timeout: 10000,
274270
intervalTime: { max: 3000, min: 1000 }
275271
})
@@ -280,8 +276,8 @@ const requestConfig = [
280276
{ url: 'https://xxx.com/xxxx' }
281277
]
282278

283-
myXCrawl.crawlData({ requestConfig }).then(res => {
284-
// deal with
279+
myXCrawl.crawlData({ requestConfig }).then((res) => {
280+
// deal with
285281
})
286282
```
287283
@@ -292,12 +288,12 @@ Crawl file data via [crawlFile()](#crawlFile) .
292288
```js
293289
import xCrawl from 'x-crawl'
294290

295-
const myXCrawl = xCrawl({
291+
const myXCrawl = xCrawl({
296292
timeout: 10000,
297293
intervalTime: { max: 3000, min: 1000 }
298294
})
299295

300-
const requestConfig = [ 'https://xxx.com/xxxx', 'https://xxx.com/xxxx' ]
296+
const requestConfig = ['https://xxx.com/xxxx', 'https://xxx.com/xxxx']
301297

302298
myXCrawl
303299
.crawlFile({
@@ -318,12 +314,12 @@ Start a polling crawl with [startPolling()](#startPolling) .
318314
```js
319315
import xCrawl from 'x-crawl'
320316

321-
const myXCrawl = xCrawl({
317+
const myXCrawl = xCrawl({
322318
timeout: 10000,
323319
intervalTime: { max: 3000, min: 1000 }
324320
})
325321

326-
myXCrawl. startPolling({ h: 2, m: 30 }, async (count, stopPolling) => {
322+
myXCrawl.startPolling({ h: 2, m: 30 }, async (count, stopPolling) => {
327323
// will be executed every two and a half hours
328324
// crawlPage/crawlData/crawlFile
329325
const { jsdom, browser, page } = await myXCrawl.crawlPage('https://xxx.com')
@@ -351,7 +347,7 @@ const myXCrawl = xCrawl({
351347

352348
// Set individually (high priority)
353349
myXCrawl.crawlFile({
354-
requestConfig: [ 'https://xxx.com/xxxx', 'https://xxx.com/xxxx' ],
350+
requestConfig: ['https://xxx.com/xxxx', 'https://xxx.com/xxxx'],
355351
intervalTime: { max: 2000, min: 1000 }
356352
})
357353
```
@@ -376,7 +372,7 @@ The writing method of requestConfig is very flexible, there are 5 types in total
376372
```js
377373
import xCrawl from 'x-crawl'
378374

379-
const myXCrawl = xCrawl({
375+
const myXCrawl = xCrawl({
380376
timeout: 10000,
381377
intervalTime: { max: 3000, min: 1000 }
382378
})
@@ -385,7 +381,7 @@ const myXCrawl = xCrawl({
385381
const requestConfig1 = 'https://xxx.com/xxxx'
386382

387383
// requestConfig writing method 2:
388-
const requestConfig2 = [ 'https://xxx.com/xxxx', 'https://xxx.com/xxxx', 'https://xxx.com/xxxx' ]
384+
const requestConfig2 = ['https://xxx.com/xxxx', 'https://xxx.com/xxxx', 'https://xxx.com/xxxx']
389385

390386
// requestConfig writing method 3:
391387
const requestConfig3 = {
@@ -408,7 +404,7 @@ const requestConfig5 = [
408404
'https://xxx.com/xxxx'
409405
]
410406

411-
myXCrawl.crawlData({ requestConfig: requestConfig5 }).then(res => {
407+
myXCrawl.crawlData({ requestConfig: requestConfig5 }).then((res) => {
412408
console.log(res)
413409
})
414410
```
@@ -432,7 +428,7 @@ const myXCrawl = xCrawl({
432428
intervalTime: { max: 3000, min: 1000 }
433429
})
434430

435-
const requestConfig = [ 'https://xxx.com/xxxx', 'https://xxx.com/xxxx', 'https://xxx.com/xxxx' ]
431+
const requestConfig = ['https://xxx.com/xxxx', 'https://xxx.com/xxxx', 'https://xxx.com/xxxx']
436432

437433
// Method 1: Promise
438434
myXCrawl
@@ -534,7 +530,7 @@ const myXCrawl = xCrawl({ timeout: 10000 })
534530
myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => {
535531
const { jsdom, browser, page } = res
536532
console.log(jsdom.window.document.querySelector('title')?.textContent)
537-
533+
538534
// Close the browser
539535
browser.close()
540536
})
@@ -574,7 +570,7 @@ const requestConfig = [
574570
]
575571

576572
// crawlData API
577-
myXCrawl.crawlData({ requestConfig }).then(res => {
573+
myXCrawl.crawlData({ requestConfig }).then((res) => {
578574
console.log(res)
579575
})
580576
```
@@ -607,7 +603,7 @@ const myXCrawl = xCrawl({
607603
intervalTime: { max: 2000, min: 1000 }
608604
})
609605

610-
const requestConfig = [ 'https://xxx.com/xxxx', 'https://xxx.com/xxxx' ]
606+
const requestConfig = ['https://xxx.com/xxxx', 'https://xxx.com/xxxx']
611607

612608
myXCrawl
613609
.crawlFile({
@@ -666,12 +662,32 @@ interface AnyObject extends Object {
666662
### Method
667663
668664
```ts
669-
type Method = 'get' | 'GET' | 'delete' | 'DELETE' | 'head' | 'HEAD' | 'options' | 'OPTONS' | 'post' | 'POST' | 'put' | 'PUT' | 'patch' | 'PATCH' | 'purge' | 'PURGE' | 'link' | 'LINK' | 'unlink' | 'UNLINK'
665+
type Method =
666+
| 'get'
667+
| 'GET'
668+
| 'delete'
669+
| 'DELETE'
670+
| 'head'
671+
| 'HEAD'
672+
| 'options'
673+
| 'OPTONS'
674+
| 'post'
675+
| 'POST'
676+
| 'put'
677+
| 'PUT'
678+
| 'patch'
679+
| 'PATCH'
680+
| 'purge'
681+
| 'PURGE'
682+
| 'link'
683+
| 'LINK'
684+
| 'unlink'
685+
| 'UNLINK'
670686
```
671687
672688
### RequestConfigObjectV1
673689
674-
```ts
690+
```ts
675691
interface RequestConfigObjectV1 {
676692
url: string
677693
headers?: AnyObject
@@ -682,7 +698,7 @@ interface RequestConfigObjectV1 {
682698
683699
### RequestConfigObjectV2
684700
685-
```ts
701+
```ts
686702
interface RequestConfigObjectV2 {
687703
url: string
688704
method?: Method
@@ -703,10 +719,12 @@ type RequestConfig = string | RequestConfigObjectV2
703719
### IntervalTime
704720
705721
```ts
706-
type IntervalTime = number | {
707-
max: number
708-
min?: number
709-
}
722+
type IntervalTime =
723+
| number
724+
| {
725+
max: number
726+
min?: number
727+
}
710728
```
711729
712730
### XCrawlBaseConfig
@@ -739,8 +757,7 @@ interface CrawlBaseConfigV1 {
739757
### CrawlDataConfig
740758
741759
```ts
742-
interface CrawlDataConfig extends CrawlBaseConfigV1 {
743-
}
760+
interface CrawlDataConfig extends CrawlBaseConfigV1 {}
744761
```
745762
746763
### CrawlFileConfig

0 commit comments

Comments
 (0)