Skip to content

Commit 9aa386f

Browse files
committed
Refactor: The crawlPage API can add batch requests - remove JSDOM
1 parent 67e25fa commit 9aa386f

File tree

17 files changed

+375
-665
lines changed

17 files changed

+375
-665
lines changed

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,12 @@ const myXCrawl = xCrawl({
106106
*/
107107
myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
108108
// Call crawlPage API to crawl Page
109-
const { jsdom, page } = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes')
110-
111-
// Get the cover image elements for Plus listings
112-
const imgEls = jsdom.window.document.querySelector('.a1stauiv')?.querySelectorAll('picture img')
109+
const { page } = await myXCrawl.crawlPage('https://zh.airbnb.com/s/*/plus_homes')
113110

114111
// set request configuration
115-
const requestConfig: string[] = []
116-
imgEls?.forEach((item) => requestConfig.push(item.src))
112+
const requestConfig = await page.$$eval('picture img', (img) => {
113+
return img.map((item) => item.src)
114+
})
117115

118116
// Call the crawlFile API to crawl pictures
119117
myXCrawl.crawlFile({ requestConfig, fileConfig: { storeDir: './upload' } })

assets/cn/crawler.png

32.1 KB
Loading

assets/en/crawler-result.png

91.3 KB
Loading

assets/en/crawler.png

63.9 KB
Loading

docs/cn.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,12 @@ const myXCrawl = xCrawl({
103103
// 调用 startPolling API 开始轮询功能,每隔一天会调用回调函数
104104
myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
105105
// 调用 crawlPage API 爬取 Page
106-
const { jsdom, page } = await myXCrawl.crawlPage('https://www.bilibili.com/guochuang/')
106+
const { page } = await myXCrawl.crawlPage('https://www.bilibili.com/guochuang/')
107107

108-
// 获取轮播图片元素
109-
const imgEls = jsdom.window.document.querySelectorAll('.chief-recom-item img')
110-
111-
// 设置请求配置
112-
const requestConfig = []
113-
imgEls.forEach((item) => requestConfig.push(`https:${item.src}`))
108+
// 获取轮播图片元素的 URL ,设置请求配置
109+
const requestConfig = await page.$$eval('.chief-recom-item img', (imgEls) =>
110+
imgEls.map((item) => item.src)
111+
)
114112

115113
// 调用 crawlFile API 爬取图片
116114
myXCrawl.crawlFile({ requestConfig, fileConfig: { storeDir: './upload' } })

package.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
"dependencies": {
1919
"chalk": "4.1.2",
2020
"https-proxy-agent": "^5.0.1",
21-
"jsdom": "^21.1.0",
2221
"puppeteer": "^19.7.2",
2322
"x-crawl": "link:"
2423
},
@@ -29,7 +28,6 @@
2928
"@rollup/plugin-babel": "^6.0.3",
3029
"@rollup/plugin-run": "^3.0.1",
3130
"@rollup/plugin-terser": "^0.4.0",
32-
"@types/jsdom": "^21.0.0",
3331
"@types/node": "^18.11.18",
3432
"@typescript-eslint/eslint-plugin": "^5.48.2",
3533
"@typescript-eslint/parser": "^5.48.2",

0 commit comments

Comments
 (0)