-
Notifications
You must be signed in to change notification settings - Fork 1
使用AI更新index.js代码,添加报错提示和错误处理,显示处理的结果 #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,93 +8,93 @@ const turndownService = new TurndownService(); | |
| const rules = require('./rules'); | ||
|
|
||
| const configs = { | ||
| cursor: 0, | ||
| target: 'user', | ||
| userId: '', | ||
| postId: '' | ||
| } | ||
| cursor: 0, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 单引号没问题了,缩进现在看起来还是不太正常 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 可以使用 vscode 进行调整缩进 |
||
| target: 'user', | ||
| userId: '', | ||
| postId: '' | ||
| }; | ||
|
|
||
| // 创建目录 | ||
| const docsDir = path.join(__dirname, 'docs'); | ||
| const imagesDir = path.join(__dirname, 'docs/images'); | ||
|
|
||
| if (!fs.existsSync(docsDir)) { | ||
| fs.mkdirSync(docsDir); | ||
| } | ||
|
|
||
| if (!fs.existsSync(imagesDir)) { | ||
| fs.mkdirSync(imagesDir); | ||
| } | ||
|
|
||
| const handleGrabArticles = (url, id) => { | ||
| request(url, async (error, response, body) => { | ||
| if (!error && response.statusCode === 200) { | ||
| // 解析DOM元素 | ||
| const $ = cheerio.load(body); | ||
| // 获取文章内容 | ||
| const imageElements = $('.markdown-body').find('img'); | ||
|
|
||
| const tasks = imageElements.map((index, img) => { | ||
| const imageUrl = $(img).attr('src'); | ||
| if (!imageUrl) return null | ||
|
|
||
| return new Promise((resolve, reject) => { | ||
| request.head(imageUrl, (err, res, body) => { | ||
| if (err) return null | ||
| // 获取文件扩展名 | ||
| const contentType = res?.headers['content-type']; | ||
| let extname = contentType ? `.${contentType.split('/')[1]}` : ''; | ||
| // 获取文件名 | ||
| let filename = path.basename(imageUrl); | ||
|
|
||
| if (filename.indexOf('.awebp') !== -1) { | ||
| extname = '' | ||
| filename = filename.replace('.awebp', '.webp') | ||
| filename = filename.replace('.awebp?', '.webp') | ||
| filename = filename.replace('.webp?', '.webp') | ||
| } | ||
| // 创建写入流 | ||
| const stream = fs.createWriteStream(path.join(__dirname, 'docs/images', filename + extname)); | ||
| // 管道流 | ||
| request(imageUrl) | ||
| .pipe(stream) | ||
| .on('close', () => { | ||
| $(img).attr('src', `./images/${filename + extname}`); | ||
| resolve(); | ||
| }); | ||
| }); | ||
| }); | ||
| }); | ||
|
|
||
| const linkElements = $('.markdown-body').find('a'); | ||
| linkElements?.map((index, link) => { | ||
| const url = $(link).attr('href')?.replace('https://link.juejin.cn?target=', ''); | ||
| $(link).attr('href', decodeURIComponent(url)); | ||
| }) | ||
|
|
||
| turndownService.addRule('code', rules.code); | ||
| turndownService.addRule('style', rules.style); | ||
|
|
||
| const filename = $('title').text().replace(' - 掘金', '')?.trim(); | ||
|
|
||
| await Promise.all(tasks); | ||
| const content = $('.markdown-body').html(); | ||
| try { | ||
| if (!content) return | ||
| const description = $('meta[name="description"]').attr("content"); | ||
| const keywords = $('meta[name="keywords"]').attr("content"); | ||
| const datePublished = $('meta[itemprop="datePublished"]').attr("content"); | ||
| // 转换为markdown | ||
| const markdown = turndownService.turndown(content); | ||
|
|
||
| const tags = keywords?.split(',') ?? []; | ||
|
|
||
| let tagStr = ``; | ||
| tags.forEach(tag => { | ||
| tagStr += `\n - ${tag}` | ||
| }); | ||
|
|
||
| const contentMarkdown = `--- | ||
| if (!fs.existsSync(docsDir)) fs.mkdirSync(docsDir); | ||
| if (!fs.existsSync(imagesDir)) fs.mkdirSync(imagesDir); | ||
|
|
||
| // 爬取文章 | ||
| const handleGrabArticles = async (url, id) => { | ||
| try { | ||
| console.log(`开始抓取文章: ${url}`); | ||
| const body = await new Promise((resolve, reject) => { | ||
| request(url, (error, response, body) => { | ||
| if (error) return reject(error); | ||
| if (response.statusCode !== 200) return reject(new Error(`状态码: ${response.statusCode}`)); | ||
| resolve(body); | ||
| }); | ||
| }); | ||
|
|
||
| // 解析DOM元素 | ||
| const $ = cheerio.load(body); | ||
| const content = $('.markdown-body').html(); | ||
| if (!content) throw new Error('无法找到文章内容'); | ||
|
|
||
| const imageElements = $('.markdown-body').find('img'); | ||
| console.log(`找到 ${imageElements.length} 个图片元素`); | ||
|
|
||
| const tasks = imageElements.map((index, img) => { | ||
| const imageUrl = $(img).attr('src'); | ||
| if (!imageUrl) return null; | ||
|
|
||
| return new Promise((resolve, reject) => { | ||
| request.head(imageUrl, (err, res) => { | ||
| if (err) return reject(err); | ||
|
|
||
| const contentType = res.headers['content-type']; | ||
| let extname = contentType ? `.${contentType.split('/')[1]}` : ''; | ||
| let filename = path.basename(imageUrl).replace(/[^a-zA-Z0-9.-]/g, '_').split('_')[0]; | ||
|
|
||
| if (filename.indexOf('.awebp') !== -1) { | ||
| extname = '.webp'; | ||
| filename = filename.replace('.awebp', ''); | ||
| } | ||
|
|
||
| if (filename.length > 200) filename = filename.substring(0, 200); | ||
|
|
||
| const filePath = path.join(imagesDir, `${filename}${extname}`); | ||
| const stream = fs.createWriteStream(filePath); | ||
|
|
||
| request(imageUrl) | ||
| .pipe(stream) | ||
| .on('finish', () => { | ||
| $(img).attr('src', `./images/${filename}${extname}`); | ||
| resolve(); | ||
| }) | ||
| .on('error', reject); | ||
| }); | ||
| }); | ||
| }).get(); | ||
|
|
||
| await Promise.all(tasks.filter((task) => task !== null)); | ||
| console.log('所有图片下载完成'); | ||
|
|
||
| const filename = $('title').text().replace(' - 掘金', '')?.trim(); | ||
| console.log(`文章标题: ${filename}`); | ||
|
|
||
| turndownService.addRule('code', rules.code); | ||
| turndownService.addRule('style', rules.style); | ||
| const markdown = turndownService.turndown(content); | ||
|
|
||
| const description = $('meta[name="description"]').attr('content'); | ||
| const keywords = $('meta[name="keywords"]').attr('content'); | ||
| const datePublished = $('meta[itemprop="datePublished"]').attr('content'); | ||
| const tags = keywords?.split(',') ?? []; | ||
|
|
||
| let tagStr = ''; | ||
| tags.forEach((tag) => { | ||
| tagStr += `\n - ${tag}`; | ||
| }); | ||
|
|
||
| const contentMarkdown = `--- | ||
| title: "${filename}" | ||
| date: ${datePublished} | ||
| tags: ${tagStr} | ||
|
|
@@ -115,84 +115,77 @@ head: | |
|
|
||
| ${markdown} | ||
| `; | ||
| // 写入文件 | ||
| fs.writeFileSync(`docs/${id}.md`, contentMarkdown); | ||
| console.log(`文件已生成:${filename} -> ${id}`); | ||
| } catch (error) { | ||
| console.log(error); | ||
| console.log(`错误文章为${url}`); | ||
| } | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| const filePath = path.join(docsDir, `${filename}.md`); | ||
| fs.writeFileSync(filePath, contentMarkdown); | ||
| console.log(`文件已生成:${filename} -> ${filePath}`); | ||
| } catch (error) { | ||
| console.error(`处理文章时出错: ${error}`); | ||
| } | ||
| }; | ||
|
|
||
| const getRequestOptions = () => ({ | ||
| url: 'https://api.juejin.cn/content_api/v1/article/query_list', | ||
| body: JSON.stringify({ | ||
| cursor: String(configs.cursor), | ||
| sort_type: 2, | ||
| user_id: configs.userId | ||
| }), | ||
| headers: { | ||
| 'content-type': 'application/json' | ||
| } | ||
| url: 'https://api.juejin.cn/content_api/v1/article/query_list', | ||
| body: JSON.stringify({ | ||
| cursor: String(configs.cursor), | ||
| sort_type: 2, | ||
| user_id: configs.userId | ||
| }), | ||
| headers: { | ||
| 'content-type': 'application/json' | ||
| } | ||
| }); | ||
|
|
||
| const postList = [] | ||
| const postList = []; | ||
|
|
||
| const handleGrabUserArticles = (requestOptions) => { | ||
| request.post(requestOptions, (error, response, body) => { | ||
| if (!error && response.statusCode === 200) { | ||
| const { data = [], has_more, cursor } = JSON.parse(body); | ||
|
|
||
| if (data?.length) { | ||
| postList.push(...data?.map(article => article.article_id)); | ||
| } | ||
|
|
||
| if (has_more) { | ||
| configs.cursor = cursor; | ||
| handleGrabUserArticles(getRequestOptions()); | ||
| } else { | ||
| postList.forEach(id => handleGrabArticles(`https://juejin.cn/post/${id}`, id)); | ||
| } | ||
| } | ||
| }) | ||
| } | ||
| request.post(requestOptions, (error, response, body) => { | ||
| if (error || response.statusCode !== 200) return console.error(`请求用户文章时出错: ${error || response.statusCode}`); | ||
|
|
||
| const { data = [], has_more, cursor } = JSON.parse(body); | ||
| if (data?.length) postList.push(...data.map((article) => article.article_id)); | ||
|
|
||
| if (has_more) { | ||
| configs.cursor = cursor; | ||
| handleGrabUserArticles(getRequestOptions()); | ||
| } else { | ||
| postList.forEach((id) => handleGrabArticles(`https://juejin.cn/post/${id}`, id)); | ||
| } | ||
| }); | ||
| }; | ||
|
|
||
| const main = async () => { | ||
| const { model: target } = await inquirer.prompt({ | ||
| type: 'list', | ||
| name: 'model', | ||
| message: '请选择爬取目标方式', | ||
| choices: [ | ||
| { name: '通过用户 ID 爬取', value: 'user' }, | ||
| { name: '文通过文章 ID 爬取章', value: 'post' }, | ||
| ], | ||
| default: configs.target | ||
| }) | ||
|
|
||
| configs.target = target; | ||
|
|
||
| if (configs.target === 'user') { | ||
| const { prompt: userId } = await inquirer.prompt({ | ||
| type: 'input', | ||
| name: 'prompt', | ||
| message: '请输入用户 ID', | ||
| }); | ||
| configs.userId = userId?.trim(); | ||
|
|
||
| handleGrabUserArticles(getRequestOptions()) | ||
|
|
||
| } else { | ||
| const { prompt: postId } = await inquirer.prompt({ | ||
| type: 'input', | ||
| name: 'prompt', | ||
| message: '请输入文章 ID', | ||
| }); | ||
| configs.postId = postId?.trim();; | ||
|
|
||
| handleGrabArticles(`https://juejin.cn/post/${configs.postId}`) | ||
| } | ||
| } | ||
|
|
||
| main(); | ||
| const { model: target } = await inquirer.prompt({ | ||
| type: 'list', | ||
| name: 'model', | ||
| message: '请选择爬取目标方式', | ||
| choices: [ | ||
| { name: '通过用户 ID 爬取', value: 'user' }, | ||
| { name: '通过文章 ID 爬取', value: 'post' } | ||
| ], | ||
| default: configs.target | ||
| }); | ||
|
|
||
| configs.target = target; | ||
|
|
||
| if (configs.target === 'user') { | ||
| const { prompt: userId } = await inquirer.prompt({ | ||
| type: 'input', | ||
| name: 'prompt', | ||
| message: '请输入用户 ID' | ||
| }); | ||
| configs.userId = userId?.trim(); | ||
| handleGrabUserArticles(getRequestOptions()); | ||
| } else { | ||
| const { prompt: postId } = await inquirer.prompt({ | ||
| type: 'input', | ||
| name: 'prompt', | ||
| message: '请输入文章 ID' | ||
| }); | ||
| configs.postId = postId?.trim(); | ||
| await handleGrabArticles(`https://juejin.cn/post/${configs.postId}`, configs.postId); | ||
| console.log('程序执行完毕'); | ||
| } | ||
| }; | ||
|
|
||
| main().catch((error) => console.error('程序执行出错:', error)); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.idea 文件夹不应该提交。可以配置 .gitignore 文件
在文件底部,增加一个过滤
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry,抱歉,没注意这个东西,我也是第一次做这种pull request,原谅我新手了:(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以修改后,重新再 push