Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.idea 文件夹不应该提交。可以配置 .gitignore 文件
在文件底部,增加一个过滤

.idea

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry,抱歉,没注意这个东西,我也是第一次做这种pull request,原谅我新手了:(

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以修改后,重新再 push

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/jsLibraryMappings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/material_theme_project_new.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions .idea/node-reptile.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

303 changes: 148 additions & 155 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,93 +8,93 @@ const turndownService = new TurndownService();
const rules = require('./rules');

const configs = {
cursor: 0,
target: 'user',
userId: '',
postId: ''
}
cursor: 0,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

单引号没问题了,缩进现在看起来还是不太正常

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以使用 vscode 进行调整缩进

target: 'user',
userId: '',
postId: ''
};

// 创建目录
const docsDir = path.join(__dirname, 'docs');
const imagesDir = path.join(__dirname, 'docs/images');

if (!fs.existsSync(docsDir)) {
fs.mkdirSync(docsDir);
}

if (!fs.existsSync(imagesDir)) {
fs.mkdirSync(imagesDir);
}

const handleGrabArticles = (url, id) => {
request(url, async (error, response, body) => {
if (!error && response.statusCode === 200) {
// 解析DOM元素
const $ = cheerio.load(body);
// 获取文章内容
const imageElements = $('.markdown-body').find('img');

const tasks = imageElements.map((index, img) => {
const imageUrl = $(img).attr('src');
if (!imageUrl) return null

return new Promise((resolve, reject) => {
request.head(imageUrl, (err, res, body) => {
if (err) return null
// 获取文件扩展名
const contentType = res?.headers['content-type'];
let extname = contentType ? `.${contentType.split('/')[1]}` : '';
// 获取文件名
let filename = path.basename(imageUrl);

if (filename.indexOf('.awebp') !== -1) {
extname = ''
filename = filename.replace('.awebp', '.webp')
filename = filename.replace('.awebp?', '.webp')
filename = filename.replace('.webp?', '.webp')
}
// 创建写入流
const stream = fs.createWriteStream(path.join(__dirname, 'docs/images', filename + extname));
// 管道流
request(imageUrl)
.pipe(stream)
.on('close', () => {
$(img).attr('src', `./images/${filename + extname}`);
resolve();
});
});
});
});

const linkElements = $('.markdown-body').find('a');
linkElements?.map((index, link) => {
const url = $(link).attr('href')?.replace('https://link.juejin.cn?target=', '');
$(link).attr('href', decodeURIComponent(url));
})

turndownService.addRule('code', rules.code);
turndownService.addRule('style', rules.style);

const filename = $('title').text().replace(' - 掘金', '')?.trim();

await Promise.all(tasks);
const content = $('.markdown-body').html();
try {
if (!content) return
const description = $('meta[name="description"]').attr("content");
const keywords = $('meta[name="keywords"]').attr("content");
const datePublished = $('meta[itemprop="datePublished"]').attr("content");
// 转换为markdown
const markdown = turndownService.turndown(content);

const tags = keywords?.split(',') ?? [];

let tagStr = ``;
tags.forEach(tag => {
tagStr += `\n - ${tag}`
});

const contentMarkdown = `---
if (!fs.existsSync(docsDir)) fs.mkdirSync(docsDir);
if (!fs.existsSync(imagesDir)) fs.mkdirSync(imagesDir);

// 爬取文章
const handleGrabArticles = async (url, id) => {
try {
console.log(`开始抓取文章: ${url}`);
const body = await new Promise((resolve, reject) => {
request(url, (error, response, body) => {
if (error) return reject(error);
if (response.statusCode !== 200) return reject(new Error(`状态码: ${response.statusCode}`));
resolve(body);
});
});

// 解析DOM元素
const $ = cheerio.load(body);
const content = $('.markdown-body').html();
if (!content) throw new Error('无法找到文章内容');

const imageElements = $('.markdown-body').find('img');
console.log(`找到 ${imageElements.length} 个图片元素`);

const tasks = imageElements.map((index, img) => {
const imageUrl = $(img).attr('src');
if (!imageUrl) return null;

return new Promise((resolve, reject) => {
request.head(imageUrl, (err, res) => {
if (err) return reject(err);

const contentType = res.headers['content-type'];
let extname = contentType ? `.${contentType.split('/')[1]}` : '';
let filename = path.basename(imageUrl).replace(/[^a-zA-Z0-9.-]/g, '_').split('_')[0];

if (filename.indexOf('.awebp') !== -1) {
extname = '.webp';
filename = filename.replace('.awebp', '');
}

if (filename.length > 200) filename = filename.substring(0, 200);

const filePath = path.join(imagesDir, `${filename}${extname}`);
const stream = fs.createWriteStream(filePath);

request(imageUrl)
.pipe(stream)
.on('finish', () => {
$(img).attr('src', `./images/${filename}${extname}`);
resolve();
})
.on('error', reject);
});
});
}).get();

await Promise.all(tasks.filter((task) => task !== null));
console.log('所有图片下载完成');

const filename = $('title').text().replace(' - 掘金', '')?.trim();
console.log(`文章标题: ${filename}`);

turndownService.addRule('code', rules.code);
turndownService.addRule('style', rules.style);
const markdown = turndownService.turndown(content);

const description = $('meta[name="description"]').attr('content');
const keywords = $('meta[name="keywords"]').attr('content');
const datePublished = $('meta[itemprop="datePublished"]').attr('content');
const tags = keywords?.split(',') ?? [];

let tagStr = '';
tags.forEach((tag) => {
tagStr += `\n - ${tag}`;
});

const contentMarkdown = `---
title: "${filename}"
date: ${datePublished}
tags: ${tagStr}
Expand All @@ -115,84 +115,77 @@ head:

${markdown}
`;
// 写入文件
fs.writeFileSync(`docs/${id}.md`, contentMarkdown);
console.log(`文件已生成:${filename} -> ${id}`);
} catch (error) {
console.log(error);
console.log(`错误文章为${url}`);
}
}
});
}

const filePath = path.join(docsDir, `${filename}.md`);
fs.writeFileSync(filePath, contentMarkdown);
console.log(`文件已生成:${filename} -> ${filePath}`);
} catch (error) {
console.error(`处理文章时出错: ${error}`);
}
};

const getRequestOptions = () => ({
url: 'https://api.juejin.cn/content_api/v1/article/query_list',
body: JSON.stringify({
cursor: String(configs.cursor),
sort_type: 2,
user_id: configs.userId
}),
headers: {
'content-type': 'application/json'
}
url: 'https://api.juejin.cn/content_api/v1/article/query_list',
body: JSON.stringify({
cursor: String(configs.cursor),
sort_type: 2,
user_id: configs.userId
}),
headers: {
'content-type': 'application/json'
}
});

const postList = []
const postList = [];

const handleGrabUserArticles = (requestOptions) => {
request.post(requestOptions, (error, response, body) => {
if (!error && response.statusCode === 200) {
const { data = [], has_more, cursor } = JSON.parse(body);

if (data?.length) {
postList.push(...data?.map(article => article.article_id));
}

if (has_more) {
configs.cursor = cursor;
handleGrabUserArticles(getRequestOptions());
} else {
postList.forEach(id => handleGrabArticles(`https://juejin.cn/post/${id}`, id));
}
}
})
}
request.post(requestOptions, (error, response, body) => {
if (error || response.statusCode !== 200) return console.error(`请求用户文章时出错: ${error || response.statusCode}`);

const { data = [], has_more, cursor } = JSON.parse(body);
if (data?.length) postList.push(...data.map((article) => article.article_id));

if (has_more) {
configs.cursor = cursor;
handleGrabUserArticles(getRequestOptions());
} else {
postList.forEach((id) => handleGrabArticles(`https://juejin.cn/post/${id}`, id));
}
});
};

const main = async () => {
const { model: target } = await inquirer.prompt({
type: 'list',
name: 'model',
message: '请选择爬取目标方式',
choices: [
{ name: '通过用户 ID 爬取', value: 'user' },
{ name: '文通过文章 ID 爬取章', value: 'post' },
],
default: configs.target
})

configs.target = target;

if (configs.target === 'user') {
const { prompt: userId } = await inquirer.prompt({
type: 'input',
name: 'prompt',
message: '请输入用户 ID',
});
configs.userId = userId?.trim();

handleGrabUserArticles(getRequestOptions())

} else {
const { prompt: postId } = await inquirer.prompt({
type: 'input',
name: 'prompt',
message: '请输入文章 ID',
});
configs.postId = postId?.trim();;

handleGrabArticles(`https://juejin.cn/post/${configs.postId}`)
}
}

main();
const { model: target } = await inquirer.prompt({
type: 'list',
name: 'model',
message: '请选择爬取目标方式',
choices: [
{ name: '通过用户 ID 爬取', value: 'user' },
{ name: '通过文章 ID 爬取', value: 'post' }
],
default: configs.target
});

configs.target = target;

if (configs.target === 'user') {
const { prompt: userId } = await inquirer.prompt({
type: 'input',
name: 'prompt',
message: '请输入用户 ID'
});
configs.userId = userId?.trim();
handleGrabUserArticles(getRequestOptions());
} else {
const { prompt: postId } = await inquirer.prompt({
type: 'input',
name: 'prompt',
message: '请输入文章 ID'
});
configs.postId = postId?.trim();
await handleGrabArticles(`https://juejin.cn/post/${configs.postId}`, configs.postId);
console.log('程序执行完毕');
}
};

main().catch((error) => console.error('程序执行出错:', error));