Skip to content

Commit 15639c4

Browse files
committed
Add: The crawlFile API adds a beforeSave callback function, allowing users to customize the storage path and name of each requested file
1 parent c895399 commit 15639c4

File tree

6 files changed

+182
-28
lines changed

6 files changed

+182
-28
lines changed

src/api.ts

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import {
1313
log,
1414
logError,
1515
logSuccess,
16-
logWarn
16+
logWarn,
17+
mkdirDirSync
1718
} from './utils'
1819

1920
import {
@@ -146,7 +147,7 @@ function loaderCommonConfig(
146147
priority = 0
147148
}
148149

149-
return { url, timeout, proxy, maxRetry, priority }
150+
return { ...requestConfig, url, timeout, proxy, maxRetry, priority }
150151
})
151152

152153
// 2.intervalTime
@@ -258,6 +259,23 @@ function loaderFileConfig(
258259
// 装载公共配置到 loaderConfig
259260
loaderCommonConfig(baseConfig, requestObjecs, loaderConfig)
260261

262+
// 装载单独的配置
263+
loaderConfig.requestConfigs.forEach((requestConfig) => {
264+
if (
265+
isUndefined(requestConfig.storeDir) &&
266+
!isUndefined(rawConfig.fileConfig?.storeDir)
267+
) {
268+
requestConfig.storeDir = rawConfig.fileConfig!.storeDir
269+
}
270+
271+
if (
272+
isUndefined(requestConfig.extension) &&
273+
!isUndefined(rawConfig.fileConfig?.extension)
274+
) {
275+
requestConfig.extension = rawConfig.fileConfig!.extension
276+
}
277+
})
278+
261279
return loaderConfig
262280
}
263281

@@ -494,10 +512,6 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) {
494512
config
495513
)
496514

497-
if (!fs.existsSync(fileConfig.storeDir)) {
498-
fs.mkdirSync(fileConfig.storeDir)
499-
}
500-
501515
const controllerRes = await controller(
502516
'file',
503517
baseConfig.mode,
@@ -517,7 +531,8 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) {
517531
maxRetry,
518532
crawlCount,
519533
errorQueue,
520-
crawlSingleRes
534+
crawlSingleRes,
535+
requestConfig
521536
} = item
522537

523538
const crawlRes: CrawlFileSingleRes = {
@@ -532,14 +547,47 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) {
532547

533548
if (isSuccess && crawlSingleRes) {
534549
const mimeType = crawlSingleRes.headers['content-type'] ?? ''
535-
const fileExtension = fileConfig.extension ?? mimeType.split('/').pop()
536-
const fileName = new Date().getTime().toString()
537-
const filePath = path.resolve(
538-
fileConfig.storeDir,
539-
`${fileName}.${fileExtension}`
540-
)
550+
let fileName = ''
551+
let fileExtension = ''
541552

542-
const saveFileItem = writeFile(filePath, crawlSingleRes.data)
553+
if (!isUndefined(requestConfig.fileName)) {
554+
fileName = requestConfig.fileName
555+
} else {
556+
fileName = new Date().getTime().toString()
557+
}
558+
559+
if (!isUndefined(requestConfig.extension)) {
560+
fileExtension = requestConfig.extension
561+
} else {
562+
fileExtension = '.' + mimeType.split('/').pop()
563+
}
564+
565+
if (
566+
!isUndefined(requestConfig.storeDir) &&
567+
!fs.existsSync(requestConfig.storeDir)
568+
) {
569+
mkdirDirSync(requestConfig.storeDir)
570+
}
571+
572+
const storePath = requestConfig.storeDir ?? __dirname
573+
const filePath = path.resolve(storePath, fileName + fileExtension)
574+
575+
// 在保存前的回调
576+
let data = crawlSingleRes.data
577+
if (fileConfig?.beforeSave) {
578+
const newData = fileConfig.beforeSave({
579+
id,
580+
fileName,
581+
filePath,
582+
data
583+
})
584+
585+
if (newData) {
586+
data = newData
587+
}
588+
}
589+
590+
const saveFileItem = writeFile(filePath, data)
543591
.catch((err) => {
544592
const message = `File save error at id ${id}: ${err.message}`
545593
const valueOf = () => id
@@ -551,9 +599,18 @@ export function createCrawlFile(baseConfig: LoaderXCrawlBaseConfig) {
551599
.then((isError) => {
552600
const size = crawlSingleRes.data.length
553601
const isSuccess = !isError
554-
const fileInfo = { isSuccess, fileName, mimeType, size, filePath }
555602

556-
crawlRes.data = { ...crawlSingleRes, data: fileInfo }
603+
crawlRes.data = {
604+
...crawlSingleRes,
605+
data: {
606+
isSuccess,
607+
fileName,
608+
fileExtension,
609+
mimeType,
610+
size,
611+
filePath
612+
}
613+
}
557614

558615
if (callback) {
559616
callback(crawlRes)

src/types/api.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ export interface FileRequestConfig {
9494
proxy?: string
9595
maxRetry?: number
9696
priority?: number
97+
storeDir?: string
98+
fileName?: string
99+
extension?: string
97100
}
98101

99102
// CrawlConfig
@@ -134,9 +137,15 @@ export interface CrawlFileConfig<R extends CrawlFileRequestConfig> {
134137
timeout?: number
135138
intervalTime?: IntervalTime
136139
maxRetry?: number
137-
fileConfig: {
138-
storeDir: string
140+
fileConfig?: {
141+
storeDir?: string
139142
extension?: string
143+
beforeSave?: (info: {
144+
id: number
145+
fileName: string
146+
filePath: string
147+
data: Buffer
148+
}) => Buffer | void
140149
}
141150
}
142151

@@ -179,6 +188,7 @@ export interface CrawlFileSingleRes extends CrawlCommonRes {
179188
data: {
180189
isSuccess: boolean
181190
fileName: string
191+
fileExtension: string
182192
mimeType: string
183193
size: number
184194
filePath: string

src/utils.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import fs from 'node:fs'
2+
import path from 'node:path'
13
import chalk from 'chalk'
24

35
export function sleep(timeout: number) {
@@ -14,6 +16,20 @@ export function random(max: number, min = 0) {
1416
return res
1517
}
1618

19+
export function mkdirDirSync(dir: string) {
20+
const dirSplit = path.resolve(dir).split(path.sep)
21+
22+
dirSplit.reduce((prev, item, index) => {
23+
const currentDir = index !== 0 ? path.join(prev, item) : item
24+
25+
if (!fs.existsSync(currentDir)) {
26+
fs.mkdirSync(currentDir)
27+
}
28+
29+
return currentDir
30+
}, '')
31+
}
32+
1733
export const log = console.log
1834
export const logNumber = chalk.hex('#a57fff')
1935
export const logSuccess = chalk.green

test/environment/crawlFile.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,40 @@ async function loaderAPIConfig() {
103103
return res.reduce((prev, item) => prev && item.isSuccess, true)
104104
}
105105

106+
/* 3.Store Config */
107+
async function storeConfig() {
108+
const testXCrawl = xCrawl({
109+
baseUrl:
110+
'https://raw.githubusercontent.com/coder-hxl/airbnb-upload/master/area',
111+
proxy: 'http://localhost:14892'
112+
})
113+
114+
const record: string[] = []
115+
const res = await testXCrawl.crawlFile({
116+
requestConfig: [
117+
{ url: '/4401.jpg', fileName: '4401' },
118+
{ url: '/4403.jpg', fileName: '4403' }
119+
],
120+
fileConfig: {
121+
storeDir: path.resolve(__dirname, './upload'),
122+
extension: '.jpg',
123+
beforeSave(info) {
124+
record.push(info.fileName)
125+
}
126+
}
127+
})
128+
129+
let isSuccess = true
130+
res.forEach((item) => {
131+
if (isSuccess) {
132+
const hasName = record.includes(item.data?.data.fileName ?? '')
133+
isSuccess = item.isSuccess && hasName
134+
}
135+
})
136+
137+
return isSuccess
138+
}
139+
106140
/* 1.Written */
107141
test('crawlFile - writtenString', async () => {
108142
console.log(
@@ -147,3 +181,11 @@ test('crawlFile - loaderAPIConfig', async () => {
147181
)
148182
await expect(loaderAPIConfig()).resolves.toBe(true)
149183
})
184+
185+
/* 2.Store Config */
186+
test('crawlFile - loaderAPIConfig', async () => {
187+
console.log(
188+
chalk.bgGreen('================ crawlFile - storeConfig ================')
189+
)
190+
await expect(storeConfig()).resolves.toBe(true)
191+
})

0 commit comments

Comments
 (0)