Skip to content

Commit d578bdc

Browse files
authored
Merge pull request #91 from extractus/dev
v6.2.3
2 parents 386e483 + 531bec1 commit d578bdc

16 files changed

+1237
-131
lines changed

.eslintrc.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
"max-lines": [
6363
"error",
6464
{
65-
"max": 360,
65+
"max": 460,
6666
"skipBlankLines": true,
6767
"skipComments": false
6868
}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ Object with all or several of the following properties:
134134
- `xmlParserOptions`: Object, used by xml parser, view [fast-xml-parser's docs](https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md)
135135
- `getExtraFeedFields`: Function, to get more fields from feed data
136136
- `getExtraEntryFields`: Function, to get more fields from feed entry data
137+
- `baseUrl`: URL string, to absolutify the links within feed content
137138

138139
For example:
139140

dist/cjs/feed-extractor.js

Lines changed: 142 additions & 55 deletions
Large diffs are not rendered by default.

dist/cjs/index.d.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ export interface ReaderOptions {
4747
* https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md
4848
*/
4949
xmlParserOptions?: any;
50+
/**
51+
* fill in the baseurl when it does not exist in the link
52+
* default: ''
53+
*/
54+
baseUrl?: string;
5055
/**
5156
* merge extra feed fields in result
5257
*/

dist/cjs/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"name": "@extractus/feed-extractor",
3-
"version": "6.2.2",
3+
"version": "6.2.3",
44
"main": "./feed-extractor.js"
55
}

dist/feed-extractor.esm.js

Lines changed: 75 additions & 49 deletions
Large diffs are not rendered by default.

index.d.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ export interface ReaderOptions {
4747
* https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md
4848
*/
4949
xmlParserOptions?: any;
50+
/**
51+
* fill in the baseurl when it does not exist in the link
52+
* default: ''
53+
*/
54+
baseUrl?: string;
5055
/**
5156
* merge extra feed fields in result
5257
*/

package.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"version": "6.2.2",
2+
"version": "6.2.3",
33
"name": "@extractus/feed-extractor",
44
"description": "To read and normalize RSS/ATOM/JSON feed data",
55
"homepage": "https://extractor-demos.pages.dev",
@@ -36,13 +36,13 @@
3636
},
3737
"dependencies": {
3838
"bellajs": "^11.1.2",
39-
"cross-fetch": "^3.1.5",
40-
"fast-xml-parser": "^4.2.2",
41-
"html-entities": "^2.3.3"
39+
"cross-fetch": "^3.1.6",
40+
"fast-xml-parser": "^4.2.4",
41+
"html-entities": "^2.3.6"
4242
},
4343
"devDependencies": {
44-
"esbuild": "^0.17.18",
45-
"eslint": "^8.40.0",
44+
"esbuild": "^0.18.2",
45+
"eslint": "^8.42.0",
4646
"jest": "^29.5.0",
4747
"nock": "^13.3.1"
4848
},

src/main.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ const getopt = (options = {}) => {
1414
descriptionMaxLen = 210,
1515
useISODateFormat = true,
1616
xmlParserOptions = {},
17+
baseUrl = '',
1718
getExtraFeedFields = () => ({}),
1819
getExtraEntryFields = () => ({}),
1920
} = options
@@ -23,6 +24,7 @@ const getopt = (options = {}) => {
2324
descriptionMaxLen,
2425
useISODateFormat,
2526
xmlParserOptions,
27+
baseUrl,
2628
getExtraFeedFields,
2729
getExtraEntryFields,
2830
}
@@ -51,6 +53,7 @@ export const extract = async (url, options = {}, fetchOptions = {}) => {
5153
if (!isValidUrl(url)) {
5254
throw new Error('Input param must be a valid URL')
5355
}
56+
5457
const data = await retrieve(url, fetchOptions)
5558
if (!data.text && !data.json) {
5659
throw new Error(`Failed to load content from "${url}"`)

src/main.test.js

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import nock from 'nock'
77

88
import { hasProperty, isString } from 'bellajs'
99

10-
import { extract, read } from './main.js'
10+
import { extract, extractFromXml, extractFromJson, read } from './main.js'
1111
import { isValid as isValidUrl } from './utils/linker.js'
1212

1313
const feedAttrs = 'title link description generator language published entries'.split(' ')
@@ -336,6 +336,65 @@ describe('test extract() without normalization', () => {
336336
})
337337
})
338338

339+
describe('test extract with `baseUrl` option', () => {
340+
test('extract rss feed with xml', () => {
341+
const baseUrl = 'https://huggingface.co'
342+
const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
343+
const result = extractFromXml(xml, { baseUrl })
344+
345+
feedAttrs.forEach((k) => {
346+
expect(hasProperty(result, k)).toBe(true)
347+
})
348+
349+
entryAttrs.forEach((k) => {
350+
expect(hasProperty(result.entries[0], k)).toBe(true)
351+
})
352+
353+
expect(validateProps(result.entries[0])).toBe(true)
354+
expect(result.link).toBe(baseUrl + '/blog')
355+
expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
356+
})
357+
358+
test('extract rss feed with json', () => {
359+
const baseUrl = 'https://www.jsonfeed.org'
360+
const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8')
361+
const result = extractFromJson(JSON.parse(json), { baseUrl })
362+
363+
feedAttrs.forEach((k) => {
364+
expect(hasProperty(result, k)).toBe(true)
365+
})
366+
367+
entryAttrs.forEach((k) => {
368+
expect(hasProperty(result.entries[0], k)).toBe(true)
369+
})
370+
371+
expect(result.link).toBe(baseUrl + '/')
372+
expect(result.entries[0].link).toBe(baseUrl + '/2020/08/07/json-feed-version.html')
373+
})
374+
375+
test('extract rss feed with url', async () => {
376+
const url = 'https://huggingface.co/blog/rss'
377+
const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
378+
const { baseUrl, path } = parseUrl(url)
379+
nock(baseUrl).get(path).reply(200, xml, {
380+
'Content-Type': 'application/xml',
381+
})
382+
const result = await extract(url, { baseUrl })
383+
384+
feedAttrs.forEach((k) => {
385+
expect(hasProperty(result, k)).toBe(true)
386+
})
387+
388+
entryAttrs.forEach((k) => {
389+
expect(hasProperty(result.entries[0], k)).toBe(true)
390+
})
391+
392+
expect(validateProps(result.entries[0])).toBe(true)
393+
expect(result.link).toBe(baseUrl + '/blog')
394+
expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
395+
})
396+
})
397+
339398
describe('check old method read()', () => {
340399
test('ensure that depricated method read() still works', async () => {
341400
const url = 'https://realworld-standard-feed.tld/rss'

0 commit comments

Comments
 (0)