Skip to content

Commit 11e373c

Browse files
authored
Merge pull request #110 from extractus/7.0.5
v7.0.5
2 parents e228331 + 640381d commit 11e373c

File tree

10 files changed

+324
-15
lines changed

10 files changed

+324
-15
lines changed

.eslintrc.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,15 @@
6262
"max-lines": [
6363
"error",
6464
{
65-
"max": 460,
65+
"max": 520,
6666
"skipBlankLines": true,
6767
"skipComments": false
6868
}
6969
],
7070
"max-lines-per-function": [
7171
"error",
7272
{
73-
"max": 150,
73+
"max": 240,
7474
"skipBlankLines": true
7575
}
7676
],

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ URL of a valid feed source
114114
Feed content must be accessible and conform one of the following standards:
115115

116116
- [RSS Feed](https://www.rssboard.org/rss-specification)
117+
- [RDF Feed](https://web.resource.org/rss/1.0/spec)
117118
- [ATOM Feed](https://datatracker.ietf.org/doc/html/rfc5023)
118119
- [JSON Feed](https://www.jsonfeed.org/version/1.1/)
119120

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"version": "7.0.4",
2+
"version": "7.0.5",
33
"name": "@extractus/feed-extractor",
44
"description": "To read and normalize RSS/ATOM/JSON feed data",
55
"homepage": "https://extractor-demos.pages.dev",

src/main.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import { isValid as isValidUrl } from './utils/linker.js'
44

55
import retrieve from './utils/retrieve.js'
6-
import { validate, xml2obj, isRSS, isAtom } from './utils/xmlparser.js'
6+
import { validate, xml2obj, isRSS, isAtom, isRdf } from './utils/xmlparser.js'
77
import parseJsonFeed from './utils/parseJsonFeed.js'
88
import parseRssFeed from './utils/parseRssFeed.js'
99
import parseAtomFeed from './utils/parseAtomFeed.js'
10+
import parseRdfFeed from './utils/parseRdfFeed.js'
1011

1112
const getopt = (options = {}) => {
1213
const {
@@ -42,11 +43,14 @@ export const extractFromXml = (xml, options = {}) => {
4243
const opts = getopt(options)
4344

4445
const data = xml2obj(xml, opts.xmlParserOptions)
46+
4547
return isRSS(data)
4648
? parseRssFeed(data, opts)
4749
: isAtom(data)
4850
? parseAtomFeed(data, opts)
49-
: null
51+
: isRdf(data)
52+
? parseRdfFeed(data, opts)
53+
: null
5054
}
5155

5256
export const extract = async (url, options = {}, fetchOptions = {}) => {

src/main.test.js

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,30 @@ describe('test extract() standard feed', () => {
138138
expect(validateProps(result.entries[0])).toBe(true)
139139
})
140140

141+
test('extract rdf feed from Slashdot with extraFields', async () => {
142+
const url = 'https://some-news-page.tld/atom'
143+
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
144+
const { baseUrl, path } = parseUrl(url)
145+
nock(baseUrl).get(path).reply(200, xml, {
146+
'Content-Type': 'application/xml',
147+
})
148+
const result = await extract(url, {
149+
getExtraFeedFields: data => {
150+
return {
151+
subject: data['dc:subject'],
152+
}
153+
},
154+
getExtraEntryFields: data => {
155+
return {
156+
author: data['dc:creator'],
157+
}
158+
},
159+
})
160+
expect(hasProperty(result, 'subject')).toBe(true)
161+
expect(hasProperty(result.entries[0], 'author')).toBe(true)
162+
expect(validateProps(result.entries[0])).toBe(true)
163+
})
164+
141165
test('extract atom feed which contains multi links', async () => {
142166
const url = 'https://some-news-page.tld/atom/multilinks'
143167
const xml = readFileSync('test-data/atom-multilinks.xml', 'utf8')
@@ -291,6 +315,22 @@ describe('test extract() without normalization', () => {
291315
expect(hasProperty(result.item, 'guid')).toBe(true)
292316
})
293317

318+
test('extract rdf feed from Slashdot without normalization', async () => {
319+
const url = 'https://some-news-page.tld/atom'
320+
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
321+
const { baseUrl, path } = parseUrl(url)
322+
nock(baseUrl).get(path).reply(200, xml, {
323+
'Content-Type': 'application/xml',
324+
})
325+
const result = await extract(url, {
326+
normalization: false,
327+
})
328+
expect(hasProperty(result.channel, 'syn:updateBase')).toBe(true)
329+
expect(hasProperty(result.channel, 'dc:rights')).toBe(true)
330+
expect(hasProperty(result, 'item')).toBe(true)
331+
expect(hasProperty(result.item[0], 'slash:department')).toBe(true)
332+
})
333+
294334
test('extract atom feed from Google', async () => {
295335
const url = 'https://some-news-page.tld/atom'
296336
const xml = readFileSync('test-data/atom-feed-standard-realworld.xml', 'utf8')
@@ -358,7 +398,7 @@ describe('test extract() without normalization', () => {
358398
})
359399

360400
describe('test extract with `baseUrl` option', () => {
361-
test('extract rss feed with xml', () => {
401+
test('extract rss feed from file', () => {
362402
const baseUrl = 'https://huggingface.co'
363403
const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
364404
const result = extractFromXml(xml, { baseUrl })
@@ -376,7 +416,26 @@ describe('test extract with `baseUrl` option', () => {
376416
expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
377417
})
378418

379-
test('extract rss feed with json', () => {
419+
test('extract rdf feed from file', () => {
420+
const baseUrl = 'https://slashdot.org'
421+
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
422+
const result = extractFromXml(xml, { baseUrl })
423+
424+
feedAttrs.forEach((k) => {
425+
expect(hasProperty(result, k)).toBe(true)
426+
})
427+
428+
entryAttrs.forEach((k) => {
429+
expect(hasProperty(result.entries[0], k)).toBe(true)
430+
})
431+
432+
expect(validateProps(result.entries[0])).toBe(true)
433+
expect(result.link).toBe(baseUrl + '/')
434+
const firstItemLink = result.entries[0].link
435+
expect(firstItemLink.startsWith('https://tech.slashdot.org/story/23/08/23/2238246/spacex-')).toBe(true)
436+
})
437+
438+
test('extract json feed from file', () => {
380439
const baseUrl = 'https://www.jsonfeed.org'
381440
const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8')
382441
const result = extractFromJson(JSON.parse(json), { baseUrl })

src/utils/parseAtomFeed.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,10 @@ const parseAtom = (data, options = {}) => {
9898
getExtraFeedFields,
9999
} = options
100100

101+
const feedData = data.feed
102+
101103
if (!normalization) {
102-
return flatten(data.feed, baseUrl)
104+
return flatten(feedData, baseUrl)
103105
}
104106

105107
const {
@@ -111,9 +113,9 @@ const parseAtom = (data, options = {}) => {
111113
language = '',
112114
updated = '',
113115
entry: item = [],
114-
} = data.feed
116+
} = feedData
115117

116-
const extraFields = getExtraFeedFields(data.feed)
118+
const extraFields = getExtraFeedFields(feedData)
117119

118120
const items = isArray(item) ? item : [item]
119121

src/utils/parseRdfFeed.js

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// parseRssFeed.js
2+
3+
// specs: https://www.rssboard.org/rss-specification
4+
5+
import { isArray } from 'bellajs'
6+
7+
import {
8+
getText,
9+
toISODateString,
10+
buildDescription,
11+
getPureUrl,
12+
getEntryId
13+
} from './normalizer.js'
14+
15+
const transform = (item, options) => {
16+
const {
17+
useISODateFormat,
18+
descriptionMaxLen,
19+
baseUrl,
20+
getExtraEntryFields,
21+
} = options
22+
23+
const {
24+
guid = '',
25+
title = '',
26+
link = '',
27+
'dc:date': pubDate = '',
28+
description = '',
29+
'content:encoded': content = '',
30+
} = item
31+
32+
const published = useISODateFormat ? toISODateString(pubDate) : pubDate
33+
const htmlContent = getText(description || content)
34+
const entry = {
35+
id: getEntryId(guid, link, pubDate),
36+
title: getText(title),
37+
link: getPureUrl(link, guid, baseUrl),
38+
published,
39+
description: buildDescription(description || htmlContent, descriptionMaxLen),
40+
}
41+
42+
const extraFields = getExtraEntryFields(item)
43+
44+
return {
45+
...entry,
46+
...extraFields,
47+
}
48+
}
49+
50+
const flatten = (feed, baseUrl) => {
51+
const {
52+
title = '',
53+
link = '',
54+
item,
55+
} = feed
56+
57+
const items = isArray(item) ? item : [item]
58+
const entries = items.map((entry) => {
59+
const {
60+
id,
61+
title = '',
62+
link = '',
63+
} = entry
64+
65+
const item = {
66+
...entry,
67+
title: getText(title),
68+
link: getPureUrl(link, id, baseUrl),
69+
}
70+
71+
return item
72+
})
73+
74+
const output = {
75+
...feed,
76+
title: getText(title),
77+
link: getPureUrl(link, baseUrl),
78+
item: isArray(item) ? entries : entries[0],
79+
}
80+
return output
81+
}
82+
83+
const parseRdf = (data, options = {}) => {
84+
const {
85+
normalization,
86+
baseUrl,
87+
getExtraFeedFields,
88+
} = options
89+
90+
const feedData = data['rdf:RDF']
91+
92+
if (!normalization) {
93+
return flatten(feedData, baseUrl)
94+
}
95+
96+
const {
97+
title = '',
98+
link = '',
99+
description = '',
100+
generator = '',
101+
'dc:language': language = '',
102+
'dc:date': lastBuildDate = '',
103+
} = feedData.channel
104+
105+
const { item } = feedData
106+
107+
const extraFields = getExtraFeedFields(feedData)
108+
109+
const items = isArray(item) ? item : [item]
110+
111+
const published = options.useISODateFormat ? toISODateString(lastBuildDate) : lastBuildDate
112+
113+
return {
114+
title: getText(title),
115+
link: getPureUrl(link, '', baseUrl),
116+
description,
117+
language,
118+
generator,
119+
published,
120+
...extraFields,
121+
entries: items.map((item) => {
122+
return transform(item, options)
123+
}),
124+
}
125+
}
126+
127+
export default (data, options = {}) => {
128+
return parseRdf(data, options)
129+
}

src/utils/parseRssFeed.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,10 @@ const parseRss = (data, options = {}) => {
103103
getExtraFeedFields,
104104
} = options
105105

106+
const feedData = data.rss.channel
107+
106108
if (!normalization) {
107-
return flatten(data.rss.channel, baseUrl)
109+
return flatten(feedData, baseUrl)
108110
}
109111

110112
const {
@@ -115,9 +117,9 @@ const parseRss = (data, options = {}) => {
115117
language = '',
116118
lastBuildDate = '',
117119
item = [],
118-
} = data.rss.channel
120+
} = feedData
119121

120-
const extraFields = getExtraFeedFields(data.rss.channel)
122+
const extraFields = getExtraFeedFields(feedData)
121123

122124
const items = isArray(item) ? item : [item]
123125

src/utils/xmlparser.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,19 @@ export const isAtom = (data = {}) => {
1212
return hasProperty(data, 'feed') && hasProperty(data.feed, 'entry')
1313
}
1414

15+
export const isRdf = (data = {}) => {
16+
return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel')
17+
}
18+
1519
export const validate = (xml) => {
1620
return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true
1721
}
1822

1923
export const xml2obj = (xml = '', extraOptions = {}) => {
2024
const options = {
21-
...extraOptions,
22-
ignoreAttributes: false,
2325
attributeNamePrefix: '@_',
26+
ignoreAttributes: false,
27+
...extraOptions,
2428
}
2529
const parser = new XMLParser(options)
2630
const jsonObj = parser.parse(xml)

0 commit comments

Comments
 (0)