Merge pull request #91 from extractus/dev

ndaidong · web-flow · commit d578bdc2db12 · 2023-06-14T13:05:49.000+07:00
v6.2.3
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -62,7 +62,7 @@
     "max-lines": [
       "error",
       {
-        "max": 360,
+        "max": 460,
         "skipBlankLines": true,
         "skipComments": false
       }
diff --git a/README.md b/README.md
@@ -134,6 +134,7 @@ Object with all or several of the following properties:
   - `xmlParserOptions`: Object, used by xml parser, view [fast-xml-parser's docs](https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md)
   - `getExtraFeedFields`: Function, to get more fields from feed data
   - `getExtraEntryFields`: Function, to get more fields from feed entry data
+  - `baseUrl`: URL string, to absolutify the links within feed content
 
 For example:
 
diff --git a/dist/cjs/feed-extractor.js b/dist/cjs/feed-extractor.js
diff --git a/dist/cjs/index.d.ts b/dist/cjs/index.d.ts
@@ -47,6 +47,11 @@ export interface ReaderOptions {
    * https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md
    */
   xmlParserOptions?: any;
+  /**
+   * fill in the baseurl when it does not exist in the link
+   * default: ''
+   */
+  baseUrl?: string;
   /**
    * merge extra feed fields in result
    */
diff --git a/dist/cjs/package.json b/dist/cjs/package.json
@@ -1,5 +1,5 @@
 {
   "name": "@extractus/feed-extractor",
-  "version": "6.2.2",
+  "version": "6.2.3",
   "main": "./feed-extractor.js"
 }
diff --git a/dist/feed-extractor.esm.js b/dist/feed-extractor.esm.js
diff --git a/index.d.ts b/index.d.ts
@@ -47,6 +47,11 @@ export interface ReaderOptions {
    * https://github.com/NaturalIntelligence/fast-xml-parser/blob/master/docs/v4/2.XMLparseOptions.md
    */
   xmlParserOptions?: any;
+  /**
+   * fill in the baseurl when it does not exist in the link
+   * default: ''
+   */
+  baseUrl?: string;
   /**
    * merge extra feed fields in result
    */
diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "6.2.2",
+  "version": "6.2.3",
   "name": "@extractus/feed-extractor",
   "description": "To read and normalize RSS/ATOM/JSON feed data",
   "homepage": "https://extractor-demos.pages.dev",
@@ -36,13 +36,13 @@
   },
   "dependencies": {
     "bellajs": "^11.1.2",
-    "cross-fetch": "^3.1.5",
-    "fast-xml-parser": "^4.2.2",
-    "html-entities": "^2.3.3"
+    "cross-fetch": "^3.1.6",
+    "fast-xml-parser": "^4.2.4",
+    "html-entities": "^2.3.6"
   },
   "devDependencies": {
-    "esbuild": "^0.17.18",
-    "eslint": "^8.40.0",
+    "esbuild": "^0.18.2",
+    "eslint": "^8.42.0",
     "jest": "^29.5.0",
     "nock": "^13.3.1"
   },
diff --git a/src/main.js b/src/main.js
@@ -14,6 +14,7 @@ const getopt = (options = {}) => {
     descriptionMaxLen = 210,
     useISODateFormat = true,
     xmlParserOptions = {},
+    baseUrl = '',
     getExtraFeedFields = () => ({}),
     getExtraEntryFields = () => ({}),
   } = options
@@ -23,6 +24,7 @@ const getopt = (options = {}) => {
     descriptionMaxLen,
     useISODateFormat,
     xmlParserOptions,
+    baseUrl,
     getExtraFeedFields,
     getExtraEntryFields,
   }
@@ -51,6 +53,7 @@ export const extract = async (url, options = {}, fetchOptions = {}) => {
   if (!isValidUrl(url)) {
     throw new Error('Input param must be a valid URL')
   }
+
   const data = await retrieve(url, fetchOptions)
   if (!data.text && !data.json) {
     throw new Error(`Failed to load content from "${url}"`)
diff --git a/src/main.test.js b/src/main.test.js
@@ -7,7 +7,7 @@ import nock from 'nock'
 
 import { hasProperty, isString } from 'bellajs'
 
-import { extract, read } from './main.js'
+import { extract, extractFromXml, extractFromJson, read } from './main.js'
 import { isValid as isValidUrl } from './utils/linker.js'
 
 const feedAttrs = 'title link description generator language published entries'.split(' ')
@@ -336,6 +336,65 @@ describe('test extract() without normalization', () => {
   })
 })
 
+describe('test extract with `baseUrl` option', () => {
+  test('extract rss feed with xml', () => {
+    const baseUrl = 'https://huggingface.co'
+    const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
+    const result = extractFromXml(xml, { baseUrl })
+
+    feedAttrs.forEach((k) => {
+      expect(hasProperty(result, k)).toBe(true)
+    })
+
+    entryAttrs.forEach((k) => {
+      expect(hasProperty(result.entries[0], k)).toBe(true)
+    })
+
+    expect(validateProps(result.entries[0])).toBe(true)
+    expect(result.link).toBe(baseUrl + '/blog')
+    expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
+  })
+
+  test('extract rss feed with json', () => {
+    const baseUrl = 'https://www.jsonfeed.org'
+    const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8')
+    const result = extractFromJson(JSON.parse(json), { baseUrl })
+
+    feedAttrs.forEach((k) => {
+      expect(hasProperty(result, k)).toBe(true)
+    })
+
+    entryAttrs.forEach((k) => {
+      expect(hasProperty(result.entries[0], k)).toBe(true)
+    })
+
+    expect(result.link).toBe(baseUrl + '/')
+    expect(result.entries[0].link).toBe(baseUrl + '/2020/08/07/json-feed-version.html')
+  })
+
+  test('extract rss feed with url', async () => {
+    const url = 'https://huggingface.co/blog/rss'
+    const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
+    const { baseUrl, path } = parseUrl(url)
+    nock(baseUrl).get(path).reply(200, xml, {
+      'Content-Type': 'application/xml',
+    })
+    const result = await extract(url, { baseUrl })
+
+    feedAttrs.forEach((k) => {
+      expect(hasProperty(result, k)).toBe(true)
+    })
+
+    entryAttrs.forEach((k) => {
+      expect(hasProperty(result.entries[0], k)).toBe(true)
+    })
+
+    expect(validateProps(result.entries[0])).toBe(true)
+    expect(result.link).toBe(baseUrl + '/blog')
+    expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
+  })
+})
+
 describe('check old method read()', () => {
   test('ensure that depricated method read() still works', async () => {
     const url = 'https://realworld-standard-feed.tld/rss'
diff --git a/src/utils/normalizer.js b/src/utils/normalizer.js
@@ -11,7 +11,7 @@ import {
 
 import { decode } from 'html-entities'
 
-import { isValid as isValidUrl, purify as purifyUrl } from './linker.js'
+import { absolutify, isValid as isValidUrl, purify as purifyUrl } from './linker.js'
 
 export const toISODateString = (dstr) => {
   try {
@@ -57,9 +57,15 @@ export const getLink = (val = [], id = '') => {
             : isArray(val) ? getEntryLink(val) : ''
 }
 
-export const getPureUrl = (url, id = '') => {
+export const getPureUrl = (url, id = '', baseUrl) => {
   const link = getLink(url, id)
-  return link ? purifyUrl(link) : ''
+  const pu = purifyUrl(link)
+
+  return link
+    ? pu
+      ? pu
+      : absolutify(baseUrl, link)
+    : ''
 }
 
 const hash = (str) => Math.abs(str.split('').reduce((s, c) => Math.imul(31, s) + c.charCodeAt(0) | 0, 0)).toString(36)
diff --git a/src/utils/parseAtomFeed.js b/src/utils/parseAtomFeed.js
@@ -17,6 +17,7 @@ const transform = (item, options) => {
   const {
     useISODateFormat,
     descriptionMaxLen,
+    baseUrl,
     getExtraEntryFields,
   } = options
 
@@ -37,7 +38,7 @@ const transform = (item, options) => {
   const entry = {
     id: getEntryId(id, link, pubDate),
     title: getText(title),
-    link: getPureUrl(link, id),
+    link: getPureUrl(link, id, baseUrl),
     published: useISODateFormat ? toISODateString(pubDate) : pubDate,
     description: buildDescription(htmlContent || summary, descriptionMaxLen),
   }
@@ -50,7 +51,7 @@ const transform = (item, options) => {
   }
 }
 
-const flatten = (feed) => {
+const flatten = (feed, baseUrl) => {
   const {
     id,
     title = '',
@@ -70,7 +71,7 @@ const flatten = (feed) => {
     const item = {
       ...entry,
       title: getText(title),
-      link: getPureUrl(link, id),
+      link: getPureUrl(link, id, baseUrl),
     }
     if (hasProperty(item, 'summary')) {
       item.summary = getText(summary)
@@ -84,7 +85,7 @@ const flatten = (feed) => {
   const output = {
     ...feed,
     title: getText(title),
-    link: getPureUrl(link, id),
+    link: getPureUrl(link, id, baseUrl),
     entry: isArray(entry) ? items : items[0],
   }
   return output
@@ -93,11 +94,12 @@ const flatten = (feed) => {
 const parseAtom = (data, options = {}) => {
   const {
     normalization,
+    baseUrl,
     getExtraFeedFields,
   } = options
 
   if (!normalization) {
-    return flatten(data.feed)
+    return flatten(data.feed, baseUrl)
   }
 
   const {
@@ -119,7 +121,7 @@ const parseAtom = (data, options = {}) => {
 
   return {
     title: getText(title),
-    link: getPureUrl(link, id),
+    link: getPureUrl(link, id, baseUrl),
     description: subtitle,
     language,
     generator,
diff --git a/src/utils/parseJsonFeed.js b/src/utils/parseJsonFeed.js
@@ -10,12 +10,13 @@ import {
   getEntryId
 } from './normalizer.js'
 
-import { purify as purifyUrl } from './linker.js'
+import { absolutify, purify as purifyUrl } from './linker.js'
 
 const transform = (item, options) => {
   const {
     useISODateFormat,
     descriptionMaxLen,
+    baseUrl,
     getExtraEntryFields,
   } = options
 
@@ -35,7 +36,7 @@ const transform = (item, options) => {
   const entry = {
     id: getEntryId(id, link, pubDate),
     title,
-    link: purifyUrl(link),
+    link: purifyUrl(link) || absolutify(baseUrl, link),
     published,
     description: buildDescription(textContent || htmlContent || summary, descriptionMaxLen),
   }
@@ -49,6 +50,7 @@ const transform = (item, options) => {
 const parseJson = (data, options) => {
   const {
     normalization,
+    baseUrl,
     getExtraFeedFields,
   } = options
 
@@ -70,7 +72,7 @@ const parseJson = (data, options) => {
 
   return {
     title,
-    link: purifyUrl(homepageUrl),
+    link: purifyUrl(homepageUrl) || absolutify(baseUrl, homepageUrl),
     description,
     language,
     published: '',
diff --git a/src/utils/parseRssFeed.js b/src/utils/parseRssFeed.js
@@ -17,6 +17,7 @@ const transform = (item, options) => {
   const {
     useISODateFormat,
     descriptionMaxLen,
+    baseUrl,
     getExtraEntryFields,
   } = options
 
@@ -33,7 +34,7 @@ const transform = (item, options) => {
   const entry = {
     id: getEntryId(guid, link, pubDate),
     title: getText(title),
-    link: getPureUrl(link, guid),
+    link: getPureUrl(link, guid, baseUrl),
     published,
     description: buildDescription(description, descriptionMaxLen),
   }
@@ -46,7 +47,7 @@ const transform = (item, options) => {
   }
 }
 
-const flatten = (feed) => {
+const flatten = (feed, baseUrl) => {
   const {
     title = '',
     link = '',
@@ -64,7 +65,7 @@ const flatten = (feed) => {
     const item = {
       ...entry,
       title: getText(title),
-      link: getPureUrl(link, id),
+      link: getPureUrl(link, id, baseUrl),
     }
 
     const txtTags = 'guid description source'.split(' ')
@@ -88,7 +89,7 @@ const flatten = (feed) => {
   const output = {
     ...feed,
     title: getText(title),
-    link: getPureUrl(link),
+    link: getPureUrl(link, baseUrl),
     item: isArray(item) ? entries : entries[0],
   }
   return output
@@ -97,11 +98,12 @@ const flatten = (feed) => {
 const parseRss = (data, options = {}) => {
   const {
     normalization,
+    baseUrl,
     getExtraFeedFields,
   } = options
 
   if (!normalization) {
-    return flatten(data.rss.channel)
+    return flatten(data.rss.channel, baseUrl)
   }
 
   const {
@@ -122,7 +124,7 @@ const parseRss = (data, options = {}) => {
 
   return {
     title: getText(title),
-    link: getPureUrl(link),
+    link: getPureUrl(link, '', baseUrl),
     description,
     language,
     generator,
diff --git a/test-data/json-feed-miss-base-url.json b/test-data/json-feed-miss-base-url.json
diff --git a/test-data/rss-feed-miss-base-url.xml b/test-data/rss-feed-miss-base-url.xml

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`"max-lines": [`
`63`	`63`	`"error",`
`64`	`64`	`{`
`65`		`- "max": 360,`
	`65`	`+ "max": 460,`
`66`	`66`	`"skipBlankLines": true,`
`67`	`67`	`"skipComments": false`
`68`	`68`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@extractus/feed-extractor",`
`3`		`- "version": "6.2.2",`
	`3`	`+ "version": "6.2.3",`
`4`	`4`	`"main": "./feed-extractor.js"`
`5`	`5`	`}`