Skip to content

Commit 9c71c6b

Browse files
committed
Load JSON-LD from HTML documents.
* Adds options parameter to documentLoader * Uses xmldom, if loaded. * Adds util.ParseContentTypeHeader * Adds documentLoader implementations for xhr and node (still requires tests).
1 parent cbbef5b commit 9c71c6b

File tree

6 files changed

+197
-91
lines changed

6 files changed

+197
-91
lines changed

lib/ContextResolver.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ module.exports = class ContextResolver {
149149
let remoteDoc;
150150

151151
try {
152-
remoteDoc = await documentLoader(url);
152+
remoteDoc = await documentLoader(url, {});
153153
context = remoteDoc.document || null;
154154
// parse string context as JSON
155155
if(_isString(context)) {

lib/documentLoaders/node.js

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
*/
44
'use strict';
55

6-
const {parseLinkHeader, buildHeaders} = require('../util');
6+
const {
7+
parseLinkHeader,
8+
buildHeaders,
9+
parseContentTypeHeader
10+
} = require('../util');
711
const {LINK_HEADER_REL} = require('../constants');
812
const JsonLdError = require('../JsonLdError');
913
const RequestQueue = require('../RequestQueue');
@@ -37,11 +41,11 @@ module.exports = ({
3741
const http = require('http');
3842

3943
const queue = new RequestQueue();
40-
return queue.wrapLoader(function(url) {
41-
return loadDocument(url, []);
44+
return queue.wrapLoader(function(url, options) {
45+
return loadDocument(url, options, []);
4246
});
4347

44-
async function loadDocument(url, redirects) {
48+
async function loadDocument(url, options, redirects) {
4549
if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
4650
throw new JsonLdError(
4751
'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -60,6 +64,12 @@ module.exports = ({
6064
return doc;
6165
}
6266

67+
// add any optional requestProfile
68+
if(options.requestProfile) {
69+
headers.Accept =
70+
headers.Accept + ", application/ld+json;profile=${options.requestProfile}";
71+
}
72+
6373
let result;
6474
try {
6575
result = await _request(request, {
@@ -76,8 +86,17 @@ module.exports = ({
7686
}
7787

7888
const {res, body} = result;
89+
const {contentType, params} = parseContentTypeHeader(res.headers['content-type']);
90+
91+
doc = {
92+
contextUrl: null,
93+
documentUrl: url,
94+
document: body || null,
95+
contentType: contentType,
96+
profile: params.profile
97+
};
7998

80-
doc = {contextUrl: null, documentUrl: url, document: body || null};
99+
// separate profile from content-type
81100

82101
// handle error
83102
const statusText = http.STATUS_CODES[res.statusCode];
@@ -93,7 +112,7 @@ module.exports = ({
93112

94113
// handle Link Header
95114
if(res.headers.link &&
96-
res.headers['content-type'] !== 'application/ld+json') {
115+
contentType !== 'application/ld+json') {
97116
// only 1 related link header permitted
98117
const linkHeader = parseLinkHeader(res.headers.link)[LINK_HEADER_REL];
99118
if(Array.isArray(linkHeader)) {
@@ -131,7 +150,7 @@ module.exports = ({
131150
});
132151
}
133152
redirects.push(url);
134-
return loadDocument(res.headers.location, redirects);
153+
return loadDocument(res.headers.location, options, redirects);
135154
}
136155

137156
// cache for each redirected URL

lib/documentLoaders/xhr.js

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
*/
44
'use strict';
55

6-
const {parseLinkHeader, buildHeaders} = require('../util');
6+
const {
7+
parseLinkHeader,
8+
buildHeaders,
9+
parseContentTypeHeader
10+
} = require('../util');
711
const {LINK_HEADER_REL} = require('../constants');
812
const JsonLdError = require('../JsonLdError');
913
const RequestQueue = require('../RequestQueue');
@@ -30,7 +34,7 @@ module.exports = ({
3034
const queue = new RequestQueue();
3135
return queue.wrapLoader(loader);
3236

33-
async function loader(url) {
37+
async function loader(url, options) {
3438
if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
3539
throw new JsonLdError(
3640
'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -44,6 +48,12 @@ module.exports = ({
4448
'jsonld.InvalidUrl', {code: 'loading document failed', url});
4549
}
4650

51+
// add any optional requestProfile
52+
if(options.requestProfile) {
53+
headers.Accept =
54+
headers.Accept + ", application/ld+json;profile=${options.requestProfile}";
55+
}
56+
4757
let req;
4858
try {
4959
req = await _get(xhr, url, headers);
@@ -64,10 +74,18 @@ module.exports = ({
6474
});
6575
}
6676

67-
const doc = {contextUrl: null, documentUrl: url, document: req.response};
77+
const {contentType, params} =
78+
parseContentTypeHeader(req.getResponseHeader('Content-Type'));
79+
80+
const doc = {
81+
contextUrl: null,
82+
documentUrl: url,
83+
document: req.response,
84+
contentType: contentType,
85+
profile: params.profile
86+
};
6887

6988
// handle Link Header (avoid unsafe header warning by existence testing)
70-
const contentType = req.getResponseHeader('Content-Type');
7189
let linkHeader;
7290
if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
7391
linkHeader = req.getResponseHeader('Link');

lib/jsonld.js

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const LRU = require('lru-cache');
4242
const NQuads = require('./NQuads');
4343
const Rdfa = require('./Rdfa');
4444

45+
const {prependBase: _prependBase} = require ('./url');
4546
const {expand: _expand} = require('./expand');
4647
const {flatten: _flatten} = require('./flatten');
4748
const {fromRDF: _fromRDF} = require('./fromRdf');
@@ -854,6 +855,9 @@ jsonld.documentLoader = async url => {
854855
* @param url the URL to fetch.
855856
* @param [options] the options to use:
856857
* [documentLoader] the document loader to use.
858+
* [extractAllScripts] concatenates all matching script elements..
859+
* [profile] used when selecting from HTML script elements.
860+
* [requestProfile] one or more profile IRIs to use in the request.
857861
*
858862
* @return a Promise that resolves to the retrieved remote document.
859863
*/
@@ -865,7 +869,10 @@ jsonld.get = async function(url, options) {
865869
load = jsonld.documentLoader;
866870
}
867871

868-
const remoteDoc = await load(url);
872+
// FIXME: unescape frag?
873+
const [reference, frag] = url.split('#', 2);
874+
875+
const remoteDoc = await load(reference, options);
869876

870877
try {
871878
if(!remoteDoc.document) {
@@ -874,16 +881,68 @@ jsonld.get = async function(url, options) {
874881
'jsonld.NullRemoteDocument');
875882
}
876883
if(_isString(remoteDoc.document)) {
877-
remoteDoc.document = JSON.parse(remoteDoc.document);
884+
if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) {
885+
const domParser = new jsonld.domParser();
886+
const dom = domParser.parseFromString(remoteDoc.document);
887+
888+
// Use any document base
889+
const baseElem = dom.getElementsByTagName('base');
890+
if(baseElem.length > 0) {
891+
const href = baseElem[0].getAttribute('href');
892+
options.base = _prependBase(options.base || reference, href);
893+
}
894+
895+
const scripts = dom.getElementsByTagName('script');
896+
remoteDoc.document = [];
897+
898+
for(let i = 0; i < scripts.length; i++) {
899+
const script = scripts[i];
900+
// only application/ld+json
901+
if(!script.getAttribute('type').startsWith('application/ld+json')) {
902+
continue;
903+
}
904+
// If url has a fragment identifier, only matching scripts
905+
if(frag && script.getAttribute('id') !== frag) {
906+
continue;
907+
}
908+
try {
909+
remoteDoc.document.push(JSON.parse(script.textContent));
910+
} catch(e) {
911+
throw new JsonLdError(
912+
'Illegal script content.',
913+
'jsonld.InvalidScriptElement', {
914+
code: 'invalid script element',
915+
remoteDoc
916+
});
917+
}
918+
}
919+
if(frag && remoteDoc.document.length === 0) {
920+
throw new JsonLdError(
921+
'No script tag found with id=${frag}.',
922+
'jsonld.InvalidScriptElement', {
923+
code: 'invalid script element',
924+
remoteDoc
925+
});
926+
}
927+
if(!options.extractAllScripts) {
928+
remoteDoc.document = remoteDoc.document[0];
929+
}
930+
} else {
931+
remoteDoc.document = JSON.parse(remoteDoc.document);
932+
}
878933
}
879934
} catch(e) {
880-
throw new JsonLdError(
881-
'Could not retrieve a JSON-LD document from the URL.',
882-
'jsonld.LoadDocumentError', {
883-
code: 'loading document failed',
884-
cause: e,
885-
remoteDoc
886-
});
935+
if(e.name === 'jsonld.InvalidScriptElement') {
936+
throw(e)
937+
} else {
938+
throw new JsonLdError(
939+
'Could not retrieve a JSON-LD document from the URL.',
940+
'jsonld.LoadDocumentError', {
941+
code: 'loading document failed',
942+
cause: e,
943+
remoteDoc
944+
});
945+
}
887946
}
888947

889948
return remoteDoc;
@@ -934,6 +993,20 @@ jsonld.documentLoaders = {};
934993
jsonld.documentLoaders.node = require('./documentLoaders/node');
935994
jsonld.documentLoaders.xhr = require('./documentLoaders/xhr');
936995

996+
// Optional DOM parser
997+
try {
998+
jsonld.domParser = require('xmldom').DOMParser;
999+
} catch(e) {
1000+
jsonld.domParser = class NoDOMParser {
1001+
parseFromString() {
1002+
throw new JsonLdError(
1003+
'Could not parse HTML document. ' +
1004+
'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
1005+
{code: 'loading document failed'});
1006+
}
1007+
};
1008+
}
1009+
9371010
/**
9381011
* Assigns the default document loader for external document URLs to a built-in
9391012
* default. Supported types currently include: 'xhr' and 'node'.

lib/util.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/;
1515
const REGEX_LINK_HEADER_PARAMS =
1616
/(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g;
1717

18+
// FIXME: conditinally support text/html
1819
const DEFAULTS = {
1920
headers: {
2021
accept: 'application/ld+json, application/json'
@@ -142,6 +143,34 @@ api.parseLinkHeader = header => {
142143
return rval;
143144
};
144145

146+
/**
147+
* Parses a content-type header. The results will be key'd by the value of "rel".
148+
*
149+
* Accept: application/ld+json
150+
*
151+
* Parses as: ["application/ld+json", {}]
152+
*
153+
* Accept: application/ld+json;profile=http://www.w3.org/ns/json-ld#context
154+
*
155+
* Parses as: ["application/ld+json", {profile: "http://www.w3.org/ns/json-ld#context"}]
156+
*
157+
* If there is more than one
158+
*
159+
* @param header the content-type header to parse.
160+
*/
161+
api.parseContentTypeHeader = header => {
162+
const [type, ...rest] = header.split(';');
163+
const params = {};
164+
const rval = [type.trim(), params];
165+
166+
// assign parameters
167+
for(const paramString of rest) {
168+
const [param, value] = paramString.split('=');
169+
params[param.trim().toLowerCase()] = value.trim();
170+
}
171+
return rval;
172+
};
173+
145174
/**
146175
* Throws an exception if the given value is not a valid @type value.
147176
*

0 commit comments

Comments
 (0)