Skip to content

Commit d281254

Browse files
committed
Load JSON-LD from HTML documents.
* Adds options parameter to documentLoader * Uses xmldom, if loaded. * Adds util.ParseContentTypeHeader * Adds documentLoader implementations for xhr and node (still requires tests).
1 parent 7cf2f71 commit d281254

File tree

6 files changed

+202
-95
lines changed

6 files changed

+202
-95
lines changed

lib/ContextResolver.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ module.exports = class ContextResolver {
149149
let remoteDoc;
150150

151151
try {
152-
remoteDoc = await documentLoader(url);
152+
remoteDoc = await documentLoader(url, {});
153153
context = remoteDoc.document || null;
154154
// parse string context as JSON
155155
if(_isString(context)) {

lib/documentLoaders/node.js

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
*/
44
'use strict';
55

6-
const {parseLinkHeader, buildHeaders} = require('../util');
6+
const {
7+
parseLinkHeader,
8+
buildHeaders,
9+
parseContentTypeHeader
10+
} = require('../util');
711
const {LINK_HEADER_CONTEXT} = require('../constants');
812
const JsonLdError = require('../JsonLdError');
913
const RequestQueue = require('../RequestQueue');
@@ -38,11 +42,11 @@ module.exports = ({
3842
const http = require('http');
3943

4044
const queue = new RequestQueue();
41-
return queue.wrapLoader(function(url) {
42-
return loadDocument(url, []);
45+
return queue.wrapLoader(function(url, options) {
46+
return loadDocument(url, options, []);
4347
});
4448

45-
async function loadDocument(url, redirects) {
49+
async function loadDocument(url, options, redirects) {
4650
if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
4751
throw new JsonLdError(
4852
'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -61,6 +65,12 @@ module.exports = ({
6165
return doc;
6266
}
6367

68+
// add any optional requestProfile
69+
if(options.requestProfile) {
70+
headers.Accept =
71+
headers.Accept + ", application/ld+json;profile=${options.requestProfile}";
72+
}
73+
6474
let result;
6575
let alternate = null;
6676
try {
@@ -78,8 +88,17 @@ module.exports = ({
7888
}
7989

8090
const {res, body} = result;
91+
const {contentType, params} = parseContentTypeHeader(res.headers['content-type']);
92+
93+
doc = {
94+
contextUrl: null,
95+
documentUrl: url,
96+
document: body || null,
97+
contentType: contentType,
98+
profile: params.profile
99+
};
81100

82-
doc = {contextUrl: null, documentUrl: url, document: body || null};
101+
// separate profile from content-type
83102

84103
// handle error
85104
const statusText = http.STATUS_CODES[res.statusCode];
@@ -95,7 +114,7 @@ module.exports = ({
95114

96115
// handle Link Header
97116
if(res.headers.link &&
98-
res.headers['content-type'] !== 'application/ld+json') {
117+
contentType !== 'application/ld+json') {
99118
// only 1 related link header permitted
100119
const linkHeaders = parseLinkHeader(res.headers.link);
101120
const linkedContext = linkHeaders[LINK_HEADER_CONTEXT];
@@ -144,7 +163,7 @@ module.exports = ({
144163
});
145164
}
146165
redirects.push(url);
147-
return loadDocument(res.headers.location, redirects);
166+
return loadDocument(res.headers.location, options, redirects);
148167
}
149168

150169
// cache for each redirected URL

lib/documentLoaders/xhr.js

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
*/
44
'use strict';
55

6-
const {parseLinkHeader, buildHeaders} = require('../util');
6+
const {
7+
parseLinkHeader,
8+
buildHeaders,
9+
parseContentTypeHeader
10+
} = require('../util');
711
const {LINK_HEADER_CONTEXT} = require('../constants');
812
const JsonLdError = require('../JsonLdError');
913
const RequestQueue = require('../RequestQueue');
@@ -31,7 +35,7 @@ module.exports = ({
3135
const queue = new RequestQueue();
3236
return queue.wrapLoader(loader);
3337

34-
async function loader(url) {
38+
async function loader(url, options) {
3539
if(url.indexOf('http:') !== 0 && url.indexOf('https:') !== 0) {
3640
throw new JsonLdError(
3741
'URL could not be dereferenced; only "http" and "https" URLs are ' +
@@ -45,6 +49,12 @@ module.exports = ({
4549
'jsonld.InvalidUrl', {code: 'loading document failed', url});
4650
}
4751

52+
// add any optional requestProfile
53+
if(options.requestProfile) {
54+
headers.Accept =
55+
headers.Accept + ", application/ld+json;profile=${options.requestProfile}";
56+
}
57+
4858
let req;
4959
try {
5060
req = await _get(xhr, url, headers);
@@ -65,11 +75,19 @@ module.exports = ({
6575
});
6676
}
6777

68-
let doc = {contextUrl: null, documentUrl: url, document: req.response};
78+
const {contentType, params} =
79+
parseContentTypeHeader(req.getResponseHeader('Content-Type'));
80+
81+
let doc = {
82+
contextUrl: null,
83+
documentUrl: url,
84+
document: req.response,
85+
contentType: contentType,
86+
profile: params.profile
87+
};
6988
let alternate = null;
7089

7190
// handle Link Header (avoid unsafe header warning by existence testing)
72-
const contentType = req.getResponseHeader('Content-Type');
7391
let linkHeader;
7492
if(REGEX_LINK_HEADER.test(req.getAllResponseHeaders())) {
7593
linkHeader = req.getResponseHeader('Link');

lib/jsonld.js

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const LRU = require('lru-cache');
4242
const NQuads = require('./NQuads');
4343
const Rdfa = require('./Rdfa');
4444

45+
const {prependBase: _prependBase} = require ('./url');
4546
const {expand: _expand} = require('./expand');
4647
const {flatten: _flatten} = require('./flatten');
4748
const {fromRDF: _fromRDF} = require('./fromRdf');
@@ -862,6 +863,9 @@ jsonld.documentLoader = async url => {
862863
* @param url the URL to fetch.
863864
* @param [options] the options to use:
864865
* [documentLoader] the document loader to use.
866+
* [extractAllScripts] concatenates all matching script elements..
867+
* [profile] used when selecting from HTML script elements.
868+
* [requestProfile] one or more profile IRIs to use in the request.
865869
*
866870
* @return a Promise that resolves to the retrieved remote document.
867871
*/
@@ -873,7 +877,10 @@ jsonld.get = async function(url, options) {
873877
load = jsonld.documentLoader;
874878
}
875879

876-
const remoteDoc = await load(url);
880+
// FIXME: unescape frag?
881+
const [reference, frag] = url.split('#', 2);
882+
883+
const remoteDoc = await load(reference, options);
877884

878885
try {
879886
if(!remoteDoc.document) {
@@ -882,16 +889,68 @@ jsonld.get = async function(url, options) {
882889
'jsonld.NullRemoteDocument');
883890
}
884891
if(_isString(remoteDoc.document)) {
885-
remoteDoc.document = JSON.parse(remoteDoc.document);
892+
if(remoteDoc.contentType && remoteDoc.contentType.includes('text/html')) {
893+
const domParser = new jsonld.domParser();
894+
const dom = domParser.parseFromString(remoteDoc.document);
895+
896+
// Use any document base
897+
const baseElem = dom.getElementsByTagName('base');
898+
if(baseElem.length > 0) {
899+
const href = baseElem[0].getAttribute('href');
900+
options.base = _prependBase(options.base || reference, href);
901+
}
902+
903+
const scripts = dom.getElementsByTagName('script');
904+
remoteDoc.document = [];
905+
906+
for(let i = 0; i < scripts.length; i++) {
907+
const script = scripts[i];
908+
// only application/ld+json
909+
if(!script.getAttribute('type').startsWith('application/ld+json')) {
910+
continue;
911+
}
912+
// If url has a fragment identifier, only matching scripts
913+
if(frag && script.getAttribute('id') !== frag) {
914+
continue;
915+
}
916+
try {
917+
remoteDoc.document.push(JSON.parse(script.textContent));
918+
} catch(e) {
919+
throw new JsonLdError(
920+
'Illegal script content.',
921+
'jsonld.InvalidScriptElement', {
922+
code: 'invalid script element',
923+
remoteDoc
924+
});
925+
}
926+
}
927+
if(frag && remoteDoc.document.length === 0) {
928+
throw new JsonLdError(
929+
'No script tag found with id=${frag}.',
930+
'jsonld.InvalidScriptElement', {
931+
code: 'invalid script element',
932+
remoteDoc
933+
});
934+
}
935+
if(!options.extractAllScripts) {
936+
remoteDoc.document = remoteDoc.document[0];
937+
}
938+
} else {
939+
remoteDoc.document = JSON.parse(remoteDoc.document);
940+
}
886941
}
887942
} catch(e) {
888-
throw new JsonLdError(
889-
'Could not retrieve a JSON-LD document from the URL.',
890-
'jsonld.LoadDocumentError', {
891-
code: 'loading document failed',
892-
cause: e,
893-
remoteDoc
894-
});
943+
if(e.name === 'jsonld.InvalidScriptElement') {
944+
throw(e)
945+
} else {
946+
throw new JsonLdError(
947+
'Could not retrieve a JSON-LD document from the URL.',
948+
'jsonld.LoadDocumentError', {
949+
code: 'loading document failed',
950+
cause: e,
951+
remoteDoc
952+
});
953+
}
895954
}
896955

897956
return remoteDoc;
@@ -942,6 +1001,20 @@ jsonld.documentLoaders = {};
9421001
jsonld.documentLoaders.node = require('./documentLoaders/node');
9431002
jsonld.documentLoaders.xhr = require('./documentLoaders/xhr');
9441003

1004+
// Optional DOM parser
1005+
try {
1006+
jsonld.domParser = require('xmldom').DOMParser;
1007+
} catch(e) {
1008+
jsonld.domParser = class NoDOMParser {
1009+
parseFromString() {
1010+
throw new JsonLdError(
1011+
'Could not parse HTML document. ' +
1012+
'HTML parsing not implemented.', 'jsonld.LoadDocumentError',
1013+
{code: 'loading document failed'});
1014+
}
1015+
};
1016+
}
1017+
9451018
/**
9461019
* Assigns the default document loader for external document URLs to a built-in
9471020
* default. Supported types currently include: 'xhr' and 'node'.

lib/util.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ const REGEX_LINK_HEADER = /\s*<([^>]*?)>\s*(?:;\s*(.*))?/;
1515
const REGEX_LINK_HEADER_PARAMS =
1616
/(.*?)=(?:(?:"([^"]*?)")|([^"]*?))\s*(?:(?:;\s*)|$)/g;
1717

18+
// FIXME: conditinally support text/html
1819
const DEFAULTS = {
1920
headers: {
2021
accept: 'application/ld+json, application/json'
@@ -142,6 +143,34 @@ api.parseLinkHeader = header => {
142143
return rval;
143144
};
144145

146+
/**
147+
* Parses a content-type header. The results will be key'd by the value of "rel".
148+
*
149+
* Accept: application/ld+json
150+
*
151+
* Parses as: ["application/ld+json", {}]
152+
*
153+
* Accept: application/ld+json;profile=http://www.w3.org/ns/json-ld#context
154+
*
155+
* Parses as: ["application/ld+json", {profile: "http://www.w3.org/ns/json-ld#context"}]
156+
*
157+
* If there is more than one
158+
*
159+
* @param header the content-type header to parse.
160+
*/
161+
api.parseContentTypeHeader = header => {
162+
const [type, ...rest] = header.split(';');
163+
const params = {};
164+
const rval = [type.trim(), params];
165+
166+
// assign parameters
167+
for(const paramString of rest) {
168+
const [param, value] = paramString.split('=');
169+
params[param.trim().toLowerCase()] = value.trim();
170+
}
171+
return rval;
172+
};
173+
145174
/**
146175
* Throws an exception if the given value is not a valid @type value.
147176
*

0 commit comments

Comments
 (0)