Skip to content

Commit 82f2dac

Browse files
N0taN3rdikreymer
authored andcommitted
autoFetchWorker.js improvements: (#397)
- ensured that autoFetchWorker uses full srcset URLs - resolves the URL against the img.src or document.baseURI if not rewritten - otherwise ensures the rewritten URL is not relative or schemeless wombat.js: - AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker - defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first
1 parent a9e4b5c commit 82f2dac

File tree

2 files changed

+121
-57
lines changed

2 files changed

+121
-57
lines changed

pywb/static/autoFetchWorker.js

Lines changed: 72 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ function AutoFetcher(init) {
5050
if (!(this instanceof AutoFetcher)) {
5151
return new AutoFetcher(init);
5252
}
53-
this.proxyMode = init.proxyMode;
5453
this.prefix = init.prefix;
5554
this.mod = init.mod;
5655
this.prefixMod = init.prefix + init.mod;
@@ -88,22 +87,21 @@ AutoFetcher.prototype.fixupURL = function (url) {
8887
};
8988

9089
AutoFetcher.prototype.safeFetch = function (url) {
91-
var fixedURL = this.fixupURL(url);
9290
// check to see if we have seen this url before in order
9391
// to lessen the load against the server content is fetched from
9492
if (this.seen[url] != null) return;
9593
this.seen[url] = true;
9694
if (this.queuing) {
9795
// we are currently waiting for a batch of fetches to complete
98-
return this.queue.push(fixedURL);
96+
return this.queue.push(url);
9997
}
10098
// fetch this url
10199
this.fetches.push(fetch(url));
102100
};
103101

104102
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
105103
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
106-
this.safeFetch(n2);
104+
this.safeFetch(this.fixupURL(n2));
107105
return n1 + n2 + n3;
108106
};
109107

@@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) {
154152
}
155153
};
156154

157-
AutoFetcher.prototype.extractSrcset = function (srcsets) {
155+
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
156+
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
157+
// otherwise returns null if this did not happen
158+
if (url.indexOf(this.relative) === 0) {
159+
return url.replace(this.relative, this.prefix);
160+
}
161+
if (url.indexOf(this.schemeless) === 0) {
162+
return url.replace(this.schemeless, this.prefix);
163+
}
164+
return null;
165+
};
166+
167+
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
168+
// given a url and base url returns a resolved full URL or
169+
// null if resolution was unsuccessful
170+
try {
171+
var _url = new URL(url, base);
172+
return _url.href;
173+
} catch (e) {
174+
return null;
175+
}
176+
};
177+
178+
179+
AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
180+
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
181+
if (url.indexOf(this.prefix) !== 0) {
182+
// first check for / (relative) or // (schemeless) rewritten urls
183+
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
184+
if (maybeFixed != null) {
185+
return maybeFixed;
186+
}
187+
// resolve URL against tag src
188+
maybeFixed = this.maybeResolveURL(url, tagSrc);
189+
if (maybeFixed != null) {
190+
return this.prefix + 'im_/' + maybeFixed;
191+
}
192+
// finally last attempt resolve the originating documents base URI
193+
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
194+
if (maybeFixed != null) {
195+
return this.prefix + 'im_/' + maybeFixed;
196+
}
197+
// not much to do now.....
198+
return this.prefixMod + '/' + url;
199+
}
200+
return url;
201+
};
202+
203+
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
158204
if (srcsets == null || srcsets.values == null) return;
159205
var srcsetValues = srcsets.values;
160-
// was srcsets from rewrite_srcset and if so no need to split
161-
var presplit = srcsets.presplit;
206+
if (!srcsets.presplit) {
207+
// was from extract from local doc so we need to duplicate work
208+
return this.srcsetNotPreSplit(srcsetValues, context);
209+
}
210+
// was rewrite_srcset so just ensure we just
162211
for (var i = 0; i < srcsetValues.length; i++) {
163-
var srcset = srcsetValues[i];
164-
if (presplit) {
165-
// was rewrite_srcset so just ensure we just
212+
// grab the URL not width/height key
213+
this.safeFetch(srcsetValues[i].split(' ')[0]);
214+
}
215+
};
216+
217+
AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
218+
// was from extract from local doc so we need to duplicate work
219+
var j;
220+
for (var i = 0; i < values.length; i++) {
221+
var srcsetValues = values[i].srcset.split(srcsetSplit);
222+
var tagSrc = values[i].tagSrc;
223+
for (j = 0; j < srcsetValues.length; j++) {
166224
// grab the URL not width/height key
167-
this.safeFetch(srcset.split(' ')[0]);
168-
} else {
169-
// was from extract from local doc so we need to duplicate work
170-
var values = srcset.split(srcsetSplit);
171-
for (var j = 0; j < values.length; j++) {
172-
if (Boolean(values[j])) {
173-
var value = values[j].trim();
174-
if (value.length > 0) {
175-
this.safeFetch(value.split(' ')[0]);
176-
}
177-
}
225+
if (Boolean(srcsetValues[j])) {
226+
var value = srcsetValues[j].trim().split(' ')[0];
227+
this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
178228
}
179229
}
180230
}
@@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
184234
// we got a message and now we autofetch!
185235
// these calls turn into no ops if they have no work
186236
this.extractMedia(data.media);
187-
this.extractSrcset(data.srcset);
237+
this.extractSrcset(data.srcset, data.context);
188238
this.fetchAll();
189239
};
190240

pywb/static/wombat.js

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
11511151
} else if (lowername == "style") {
11521152
value = rewrite_style(value);
11531153
} else if (lowername == "srcset") {
1154-
value = rewrite_srcset(value);
1154+
value = rewrite_srcset(value, this.tagName === 'IMG');
11551155
}
11561156
}
11571157
orig_setAttribute.call(this, name, value);
@@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) {
14031403
this.worker.terminate();
14041404
};
14051405

1406-
AutoFetchWorker.prototype.postMessage = function (msg) {
1406+
AutoFetchWorker.prototype.postMessage = function (msg, deferred) {
1407+
if (deferred) {
1408+
var self = this;
1409+
return Promise.resolve().then(function () {
1410+
self.worker.postMessage(msg);
1411+
});
1412+
}
14071413
this.worker.postMessage(msg);
14081414
};
14091415

14101416
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
1411-
// send values from rewrite_srcset to the worker
1417+
// send values from rewrite_srcset to the worker deferred
1418+
// to ensure the page viewer sees the images first
14121419
this.postMessage({
14131420
'type': 'values',
14141421
'srcset': {'values': srcset, 'presplit': true},
1415-
});
1422+
}, true);
14161423
};
14171424

14181425
AutoFetchWorker.prototype.preserveMedia = function (media) {
@@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) {
14211428
};
14221429

14231430
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
1424-
// get the values to be preserved from the documents stylesheets
1425-
// and all elements with a srcset
1426-
var media = [];
1427-
var srcset = [];
1428-
var sheets = $wbwindow.document.styleSheets;
1429-
var i = 0;
1430-
for (; i < sheets.length; ++i) {
1431-
var rules = sheets[i].cssRules;
1432-
for (var j = 0; j < rules.length; ++j) {
1433-
var rule = rules[j];
1434-
if (rule.type === CSSRule.MEDIA_RULE) {
1435-
media.push(rule.cssText);
1436-
}
1431+
// get the values to be preserved from the documents stylesheets
1432+
// and all elements with a srcset
1433+
var media = [];
1434+
var srcset = [];
1435+
var sheets = $wbwindow.document.styleSheets;
1436+
var i = 0;
1437+
for (; i < sheets.length; ++i) {
1438+
var rules = sheets[i].cssRules;
1439+
for (var j = 0; j < rules.length; ++j) {
1440+
var rule = rules[j];
1441+
if (rule.type === CSSRule.MEDIA_RULE) {
1442+
media.push(rule.cssText);
14371443
}
14381444
}
1439-
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
1440-
for (i = 0; i < srcsetElems.length; i++) {
1441-
var srcsetElem = srcsetElems[i];
1442-
if (wb_getAttribute) {
1443-
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset'));
1444-
} else {
1445-
srcset.push(srcsetElem.getAttribute('srcset'));
1446-
}
1445+
}
1446+
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
1447+
for (i = 0; i < srcsetElems.length; i++) {
1448+
var ssv = {tagSrc: srcsetElems[i].src};
1449+
if (wb_getAttribute) {
1450+
ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
1451+
} else {
1452+
ssv.srcset = srcsetElems[i].getAttribute('srcset');
14471453
}
1448-
this.postMessage({
1449-
'type': 'values',
1450-
'media': media,
1451-
'srcset': {'values': srcset, 'presplit': false},
1452-
});
1453-
};
1454+
srcset.push(ssv);
1455+
}
1456+
// send the extracted values to the worker deferred
1457+
// to ensure the page viewer sees the images first
1458+
this.postMessage({
1459+
'type': 'values',
1460+
'media': media,
1461+
'srcset': {'values': srcset, 'presplit': false},
1462+
'context': {
1463+
'docBaseURI': $wbwindow.document.baseURI
1464+
}
1465+
}, true);
1466+
};
14541467

14551468
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
14561469

@@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
16011614
} else if (name == "style") {
16021615
new_value = rewrite_style(value);
16031616
} else if (name == "srcset") {
1604-
new_value = rewrite_srcset(value);
1617+
new_value = rewrite_srcset(value, elem.tagName === 'IMG');
16051618
} else {
16061619
// Only rewrite if absolute url
16071620
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
@@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
16431656
}
16441657

16451658
//============================================
1646-
function rewrite_srcset(value)
1659+
function rewrite_srcset(value, isImage)
16471660
{
16481661
if (!value) {
16491662
return "";
@@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
16551668
for (var i = 0; i < values.length; i++) {
16561669
values[i] = rewrite_url(values[i].trim());
16571670
}
1658-
if (wbUseAFWorker) {
1671+
1672+
if (wbUseAFWorker && isImage) {
16591673
// send post split values to preservation worker
16601674
WBAutoFetchWorker.preserveSrcset(values);
16611675
}
@@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
20042018
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
20052019
val = rewrite_inline_style(orig);
20062020
} else if (attr == "srcset") {
2007-
val = rewrite_srcset(orig);
2021+
val = rewrite_srcset(orig, this.tagName === 'IMG');
20082022
} else if (this.tagName === 'LINK' && attr === 'href') {
20092023
var relV = this.rel;
20102024
if (relV === 'import' || relV === 'preload') {

0 commit comments

Comments
 (0)