Skip to content

Commit 83a33e0

Browse files
committed
Resolve relative canonical paths if rewriting is disabled
For Via, we want rel=canonical links to resolve to the same absolute URL as it did on the original page. For absolute URLs, no rewriting is necessary. If the original rel=canonical URL was relative however, it needs to be resolved relative to the original URL. See hypothesis/via#65 for context.
1 parent 7a0680f commit 83a33e0

File tree

2 files changed

+31
-20
lines changed

2 files changed

+31
-20
lines changed

pywb/rewrite/html_rewriter.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import re
66

77
from HTMLParser import HTMLParser, HTMLParseError
8-
from urlparse import urlsplit, urlunsplit
8+
from urlparse import urljoin, urlsplit, urlunsplit
99

1010
from url_rewriter import UrlRewriter
1111
from regex_rewriters import JSRewriter, CSSRewriter
@@ -276,9 +276,18 @@ def _rewrite_tag_attrs(self, tag, tag_attrs):
276276
# special case: if rewrite_canon not set,
277277
# don't rewrite rel=canonical
278278
elif tag == 'link' and attr_name == 'href':
279-
if (self.opts.get('rewrite_rel_canon', True) or
280-
not self.has_attr(tag_attrs, ('rel', 'canonical'))):
281-
rw_mod = handler.get(attr_name)
279+
rw_mod = handler.get(attr_name)
280+
281+
if self.has_attr(tag_attrs, ('rel', 'canonical')):
282+
if self.opts.get('rewrite_rel_canon', True):
283+
attr_value = self._rewrite_url(attr_value, rw_mod)
284+
else:
285+
# resolve relative rel=canonical URLs so that they
286+
# refer to the same absolute URL as on the original
287+
# page (see https://github.com/hypothesis/via/issues/65
288+
# for context)
289+
attr_value = urljoin(self.orig_url, attr_value)
290+
else:
282291
attr_value = self._rewrite_url(attr_value, rw_mod)
283292

284293
# special case: meta tag

pywb/rewrite/test/test_html_rewriter.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,12 @@
158158
<link rel="canonical" href="/web/20131226101010oe_/http://example.com/">
159159
160160
# rel=canonical: no_rewrite
161-
>>> parse('<link rel=canonical href="http://example.com/">', urlrewriter=no_base_canon_rewriter)
162-
<link rel="canonical" href="http://example.com/">
161+
>>> parse('<link rel=canonical href="http://example.com/canon/path">', urlrewriter=no_base_canon_rewriter)
162+
<link rel="canonical" href="http://example.com/canon/path">
163+
164+
# rel=canonical: no_rewrite
165+
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
166+
<link rel="canonical" href="http://example.com/relative/path">
163167
164168
# doctype
165169
>>> parse('<!doctype html PUBLIC "public">')
@@ -210,26 +214,24 @@
210214
import pprint
211215
import urllib
212216

213-
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
214-
'/web/',
215-
rewrite_opts=dict(punycode_links=False))
217+
ORIGINAL_URL = 'http://example.com/some/path/index.html'
218+
219+
def new_rewriter(prefix='/web/', rewrite_opts=dict()):
220+
PROXY_PATH = '20131226101010/{0}'.format(ORIGINAL_URL)
221+
return UrlRewriter(PROXY_PATH, prefix, rewrite_opts=rewrite_opts)
216222

217-
full_path_urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
218-
'http://localhost:80/web/',
219-
rewrite_opts=dict(punycode_links=False))
223+
urlrewriter = new_rewriter(rewrite_opts=dict(punycode_links=False))
220224

221-
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
222-
'/web/',
223-
rewrite_opts=dict(punycode_links=True))
225+
full_path_urlrewriter = new_rewriter(prefix='http://localhost:80/web/',
226+
rewrite_opts=dict(punycode_links=False))
224227

228+
urlrewriter_pencode = new_rewriter(rewrite_opts=dict(punycode_links=True))
225229

226-
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
227-
'/web/',
228-
rewrite_opts=dict(rewrite_rel_canon=False,
229-
rewrite_base=False))
230+
no_base_canon_rewriter = new_rewriter(rewrite_opts=dict(rewrite_rel_canon=False,
231+
rewrite_base=False))
230232

231233
def parse(data, head_insert=None, urlrewriter=urlrewriter):
232-
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
234+
parser = HTMLRewriter(urlrewriter, head_insert = head_insert, url = ORIGINAL_URL)
233235

234236
if isinstance(data, unicode):
235237
data = data.encode('utf-8')

0 commit comments

Comments
 (0)