Skip to content

Commit 0c96591

Browse files
committed
proxy: change HttpsUrlRewriter to SchemeOnlyUrlRewriter, which fixes http->https or https->http to match
the scheme of the current page. url-rewrite-only mode: add uo_ mod and use that to rewrite only urls (no banner, no client side rewrite) addresses #142
1 parent 979fcae commit 0c96591

File tree

6 files changed

+57
-30
lines changed

6 files changed

+57
-30
lines changed

pywb/framework/proxy.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import socket
1010
import ssl
1111

12-
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
12+
from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter
1313
from pywb.utils.wbexception import BadRequestException
1414

1515
from pywb.utils.bufferedreaders import BufferedReader
@@ -204,7 +204,7 @@ def __call__(self, env):
204204
host_prefix=host_prefix,
205205
rel_prefix=rel_prefix,
206206
wburl_class=route.handler.get_wburl_type(),
207-
urlrewriter_class=HttpsUrlRewriter,
207+
urlrewriter_class=SchemeOnlyUrlRewriter,
208208
use_abs_prefix=False,
209209
is_proxy=True)
210210

@@ -219,7 +219,7 @@ def __call__(self, env):
219219
wbrequest.wb_url.mod = 'bn_'
220220
else:
221221
# unaltered, no rewrite or banner
222-
wbrequest.wb_url.mod = 'id_'
222+
wbrequest.wb_url.mod = 'uo_'
223223

224224
response = route.handler(wbrequest)
225225

pywb/rewrite/rewrite_content.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from pywb.utils.bufferedreaders import DecompressingBufferedReader
1717
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
1818

19+
from regex_rewriters import JSNoneRewriter, JSLinkOnlyRewriter
20+
1921

2022
#=================================================================
2123
class RewriteContent:
@@ -159,9 +161,8 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
159161
charset = self._extract_html_charset(first_buff,
160162
status_headers)
161163

162-
if head_insert_func:
164+
if head_insert_func and not wb_url.is_url_rewrite_only:
163165
head_insert_orig = head_insert_func(rule, cdx)
164-
head_insert_str = None
165166

166167
if charset:
167168
try:
@@ -191,9 +192,15 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
191192

192193
return (status_headers, gen, False)
193194

195+
js_rewriter_class = rule.rewriters['js']
196+
css_rewriter_class = rule.rewriters['css']
197+
198+
if wb_url.is_url_rewrite_only:
199+
js_rewriter_class = JSNoneRewriter
200+
194201
rewriter = rewriter_class(urlrewriter,
195-
js_rewriter_class=rule.rewriters['js'],
196-
css_rewriter_class=rule.rewriters['css'],
202+
js_rewriter_class=js_rewriter_class,
203+
css_rewriter_class=css_rewriter_class,
197204
head_insert=head_insert_str,
198205
url=wb_url.url,
199206
defmod=self.defmod,
@@ -203,6 +210,11 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
203210
if wb_url.is_banner_only:
204211
return (status_headers, self.stream_to_gen(stream), False)
205212

213+
# url-only rewriter, but not rewriting urls in JS, so return
214+
if wb_url.is_url_rewrite_only and text_type == 'js':
215+
#return (status_headers, self.stream_to_gen(stream), False)
216+
rewriter_class = JSLinkOnlyRewriter
217+
206218
# apply one of (js, css, xml) rewriters
207219
rewriter = rewriter_class(urlrewriter)
208220

pywb/rewrite/test/test_url_rewriter.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -125,22 +125,33 @@
125125
>>> do_deprefix('http://example.com/file.html?foo=bar&url=' + urllib.quote_plus('http://localhost:8080/pywb/extra/path/http://example.com/filename.html') + '&foo2=bar2', '/pywb/', 'http://localhost:8080/pywb/')
126126
'http://example.com/file.html?foo=bar&url=http://example.com/filename.html&foo2=bar2'
127127
128-
# HttpsUrlRewriter tests
129-
>>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None)
130-
>>> httpsrewriter.rewrite('https://example.com/abc')
128+
# SchemeOnlyUrlRewriter tests
129+
>>> SchemeOnlyUrlRewriter('http://example.com/').rewrite('https://example.com/abc')
131130
'http://example.com/abc'
132131
133-
>>> httpsrewriter.rewrite('http://example.com/abc')
132+
>>> SchemeOnlyUrlRewriter('http://example.com/abc').rewrite('http://example.com/abc')
134133
'http://example.com/abc'
135134
135+
>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('http://example.com/abc')
136+
'https://example.com/abc'
137+
138+
>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('https://example.com/abc')
139+
'https://example.com/abc'
140+
141+
>>> SchemeOnlyUrlRewriter('http://example.com/abc').rewrite('//example.com/abc')
142+
'//example.com/abc'
143+
144+
>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('//example.com/abc')
145+
'//example.com/abc'
146+
136147
# rebase is identity
137-
>>> httpsrewriter.rebase_rewriter('https://example.com/') == httpsrewriter
148+
>>> x = SchemeOnlyUrlRewriter('http://example.com'); x.rebase_rewriter('https://example.com/') == x
138149
True
139150
140151
"""
141152

142153

143-
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
154+
from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter
144155
import urllib
145156

146157
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):

pywb/rewrite/url_rewriter.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class UrlRewriter(object):
1919

2020
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
2121

22-
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
22+
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
2323
root_path=None, cookie_scope=None, rewrite_opts={}):
2424
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
2525
self.prefix = prefix
@@ -152,17 +152,26 @@ def urljoin(orig_url, url):
152152

153153

154154
#=================================================================
155-
class HttpsUrlRewriter(UrlRewriter):
155+
class SchemeOnlyUrlRewriter(UrlRewriter):
156156
"""
157-
A url rewriter which urls that start with https:// to http://
157+
A url rewriter which ensures that any urls have the same
158+
scheme (http or https) as the base url.
158159
Other urls/input is unchanged.
159160
"""
160161

161-
HTTP = 'http://'
162-
HTTPS = 'https://'
162+
def __init__(self, *args, **kwargs):
163+
super(SchemeOnlyUrlRewriter, self).__init__(*args, **kwargs)
164+
self.url_scheme = self.wburl.url.split(':')[0]
165+
if self.url_scheme == 'https':
166+
self.opposite_scheme = 'http'
167+
else:
168+
self.opposite_scheme = 'https'
163169

164170
def rewrite(self, url, mod=None):
165-
return self.remove_https(url)
171+
if url.startswith(self.opposite_scheme + '://'):
172+
url = self.url_scheme + url[len(self.opposite_scheme):]
173+
174+
return url
166175

167176
def get_new_url(self, **kwargs):
168177
return kwargs.get('url', self.wburl.url)
@@ -175,12 +184,3 @@ def get_cookie_rewriter(self, scope=None):
175184

176185
def deprefix_url(self):
177186
return self.wburl.url
178-
179-
@staticmethod
180-
def remove_https(url):
181-
rw = HttpsUrlRewriter
182-
if url.startswith(rw.HTTPS):
183-
result = rw.HTTP + url[len(rw.HTTPS):]
184-
return result
185-
else:
186-
return url

pywb/rewrite/wburl.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,10 @@ def is_embed(self):
325325
def is_banner_only(self):
326326
return (self.mod == 'bn_')
327327

328+
@property
329+
def is_url_rewrite_only(self):
330+
return (self.mod == 'uo_')
331+
328332
@property
329333
def is_identity(self):
330334
return (self.mod == 'id_')

pywb/webapp/live_rewrite_handler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from pywb.rewrite.rewrite_live import LiveRewriter
55
from pywb.rewrite.wburl import WbUrl
6-
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
76

87
from handlers import StaticHandler, SearchPageWbUrlHandler
98
from views import HeadInsertView
@@ -235,7 +234,8 @@ def _get_video_info(self, wbrequest, info_url=None, video_url=None):
235234
headers = self._live_request_headers(wbrequest)
236235
headers['Content-Type'] = content_type
237236

238-
info_url = HttpsUrlRewriter.remove_https(info_url)
237+
if info_url.startswith('https://'):
238+
info_url = info_url.replace('https', 'http', 1)
239239

240240
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
241241

0 commit comments

Comments
 (0)