Skip to content

Commit c935aa5

Browse files
committed
Merge branch 'develop' for 0.7.5
2 parents de403c4 + 43805c6 commit c935aa5

28 files changed

+158
-75
lines changed

.gitattributes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*.arc -text
2+
*.warc -text
3+
*.cdx -text
4+
*.gz -text

CHANGES.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
pywb 0.7.5 changelist
2+
~~~~~~~~~~~~~~~~~~~~~
3+
4+
* Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes:
5+
- read all files as binary to avoid line ending issues
6+
- properly convert url <-> file
7+
- avoid platform dependent apis
8+
9+
* Change any unhandled exceptions to result in a 500 error, instead of 400.
10+
11+
* More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting.
12+
13+
114
pywb 0.7.2 changelist
215
~~~~~~~~~~~~~~~~~~~~~
316

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PyWb 0.7.2
1+
PyWb 0.7.5
22
==========
33

44
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
@@ -13,7 +13,7 @@ pywb is a python implementation of web archival replay tools, sometimes also kno
1313
pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC <http://en.wikipedia.org/wiki/ARC_(file_format)>`_ and `WARC <http://en.wikipedia.org/wiki/Web_ARChive>`_.
1414
The replay system is designed to accurately replay complex dynamic sites, including video and audio content.
1515

16-
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server.
16+
pywb can be used as a traditional web application or an HTTP or HTTPS proxy server, and has been tested on Linux, OS X and Windows platforms.
1717

1818
pywb is also fully compliant with the `Memento <http://mementoweb.org/>`_ protocol (`RFC-7089 <http://tools.ietf.org/html/rfc7089>`_).
1919

pywb/cdx/cdxsource.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, filename):
3030
def load_cdx(self, query):
3131
def do_open():
3232
try:
33-
source = open(self.filename)
33+
source = open(self.filename, 'rb')
3434
gen = iter_range(source, query.key, query.end_key)
3535
for line in gen:
3636
yield line

pywb/cdx/test/test_redis_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
def load_cdx_into_redis(source, filename, key=None):
2828
# load a cdx into mock redis
29-
with open(test_cdx_dir + filename) as fh:
29+
with open(test_cdx_dir + filename, 'rb') as fh:
3030
for line in fh:
3131
zadd_cdx(source, line, key)
3232

pywb/cdx/zipnum.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def load_loc(self):
8484
self.loc_mtime = new_mtime
8585

8686
logging.debug('Loading loc from: ' + self.loc_filename)
87-
with open(self.loc_filename) as fh:
87+
with open(self.loc_filename, 'rb') as fh:
8888
for line in fh:
8989
parts = line.rstrip().split('\t')
9090
self.loc_map[parts[0]] = parts[1:]
@@ -112,7 +112,7 @@ def lookup_loc(self, part):
112112
def load_cdx(self, query):
113113
self.load_loc()
114114

115-
reader = open(self.summary)
115+
reader = open(self.summary, 'rb')
116116

117117
idx_iter = iter_range(reader,
118118
query.key,

pywb/framework/certauth.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313

1414

1515
#=================================================================
16-
# Duration of 100 years
17-
CERT_DURATION = 100 * 365 * 24 * 60 * 60
16+
# Duration of 10 years
17+
CERT_DURATION = 10 * 365 * 24 * 60 * 60
1818

1919
CERTS_DIR = './ca/certs/'
2020

pywb/framework/proxy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ def handle_cert_install(self, env):
334334
return None
335335

336336
buff = ''
337-
with open(self.ca.ca_file) as fh:
337+
with open(self.ca.ca_file, 'rb') as fh:
338338
buff = fh.read()
339339

340340
content_type = 'application/x-x509-ca-cert'

pywb/framework/test/test_certauth.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
from pywb.framework.certauth import main, CertificateAuthority
77

8-
TEST_CA_DIR = './pywb/framework/test/pywb_test_ca_certs'
9-
TEST_CA_ROOT = './pywb/framework/test/pywb_test_ca.pem'
8+
TEST_CA_DIR = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca_certs')
9+
TEST_CA_ROOT = os.path.join('.', 'pywb', 'framework', 'test', 'pywb_test_ca.pem')
1010

1111
def setup_module():
1212
openssl_support = pytest.importorskip("OpenSSL")

pywb/framework/test/test_wsgi_wrapper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def response(env, start_response):
1414

1515
class TestErrApp:
1616
def __call__(self, env):
17-
raise Exception('Test Error')
17+
raise Exception('Test Unexpected Error')
1818

1919
class TestCustomErrApp:
2020
def __call__(self, env):
@@ -41,8 +41,8 @@ def test_err_app():
4141
testapp = webtest.TestApp(the_app)
4242
resp = testapp.get('/abc', expect_errors=True)
4343

44-
assert resp.status_int == 400
45-
assert '400 Bad Request Error: Test Error' in resp.body
44+
assert resp.status_int == 500
45+
assert '500 Internal Server Error Error: Test Unexpected Error' in resp.body
4646

4747
def test_custom_err_app():
4848
the_app = init_app(initer(TestCustomErrApp), load_yaml=False)

pywb/framework/wsgi_wrappers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def handle_exception(self, env, exc, print_trace):
118118
if hasattr(exc, 'status'):
119119
status = exc.status()
120120
else:
121-
status = '400 Bad Request'
121+
status = '500 Internal Server Error'
122122

123123
if hasattr(exc, 'url'):
124124
err_url = exc.url

pywb/rewrite/html_rewriter.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@ def _init_rewrite_tags(defmod):
3030
'base': {'href': defmod},
3131
'blockquote': {'cite': defmod},
3232
'body': {'background': 'im_'},
33+
'button': {'formaction': defmod},
34+
'command': {'icon': 'im_'},
3335
'del': {'cite': defmod},
3436
'embed': {'src': 'oe_'},
3537
'head': {'': defmod}, # for head rewriting
3638
'iframe': {'src': 'if_'},
3739
'img': {'src': 'im_',
3840
'srcset': 'im_'},
3941
'ins': {'cite': defmod},
40-
'input': {'src': 'im_'},
42+
'input': {'src': 'im_',
43+
'formaction': defmod},
4144
'form': {'action': defmod},
4245
'frame': {'src': 'fr_'},
4346
'link': {'href': 'oe_'},
@@ -49,7 +52,8 @@ def _init_rewrite_tags(defmod):
4952
'ref': {'href': 'oe_'},
5053
'script': {'src': 'js_'},
5154
'source': {'src': 'oe_'},
52-
'video': {'src': 'oe_'},
55+
'video': {'src': 'oe_',
56+
'poster': 'im_'},
5357

5458
'div': {'data-src': defmod,
5559
'data-uri': defmod},

pywb/rewrite/rewrite_live.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
import datetime
77
import mimetypes
88
import logging
9+
import os
910

1011
from urlparse import urlsplit
1112

12-
from pywb.utils.loaders import is_http, LimitReader, BlockLoader
13+
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
1314
from pywb.utils.loaders import extract_client_cookie
1415
from pywb.utils.timeutils import datetime_to_timestamp
1516
from pywb.utils.statusandheaders import StatusAndHeaders
@@ -180,11 +181,18 @@ def fetch_request(self, url, urlrewriter,
180181
if url.startswith('//'):
181182
url = 'http:' + url
182183

184+
if is_http(url):
185+
is_remote = True
186+
else:
187+
is_remote = False
188+
if not url.startswith('file:'):
189+
url = to_file_url(url)
190+
183191
# explicit urlkey may be passed in (say for testing)
184192
if not urlkey:
185193
urlkey = canonicalize(url)
186194

187-
if is_http(url):
195+
if is_remote:
188196
(status_headers, stream) = self.fetch_http(url, urlkey, env,
189197
req_headers,
190198
follow_redirects,

pywb/rewrite/test/test_url_rewriter.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,17 @@
103103
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
104104
105105
# HttpsUrlRewriter tests
106-
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
106+
>>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None)
107+
>>> httpsrewriter.rewrite('https://example.com/abc')
107108
'http://example.com/abc'
108109
109-
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('http://example.com/abc')
110+
>>> httpsrewriter.rewrite('http://example.com/abc')
110111
'http://example.com/abc'
111112
113+
# rebase is identity
114+
>>> httpsrewriter.rebase_rewriter('https://example.com/') == httpsrewriter
115+
True
116+
112117
"""
113118

114119

pywb/static/wombat.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ _WBWombat = (function() {
105105
"http:/" + prefix, "https:/" + prefix];
106106
}
107107

108+
var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"];
109+
108110
//============================================
109111
function rewrite_url_(url) {
110112
// If undefined, just return it
@@ -692,12 +694,9 @@ _WBWombat = (function() {
692694
}
693695

694696
override_attr(created, "src");
695-
} else if (created.tagName == "IMG" || created.tagName == "VIDEO" || created.tagName == "AUDIO") {
697+
} else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
696698
override_attr(created, "src");
697699
}
698-
// } else if (created.tagName == "A") {
699-
// override_attr(created, "href");
700-
// }
701700

702701
return created;
703702
}

pywb/utils/bufferedreaders.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@ def __init__(self, stream, block_size=1024,
4646
self.buff_size = 0
4747

4848
def set_decomp(self, decomp_type):
49-
if self.num_read > 0:
50-
raise Exception('Attempting to change decompression mid-stream')
51-
5249
self._init_decomp(decomp_type)
5350

5451
def _init_decomp(self, decomp_type):

pywb/utils/loaders.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import hmac
88
import urllib
99
import urllib2
10+
import urlparse
1011
import time
1112
import pkg_resources
1213
from io import open
@@ -17,6 +18,15 @@ def is_http(filename):
1718
return filename.startswith(('http://', 'https://'))
1819

1920

21+
#=================================================================
22+
def to_file_url(filename):
23+
""" Convert a filename to a file:// url
24+
"""
25+
url = os.path.abspath(filename)
26+
url = urlparse.urljoin('file:', urllib.pathname2url(url))
27+
return url
28+
29+
2030
#=================================================================
2131
def load_yaml_config(config_file):
2232
import yaml
@@ -39,12 +49,12 @@ def extract_post_query(method, mime, length, stream):
3949
not mime.lower().startswith('application/x-www-form-urlencoded'))):
4050
return None
4151

42-
if not length or length == '0':
43-
return None
44-
4552
try:
4653
length = int(length)
47-
except ValueError:
54+
except (ValueError, TypeError):
55+
return None
56+
57+
if length <= 0:
4858
return None
4959

5060
#todo: encoding issues?
@@ -129,9 +139,10 @@ def load_file_or_resource(self, url, offset=0, length=-1):
129139
# if starting with . or /, can only be a file path..
130140
file_only = url.startswith(('/', '.'))
131141

142+
# convert to filename
132143
if url.startswith('file://'):
133-
url = url[len('file://'):]
134144
file_only = True
145+
url = urllib.url2pathname(url[len('file://'):])
135146

136147
try:
137148
# first, try as file

pywb/utils/test/test_binsearch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,12 @@
6666
test_cdx_dir = get_test_dir() + 'cdx/'
6767

6868
def print_binsearch_results(key, iter_func):
69-
with open(test_cdx_dir + 'iana.cdx') as cdx:
69+
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
7070
for line in iter_func(cdx, key):
7171
print line
7272

7373
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
74-
with open(test_cdx_dir + 'iana.cdx') as cdx:
74+
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
7575
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
7676
print line
7777

pywb/utils/test/test_loaders.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
100
2626
2727
# no length specified, read full amount requested
28-
>>> len(BlockLoader().load('file://' + test_cdx_dir + 'example.cdx', 0, -1).read(400))
28+
>>> len(BlockLoader().load(to_file_url(test_cdx_dir + 'example.cdx'), 0, -1).read(400))
2929
400
3030
3131
# HMAC Cookie Maker
@@ -56,14 +56,41 @@
5656
>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
5757
5858
>>> extract_client_cookie({}, 'y')
59+
60+
61+
# extract_post_query tests
62+
63+
# correct POST data
64+
>>> post_data = 'foo=bar&dir=%2Fbaz'
65+
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
66+
'foo=bar&dir=/baz'
67+
68+
# unsupported method
69+
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data))
70+
71+
# unsupported type
72+
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data))
73+
74+
# invalid length
75+
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data))
76+
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data))
77+
78+
# length too short
79+
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data))
80+
'foo=bar&dir=%2'
81+
82+
# length too long
83+
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
84+
'foo=bar&dir=/baz'
5985
"""
6086

6187

6288
#=================================================================
6389
import re
90+
import os
6491
from io import BytesIO
65-
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
66-
from pywb.utils.loaders import LimitReader, extract_client_cookie
92+
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
93+
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
6794

6895
from pywb import get_test_dir
6996

@@ -82,7 +109,6 @@ def seek_read_full(seekable_reader, offset):
82109
return seekable_reader.readline()
83110

84111

85-
86112
if __name__ == "__main__":
87113
import doctest
88114
doctest.testmod()

pywb/utils/wbexception.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ def __init__(self, msg=None, url=None):
66
Exception.__init__(self, msg)
77
self.url = url
88

9-
def status(self):
10-
return '500 Internal Server Error'
9+
# Default Error Code
10+
# def status(self):
11+
# return '500 Internal Server Error'
1112

1213

1314
#=================================================================

0 commit comments

Comments
 (0)