Merge branch 'develop' for 0.6.4

ikreymer · ikreymer · commit 71a8abe9c399 · 2014-11-06T00:34:32.000-08:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,19 @@
+pywb 0.6.4 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* Ignore bad multiline headers in warc.
+
+* Rewrite fix: Don't parse html entities in HTML rewriter.
+
+* Ensure cdx iterator closed when reeading.
+
+* Rewrite fix: remove pywb prefix from any query params.
+
+* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls.
+
+* WARC metadata and resource records include in cdx from cdx-indexer by default
+
+
 pywb 0.6.3 changelist
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/README.rst b/README.rst
@@ -1,10 +1,10 @@
-PyWb 0.6.3
+PyWb 0.6.4
 ==========
 
 .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
       :target: https://travis-ci.org/ikreymer/pywb
-.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master
-      :target: https://coveralls.io/r/ikreymer/pywb?branch=master
+.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
+      :target: https://coveralls.io/r/ikreymer/pywb?branch=develop
 .. image:: https://img.shields.io/gratipay/ikreymer.svg
       :target: https://www.gratipay.com/ikreymer/
       
diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py
@@ -28,8 +28,17 @@ def __init__(self, filename):
         self.filename = filename
 
     def load_cdx(self, query):
-        source = open(self.filename)
-        return iter_range(source, query.key, query.end_key)
+        def do_open():
+            try:
+                source = open(self.filename)
+                gen = iter_range(source, query.key, query.end_key)
+                for line in gen:
+                    yield line
+            finally:
+                source.close()
+
+        return do_open()
+        #return iter_range(do_open(), query.key, query.end_key)
 
     def __str__(self):
         return 'CDX File - ' + self.filename
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
@@ -78,6 +78,8 @@ def __init__(self, env,
                                                  rel_prefix,
                                                  env.get('SCRIPT_NAME', '/'),
                                                  cookie_scope)
+
+            self.urlrewriter.deprefix_url()
         else:
         # no wb_url, just store blank wb_url
             self.wb_url = None
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
@@ -136,9 +136,9 @@ def handle_exception(self, env, exc, print_trace):
             err_details = None
 
         if error_view:
-            if err_url:
+            if err_url and isinstance(err_url, str):
                 err_url = err_url.decode('utf-8', 'ignore')
-            if err_msg:
+            if err_msg and isinstance(err_msg, str):
                 err_msg = err_msg.decode('utf-8', 'ignore')
 
             return error_view.render_response(exc_type=type(exc).__name__,
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
@@ -263,10 +263,20 @@ def _internal_close(self):  # pragma: no cover
 
 #=================================================================
 class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
+    PARSETAG = re.compile('[<]')
+
     def __init__(self, *args, **kwargs):
         HTMLParser.__init__(self)
         super(HTMLRewriter, self).__init__(*args, **kwargs)
 
+    def reset(self):
+        HTMLParser.reset(self)
+        self.interesting = self.PARSETAG
+
+    def clear_cdata_mode(self):
+        HTMLParser.clear_cdata_mode(self)
+        self.interesting = self.PARSETAG
+
     def feed(self, string):
         try:
             HTMLParser.feed(self, string)
@@ -311,11 +321,12 @@ def handle_endtag(self, tag):
     def handle_data(self, data):
         self.parse_data(data)
 
-    def handle_entityref(self, data):
-        self.out.write('&' + data + ';')
-
-    def handle_charref(self, data):
-        self.out.write('&#' + data + ';')
+    # overriding regex so that these are no longer called
+    #def handle_entityref(self, data):
+    #    self.out.write('&' + data + ';')
+    #
+    #def handle_charref(self, data):
+    #    self.out.write('&#' + data + ';')
 
     def handle_comment(self, data):
         self.out.write('<!--')
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
@@ -111,7 +111,8 @@ class JSLinkOnlyRewriter(RegexRewriter):
     JS Rewriter which rewrites absolute http://, https:// and // urls
     at the beginning of a string
     """
-    JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
+    #JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])'
+    JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])'
 
     def __init__(self, rewriter, rules=[]):
         rules = rules + [
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
@@ -28,8 +28,11 @@
 <base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
 
 # HTML Entities
->>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
-<a href="">&rsaquo; &nbsp; &#62;</div>
+>>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
+<a href="">&rsaquo; &nbsp; &#62; &#63</div>
+
+>>> parse('<div>X&Y</div> </div>X&Y;</div>')
+<div>X&Y</div> </div>X&Y;</div>
 
 # Don't rewrite anchors
 >>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py
@@ -61,6 +61,9 @@
 >>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
 '&quot;/web/20131010/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
 
+>>> _test_js('"http:\/\/sub-site.example.com\/path-dashes\/path_other\/foo_bar.txt"')
+'"/web/20131010/http:\\/\\/sub-site.example.com\\/path-dashes\\/path_other\\/foo_bar.txt"'
+
 
 #=================================================================
 # XML Rewriting
diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py
@@ -105,10 +105,10 @@ def test_example_1():
     assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
 
 def test_example_2_redirect():
-    status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
+    status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
 
     # redirect, no content
-    assert status_headers.get_statuscode() == '301'
+    assert status_headers.get_statuscode() == '302'
     assert len(buff) == 0
 
 
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
@@ -74,6 +74,18 @@
 >>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_new_url(timestamp='20131024')
 '/123/20131024id_/http://example.com/file/path/blah.html'
 
+# deprefix tests
+>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/20141226/http://example.com/', '/pywb/', 'http://localhost:8080/pywb/')
+'http://example.com/file/path/blah.html?param=http://example.com/'
+
+>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/if_/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
+'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
+
+>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
+'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
+
+>>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b&param2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/')
+'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
 
 # HttpsUrlRewriter tests
 >>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
@@ -86,13 +98,22 @@
 
 
 from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
-
+import urllib
 
 def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
     rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
     return rewriter.rewrite(rel_url, mod)
 
 
+def do_deprefix(url, rel_prefix, full_prefix):
+    encoded = urllib.quote_plus(full_prefix)
+    url = url.replace(full_prefix, encoded)
+
+    rewriter = UrlRewriter(url, rel_prefix, full_prefix)
+    url = rewriter.deprefix_url()
+    return urllib.unquote_plus(url)
+
+
 if __name__ == "__main__":
     import doctest
     doctest.testmod()
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
@@ -1,4 +1,3 @@
-import copy
 import urlparse
 
 from wburl import WbUrl
@@ -88,6 +87,9 @@ def get_cookie_rewriter(self, scope=None):
         cls = get_cookie_rewriter(scope)
         return cls(self)
 
+    def deprefix_url(self):
+        return self.wburl.deprefix_url(self.full_prefix)
+
     def __repr__(self):
         return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
 
@@ -150,3 +152,6 @@ def rebase_rewriter(self, new_url):
 
     def get_cookie_rewriter(self, scope=None):
         return None
+
+    def deprefix_url(self):
+        return self.wburl.url
diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py
@@ -39,7 +39,7 @@
 """
 
 import re
-
+import urllib
 
 #=================================================================
 class BaseWbUrl(object):
@@ -149,6 +149,14 @@ def set_replay_timestamp(self, timestamp):
         self.timestamp = timestamp
         self.type = self.REPLAY
 
+
+    def deprefix_url(self, prefix):
+        prefix = urllib.quote_plus(prefix)
+        rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
+        new_url = re.sub(rex_query, '=', self.url)
+        self.url = new_url
+        return self.url
+
     # Str Representation
     # ====================
     def to_str(self, **overrides):
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
@@ -11,18 +11,18 @@ rules:
     # facebook rules
     #=================================================================
     - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
-        
+
       fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
-              
+
     - url_prefix: 'com,facebook)/ajax/ufi/'
-      
+
       fuzzy_lookup:
           - ft_ent_identifier
           - lsd
 
     - url_prefix: 'com,facebook)/ajax/chat/hovercard/sidebar.php'
 
-      fuzzy_lookup: 
+      fuzzy_lookup:
           - ids[0]
 
     - url_prefix: 'com,facebook)/login.php'
@@ -82,20 +82,21 @@ rules:
     #=================================================================
 
     - url_prefix: 'com,google,plus)/_/stream/getactivities'
-    
-      fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
- 
+
+    #      fuzzy_lookup: '(egk[^"]+)?.*(f.sid=[^&]+)'
+      fuzzy_lookup: 'f.req=.*\]\]\]\,\"([^"]+).*(f.sid=[^&]+)'
+
     - url_prefix: 'com,google,plus)/_/stream/squarestream'
-    
+
       fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
- 
+
     - url_prefix: 'com,google,plus)/_/communities/rt/landing'
-    
+
       fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
-    
+
 
     - url_prefix: 'com,google,plus)/_/'
-     
+
       fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
 
 
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
@@ -708,11 +708,11 @@ WB_wombat_init = (function() {
     }
 
     //============================================
-    function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) {
+    function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
         wb_replay_prefix = replay_prefix;
         
         if (wb_replay_prefix) {
-            wb_replay_date_prefix = replay_prefix + capture_date + "/";
+            wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
             
             if (capture_date.length > 0) {
                 wb_capture_date_part = "/" + capture_date + "/";
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
@@ -7,7 +7,8 @@
                  "{{ cdx['timestamp'] if include_ts else ''}}",
                  "{{ urlsplit.scheme }}",
                  "{{ urlsplit.netloc }}",
-                 "{{ cdx.timestamp | format_ts('%s') }}");
+                 "{{ cdx.timestamp | format_ts('%s') }}",
+                 "{{ wbrequest.wb_url.mod }}");
 </script>
 {% endif %}
 <script>
diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py
@@ -169,7 +169,8 @@ def parse(self, stream, full_statusline=None):
 
             # append continuation lines, if any
             while next_line and next_line.startswith((' ', '\t')):
-                value += next_line
+                if value is not None:
+                    value += next_line
                 next_line, total_read = _strip_count(stream.readline(),
                                                      total_read)
 
diff --git a/pywb/utils/test/test_statusandheaders.py b/pywb/utils/test/test_statusandheaders.py
@@ -32,6 +32,10 @@
 # empty
 >>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
 StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
+
+
+>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3))
+StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
 """
 
 
@@ -54,6 +58,14 @@
 
 """
 
+status_headers_3 = "\
+HTTP/1.0 204 Empty\r\n\
+Content-Type: Value\r\n\
+%Invalid%\r\n\
+\tMultiline\r\n\
+Content-Length: 0\r\n\
+\r\n"
+
 
 if __name__ == "__main__":
     import doctest
diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py
diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py
diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py
diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py
diff --git a/setup.py b/setup.py
diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py