Skip to content

Commit d9c5345

Browse files
committed
rewrite: add support for Cookie request header rewrite to support sites
which require a cookie to be set. req_cookie_rewrite directive can be set in rules.yaml per url prefix with a list of match/replace regexs
1 parent df94e17 commit d9c5345

File tree

5 files changed

+99
-13
lines changed

5 files changed

+99
-13
lines changed

CHANGES.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ pywb 0.7.2 changelist
33

44
* Experiment with disabling DASH for YT
55

6+
* New ``req_cookie_rewrite`` rewrite directive to rewrite outgoing ``Cookie`` header, can be used to fix a certain cookie for a url prefix.
7+
68

79
pywb 0.7.1 changelist
810
~~~~~~~~~~~~~~~~~~~~~
@@ -25,7 +27,7 @@ pywb 0.7.1 changelist
2527
- setAttribute override
2628
- Date override sets date to replay timestamp
2729
- Image() object override
28-
- ability to disable dynamic attribute rewriting by setting `_no_rewrite` on an element.
30+
- ability to disable dynamic attribute rewriting by setting ``_no_rewrite`` on an element.
2931

3032
* Type detection: resolve conflict between text/html that is served under js_ mod, resolve if html or js.
3133

pywb/rewrite/rewrite_live.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,11 @@ def fetch_local_file(self, uri):
5050

5151
return (status_headers, stream)
5252

53-
def translate_headers(self, url, env):
53+
def translate_headers(self, url, urlkey, env):
5454
headers = {}
5555

5656
splits = urlsplit(url)
57+
has_cookies = False
5758

5859
for name, value in env.iteritems():
5960
if name == 'HTTP_HOST':
@@ -73,6 +74,11 @@ def translate_headers(self, url, env):
7374
elif name == 'HTTP_REFERER':
7475
continue
7576

77+
elif name == 'HTTP_COOKIE':
78+
name = 'Cookie'
79+
value = self._req_cookie_rewrite(urlkey, value)
80+
has_cookies = True
81+
7682
elif name.startswith('HTTP_'):
7783
name = name[5:].title().replace('_', '-')
7884

@@ -87,9 +93,28 @@ def translate_headers(self, url, env):
8793
if value:
8894
headers[name] = value
8995

96+
if not has_cookies:
97+
value = self._req_cookie_rewrite(urlkey, '')
98+
if value:
99+
headers['Cookie'] = value
100+
90101
return headers
91102

103+
def _req_cookie_rewrite(self, urlkey, value):
104+
rule = self.rewriter.ruleset.get_first_match(urlkey)
105+
if not rule or not rule.req_cookie_rewrite:
106+
return value
107+
108+
for cr in rule.req_cookie_rewrite:
109+
try:
110+
value = cr['rx'].sub(cr['replace'], value)
111+
except KeyError:
112+
pass
113+
114+
return value
115+
92116
def fetch_http(self, url,
117+
urlkey=None,
93118
env=None,
94119
req_headers=None,
95120
follow_redirects=False,
@@ -109,7 +134,7 @@ def fetch_http(self, url,
109134
method = env['REQUEST_METHOD'].upper()
110135
input_ = env['wsgi.input']
111136

112-
req_headers.update(self.translate_headers(url, env))
137+
req_headers.update(self.translate_headers(url, urlkey, env))
113138

114139
if method in ('POST', 'PUT'):
115140
len_ = env.get('CONTENT_LENGTH')
@@ -155,17 +180,18 @@ def fetch_request(self, url, urlrewriter,
155180
if url.startswith('//'):
156181
url = 'http:' + url
157182

183+
# explicit urlkey may be passed in (say for testing)
184+
if not urlkey:
185+
urlkey = canonicalize(url)
186+
158187
if is_http(url):
159-
(status_headers, stream) = self.fetch_http(url, env, req_headers,
188+
(status_headers, stream) = self.fetch_http(url, urlkey, env,
189+
req_headers,
160190
follow_redirects,
161191
ignore_proxies)
162192
else:
163193
(status_headers, stream) = self.fetch_local_file(url)
164194

165-
# explicit urlkey may be passed in (say for testing)
166-
if not urlkey:
167-
urlkey = canonicalize(url)
168-
169195
if timestamp is None:
170196
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
171197

pywb/rewrite/rewriterules.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from html_rewriter import HTMLRewriter
99

1010
import itertools
11+
import re
1112

1213

1314
#=================================================================
@@ -47,6 +48,12 @@ def __init__(self, url_prefix, config={}):
4748
# cookie rewrite scope
4849
self.cookie_scope = config.get('cookie_scope', 'default')
4950

51+
req_cookie_rewrite = config.get('req_cookie_rewrite', [])
52+
for rc in req_cookie_rewrite:
53+
rc['rx'] = re.compile(rc.get('match', ''))
54+
55+
self.req_cookie_rewrite = req_cookie_rewrite
56+
5057
def _add_custom_regexs(self, field, config):
5158
regexs = config.get(field + '_regexs')
5259
if not regexs:

pywb/rewrite/test/test_rewrite_live.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,43 @@ def test_csrf_token_headers():
2222
rewriter = LiveRewriter()
2323
env = {'HTTP_X_CSRFTOKEN': 'wrong', 'HTTP_COOKIE': 'csrftoken=foobar'}
2424

25-
req_headers = rewriter.translate_headers('http://example.com/', env)
25+
req_headers = rewriter.translate_headers('http://example.com/', 'com,example)/', env)
2626

2727
assert req_headers == {'X-CSRFToken': 'foobar', 'Cookie': 'csrftoken=foobar'}
2828

29+
def test_req_cookie_rewrite_1():
30+
rewriter = LiveRewriter()
31+
env = {'HTTP_COOKIE': 'A=B'}
32+
33+
urlkey = 'example,example,test)/'
34+
url = 'test.example.example/'
35+
36+
req_headers = rewriter.translate_headers(url, urlkey, env)
37+
38+
assert req_headers == {'Cookie': 'A=B; FOO=&bar=1'}
39+
40+
def test_req_cookie_rewrite_2():
41+
rewriter = LiveRewriter()
42+
env = {'HTTP_COOKIE': 'FOO=goo'}
43+
44+
urlkey = 'example,example,test)/'
45+
url = 'test.example.example/'
46+
47+
req_headers = rewriter.translate_headers(url, urlkey, env)
48+
49+
assert req_headers == {'Cookie': 'FOO=&bar=1'}
50+
51+
def test_req_cookie_rewrite_3():
52+
rewriter = LiveRewriter()
53+
env = {}
54+
55+
urlkey = 'example,example,test)/'
56+
url = 'test.example.example/'
57+
58+
req_headers = rewriter.translate_headers(url, urlkey, env)
59+
60+
assert req_headers == {'Cookie': '; FOO=&bar=1'}
61+
2962
def test_local_1():
3063
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
3164
urlrewriter,

pywb/rules.yaml

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ rules:
162162
args:
163163
- id
164164
- itag
165-
- mime
165+
#- mime
166166

167167
filter:
168168
- '~urlkey:{0}'
@@ -187,15 +187,24 @@ rules:
187187
js_rewrite_location: location
188188

189189

190-
# watch config changes
191-
- url_prefix: 'com,youtube)/watch'
190+
# watch and embed config changes
191+
- url_prefix: 'com,youtube)/'
192192

193193
rewrite:
194-
195194
js_regexs:
196195
- match: 'ytplayer.load\(\);'
197196
replace: 'ytplayer.config.args.dash = "0"; ytplayer.config.args.dashmpd = ""; {0}'
198197

198+
- match: 'yt\.setConfig.*PLAYER_CONFIG.*args": {'
199+
replace: '{0} "dash": "0", dashmpd: "", '
200+
201+
req_cookie_rewrite:
202+
- match: '^(((?!PREF).)*)$'
203+
replace: '\1; PREF=f2=40000000'
204+
205+
- match: '(.*PREF=)([^ ;]*)(.*)'
206+
replace: '\1&f2=40000000\3'
207+
199208
# testing rules -- not for valid domain
200209
#=================================================================
201210
# this rule block is a non-existent prefix merely for testing
@@ -217,6 +226,15 @@ rules:
217226
rewrite:
218227
js_rewrite_location: urls
219228

229+
req_cookie_rewrite:
230+
- match: '^(((?!FOO).)*)$'
231+
replace: '\1; FOO=bar=1'
232+
233+
- match: '(.*FOO=)([^ ;]*)(.*)'
234+
replace: '\1&bar=1\3'
235+
236+
- match: ''
237+
invalid_: ''
220238

221239
# all domain rules -- fallback to this dataset
222240
#=================================================================

0 commit comments

Comments
 (0)