Skip to content

Commit 39e824c

Browse files
committed
live rewite proxy: decouple having http/https proxy from recording,
move youtubedl wrapper calls, metadata add calls to live rewrite proxy class for easier extension closes #141 also improves #136
1 parent c7224ec commit 39e824c

File tree

3 files changed

+77
-63
lines changed

3 files changed

+77
-63
lines changed

pywb/rewrite/rewrite_live.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ def __init__(self, is_framed_replay=False, proxies=None):
3838
else:
3939
logging.debug('Live Rewrite Direct (no proxy)')
4040

41+
def is_recording(self):
42+
return self.proxies is not None
43+
4144
def fetch_local_file(self, uri):
4245
#fh = open(uri)
4346
fh = LocalFileLoader().load(uri)
@@ -123,14 +126,14 @@ def fetch_http(self, url,
123126
env=None,
124127
req_headers=None,
125128
follow_redirects=False,
126-
ignore_proxies=False,
129+
skip_recording=False,
127130
verify=True):
128131

129132
method = 'GET'
130133
data = None
131134

132135
proxies = None
133-
if not ignore_proxies:
136+
if not skip_recording:
134137
proxies = self.proxies
135138

136139
if not req_headers:
@@ -174,7 +177,7 @@ def fetch_request(self, url, urlrewriter,
174177
req_headers={},
175178
timestamp=None,
176179
follow_redirects=False,
177-
ignore_proxies=False,
180+
skip_recording=False,
178181
verify=True,
179182
remote_only=True):
180183

@@ -203,7 +206,7 @@ def fetch_request(self, url, urlrewriter,
203206
(status_headers, stream) = self.fetch_http(url, urlkey, env,
204207
req_headers,
205208
follow_redirects,
206-
ignore_proxies,
209+
skip_recording,
207210
verify)
208211
else:
209212
(status_headers, stream) = self.fetch_local_file(url)
@@ -232,6 +235,26 @@ def fetch_request(self, url, urlrewriter,
232235

233236
return result
234237

238+
def fetch_async(self, url, headers):
239+
resp = self.live_request(method='GET',
240+
url=url,
241+
headers=headers,
242+
proxies=self.proxies,
243+
verify=False,
244+
stream=True)
245+
246+
# don't actually read whole response,
247+
# proxy response for writing it
248+
resp.close()
249+
250+
def add_metadata(self, url, headers, data):
251+
return self.live_request(method='PUTMETA',
252+
url=url,
253+
data=data,
254+
headers=headers,
255+
proxies=self.proxies,
256+
verify=False)
257+
235258
def get_rewritten(self, *args, **kwargs):
236259
result = self.fetch_request(*args, **kwargs)
237260

@@ -240,3 +263,35 @@ def get_rewritten(self, *args, **kwargs):
240263
buff = ''.join(gen)
241264

242265
return (status_headers, buff)
266+
267+
def get_video_info(self, url):
268+
return youtubedl.extract_info(url)
269+
270+
271+
#=================================================================
272+
class YoutubeDLWrapper(object): #pragma: no cover
273+
""" YoutubeDL wrapper, inits youtubee-dl if it is available
274+
"""
275+
def __init__(self):
276+
try:
277+
from youtube_dl import YoutubeDL as YoutubeDL
278+
except ImportError:
279+
self.ydl = None
280+
return
281+
282+
self.ydl = YoutubeDL(dict(simulate=True,
283+
youtube_include_dash_manifest=False))
284+
self.ydl.add_default_info_extractors()
285+
286+
def extract_info(self, url):
287+
print('YDL', self.ydl)
288+
if not self.ydl:
289+
return None
290+
291+
info = self.ydl.extract_info(url)
292+
return info
293+
294+
295+
#=================================================================
296+
youtubedl = YoutubeDLWrapper()
297+

pywb/webapp/live_rewrite_handler.py

Lines changed: 17 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from pywb.utils.wbexception import WbException
1212

1313
import json
14-
import requests
1514
import hashlib
1615

1716

@@ -28,19 +27,17 @@ class RewriteHandler(SearchPageWbUrlHandler):
2827

2928
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
3029

31-
youtubedl = None
32-
3330
def __init__(self, config):
3431
super(RewriteHandler, self).__init__(config)
3532

3633
proxyhostport = config.get('proxyhostport')
3734

3835
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
3936

40-
self.rewriter = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
41-
proxies=proxyhostport)
37+
self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
38+
proxies=proxyhostport)
4239

43-
self.proxies = self.rewriter.proxies
40+
self.recording = self.live_fetcher.is_recording()
4441

4542
self.head_insert_view = HeadInsertView.init_from_config(config)
4643

@@ -73,7 +70,7 @@ def handle_request(self, wbrequest):
7370
def _live_request_headers(self, wbrequest):
7471
return {}
7572

76-
def _ignore_proxies(self, wbrequest):
73+
def _skip_recording(self, wbrequest):
7774
return False
7875

7976
def render_content(self, wbrequest):
@@ -87,7 +84,7 @@ def render_content(self, wbrequest):
8784
if ref_wburl_str:
8885
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
8986

90-
ignore_proxies = self._ignore_proxies(wbrequest)
87+
skip_recording = self._skip_recording(wbrequest)
9188

9289
use_206 = False
9390
url = None
@@ -96,7 +93,7 @@ def render_content(self, wbrequest):
9693
readd_range = False
9794
cache_key = None
9895

99-
if self.proxies and not ignore_proxies:
96+
if self.recording and not skip_recording:
10097
rangeres = wbrequest.extract_range()
10198

10299
if rangeres:
@@ -110,17 +107,17 @@ def render_content(self, wbrequest):
110107
readd_range = True
111108
else:
112109
# disables proxy
113-
ignore_proxies = True
110+
skip_recording = True
114111

115112
# sets cache_key only if not already cached
116113
cache_key = self._get_cache_key('r:', url)
117114

118-
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
115+
result = self.live_fetcher.fetch_request(wbrequest.wb_url.url,
119116
wbrequest.urlrewriter,
120117
head_insert_func=head_insert_func,
121118
req_headers=req_headers,
122119
env=wbrequest.env,
123-
ignore_proxies=ignore_proxies,
120+
skip_recording=skip_recording,
124121
verify=self.verify)
125122

126123
wbresponse = self._make_response(wbrequest, *result)
@@ -135,8 +132,8 @@ def render_content(self, wbrequest):
135132
except (ValueError, TypeError):
136133
pass
137134

138-
if cache_key:
139-
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
135+
if self.recording and cache_key:
136+
self._add_rec_ping(cache_key, url, wbrequest, wbresponse)
140137

141138
if rangeres:
142139
referrer = wbrequest.env.get('REL_REFERER')
@@ -183,7 +180,7 @@ def create_cache_key(prefix, url):
183180
key = prefix + key
184181
return key
185182

186-
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
183+
def _add_rec_ping(self, key, url, wbrequest, wbresponse):
187184
def do_ping():
188185
headers = self._live_request_headers(wbrequest)
189186
headers['Connection'] = 'close'
@@ -192,15 +189,8 @@ def do_ping():
192189
# mark as pinged
193190
self._cache[key] = '1'
194191

195-
resp = requests.get(url=url,
196-
headers=headers,
197-
proxies=self.proxies,
198-
verify=False,
199-
stream=True)
192+
self.live_fetcher.fetch_async(url, headers)
200193

201-
# don't actually read whole response,
202-
# proxy response for writing it
203-
resp.close()
204194
except:
205195
del self._cache[key]
206196
raise
@@ -219,20 +209,17 @@ def wrap_buff_gen(gen):
219209
return wbresponse
220210

221211
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
222-
if not self.youtubedl:
223-
self.youtubedl = YoutubeDLWrapper()
224-
225212
if not video_url:
226213
video_url = wbrequest.wb_url.url
227214

228215
if not info_url:
229216
info_url = wbrequest.wb_url.url
230217

231218
cache_key = None
232-
if self.proxies:
219+
if self.recording:
233220
cache_key = self._get_cache_key('v:', video_url)
234221

235-
info = self.youtubedl.extract_info(video_url)
222+
info = self.live_fetcher.get_video_info(video_url)
236223
if info is None: #pragma: no cover
237224
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
238225
'enable improved video proxy')
@@ -244,42 +231,14 @@ def _get_video_info(self, wbrequest, info_url=None, video_url=None):
244231
content_type = self.YT_DL_TYPE
245232
metadata = json.dumps(info)
246233

247-
if (self.proxies and cache_key):
234+
if (self.recording and cache_key):
248235
headers = self._live_request_headers(wbrequest)
249236
headers['Content-Type'] = content_type
250237

251238
info_url = HttpsUrlRewriter.remove_https(info_url)
252239

253-
response = requests.request(method='PUTMETA',
254-
url=info_url,
255-
data=metadata,
256-
headers=headers,
257-
proxies=self.proxies,
258-
verify=False)
240+
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
259241

260242
self._cache[cache_key] = '1'
261243

262244
return WbResponse.text_response(metadata, content_type=content_type)
263-
264-
265-
#=================================================================
266-
class YoutubeDLWrapper(object): #pragma: no cover
267-
""" YoutubeDL wrapper, inits youtubee-dl if it is available
268-
"""
269-
def __init__(self):
270-
try:
271-
from youtube_dl import YoutubeDL as YoutubeDL
272-
except ImportError:
273-
self.ydl = None
274-
return
275-
276-
self.ydl = YoutubeDL(dict(simulate=True,
277-
youtube_include_dash_manifest=False))
278-
self.ydl.add_default_info_extractors()
279-
280-
def extract_info(self, url):
281-
if not self.ydl:
282-
return None
283-
284-
info = self.ydl.extract_info(url)
285-
return info

tests/test_live_proxy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def make_httpd(app):
7373

7474
config = dict(collections=dict(rewrite='$liveweb'),
7575
framed_replay=True,
76-
proxyhostport=server.proxy_dict)
76+
proxyhostport=server.proxy_str)
7777

7878
global cache
7979
cache = {}

0 commit comments

Comments
 (0)