Skip to content
This repository was archived by the owner on Feb 28, 2019. It is now read-only.

Commit 1576283

Browse files
committed
Great performance improve (about 8x faster in text-like content rewrite)
1 parent ba5a018 commit 1576283

File tree

3 files changed

+61
-49
lines changed

3 files changed

+61
-49
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
### Project Special
22
/custom_func.py
33
/config.py
4+
/tests/sample
45
/ip_whitelist.txt
56
### Vim template
67
# swap

EasyWebsiteMirror.py

Lines changed: 37 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,16 @@
33
import os
44

55
os.chdir(os.path.dirname(__file__))
6-
import requests
76
import traceback
87
from datetime import datetime, timedelta
98
import re
109
import base64
1110
import zlib
1211
from time import time
1312
from html import escape as html_escape
13+
import threading
1414
from urllib.parse import urljoin, urlsplit, urlunsplit
15+
import requests
1516
from flask import Flask, request, make_response, Response, redirect
1617
from ColorfulPyPrint import * # TODO: Migrate logging tools to the stdlib
1718

@@ -41,7 +42,7 @@
4142
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
4243
local_cache_enable = False
4344

44-
__VERSION__ = '0.15.1-dev'
45+
__VERSION__ = '0.16.1-dev'
4546
__author__ = 'Aploium <i@z.codes>'
4647
static_file_extensions_list = set(static_file_extensions_list)
4748
external_domains_set = set(external_domains or [])
@@ -52,6 +53,10 @@
5253
myurl_prefix_escaped = myurl_prefix.replace('/', r'\/')
5354
cdn_domains_number = len(CDN_domains)
5455

56+
# ## thread local var ##
57+
thread_local = threading.local()
58+
thread_local.start_time = None
59+
5560
# ########## Handle dependencies #############
5661
if not enable_static_resource_CDN:
5762
mime_based_static_resource_CDN = False
@@ -110,14 +115,7 @@
110115
)
111116
regex_extract_base64_from_embedded_url = re.compile(
112117
r'_ewm0(?P<gzip>z?)_\.(?P<b64>[a-zA-Z0-9-_]+=*)\._ewm1_\.[a-zA-Z\d]+\b')
113-
# Basic url rewriter for external sites, see function response_text_rewrite()
114-
regex_basic_ext_url_rewriter = {}
115-
regex_basic_ext_url_esc_rewriter = {}
116-
for _domain in external_domains:
117-
regex_basic_ext_url_rewriter[_domain] = re.compile(r'(https?:)?//' + re.escape(_domain), flags=re.IGNORECASE)
118-
# TODO: Combine it together with regex_basic_ext_url_rewriter
119-
regex_basic_ext_url_esc_rewriter[_domain] = re.compile(r'(https?:)?\\/\\/' + re.escape(_domain),
120-
flags=re.IGNORECASE)
118+
121119
# Response Cookies Rewriter, see response_cookie_rewrite()
122120
regex_cookie_rewriter = re.compile(r'\bdomain=(\.?([\w-]+\.)+\w+)\b', flags=re.IGNORECASE)
123121
# Request Domains Rewriter, see rewrite_client_requests_text()
@@ -419,6 +417,7 @@ def put_response_to_local_cache(url, our_resp, req, remote_resp):
419417
def try_get_cached_response(url, client_header):
420418
"""
421419
420+
:param url: real url with query string
422421
:type client_header: dict
423422
"""
424423
# Only use cache when client use GET
@@ -484,10 +483,9 @@ def regex_url_reassemble(match_obj):
484483
# only url(something) and @import are allowed to be unquoted
485484
or ('url' not in prefix and 'import' not in prefix) and (not quote_left or quote_right == ')')
486485
# for "key":"value" type replace, we must have at least one '/' in url path (for the value to be regard as url)
487-
or (':' in prefix and '/' not in path)):
486+
or (':' in prefix and '/' not in path)
487+
):
488488
return whole_match_string
489-
else:
490-
url_rewrite_cache_miss_count += 1
491489

492490
remote_path = request.path
493491
if request.path[:11] == '/extdomains':
@@ -564,6 +562,7 @@ def regex_url_reassemble(match_obj):
564562
# write the adv rewrite cache only if we disable CDN or we known whether this url is CDN-able
565563
if not mime_based_static_resource_CDN or _we_knew_this_url:
566564
url_rewrite_cache[match_obj.group()] = reassembled # write cache
565+
url_rewrite_cache_miss_count += 1
567566

568567
return reassembled
569568

@@ -661,6 +660,7 @@ def copy_response(requests_response_obj, content=b''):
661660
return resp
662661

663662

663+
# noinspection PyProtectedMember
664664
def response_cookies_deep_copy(req_obj):
665665
"""
666666
It's a BAD hack to get RAW cookies headers, but so far, we don't have better way.
@@ -711,8 +711,8 @@ def response_content_rewrite(remote_resp_obj):
711711
if custom_text_rewriter_enable and content_mime == 'text/html':
712712
resp_text2 = custom_response_html_rewriter(resp_text)
713713
resp_text = resp_text2
714-
except Exception as e: # just print err and fallback to normal rewrite
715-
errprint('Custom Rewrite Function "custom_response_html_rewriter(text)" in custom_func.py ERROR', e)
714+
except Exception as _e: # just print err and fallback to normal rewrite
715+
errprint('Custom Rewrite Function "custom_response_html_rewriter(text)" in custom_func.py ERROR', _e)
716716
traceback.print_exc()
717717

718718
# then do the normal rewrites
@@ -751,28 +751,18 @@ def response_text_rewrite(resp_text):
751751
resp_text = resp_text.replace(r'https:\/\/' + domain, # TODO: Combine it with non-escaped version
752752
myurl_prefix_escaped + r'\/extdomains\/' + 'https-' + domain)
753753
# Implicit schemes replace, will be replaced to the same as `my_host_scheme`, unless forced
754-
resp_text = regex_basic_ext_url_rewriter[domain].sub(
755-
'{0}{1}/extdomains/{2}{3}'.format(
756-
my_host_scheme,
757-
my_host_name,
758-
('https-' if ('NONE' != force_https_domains)
759-
and (
760-
'ALL' == force_https_domains or domain in force_https_domains
761-
) else ''),
762-
domain),
763-
resp_text
764-
)
765754

766-
resp_text = regex_basic_ext_url_esc_rewriter[domain].sub( # TODO: Combine it with non-escaped version
767-
'{0}\\/extdomains\\/{1}{2}'.format(
768-
myurl_prefix_escaped,
769-
('https-' if ('NONE' != force_https_domains)
770-
and (
771-
'ALL' == force_https_domains or domain in force_https_domains
772-
) else ''),
773-
domain),
774-
resp_text
775-
)
755+
buff = '{0}/extdomains/{1}{2}'.format(
756+
myurl_prefix,
757+
('https-' if ('NONE' != force_https_domains)
758+
and (
759+
'ALL' == force_https_domains or domain in force_https_domains
760+
) else ''),
761+
domain)
762+
resp_text = resp_text.replace('http://' + domain, buff, )
763+
resp_text = resp_text.replace('http:\\/\\/' + domain, buff.replace('/', r'\/'))
764+
resp_text = resp_text.replace('//' + domain, buff)
765+
resp_text = resp_text.replace('\\/\\/' + domain, buff.replace('/', r'\/'), )
776766

777767
# rewrite "foo.domain.tld" and 'foo.domain.tld'
778768
resp_text = resp_text.replace('"%s"' % domain, '\"' + my_host_name + '/extdomains/' + domain + '\"')
@@ -887,7 +877,7 @@ def send_request(url, method='GET', headers=None, param_get=None, data=None):
887877
return r, req_time
888878

889879

890-
def request_remote_site_and_parse(actual_request_url, start_time=None):
880+
def request_remote_site_and_parse(actual_request_url):
891881
if verbose_level >= 3: dbgprint('actual_request_url:', actual_request_url)
892882

893883
if mime_based_static_resource_CDN:
@@ -909,8 +899,8 @@ def request_remote_site_and_parse(actual_request_url, start_time=None):
909899
resp = try_get_cached_response(actual_request_url, client_header)
910900
if resp is not None:
911901
dbgprint('CacheHit,Return')
912-
if start_time is not None:
913-
resp.headers.set('X-CP-Time', "%.4f" % (time() - start_time))
902+
if thread_local.start_time is not None:
903+
resp.headers.set('X-CP-Time', "%.4f" % (time() - thread_local.start_time))
914904
return resp # If cache hit, just skip next steps
915905

916906
try: # send request to remote server
@@ -950,8 +940,8 @@ def request_remote_site_and_parse(actual_request_url, start_time=None):
950940

951941
if local_cache_enable: # storge entire our server's response (headers included)
952942
put_response_to_local_cache(actual_request_url, resp, request, r)
953-
if start_time is not None:
954-
resp.headers.add('X-CP-Time', "%.4f" % (time() - start_time - req_time))
943+
if thread_local.start_time is not None:
944+
resp.headers.add('X-CP-Time', "%.4f" % (time() - thread_local.start_time - req_time))
955945
return resp
956946

957947

@@ -975,9 +965,8 @@ def filter_client_request():
975965
if verbose_level >= 3: dbgprint('add to ip_whitelist because cookies:', request.remote_addr)
976966
else:
977967
return redirect(
978-
"/ip_ban_verify_page?origin="
979-
+ base64.urlsafe_b64encode(str(request.url).encode(encoding='utf-8')).decode()
980-
, code=302)
968+
"/ip_ban_verify_page?origin=" + base64.urlsafe_b64encode(str(request.url).encode(encoding='utf-8')).decode(),
969+
code=302)
981970

982971
return None
983972

@@ -1003,7 +992,6 @@ def rewrite_client_request():
1003992
try:
1004993
real_url = extract_real_url_from_embedded_url(request.url)
1005994
if real_url is not None:
1006-
global request
1007995
request.url = real_url
1008996
request.path = urlsplit(real_url).path
1009997
except:
@@ -1120,7 +1108,7 @@ def ip_ban_verify_page():
11201108
@app.route('/extdomains/<path:hostname>', methods=['GET', 'POST'])
11211109
@app.route('/extdomains/<path:hostname>/<path:extpath>', methods=['GET', 'POST'])
11221110
def get_external_site(hostname, extpath='/'):
1123-
start_time = time() # to display compute time
1111+
thread_local.start_time = time() # to display compute time
11241112
# pre-filter client's request
11251113
filter_or_rewrite_result = filter_client_request() or is_client_request_need_redirect()
11261114

@@ -1145,13 +1133,13 @@ def get_external_site(hostname, extpath='/'):
11451133
if verbose_level >= 3: dbgprint('after extract, url:', request.url, ' path:', request.path)
11461134
actual_request_url = urljoin(urljoin(scheme + hostname, extpath), '?' + urlsplit(request.url).query)
11471135

1148-
return request_remote_site_and_parse(actual_request_url, start_time)
1136+
return request_remote_site_and_parse(actual_request_url)
11491137

11501138

11511139
@app.route('/', methods=['GET', 'POST'])
11521140
@app.route('/<path:input_path>', methods=['GET', 'POST'])
11531141
def get_main_site(input_path='/'):
1154-
start_time = time() # to display compute time
1142+
thread_local.start_time = time() # to display compute time
11551143
# pre-filter client's request
11561144
filter_or_rewrite_result = filter_client_request() or is_client_request_need_redirect()
11571145
if filter_or_rewrite_result is not None:
@@ -1165,7 +1153,7 @@ def get_main_site(input_path='/'):
11651153

11661154
actual_request_url = urljoin(target_scheme + target_domain, extract_url_path_and_query(request.url))
11671155

1168-
return request_remote_site_and_parse(actual_request_url, start_time)
1156+
return request_remote_site_and_parse(actual_request_url)
11691157

11701158

11711159
# ################# End Flask #################

tests/regex_rewriter_test.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# coding=utf-8
22
import re
3+
from time import time
34
from config import *
45
from EasyWebsiteMirror import regex_adv_url_rewriter, regex_url_reassemble, \
56
static_file_extensions_list, external_domains_set, allowed_domains_set, myurl_prefix, cdn_domains_number, \
@@ -185,3 +186,25 @@ class DbgRequest:
185186
infoprint('All', len(test_cases), 'tests passed')
186187
else:
187188
errprint('Failed in ', fail_count, 'tests')
189+
190+
infoprint('Begin Performance Test')
191+
192+
with open(os.path.join(os.path.dirname(__file__),'sample', 'google_home.html'),'r') as fp:
193+
buff = fp.read()
194+
try:
195+
regex_adv_url_rewriter.cache_clear()
196+
except:
197+
pass
198+
start_time = time()
199+
regex_adv_url_rewriter.sub(regex_url_reassemble,buff)
200+
infoprint('google_home.html',time()-start_time)
201+
202+
with open(os.path.join(os.path.dirname(__file__),'sample', 'google_script.js'),'r') as fp:
203+
buff = fp.read()
204+
try:
205+
regex_adv_url_rewriter.cache_clear()
206+
except:
207+
pass
208+
start_time = time()
209+
regex_adv_url_rewriter.sub(regex_url_reassemble,buff)
210+
infoprint('google_script.js',time()-start_time)

0 commit comments

Comments
 (0)