More docs on html-py-ever

konstin · konstin · commit 8a0efce6f8bf · 2018-09-07T23:11:15.000+02:00
diff --git a/html-py-ever/README.md b/html-py-ever/README.md
@@ -1,7 +1,83 @@
 # html-py-ever
 
-Using [html5ever](https://github.com/servo/html5ever) through [kuchiki](https://github.com/kuchiki-rs/kuchiki) to speed up html parsing and css-selecting.
+Demoing hot to use [html5ever](https://github.com/servo/html5ever) through [kuchiki](https://github.com/kuchiki-rs/kuchiki) to speed up html parsing and css-selecting.
+
+## Usage
+
+`parse_file` and `parse_text` return a parsed `Document`, which then lets you select elements by css selectors using the `select` method. All elements are returned as strings
 
 ## Benchmarking
 
 Create a python 3.6+ venv and activate it. Install html-py-ever in there (`python setup.py install`). To get a readable benchmark, run `test/run_all.py`. To get a real benchmark, run `pytest test_parsing.py` or `pytest test_selector.py`. Both have a `--benchmark-histogram` option.
+
+## Example benchmark results
+
+Running on Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz with python 3.6 and rustc 1.30.0-nightly (aaa170beb 2018-08-31)
+
+**run_all.py**
+
+```
+monty-python.html 1400
+Parse lxml  0.013675s 0.114107s  8.344x
+Parse py    0.013675s 0.191262s 13.986x
+Select lxml 0.004283s 0.001122s  3.818x
+Select py   0.004047s 0.001122s  3.608x
+empty.html 0
+Parse lxml  0.000050s 0.000250s  5.027x
+Parse py    0.000050s 0.000091s  1.834x
+Select lxml 0.000047s 0.000011s  4.452x
+Select py   0.000034s 0.000011s  3.263x
+small.html 0
+Parse lxml  0.000050s 0.000408s  8.221x
+Parse py    0.000050s 0.000341s  6.860x
+Select lxml 0.000048s 0.000006s  7.700x
+Select py   0.000116s 0.000006s 18.739x
+rust.html 733
+Parse lxml  0.034088s 0.269182s  7.897x
+Parse py    0.034088s 0.423923s 12.436x
+Select lxml 0.006814s 0.004962s  1.373x
+Select py   0.006792s 0.004962s  1.369x
+python.html 1518
+Parse lxml  0.134979s 1.440968s 10.675x
+Parse py    0.134979s 2.271023s 16.825x
+Select lxml 0.036732s 0.006711s  5.474x
+Select py   0.036882s 0.006711s  5.496x
+```
+
+**test_parsing.py**
+
+```
+------------------------------------------------------------------------------------------------------------------- benchmark: 10 tests -------------------------------------------------------------------------------------------------------------------
+Name (time in us)                                           Min                       Max                      Mean                 StdDev                    Median                    IQR            Outliers           OPS            Rounds  Iterations
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+test_bench_parsing_rust[empty.html]                      6.1110 (1.0)            513.7940 (1.0)              7.4792 (1.0)           9.5990 (1.0)              6.3950 (1.0)           0.2948 (1.0)      649;4746  133,704.3206 (1.0)       27203           1
+test_bench_parsing_rust[small.html]                     19.3520 (3.17)           788.8010 (1.54)            22.1472 (2.96)         16.8692 (1.76)            19.8700 (3.11)          0.5373 (1.82)     393;1818   45,152.4211 (0.34)      16177           1
+test_bench_parsing_python[empty.html]                   57.6250 (9.43)        38,060.2320 (74.08)           72.3809 (9.68)        457.3842 (47.65)           59.6890 (9.33)          3.0377 (10.31)      11;948   13,815.7902 (0.10)       6939           1
+test_bench_parsing_python[small.html]                  290.9070 (47.60)        2,750.8890 (5.35)           345.1972 (46.15)       178.1737 (18.56)          301.0480 (47.08)        26.8838 (91.21)     103;362    2,896.8951 (0.02)       2477           1
+test_bench_parsing_rust[monty-python.html]          12,943.2440 (>1000.0)     21,217.3930 (41.30)       13,930.9700 (>1000.0)   1,687.9115 (175.84)      13,393.0260 (>1000.0)     493.4407 (>1000.0)       6;7       71.7825 (0.00)         65           1
+test_bench_parsing_rust[rust.html]                  27,254.8300 (>1000.0)     44,283.6160 (86.19)       29,939.0300 (>1000.0)   3,770.0365 (392.75)      28,366.1800 (>1000.0)   2,199.8490 (>1000.0)       4;4       33.4012 (0.00)         30           1
+test_bench_parsing_rust[python.html]               117,097.9310 (>1000.0)    139,946.1370 (272.38)     124,982.5736 (>1000.0)   7,679.8512 (800.07)     124,375.9720 (>1000.0)  10,055.3265 (>1000.0)       2;0        8.0011 (0.00)          8           1
+test_bench_parsing_python[monty-python.html]       181,122.6270 (>1000.0)    221,371.7280 (430.86)     191,845.8776 (>1000.0)  16,849.9999 (>1000.0)    186,777.4470 (>1000.0)  15,766.5518 (>1000.0)       1;1        5.2125 (0.00)          5           1
+test_bench_parsing_python[rust.html]               384,658.8340 (>1000.0)    423,217.7400 (823.71)     406,878.9022 (>1000.0)  17,625.0831 (>1000.0)    413,173.2850 (>1000.0)  31,943.3840 (>1000.0)       1;0        2.4577 (0.00)          5           1
+test_bench_parsing_python[python.html]           2,195,261.3770 (>1000.0)  2,249,598.2990 (>1000.0)  2,221,196.6530 (>1000.0)  23,091.9237 (>1000.0)  2,212,574.4390 (>1000.0)  38,692.2310 (>1000.0)       2;0        0.4502 (0.00)          5           1
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+```
+
+**test_selector.py**
+
+```
+------------------------------------------------------------------------------------------------------------ benchmark: 10 tests -------------------------------------------------------------------------------------------------------------
+Name (time in us)                                         Min                    Max                   Mean                StdDev                 Median                   IQR            Outliers           OPS            Rounds  Iterations
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+test_bench_selector_rust[empty.html]                   1.3180 (1.0)          63.8790 (1.0)           1.5361 (1.0)          0.9402 (1.0)           1.4420 (1.0)          0.0630 (1.0)     1884;6084  651,005.8803 (1.0)       84775           1
+test_bench_selector_rust[small.html]                   1.5300 (1.16)        112.5220 (1.76)          1.7647 (1.15)         1.0319 (1.10)          1.6590 (1.15)         0.0630 (1.00)    2215;7135  566,666.9515 (0.87)      96507           1
+test_bench_selector_python[empty.html]                20.1260 (15.27)       532.0720 (8.33)         22.9150 (14.92)       12.8876 (13.71)        20.8190 (14.44)        0.5280 (8.38)     818;1965   43,639.4426 (0.07)      18434           1
+test_bench_selector_python[small.html]                26.5540 (20.15)       890.5700 (13.94)        29.7362 (19.36)       14.8236 (15.77)        27.4300 (19.02)        0.7265 (11.53)    762;2109   33,629.0076 (0.05)      17413           1
+test_bench_selector_rust[monty-python.html]          691.8140 (524.90)    2,925.7400 (45.80)       851.7575 (554.50)     222.7539 (236.93)      802.9160 (556.81)      79.2970 (>1000.0)     43;69    1,174.0430 (0.00)        843           1
+test_bench_selector_rust[rust.html]                1,220.5940 (926.10)    6,789.2340 (106.28)    1,509.8102 (982.90)     540.7908 (575.20)    1,352.9600 (938.25)     361.6030 (>1000.0)       8;6      662.3349 (0.00)        240           1
+test_bench_selector_python[monty-python.html]      3,851.9600 (>1000.0)   8,077.7510 (126.45)    4,260.0542 (>1000.0)    675.4977 (718.48)    4,063.3380 (>1000.0)    216.4488 (>1000.0)     20;26      234.7388 (0.00)        245           1
+test_bench_selector_python[rust.html]              6,437.3910 (>1000.0)  11,348.6070 (177.66)    7,033.6536 (>1000.0)  1,050.6394 (>1000.0)   6,739.6810 (>1000.0)    363.3680 (>1000.0)     12;13      142.1736 (0.00)        151           1
+test_bench_selector_rust[python.html]              6,504.3130 (>1000.0)  12,934.9650 (202.49)    7,557.5249 (>1000.0)  1,398.7101 (>1000.0)   6,976.7700 (>1000.0)    965.8090 (>1000.0)     17;16      132.3185 (0.00)        143           1
+test_bench_selector_python[python.html]           36,145.0260 (>1000.0)  46,582.5100 (729.23)   38,058.3009 (>1000.0)  2,960.4055 (>1000.0)  36,630.3450 (>1000.0)  1,389.9710 (>1000.0)       4;5       26.2755 (0.00)         23           1
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+```
diff --git a/html-py-ever/html_py_ever/__init__.py b/html-py-ever/html_py_ever/__init__.py
@@ -1 +1 @@
-from .html_py_ever import *
+from .html_py_ever import *
diff --git a/html-py-ever/requirements-dev.txt b/html-py-ever/requirements-dev.txt
@@ -3,4 +3,5 @@ setuptools-rust
 wheel
 pytest-benchmark[historgram]
 pytest
-
+beautifulsoup4
+lxml
diff --git a/html-py-ever/test/run_all.py b/html-py-ever/test/run_all.py
@@ -19,11 +19,11 @@ def rust(filename: str) -> Tuple[int, float, float]:
     return len(links), end_load - start_load, end_search - start_search
 
 
-def python(filename: str) -> Tuple[int, float, float]:
+def python(filename: str, parser: str) -> Tuple[int, float, float]:
     start_load = perf_counter()
     with open(filename) as fp:
         text = fp.read()
-    soup = BeautifulSoup(text, "html.parser")
+    soup = BeautifulSoup(text, parser)
 
     end_load = perf_counter()
     start_search = perf_counter()
@@ -37,11 +37,21 @@ def python(filename: str) -> Tuple[int, float, float]:
 def main():
     for filename in glob("*.html"):
         count_rs, parse_rs, select_rs = rust(filename)
-        count_py, parse_py, select_py = python(filename)
+        count_lxml, parse_lxml, select_lxml = python(filename, "lxml")
+        count_py, parse_py, select_py = python(filename, "html.parser")
+        assert count_rs == count_lxml
         assert count_rs == count_py
         print(f"{filename} {count_rs}")
-        print(f"Parse  {parse_rs:6f}s {parse_py:6f}s {parse_py/parse_rs:6.3f}x")
-        print(f"Select {select_py:6f}s {select_rs:6f}s {select_py/select_rs:6.3f}x")
+        print(
+            f"Parse lxml  {parse_rs:6f}s {parse_lxml:6f}s {parse_lxml/parse_rs:6.3f}x"
+        )
+        print(f"Parse py    {parse_rs:6f}s {parse_py:6f}s {parse_py/parse_rs:6.3f}x")
+        print(
+            f"Select lxml {select_lxml:6f}s {select_rs:6f}s {select_lxml/select_rs:6.3f}x"
+        )
+        print(
+            f"Select py   {select_py:6f}s {select_rs:6f}s {select_py/select_rs:6.3f}x"
+        )
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .html_py_ever import *`
	`1`	`+from .html_py_ever import *`