3
3
import re
4
4
from importlib import import_module
5
5
from importlib .util import module_from_spec , spec_from_file_location
6
- from typing import List
7
6
8
7
from bs4 import BeautifulSoup , PageElement
9
8
from weasyprint import HTML , urls
@@ -35,6 +34,18 @@ def __init__(self, options: Options):
35
34
self ._scraped_scripts = []
36
35
self ._mixed_script = ''
37
36
37
+ def to_pattern (s : str ) -> re .Pattern :
38
+ if s .startswith ('^' ):
39
+ return re .compile (s )
40
+ return re .compile (f'^{ s } ' )
41
+
42
+ self ._exclude_page_patterns = list (map (
43
+ to_pattern ,
44
+ self ._options .exclude_pages
45
+ ))
46
+ self ._options .logger .debug (
47
+ f'Exclude page patterns: { self ._exclude_page_patterns } ' )
48
+
38
49
def on_nav (self , nav ):
39
50
""" on_nav """
40
51
self ._nav = nav
@@ -44,31 +55,11 @@ def on_nav(self, nav):
44
55
def on_post_page (self , output_content : str , page , pdf_path : str ) -> str :
45
56
""" on_post_page """
46
57
47
- def get_excluded_pages (e_paths : List [str ]) -> List [str ]:
48
-
49
- def get_files_in_dir (path : str ) -> List [str ]:
50
- files = list ()
51
- for f in os .listdir (path ):
52
- sub_path = os .path .join (path , f )
53
- if os .path .isdir (sub_path ):
54
- files += get_files_in_dir (sub_path )
55
- else :
56
- files .append (os .path .splitext (sub_path )[0 ] + '/' )
57
- return files
58
-
59
- excluded_pages = list ()
60
- cwd = os .getcwd ()
61
- os .chdir ("docs" )
62
- for path in e_paths :
63
- if os .path .isdir (path ):
64
- excluded_pages += get_files_in_dir (path )
65
- else :
66
- excluded_pages .append (path )
67
- os .chdir (cwd )
68
- return excluded_pages
69
-
70
58
def is_excluded (url : str ) -> bool :
71
- return url in get_excluded_pages (self ._options .exclude_pages )
59
+ for p in self ._exclude_page_patterns :
60
+ if p .match (url ):
61
+ return True
62
+ return False
72
63
73
64
if is_excluded (page .url ):
74
65
self .logger .info (f'Page skipped: [{ page .title } ]({ page .url } )' )
@@ -234,7 +225,7 @@ def shift_heading(elem, page):
234
225
elem .insert (0 , h1 )
235
226
return elem
236
227
237
- def cleanup_class (classes : [] ):
228
+ def cleanup_class (classes ):
238
229
if classes and len (classes ):
239
230
excludes = ['md-content__inner' ]
240
231
return [c for c in classes if not (c in excludes )]
0 commit comments