1
+ import os
2
+ from playwright .sync_api import sync_playwright
3
+ from PyPDF2 import PdfReader , PdfWriter , PdfMerger
4
+ from reportlab .pdfgen import canvas
5
+ from reportlab .lib .pagesizes import A4
6
+
7
+ BASE_URL = "http://docs.nomana-it.fr"
8
+ OUTPUT_FOLDER = "pdf-output"
9
+ COVER_FOLDER = "pdf-cover"
10
+ PDF_NAME = "final_documentation.pdf"
11
+ COVER_FILE = os .path .join (COVER_FOLDER , "liberty_cover.pdf" )
12
+
13
+ # Hardcoded Navigation Structure
14
+ NAV = [
15
+ {"title" : "Getting Started" , "path" : "liberty/getting-started.md" },
16
+ {"title" : "Release Notes" , "path" : "liberty/release-notes.md" },
17
+ {
18
+ "title" : "Installation" ,
19
+ "children" : [
20
+ {"title" : "Architecture" , "path" : "liberty/technical/architecture.md" },
21
+ {"title" : "Docker Installation Guide" , "path" : "liberty/technical/installation.md" },
22
+ {"title" : "Installation Tools Deployment Guide" , "path" : "liberty/technical/tools-deployment.md" },
23
+ {"title" : "Liberty Deployment Guide" , "path" : "liberty/technical/liberty-deployment.md" },
24
+ {"title" : "Create Linux Services" , "path" : "liberty/technical/linux-services.md" },
25
+ {"title" : "Enable SSL with Traefik" , "path" : "liberty/technical/post-ssl.md" },
26
+ ]
27
+ },
28
+ {
29
+ "title" : "Nomasx-1" ,
30
+ "children" : [
31
+ {
32
+ "title" : "Administrator's Guide" ,
33
+ "children" : [
34
+ {"title" : "Global Settings" , "path" : "liberty/nomasx1/admin/global-settings.md" },
35
+ ]
36
+ }
37
+ ]
38
+ }
39
+ ]
40
+
41
+ # Flatten the nav to extract URLs and titles
42
+ def flatten_nav (nav , level = 1 , parent_number = "" ):
43
+ pages = []
44
+ for index , item in enumerate (nav , start = 1 ):
45
+ number = f"{ parent_number } { index } " if parent_number else str (index )
46
+ full_title = f"{ number } . { item ['title' ]} "
47
+
48
+ if "path" in item : # It's a direct page
49
+ page_path = item ["path" ].replace (".md" , "" ).replace ("docs/" , "" ).strip ("/" )
50
+ pages .append ((full_title , page_path ))
51
+ if "children" in item : # It's a nested structure
52
+ # Add parent title as a standalone TOC entry
53
+ pages .append ((full_title , None ))
54
+ # Add children recursively
55
+ pages .extend (flatten_nav (item ["children" ], level + 1 , f"{ number } ." ))
56
+ return pages
57
+
58
+ # Generate Table of Contents
59
+ def generate_toc (pages_with_numbers ):
60
+ toc_html = """<!DOCTYPE html>
61
+ <html lang="en">
62
+ <head>
63
+ <meta charset="UTF-8">
64
+ <title>Table of Contents</title>
65
+ <style>
66
+ body {
67
+ font-family: Arial, sans-serif;
68
+ margin: 2rem;
69
+ }
70
+ h1 {
71
+ font-size: 2rem;
72
+ color: #333;
73
+ }
74
+ ul {
75
+ list-style: none;
76
+ padding: 0;
77
+ }
78
+ li {
79
+ margin: 0.5rem 0;
80
+ padding-left: 1rem; /* Add indentation */
81
+ }
82
+ li.level-1 {
83
+ font-weight: bold;
84
+ padding-left: 0; /* No indentation for top-level items */
85
+ }
86
+ li.level-2 {
87
+ padding-left: 1.5rem; /* Indentation for level 2 items */
88
+ }
89
+ li.level-3 {
90
+ padding-left: 3rem; /* Indentation for level 3 items */
91
+ }
92
+ span.page-number {
93
+ float: right;
94
+ color: #555;
95
+ }
96
+ </style>
97
+ </head>
98
+ <body>
99
+ <h1>Table of Contents</h1>
100
+ <ul>
101
+ """
102
+ for title , page_number in pages_with_numbers :
103
+ # Determine the indentation level based on numbering
104
+ level = title .count ("." ) - 1
105
+ class_level = f"level-{ level + 1 } "
106
+ toc_html += f'<li class="{ class_level } ">{ title } <span class="page-number">Page { page_number } </span></li>\n '
107
+
108
+ toc_html += """
109
+ </ul>
110
+ </body>
111
+ </html>
112
+ """
113
+ toc_file = os .path .join (OUTPUT_FOLDER , "toc.html" )
114
+ with open (toc_file , "w" ) as file :
115
+ file .write (toc_html )
116
+ print (f"TOC generated as { toc_file } " )
117
+ return toc_file
118
+
119
+ def handle_cookie_consent (page ):
120
+ """Automatically accept cookie consent if the dialog is present."""
121
+ try :
122
+ consent_button_selector = "button:has-text('Accept')"
123
+ if page .locator (consent_button_selector ).is_visible ():
124
+ page .click (consent_button_selector )
125
+ print ("Cookie consent accepted." )
126
+ except Exception as e :
127
+ print ("No cookie consent dialog found or an error occurred:" , e )
128
+
129
+ def add_page_numbers (input_file , output_file ):
130
+ reader = PdfReader (input_file )
131
+ writer = PdfWriter ()
132
+
133
+ temp_page_numbers = "temp_page_numbers.pdf"
134
+ with open (temp_page_numbers , "wb" ) as temp_pdf :
135
+ c = canvas .Canvas (temp_pdf , pagesize = A4 )
136
+ width , height = A4
137
+
138
+ font_name = "Helvetica"
139
+ font_size = 8
140
+
141
+ for i in range (len (reader .pages )):
142
+ c .setFont (font_name , font_size )
143
+ c .drawRightString (width - 5 , height - 20 , f"Page { i + 1 } of { len (reader .pages )} " )
144
+ c .showPage ()
145
+ c .save ()
146
+
147
+ temp_reader = PdfReader (temp_page_numbers )
148
+ for page_num , page in enumerate (reader .pages ):
149
+ overlay = temp_reader .pages [page_num ]
150
+ page .merge_page (overlay )
151
+ writer .add_page (page )
152
+
153
+ with open (output_file , "wb" ) as final_pdf :
154
+ writer .write (final_pdf )
155
+ os .remove (temp_page_numbers )
156
+ print (f"PDF with page numbers saved to { output_file } " )
157
+
158
+ # Generate PDF with Playwright
159
+ def generate_pdf_with_cover_and_toc (base_url , pages_with_titles , output_folder ):
160
+ os .makedirs (output_folder , exist_ok = True )
161
+
162
+ with sync_playwright () as p :
163
+ browser = p .chromium .launch ()
164
+ context = browser .new_context ()
165
+
166
+ page_numbers = []
167
+ current_page = 1
168
+ pdf_paths = []
169
+
170
+ # Generate individual pages and track page numbers
171
+ for title , page_path in pages_with_titles :
172
+ if page_path is None :
173
+ page_numbers .append ((title , current_page ))
174
+ continue
175
+
176
+ url = f"{ base_url } /{ page_path } "
177
+ output_path = os .path .join (output_folder , f"{ page_path .replace ('/' , '_' )} .pdf" )
178
+ page = context .new_page ()
179
+ page .goto (url , wait_until = "networkidle" )
180
+ handle_cookie_consent (page )
181
+
182
+ page .pdf (
183
+ path = output_path ,
184
+ format = "A4" ,
185
+ print_background = True ,
186
+ margin = {"top" : "1in" , "bottom" : "1in" },
187
+ display_header_footer = True ,
188
+ header_template = f'''
189
+ <div style="font-size: 10px; padding-left: 20px; width: 100%; text-align: left;">
190
+ { title }
191
+ </div>
192
+ '''
193
+ )
194
+
195
+ pdf_paths .append (output_path )
196
+ reader = PdfReader (output_path )
197
+ num_pages = len (reader .pages )
198
+ page_numbers .append ((title , current_page ))
199
+ current_page += num_pages
200
+
201
+
202
+
203
+ toc_file = os .path .join (output_folder , "toc.pdf" )
204
+ generate_toc (page_numbers )
205
+
206
+ toc_page = context .new_page ()
207
+ toc_page .goto (f"file://{ os .path .abspath (generate_toc (page_numbers ))} " )
208
+ toc_page .pdf (
209
+ path = toc_file ,
210
+ format = "A4" ,
211
+ print_background = True ,
212
+ )
213
+
214
+
215
+ merged_output = os .path .join (output_folder , PDF_NAME )
216
+ merge_pdfs (pdf_paths , merged_output )
217
+ add_page_numbers (merged_output , merged_output )
218
+
219
+ final_pdf_paths = [COVER_FILE , toc_file , merged_output ]
220
+ final_output_path = os .path .join (output_folder , PDF_NAME )
221
+ merge_pdfs (final_pdf_paths , final_output_path )
222
+
223
+ browser .close ()
224
+
225
+ # Merge PDFs into one
226
+ def merge_pdfs (input_files , output_file ):
227
+ merger = PdfMerger ()
228
+ for pdf in input_files :
229
+ merger .append (pdf )
230
+ merger .write (output_file )
231
+ merger .close ()
232
+ print (f"Merged PDF saved to { output_file } " )
233
+
234
+ if __name__ == "__main__" :
235
+ pages_with_titles = flatten_nav (NAV )
236
+ generate_pdf_with_cover_and_toc (BASE_URL , pages_with_titles , OUTPUT_FOLDER )
0 commit comments