|
| 1 | +# To run this script you need Python setup and installed chardet and beautifulsoup dependencies through pip install |
| 2 | + |
| 3 | + |
| 4 | +import os |
| 5 | +import chardet |
| 6 | +from bs4 import BeautifulSoup |
| 7 | + |
| 8 | +# This is basic config |
| 9 | + |
| 10 | +INPUT_DIR = 'docs/index.html' # TypeDoc assinged docs dir, |
| 11 | +OUTPUT_NAV_ADOC = 'apps/docs/src/en/modules/ROOT/nav.adoc' # Your desired nav file to put pages nav |
| 12 | +OUTPUT_PAGES_DIR = 'apps/docs/src/en/modules/ROOT/pages' # Your desired location |
| 13 | + |
| 14 | +def process_code_tag(tag): |
| 15 | + # Apply desired replacements to the HTML content |
| 16 | + cleaned_html = ( |
| 17 | + tag.decode_contents() |
| 18 | + .replace(' ', '__') |
| 19 | + .replace('<br>', '\n') |
| 20 | + .replace('<br/>', '\n') |
| 21 | + .replace('<button>Copy</button>', '') |
| 22 | + ) |
| 23 | + |
| 24 | + # Convert the cleaned HTML to Beautiful Soup object |
| 25 | + cleaned_soup = BeautifulSoup(cleaned_html, "html.parser") |
| 26 | + |
| 27 | + # Extract text content from the cleaned soup |
| 28 | + code_text = cleaned_soup.get_text() |
| 29 | + |
| 30 | + return f"[source, javascript]\n----\n{code_text}\n----\n\n" |
| 31 | + |
| 32 | +def process_div_or_p_tag(tag): |
| 33 | + # Apply desired replacements to the HTML content |
| 34 | + cleaned_html = ( |
| 35 | + tag.decode_contents() |
| 36 | + .replace('<code>', '`') |
| 37 | + .replace('</code>', '`') |
| 38 | + .replace('<strong>', '*') |
| 39 | + .replace('</strong>', '*') |
| 40 | + ) |
| 41 | + |
| 42 | + for link in tag.find_all('a'): |
| 43 | + xref = link.get("href") |
| 44 | + link_text = link.get_text().strip() |
| 45 | + cleaned_html = cleaned_html.replace('<a ', f"{xref}[{link_text}] <a ") |
| 46 | + start = cleaned_html.index('<a ') |
| 47 | + end = cleaned_html.index('</a>') + 3 |
| 48 | + if len(cleaned_html) > end: |
| 49 | + cleaned_html = cleaned_html[0: start:] + cleaned_html[end + 1::] |
| 50 | + |
| 51 | + # Convert the cleaned HTML to Beautiful Soup object |
| 52 | + cleaned_soup = BeautifulSoup(cleaned_html, "html.parser") |
| 53 | + |
| 54 | + # Extract text content from the cleaned soup |
| 55 | + content = cleaned_soup.get_text().strip() |
| 56 | + |
| 57 | + return f"{content}\n\n" |
| 58 | + |
| 59 | +def process_content_section(content_section, adoc_content): |
| 60 | + for tag in content_section.find_all(): |
| 61 | + if tag.name in {"h1", "h2", "h3", "h4"}: |
| 62 | + heading_level = int(tag.name[1]) # Convert heading level to integer |
| 63 | + heading_text = tag.get_text().strip() |
| 64 | + adoc_content += f"{'=' * heading_level} {heading_text}\n\n" |
| 65 | + |
| 66 | + elif tag.name == "pre": |
| 67 | + adoc_content += process_code_tag(tag).replace('__', ' ') |
| 68 | + |
| 69 | + elif tag.name == "ul": |
| 70 | + for li in tag.find_all("li"): |
| 71 | + li_text = li.get_text().strip() |
| 72 | + adoc_content += f"* {li_text}\n" |
| 73 | + adoc_content += "\n" |
| 74 | + |
| 75 | + elif tag.name == "div" or tag.name == "p": |
| 76 | + adoc_content += process_div_or_p_tag(tag) |
| 77 | + |
| 78 | + return adoc_content |
| 79 | + |
| 80 | +def update_nav_adoc(antora_links): |
| 81 | + # Find the position to replace '.Packages' section |
| 82 | + # Append antora_output.adoc to nav.adoc and replace '.Packages' section |
| 83 | + adoc_output_filename = OUTPUT_NAV_ADOC |
| 84 | + with open(adoc_output_filename, "r", encoding="utf-8") as test_adoc_file: |
| 85 | + nav_adoc_content = test_adoc_file.read() |
| 86 | + |
| 87 | + # Find the position to replace '.Packages' section |
| 88 | + packages_start = nav_adoc_content.find(".Packages") |
| 89 | + if packages_start != -1: |
| 90 | + packages_end = nav_adoc_content.find("\n\n", packages_start) + 2 |
| 91 | + if packages_end != -1: |
| 92 | + new_nav_adoc_content = ( |
| 93 | + nav_adoc_content[:packages_start] + |
| 94 | + '.Packages\n' + |
| 95 | + "\n".join(antora_links) + |
| 96 | + "\n\n" + |
| 97 | + nav_adoc_content[packages_end:] |
| 98 | + ) |
| 99 | + with open(adoc_output_filename, "w", encoding="utf-8") as new_nav_adoc_file: |
| 100 | + new_nav_adoc_file.write(new_nav_adoc_content) |
| 101 | + print(f"Updated {adoc_output_filename} with antora_output.adoc content.") |
| 102 | + else: |
| 103 | + print(".Packages end not found in nav.adoc.") |
| 104 | + else: |
| 105 | + print(".Packages section not found in nav.adoc.") |
| 106 | + # Append content of antora_output.adoc to nav.adoc |
| 107 | + with open(adoc_output_filename, "a", encoding="utf-8") as nav_adoc_file: |
| 108 | + nav_adoc_file.write('.Packages\n' + "\n".join(antora_links) + "\n\n") |
| 109 | + print(f"Appended content of antora_output.adoc to {adoc_output_filename}.") |
| 110 | + |
| 111 | +def update_pages_directory(): |
| 112 | + # Move generated .adoc files to pages dir |
| 113 | + output_directory = OUTPUT_PAGES_DIR |
| 114 | + for generated_filename in os.listdir(): |
| 115 | + if generated_filename.endswith(".adoc"): |
| 116 | + generated_file_path = os.path.join(generated_filename) |
| 117 | + destination_path = os.path.join(output_directory, generated_filename) |
| 118 | + if os.path.exists(destination_path): |
| 119 | + os.remove(destination_path) # Remove existing file |
| 120 | + os.rename(generated_file_path, destination_path) |
| 121 | + print(f"Moved {generated_filename} to {output_directory}.") |
| 122 | + |
| 123 | +def process_docs(ul_element): |
| 124 | + antora_links = [] |
| 125 | + |
| 126 | + # Iterate through <li> elements inside <ul> |
| 127 | + for li in ul_element.find_all("li"): |
| 128 | + link = li.find("a") |
| 129 | + if link: |
| 130 | + href = link.get("href") |
| 131 | + link_text = link.get_text().strip() |
| 132 | + |
| 133 | + # Remove '_' at position 0 and replace remaining underscores with hyphens |
| 134 | + module_name = os.path.splitext(os.path.basename(href))[0] |
| 135 | + module_name = module_name[1:].replace("_", "-") |
| 136 | + |
| 137 | + # Construct full path to the corresponding .html file |
| 138 | + html_path = os.path.join("docs", href) |
| 139 | + |
| 140 | + # Check if the HTML file exists before reading and parsing |
| 141 | + if os.path.exists(html_path): |
| 142 | + # Read and parse the .html content |
| 143 | + with open(html_path, "rb") as html_file: |
| 144 | + html_content = html_file.read() |
| 145 | + |
| 146 | + # Parse the .html content using Beautiful Soup |
| 147 | + html_soup = BeautifulSoup(html_content, "html.parser") |
| 148 | + |
| 149 | + # Find the content section |
| 150 | + content_section = html_soup.find("section", class_="tsd-panel tsd-typography") |
| 151 | + |
| 152 | + if content_section: |
| 153 | + # Convert <code> tags to Antora code blocks while preserving other text |
| 154 | + adoc_filename = f"{module_name}.adoc" |
| 155 | + adoc_content = f"= {link_text}\n\n" |
| 156 | + |
| 157 | + adoc_content = process_content_section(content_section, adoc_content) |
| 158 | + |
| 159 | + # Create the .adoc file with UTF-8 encoding |
| 160 | + with open(adoc_filename.replace("/", "-"), "w", encoding="utf-8") as adoc_file: |
| 161 | + adoc_file.write(adoc_content) |
| 162 | + |
| 163 | + antora_link = f"* xref:{adoc_filename}[{link_text}]" |
| 164 | + antora_links.append(antora_link) |
| 165 | + else: |
| 166 | + print(f"HTML file '{html_path}' does not exist.") |
| 167 | + |
| 168 | + return antora_links |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == '__main__': |
| 172 | + # Read the HTML file in binary mode |
| 173 | + with open(INPUT_DIR, "rb") as file: |
| 174 | + html_content = file.read() |
| 175 | + |
| 176 | + # Detect the encoding of the HTML content |
| 177 | + result = chardet.detect(html_content) |
| 178 | + encoding = result["encoding"] |
| 179 | + |
| 180 | + # Parse the HTML content using Beautiful Soup |
| 181 | + soup = BeautifulSoup(html_content.decode(encoding), "html.parser") |
| 182 | + |
| 183 | + # Find the <ul> element with the specified class |
| 184 | + ul_element = soup.find("ul", class_="tsd-small-nested-navigation") |
| 185 | + |
| 186 | + if ul_element: |
| 187 | + # Lists to store Antora links and individual HTML files |
| 188 | + antora_links = process_docs(ul_element) |
| 189 | + |
| 190 | + update_nav_adoc(antora_links) |
| 191 | + |
| 192 | + update_pages_directory() |
| 193 | + |
| 194 | + print("All tasks completed.") |
| 195 | + else: |
| 196 | + print("UL element not found in the HTML.") |
0 commit comments