|
33 | 33 |
|
34 | 34 | def get_tutorial_reducer(html_page: HtmlPage) -> Type[ReducedTutorial]:
|
35 | 35 | """Get the reducer appropriate for the tutorial's structure."""
|
36 |
| - # doc = html_page.parse() |
37 |
| - return ReducedNbcollectionTutorial |
38 |
| - # if len(doc.cssselect(".jp-Notebook")) > 0: |
39 |
| - # logger.debug("Using nbcollection tutorial reducer") |
40 |
| - # return ReducedNbcollectionTutorial |
| 36 | + logger.debug("Using jupyterbook tutorial reducer") |
| 37 | + return ReducedJupyterBookTutorial |
| 38 | + # doc = html_page.parse() # TODO |
| 39 | + # if "tutorial--" in doc.cssselect("*")[0].text_content(): |
| 40 | + # logger.debug("Using jupyterbook tutorial reducer") |
| 41 | + # return ReducedJupyterBookTutorial |
41 | 42 | # else:
|
42 | 43 | # logger.debug("Using sphinx tutorial reducer")
|
43 | 44 | # return ReducedSphinxTutorial
|
@@ -182,24 +183,28 @@ def process_html(self, html_page: HtmlPage) -> None:
|
182 | 183 | try:
|
183 | 184 | self._h1 = self._get_section_title(doc.cssselect("h1")[0])
|
184 | 185 | except IndexError:
|
| 186 | + logger.warning("Did not find h1") |
185 | 187 | pass
|
186 | 188 |
|
187 | 189 | try:
|
188 | 190 | authors_paragraph = doc.cssselect(".card section p, .card .section p")[0]
|
189 | 191 | self._authors = self._parse_comma_list(authors_paragraph)
|
190 | 192 | except IndexError:
|
| 193 | + logger.warning("Did not find authors") |
191 | 194 | pass
|
192 | 195 |
|
193 | 196 | try:
|
194 | 197 | keywords_paragraph = doc.cssselect("#keywords p")[0]
|
195 | 198 | self._keywords = self._parse_comma_list(keywords_paragraph)
|
196 | 199 | except IndexError:
|
| 200 | + logger.warning("Did not find keywords") |
197 | 201 | pass
|
198 | 202 |
|
199 | 203 | try:
|
200 | 204 | summary_paragraph = doc.cssselect("#summary p")[0]
|
201 | 205 | self._summary = summary_paragraph.text_content().replace("\n", " ")
|
202 | 206 | except IndexError:
|
| 207 | + logger.warning("Did not find summary") |
203 | 208 | pass
|
204 | 209 |
|
205 | 210 | image_elements = doc.cssselect(".card section img, .card .section img")
|
@@ -301,6 +306,89 @@ def process_html(self, html_page: HtmlPage) -> None:
|
301 | 306 | logger.debug("Found %s section in total", len(self._sections))
|
302 | 307 |
|
303 | 308 |
|
| 309 | +class ReducedJupyterBookTutorial(ReducedTutorial): |
| 310 | + """A reduced tutorial notebook that was published with |
| 311 | + JupyterBook. |
| 312 | + """ |
| 313 | + |
| 314 | + def process_html(self, html_page: HtmlPage) -> None: |
| 315 | + """Process the HTML page.""" |
| 316 | + doc = html_page.parse() |
| 317 | + |
| 318 | + try: |
| 319 | + self._h1 = self._get_section_title(doc.cssselect("h1")[0]) |
| 320 | + logger.debug(f"Header:\n{self._h1}") |
| 321 | + except IndexError: |
| 322 | + logger.warning("Did not find h1") |
| 323 | + pass |
| 324 | + |
| 325 | + try: |
| 326 | + authors_paragraph = doc.cssselect("#authors p")[0] |
| 327 | + self._authors = self._parse_comma_list(authors_paragraph) |
| 328 | + logger.debug(f"Authors:\n{self._authors}") |
| 329 | + except IndexError: |
| 330 | + logger.warning("Did not find authors") |
| 331 | + pass |
| 332 | + |
| 333 | + try: |
| 334 | + keywords_paragraph = doc.cssselect("#keywords p")[0] |
| 335 | + self._keywords = self._parse_comma_list(keywords_paragraph) |
| 336 | + logger.debug(f"Keywords:\n{self._keywords}") |
| 337 | + except IndexError: |
| 338 | + logger.warning("Did not find keywords") |
| 339 | + pass |
| 340 | + |
| 341 | + try: |
| 342 | + summary_paragraph = doc.cssselect("#summary p")[0] |
| 343 | + self._summary = summary_paragraph.text_content().replace("\n", " ") |
| 344 | + logger.debug(f"Summary:\n{self._summary}") |
| 345 | + except IndexError: |
| 346 | + logger.warning("Did not find summary") |
| 347 | + pass |
| 348 | + |
| 349 | + image_elements = doc.cssselect("img") |
| 350 | + logger.debug(f"Found {len(image_elements)} image elements") |
| 351 | + for image_element in image_elements: |
| 352 | + img_src = image_element.attrib["src"] |
| 353 | + if img_src.startswith("data:"): |
| 354 | + # skip embedded images |
| 355 | + continue |
| 356 | + self._images.append(urljoin(self.url, img_src)) |
| 357 | + |
| 358 | + root_section = doc.cssselect("section")[0] |
| 359 | + for s in iter_sphinx_sections( |
| 360 | + base_url=self._url, |
| 361 | + root_section=root_section, |
| 362 | + headers=[], |
| 363 | + header_callback=lambda x: x.rstrip("¶"), |
| 364 | + content_callback=clean_content, |
| 365 | + ): |
| 366 | + if not self._is_ignored_section(s): |
| 367 | + self._sections.append(s) |
| 368 | + |
| 369 | + # Also look for additional h1 section on the page. |
| 370 | + # Technically, the page should only have one h1, and all content |
| 371 | + # should be subsections of that. In real life, though, it's easy |
| 372 | + # to accidentally use additional h1 elements for subsections. |
| 373 | + h1_heading = self._sections[-1].headings[-1] |
| 374 | + for sibling in root_section.itersiblings(tag=("div", "section")): |
| 375 | + if sibling.tag == "div" and "section" not in sibling.classes: |
| 376 | + continue |
| 377 | + for s in iter_sphinx_sections( |
| 378 | + root_section=sibling, |
| 379 | + base_url=self._url, |
| 380 | + headers=[h1_heading], |
| 381 | + header_callback=lambda x: x.rstrip("¶"), |
| 382 | + content_callback=clean_content, |
| 383 | + ): |
| 384 | + if not self._is_ignored_section(s): |
| 385 | + self._sections.append(s) |
| 386 | + |
| 387 | + @staticmethod |
| 388 | + def _get_section_title(element: lxml.html.HtmlElement) -> str: |
| 389 | + return element.text_content().rstrip("¶") |
| 390 | + |
| 391 | + |
304 | 392 | def clean_content(x: str) -> str:
|
305 | 393 | x = x.strip()
|
306 | 394 | x = x.replace(r"\n", " ")
|
|
0 commit comments