Skip to content

Commit 3a5faaa

Browse files
committed
add ReducedJupyterBookTutorial class
1 parent 3af7bcc commit 3a5faaa

File tree

1 file changed

+83
-0
lines changed

1 file changed

+83
-0
lines changed

astropylibrarian/reducers/tutorial.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,89 @@ def process_html(self, html_page: HtmlPage) -> None:
305305
logger.debug("Found %s section in total", len(self._sections))
306306

307307

308+
class ReducedJupyterBookTutorial(ReducedTutorial):
309+
"""A reduced tutorial notebook that was published with
310+
JupyterBook.
311+
"""
312+
313+
def process_html(self, html_page: HtmlPage) -> None:
314+
"""Process the HTML page."""
315+
doc = html_page.parse()
316+
317+
try:
318+
self._h1 = self._get_section_title(doc.cssselect("h1")[0])
319+
logger.debug(f"Header:\n{self._h1}")
320+
except IndexError:
321+
logger.warning("Did not find h1")
322+
pass
323+
324+
try:
325+
authors_paragraph = doc.cssselect("#authors p")[0]
326+
self._authors = self._parse_comma_list(authors_paragraph)
327+
logger.debug(f"Authors:\n{self._authors}")
328+
except IndexError:
329+
logger.warning("Did not find authors")
330+
pass
331+
332+
try:
333+
keywords_paragraph = doc.cssselect("#keywords p")[0]
334+
self._keywords = self._parse_comma_list(keywords_paragraph)
335+
logger.debug(f"Keywords:\n{self._keywords}")
336+
except IndexError:
337+
logger.warning("Did not find keywords")
338+
pass
339+
340+
try:
341+
summary_paragraph = doc.cssselect("#summary p")[0]
342+
self._summary = summary_paragraph.text_content().replace("\n", " ")
343+
logger.debug(f"Summary:\n{self._summary}")
344+
except IndexError:
345+
logger.warning("Did not find summary")
346+
pass
347+
348+
image_elements = doc.cssselect("img")
349+
logger.debug(f"Found {len(image_elements)} image elements")
350+
for image_element in image_elements:
351+
img_src = image_element.attrib["src"]
352+
if img_src.startswith("data:"):
353+
# skip embedded images
354+
continue
355+
self._images.append(urljoin(self.url, img_src))
356+
357+
root_section = doc.cssselect("section")[0]
358+
for s in iter_sphinx_sections(
359+
base_url=self._url,
360+
root_section=root_section,
361+
headers=[],
362+
header_callback=lambda x: x.rstrip("¶"),
363+
content_callback=clean_content,
364+
):
365+
if not self._is_ignored_section(s):
366+
self._sections.append(s)
367+
368+
# Also look for additional h1 section on the page.
369+
# Technically, the page should only have one h1, and all content
370+
# should be subsections of that. In real life, though, it's easy
371+
# to accidentally use additional h1 elements for subsections.
372+
h1_heading = self._sections[-1].headings[-1]
373+
for sibling in root_section.itersiblings(tag=("div", "section")):
374+
if sibling.tag == "div" and "section" not in sibling.classes:
375+
continue
376+
for s in iter_sphinx_sections(
377+
root_section=sibling,
378+
base_url=self._url,
379+
headers=[h1_heading],
380+
header_callback=lambda x: x.rstrip("¶"),
381+
content_callback=clean_content,
382+
):
383+
if not self._is_ignored_section(s):
384+
self._sections.append(s)
385+
386+
@staticmethod
387+
def _get_section_title(element: lxml.html.HtmlElement) -> str:
388+
return element.text_content().rstrip("¶")
389+
390+
308391
def clean_content(x: str) -> str:
309392
x = x.strip()
310393
x = x.replace(r"\n", " ")

0 commit comments

Comments
 (0)