Skip to content

Commit e38676d

Browse files
authored
Merge pull request #71 from astropy-learn/jb_processor
Add processor for tutorials produced by JupyterBook
2 parents 14eb1a8 + 2672fe3 commit e38676d

File tree

1 file changed

+93
-5
lines changed

1 file changed

+93
-5
lines changed

astropylibrarian/reducers/tutorial.py

Lines changed: 93 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@
3333

3434
def get_tutorial_reducer(html_page: HtmlPage) -> Type[ReducedTutorial]:
3535
"""Get the reducer appropriate for the tutorial's structure."""
36-
# doc = html_page.parse()
37-
return ReducedNbcollectionTutorial
38-
# if len(doc.cssselect(".jp-Notebook")) > 0:
39-
# logger.debug("Using nbcollection tutorial reducer")
40-
# return ReducedNbcollectionTutorial
36+
logger.debug("Using jupyterbook tutorial reducer")
37+
return ReducedJupyterBookTutorial
38+
# doc = html_page.parse() # TODO
39+
# if "tutorial--" in doc.cssselect("*")[0].text_content():
40+
# logger.debug("Using jupyterbook tutorial reducer")
41+
# return ReducedJupyterBookTutorial
4142
# else:
4243
# logger.debug("Using sphinx tutorial reducer")
4344
# return ReducedSphinxTutorial
@@ -182,24 +183,28 @@ def process_html(self, html_page: HtmlPage) -> None:
182183
try:
183184
self._h1 = self._get_section_title(doc.cssselect("h1")[0])
184185
except IndexError:
186+
logger.warning("Did not find h1")
185187
pass
186188

187189
try:
188190
authors_paragraph = doc.cssselect(".card section p, .card .section p")[0]
189191
self._authors = self._parse_comma_list(authors_paragraph)
190192
except IndexError:
193+
logger.warning("Did not find authors")
191194
pass
192195

193196
try:
194197
keywords_paragraph = doc.cssselect("#keywords p")[0]
195198
self._keywords = self._parse_comma_list(keywords_paragraph)
196199
except IndexError:
200+
logger.warning("Did not find keywords")
197201
pass
198202

199203
try:
200204
summary_paragraph = doc.cssselect("#summary p")[0]
201205
self._summary = summary_paragraph.text_content().replace("\n", " ")
202206
except IndexError:
207+
logger.warning("Did not find summary")
203208
pass
204209

205210
image_elements = doc.cssselect(".card section img, .card .section img")
@@ -301,6 +306,89 @@ def process_html(self, html_page: HtmlPage) -> None:
301306
logger.debug("Found %s section in total", len(self._sections))
302307

303308

309+
class ReducedJupyterBookTutorial(ReducedTutorial):
310+
"""A reduced tutorial notebook that was published with
311+
JupyterBook.
312+
"""
313+
314+
def process_html(self, html_page: HtmlPage) -> None:
315+
"""Process the HTML page."""
316+
doc = html_page.parse()
317+
318+
try:
319+
self._h1 = self._get_section_title(doc.cssselect("h1")[0])
320+
logger.debug(f"Header:\n{self._h1}")
321+
except IndexError:
322+
logger.warning("Did not find h1")
323+
pass
324+
325+
try:
326+
authors_paragraph = doc.cssselect("#authors p")[0]
327+
self._authors = self._parse_comma_list(authors_paragraph)
328+
logger.debug(f"Authors:\n{self._authors}")
329+
except IndexError:
330+
logger.warning("Did not find authors")
331+
pass
332+
333+
try:
334+
keywords_paragraph = doc.cssselect("#keywords p")[0]
335+
self._keywords = self._parse_comma_list(keywords_paragraph)
336+
logger.debug(f"Keywords:\n{self._keywords}")
337+
except IndexError:
338+
logger.warning("Did not find keywords")
339+
pass
340+
341+
try:
342+
summary_paragraph = doc.cssselect("#summary p")[0]
343+
self._summary = summary_paragraph.text_content().replace("\n", " ")
344+
logger.debug(f"Summary:\n{self._summary}")
345+
except IndexError:
346+
logger.warning("Did not find summary")
347+
pass
348+
349+
image_elements = doc.cssselect("img")
350+
logger.debug(f"Found {len(image_elements)} image elements")
351+
for image_element in image_elements:
352+
img_src = image_element.attrib["src"]
353+
if img_src.startswith("data:"):
354+
# skip embedded images
355+
continue
356+
self._images.append(urljoin(self.url, img_src))
357+
358+
root_section = doc.cssselect("section")[0]
359+
for s in iter_sphinx_sections(
360+
base_url=self._url,
361+
root_section=root_section,
362+
headers=[],
363+
header_callback=lambda x: x.rstrip("¶"),
364+
content_callback=clean_content,
365+
):
366+
if not self._is_ignored_section(s):
367+
self._sections.append(s)
368+
369+
# Also look for additional h1 section on the page.
370+
# Technically, the page should only have one h1, and all content
371+
# should be subsections of that. In real life, though, it's easy
372+
# to accidentally use additional h1 elements for subsections.
373+
h1_heading = self._sections[-1].headings[-1]
374+
for sibling in root_section.itersiblings(tag=("div", "section")):
375+
if sibling.tag == "div" and "section" not in sibling.classes:
376+
continue
377+
for s in iter_sphinx_sections(
378+
root_section=sibling,
379+
base_url=self._url,
380+
headers=[h1_heading],
381+
header_callback=lambda x: x.rstrip("¶"),
382+
content_callback=clean_content,
383+
):
384+
if not self._is_ignored_section(s):
385+
self._sections.append(s)
386+
387+
@staticmethod
388+
def _get_section_title(element: lxml.html.HtmlElement) -> str:
389+
return element.text_content().rstrip("¶")
390+
391+
304392
def clean_content(x: str) -> str:
305393
x = x.strip()
306394
x = x.replace(r"\n", " ")

0 commit comments

Comments
 (0)