@@ -305,6 +305,89 @@ def process_html(self, html_page: HtmlPage) -> None:
305
305
logger .debug ("Found %s section in total" , len (self ._sections ))
306
306
307
307
308
+ class ReducedJupyterBookTutorial (ReducedTutorial ):
309
+ """A reduced tutorial notebook that was published with
310
+ JupyterBook.
311
+ """
312
+
313
+ def process_html (self , html_page : HtmlPage ) -> None :
314
+ """Process the HTML page."""
315
+ doc = html_page .parse ()
316
+
317
+ try :
318
+ self ._h1 = self ._get_section_title (doc .cssselect ("h1" )[0 ])
319
+ logger .debug (f"Header:\n { self ._h1 } " )
320
+ except IndexError :
321
+ logger .warning ("Did not find h1" )
322
+ pass
323
+
324
+ try :
325
+ authors_paragraph = doc .cssselect ("#authors p" )[0 ]
326
+ self ._authors = self ._parse_comma_list (authors_paragraph )
327
+ logger .debug (f"Authors:\n { self ._authors } " )
328
+ except IndexError :
329
+ logger .warning ("Did not find authors" )
330
+ pass
331
+
332
+ try :
333
+ keywords_paragraph = doc .cssselect ("#keywords p" )[0 ]
334
+ self ._keywords = self ._parse_comma_list (keywords_paragraph )
335
+ logger .debug (f"Keywords:\n { self ._keywords } " )
336
+ except IndexError :
337
+ logger .warning ("Did not find keywords" )
338
+ pass
339
+
340
+ try :
341
+ summary_paragraph = doc .cssselect ("#summary p" )[0 ]
342
+ self ._summary = summary_paragraph .text_content ().replace ("\n " , " " )
343
+ logger .debug (f"Summary:\n { self ._summary } " )
344
+ except IndexError :
345
+ logger .warning ("Did not find summary" )
346
+ pass
347
+
348
+ image_elements = doc .cssselect ("img" )
349
+ logger .debug (f"Found { len (image_elements )} image elements" )
350
+ for image_element in image_elements :
351
+ img_src = image_element .attrib ["src" ]
352
+ if img_src .startswith ("data:" ):
353
+ # skip embedded images
354
+ continue
355
+ self ._images .append (urljoin (self .url , img_src ))
356
+
357
+ root_section = doc .cssselect ("section" )[0 ]
358
+ for s in iter_sphinx_sections (
359
+ base_url = self ._url ,
360
+ root_section = root_section ,
361
+ headers = [],
362
+ header_callback = lambda x : x .rstrip ("¶" ),
363
+ content_callback = clean_content ,
364
+ ):
365
+ if not self ._is_ignored_section (s ):
366
+ self ._sections .append (s )
367
+
368
+ # Also look for additional h1 section on the page.
369
+ # Technically, the page should only have one h1, and all content
370
+ # should be subsections of that. In real life, though, it's easy
371
+ # to accidentally use additional h1 elements for subsections.
372
+ h1_heading = self ._sections [- 1 ].headings [- 1 ]
373
+ for sibling in root_section .itersiblings (tag = ("div" , "section" )):
374
+ if sibling .tag == "div" and "section" not in sibling .classes :
375
+ continue
376
+ for s in iter_sphinx_sections (
377
+ root_section = sibling ,
378
+ base_url = self ._url ,
379
+ headers = [h1_heading ],
380
+ header_callback = lambda x : x .rstrip ("¶" ),
381
+ content_callback = clean_content ,
382
+ ):
383
+ if not self ._is_ignored_section (s ):
384
+ self ._sections .append (s )
385
+
386
+ @staticmethod
387
+ def _get_section_title (element : lxml .html .HtmlElement ) -> str :
388
+ return element .text_content ().rstrip ("¶" )
389
+
390
+
308
391
def clean_content (x : str ) -> str :
309
392
x = x .strip ()
310
393
x = x .replace (r"\n" , " " )
0 commit comments