From 1ed767a9fce35ce2b188b907319c7645e6b1cf80 Mon Sep 17 00:00:00 2001 From: chrispy Date: Sat, 1 Feb 2025 18:08:32 -0500 Subject: [PATCH 1/3] make conversion non-destructive to soup; improve div/article/section handling Signed-off-by: chrispy --- markdownify/__init__.py | 105 ++++++++++++++++++++++++++++---------- tests/test_basic.py | 2 +- tests/test_conversions.py | 15 +++++- 3 files changed, 91 insertions(+), 31 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 7d14fe7..ac2796f 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,4 +1,4 @@ -from bs4 import BeautifulSoup, NavigableString, Comment, Doctype +from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag from textwrap import fill import re import six @@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el): if html_heading_re.match(el.name) is not None: return True return el.name in ('p', 'blockquote', + 'article', 'div', 'section', 'ol', 'ul', 'li', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th') @@ -89,6 +90,34 @@ def should_remove_whitespace_outside(el): return should_remove_whitespace_inside(el) or (el and el.name == 'pre') +def _is_content_element(el): + """Returns True for content (tags and non-whitespace text), else False.""" + if isinstance(el, Tag): + return True + elif isinstance(el, NavigableString): + return el.strip() != '' + else: + return False + + +def _prev_content_sibling(el): + """Returns the first previous sibling that is a content element, else None.""" + while el is not None: + el = el.previous_sibling + if _is_content_element(el): + return el + return None + + +def _next_content_sibling(el): + """Returns the first next sibling that is a content element, else None.""" + while el is not None: + el = el.next_sibling + if _is_content_element(el): + return el + return None + + class MarkdownConverter(object): class DefaultOptions: autolinks = True @@ -144,29 +173,37 @@ def process_tag(self, node, convert_as_inline): if isHeading or isCell: convert_children_as_inline = True - # Remove whitespace-only textnodes just before, after or - # inside block-level elements. + # Collect child elements to process, ignoring whitespace-only text elements + # adjacent to the inner/outer boundaries of block elements. should_remove_inside = should_remove_whitespace_inside(node) - for el in node.children: - # Only extract (remove) whitespace-only text node if any of the - # conditions is true: - # - el is the first element in its parent (block-level) - # - el is the last element in its parent (block-level) - # - el is adjacent to a block-level node - can_extract = (should_remove_inside and (not el.previous_sibling - or not el.next_sibling) - or should_remove_whitespace_outside(el.previous_sibling) - or should_remove_whitespace_outside(el.next_sibling)) - if (isinstance(el, NavigableString) - and six.text_type(el).strip() == '' - and can_extract): - el.extract() - # Convert the children first - for el in node.children: - if isinstance(el, Comment) or isinstance(el, Doctype): - continue + def _can_ignore(el): + if isinstance(el, (Comment, Doctype)): + # Comment and Doctype elements are always ignored. + return True + elif isinstance(el, Tag): + # Tags are always processed. + return False elif isinstance(el, NavigableString): + if six.text_type(el).strip() != '': + # Non-whitespace text nodes are always processed. + return False + elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): + # Inside block elements (excluding
), ignore adjacent whitespace elements.
+                    return True
+                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
+                    # Outside block elements (including Hello') == 'Hello'
+    assert md('
Hello
') == '\n\nHello\n\n' def test_whitespace(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 1367006..31202cb 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -114,8 +114,19 @@ def test_del(): inline_tests('del', '~~') -def test_div(): - assert md('Hello World') == 'Hello World' +def test_div_section_article(): + for tag in ['div', 'section', 'article']: + assert md(f'
456
') == '\n\n456\n\n' + assert md(f'123<{tag}>456789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n 456 \n789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>

456

789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n

456

\n789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>
4 5 6
789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>\n
4 5 6
\n789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>4\n5\n6789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n4\n5\n6\n789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n

\n4\n5\n6\n

\n789') == '123\n\n4\n5\n6\n\n789' + assert md(f'<{tag}>

title

body', heading_style=ATX) == '\n\n# title\n\nbody\n\n' def test_em(): From a39b39329c118dbd0901c418dbadaf97d0447f97 Mon Sep 17 00:00:00 2001 From: chrispy Date: Tue, 4 Feb 2025 18:03:04 -0500 Subject: [PATCH 2/3] implement review feedback (thanks AlexVonB!) --- markdownify/__init__.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index fb216a1..9e4c99f 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -90,30 +90,37 @@ def should_remove_whitespace_outside(el): return should_remove_whitespace_inside(el) or (el and el.name == 'pre') -def _is_content_element(el): - """Returns True for content (tags and non-whitespace text), else False.""" +def _is_block_content_element(el): + """ + In a block context, returns: + + - True for content elements (tags and non-whitespace text) + - False for non-content elements (whitespace text, comments, doctypes) + """ if isinstance(el, Tag): return True + elif isinstance(el, (Comment, Doctype)): + return False # (subclasses of NavigableString, must test first) elif isinstance(el, NavigableString): return el.strip() != '' else: return False -def _prev_content_sibling(el): +def _prev_block_content_sibling(el): """Returns the first previous sibling that is a content element, else None.""" while el is not None: el = el.previous_sibling - if _is_content_element(el): + if _is_block_content_element(el): return el return None -def _next_content_sibling(el): +def _next_block_content_sibling(el): """Returns the first next sibling that is a content element, else None.""" while el is not None: el = el.next_sibling - if _is_content_element(el): + if _is_block_content_element(el): return el return None @@ -177,12 +184,13 @@ def process_tag(self, node, convert_as_inline): should_remove_inside = should_remove_whitespace_inside(node) def _can_ignore(el): - if isinstance(el, (Comment, Doctype)): - # Comment and Doctype elements are always ignored. - return True - elif isinstance(el, Tag): + if isinstance(el, Tag): # Tags are always processed. return False + elif isinstance(el, (Comment, Doctype)): + # Comment and Doctype elements are always ignored. + # (subclasses of NavigableString, must test first) + return True elif isinstance(el, NavigableString): if six.text_type(el).strip() != '': # Non-whitespace text nodes are always processed. @@ -191,7 +199,7 @@ def _can_ignore(el): # Inside block elements (excluding
), ignore adjacent whitespace elements.
                     return True
                 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
-                    # Outside block elements (including ), ignore adjacent whitespace elements.
                     return True
                 else:
                     return False
@@ -462,9 +470,9 @@ def convert_list(self, el, text, convert_as_inline):
 
         nested = False
         before_paragraph = False
-        if next_sibling := _next_content_sibling(el):
-            if next_sibling.name not in ['ul', 'ol']:
-                before_paragraph = True
+        next_sibling = _next_block_content_sibling(el)
+        if next_sibling and next_sibling.name not in ['ul', 'ol']:
+            before_paragraph = True
         while el:
             if el.name == 'li':
                 nested = True

From 776ca3fc943a3bc22d606eb5f5008fad2b4b9042 Mon Sep 17 00:00:00 2001
From: chrispy 
Date: Tue, 4 Feb 2025 18:06:02 -0500
Subject: [PATCH 3/3] fix a lint error

---
 tests/test_conversions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 31202cb..1739cb9 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -116,7 +116,7 @@ def test_del():
 
 def test_div_section_article():
     for tag in ['div', 'section', 'article']:
-        assert md(f'
456
') == '\n\n456\n\n' + assert md(f'<{tag}>456') == '\n\n456\n\n' assert md(f'123<{tag}>456789') == '123\n\n456\n\n789' assert md(f'123<{tag}>\n 456 \n789') == '123\n\n456\n\n789' assert md(f'123<{tag}>

456

789') == '123\n\n456\n\n789'