From 1ed767a9fce35ce2b188b907319c7645e6b1cf80 Mon Sep 17 00:00:00 2001
From: chrispy <chrispy@synopsys.com>
Date: Sat, 1 Feb 2025 18:08:32 -0500
Subject: [PATCH 1/3] make conversion non-destructive to soup; improve
 div/article/section handling

Signed-off-by: chrispy <chrispy@synopsys.com>
---
 markdownify/__init__.py   | 105 ++++++++++++++++++++++++++++----------
 tests/test_basic.py       |   2 +-
 tests/test_conversions.py |  15 +++++-
 3 files changed, 91 insertions(+), 31 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 7d14fe7..ac2796f 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -1,4 +1,4 @@
-from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
+from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
 from textwrap import fill
 import re
 import six
@@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el):
     if html_heading_re.match(el.name) is not None:
         return True
     return el.name in ('p', 'blockquote',
+                       'article', 'div', 'section',
                        'ol', 'ul', 'li',
                        'table', 'thead', 'tbody', 'tfoot',
                        'tr', 'td', 'th')
@@ -89,6 +90,34 @@ def should_remove_whitespace_outside(el):
     return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
 
 
+def _is_content_element(el):
+    """Returns True for content (tags and non-whitespace text), else False."""
+    if isinstance(el, Tag):
+        return True
+    elif isinstance(el, NavigableString):
+        return el.strip() != ''
+    else:
+        return False
+
+
+def _prev_content_sibling(el):
+    """Returns the first previous sibling that is a content element, else None."""
+    while el is not None:
+        el = el.previous_sibling
+        if _is_content_element(el):
+            return el
+    return None
+
+
+def _next_content_sibling(el):
+    """Returns the first next sibling that is a content element, else None."""
+    while el is not None:
+        el = el.next_sibling
+        if _is_content_element(el):
+            return el
+    return None
+
+
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
@@ -144,29 +173,37 @@ def process_tag(self, node, convert_as_inline):
         if isHeading or isCell:
             convert_children_as_inline = True
 
-        # Remove whitespace-only textnodes just before, after or
-        # inside block-level elements.
+        # Collect child elements to process, ignoring whitespace-only text elements
+        # adjacent to the inner/outer boundaries of block elements.
         should_remove_inside = should_remove_whitespace_inside(node)
-        for el in node.children:
-            # Only extract (remove) whitespace-only text node if any of the
-            # conditions is true:
-            # - el is the first element in its parent (block-level)
-            # - el is the last element in its parent (block-level)
-            # - el is adjacent to a block-level node
-            can_extract = (should_remove_inside and (not el.previous_sibling
-                                                     or not el.next_sibling)
-                           or should_remove_whitespace_outside(el.previous_sibling)
-                           or should_remove_whitespace_outside(el.next_sibling))
-            if (isinstance(el, NavigableString)
-                    and six.text_type(el).strip() == ''
-                    and can_extract):
-                el.extract()
 
-        # Convert the children first
-        for el in node.children:
-            if isinstance(el, Comment) or isinstance(el, Doctype):
-                continue
+        def _can_ignore(el):
+            if isinstance(el, (Comment, Doctype)):
+                # Comment and Doctype elements are always ignored.
+                return True
+            elif isinstance(el, Tag):
+                # Tags are always processed.
+                return False
             elif isinstance(el, NavigableString):
+                if six.text_type(el).strip() != '':
+                    # Non-whitespace text nodes are always processed.
+                    return False
+                elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
+                    # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
+                    return True
+                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
+                    # Outside block elements (including <pre), ignore adjacent whitespace elements.
+                    return True
+                else:
+                    return False
+            else:
+                raise ValueError('Unexpected element type: %s' % type(el))
+
+        children_to_convert = [child for child in node.children if not _can_ignore(child)]
+
+        # Convert the children first
+        for el in children_to_convert:
+            if isinstance(el, NavigableString):
                 text += self.process_text(el)
             else:
                 text_strip = text.rstrip('\n')
@@ -338,6 +375,16 @@ def convert_code(self, el, text, convert_as_inline):
 
     convert_del = abstract_inline_conversion(lambda self: '~~')
 
+    def convert_div(self, el, text, convert_as_inline):
+        if convert_as_inline:
+            return ' ' + text.strip() + ' '
+        text = text.strip()
+        return '\n\n%s\n\n' % text if text else ''
+
+    convert_article = convert_div
+
+    convert_section = convert_div
+
     convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
 
     convert_kbd = convert_code
@@ -416,8 +463,9 @@ def convert_list(self, el, text, convert_as_inline):
 
         nested = False
         before_paragraph = False
-        if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
-            before_paragraph = True
+        if next_sibling := _next_content_sibling(el):
+            if next_sibling.name not in ['ul', 'ol']:
+                before_paragraph = True
         while el:
             if el.name == 'li':
                 nested = True
@@ -540,6 +588,7 @@ def convert_th(self, el, text, convert_as_inline):
 
     def convert_tr(self, el, text, convert_as_inline):
         cells = el.find_all(['td', 'th'])
+        is_first_row = el.find_previous_sibling() is None
         is_headrow = (
             all([cell.name == 'th' for cell in cells])
             or (el.parent.name == 'thead'
@@ -547,15 +596,15 @@ def convert_tr(self, el, text, convert_as_inline):
                 and len(el.parent.find_all('tr')) == 1)
         )
         is_head_row_missing = (
-            (not el.previous_sibling and not el.parent.name == 'tbody')
-            or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
+            (is_first_row and not el.parent.name == 'tbody')
+            or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
         )
         overline = ''
         underline = ''
         if ((is_headrow
              or (is_head_row_missing
                  and self.options['table_infer_header']))
-                and not el.previous_sibling):
+                and is_first_row):
             # first row and:
             # - is headline or
             # - headline is missing and header inference is enabled
@@ -569,10 +618,10 @@ def convert_tr(self, el, text, convert_as_inline):
             underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
         elif ((is_head_row_missing
                and not self.options['table_infer_header'])
-              or (not el.previous_sibling
+              or (is_first_row
                   and (el.parent.name == 'table'
                        or (el.parent.name == 'tbody'
-                           and not el.parent.previous_sibling)))):
+                           and not el.parent.find_previous_sibling())))):
             # headline is missing and header inference is disabled or:
             # first row, not headline, and:
             #  - the parent is table or
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 584adb9..9be524e 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -6,7 +6,7 @@ def test_single_tag():
 
 
 def test_soup():
-    assert md('<div><span>Hello</div></span>') == 'Hello'
+    assert md('<div><span>Hello</div></span>') == '\n\nHello\n\n'
 
 
 def test_whitespace():
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 1367006..31202cb 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -114,8 +114,19 @@ def test_del():
     inline_tests('del', '~~')
 
 
-def test_div():
-    assert md('Hello</div> World') == 'Hello World'
+def test_div_section_article():
+    for tag in ['div', 'section', 'article']:
+        assert md(f'<div>456</div>') == '\n\n456\n\n'
+        assert md(f'123<{tag}>456</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}>\n 456 \n</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}><p>456</p></{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}>\n<p>456</p>\n</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}><pre>4 5 6</pre></{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
+        assert md(f'123<{tag}>\n<pre>4 5 6</pre>\n</{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
+        assert md(f'123<{tag}>4\n5\n6</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'123<{tag}>\n4\n5\n6\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'123<{tag}>\n<p>\n4\n5\n6\n</p>\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'<{tag}><h1>title</h1>body</{{tag}}>', heading_style=ATX) == '\n\n# title\n\nbody\n\n'
 
 
 def test_em():

From a39b39329c118dbd0901c418dbadaf97d0447f97 Mon Sep 17 00:00:00 2001
From: chrispy <chrispy@synopsys.com>
Date: Tue, 4 Feb 2025 18:03:04 -0500
Subject: [PATCH 2/3] implement review feedback (thanks AlexVonB!)

---
 markdownify/__init__.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index fb216a1..9e4c99f 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -90,30 +90,37 @@ def should_remove_whitespace_outside(el):
     return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
 
 
-def _is_content_element(el):
-    """Returns True for content (tags and non-whitespace text), else False."""
+def _is_block_content_element(el):
+    """
+    In a block context, returns:
+
+    - True for content elements (tags and non-whitespace text)
+    - False for non-content elements (whitespace text, comments, doctypes)
+    """
     if isinstance(el, Tag):
         return True
+    elif isinstance(el, (Comment, Doctype)):
+        return False  # (subclasses of NavigableString, must test first)
     elif isinstance(el, NavigableString):
         return el.strip() != ''
     else:
         return False
 
 
-def _prev_content_sibling(el):
+def _prev_block_content_sibling(el):
     """Returns the first previous sibling that is a content element, else None."""
     while el is not None:
         el = el.previous_sibling
-        if _is_content_element(el):
+        if _is_block_content_element(el):
             return el
     return None
 
 
-def _next_content_sibling(el):
+def _next_block_content_sibling(el):
     """Returns the first next sibling that is a content element, else None."""
     while el is not None:
         el = el.next_sibling
-        if _is_content_element(el):
+        if _is_block_content_element(el):
             return el
     return None
 
@@ -177,12 +184,13 @@ def process_tag(self, node, convert_as_inline):
         should_remove_inside = should_remove_whitespace_inside(node)
 
         def _can_ignore(el):
-            if isinstance(el, (Comment, Doctype)):
-                # Comment and Doctype elements are always ignored.
-                return True
-            elif isinstance(el, Tag):
+            if isinstance(el, Tag):
                 # Tags are always processed.
                 return False
+            elif isinstance(el, (Comment, Doctype)):
+                # Comment and Doctype elements are always ignored.
+                # (subclasses of NavigableString, must test first)
+                return True
             elif isinstance(el, NavigableString):
                 if six.text_type(el).strip() != '':
                     # Non-whitespace text nodes are always processed.
@@ -191,7 +199,7 @@ def _can_ignore(el):
                     # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
                     return True
                 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
-                    # Outside block elements (including <pre), ignore adjacent whitespace elements.
+                    # Outside block elements (including <pre>), ignore adjacent whitespace elements.
                     return True
                 else:
                     return False
@@ -462,9 +470,9 @@ def convert_list(self, el, text, convert_as_inline):
 
         nested = False
         before_paragraph = False
-        if next_sibling := _next_content_sibling(el):
-            if next_sibling.name not in ['ul', 'ol']:
-                before_paragraph = True
+        next_sibling = _next_block_content_sibling(el)
+        if next_sibling and next_sibling.name not in ['ul', 'ol']:
+            before_paragraph = True
         while el:
             if el.name == 'li':
                 nested = True

From 776ca3fc943a3bc22d606eb5f5008fad2b4b9042 Mon Sep 17 00:00:00 2001
From: chrispy <chrispy@synopsys.com>
Date: Tue, 4 Feb 2025 18:06:02 -0500
Subject: [PATCH 3/3] fix a lint error

---
 tests/test_conversions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 31202cb..1739cb9 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -116,7 +116,7 @@ def test_del():
 
 def test_div_section_article():
     for tag in ['div', 'section', 'article']:
-        assert md(f'<div>456</div>') == '\n\n456\n\n'
+        assert md(f'<{tag}>456</{tag}>') == '\n\n456\n\n'
         assert md(f'123<{tag}>456</{tag}>789') == '123\n\n456\n\n789'
         assert md(f'123<{tag}>\n 456 \n</{tag}>789') == '123\n\n456\n\n789'
         assert md(f'123<{tag}><p>456</p></{tag}>789') == '123\n\n456\n\n789'