From 60d86663d7f3cb5113e55ecede817f1a306420a2 Mon Sep 17 00:00:00 2001
From: Joseph Myers <josmyers@redhat.com>
Date: Tue, 9 Apr 2024 16:47:15 +0000
Subject: [PATCH 1/4] More carefully separate inline text from block content

There are various cases in which inline text fails to be separated by
(sufficiently many) newlines from adjacent block content.  A paragraph
needs a blank line (two newlines) separating it from prior text, as
does an underlined header; an ATX header needs a single newline
separating it from prior text.  A list needs at least one newline
separating it from prior text, but in general two newlines (for an
ordered list starting other than at 1, which will only be recognized
given a blank line before).

To avoid accumulation of more newlines than necessary, take care when
concatenating the results of converting consecutive tags to remove
redundant newlines (keeping the greater of the number ending the prior
text and the number starting the subsequent text).

This is thus an alternative to #108 that tries to avoid the excess
newline accumulation that was a concern there, as well as fixing more
cases than just paragraphs, and updating tests.

Fixes #92

Fixes #98
---
 markdownify/__init__.py   | 18 +++++++----
 tests/test_advanced.py    |  2 +-
 tests/test_conversions.py | 65 ++++++++++++++++++++-------------------
 tests/test_lists.py       | 20 ++++++------
 4 files changed, 58 insertions(+), 47 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index eaa6ded..d0da098 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -143,7 +143,13 @@ def is_nested_node(el):
             elif isinstance(el, NavigableString):
                 text += self.process_text(el)
             else:
-                text += self.process_tag(el, convert_children_as_inline)
+                text_strip = text.rstrip('\n')
+                newlines_left = len(text) - len(text_strip)
+                next_text = self.process_tag(el, convert_children_as_inline)
+                next_text_strip = next_text.lstrip('\n')
+                newlines_right = len(next_text) - len(next_text_strip)
+                newlines = '\n' * max(newlines_left, newlines_right)
+                text = text_strip + newlines + next_text_strip
 
         if not children_only:
             convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -216,7 +222,7 @@ def indent(self, text, level):
 
     def underline(self, text, pad_char):
         text = (text or '').rstrip()
-        return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
 
     def convert_a(self, el, text, convert_as_inline):
         prefix, suffix, text = chomp(text)
@@ -277,8 +283,8 @@ def convert_hn(self, n, el, text, convert_as_inline):
             return self.underline(text, line)
         hashes = '#' * n
         if style == ATX_CLOSED:
-            return '%s %s %s\n\n' % (hashes, text, hashes)
-        return '%s %s\n\n' % (hashes, text)
+            return '\n%s %s %s\n\n' % (hashes, text, hashes)
+        return '\n%s %s\n\n' % (hashes, text)
 
     def convert_hr(self, el, text, convert_as_inline):
         return '\n\n---\n\n'
@@ -313,7 +319,7 @@ def convert_list(self, el, text, convert_as_inline):
         if nested:
             # remove trailing newline if nested
             return '\n' + self.indent(text, 1).rstrip()
-        return text + ('\n' if before_paragraph else '')
+        return '\n\n' + text + ('\n' if before_paragraph else '')
 
     convert_ul = convert_list
     convert_ol = convert_list
@@ -344,7 +350,7 @@ def convert_p(self, el, text, convert_as_inline):
                         width=self.options['wrap_width'],
                         break_long_words=False,
                         break_on_hyphens=False)
-        return '%s\n\n' % text if text else ''
+        return '\n\n%s\n\n' % text if text else ''
 
     def convert_pre(self, el, text, convert_as_inline):
         if not text:
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index 14bf3cd..a3a5fda 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -14,7 +14,7 @@ def test_chomp():
 
 def test_nested():
     text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
-    assert text == 'This is an [example link](http://example.com/).\n\n'
+    assert text == '\n\nThis is an [example link](http://example.com/).\n\n'
 
 
 def test_ignore_comments():
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 9652143..e2c172a 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -112,36 +112,38 @@ def test_em():
 
 
 def test_header_with_space():
-    assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
-    assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
-    assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello   \n\n</h5>') == '##### Hello\n\n'
+    assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
+    assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
+    assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
+    assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
+    assert md('<h5>\n\nHello   \n\n</h5>') == '\n##### Hello\n\n'
 
 
 def test_h1():
-    assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
+    assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'
 
 
 def test_h2():
-    assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
+    assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'
 
 
 def test_hn():
-    assert md('<h3>Hello</h3>') == '### Hello\n\n'
-    assert md('<h4>Hello</h4>') == '#### Hello\n\n'
-    assert md('<h5>Hello</h5>') == '##### Hello\n\n'
-    assert md('<h6>Hello</h6>') == '###### Hello\n\n'
+    assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
+    assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
+    assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
+    assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'
 
 
 def test_hn_chained():
-    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
-    assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
+    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
+    assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
 
 
 def test_hn_nested_tag_heading_style():
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'
 
 
 def test_hn_nested_simple_tag():
@@ -157,12 +159,12 @@ def test_hn_nested_simple_tag():
     ]
 
     for tag, markdown in tag_to_markdown:
-        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'
 
-    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
+    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'
 
     # Nested lists not supported
-    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
+    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'
 
 
 def test_hn_nested_img():
@@ -172,18 +174,18 @@ def test_hn_nested_img():
         ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
     ]
     for image_attributes, markdown, title in image_attributes_to_markdown:
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
 
 
 def test_hn_atx_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'
 
 
 def test_hn_atx_closed_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'
 
 
 def test_head():
@@ -193,7 +195,7 @@ def test_head():
 def test_hr():
     assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
     assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
-    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
+    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n\n---\n\n\nWorld\n\n'
 
 
 def test_i():
@@ -210,12 +212,13 @@ def test_kbd():
 
 
 def test_p():
-    assert md('<p>hello</p>') == 'hello\n\n'
-    assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
-    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
-    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
-    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
-    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
+    assert md('<p>hello</p>') == '\n\nhello\n\n'
+    assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
+    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
 
 
 def test_pre():
diff --git a/tests/test_lists.py b/tests/test_lists.py
index 5a04430..0b23179 100644
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -41,16 +41,17 @@
 
 
 def test_ol():
-    assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
+    assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
+    assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
 
 
 def test_nested_ols():
-    assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
+    assert md(nested_ols) == '\n\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
 
 
 def test_ul():
-    assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
+    assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
     assert md("""<ul>
      <li>
              a
@@ -58,11 +59,12 @@ def test_ul():
      <li> b </li>
      <li>   c
      </li>
- </ul>""") == '* a\n* b\n* c\n'
+ </ul>""") == '\n\n* a\n* b\n* c\n'
 
 
 def test_inline_ul():
-    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
+    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
+    assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'
 
 
 def test_nested_uls():
@@ -70,12 +72,12 @@ def test_nested_uls():
     Nested ULs should alternate bullet characters.
 
     """
-    assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
+    assert md(nested_uls) == '\n\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
 
 
 def test_bullets():
-    assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
+    assert md(nested_uls, bullets='-') == '\n\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
 
 
 def test_li_text():
-    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
+    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'

From c2ffe46e858b3d98ed73f6c7107c88e186e3d43d Mon Sep 17 00:00:00 2001
From: Joseph Myers <josmyers@redhat.com>
Date: Thu, 3 Oct 2024 00:30:50 +0000
Subject: [PATCH 2/4] Fix whitespace issues around wrapping

This fixes various issues relating to how input whitespace is handled
and how wrapping handles whitespace resulting from hard line breaks.

This PR uses a branch based on that for #120 to avoid conflicts with
the fixes and associated test changes there.  My suggestion is thus
first to merge #120 (which fixes two open issues), then to merge the
remaining changes from this PR.

Wrapping paragraphs has the effect of losing all newlines including
those from `<br>` tags, contrary to HTML semantics (wrapping should be
a matter of pretty-printing the output; input whitespace from the HTML
input should be normalized, but `<br>` should remain as a hard line
break).  To fix this, we need to wrap the portions of a paragraph
between hard line breaks separately.  For this to work, ensure that
when wrapping, all input whitespace is normalized at an early stage,
including turning newlines into spaces.  (Only ASCII whitespace is
handled this way; `\s` is not used as it's not clear Unicode
whitespace should get such normalization.)

When not wrapping, there is still too much input whitespace
preservation.  If the input contains a blank line, that ends up as a
paragraph break in the output, or breaks the header formatting when
appearing in a header tag, though in terms of HTML semantics such a
blank line is no different from a space.  In the case of an ATX
header, even a single newline appearing in the output breaks the
Markdown.  Thus, when not wrapping, arrange for input whitespace
containing at least one `\r` or `\n` to be normalized to a single
newline, and in the ATX header case, normalize to a space.

Fixes #130
(probably, not sure exactly what the HTML input there is)

Fixes #88
(a related case, anyway; the actual input in #88 has already been fixed)
---
 markdownify/__init__.py   | 29 +++++++++++++++++++++++------
 tests/test_basic.py       |  1 +
 tests/test_conversions.py | 15 +++++++++++++--
 tests/test_tables.py      |  2 +-
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index efb2d15..a37f870 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -7,7 +7,8 @@
 convert_heading_re = re.compile(r'convert_h(\d+)')
 line_beginning_re = re.compile(r'^', re.MULTILINE)
 whitespace_re = re.compile(r'[\t ]+')
-all_whitespace_re = re.compile(r'[\s]+')
+all_whitespace_re = re.compile(r'[\t \r\n]+')
+newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 html_heading_re = re.compile(r'h[1-6]')
 
 
@@ -168,7 +169,11 @@ def process_text(self, el):
 
         # normalize whitespace if we're not inside a preformatted element
         if not el.find_parent('pre'):
-            text = whitespace_re.sub(' ', text)
+            if self.options['wrap']:
+                text = all_whitespace_re.sub(' ', text)
+            else:
+                text = newline_whitespace_re.sub('\n', text)
+                text = whitespace_re.sub(' ', text)
 
         # escape special characters if we're not inside a preformatted or code element
         if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
@@ -286,6 +291,7 @@ def convert_hn(self, n, el, text, convert_as_inline):
         if style == UNDERLINED and n <= 2:
             line = '=' if n == 1 else '-'
             return self.underline(text, line)
+        text = all_whitespace_re.sub(' ', text)
         hashes = '#' * n
         if style == ATX_CLOSED:
             return '\n%s %s %s\n\n' % (hashes, text, hashes)
@@ -351,10 +357,21 @@ def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
             return text
         if self.options['wrap']:
-            text = fill(text,
-                        width=self.options['wrap_width'],
-                        break_long_words=False,
-                        break_on_hyphens=False)
+            # Preserve newlines (and preceding whitespace) resulting
+            # from <br> tags.  Newlines in the input have already been
+            # replaced by spaces.
+            lines = text.split('\n')
+            new_lines = []
+            for line in lines:
+                line = line.lstrip()
+                line_no_trailing = line.rstrip()
+                trailing = line[len(line_no_trailing):]
+                line = fill(line,
+                            width=self.options['wrap_width'],
+                            break_long_words=False,
+                            break_on_hyphens=False)
+                new_lines.append(line + trailing)
+            text = '\n'.join(new_lines)
         return '\n\n%s\n\n' % text if text else ''
 
     def convert_pre(self, el, text, convert_as_inline):
diff --git a/tests/test_basic.py b/tests/test_basic.py
index bf25ee0..66f8b6c 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -11,3 +11,4 @@ def test_soup():
 
 def test_whitespace():
     assert md(' a  b \t\t c ') == ' a b c '
+    assert md(' a  b \n\n c ') == ' a b\nc '
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index baa294b..9c1edc3 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
+from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
 
 
 def inline_tests(tag, markup):
@@ -113,6 +113,7 @@ def test_em():
 
 def test_header_with_space():
     assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
+    assert md('<h3>Hello\n\n\nWorld</h3>') == '\n### Hello World\n\n'
     assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
     assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
     assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
@@ -174,7 +175,7 @@ def test_hn_nested_img():
         ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
     ]
     for image_attributes, markdown, title in image_attributes_to_markdown:
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
         assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
 
 
@@ -214,10 +215,20 @@ def test_kbd():
 def test_p():
     assert md('<p>hello</p>') == '\n\nhello\n\n'
     assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
     assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
     assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
     assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
     assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012  \n67890\n\n'
     assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
 
 
diff --git a/tests/test_tables.py b/tests/test_tables.py
index 594e5bf..fc6eee6 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -242,7 +242,7 @@ def test_table():
     assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
     assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
     assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith  Jackson | 50 |\n| Eve | Jackson  Smith | 94 |\n\n'
+    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
     assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
     assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
     assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'

From 340aecbe98e5450cca872f03b1c0ecf70ac83de0 Mon Sep 17 00:00:00 2001
From: Joseph Myers <josmyers@redhat.com>
Date: Thu, 3 Oct 2024 20:16:23 +0000
Subject: [PATCH 3/4] More thorough cleanup of input whitespace

This improves the markdownify logic for cleaning up input whitespace
that has no semantic significance in HTML.

This PR uses a branch based on that for #150 (which in turn is based
on that for #120) to avoid conflicts with those fixes.  The suggested
order of merging is just first to merge #120, then the rest of #150,
then the rest of this PR.

Whitespace in HTML input isn't generally significant before or after
block-level elements, or at the start of end of such an element other
than `<pre>`.  There is some limited logic in markdownify for removing
it, (a) for whitespace-only nodes in conjunction with a limited list
of elements (and with questionable logic that ony removes whitespace
adjacent to such an element when also inside such an element) and (b)
only for trailing whitespace, in certain places in relation to lists.

Replace both those places with more thorough logic using a common list
of block-level elements (which could be expanded more).

In general, this reduces the number of unnecessary blank lines in
output from markdownify (sometimes lines with just a newline,
sometimes lines containing a space as well as that newline).  There
are open issues about cases where propagating such input whitespace to
the output actually results in badly formed Markdown output (wrongly
indented output), but #120 (which this builds on) fixes those issues,
sometimes leaving unnecessary lines with just a space on them in the
output, which are dealt with fully by the present PR.

There are a few testcases that are affected because they were relying
on such whitespace for good output from bad HTML input that used `<p>`
or `<blockquote>` inside header tags.  To keep reasonable output in
those cases of bad input now input whitespace adjacent to those two
tags is ignored, make the `<p>` and `<blockquote>` output explicitly
include leading and trailing spaces if `convert_as_inline`; such
explicit spaces seem the best that can be done for such bad input.
Given those fixes, all the remaining changes needed to the
expectations of existing tests seem like improvements (removing
useless spaces or newlines from the output).
---
 markdownify/__init__.py   | 75 ++++++++++++++++++++++++---------------
 tests/test_conversions.py | 16 +++++++--
 2 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index a37f870..dd2507d 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -67,6 +67,23 @@ def _todict(obj):
     return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
 
 
+def remove_whitespace_inside(el):
+    """Return to remove whitespace immediately inside a block-level element."""
+    if not el or not el.name:
+        return False
+    if html_heading_re.match(el.name) is not None:
+        return True
+    return el.name in ('p', 'blockquote',
+                       'ol', 'ul', 'li',
+                       'table', 'thead', 'tbody', 'tfoot',
+                       'tr', 'td', 'th')
+
+
+def remove_whitespace_outside(el):
+    """Return to remove whitespace immediately outside a block-level element."""
+    return remove_whitespace_inside(el) or (el and el.name == 'pre')
+
+
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
@@ -120,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
         if not children_only and (isHeading or isCell):
             convert_children_as_inline = True
 
-        # Remove whitespace-only textnodes in purely nested nodes
-        def is_nested_node(el):
-            return el and el.name in ['ol', 'ul', 'li',
-                                      'table', 'thead', 'tbody', 'tfoot',
-                                      'tr', 'td', 'th']
-
-        if is_nested_node(node):
-            for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the
-                # conditions is true:
-                # - el is the first element in its parent
-                # - el is the last element in its parent
-                # - el is adjacent to an nested node
-                can_extract = (not el.previous_sibling
-                               or not el.next_sibling
-                               or is_nested_node(el.previous_sibling)
-                               or is_nested_node(el.next_sibling))
-                if (isinstance(el, NavigableString)
-                        and six.text_type(el).strip() == ''
-                        and can_extract):
-                    el.extract()
+        # Remove whitespace-only textnodes just before, after or
+        # inside block-level elements.
+        remove_inside = remove_whitespace_inside(node)
+        for el in node.children:
+            # Only extract (remove) whitespace-only text node if any of the
+            # conditions is true:
+            # - el is the first element in its parent (block-level)
+            # - el is the last element in its parent (block-level)
+            # - el is adjacent to a block-level node
+            can_extract = (remove_inside and (not el.previous_sibling
+                                              or not el.next_sibling)
+                           or remove_whitespace_outside(el.previous_sibling)
+                           or remove_whitespace_outside(el.next_sibling))
+            if (isinstance(el, NavigableString)
+                    and six.text_type(el).strip() == ''
+                    and can_extract):
+                el.extract()
 
         # Convert the children first
         for el in node.children:
@@ -179,12 +192,16 @@ def process_text(self, el):
         if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
             text = self.escape(text)
 
-        # remove trailing whitespaces if any of the following condition is true:
-        # - current text node is the last node in li
-        # - current text node is followed by an embedded list
-        if (el.parent.name == 'li'
-                and (not el.next_sibling
-                     or el.next_sibling.name in ['ul', 'ol'])):
+        # remove leading whitespace at the start or just after a
+        # block-level element; remove traliing whitespace at the end
+        # or just before a block-level element.
+        if (remove_whitespace_outside(el.previous_sibling)
+                or (remove_whitespace_inside(el.parent)
+                    and not el.previous_sibling)):
+            text = text.lstrip()
+        if (remove_whitespace_outside(el.next_sibling)
+                or (remove_whitespace_inside(el.parent)
+                    and not el.next_sibling)):
             text = text.rstrip()
 
         return text
@@ -257,7 +274,7 @@ def convert_a(self, el, text, convert_as_inline):
     def convert_blockquote(self, el, text, convert_as_inline):
 
         if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '
 
         return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
 
@@ -355,7 +372,7 @@ def convert_li(self, el, text, convert_as_inline):
 
     def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '
         if self.options['wrap']:
             # Preserve newlines (and preceding whitespace) resulting
             # from <br> tags.  Newlines in the input have already been
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 9c1edc3..0be1d0c 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -66,7 +66,7 @@ def test_blockquote_with_paragraph():
 
 def test_blockquote_nested():
     text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
-    assert text == '\n> And she was like \n> > Hello\n\n'
+    assert text == '\n> And she was like\n> > Hello\n\n'
 
 
 def test_br():
@@ -136,7 +136,7 @@ def test_hn():
 
 
 def test_hn_chained():
-    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n'
+    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n'
     assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
     assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
     assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
@@ -196,7 +196,7 @@ def test_head():
 def test_hr():
     assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
     assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
-    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n\n---\n\n\nWorld\n\n'
+    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'
 
 
 def test_i():
@@ -303,3 +303,13 @@ def callback(el):
     assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n    foo\nbar\n```\n'
     assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
     assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
+
+
+def test_spaces():
+    assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
+    assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
+    assert md('test <p> again </p>') == 'test\n\nagain\n\n'
+    assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
+    assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
+    assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
+    assert md('test <pre> foo </pre> bar') == 'test\n```\n foo \n```\nbar'

From c13bdd5c1426c5bbfd3d340096bc6c1fb2f508bf Mon Sep 17 00:00:00 2001
From: Joseph Myers <josmyers@redhat.com>
Date: Thu, 3 Oct 2024 21:04:40 +0000
Subject: [PATCH 4/4] Fix logic for indentation inside list items

This fixes problems with the markdownify logic for indentation inside
list items.

This PR uses a branch building on that for #120, #150 and #151, so
those three PRs should be merged first before merging this one.

There is limited logic in markdownify for handling indentation in the
case of nested lists.  There are two major problems with this logic:

* As it's in `convert_list`, causing a list to be indented when inside
  another list, it does not add indentation for any other elements
  such as paragraphs that may be found inside list items (or `<pre>`,
  `<blockquote>`, etc.), so such elements are wrongly not indented and
  terminate the list in the output.

* It uses fixed indentation of one tab.  Following CommonMark, a tab
  in Markdown is considered equivalent to four spaces, which is not
  sufficient indentation in ordered list items with a number of three
  or more digits.

Fix both of these issues by making `convert_li` handle indentation for
the contents of `<li>`, based on the length of the list item marker,
rather than doing it in `convert_list` at all.
---
 markdownify/__init__.py | 13 +++++++++----
 tests/test_lists.py     |  8 +++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index dd2507d..5cbf95f 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -244,8 +244,8 @@ def escape(self, text):
             text = text.replace('_', r'\_')
         return text
 
-    def indent(self, text, level):
-        return line_beginning_re.sub('\t' * level, text) if text else ''
+    def indent(self, text, columns):
+        return line_beginning_re.sub(' ' * columns, text) if text else ''
 
     def underline(self, text, pad_char):
         text = (text or '').rstrip()
@@ -346,7 +346,7 @@ def convert_list(self, el, text, convert_as_inline):
             el = el.parent
         if nested:
             # remove trailing newline if nested
-            return '\n' + self.indent(text, 1).rstrip()
+            return '\n' + text.rstrip()
         return '\n\n' + text + ('\n' if before_paragraph else '')
 
     convert_ul = convert_list
@@ -368,7 +368,12 @@ def convert_li(self, el, text, convert_as_inline):
                 el = el.parent
             bullets = self.options['bullets']
             bullet = bullets[depth % len(bullets)]
-        return '%s %s\n' % (bullet, (text or '').strip())
+        bullet = bullet + ' '
+        text = (text or '').strip()
+        text = self.indent(text, len(bullet))
+        if text:
+            text = bullet + text[len(bullet):]
+        return '%s\n' % text
 
     def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
diff --git a/tests/test_lists.py b/tests/test_lists.py
index ecc1a65..a660778 100644
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -47,10 +47,11 @@ def test_ol():
     assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
     assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
     assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="1234"><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ol>') == '\n\n1234. first para\n      \n      second para\n1235. third para\n      \n      fourth para\n'
 
 
 def test_nested_ols():
-    assert md(nested_ols) == '\n\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
+    assert md(nested_ols) == '\n\n1. 1\n   1. a\n      1. I\n      2. II\n      3. III\n   2. b\n   3. c\n2. 2\n3. 3\n'
 
 
 def test_ul():
@@ -63,6 +64,7 @@ def test_ul():
      <li>   c
      </li>
  </ul>""") == '\n\n* a\n* b\n* c\n'
+    assert md('<ul><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ul>') == '\n\n* first para\n  \n  second para\n* third para\n  \n  fourth para\n'
 
 
 def test_inline_ul():
@@ -75,11 +77,11 @@ def test_nested_uls():
     Nested ULs should alternate bullet characters.
 
     """
-    assert md(nested_uls) == '\n\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
+    assert md(nested_uls) == '\n\n* 1\n  + a\n    - I\n    - II\n    - III\n  + b\n  + c\n* 2\n* 3\n'
 
 
 def test_bullets():
-    assert md(nested_uls, bullets='-') == '\n\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
+    assert md(nested_uls, bullets='-') == '\n\n- 1\n  - a\n    - I\n    - II\n    - III\n  - b\n  - c\n- 2\n- 3\n'
 
 
 def test_li_text():