matthewwithanm · chrispy-snps · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/README.rst b/README.rst
@@ -150,6 +150,13 @@ wrap, wrap_width
   Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
   A `wrap_width` value of `None` reflows lines to unlimited line length.
 
+strip_document
+  Controls whether leading and/or trailing separation newlines are removed from
+  the final converted document. Supported values are ``LSTRIP`` (leading),
+  ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (no removal). Newlines
+  within the document are unaffected.
+  Defaults to ``LSTRIP``.
+
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
 

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -26,6 +26,11 @@
 ASTERISK = '*'
 UNDERSCORE = '_'
 
+# Document strip styles
+LSTRIP = 'lstrip'
+RSTRIP = 'rstrip'
+STRIP = 'strip'
+
 
 def chomp(text):
     """
@@ -99,6 +104,7 @@ class DefaultOptions:
         keep_inline_images_in = []
         newline_style = SPACES
         strip = None
+        strip_document = LSTRIP
         strong_em_symbol = ASTERISK
         sub_symbol = ''
         sup_symbol = ''
@@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline):
         return text
 
     def convert__document_(self, el, text, convert_as_inline):
-        # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
+        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+        if self.options['strip_document'] == LSTRIP:
+            text = text.lstrip('\n')  # remove leading separation newlines
+        elif self.options['strip_document'] == RSTRIP:
+            text = text.rstrip('\n')  # remove trailing separation newlines
+        elif self.options['strip_document'] == STRIP:
+            text = text.strip('\n')  # remove leading and trailing separation newlines
+        elif self.options['strip_document'] is None:
+            pass  # leave leading and trailing separation newlines as-is
+        else:
+            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+
         return text
 
     def process_text(self, el):
@@ -454,6 +471,7 @@ def _indent_for_li(match):
     def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
             return ' ' + text.strip() + ' '
+        text = text.strip()
         if self.options['wrap']:
             # Preserve newlines (and preceding whitespace) resulting
             # from <br> tags.  Newlines in the input have already been
@@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline):
     convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
 
     def convert_table(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n'
+        return '\n\n' + text.strip() + '\n\n'
 
     def convert_caption(self, el, text, convert_as_inline):
-        return text + '\n\n'
+        return text.strip() + '\n\n'
 
     def convert_figcaption(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n\n'
+        return '\n\n' + text.strip() + '\n\n'
 
     def convert_td(self, el, text, convert_as_inline):
         colspan = 1

diff --git a/tests/test_advanced.py b/tests/test_advanced.py
@@ -1,4 +1,11 @@
-from markdownify import markdownify as md
+from markdownify import MarkdownConverter
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 def test_chomp():

diff --git a/tests/test_args.py b/tests/test_args.py
@@ -2,7 +2,7 @@
 Test whitelisting/blacklisting of specific tags.
 
 """
-from markdownify import markdownify as md
+from markdownify import markdownify as md, LSTRIP, RSTRIP, STRIP
 
 
 def test_strip():
@@ -23,3 +23,11 @@ def test_convert():
 def test_do_not_convert():
     text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
     assert text == 'Some Text'
+
+
+def test_strip_document():
+    assert md("<p>Hello</p>") == "Hello\n\n"  # defaults to LSTRIP
+    assert md("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
+    assert md("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
+    assert md("<p>Hello</p>", strip_document=STRIP) == "Hello"
+    assert md("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -1,4 +1,11 @@
-from markdownify import markdownify as md
+from markdownify import MarkdownConverter
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 def test_single_tag():

diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -1,4 +1,11 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from markdownify import MarkdownConverter, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 def inline_tests(tag, markup):
@@ -79,11 +86,6 @@ def test_br():
     assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
 
 
-def test_caption():
-    assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
-    assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'
-
-
 def test_code():
     inline_tests('code', '`')
     assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
@@ -126,6 +128,11 @@ def test_em():
     inline_tests('em', '*')
 
 
+def test_figcaption():
+    assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
+    assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")
+
+
 def test_header_with_space():
     assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
     assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
@@ -236,6 +243,7 @@ def test_kbd():
 
 def test_p():
     assert md('<p>hello</p>') == '\n\nhello\n\n'
+    assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
     assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
     assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
     assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'

diff --git a/tests/test_escaping.py b/tests/test_escaping.py
@@ -1,6 +1,13 @@
 import warnings
 from bs4 import MarkupResemblesLocatorWarning
-from markdownify import markdownify as md
+from markdownify import MarkdownConverter
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 def test_asterisks():

diff --git a/tests/test_lists.py b/tests/test_lists.py
@@ -1,4 +1,11 @@
-from markdownify import markdownify as md
+from markdownify import MarkdownConverter
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 nested_uls = """

diff --git a/tests/test_tables.py b/tests/test_tables.py
@@ -1,4 +1,11 @@
-from markdownify import markdownify as md
+from markdownify import MarkdownConverter
+
+
+def md(html, **options):
+    # disable document-level stripping so separation newlines are included in testing
+    options = {**options, "strip_document": None}
+
+    return MarkdownConverter(**options).convert(html)
 
 
 table = """<table>
@@ -228,7 +235,10 @@
     </tbody>
 </table>"""
 
-table_with_caption = """TEXT<table><caption>Caption</caption>
+table_with_caption = """TEXT<table>
+    <caption>
+        Caption
+    </caption>
     <tbody><tr><td>Firstname</td>
             <td>Lastname</td>
             <td>Age</td>