Skip to content

remove superfluous leading/trailing whitespace #181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ wrap, wrap_width
Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
A `wrap_width` value of `None` reflows lines to unlimited line length.

strip_document
Controls whether leading and/or trailing separation newlines are removed from
the final converted document. Supported values are ``LSTRIP`` (leading),
``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
within the document are unaffected.
Defaults to ``STRIP``.

Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.

Expand Down
26 changes: 22 additions & 4 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
ASTERISK = '*'
UNDERSCORE = '_'

# Document strip styles
LSTRIP = 'lstrip'
RSTRIP = 'rstrip'
STRIP = 'strip'


def chomp(text):
"""
Expand Down Expand Up @@ -99,6 +104,7 @@ class DefaultOptions:
keep_inline_images_in = []
newline_style = SPACES
strip = None
strip_document = STRIP
strong_em_symbol = ASTERISK
sub_symbol = ''
sup_symbol = ''
Expand Down Expand Up @@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline):
return text

def convert__document_(self, el, text, convert_as_inline):
# for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
if self.options['strip_document'] == LSTRIP:
text = text.lstrip('\n') # remove leading separation newlines
elif self.options['strip_document'] == RSTRIP:
text = text.rstrip('\n') # remove trailing separation newlines
elif self.options['strip_document'] == STRIP:
text = text.strip('\n') # remove leading and trailing separation newlines
elif self.options['strip_document'] is None:
pass # leave leading and trailing separation newlines as-is
else:
raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])

return text

def process_text(self, el):
Expand Down Expand Up @@ -454,6 +471,7 @@ def _indent_for_li(match):
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return ' ' + text.strip() + ' '
text = text.strip()
if self.options['wrap']:
# Preserve newlines (and preceding whitespace) resulting
# from <br> tags. Newlines in the input have already been
Expand Down Expand Up @@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline):
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

def convert_table(self, el, text, convert_as_inline):
return '\n\n' + text + '\n'
return '\n\n' + text.strip() + '\n\n'

def convert_caption(self, el, text, convert_as_inline):
return text + '\n\n'
return text.strip() + '\n\n'

def convert_figcaption(self, el, text, convert_as_inline):
return '\n\n' + text + '\n\n'
return '\n\n' + text.strip() + '\n\n'

def convert_td(self, el, text, convert_as_inline):
colspan = 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_advanced.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md
from .utils import md


def test_chomp():
Expand Down
11 changes: 10 additions & 1 deletion tests/test_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
Test whitelisting/blacklisting of specific tags.

"""
from markdownify import markdownify as md
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
from .utils import md


def test_strip():
Expand All @@ -23,3 +24,11 @@ def test_convert():
def test_do_not_convert():
text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
assert text == 'Some Text'


def test_strip_document():
assert markdownify("<p>Hello</p>") == "Hello" # test default of STRIP
assert markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
2 changes: 1 addition & 1 deletion tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md
from .utils import md


def test_single_tag():
Expand Down
14 changes: 8 additions & 6 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
from .utils import md


def inline_tests(tag, markup):
Expand Down Expand Up @@ -79,11 +80,6 @@ def test_br():
assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'


def test_caption():
assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'


def test_code():
inline_tests('code', '`')
assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
Expand Down Expand Up @@ -126,6 +122,11 @@ def test_em():
inline_tests('em', '*')


def test_figcaption():
assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")


def test_header_with_space():
assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
Expand Down Expand Up @@ -236,6 +237,7 @@ def test_kbd():

def test_p():
assert md('<p>hello</p>') == '\n\nhello\n\n'
assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
Expand Down
4 changes: 2 additions & 2 deletions tests/test_custom_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def test_custom_conversion_functions():
def md(html, **options):
return UnitTestConverter(**options).convert(html)

assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'

assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_escaping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import warnings
from bs4 import MarkupResemblesLocatorWarning
from markdownify import markdownify as md
from .utils import md


def test_asterisks():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_lists.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md
from .utils import md


nested_uls = """
Expand Down
7 changes: 5 additions & 2 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md
from .utils import md


table = """<table>
Expand Down Expand Up @@ -228,7 +228,10 @@
</tbody>
</table>"""

table_with_caption = """TEXT<table><caption>Caption</caption>
table_with_caption = """TEXT<table>
<caption>
Caption
</caption>
<tbody><tr><td>Firstname</td>
<td>Lastname</td>
<td>Age</td>
Expand Down
9 changes: 9 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from markdownify import MarkdownConverter


# for unit testing, disable document-level stripping by default so that
# separation newlines are included in testing
def md(html, **options):
options = {"strip_document": None, **options}

return MarkdownConverter(**options).convert(html)
Loading