From 01fe37dc45c661154e36a394406dfca58d139da0 Mon Sep 17 00:00:00 2001 From: chrispy Date: Sun, 26 Jan 2025 21:27:09 -0500 Subject: [PATCH 1/3] remove superfluous leading/trailing whitespace Signed-off-by: chrispy --- markdownify/__init__.py | 12 ++- tests/test_advanced.py | 2 +- tests/test_conversions.py | 181 +++++++++++++++++++------------------- tests/test_lists.py | 28 +++--- tests/test_tables.py | 57 ++++++------ 5 files changed, 144 insertions(+), 136 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index ef4e7ca..10ff153 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -180,7 +180,10 @@ def process_tag(self, node, convert_as_inline): return text def convert__document_(self, el, text, convert_as_inline): - # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is + """Final document-level formatting for BeautifulSoup object (node.name == "[document]")""" + # remove all leading newlines + text = text.lstrip('\n') + return text def process_text(self, el): @@ -454,6 +457,7 @@ def _indent_for_li(match): def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return ' ' + text.strip() + ' ' + text = text.strip() if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been @@ -500,13 +504,13 @@ def convert_style(self, el, text, convert_as_inline): convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) def convert_table(self, el, text, convert_as_inline): - return '\n\n' + text + '\n' + return '\n\n' + text.strip() + '\n\n' def convert_caption(self, el, text, convert_as_inline): - return text + '\n\n' + return text.strip() + '\n\n' def convert_figcaption(self, el, text, convert_as_inline): - return '\n\n' + text + '\n\n' + return '\n\n' + text.strip() + '\n\n' def convert_td(self, el, text, convert_as_inline): colspan = 1 diff --git a/tests/test_advanced.py b/tests/test_advanced.py index a3a5fda..14bf3cd 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -14,7 +14,7 @@ def test_chomp(): def test_nested(): text = md('

This is an example link.

') - assert text == '\n\nThis is an [example link](http://example.com/).\n\n' + assert text == 'This is an [example link](http://example.com/).\n\n' def test_ignore_comments(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 05c6cd4..4df024e 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -41,7 +41,7 @@ def test_a_no_autolinks(): def test_a_in_code(): assert md('Google') == '`Google`' - assert md('
Google
') == '\n\n```\nGoogle\n```\n\n' + assert md('
Google
') == '```\nGoogle\n```\n\n' def test_b(): @@ -56,22 +56,22 @@ def test_b_spaces(): def test_blockquote(): - assert md('
Hello
') == '\n> Hello\n\n' - assert md('
\nHello\n
') == '\n> Hello\n\n' + assert md('
Hello
') == '> Hello\n\n' + assert md('
\nHello\n
') == '> Hello\n\n' def test_blockquote_with_nested_paragraph(): - assert md('

Hello

') == '\n> Hello\n\n' - assert md('

Hello

Hello again

') == '\n> Hello\n>\n> Hello again\n\n' + assert md('

Hello

') == '> Hello\n\n' + assert md('

Hello

Hello again

') == '> Hello\n>\n> Hello again\n\n' def test_blockquote_with_paragraph(): - assert md('
Hello

handsome

') == '\n> Hello\n\nhandsome\n\n' + assert md('
Hello

handsome

') == '> Hello\n\nhandsome\n\n' def test_blockquote_nested(): text = md('
And she was like
Hello
') - assert text == '\n> And she was like\n> > Hello\n\n' + assert text == '> And she was like\n> > Hello\n\n' def test_br(): @@ -79,11 +79,6 @@ def test_br(): assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' -def test_caption(): - assert md('TEXT
Caption
SPAN
') == 'TEXT\n\nCaption\n\nSPAN' - assert md('
SPAN
Caption
TEXT') == 'SPAN\n\nCaption\n\nTEXT' - - def test_code(): inline_tests('code', '`') assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' @@ -105,13 +100,13 @@ def test_code(): def test_dl(): - assert md('
term
definition
') == '\nterm\n: definition\n' - assert md('

te

rm

definition
') == '\nte rm\n: definition\n' - assert md('
term

definition-p1

definition-p2

') == '\nterm\n: definition-p1\n\n definition-p2\n' - assert md('
term

definition 1

definition 2

') == '\nterm\n: definition 1\n: definition 2\n' - assert md('
term 1
definition 1
term 2
definition 2
') == '\nterm 1\n: definition 1\nterm 2\n: definition 2\n' - assert md('
term

line 1

line 2

') == '\nterm\n: > line 1\n >\n > line 2\n' - assert md('
term
  1. 1

    • 2a
    • 2b
  2. 3

') == '\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n' + assert md('
term
definition
') == 'term\n: definition\n' + assert md('

te

rm

definition
') == 'te rm\n: definition\n' + assert md('
term

definition-p1

definition-p2

') == 'term\n: definition-p1\n\n definition-p2\n' + assert md('
term

definition 1

definition 2

') == 'term\n: definition 1\n: definition 2\n' + assert md('
term 1
definition 1
term 2
definition 2
') == 'term 1\n: definition 1\nterm 2\n: definition 2\n' + assert md('
term

line 1

line 2

') == 'term\n: > line 1\n >\n > line 2\n' + assert md('
term
  1. 1

    • 2a
    • 2b
  2. 3

') == 'term\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n' def test_del(): @@ -126,42 +121,47 @@ def test_em(): inline_tests('em', '*') +def test_figcaption(): + assert (md("TEXT
\nCaption\n
SPAN
") == "TEXT\n\nCaption\n\nSPAN") + assert (md("
SPAN
\nCaption\n
TEXT") == "SPAN\n\nCaption\n\nTEXT") + + def test_header_with_space(): - assert md('

\n\nHello

') == '\n\n### Hello\n\n' - assert md('

Hello\n\n\nWorld

') == '\n\n### Hello World\n\n' - assert md('

\n\nHello

') == '\n\n#### Hello\n\n' - assert md('
\n\nHello
') == '\n\n##### Hello\n\n' - assert md('
\n\nHello\n\n
') == '\n\n##### Hello\n\n' - assert md('
\n\nHello \n\n
') == '\n\n##### Hello\n\n' + assert md('

\n\nHello

') == '### Hello\n\n' + assert md('

Hello\n\n\nWorld

') == '### Hello World\n\n' + assert md('

\n\nHello

') == '#### Hello\n\n' + assert md('
\n\nHello
') == '##### Hello\n\n' + assert md('
\n\nHello\n\n
') == '##### Hello\n\n' + assert md('
\n\nHello \n\n
') == '##### Hello\n\n' def test_h1(): - assert md('

Hello

') == '\n\nHello\n=====\n\n' + assert md('

Hello

') == 'Hello\n=====\n\n' def test_h2(): - assert md('

Hello

') == '\n\nHello\n-----\n\n' + assert md('

Hello

') == 'Hello\n-----\n\n' def test_hn(): - assert md('

Hello

') == '\n\n### Hello\n\n' - assert md('

Hello

') == '\n\n#### Hello\n\n' - assert md('
Hello
') == '\n\n##### Hello\n\n' - assert md('
Hello
') == '\n\n###### Hello\n\n' + assert md('

Hello

') == '### Hello\n\n' + assert md('

Hello

') == '#### Hello\n\n' + assert md('
Hello
') == '##### Hello\n\n' + assert md('
Hello
') == '###### Hello\n\n' assert md('Hello') == md('
Hello
') assert md('Hello') == md('Hello') def test_hn_chained(): - assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n\n# First\n\n## Second\n\n### Third\n\n' + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n## Second\n\n### Third\n\n' assert md('X

First

', heading_style=ATX) == 'X\n\n# First\n\n' assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n\n# First #\n\n' assert md('X

First

') == 'X\n\nFirst\n=====\n\n' def test_hn_nested_tag_heading_style(): - assert md('

A

P

C

', heading_style=ATX_CLOSED) == '\n\n# A P C #\n\n' - assert md('

A

P

C

', heading_style=ATX) == '\n\n# A P C\n\n' + assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' + assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' def test_hn_nested_simple_tag(): @@ -177,9 +177,9 @@ def test_hn_nested_simple_tag(): ] for tag, markdown in tag_to_markdown: - assert md('

A <' + tag + '>' + tag + ' B

') == '\n\n### A ' + markdown + ' B\n\n' + assert md('

A <' + tag + '>' + tag + ' B

') == '### A ' + markdown + ' B\n\n' - assert md('

A
B

', heading_style=ATX) == '\n\n### A B\n\n' + assert md('

A
B

', heading_style=ATX) == '### A B\n\n' # Nested lists not supported # assert md('

A
  • li1
  • l2

', heading_style=ATX) == '\n### A li1 li2 B\n\n' @@ -192,23 +192,23 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

A B

') == '\n\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' - assert md('

A B

', keep_inline_images_in=['h3']) == '\n\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' + assert md('

A B

') == '### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' + assert md('

A B

', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' def test_hn_atx_headings(): - assert md('

Hello

', heading_style=ATX) == '\n\n# Hello\n\n' - assert md('

Hello

', heading_style=ATX) == '\n\n## Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' def test_hn_atx_closed_headings(): - assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n# Hello #\n\n' - assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n## Hello ##\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '# Hello #\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '## Hello ##\n\n' def test_hn_newlines(): - assert md("

H1-1

TEXT

H2-2

TEXT

H1-2

TEXT", heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT' - assert md('

H1-1

\n

TEXT

\n

H2-2

\n

TEXT

\n

H1-2

\n

TEXT

', heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n' + assert md("

H1-1

TEXT

H2-2

TEXT

H1-2

TEXT", heading_style=ATX) == '# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT' + assert md('

H1-1

\n

TEXT

\n

H2-2

\n

TEXT

\n

H1-2

\n

TEXT

', heading_style=ATX) == '# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n' def test_head(): @@ -218,7 +218,7 @@ def test_head(): def test_hr(): assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('Hello
World') == 'Hello\n\n---\n\nWorld' - assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n---\n\nWorld\n\n' + assert md('

Hello

\n
\n

World

') == 'Hello\n\n---\n\nWorld\n\n' def test_i(): @@ -235,48 +235,49 @@ def test_kbd(): def test_p(): - assert md('

hello

') == '\n\nhello\n\n' - assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' - assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' - assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' - assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=None) == '\n\n123456789 123456789\n\n' - assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' - assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' - assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' - assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' + assert md('

hello

') == 'hello\n\n' + assert md("

hello

") == "hello\n\n" + assert md('

123456789 123456789

') == '123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

') == '123456789\n123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=None) == '123456789 123456789\n\n' + assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '123456789\n123456789\n\n' + assert md('

Some long link

', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '12345 \n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '12345 \n67890\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '12345678901 \n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '12345678901 \n12345\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '1234 5678\n9012\\\n67890\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '1234 5678\n9012 \n67890\n\n' assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' def test_pre(): - assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' - assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' - assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' - assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' - assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' - assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbaz
') == '\n\n```\nfoo\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
', sup_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' - assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
test\n    foo\nbar
') == '```\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
') == '```\ntest\n foo\nbar\n```\n\n' + assert md('
*this_should_not_escape*
') == '```\n*this_should_not_escape*\n```\n\n' + assert md('
*this_should_not_escape*
') == '```\n*this_should_not_escape*\n```\n\n' + assert md('
\t\tthis  should\t\tnot  normalize
') == '```\n\t\tthis should\t\tnot normalize\n```\n\n' + assert md('
\t\tthis  should\t\tnot  normalize
') == '```\n\t\tthis should\t\tnot normalize\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbaz
') == '```\nfoo\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
', sup_symbol='^') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
', sub_symbol='^') == '```\nfoo\nbar\nbaz\n```\n\n' + assert md('
foo\nbar\nbaz
', sub_symbol='^') == '```\nfoo\nbar\nbaz\n```\n\n' assert md('foo
bar
baz', sub_symbol='^') == 'foo\n\n```\nbar\n```\n\nbaz' - assert md("

foo

\n
bar
\n

baz

", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz" + assert md("

foo

\n
bar
\n

baz

", sub_symbol="^") == 'foo\n\n```\nbar\n```\n\nbaz' def test_script(): @@ -319,24 +320,24 @@ def test_sup(): def test_lang(): - assert md('
test\n    foo\nbar
', code_language='python') == '\n\n```python\ntest\n foo\nbar\n```\n\n' - assert md('
test\n    foo\nbar
', code_language='javascript') == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
', code_language='python') == '```python\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
', code_language='javascript') == '```javascript\ntest\n foo\nbar\n```\n\n' def test_lang_callback(): def callback(el): return el['class'][0] if el.has_attr('class') else None - assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```python\ntest\n foo\nbar\n```\n\n' - assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' - assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
', code_language_callback=callback) == '```python\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
', code_language_callback=callback) == '```javascript\ntest\n foo\nbar\n```\n\n' + assert md('
test\n    foo\nbar
', code_language_callback=callback) == '```javascript\ntest\n foo\nbar\n```\n\n' def test_spaces(): - assert md('

a b

c d

') == '\n\na b\n\nc d\n\n' - assert md('

a

') == '\n\n*a*\n\n' + assert md('

a b

c d

') == 'a b\n\nc d\n\n' + assert md('

a

') == '*a*\n\n' assert md('test

again

') == 'test\n\nagain\n\n' assert md('test
text
after') == 'test\n> text\n\nafter' - assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' - assert md('