diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 93a889f..9e4c99f 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,4 +1,4 @@ -from bs4 import BeautifulSoup, NavigableString, Comment, Doctype +from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag from textwrap import fill import re import six @@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el): if html_heading_re.match(el.name) is not None: return True return el.name in ('p', 'blockquote', + 'article', 'div', 'section', 'ol', 'ul', 'li', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th') @@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el): return should_remove_whitespace_inside(el) or (el and el.name == 'pre') +def _is_block_content_element(el): + """ + In a block context, returns: + + - True for content elements (tags and non-whitespace text) + - False for non-content elements (whitespace text, comments, doctypes) + """ + if isinstance(el, Tag): + return True + elif isinstance(el, (Comment, Doctype)): + return False # (subclasses of NavigableString, must test first) + elif isinstance(el, NavigableString): + return el.strip() != '' + else: + return False + + +def _prev_block_content_sibling(el): + """Returns the first previous sibling that is a content element, else None.""" + while el is not None: + el = el.previous_sibling + if _is_block_content_element(el): + return el + return None + + +def _next_block_content_sibling(el): + """Returns the first next sibling that is a content element, else None.""" + while el is not None: + el = el.next_sibling + if _is_block_content_element(el): + return el + return None + + class MarkdownConverter(object): class DefaultOptions: autolinks = True @@ -143,29 +179,38 @@ def process_tag(self, node, convert_as_inline): or node.name in ['td', 'th'] # table cells ) - # Remove whitespace-only textnodes just before, after or - # inside block-level elements. + # Collect child elements to process, ignoring whitespace-only text elements + # adjacent to the inner/outer boundaries of block elements. should_remove_inside = should_remove_whitespace_inside(node) - for el in node.children: - # Only extract (remove) whitespace-only text node if any of the - # conditions is true: - # - el is the first element in its parent (block-level) - # - el is the last element in its parent (block-level) - # - el is adjacent to a block-level node - can_extract = (should_remove_inside and (not el.previous_sibling - or not el.next_sibling) - or should_remove_whitespace_outside(el.previous_sibling) - or should_remove_whitespace_outside(el.next_sibling)) - if (isinstance(el, NavigableString) - and six.text_type(el).strip() == '' - and can_extract): - el.extract() - # Convert the children first - for el in node.children: - if isinstance(el, Comment) or isinstance(el, Doctype): - continue + def _can_ignore(el): + if isinstance(el, Tag): + # Tags are always processed. + return False + elif isinstance(el, (Comment, Doctype)): + # Comment and Doctype elements are always ignored. + # (subclasses of NavigableString, must test first) + return True elif isinstance(el, NavigableString): + if six.text_type(el).strip() != '': + # Non-whitespace text nodes are always processed. + return False + elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): + # Inside block elements (excluding
), ignore adjacent whitespace elements. + return True + elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling): + # Outside block elements (including), ignore adjacent whitespace elements. + return True + else: + return False + else: + raise ValueError('Unexpected element type: %s' % type(el)) + + children_to_convert = [child for child in node.children if not _can_ignore(child)] + + # Convert the children first + for el in children_to_convert: + if isinstance(el, NavigableString): text += self.process_text(el) else: text_strip = text.rstrip('\n') @@ -337,6 +382,16 @@ def convert_code(self, el, text, convert_as_inline): convert_del = abstract_inline_conversion(lambda self: '~~') + def convert_div(self, el, text, convert_as_inline): + if convert_as_inline: + return ' ' + text.strip() + ' ' + text = text.strip() + return '\n\n%s\n\n' % text if text else '' + + convert_article = convert_div + + convert_section = convert_div + convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol']) convert_kbd = convert_code @@ -415,7 +470,8 @@ def convert_list(self, el, text, convert_as_inline): nested = False before_paragraph = False - if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']: + next_sibling = _next_block_content_sibling(el) + if next_sibling and next_sibling.name not in ['ul', 'ol']: before_paragraph = True while el: if el.name == 'li': @@ -539,6 +595,7 @@ def convert_th(self, el, text, convert_as_inline): def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) + is_first_row = el.find_previous_sibling() is None is_headrow = ( all([cell.name == 'th' for cell in cells]) or (el.parent.name == 'thead' @@ -546,15 +603,15 @@ def convert_tr(self, el, text, convert_as_inline): and len(el.parent.find_all('tr')) == 1) ) is_head_row_missing = ( - (not el.previous_sibling and not el.parent.name == 'tbody') - or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) + (is_first_row and not el.parent.name == 'tbody') + or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) ) overline = '' underline = '' if ((is_headrow or (is_head_row_missing and self.options['table_infer_header'])) - and not el.previous_sibling): + and is_first_row): # first row and: # - is headline or # - headline is missing and header inference is enabled @@ -568,10 +625,10 @@ def convert_tr(self, el, text, convert_as_inline): underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' elif ((is_head_row_missing and not self.options['table_infer_header']) - or (not el.previous_sibling + or (is_first_row and (el.parent.name == 'table' or (el.parent.name == 'tbody' - and not el.parent.previous_sibling)))): + and not el.parent.find_previous_sibling())))): # headline is missing and header inference is disabled or: # first row, not headline, and: # - the parent is table or diff --git a/tests/test_basic.py b/tests/test_basic.py index 584adb9..9be524e 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -6,7 +6,7 @@ def test_single_tag(): def test_soup(): - assert md('Hello') == 'Hello' + assert md('Hello') == '\n\nHello\n\n' def test_whitespace(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 1367006..1739cb9 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -114,8 +114,19 @@ def test_del(): inline_tests('del', '~~') -def test_div(): - assert md('Hello World') == 'Hello World' +def test_div_section_article(): + for tag in ['div', 'section', 'article']: + assert md(f'<{tag}>456{tag}>') == '\n\n456\n\n' + assert md(f'123<{tag}>456{tag}>789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n 456 \n{tag}>789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>456
{tag}>789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n456
\n{tag}>789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>4 5 6{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>\n4 5 6\n{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>4\n5\n6{tag}>789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n4\n5\n6\n{tag}>789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n\n4\n5\n6\n
\n{tag}>789') == '123\n\n4\n5\n6\n\n789' + assert md(f'<{tag}>title
body{{tag}}>', heading_style=ATX) == '\n\n# title\n\nbody\n\n' def test_em():