Skip to content

fix unnecessary /n newlines resulting in a lot of white area in the markdown #131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 9 additions & 62 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
all_whitespace_re = re.compile(r'[\s]+')
html_heading_re = re.compile(r'h[1-6]')


# Heading styles
ATX = 'atx'
ATX_CLOSED = 'atx_closed'
Expand All @@ -27,25 +26,13 @@


def chomp(text):
"""
If the text in an inline tag like b, a, or em contains a leading or trailing
space, strip the string and return a space as suffix of prefix, if needed.
This function is used to prevent conversions like
<b> foo</b> => ** foo**
"""
prefix = ' ' if text and text[0] == ' ' else ''
suffix = ' ' if text and text[-1] == ' ' else ''
text = text.strip()
return (prefix, suffix, text)


def abstract_inline_conversion(markup_fn):
"""
This abstracts all simple inline tags like b, em, del, ...
Returns a function that wraps the chomped text in a pair of the string
that is returned by markup_fn. markup_fn is necessary to allow for
references to self.strong_em_symbol etc.
"""
def implementation(self, el, text, convert_as_inline):
markup = markup_fn(self)
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
Expand Down Expand Up @@ -86,14 +73,11 @@ class Options(DefaultOptions):
pass

def __init__(self, **options):
# Create an options dictionary. Use DefaultOptions as a base so that
# it doesn't have to be extended.
self.options = _todict(self.DefaultOptions)
self.options.update(_todict(self.Options))
self.options.update(options)
if self.options['strip'] is not None and self.options['convert'] is not None:
raise ValueError('You may specify either tags to strip or tags to'
' convert, but not both.')
raise ValueError('You may specify either tags to strip or tags to convert, but not both.')

def convert(self, html):
soup = BeautifulSoup(html, 'html.parser')
Expand All @@ -105,28 +89,20 @@ def convert_soup(self, soup):
def process_tag(self, node, convert_as_inline, children_only=False):
text = ''

# markdown headings or cells can't include
# block elements (elements w/newlines)
isHeading = html_heading_re.match(node.name) is not None
isCell = node.name in ['td', 'th']
convert_children_as_inline = convert_as_inline

if not children_only and (isHeading or isCell):
convert_children_as_inline = True

# Remove whitespace-only textnodes in purely nested nodes
def is_nested_node(el):
return el and el.name in ['ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th']

if is_nested_node(node):
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent
# - el is the last element in its parent
# - el is adjacent to an nested node
can_extract = (not el.previous_sibling
or not el.next_sibling
or is_nested_node(el.previous_sibling)
Expand All @@ -136,14 +112,13 @@ def is_nested_node(el):
and can_extract):
el.extract()

# Convert the children first
for el in node.children:
if isinstance(el, Comment) or isinstance(el, Doctype):
continue
elif isinstance(el, NavigableString):
text += self.process_text(el)
text += self.process_text(el).strip()
else:
text += self.process_tag(el, convert_children_as_inline)
text += self.process_tag(el, convert_children_as_inline).strip()

if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
Expand All @@ -154,27 +129,17 @@ def is_nested_node(el):

def process_text(self, el):
text = six.text_type(el) or ''

# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
text = whitespace_re.sub(' ', text)

# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
text = self.escape(text)

# remove trailing whitespaces if any of the following condition is true:
# - current text node is the last node in li
# - current text node is followed by an embedded list
if (el.parent.name == 'li'
and (not el.next_sibling
or el.next_sibling.name in ['ul', 'ol'])):
text = text.rstrip()

return text
return text.strip()

def __getattr__(self, attr):
# Handle headings
m = convert_heading_re.match(attr)
if m:
n = int(m.group(1))
Expand Down Expand Up @@ -216,20 +181,18 @@ def indent(self, text, level):

def underline(self, text, pad_char):
text = (text or '').rstrip()
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
return '%s\n%s\n' % (text, pad_char * len(text)) if text else ''

def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
if not text:
return ''
href = el.get('href')
title = el.get('title')
# For the replacement see #29: text nodes underscores are escaped
if (self.options['autolinks']
and text.replace(r'\_', '_') == href
and not title
and not self.options['default_title']):
# Shortcut syntax
return '<%s>' % href
if self.options['default_title'] and not title:
title = href
Expand All @@ -239,16 +202,13 @@ def convert_a(self, el, text, convert_as_inline):
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

def convert_blockquote(self, el, text, convert_as_inline):

if convert_as_inline:
return text

return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
return (line_beginning_re.sub('> ', text.strip()) + '\n') if text else ''

def convert_br(self, el, text, convert_as_inline):
if convert_as_inline:
return ""

if self.options['newline_style'].lower() == BACKSLASH:
return '\\\n'
else:
Expand All @@ -269,16 +229,15 @@ def convert_code(self, el, text, convert_as_inline):
def convert_hn(self, n, el, text, convert_as_inline):
if convert_as_inline:
return text

style = self.options['heading_style'].lower()
text = text.strip()
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
hashes = '#' * n
if style == ATX_CLOSED:
return '%s %s %s\n\n' % (hashes, text, hashes)
return '%s %s\n\n' % (hashes, text)
return '%s %s %s\n' % (hashes, text, hashes)
return '%s %s\n' % (hashes, text)

def convert_hr(self, el, text, convert_as_inline):
return '\n\n---\n\n'
Expand All @@ -297,10 +256,6 @@ def convert_img(self, el, text, convert_as_inline):
return '![%s](%s%s)' % (alt, src, title_part)

def convert_list(self, el, text, convert_as_inline):

# Converting a list to inline is undefined.
# Ignoring convert_to_inline for list.

nested = False
before_paragraph = False
if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
Expand All @@ -311,7 +266,6 @@ def convert_list(self, el, text, convert_as_inline):
break
el = el.parent
if nested:
# remove trailing newline if nested
return '\n' + self.indent(text, 1).rstrip()
return text + ('\n' if before_paragraph else '')

Expand Down Expand Up @@ -344,16 +298,14 @@ def convert_p(self, el, text, convert_as_inline):
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
return '%s\n\n' % text if text else ''
return '%s\n' % text if text else ''

def convert_pre(self, el, text, convert_as_inline):
if not text:
return ''
code_language = self.options['code_language']

if self.options['code_language_callback']:
code_language = self.options['code_language_callback'](el) or code_language

return '\n```%s\n%s\n```\n' % (code_language, text)

def convert_script(self, el, text, convert_as_inline):
Expand Down Expand Up @@ -403,7 +355,6 @@ def convert_tr(self, el, text, convert_as_inline):
overline = ''
underline = ''
if is_headrow and not el.previous_sibling:
# first row and is headline: print headline underline
full_colspan = 0
for cell in cells:
if "colspan" in cell.attrs:
Expand All @@ -415,10 +366,6 @@ def convert_tr(self, el, text, convert_as_inline):
and (el.parent.name == 'table'
or (el.parent.name == 'tbody'
and not el.parent.previous_sibling))):
# first row, not headline, and:
# - the parent is table or
# - the parent is tbody at the beginning of a table.
# print empty headline above this row
overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
return overline + '|' + text + '\n' + underline
Expand Down