Skip to content

propagate parent tag context downward to improve runtime #191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ If you have a special usecase that calls for a special conversion, you can
always inherit from ``MarkdownConverter`` and override the method you want to
change.
The function that handles a HTML tag named ``abc`` is called
``convert_abc(self, el, text, convert_as_inline)`` and returns a string
``convert_abc(self, el, text, parent_tags)`` and returns a string
containing the converted HTML tag.
The ``MarkdownConverter`` object will handle the conversion based on the
function names:
Expand All @@ -193,8 +193,8 @@ function names:
"""
Create a custom MarkdownConverter that adds two newlines after an image
"""
def convert_img(self, el, text, convert_as_inline):
return super().convert_img(el, text, convert_as_inline) + '\n\n'
def convert_img(self, el, text, parent_tags):
return super().convert_img(el, text, parent_tags) + '\n\n'

# Create shorthand method for conversion
def md(html, **options):
Expand All @@ -208,7 +208,7 @@ function names:
"""
Create a custom MarkdownConverter that ignores paragraphs
"""
def convert_p(self, el, text, convert_as_inline):
def convert_p(self, el, text, parent_tags):
return ''

# Create shorthand method for conversion
Expand Down
143 changes: 77 additions & 66 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
the text if it looks like an HTML tag. markup_fn is necessary to allow for
references to self.strong_em_symbol etc.
"""
def implementation(self, el, text, convert_as_inline):
def implementation(self, el, text, parent_tags):
markup_prefix = markup_fn(self)
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
markup_suffix = '</' + markup_prefix[1:]
else:
markup_suffix = markup_prefix
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
if '_noformat' in parent_tags:
return text
prefix, suffix, text = chomp(text)
if not text:
Expand Down Expand Up @@ -170,24 +170,18 @@ def convert(self, html):
return self.convert_soup(soup)

def convert_soup(self, soup):
return self.process_tag(soup, convert_as_inline=False)
return self.process_tag(soup, parent_tags=set())

def process_element(self, node, convert_as_inline):
def process_element(self, node, parent_tags=None):
if isinstance(node, NavigableString):
return self.process_text(node)
return self.process_text(node, parent_tags=parent_tags)
else:
return self.process_tag(node, convert_as_inline)
return self.process_tag(node, parent_tags=parent_tags)

def process_tag(self, node, convert_as_inline):
text = ''

# For Markdown headings and table cells, convert children as inline
# (so that block element children do not produce newlines).
convert_children_as_inline = (
convert_as_inline # propagated from parent
or html_heading_re.match(node.name) is not None # headings
or node.name in ['td', 'th'] # table cells
)
def process_tag(self, node, parent_tags=None):
# For the top-level element, initialize the parent context with an empty set.
if parent_tags is None:
parent_tags = set()

# Collect child elements to process, ignoring whitespace-only text elements
# adjacent to the inner/outer boundaries of block elements.
Expand Down Expand Up @@ -220,8 +214,27 @@ def _can_ignore(el):

children_to_convert = [el for el in node.children if not _can_ignore(el)]

# Create a copy of this tag's parent context, then update it to include this tag
# to propagate down into the children.
parent_tags_for_children = set(parent_tags)
parent_tags_for_children.add(node.name)

# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
if (
html_heading_re.match(node.name) is not None # headings
or node.name in {'td', 'th'} # table cells
):
parent_tags_for_children.add('_inline')

# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
if node.name in {'pre', 'code', 'kbd', 'samp'}:
parent_tags_for_children.add('_noformat')

# Convert the children elements into a list of result strings.
child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert]
child_strings = [
self.process_element(el, parent_tags=parent_tags_for_children)
for el in children_to_convert
]

# Remove empty string values.
child_strings = [s for s in child_strings if s]
Expand Down Expand Up @@ -256,11 +269,11 @@ def _can_ignore(el):
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
convert_fn = getattr(self, convert_fn_name, None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text, convert_as_inline)
text = convert_fn(node, text, parent_tags=parent_tags)

return text

def convert__document_(self, el, text, convert_as_inline):
def convert__document_(self, el, text, parent_tags):
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
if self.options['strip_document'] == LSTRIP:
text = text.lstrip('\n') # remove leading separation newlines
Expand All @@ -275,19 +288,23 @@ def convert__document_(self, el, text, convert_as_inline):

return text

def process_text(self, el):
def process_text(self, el, parent_tags=None):
# For the top-level element, initialize the parent context with an empty set.
if parent_tags is None:
parent_tags = set()

text = six.text_type(el) or ''

# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
if 'pre' not in parent_tags:
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
text = newline_whitespace_re.sub('\n', text)
text = whitespace_re.sub(' ', text)

# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
if '_noformat' not in parent_tags:
text = self.escape(text)

# remove leading whitespace at the start or just after a
Expand All @@ -310,8 +327,8 @@ def __getattr__(self, attr):
if m:
n = int(m.group(1))

def convert_tag(el, text, convert_as_inline):
return self._convert_hn(n, el, text, convert_as_inline)
def convert_tag(el, text, parent_tags):
return self._convert_hn(n, el, text, parent_tags)

convert_tag.__name__ = 'convert_h%s' % n
setattr(self, convert_tag.__name__, convert_tag)
Expand Down Expand Up @@ -358,8 +375,8 @@ def underline(self, text, pad_char):
text = (text or '').rstrip()
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

def convert_a(self, el, text, convert_as_inline):
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
def convert_a(self, el, text, parent_tags):
if '_noformat' in parent_tags:
return text
prefix, suffix, text = chomp(text)
if not text:
Expand All @@ -380,10 +397,10 @@ def convert_a(self, el, text, convert_as_inline):

convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

def convert_blockquote(self, el, text, convert_as_inline):
def convert_blockquote(self, el, text, parent_tags):
# handle some early-exit scenarios
text = (text or '').strip()
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return "\n"
Expand All @@ -396,25 +413,25 @@ def _indent_for_blockquote(match):

return '\n' + text + '\n\n'

def convert_br(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_br(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ""

if self.options['newline_style'].lower() == BACKSLASH:
return '\\\n'
else:
return ' \n'

def convert_code(self, el, text, convert_as_inline):
if el.parent.name == 'pre':
def convert_code(self, el, text, parent_tags):
if 'pre' in parent_tags:
return text
converter = abstract_inline_conversion(lambda self: '`')
return converter(self, el, text, convert_as_inline)
return converter(self, el, text, parent_tags)

convert_del = abstract_inline_conversion(lambda self: '~~')

def convert_div(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_div(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ' ' + text.strip() + ' '
text = text.strip()
return '\n\n%s\n\n' % text if text else ''
Expand All @@ -427,9 +444,9 @@ def convert_div(self, el, text, convert_as_inline):

convert_kbd = convert_code

def convert_dd(self, el, text, convert_as_inline):
def convert_dd(self, el, text, parent_tags):
text = (text or '').strip()
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
Expand All @@ -445,11 +462,11 @@ def _indent_for_dd(match):

return '%s\n' % text

def convert_dt(self, el, text, convert_as_inline):
def convert_dt(self, el, text, parent_tags):
# remove newlines from term text
text = (text or '').strip()
text = all_whitespace_re.sub(' ', text)
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
Expand All @@ -459,9 +476,9 @@ def convert_dt(self, el, text, convert_as_inline):

return '\n%s\n' % text

def _convert_hn(self, n, el, text, convert_as_inline):
def _convert_hn(self, n, el, text, parent_tags):
""" Method name prefixed with _ to prevent <hn> to call this """
if convert_as_inline:
if '_inline' in parent_tags:
return text

# prevent MemoryErrors in case of very large n
Expand All @@ -478,46 +495,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
return '\n\n%s %s\n\n' % (hashes, text)

def convert_hr(self, el, text, convert_as_inline):
def convert_hr(self, el, text, parent_tags):
return '\n\n---\n\n'

convert_i = convert_em

def convert_img(self, el, text, convert_as_inline):
def convert_img(self, el, text, parent_tags):
alt = el.attrs.get('alt', None) or ''
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
if (convert_as_inline
if ('_inline' in parent_tags
and el.parent.name not in self.options['keep_inline_images_in']):
return alt

return '![%s](%s%s)' % (alt, src, title_part)

def convert_list(self, el, text, convert_as_inline):
def convert_list(self, el, text, parent_tags):

# Converting a list to inline is undefined.
# Ignoring convert_to_inline for list.
# Ignoring inline conversion parents for list.

nested = False
before_paragraph = False
next_sibling = _next_block_content_sibling(el)
if next_sibling and next_sibling.name not in ['ul', 'ol']:
before_paragraph = True
while el:
if el.name == 'li':
nested = True
break
el = el.parent
if nested:
# remove trailing newline if nested
if 'li' in parent_tags:
# remove trailing newline if we're in a nested list
return '\n' + text.rstrip()
return '\n\n' + text + ('\n' if before_paragraph else '')

convert_ul = convert_list
convert_ol = convert_list

def convert_li(self, el, text, convert_as_inline):
def convert_li(self, el, text, parent_tags):
# handle some early-exit scenarios
text = (text or '').strip()
if not text:
Expand Down Expand Up @@ -554,8 +565,8 @@ def _indent_for_li(match):

return '%s\n' % text

def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_p(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ' ' + text.strip() + ' '
text = text.strip()
if self.options['wrap']:
Expand All @@ -577,7 +588,7 @@ def convert_p(self, el, text, convert_as_inline):
text = '\n'.join(new_lines)
return '\n\n%s\n\n' % text if text else ''

def convert_pre(self, el, text, convert_as_inline):
def convert_pre(self, el, text, parent_tags):
if not text:
return ''
code_language = self.options['code_language']
Expand All @@ -587,10 +598,10 @@ def convert_pre(self, el, text, convert_as_inline):

return '\n\n```%s\n%s\n```\n\n' % (code_language, text)

def convert_script(self, el, text, convert_as_inline):
def convert_script(self, el, text, parent_tags):
return ''

def convert_style(self, el, text, convert_as_inline):
def convert_style(self, el, text, parent_tags):
return ''

convert_s = convert_del
Expand All @@ -603,28 +614,28 @@ def convert_style(self, el, text, convert_as_inline):

convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

def convert_table(self, el, text, convert_as_inline):
def convert_table(self, el, text, parent_tags):
return '\n\n' + text.strip() + '\n\n'

def convert_caption(self, el, text, convert_as_inline):
def convert_caption(self, el, text, parent_tags):
return text.strip() + '\n\n'

def convert_figcaption(self, el, text, convert_as_inline):
def convert_figcaption(self, el, text, parent_tags):
return '\n\n' + text.strip() + '\n\n'

def convert_td(self, el, text, convert_as_inline):
def convert_td(self, el, text, parent_tags):
colspan = 1
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

def convert_th(self, el, text, convert_as_inline):
def convert_th(self, el, text, parent_tags):
colspan = 1
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

def convert_tr(self, el, text, convert_as_inline):
def convert_tr(self, el, text, parent_tags):
cells = el.find_all(['td', 'th'])
is_first_row = el.find_previous_sibling() is None
is_headrow = (
Expand Down
6 changes: 3 additions & 3 deletions tests/test_custom_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
"""
Create a custom MarkdownConverter for unit tests
"""
def convert_img(self, el, text, convert_as_inline):
def convert_img(self, el, text, parent_tags):
"""Add two newlines after an image"""
return super().convert_img(el, text, convert_as_inline) + '\n\n'
return super().convert_img(el, text, parent_tags) + '\n\n'

def convert_custom_tag(self, el, text, convert_as_inline):
def convert_custom_tag(self, el, text, parent_tags):
"""Ensure conversion function is found for tags with special characters in name"""
return "FUNCTION USED: %s" % text

Expand Down