From a168c22044c6c44b32276bfc7bcc5ff61d8d938b Mon Sep 17 00:00:00 2001
From: Archit Singh <131599737+archit15singh@users.noreply.github.com>
Date: Wed, 12 Jun 2024 14:16:32 +0530
Subject: [PATCH] fix unnecessary /n newlines resulting in a lot of white area
 in the markdown

https://github.com/matthewwithanm/python-markdownify/issues/130
---
 markdownify/__init__.py | 71 ++++++-----------------------------------
 1 file changed, 9 insertions(+), 62 deletions(-)
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index eaa6ded..24f9108 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -10,7 +10,6 @@
 all_whitespace_re = re.compile(r'[\s]+')
 html_heading_re = re.compile(r'h[1-6]')
 
-
 # Heading styles
 ATX = 'atx'
 ATX_CLOSED = 'atx_closed'
@@ -27,12 +26,6 @@
 
 
 def chomp(text):
-    """
-    If the text in an inline tag like b, a, or em contains a leading or trailing
-    space, strip the string and return a space as suffix of prefix, if needed.
-    This function is used to prevent conversions like
-        <b> foo</b> => ** foo**
-    """
     prefix = ' ' if text and text[0] == ' ' else ''
     suffix = ' ' if text and text[-1] == ' ' else ''
     text = text.strip()
@@ -40,12 +33,6 @@ def chomp(text):
 
 
 def abstract_inline_conversion(markup_fn):
-    """
-    This abstracts all simple inline tags like b, em, del, ...
-    Returns a function that wraps the chomped text in a pair of the string
-    that is returned by markup_fn. markup_fn is necessary to allow for
-    references to self.strong_em_symbol etc.
-    """
     def implementation(self, el, text, convert_as_inline):
         markup = markup_fn(self)
         if el.find_parent(['pre', 'code', 'kbd', 'samp']):
@@ -86,14 +73,11 @@ class Options(DefaultOptions):
         pass
 
     def __init__(self, **options):
-        # Create an options dictionary. Use DefaultOptions as a base so that
-        # it doesn't have to be extended.
         self.options = _todict(self.DefaultOptions)
         self.options.update(_todict(self.Options))
         self.options.update(options)
         if self.options['strip'] is not None and self.options['convert'] is not None:
-            raise ValueError('You may specify either tags to strip or tags to'
-                             ' convert, but not both.')
+            raise ValueError('You may specify either tags to strip or tags to convert, but not both.')
 
     def convert(self, html):
         soup = BeautifulSoup(html, 'html.parser')
@@ -105,8 +89,6 @@ def convert_soup(self, soup):
     def process_tag(self, node, convert_as_inline, children_only=False):
         text = ''
 
-        # markdown headings or cells can't include
-        # block elements (elements w/newlines)
         isHeading = html_heading_re.match(node.name) is not None
         isCell = node.name in ['td', 'th']
         convert_children_as_inline = convert_as_inline
@@ -114,7 +96,6 @@ def process_tag(self, node, convert_as_inline, children_only=False):
         if not children_only and (isHeading or isCell):
             convert_children_as_inline = True
 
-        # Remove whitespace-only textnodes in purely nested nodes
         def is_nested_node(el):
             return el and el.name in ['ol', 'ul', 'li',
                                       'table', 'thead', 'tbody', 'tfoot',
@@ -122,11 +103,6 @@ def is_nested_node(el):
 
         if is_nested_node(node):
             for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the
-                # conditions is true:
-                # - el is the first element in its parent
-                # - el is the last element in its parent
-                # - el is adjacent to an nested node
                 can_extract = (not el.previous_sibling
                                or not el.next_sibling
                                or is_nested_node(el.previous_sibling)
@@ -136,14 +112,13 @@ def is_nested_node(el):
                         and can_extract):
                     el.extract()
 
-        # Convert the children first
         for el in node.children:
             if isinstance(el, Comment) or isinstance(el, Doctype):
                 continue
             elif isinstance(el, NavigableString):
-                text += self.process_text(el)
+                text += self.process_text(el).strip()
             else:
-                text += self.process_tag(el, convert_children_as_inline)
+                text += self.process_tag(el, convert_children_as_inline).strip()
 
         if not children_only:
             convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -154,27 +129,17 @@ def is_nested_node(el):
 
     def process_text(self, el):
         text = six.text_type(el) or ''
-
-        # normalize whitespace if we're not inside a preformatted element
         if not el.find_parent('pre'):
             text = whitespace_re.sub(' ', text)
-
-        # escape special characters if we're not inside a preformatted or code element
         if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
             text = self.escape(text)
-
-        # remove trailing whitespaces if any of the following condition is true:
-        # - current text node is the last node in li
-        # - current text node is followed by an embedded list
         if (el.parent.name == 'li'
                 and (not el.next_sibling
                      or el.next_sibling.name in ['ul', 'ol'])):
             text = text.rstrip()
-
-        return text
+        return text.strip()
 
     def __getattr__(self, attr):
-        # Handle headings
         m = convert_heading_re.match(attr)
         if m:
             n = int(m.group(1))
@@ -216,7 +181,7 @@ def indent(self, text, level):
 
     def underline(self, text, pad_char):
         text = (text or '').rstrip()
-        return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+        return '%s\n%s\n' % (text, pad_char * len(text)) if text else ''
 
     def convert_a(self, el, text, convert_as_inline):
         prefix, suffix, text = chomp(text)
@@ -224,12 +189,10 @@ def convert_a(self, el, text, convert_as_inline):
             return ''
         href = el.get('href')
         title = el.get('title')
-        # For the replacement see #29: text nodes underscores are escaped
         if (self.options['autolinks']
                 and text.replace(r'\_', '_') == href
                 and not title
                 and not self.options['default_title']):
-            # Shortcut syntax
             return '<%s>' % href
         if self.options['default_title'] and not title:
             title = href
@@ -239,16 +202,13 @@ def convert_a(self, el, text, convert_as_inline):
     convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
 
     def convert_blockquote(self, el, text, convert_as_inline):
-
         if convert_as_inline:
             return text
-
-        return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
+        return (line_beginning_re.sub('> ', text.strip()) + '\n') if text else ''
 
     def convert_br(self, el, text, convert_as_inline):
         if convert_as_inline:
             return ""
-
         if self.options['newline_style'].lower() == BACKSLASH:
             return '\\\n'
         else:
@@ -269,7 +229,6 @@ def convert_code(self, el, text, convert_as_inline):
     def convert_hn(self, n, el, text, convert_as_inline):
         if convert_as_inline:
             return text
-
         style = self.options['heading_style'].lower()
         text = text.strip()
         if style == UNDERLINED and n <= 2:
@@ -277,8 +236,8 @@ def convert_hn(self, n, el, text, convert_as_inline):
             return self.underline(text, line)
         hashes = '#' * n
         if style == ATX_CLOSED:
-            return '%s %s %s\n\n' % (hashes, text, hashes)
-        return '%s %s\n\n' % (hashes, text)
+            return '%s %s %s\n' % (hashes, text, hashes)
+        return '%s %s\n' % (hashes, text)
 
     def convert_hr(self, el, text, convert_as_inline):
         return '\n\n---\n\n'
@@ -297,10 +256,6 @@ def convert_img(self, el, text, convert_as_inline):
         return '![%s](%s%s)' % (alt, src, title_part)
 
     def convert_list(self, el, text, convert_as_inline):
-
-        # Converting a list to inline is undefined.
-        # Ignoring convert_to_inline for list.
-
         nested = False
         before_paragraph = False
         if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
@@ -311,7 +266,6 @@ def convert_list(self, el, text, convert_as_inline):
                 break
             el = el.parent
         if nested:
-            # remove trailing newline if nested
             return '\n' + self.indent(text, 1).rstrip()
         return text + ('\n' if before_paragraph else '')
 
@@ -344,16 +298,14 @@ def convert_p(self, el, text, convert_as_inline):
                         width=self.options['wrap_width'],
                         break_long_words=False,
                         break_on_hyphens=False)
-        return '%s\n\n' % text if text else ''
+        return '%s\n' % text if text else ''
 
     def convert_pre(self, el, text, convert_as_inline):
         if not text:
             return ''
         code_language = self.options['code_language']
-
         if self.options['code_language_callback']:
             code_language = self.options['code_language_callback'](el) or code_language
-
         return '\n```%s\n%s\n```\n' % (code_language, text)
 
     def convert_script(self, el, text, convert_as_inline):
@@ -403,7 +355,6 @@ def convert_tr(self, el, text, convert_as_inline):
         overline = ''
         underline = ''
         if is_headrow and not el.previous_sibling:
-            # first row and is headline: print headline underline
             full_colspan = 0
             for cell in cells:
                 if "colspan" in cell.attrs:
@@ -415,10 +366,6 @@ def convert_tr(self, el, text, convert_as_inline):
               and (el.parent.name == 'table'
                    or (el.parent.name == 'tbody'
                        and not el.parent.previous_sibling))):
-            # first row, not headline, and:
-            # - the parent is table or
-            # - the parent is tbody at the beginning of a table.
-            # print empty headline above this row
             overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
             overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
         return overline + '|' + text + '\n' + underline