diff --git a/README.rst b/README.rst index af87f51..34ed7e0 100644 --- a/README.rst +++ b/README.rst @@ -139,6 +139,11 @@ keep_inline_images_in that should be allowed to contain inline images, for example ``['td']``. Defaults to an empty list. +table_infer_header + Controls handling of tables with no header row (as indicated by ```` + or ````). When set to ``True``, the first body row is used as the header row. + Defaults to ``False``, which leaves the header row empty. + wrap, wrap_width If ``wrap`` is set to ``True``, all text paragraphs are wrapped at ``wrap_width`` characters. Defaults to ``False`` and ``80``. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index ac53077..2360210 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -102,6 +102,7 @@ class DefaultOptions: strong_em_symbol = ASTERISK sub_symbol = '' sup_symbol = '' + table_infer_header = False wrap = False wrap_width = 80 @@ -518,13 +519,24 @@ def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) is_headrow = ( all([cell.name == 'th' for cell in cells]) - or (not el.previous_sibling and not el.parent.name == 'tbody') + or (el.parent.name == 'thead' + # avoid multiple tr in thead + and len(el.parent.find_all('tr')) == 1) + ) + is_head_row_missing = ( + (not el.previous_sibling and not el.parent.name == 'tbody') or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) ) overline = '' underline = '' - if is_headrow and not el.previous_sibling: - # first row and is headline: print headline underline + if ((is_headrow + or (is_head_row_missing + and self.options['table_infer_header'])) + and not el.previous_sibling): + # first row and: + # - is headline or + # - headline is missing and header inference is enabled + # print headline underline full_colspan = 0 for cell in cells: if 'colspan' in cell.attrs and cell['colspan'].isdigit(): @@ -532,13 +544,16 @@ def convert_tr(self, el, text, convert_as_inline): else: full_colspan += 1 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - elif (not el.previous_sibling - and (el.parent.name == 'table' - or (el.parent.name == 'tbody' - and not el.parent.previous_sibling))): + elif ((is_head_row_missing + and not self.options['table_infer_header']) + or (not el.previous_sibling + and (el.parent.name == 'table' + or (el.parent.name == 'tbody' + and not el.parent.previous_sibling)))): + # headline is missing and header inference is disabled or: # first row, not headline, and: - # - the parent is table or - # - the parent is tbody at the beginning of a table. + # - the parent is table or + # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' diff --git a/markdownify/main.py b/markdownify/main.py index 4e1c874..432efb5 100644 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -61,6 +61,10 @@ def main(argv=sys.argv[1:]): "should be converted to markdown images instead, this option can " "be set to a list of parent tags that should be allowed to " "contain inline images.") + parser.add_argument('--table-infer-header', dest='table_infer_header', + action='store_true', + help="When a table has no header row (as indicated by '' " + "or ''), use the first body row as the header row.") parser.add_argument('-w', '--wrap', action='store_true', help="Wrap all text paragraphs at --wrap-width characters.") parser.add_argument('--wrap-width', type=int, default=80) diff --git a/tests/test_tables.py b/tests/test_tables.py index dcf9ad7..da4bf53 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -141,6 +141,33 @@ """ +table_head_body_multiple_head = """ + + + + + + + + + + + + + + + + + + + + + + + + +
CreatorEditorServer
OperatorManagerEngineer
BobOliverTom
ThomasLucasEthan
""" + table_missing_text = """ @@ -245,10 +272,28 @@ def test_table(): assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body_multiple_head) == '\n\n| | | |\n| --- | --- | --- |\n| Creator | Editor | Server |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n' assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' + + +def test_table_infer_header(): + assert md(table, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_html_content, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_paragraphs, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_linebreaks, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' + assert md(table_with_header_column, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body_multiple_head, table_infer_header=True) == '\n\n| Creator | Editor | Server |\n| --- | --- | --- |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' + assert md(table_head_body_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_text, table_infer_header=True) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'