From 595313a909cc0cdb76ad7f261d41ca529e565993 Mon Sep 17 00:00:00 2001 From: Dan MacTough Date: Wed, 2 Jul 2014 15:22:24 -0400 Subject: [PATCH 1/3] Fix missing space in blocks of text. Generally caused by inline anchor tags, but if we start formatting other inline tags, this issue will affect them, too. --- lib/formatter.js | 2 +- lib/helper.js | 5 +++-- lib/html-to-text.js | 8 +++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/formatter.js b/lib/formatter.js index 657859f..3b61736 100644 --- a/lib/formatter.js +++ b/lib/formatter.js @@ -6,7 +6,7 @@ var helper = require('./helper'); function formatText(elem, options) { var text = _s.strip(elem.raw); text = helper.decodeHTMLEntities(text); - return helper.wordwrap(text, options.wordwrap); + return helper.wordwrap(elem.needsSpace ? ' ' + text : text, options.wordwrap); }; function formatLineBreak(elem, fn, options) { diff --git a/lib/helper.js b/lib/helper.js index b90bd41..f3765dc 100644 --- a/lib/helper.js +++ b/lib/helper.js @@ -50,9 +50,10 @@ exports.decodeHTMLEntities = function decodeHTMLEntities(text) { }; exports.wordwrap = function wordwrap(text, max) { - var result = ''; + // Preserve leading space + var result = _s.startsWith(text, ' ') ? ' ' : ''; var words = _s.words(text); - var length = 0; + var length = result.length; var buffer = []; _.each(words, function(word) { if (length + word.length > max) { diff --git a/lib/html-to-text.js b/lib/html-to-text.js index 28a6281..8aaf202 100644 --- a/lib/html-to-text.js +++ b/lib/html-to-text.js @@ -73,6 +73,7 @@ function containsTable(attr, tables) { function walk(dom, options) { var result = ''; + var whiteSpaceRegex = /\S$/; _.each(dom, function(elem) { switch(elem.type) { case 'tag': @@ -111,7 +112,12 @@ function walk(dom, options) { } break; case 'text': - if (elem.raw !== '\r\n') result += format.text(elem, options); + if (elem.raw !== '\r\n') { + // Text needs a leading space if `result` currently + // doesn't end with whitespace + elem.needsSpace = whiteSpaceRegex.test(result); + result += format.text(elem, options); + } break; default: if (!_.include(SKIP_TYPES, elem.type)) { From f1a744638e050a1a1be6bf903deab1f0b96e7b83 Mon Sep 17 00:00:00 2001 From: Dan MacTough Date: Wed, 2 Jul 2014 15:24:49 -0400 Subject: [PATCH 2/3] Format anchor tags more informatively. --- lib/formatter.js | 20 +++++++++++++++----- lib/html-to-text.js | 3 +++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/lib/formatter.js b/lib/formatter.js index 3b61736..2e5ddc8 100644 --- a/lib/formatter.js +++ b/lib/formatter.js @@ -21,13 +21,23 @@ function formatHeading(elem, fn, options) { return fn(elem.children, options).toUpperCase() + '\n'; } +// If we have both href and anchor text, format it in a useful manner: +// - "anchor text [href]" +// Otherwise if we have only anchor text or an href, we return the part we have: +// - "anchor text" or +// - "href" function formatAnchor(elem, fn, options) { + var href = ''; + // Always get the anchor text + var result = _s.strip(fn(elem.children || [], options)); + // Get the href, if present if (elem.attribs && elem.attribs.href) { - return elem.attribs.href.replace(/^mailto\:/, ''); - } - else { - return helper.wordwrap(helper.decodeHTMLEntities(_s.strip(elem.raw)), options.wordwrap); - } + href = elem.attribs.href.replace(/^mailto\:/, ''); + } + if (result && href) { + result += ' [' + href + ']'; + } + return formatText({ raw: result || href, needsSpace: elem.needsSpace }, options); }; function formatHorizontalLine(elem, fn, options) { diff --git a/lib/html-to-text.js b/lib/html-to-text.js index 8aaf202..301c894 100644 --- a/lib/html-to-text.js +++ b/lib/html-to-text.js @@ -79,6 +79,9 @@ function walk(dom, options) { case 'tag': switch(elem.name.toLowerCase()) { case 'a': + // Inline element needs a leading space if `result` currently + // doesn't end with whitespace + elem.needsSpace = whiteSpaceRegex.test(result); result += format.anchor(elem, walk, options); break; case 'p': From 004330eb1e9f01ad983c5ad71f98d0fe8fb29280 Mon Sep 17 00:00:00 2001 From: Dan MacTough Date: Wed, 2 Jul 2014 15:34:51 -0400 Subject: [PATCH 3/3] Update example output in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0047d2f..d7c3a21 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,7 @@ gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea -takimata sanctus est Lorem ipsum dolor sit amet.www.github.com +takimata sanctus est Lorem ipsum dolor sit amet. Github [www.github.com] At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum @@ -256,7 +256,7 @@ MAILTO FORMATING Some Company Some Street 42 Somewhere -E-Mail:test@example.com +E-Mail: Click here [test@example.com] ``` ## License