Skip to content

Commit 6235a9b

Browse files
author
Malte Legenhausen
committed
Code cleanups
1 parent 29a0d1d commit 6235a9b

File tree

3 files changed

+177
-153
lines changed

3 files changed

+177
-153
lines changed

lib/formatter.js

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
var _ = require('underscore');
2+
var _s = require('underscore.string');
3+
4+
var helper = require('./helper');
5+
6+
function formatText(elem, options) {
7+
var text = _s.strip(elem.raw);
8+
text = helper.decodeHTMLEntities(text);
9+
return helper.wordwrap(text, options.wordwrap);
10+
};
11+
12+
function formatLineBreak(elem, fn, options) {
13+
return '\n' + fn(elem.children, options);
14+
};
15+
16+
function formatParagraph(elem, fn, options) {
17+
return fn(elem.children, options) + '\n\n';
18+
};
19+
20+
function formatHeading(elem, fn, options) {
21+
return fn(elem.children, options).toUpperCase() + '\n';
22+
}
23+
24+
function formatAnchor(elem, fn, options) {
25+
return elem.attribs.href.replace(/^mailto\:/, '');
26+
};
27+
28+
function formatHorizontalLine(elem, fn, options) {
29+
return _s.repeat('-', options.wordwrap) + '\n\n';
30+
};
31+
32+
function formatListItem(prefix, elem, fn, options) {
33+
options = _.clone(options);
34+
// Reduce the wordwrap for sub elements.
35+
options.wordwrap -= prefix.length;
36+
// Process sub elements.
37+
var text = fn(elem.children, options);
38+
// Replace all line breaks with line break + prefix spacing.
39+
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length));
40+
// Add first prefix and line break at the end.
41+
return prefix + text + '\n';
42+
};
43+
44+
function formatUnorderedList(elem, fn, options) {
45+
var result = '';
46+
_.each(elem.children, function(elem) {
47+
result += formatListItem(' * ', elem, fn, options);
48+
});
49+
return result + '\n';
50+
};
51+
52+
function formatOrderedList(elem, fn, options) {
53+
var result = '';
54+
// Calculate the maximum length to i.
55+
var maxLength = elem.children.length.toString().length;
56+
_.each(elem.children, function(elem, i) {
57+
var index = i + 1;
58+
// Calculate the needed spacing for nice indentation.
59+
var spacing = maxLength - index.toString().length;
60+
var prefix = ' ' + index + '. ' + _s.repeat(' ', spacing);
61+
result += formatListItem(prefix, elem, fn, options);
62+
});
63+
return result + '\n';
64+
};
65+
66+
function tableToString(table) {
67+
// Determine space width per column
68+
// Convert all rows to lengths
69+
var widths = _.map(table, function(row) {
70+
return _.map(row, function(col) {
71+
return col.length;
72+
});
73+
});
74+
// Invert rows with colums
75+
widths = helper.arrayZip(widths);
76+
// Determine the max values for each column
77+
widths = _.map(widths, function(col) {
78+
return _.max(col);
79+
});
80+
81+
// Build the table
82+
var text = '';
83+
_.each(table, function(row) {
84+
var i = 0;
85+
_.each(row, function(col) {
86+
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' ';
87+
});
88+
text += '\n';
89+
});
90+
return text + '\n';
91+
};
92+
93+
function formatTable(elem, fn, options) {
94+
var table = [];
95+
_.each(elem.children, function(elem) {
96+
if (elem.type === 'tag' && elem.name === 'tr') {
97+
var rows = [];
98+
_.each(elem.children, function(elem) {
99+
var tokens, times;
100+
if (elem.type === 'tag') {
101+
if (elem.name === 'th') {
102+
tokens = formatHeading(elem, fn, options).split('\n');
103+
rows.push(_.compact(tokens));
104+
} else if (elem.name === 'td') {
105+
tokens = fn(elem.children, options).split('\n');
106+
rows.push(_.compact(tokens));
107+
// Fill colspans with empty values
108+
if (elem.attribs && elem.attribs.colspan) {
109+
times = elem.attribs.colspan - 1;
110+
_.times(times, function() {
111+
rows.push(['']);
112+
});
113+
}
114+
}
115+
}
116+
});
117+
rows = helper.arrayZip(rows);
118+
_.each(rows, function(row) {
119+
row = _.map(row, function(col) {
120+
return col || '';
121+
});
122+
table.push(row);
123+
});
124+
}
125+
});
126+
return tableToString(table);
127+
};
128+
129+
exports.text = formatText;
130+
exports.lineBreak = formatLineBreak;
131+
exports.paragraph = formatParagraph;
132+
exports.anchor = formatAnchor;
133+
exports.heading = formatHeading;
134+
exports.table = formatTable;
135+
exports.orderedList = formatOrderedList;
136+
exports.unorderedList = formatUnorderedList;
137+
exports.listItem = formatListItem;
138+
exports.horizontalLine = formatHorizontalLine;

lib/helper.js

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
var _ = require('underscore');
2+
var _s = require('underscore.string');
3+
14
/**
25
* <p>Decodes any HTML entities in a string into their unicode form</p>
36
*
@@ -44,4 +47,29 @@ exports.decodeHTMLEntities = function decodeHTMLEntities(text) {
4447
return text.replace(/&(.+?);/g, function(str, ent) {
4548
return String.fromCharCode(ent[0] !== '#' ? HTMLEntities[ent] : ent[1] === 'x' ? parseInt(ent.substr(2),16) : parseInt(ent.substr(1), 10));
4649
});
50+
};
51+
52+
exports.wordwrap = function wordwrap(text, max) {
53+
var result = '';
54+
var words = _s.words(text);
55+
var length = 0;
56+
var buffer = [];
57+
_.each(words, function(word) {
58+
if (length + word.length > max) {
59+
// Concat buffer and add it to the result
60+
result += buffer.join(' ') + '\n';
61+
// Reset buffer and length
62+
buffer.length = length = 0;
63+
}
64+
buffer.push(word);
65+
// Add word length + one whitespace
66+
length += word.length + 1;
67+
});
68+
// Add the rest tot the result.
69+
result += buffer.join(' ');
70+
return _s.rstrip(result);
71+
};
72+
73+
exports.arrayZip = function arrayZip(array) {
74+
return _.zip.apply(_, array);
4775
};

lib/html-to-text.js

Lines changed: 11 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ var _s = require('underscore.string');
66
var htmlparser = require("htmlparser");
77

88
var helper = require('./helper');
9+
var format = require('./formatter');
910

1011
function htmlToText(html, options) {
1112
options = options || {};
@@ -42,151 +43,6 @@ function filterBody(dom) {
4243
return result || dom;
4344
}
4445

45-
function zip(array) {
46-
return _.zip.apply(_, array);
47-
}
48-
49-
function wordwrap(text, max) {
50-
var result = '';
51-
var words = _s.words(text);
52-
var length = 0;
53-
var buffer = [];
54-
_.each(words, function(word) {
55-
if (length + word.length <= max) {
56-
buffer.push(word);
57-
// Add word length + one whitespace
58-
length += word.length + 1;
59-
} else {
60-
result += buffer.join(' ') + '\n';
61-
buffer = [word];
62-
length = word.length;
63-
}
64-
});
65-
result += buffer.join(' ');
66-
return _s.rstrip(result);
67-
}
68-
69-
function formatText(elem, options) {
70-
var text = _s.strip(elem.raw);
71-
text = helper.decodeHTMLEntities(text);
72-
return wordwrap(text, options.wordwrap);
73-
}
74-
75-
function formatBreak(elem, fn, options) {
76-
return '\n' + fn(elem.children, options);
77-
}
78-
79-
function formatParagraph(elem, fn, options) {
80-
return fn(elem.children, options) + '\n\n';
81-
}
82-
83-
function formatTitle(elem, fn, options) {
84-
return fn(elem.children, options).toUpperCase() + '\n';
85-
}
86-
87-
function formatAnchor(elem, fn, options) {
88-
return elem.attribs.href.replace(/^mailto\:/, '');
89-
}
90-
91-
function formatHorizontalLine(elem, fn, options) {
92-
return _s.repeat('-', options.wordwrap) + '\n\n';
93-
}
94-
95-
function formatListEntry(prefix, elem, fn, options) {
96-
options = _.clone(options);
97-
// Reduce the wordwrap for sub elements.
98-
options.wordwrap -= prefix.length;
99-
// Process sub elements.
100-
var text = fn(elem.children, options);
101-
// Replace all line breaks with line break + prefix spacing.
102-
text = text.replace(/\n/g, '\n' + _s.repeat(' ', prefix.length));
103-
// Add first prefix and line break at the end.
104-
return prefix + text + '\n';
105-
}
106-
107-
function formatList(elem, fn, options) {
108-
var result = '';
109-
if (elem.name === 'ul') {
110-
_.each(elem.children, function(elem) {
111-
result += formatListEntry(' * ', elem, fn, options);
112-
});
113-
} else if (elem.name === 'ol') {
114-
// Calculate the maximum length to i.
115-
var maxLength = elem.children.length.toString().length;
116-
_.each(elem.children, function(elem, i) {
117-
var index = i + 1;
118-
// Calculate the needed spacing for nice indentation.
119-
var spacing = maxLength - index.toString().length;
120-
var prefix = ' ' + index + '. ' + _s.repeat(' ', spacing);
121-
result += formatListEntry(prefix, elem, fn, options);
122-
});
123-
}
124-
return result + '\n';
125-
}
126-
127-
function tableToString(table) {
128-
// Determine space width per column
129-
// Convert all rows to lengths
130-
var widths = _.map(table, function(row) {
131-
return _.map(row, function(col) {
132-
return col.length;
133-
});
134-
});
135-
// Invert rows with colums
136-
widths = zip(widths);
137-
// Determine the max values for each column
138-
widths = _.map(widths, function(col) {
139-
return _.max(col);
140-
});
141-
142-
// Build the table
143-
var text = '';
144-
_.each(table, function(row) {
145-
var i = 0;
146-
_.each(row, function(col) {
147-
text += _s.rpad(_s.strip(col), widths[i++], ' ') + ' ';
148-
});
149-
text += '\n';
150-
});
151-
return text + '\n';
152-
}
153-
154-
function formatTable(elem, fn, options) {
155-
var table = [];
156-
_.each(elem.children, function(elem) {
157-
if (elem.type === 'tag' && elem.name === 'tr') {
158-
var rows = [];
159-
_.each(elem.children, function(elem) {
160-
var tokens, times;
161-
if (elem.type === 'tag') {
162-
if (elem.name === 'th') {
163-
tokens = formatTitle(elem, fn, options).split('\n');
164-
rows.push(_.compact(tokens));
165-
} else if (elem.name === 'td') {
166-
tokens = fn(elem.children, options).split('\n');
167-
rows.push(_.compact(tokens));
168-
// Fill colspans with empty values
169-
if (elem.attribs && elem.attribs.colspan) {
170-
times = elem.attribs.colspan - 1;
171-
_.times(times, function() {
172-
rows.push(['']);
173-
});
174-
}
175-
}
176-
}
177-
});
178-
rows = zip(rows);
179-
_.each(rows, function(row) {
180-
row = _.map(row, function(col) {
181-
return col || '';
182-
});
183-
table.push(row);
184-
});
185-
}
186-
});
187-
return tableToString(table);
188-
}
189-
19046
function containsTable(attr, tables) {
19147
if (tables === true) return true;
19248

@@ -216,38 +72,40 @@ function walk(dom, options) {
21672
case 'tag':
21773
switch(elem.name) {
21874
case 'a':
219-
result += formatAnchor(elem, walk, options);
75+
result += format.anchor(elem, walk, options);
22076
break;
22177
case 'p':
222-
result += formatParagraph(elem, walk, options);
78+
result += format.paragraph(elem, walk, options);
22379
break;
22480
case 'h1':
22581
case 'h2':
22682
case 'h3':
22783
case 'h4':
228-
result += formatTitle(elem, walk, options);
84+
result += format.heading(elem, walk, options);
22985
break;
23086
case 'br':
231-
result += formatBreak(elem, walk, options);
87+
result += format.lineBreak(elem, walk, options);
23288
break;
23389
case 'hr':
234-
result += formatHorizontalLine(elem, walk, options);
90+
result += format.horizontalLine(elem, walk, options);
23591
break;
23692
case 'ul':
93+
result += format.unorderedList(elem, walk, options);
94+
break;
23795
case 'ol':
238-
result += formatList(elem, walk, options);
96+
result += format.orderedList(elem, walk, options);
23997
break;
24098
case 'table':
24199
if (containsTable(elem.attribs, options.tables)) {
242-
result += formatTable(elem, walk, options);
100+
result += format.table(elem, walk, options);
243101
break;
244102
}
245103
default:
246104
result += walk(elem.children || [], options);
247105
}
248106
break;
249107
case 'text':
250-
if (elem.raw !== '\r\n') result += formatText(elem, options);
108+
if (elem.raw !== '\r\n') result += format.text(elem, options);
251109
break;
252110
default:
253111
result += walk(elem.children || [], options);

0 commit comments

Comments
 (0)