Skip to content

Commit e68afaf

Browse files
committed
feat(options): ignore whitespace while parsing html and wrap top level text nodes in paragraphs
1 parent 0007a4a commit e68afaf

File tree

6 files changed

+88
-17
lines changed

6 files changed

+88
-17
lines changed

src/converters.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,14 @@ export const convertTextNodeToText: TextConverter = (node, marks) => {
104104
data: {},
105105
};
106106
};
107+
108+
export const convertTextNodeToParagraphedText: TagConverter<Block> = (
109+
node,
110+
next,
111+
) => {
112+
return {
113+
nodeType: BLOCKS.PARAGRAPH,
114+
data: {},
115+
content: next(node),
116+
};
117+
};

src/htmlStringToDocument.ts

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@ import {
1616
BLOCKS,
1717
} from "@contentful/rich-text-types";
1818
import type {
19+
HTMLElementNode,
1920
HTMLNode,
2021
HTMLTagName,
22+
HTMLTextNode,
2123
Next,
2224
Options,
2325
OptionsWithDefaults,
2426
TagConverter,
27+
TextConverter,
2528
} from "./types";
26-
import { createDocumentNode, getAsList } from "./utils";
29+
import { createDocumentNode, getAsList, isWhiteSpace } from "./utils";
2730

2831
const DEFAULT_TAG_CONVERTERS: Partial<
2932
Record<HTMLTagName, TagConverter<Block | Inline | Text>>
@@ -58,11 +61,9 @@ const mapHtmlNodeToRichTextNode = (
5861
node: HTMLNode,
5962
marks: Mark[],
6063
options: OptionsWithDefaults,
64+
isTopLevel = false,
6165
) => {
6266
const { convertText, convertTag } = options;
63-
if (node.type === "text") {
64-
return convertText(node, marks);
65-
}
6667

6768
const mapChildren: Next = (node, mark) => {
6869
const newMarks = mark ? getAsList(mark) : [];
@@ -74,10 +75,11 @@ const mapHtmlNodeToRichTextNode = (
7475
}
7576
return getAsList(mapHtmlNodeToRichTextNode(node, allMarks, options));
7677
};
78+
const next = mapChildren;
7779

78-
const next: Next = (node, marks) => {
79-
return mapChildren(node, marks);
80-
};
80+
if (node.type === "text") {
81+
return convertText(node, marks);
82+
}
8183

8284
const tagConverter = convertTag?.[node.tagName] ?? convertTagToChildren;
8385
const convertedNode = tagConverter(node, next);
@@ -94,10 +96,13 @@ export const htmlStringToDocument = (
9496
...options.convertTag,
9597
},
9698
convertText: options.convertText ?? convertTextNodeToText,
99+
wrapTopLevelTextNodesInParagraphs: false,
100+
ignoreWhiteSpace: false,
101+
isWhiteSpace: options.isWhiteSpace ?? isWhiteSpace,
97102
};
98103
const parsedHtml = parseHtml(htmlString);
99104
const richTextNodes = parsedHtml.flatMap((node) =>
100-
mapHtmlNodeToRichTextNode(node, [], optionsWithDefaults),
105+
mapHtmlNodeToRichTextNode(node, [], optionsWithDefaults, true),
101106
);
102107

103108
const richTextNodesWithTopLevelTextNodesConverted: TopLevelBlock[] =

src/parseHtml.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
TextNode,
77
CommentNode,
88
} from "parse5/dist/tree-adapters/default";
9-
import { isNotNull } from "./utils";
9+
import { isNotNull, isWhiteSpace } from "./utils";
1010

1111
import type { HTMLNode, HTMLTagName } from "./types";
1212

@@ -28,6 +28,10 @@ const isChildNodeDocumentType = (
2828
return childNode.nodeName === "#documentType";
2929
};
3030

31+
const isTextNodePureWhiteSpace = (textNode: TextNode): boolean => {
32+
return isWhiteSpace(textNode.value);
33+
}
34+
3135
const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
3236
if (
3337
isChildNodeComment(childNode) ||
@@ -37,6 +41,9 @@ const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
3741
return null;
3842
}
3943
if (isChildNodeTextNode(childNode)) {
44+
if (isTextNodePureWhiteSpace(childNode)) {
45+
return null;
46+
}
4047
return {
4148
type: "text",
4249
value: childNode.value,

src/test/index.test.ts

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,18 +72,18 @@ describe("Parse HTML string to Contentful Document", () => {
7272

7373
const matchNode = helpers.createBlock(
7474
BLOCKS.PARAGRAPH,
75-
helpers.createText(matchText),
75+
helpers.createText(matchText)
7676
);
7777

7878
expect(htmlNodes).toMatchObject(
79-
createDocumentNode([matchNode as TopLevelBlock]),
79+
createDocumentNode([matchNode as TopLevelBlock])
8080
);
8181
});
8282

8383
it("Handles a complex convert option from 'span' with bold class to 'paragraph' and 'bold' mark", () => {
8484
const styledSpanToMarkedParagraphConverter: TagConverter<Block> = (
8585
node,
86-
next,
86+
next
8787
) => {
8888
const isBold = node.attrs.class === "bold";
8989
const marks = isBold ? ({ type: "bold" } satisfies Mark) : undefined;
@@ -100,13 +100,13 @@ describe("Parse HTML string to Contentful Document", () => {
100100
convertTag: {
101101
span: styledSpanToMarkedParagraphConverter,
102102
},
103-
},
103+
}
104104
);
105105

106106
const matchNode = createDocumentNode([
107107
helpers.createBlock(
108108
BLOCKS.PARAGRAPH,
109-
helpers.createText(matchText, { type: "bold" }),
109+
helpers.createText(matchText, { type: "bold" })
110110
),
111111
] as TopLevelBlock[]);
112112

@@ -115,7 +115,7 @@ describe("Parse HTML string to Contentful Document", () => {
115115

116116
it("converts an invalid top level text node to a paragraph node", () => {
117117
expect(
118-
htmlStringToDocument("<div>Text under top level div</div>"),
118+
htmlStringToDocument("<div>Text under top level div</div>")
119119
).toMatchObject({
120120
content: [
121121
{
@@ -141,8 +141,8 @@ describe("Parse HTML string to Contentful Document", () => {
141141
htmlStringToDocument(
142142
"Some unwrapped text prefixing a p tag." +
143143
"<p>Paragraph content <span>I am a text node</span></p>" +
144-
"Some unwrapped text suffixing a p tag",
145-
),
144+
"Some unwrapped text suffixing a p tag"
145+
)
146146
).toMatchObject({
147147
content: [
148148
{
@@ -192,4 +192,47 @@ describe("Parse HTML string to Contentful Document", () => {
192192
nodeType: "document",
193193
});
194194
});
195+
196+
it("Handles text nodes with only whitespace by ingoring them", () => {
197+
const html = `<h2>Heading on the first line</h2>\n\n<p>Text on the third line.</p>`;
198+
199+
const htmlNodes = htmlStringToDocument(html, {
200+
ignoreWhiteSpace: true,
201+
});
202+
203+
const matchNode = createDocumentNode([
204+
helpers.createBlock(
205+
BLOCKS.HEADING_2,
206+
helpers.createText("Heading on the first line")
207+
),
208+
helpers.createBlock(
209+
BLOCKS.PARAGRAPH,
210+
helpers.createText("Text on the third line.")
211+
),
212+
] as TopLevelBlock[]);
213+
214+
expect(htmlNodes).toMatchObject(matchNode);
215+
});
216+
217+
it("Handles text nodes with only whitespace by including them", () => {
218+
const html = `<h2>Heading on the first line</h2>\n\n<p>Text on the third line.</p>`;
219+
220+
const htmlNodes = htmlStringToDocument(html, {
221+
ignoreWhiteSpace: false,
222+
});
223+
224+
const matchNode = createDocumentNode([
225+
helpers.createBlock(
226+
BLOCKS.HEADING_2,
227+
helpers.createText("Heading on the first line")
228+
),
229+
helpers.createText("\n\n"),
230+
helpers.createBlock(
231+
BLOCKS.PARAGRAPH,
232+
helpers.createText("Text on the third line.")
233+
),
234+
] as TopLevelBlock[]);
235+
236+
expect(htmlNodes).toMatchObject(matchNode);
237+
});
195238
});

src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ export type ConvertTagOptions = Record<HTMLTagName | string, TagConverter>;
5858
export interface OptionsWithDefaults {
5959
convertTag: ConvertTagOptions;
6060
convertText: TextConverter;
61+
wrapTopLevelTextNodesInParagraphs: boolean;
62+
ignoreWhiteSpace: boolean;
63+
isWhiteSpace: (content: string) => boolean;
6164
}
6265

6366
export type Options = Partial<OptionsWithDefaults>;

src/utils.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ import { MARK_TYPES, INLINE_TYPES, BLOCK_TYPES } from "./constants";
1414
export const isNotNull = <T>(value: T): value is Exclude<T, null> =>
1515
value !== null;
1616

17+
export const isWhiteSpace = (content: string): boolean => /^\s*$/.test(content);
18+
1719
export const getAsList = <T>(value: T | T[]): T[] => {
1820
if (Array.isArray(value)) {
1921
return value;

0 commit comments

Comments
 (0)