Skip to content

Commit b967c7c

Browse files
authored
Merge pull request #242 from oleast/feat-ingore-whitespace-wrap-top-level-test-2
2 parents bc1a08b + 13d8d97 commit b967c7c

File tree

9 files changed

+456
-52
lines changed

9 files changed

+456
-52
lines changed

README.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,3 +270,67 @@ htmlStringToDocument(htmlString, options);
270270
// ],
271271
// };
272272
```
273+
274+
## invalid Rich Text Documents
275+
276+
The Contentful Rich Text format requires the `Document` adhere to a specific format.
277+
The full ruleset can be found in the [Contentful Documentation](https://www.contentful.com/developers/docs/concepts/rich-text/#rules-of-rich-text).
278+
279+
By default this library will convert any HTML node by node to create a rich text document. This means that the result can be an invalid document.
280+
281+
Uploading an invalid document to Contentful will result in an error. The `@contentful/rich-text-types` package from Contentful includes a `validateRichTextDocument` as of version `17.0.0`.
282+
283+
**To mitigate invalid documents you have a few options:**
284+
285+
- Use the built in `parserOptions` and/or `postProcessing` options. (Currently useful for removing whitespace, and fixing top level nodes).
286+
- Add a custom `TagConverter` og `TextConverter` that handles your case. (To handle cases like wrong child elements of `Inline` nodes, list elements, or tables).
287+
- Change your HTML to a valid format before converting it.
288+
289+
### Handling invalid top level nodes
290+
291+
Some elements can not be at the top level of a `Document`. This includes `Text`-nodes, `Inline`-nodes, `li`-elements, and any child element of `table` (like a `tr` or `td`).
292+
293+
To handle cases where this appears this library includes a few utilities that process document after it has been created.
294+
295+
These options are:
296+
297+
- `options.postProcessing.handleTopLevelText: "preserve" | "remove" | "wrap-paragraph"`. Default: `"preserve"`.
298+
- `options.postProcessing.handleTopLevelInlines: "preserve" | "remove" | "wrap-paragraph"`. Default: `"preserve"`.
299+
300+
Examples of usage:
301+
302+
```typescript
303+
const htmlNodes = htmlStringToDocument(html, {
304+
postProcessing: {
305+
handleTopLevelText: "wrap-paragraph",
306+
handleTopLevelInlines: "remove",
307+
},
308+
});
309+
```
310+
311+
How it works:
312+
313+
- `"preserve"`: Keep top level nodes as they are, even if it results in an invalid `Document`.
314+
- `"remove"`: Remove the node with all its child nodes from the document.
315+
- `"wrap-paragraph"`: Wrap the node in a simple `paragraph`-node to make it valid.
316+
317+
### Handling extra whitespace nodes
318+
319+
A formatted HTML string might include whitespace that will be parsed and added to the document output. This can result in unwanted text nodes or an invalid document.
320+
321+
Whitespace can be removed by using the `handleWhitespaceNodes` option.
322+
323+
- `optons.parserOptions.handleWhitespaceNodes: "preserve" | "remove"`. Default: `"preserve"`.
324+
325+
```typescript
326+
const htmlNodes = htmlStringToDocument(html, {
327+
parserOptions: {
328+
handleWhitespaceNodes: "preserve",
329+
},
330+
});
331+
```
332+
333+
How it works:
334+
335+
- `"preserve"`: Keep all whitespace text nodes as they are in the original html string.
336+
- `"remove"`: Remove any text node that consist purely of whitespace from the HTML node tree. Uses the following Regex `/^\s*$/`.

src/converters.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,14 @@ export const convertTextNodeToText: TextConverter = (node, marks) => {
104104
data: {},
105105
};
106106
};
107+
108+
export const convertTextNodeToParagraphedText: TagConverter<Block> = (
109+
node,
110+
next,
111+
) => {
112+
return {
113+
nodeType: BLOCKS.PARAGRAPH,
114+
data: {},
115+
content: next(node),
116+
};
117+
};

src/htmlStringToDocument.ts

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ import {
55
convertTextNodeToText,
66
convertTagToChildren,
77
} from "./converters";
8-
import { parseHtml } from "./parseHtml";
8+
import { parseHtml, ParserOptions } from "./parseHtml";
99
import {
10-
TopLevelBlock,
1110
Document,
1211
Mark,
1312
Text,
@@ -22,7 +21,8 @@ import type {
2221
OptionsWithDefaults,
2322
TagConverter,
2423
} from "./types";
25-
import { createDocumentNode, getAsList } from "./utils";
24+
import { createDocumentNode, getAsList, isNotNull } from "./utils";
25+
import { processConvertedNodesFromTopLevel } from "./processConvertedNodesFromTopLevel";
2626

2727
const DEFAULT_TAG_CONVERTERS: Partial<
2828
Record<HTMLTagName, TagConverter<Block | Inline | Text>>
@@ -59,9 +59,6 @@ const mapHtmlNodeToRichTextNode = (
5959
options: OptionsWithDefaults,
6060
) => {
6161
const { convertText, convertTag } = options;
62-
if (node.type === "text") {
63-
return convertText(node, marks);
64-
}
6562

6663
const mapChildren: Next = (node, mark) => {
6764
const newMarks = mark ? getAsList(mark) : [];
@@ -73,10 +70,11 @@ const mapHtmlNodeToRichTextNode = (
7370
}
7471
return getAsList(mapHtmlNodeToRichTextNode(node, allMarks, options));
7572
};
73+
const next = mapChildren;
7674

77-
const next: Next = (node, marks) => {
78-
return mapChildren(node, marks);
79-
};
75+
if (node.type === "text") {
76+
return convertText(node, marks);
77+
}
8078

8179
const tagConverter = convertTag?.[node.tagName] ?? convertTagToChildren;
8280
const convertedNode = tagConverter(node, next);
@@ -93,10 +91,30 @@ export const htmlStringToDocument = (
9391
...options.convertTag,
9492
},
9593
convertText: options.convertText ?? convertTextNodeToText,
94+
parserOptions: {
95+
handleWhitespaceNodes:
96+
options?.parserOptions?.handleWhitespaceNodes ?? "preserve",
97+
},
98+
postProcessing: {
99+
handleTopLevelInlines:
100+
options?.postProcessing?.handleTopLevelInlines ?? "preserve",
101+
handleTopLevelText:
102+
options?.postProcessing?.handleTopLevelText ?? "preserve",
103+
},
104+
};
105+
106+
const parserOptions: ParserOptions = {
107+
ignoreWhiteSpace:
108+
optionsWithDefaults.parserOptions.handleWhitespaceNodes == "remove",
96109
};
97-
const parsedHtml = parseHtml(htmlString);
110+
111+
const parsedHtml = parseHtml(htmlString, parserOptions);
98112
const richTextNodes = parsedHtml.flatMap((node) =>
99113
mapHtmlNodeToRichTextNode(node, [], optionsWithDefaults),
100114
);
101-
return createDocumentNode(richTextNodes as TopLevelBlock[]);
115+
const processedRichTextNodes = richTextNodes
116+
.map((node) => processConvertedNodesFromTopLevel(node, optionsWithDefaults))
117+
.filter(isNotNull);
118+
119+
return createDocumentNode(processedRichTextNodes);
102120
};

src/parseHtml.ts

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,14 @@ import {
66
TextNode,
77
CommentNode,
88
} from "parse5/dist/tree-adapters/default";
9-
import { isNotNull } from "./utils";
9+
import { isNotNull, isWhiteSpace } from "./utils";
1010

1111
import type { HTMLNode, HTMLTagName } from "./types";
1212

13+
export interface ParserOptions {
14+
ignoreWhiteSpace: boolean;
15+
}
16+
1317
const isChildNodeComment = (childNode: ChildNode): childNode is CommentNode => {
1418
return childNode.nodeName === "#comment";
1519
};
@@ -28,7 +32,14 @@ const isChildNodeDocumentType = (
2832
return childNode.nodeName === "#documentType";
2933
};
3034

31-
const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
35+
const isTextNodePureWhiteSpace = (textNode: TextNode): boolean => {
36+
return isWhiteSpace(textNode.value);
37+
};
38+
39+
const mapChildNodeToHtmlNode = (
40+
childNode: ChildNode,
41+
options: ParserOptions,
42+
): HTMLNode | null => {
3243
if (
3344
isChildNodeComment(childNode) ||
3445
isChildNodeDocumentType(childNode) ||
@@ -37,6 +48,9 @@ const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
3748
return null;
3849
}
3950
if (isChildNodeTextNode(childNode)) {
51+
if (options.ignoreWhiteSpace && isTextNodePureWhiteSpace(childNode)) {
52+
return null;
53+
}
4054
return {
4155
type: "text",
4256
value: childNode.value,
@@ -47,17 +61,20 @@ const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
4761
type: "element",
4862
tagName: childNode.tagName as HTMLTagName,
4963
children: childNode.childNodes
50-
.map((c) => mapChildNodeToHtmlNode(c))
64+
.map((c) => mapChildNodeToHtmlNode(c, options))
5165
.filter(isNotNull),
5266
attrs: Object.fromEntries(
5367
childNode.attrs.map((attr) => [attr.name, attr.value]),
5468
),
5569
};
5670
};
5771

58-
export const parseHtml = (htmlString: string): HTMLNode[] => {
72+
export const parseHtml = (
73+
htmlString: string,
74+
options: ParserOptions,
75+
): HTMLNode[] => {
5976
const parsedHtml = parseFragment(htmlString);
6077
return parsedHtml.childNodes
61-
.map((node) => mapChildNodeToHtmlNode(node))
78+
.map((node) => mapChildNodeToHtmlNode(node, options))
6279
.filter(isNotNull);
6380
};
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import {
2+
Block,
3+
BLOCKS,
4+
Inline,
5+
Text,
6+
TopLevelBlock,
7+
} from "@contentful/rich-text-types";
8+
import type { OptionsWithDefaults } from "./types";
9+
import {
10+
isNodeTypeBlock,
11+
isNodeTypeInline,
12+
isNodeTypeText,
13+
isNodeTypeTopLevelBlock,
14+
} from "./utils";
15+
16+
export const processConvertedNodesFromTopLevel = (
17+
node: Block | Inline | Text,
18+
options: OptionsWithDefaults,
19+
): TopLevelBlock | null => {
20+
if (isNodeTypeBlock(node)) {
21+
if (isNodeTypeTopLevelBlock(node)) {
22+
return node;
23+
}
24+
// Block types that can not be at the top level are: BLOCKS.DOCUMENT | BLOCKS.LIST_ITEM | BLOCKS.TABLE_ROW | BLOCKS.TABLE_CELL | BLOCKS.TABLE_HEADER_CELL
25+
if (node.nodeType === BLOCKS.DOCUMENT) {
26+
return null;
27+
}
28+
// TODO: Handle top level list items and table elements
29+
return node as unknown as TopLevelBlock;
30+
}
31+
if (isNodeTypeInline(node)) {
32+
if (options.postProcessing.handleTopLevelInlines === "remove") {
33+
return null;
34+
}
35+
if (options.postProcessing.handleTopLevelInlines === "wrap-paragraph") {
36+
return {
37+
nodeType: BLOCKS.PARAGRAPH,
38+
data: {},
39+
content: [node],
40+
};
41+
}
42+
return node as unknown as TopLevelBlock;
43+
}
44+
if (isNodeTypeText(node)) {
45+
if (options.postProcessing.handleTopLevelText === "remove") {
46+
return null;
47+
}
48+
if (options.postProcessing.handleTopLevelText === "wrap-paragraph") {
49+
return {
50+
nodeType: BLOCKS.PARAGRAPH,
51+
data: {},
52+
content: [node],
53+
};
54+
}
55+
return node as unknown as TopLevelBlock;
56+
}
57+
return null;
58+
};

src/test/helpers.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
import { Block, BLOCKS, Inline, Mark, Text } from "@contentful/rich-text-types";
1+
import {
2+
Block,
3+
BLOCKS,
4+
Inline,
5+
INLINES,
6+
Mark,
7+
Text,
8+
} from "@contentful/rich-text-types";
29
import { getAsList } from "../utils";
310

411
export const createText = (value: string, marks?: Mark | Mark[]): Text => {
@@ -20,3 +27,15 @@ export const createBlock = (
2027
data: {},
2128
};
2229
};
30+
31+
export const createInline = (
32+
nodeType: INLINES,
33+
content: Text | Inline | Array<Text | Inline>,
34+
data: { [key: string]: string } = {},
35+
): Inline => {
36+
return {
37+
nodeType,
38+
content: getAsList(content),
39+
data,
40+
};
41+
};

0 commit comments

Comments
 (0)