Merge pull request #242 from oleast/feat-ingore-whitespace-wrap-top-level-test-2

oleast · web-flow · commit b967c7cb8cfa · 2024-12-02T09:55:43.000+01:00
diff --git a/README.md b/README.md
@@ -270,3 +270,67 @@ htmlStringToDocument(htmlString, options);
 //   ],
 // };
 ```
+
+## invalid Rich Text Documents
+
+The Contentful Rich Text format requires the `Document` adhere to a specific format.
+The full ruleset can be found in the [Contentful Documentation](https://www.contentful.com/developers/docs/concepts/rich-text/#rules-of-rich-text).
+
+By default this library will convert any HTML node by node to create a rich text document. This means that the result can be an invalid document.
+
+Uploading an invalid document to Contentful will result in an error. The `@contentful/rich-text-types` package from Contentful includes a `validateRichTextDocument` as of version `17.0.0`.
+
+**To mitigate invalid documents you have a few options:**
+
+- Use the built in `parserOptions` and/or `postProcessing` options. (Currently useful for removing whitespace, and fixing top level nodes).
+- Add a custom `TagConverter` og `TextConverter` that handles your case. (To handle cases like wrong child elements of `Inline` nodes, list elements, or tables).
+- Change your HTML to a valid format before converting it.
+
+### Handling invalid top level nodes
+
+Some elements can not be at the top level of a `Document`. This includes `Text`-nodes, `Inline`-nodes, `li`-elements, and any child element of `table` (like a `tr` or `td`).
+
+To handle cases where this appears this library includes a few utilities that process document after it has been created.
+
+These options are:
+
+- `options.postProcessing.handleTopLevelText: "preserve" | "remove" | "wrap-paragraph"`. Default: `"preserve"`.
+- `options.postProcessing.handleTopLevelInlines: "preserve" | "remove" | "wrap-paragraph"`. Default: `"preserve"`.
+
+Examples of usage:
+
+```typescript
+const htmlNodes = htmlStringToDocument(html, {
+  postProcessing: {
+    handleTopLevelText: "wrap-paragraph",
+    handleTopLevelInlines: "remove",
+  },
+});
+```
+
+How it works:
+
+- `"preserve"`: Keep top level nodes as they are, even if it results in an invalid `Document`.
+- `"remove"`: Remove the node with all its child nodes from the document.
+- `"wrap-paragraph"`: Wrap the node in a simple `paragraph`-node to make it valid.
+
+### Handling extra whitespace nodes
+
+A formatted HTML string might include whitespace that will be parsed and added to the document output. This can result in unwanted text nodes or an invalid document.
+
+Whitespace can be removed by using the `handleWhitespaceNodes` option.
+
+- `optons.parserOptions.handleWhitespaceNodes: "preserve" | "remove"`. Default: `"preserve"`.
+
+```typescript
+const htmlNodes = htmlStringToDocument(html, {
+  parserOptions: {
+    handleWhitespaceNodes: "preserve",
+  },
+});
+```
+
+How it works:
+
+- `"preserve"`: Keep all whitespace text nodes as they are in the original html string.
+- `"remove"`: Remove any text node that consist purely of whitespace from the HTML node tree. Uses the following Regex `/^\s*$/`.
diff --git a/src/converters.ts b/src/converters.ts
@@ -104,3 +104,14 @@ export const convertTextNodeToText: TextConverter = (node, marks) => {
     data: {},
   };
 };
+
+export const convertTextNodeToParagraphedText: TagConverter<Block> = (
+  node,
+  next,
+) => {
+  return {
+    nodeType: BLOCKS.PARAGRAPH,
+    data: {},
+    content: next(node),
+  };
+};
diff --git a/src/htmlStringToDocument.ts b/src/htmlStringToDocument.ts
@@ -5,9 +5,8 @@ import {
   convertTextNodeToText,
   convertTagToChildren,
 } from "./converters";
-import { parseHtml } from "./parseHtml";
+import { parseHtml, ParserOptions } from "./parseHtml";
 import {
-  TopLevelBlock,
   Document,
   Mark,
   Text,
@@ -22,7 +21,8 @@ import type {
   OptionsWithDefaults,
   TagConverter,
 } from "./types";
-import { createDocumentNode, getAsList } from "./utils";
+import { createDocumentNode, getAsList, isNotNull } from "./utils";
+import { processConvertedNodesFromTopLevel } from "./processConvertedNodesFromTopLevel";
 
 const DEFAULT_TAG_CONVERTERS: Partial<
   Record<HTMLTagName, TagConverter<Block | Inline | Text>>
@@ -59,9 +59,6 @@ const mapHtmlNodeToRichTextNode = (
   options: OptionsWithDefaults,
 ) => {
   const { convertText, convertTag } = options;
-  if (node.type === "text") {
-    return convertText(node, marks);
-  }
 
   const mapChildren: Next = (node, mark) => {
     const newMarks = mark ? getAsList(mark) : [];
@@ -73,10 +70,11 @@ const mapHtmlNodeToRichTextNode = (
     }
     return getAsList(mapHtmlNodeToRichTextNode(node, allMarks, options));
   };
+  const next = mapChildren;
 
-  const next: Next = (node, marks) => {
-    return mapChildren(node, marks);
-  };
+  if (node.type === "text") {
+    return convertText(node, marks);
+  }
 
   const tagConverter = convertTag?.[node.tagName] ?? convertTagToChildren;
   const convertedNode = tagConverter(node, next);
@@ -93,10 +91,30 @@ export const htmlStringToDocument = (
       ...options.convertTag,
     },
     convertText: options.convertText ?? convertTextNodeToText,
+    parserOptions: {
+      handleWhitespaceNodes:
+        options?.parserOptions?.handleWhitespaceNodes ?? "preserve",
+    },
+    postProcessing: {
+      handleTopLevelInlines:
+        options?.postProcessing?.handleTopLevelInlines ?? "preserve",
+      handleTopLevelText:
+        options?.postProcessing?.handleTopLevelText ?? "preserve",
+    },
+  };
+
+  const parserOptions: ParserOptions = {
+    ignoreWhiteSpace:
+      optionsWithDefaults.parserOptions.handleWhitespaceNodes == "remove",
   };
-  const parsedHtml = parseHtml(htmlString);
+
+  const parsedHtml = parseHtml(htmlString, parserOptions);
   const richTextNodes = parsedHtml.flatMap((node) =>
     mapHtmlNodeToRichTextNode(node, [], optionsWithDefaults),
   );
-  return createDocumentNode(richTextNodes as TopLevelBlock[]);
+  const processedRichTextNodes = richTextNodes
+    .map((node) => processConvertedNodesFromTopLevel(node, optionsWithDefaults))
+    .filter(isNotNull);
+
+  return createDocumentNode(processedRichTextNodes);
 };
diff --git a/src/parseHtml.ts b/src/parseHtml.ts
@@ -6,10 +6,14 @@ import {
   TextNode,
   CommentNode,
 } from "parse5/dist/tree-adapters/default";
-import { isNotNull } from "./utils";
+import { isNotNull, isWhiteSpace } from "./utils";
 
 import type { HTMLNode, HTMLTagName } from "./types";
 
+export interface ParserOptions {
+  ignoreWhiteSpace: boolean;
+}
+
 const isChildNodeComment = (childNode: ChildNode): childNode is CommentNode => {
   return childNode.nodeName === "#comment";
 };
@@ -28,7 +32,14 @@ const isChildNodeDocumentType = (
   return childNode.nodeName === "#documentType";
 };
 
-const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
+const isTextNodePureWhiteSpace = (textNode: TextNode): boolean => {
+  return isWhiteSpace(textNode.value);
+};
+
+const mapChildNodeToHtmlNode = (
+  childNode: ChildNode,
+  options: ParserOptions,
+): HTMLNode | null => {
   if (
     isChildNodeComment(childNode) ||
     isChildNodeDocumentType(childNode) ||
@@ -37,6 +48,9 @@ const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
     return null;
   }
   if (isChildNodeTextNode(childNode)) {
+    if (options.ignoreWhiteSpace && isTextNodePureWhiteSpace(childNode)) {
+      return null;
+    }
     return {
       type: "text",
       value: childNode.value,
@@ -47,17 +61,20 @@ const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => {
     type: "element",
     tagName: childNode.tagName as HTMLTagName,
     children: childNode.childNodes
-      .map((c) => mapChildNodeToHtmlNode(c))
+      .map((c) => mapChildNodeToHtmlNode(c, options))
       .filter(isNotNull),
     attrs: Object.fromEntries(
       childNode.attrs.map((attr) => [attr.name, attr.value]),
     ),
   };
 };
 
-export const parseHtml = (htmlString: string): HTMLNode[] => {
+export const parseHtml = (
+  htmlString: string,
+  options: ParserOptions,
+): HTMLNode[] => {
   const parsedHtml = parseFragment(htmlString);
   return parsedHtml.childNodes
-    .map((node) => mapChildNodeToHtmlNode(node))
+    .map((node) => mapChildNodeToHtmlNode(node, options))
     .filter(isNotNull);
 };
diff --git a/src/processConvertedNodesFromTopLevel.ts b/src/processConvertedNodesFromTopLevel.ts
@@ -0,0 +1,58 @@
+import {
+  Block,
+  BLOCKS,
+  Inline,
+  Text,
+  TopLevelBlock,
+} from "@contentful/rich-text-types";
+import type { OptionsWithDefaults } from "./types";
+import {
+  isNodeTypeBlock,
+  isNodeTypeInline,
+  isNodeTypeText,
+  isNodeTypeTopLevelBlock,
+} from "./utils";
+
+export const processConvertedNodesFromTopLevel = (
+  node: Block | Inline | Text,
+  options: OptionsWithDefaults,
+): TopLevelBlock | null => {
+  if (isNodeTypeBlock(node)) {
+    if (isNodeTypeTopLevelBlock(node)) {
+      return node;
+    }
+    // Block types that can not be at the top level are: BLOCKS.DOCUMENT | BLOCKS.LIST_ITEM | BLOCKS.TABLE_ROW | BLOCKS.TABLE_CELL | BLOCKS.TABLE_HEADER_CELL
+    if (node.nodeType === BLOCKS.DOCUMENT) {
+      return null;
+    }
+    // TODO: Handle top level list items and table elements
+    return node as unknown as TopLevelBlock;
+  }
+  if (isNodeTypeInline(node)) {
+    if (options.postProcessing.handleTopLevelInlines === "remove") {
+      return null;
+    }
+    if (options.postProcessing.handleTopLevelInlines === "wrap-paragraph") {
+      return {
+        nodeType: BLOCKS.PARAGRAPH,
+        data: {},
+        content: [node],
+      };
+    }
+    return node as unknown as TopLevelBlock;
+  }
+  if (isNodeTypeText(node)) {
+    if (options.postProcessing.handleTopLevelText === "remove") {
+      return null;
+    }
+    if (options.postProcessing.handleTopLevelText === "wrap-paragraph") {
+      return {
+        nodeType: BLOCKS.PARAGRAPH,
+        data: {},
+        content: [node],
+      };
+    }
+    return node as unknown as TopLevelBlock;
+  }
+  return null;
+};
diff --git a/src/test/helpers.ts b/src/test/helpers.ts
@@ -1,4 +1,11 @@
-import { Block, BLOCKS, Inline, Mark, Text } from "@contentful/rich-text-types";
+import {
+  Block,
+  BLOCKS,
+  Inline,
+  INLINES,
+  Mark,
+  Text,
+} from "@contentful/rich-text-types";
 import { getAsList } from "../utils";
 
 export const createText = (value: string, marks?: Mark | Mark[]): Text => {
@@ -20,3 +27,15 @@ export const createBlock = (
     data: {},
   };
 };
+
+export const createInline = (
+  nodeType: INLINES,
+  content: Text | Inline | Array<Text | Inline>,
+  data: { [key: string]: string } = {},
+): Inline => {
+  return {
+    nodeType,
+    content: getAsList(content),
+    data,
+  };
+};
diff --git a/src/test/index.test.ts b/src/test/index.test.ts
diff --git a/src/types.ts b/src/types.ts
diff --git a/src/utils.ts b/src/utils.ts