diff --git a/web/libs/editor/src/tags/object/RichText/domManager.md b/web/libs/editor/src/tags/object/RichText/domManager.md new file mode 100644 index 000000000000..54c19d7e9042 --- /dev/null +++ b/web/libs/editor/src/tags/object/RichText/domManager.md @@ -0,0 +1,252 @@ +# DomManager + +The primary role of `DomManager` is to provide an interface to the Document Object Model (DOM) +and cache as much information as possible about it for optimization. Most of this data is collected +right during the initialization phase when an instance is created, while the remaining details +are fetched during interaction processes. + +Simplified structure of `DomManager`: +```mermaid +erDiagram + DomManager ||--|| DomData : contains + DomData ||--o{ DDStaticElement: includes + DomData ||--|{ DDDynamicBlock: includes + DomData ||--o{ DDExtraText: includes + DDDynamicBlock ||--o{ DDSpanElement: includes + DDSpanElement ||--o{ DDSpanElement: includes + DDDynamicBlock ||--o{ DDTextElement: includes + DDSpanElement ||--o{ DDTextElement: includes + + + DomData { + number endPos + number displayedText + number displayedTextPos + Array~DDStaticElement|DDDynamicBlock|DDExtraText~ elements + } + DDStaticElement { + HTMLElement node + number start + string path + } + DDDynamicBlock { + number start + number end + string path + Array~DDSpanElement|DDTextElement~ children + } + DDSpanElement { + number start + number end + HTMLSpanElement node + Array~DDSpanElement|DDTextElement~ children + } + DDTextElement { + Text node + number start + number end + string[] content + } +``` + + +### DomData + + +`DomManager` stores all needed data in the object being an instance of `DomData` class. +This object stores representation of displayed content in the way it can be seen by users, +which was achieved by using Selection Api. It also keeps in mind positions coming +after the last processed element in displayed text and in the content of `RichText`. +But all this is relevant only on the stage of initialization. + +The last field `DomData` contains is an array of elements that should represent the DOM tree itself. +And that is the point of interest. + +All elements being contained by `DomData` could be logically divided in two groups. +The one that is static and never changes, they reflect the structure of the DOM. +And the other one where we store all highlight spans and texts. + + +### Structural elements + +The first group consists of elements of types `DDStaticElement`, `DDDynamicBlock` and `DDExtraText`. +It’s a flat list on the first level of descendents and represents the default state +of `RichText`’s content. + + +- `DDStaticElement` contains information about its related tag in DOM. It contains reference +to its html-node, its start position calculated as an global offset and its xpath. +The last two fields are used to search the right +elements in DomData. +- `DDExtraText` is just a string. It has no real analog in DOM but it is what we get +when we work with Selection Api to collect text representation. For example in case +when the content itself has some block elements or other line breaks. +This exists only to be sure that all symbols of displayed text are accounted for +in the region's text field. +- `DDDynamicBlock` is a container for managing all real text elements and highlighting spans +that belong to regions. It provides slots for dynamically changing content. On the initialization +it has relation only with one text node in DOM. It stores information of start and end +of the editable block in terms of global offsets, xpath of its original text element +and set of children elements. + + +### Content elements + + +The second group is sets of elements that dynamically change when regions are created and deleted. +It is represented by elements of types `DDSpanElement` and `DDTextElement`. + + +- `DDSpanElement` is similar to `DDDynamicBlock` but it also can be created / deleted +during the annotating, stores the reference to its highlighting span html-node and +has a method to remove this span itself from DOM. +- The content of `DDTextElement` is an array of strings where each element +on the one hand is a thing that is counted by global offsets as one symbol +and on the other hand is a substring of displayed text +so that there is no any character here that the browser does not provide as visible. + + +### Examples + +#### Simple Html +The simple data `

The HTML

` will be converted in this way: + + +```mermaid +flowchart TD + content["<p>The <b>HTML</b></p>"] + body["DDStaticElement
path: '/'"] + p["DDStaticElement
path: '/p[1]'"] + the["DDDynamicBlock 4
path: '/p[1]/text()[1]'"] + b["DDStaticElement
path: '/p[1]/b[1]'"] + html["DDDynamicBlock 8
path: '/p[1]/b[1]/text()[1]'"] + content --> body + content--> p + content--> the + content--> b + content--> html + t_the["DDTextElement 4
content: ['T', 'h', 'e', ' ']"] + the--> t_the + t_html["DDTextElement 8
content: ['H', 'T', 'M', 'L']"] + html--> t_html +``` + +#### A text with a region + +A text `“Text"` with region over `“x”` would be represented as: + + +```mermaid +flowchart TD + content["Texlabel_xt"] + body["0 DDStaticElement\npath: '/'"] + text["0 DDDynamicBlock 4
path: '/text()[1]'"] + content --> body + content --> text + t_text_Te["0 DDTextElement 2
content: ['T','e']"] + text --> t_text_Te + s_span_x["2 DDSpanElement 3"] + text --> s_span_x + t_text_x["2 DDTextElement 3
content: ['x']"] + s_span_x --> t_text_x + t_text_t["3 DDTextElement 4
content: ['t']"] + text--> t_text_t +``` + +#### Replacing characters + +The tricky content `a
b\nc` will be: + +```mermaid +flowchart TD + content["a<br/>b#92;nc"] + body["0 DDStaticElement
path: '/'"] + a["0 DDDynamicBlock 1
path: '/text()[1]'"] + n["#92;#92;n"] + bc["2 DDDynamicBlock 5
path: '/text()[2]'"] + content --> body + content --> a + content --> n + content --> bc + t_a["0 DDTextElement 1
content: ['a']"] + a --> t_a + t_bc["DDTextElement 5
content: ['b', ' ', 'c']"] + bc --> t_bc +``` +- `\n` is converted to space character as it is displayed in the browser. +- `
` becomes extra text element `\n` as it will be displayed as a line break. + +#### Edge cases +There could be more complicated cases, for example when HTML is not well-formed. +```html +

This +is part
of +HTML +

+``` +Is displayed in browser as: + +This is part
+of HTML + +And results in: +```mermaid +flowchart TD + content["

This is part
of HTML +

"] + body["0 DDStaticElement
path: '/'"] + p["0 DDStaticElement
path: '/p[1]'"] + ThisIsPart["0 DDDynamicBlock 4
path: '/p[1]/text()[1]'"] + ThisIsPart_text["0 DDTextElement 12
content: ['T','h','i','s',' ','i','s',' ','p','a','r','t']"] + extra1["#92;#92;n"] + of["13 DDDynamicBlock 18
path: '/p[1]/text()[2]'"] + of_text["13 DDTextElement 18
content: ['','o','f',' ','']"] + abbr["18 DDStaticElement
path: '/p[1]/abbr[1]'"] + b["18 DDStaticElement
path: '/p[1]/abbr[1]/b[1]'"] + html["18 DDDynamicBlock 22
path: '/p[1]/abbr[1]/b[1]/text()[1]'"] + html_text["18 DDTextElement 22
content: ['H','T','M','L']"] + empty["22 DDDynamicBlock 23
path: '/p[1]/text()[3]'"] + empty_text["22 DDTextElement 23
content: ['']"] + content --> body + content --> p + content --> ThisIsPart + ThisIsPart --> ThisIsPart_text + content --> extra1 + content --> of + of --> of_text + content --> abbr + content --> b + content --> html + html --> html_text + content --> empty + empty --> empty_text +``` + +In the second text node we have a content `['','o','f',' ','']` + +An empty string as a first element is a result of the fact that the browser does not display +space at the beginning of the tag content. + +An empty string as a last element is a result of the fact that the browser knows about the line break +in original html and also considers it as a character, but it does not display it. + +### Content field +Displayed text is stored in the `content` field of elements. It is represented as an array of strings. +Each item in the array is a character displayed in the browser. + +Some of the characters are empty strings, that means that they are not displayed in the browser +and cannot be got by Selection Api. But there are met in DOM's text nodes in `textContent`. +So to keep that information we store them in the `content` field as a placeholder. +But in the same time it can be used to calculate the global offset or range offset in the displayed text. + +In case if we have text for annotating: `

🐱\nmeans cat

` the whole content will be: +`['🐱', ' ', 'm', 'e', 'a', 'n', 's', ' ', 'c', 'a', 't', '.']` +When we create region over the word `cat` we can: +- get the displayed text of the region by joining the content array from the 9th to the 11th element. +(it is how it is displayed in the browser) +- get the global offset of the region. It is exactly the number of elements in the content array till +the region. ([8, 11]) +- get an offset of the range related to the region. For that we need to sum the length +of the content of all elements and in case of empty string consider it as a one +(even if it is hidden) character. ([9, 12]) + diff --git a/web/libs/editor/src/tags/object/RichText/domManager.ts b/web/libs/editor/src/tags/object/RichText/domManager.ts index 397df4da3d5a..863247953a68 100644 --- a/web/libs/editor/src/tags/object/RichText/domManager.ts +++ b/web/libs/editor/src/tags/object/RichText/domManager.ts @@ -7,6 +7,18 @@ const CF = "\r"; type DDExtraText = string; +/** + * Array of all characters and dummy placeholders + * + * Content is a way to store information about the displayed text + * and be able to restore global offsets and relative offsets in the same time. + * All hidden characters as "\n" or spaces at the start/end are stored as "" (dummy) + * but we keep in mind that it is a character with `length` == 1, + * and it affects both global and relative offsets. + * @see ./domManager.md + */ +type Content = string[]; + /** * Normalize text for displaying it. * It replaces all line breaks with '\n' symbol. @@ -21,11 +33,10 @@ class DDTextElement { public node: Text; public start: number; public end: number; - // array of all characters and dummy placeholders - public content: string[]; + public content: Content; public path?: string; - constructor(node: Text, start: number, end: number, content: string[], path?: string) { + constructor(node: Text, start: number, end: number, content: Content, path?: string) { this.node = node; this.start = start; this.end = end; @@ -54,7 +65,7 @@ class DDTextElement { const content = this.getContent(start, end); if (newNode.textContent) { - newNode.textContent = newNode.textContent.substring(start - this.start, end - this.start); + newNode.textContent = [...newNode.textContent].slice(start - this.start, end - this.start).join(""); } return new DDTextElement(newNode, start, end, content); @@ -302,16 +313,46 @@ class DDSpanElement extends DDBlock { class DDDynamicBlock extends DDBlock { public path: string; + public content: Content = []; constructor(start: number, path: string) { super(start); this.path = path; } - addTextNode(textNode: Text, start: number, end: number, content: string[], path: string) { + addTextNode(textNode: Text, start: number, end: number, content: Content, path: string) { + // There might be only one text node per DDDynamicBlock + this.content = content; this.children.push(new DDTextElement(textNode, start, end, content, path)); this.end = end; } + + getRelativeOffsetByGlobal(offset: number) { + return ( + this.content + .slice(0, offset - this.start) + //restore the size of skipped symbols (mostly \n) to 1 to get the correct text offset + .map((ch) => (ch === "" ? " " : ch)) + .join("").length + ); + } + + getGlobalOffsetByRelative(offset: number) { + let counter = offset; + const len = + offset === 0 + ? 0 + : 1 + + this.content.findIndex((ch) => { + if (ch === "") { + counter--; + } else { + counter -= ch.length; + } + return counter <= 0; + }); + return this.start + len; + } } class DDStaticElement { @@ -379,11 +420,11 @@ class DomData { fromIdx++; } let toIdx = fromIdx; - for (const char of text) { - if (displayedText[toIdx] === char || (displayedText[toIdx] === " " && char === LF)) { - contentParts.push(displayedText[toIdx]); - toIdx++; + const displayedChar = displayedText.substring(toIdx, toIdx + char.length); + if (displayedChar === char || (displayedChar === " " && char === LF)) { + contentParts.push(displayedChar); + toIdx += char.length; } else { contentParts.push(""); } @@ -408,9 +449,11 @@ class DomData { const contentLength = content.length; let displayedTextLength = text.length; - if (pos === -1) { + // When `pos - this.displayedTextPos > 1` it most probably means + // that `text` is too simple (f.e. " ") and it possible to find its duplicates not at the right place. + if (pos === -1 || pos - this.displayedTextPos > 1) { // text doesn't match any parts of displayedText - // that means that it contains some \n or other symbols that are trimmed by browser + // it means that it contains some \n or other symbols that are trimmed by browser // calc the offsets of the part of displayedText that matches the text in terms of displayed symbols const { fromIdx, toIdx, content: newContent } = this.findProjectionOnDisplayedText(text); @@ -443,7 +486,7 @@ class DomData { return this.findTextBlock(pos, avoid)?.findTextElement(pos, avoid); } - findElementByPath(path: string) { + findElementByPath(path: string): DDStaticElement | DDDynamicBlock | undefined { for (const el of this.elements) { if (typeof el !== "string" && el.path === path) { return el; @@ -452,6 +495,36 @@ class DomData { return undefined; } + getNextElement(element: DDStaticElement | DDDynamicBlock): DDStaticElement | DDDynamicBlock | undefined { + let idx = this.elements.indexOf(element); + + while ( + !(this.elements[idx + 1] instanceof DDStaticElement) && + !(this.elements[idx + 1] instanceof DDDynamicBlock) + ) { + idx++; + if (idx >= this.elements.length - 1) { + return void 0; + } + } + + return this.elements[idx + 1] as DDStaticElement | DDDynamicBlock; + } + + getEndOf(element: DDStaticElement | DDDynamicBlock | DDSpanElement | DDTextElement) { + if (element instanceof DDSpanElement || element instanceof DDTextElement) { + return element.end; + } + + const nextElement = this.getNextElement(element); + + if (nextElement) { + return nextElement.start; + } + + return this.endPos; + } + findElementByNode(node: Node) { for (const el of this.elements) { if (el instanceof DDStaticElement) { @@ -504,11 +577,7 @@ class DomData { collectBlocks(start: number, end: number) { const startIdx = this.indexOfTextBlock(start, "end"); const endIdx = Math.max(this.indexOfTextBlock(end, "start"), startIdx); - const blocks: DDDynamicBlock[] = this.elements - .slice(startIdx, endIdx + 1) - .filter((el) => el instanceof DDDynamicBlock) as DDDynamicBlock[]; - - return blocks; + return this.elements.slice(startIdx, endIdx + 1).filter((el) => el instanceof DDDynamicBlock) as DDDynamicBlock[]; } createSpans(start: number, end: number) { @@ -720,14 +789,21 @@ export default class DomManager { } relativeOffsetsToGlobalOffsets(start: string, startOffset: number, end: string, endOffset: number) { - const startEl = this.domData.findElementByPath(start); - const endEl = this.domData.findElementByPath(end); + let startEl = this.domData.findElementByPath(start); + let endEl = this.domData.findElementByPath(end); if (!startEl || !endEl) { return undefined; } + if (!(startEl instanceof DDDynamicBlock)) { + startEl = this.domData.findTextBlock(startEl.start, "end") as DDDynamicBlock; + } + if (!(endEl instanceof DDDynamicBlock)) { + // It really should be "end" and not "start" as we are looking for the exact container by the start position + endEl = this.domData.findTextBlock(endEl.start, "end") as DDDynamicBlock; + } - return [startOffset + startEl.start, endOffset + endEl.start]; + return [startEl.getGlobalOffsetByRelative(startOffset), endEl.getGlobalOffsetByRelative(endOffset)]; } globalOffsetsToRelativeOffsets(start: number, end: number) { @@ -737,9 +813,9 @@ export default class DomManager { if (startElement && endElement) { return { start: startElement.path, - startOffset: start - startElement.start, + startOffset: startElement.getRelativeOffsetByGlobal(start), end: endElement.path, - endOffset: end - endElement.start, + endOffset: endElement.getRelativeOffsetByGlobal(end), }; } @@ -754,7 +830,14 @@ export default class DomManager { return undefined; } - return [range.startOffset + startEl.start, range.endOffset + endEl.start]; + const startBlock = this.domData.findTextBlock(startEl.start, "end") as DDDynamicBlock; + // It really should be "end" and not "start" as we are looking for the exact container by the start position + const endBlock = this.domData.findTextBlock(endEl.start, "end") as DDDynamicBlock; + + return [ + startBlock.getGlobalOffsetByRelative(range.startOffset), + endBlock.getGlobalOffsetByRelative(range.endOffset), + ]; } getText(start: number, end: number) { diff --git a/web/libs/editor/src/tags/object/RichText/model.js b/web/libs/editor/src/tags/object/RichText/model.js index 95bebc95c6a4..87653e3ac7d0 100644 --- a/web/libs/editor/src/tags/object/RichText/model.js +++ b/web/libs/editor/src/tags/object/RichText/model.js @@ -337,30 +337,74 @@ const Model = types domManager?.removeStyles(ids); }, + /** + * Converts global offsets to relative offsets. + * + * @param {Object} start - The start global offset in codepoints. + * @param {Object} end - The end global offset in codepoints. + * @returns {undefined|{start: string, startOffset: number, end: string, endOffset: number}} - The relative offsets. + */ globalOffsetsToRelativeOffsets({ start, end }) { return domManager.globalOffsetsToRelativeOffsets(start, end); }, + /** + * Calculates relative offsets to global offsets for a given range in the document. + * + * @param {Node} start - The starting node of the range. + * @param {number} startOffset - The offset within the starting node. + * @param {Node} end - The ending node of the range. + * @param {number} endOffset - The offset within the ending node. + * @return {undefined|[number,number]} - An array containing the calculated global offsets in codepoints in the form [startGlobalOffset, endGlobalOffset]. + */ relativeOffsetsToGlobalOffsets(start, startOffset, end, endOffset) { return domManager.relativeOffsetsToGlobalOffsets(start, startOffset, end, endOffset); }, + /** + * Converts the given range to its global offset. + * + * @param {Range} range - The range to convert. + * @returns {[number, number]|undefined} - The global offsets of the range. + */ rangeToGlobalOffset(range) { return domManager.rangeToGlobalOffset(range); }, - createRangeByGlobalOffsets({ start, end }) { - return domManager.createRange(start, end); - }, - + /** + * Creates spans in the DOM for a given range of global offsets. + * + * @param {Object} offsets - The start and end offsets of the range. + * @param {number} offsets.start - The starting offset in codepoints. + * @param {number} offsets.end - The ending offset in codepoints. + * + * @returns {Array} - An array of DOM spans created for the range. + */ createSpansByGlobalOffsets({ start, end }) { return domManager.createSpans(start, end); }, + /** + * Removes spans from the given array based on the provided start and end global offsets. + * + * @param {Array} spans - The array of spans to be modified. + * @param {Object} offsets - The start and end global offsets. + * @param {number} offsets.start - The start global offset in codepoints. + * @param {number} offsets.end - The end global offset in codepoints. + * @returns {void} - Nothing is returned. + */ removeSpansInGlobalOffsets(spans, { start, end }) { return domManager?.removeSpans(spans, start, end); }, + /** + * Get text content at the position set by global offsets. + * + * @param {Object} offsets - The start and end global offsets. + * @param {number} offsets.start - The start global offset in codepoints. + * @param {number} offsets.end - The end global offset in codepoints. + * @returns {string} - The text content between the start and end offsets. + */ getTextFromGlobalOffsets({ start, end }) { return domManager.getText(start, end); }, diff --git a/web/libs/editor/tests/integration/data/ner/emoji.ts b/web/libs/editor/tests/integration/data/ner/emoji.ts new file mode 100644 index 000000000000..e41da05f4f4d --- /dev/null +++ b/web/libs/editor/tests/integration/data/ner/emoji.ts @@ -0,0 +1,27 @@ +export const simpleTextConfig = ` + + + +`; + +export const simpleHyperTextConfig = ` + + + +`; + +export const simpleTextData = { + // It should be some warning emoji but biome hates them + text: "🐱 Warning: This is a test text", +}; + +export const multilineTextData = { + // It should be some warning emoji but biome hates them + text: "🐱 Warning:\n🐱 This is a test text", +}; + +export const simpleHyperTextData = { + text: "

🐱 Warning:

🐱 This is a test text

", +}; diff --git a/web/libs/editor/tests/integration/e2e/ner/emoji.cy.ts b/web/libs/editor/tests/integration/e2e/ner/emoji.cy.ts new file mode 100644 index 000000000000..9a7511d38143 --- /dev/null +++ b/web/libs/editor/tests/integration/e2e/ner/emoji.cy.ts @@ -0,0 +1,338 @@ +import { Labels, LabelStudio, Sidebar } from "@humansignal/frontend-test/helpers/LSF"; +import { RichText } from "@humansignal/frontend-test/helpers/LSF/RichText"; +import { FF_LSDV_4620_3 } from "../../../../src/utils/feature-flags"; +import { + multilineTextData, + simpleHyperTextConfig, + simpleHyperTextData, + simpleTextConfig, + simpleTextData, +} from "../../data/ner/emoji"; + +describe("NER - Emoji - Text", () => { + const refTextResultValue = { + start: 21, + end: 25, + text: "test", + }; + + it("Should calculate offsets by code points in text (previous version)", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: false, + }); + LabelStudio.params().config(simpleTextConfig).data(simpleTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectText("test"); + RichText.hasRegionWithText("test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refTextResultValue.start); + expect(resultValue.end).to.eq(refTextResultValue.end); + expect(resultValue.text).to.eq(refTextResultValue.text); + + LabelStudio.params().config(simpleTextConfig).data(simpleTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refTextResultValue.start); + expect(resultValue.end).to.eq(refTextResultValue.end); + expect(resultValue.text).to.eq(refTextResultValue.text); + }); + }); + }); + + it("Should calculate offsets by code points in text", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: true, + }); + LabelStudio.params().config(simpleTextConfig).data(simpleTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectText("test"); + RichText.hasRegionWithText("test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refTextResultValue.start); + expect(resultValue.end).to.eq(refTextResultValue.end); + expect(resultValue.text).to.eq(refTextResultValue.text); + + LabelStudio.params().config(simpleTextConfig).data(simpleTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refTextResultValue.start); + expect(resultValue.end).to.eq(refTextResultValue.end); + expect(resultValue.text).to.eq(refTextResultValue.text); + }); + }); + }); + + const refMultilineTextResultValue = { + start: 2, + end: 27, + text: "Warning:\\n🐱 This is a test", + }; + + it("Should calculate offsets by code points in multiline text (previous version)", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: false, + }); + LabelStudio.params().config(simpleTextConfig).data(multilineTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectBetweenTexts("Warning", "test"); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refMultilineTextResultValue.start); + expect(resultValue.end).to.eq(refMultilineTextResultValue.end); + expect(resultValue.text).to.eq(refMultilineTextResultValue.text); + + LabelStudio.params().config(simpleTextConfig).data(multilineTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refMultilineTextResultValue.start); + expect(resultValue.end).to.eq(refMultilineTextResultValue.end); + expect(resultValue.text).to.eq(refMultilineTextResultValue.text); + }); + }); + }); + + it("Should calculate offsets by code points in multiline text", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: true, + }); + LabelStudio.params().config(simpleTextConfig).data(multilineTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectBetweenTexts("Warning", "test"); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refMultilineTextResultValue.start); + expect(resultValue.end).to.eq(refMultilineTextResultValue.end); + expect(resultValue.text).to.eq(refMultilineTextResultValue.text); + + LabelStudio.params().config(simpleTextConfig).data(multilineTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refMultilineTextResultValue.start); + expect(resultValue.end).to.eq(refMultilineTextResultValue.end); + expect(resultValue.text).to.eq(refMultilineTextResultValue.text); + }); + }); + }); + + const refHyperTextResultValue = { + start: "/article[1]/p[1]/text()[1]", + end: "/article[1]/p[1]/text()[1]", + text: "test", + globalOffsets: { + start: 23, + end: 27, + }, + startOffset: 13, + endOffset: 17, + }; + + it("Should calculate global offsets by code points and relative offsets by string length in hypertext (previous version)", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: false, + }); + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectText("test"); + RichText.hasRegionWithText("test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextResultValue.start); + expect(resultValue.end).to.eq(refHyperTextResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextResultValue.text); + + // This functionality is broken but it also is considered as outdated + // LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult(results).init(); + // LabelStudio.waitForObjectsReady(); + // RichText.hasRegionWithText("test"); + // + // LabelStudio.serialize().then((results) => { + // const resultValue = results[0].value; + // expect(resultValue.start).to.eq(refHyperTextResultValue.start); + // expect(resultValue.end).to.eq(refHyperTextResultValue.end); + // expect(resultValue.globalOffsets.start).to.eq(refHyperTextResultValue.globalOffsets.start); + // expect(resultValue.globalOffsets.end).to.eq(refHyperTextResultValue.globalOffsets.end); + // expect(resultValue.startOffset).to.eq(refHyperTextResultValue.startOffset); + // expect(resultValue.endOffset).to.eq(refHyperTextResultValue.endOffset); + // expect(resultValue.text).to.eq(refHyperTextResultValue.text); + // }); + }); + }); + + it("Should calculate global offsets by code points and relative offsets by string length in hypertext", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: true, + }); + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectText("test"); + RichText.hasRegionWithText("test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextResultValue.start); + expect(resultValue.end).to.eq(refHyperTextResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextResultValue.text); + + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextResultValue.start); + expect(resultValue.end).to.eq(refHyperTextResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextResultValue.text); + }); + }); + }); + + const refHyperTextMultilineResultValue = { + start: "/article[1]/h2[1]/text()[1]", + end: "/article[1]/p[1]/text()[1]", + text: "Warning:\\n🐱 This is a test", + globalOffsets: { + // this is offset in codepoints ("🐱" + " " = 2 codepoints) + start: 2, + end: 27, + }, + // this is offset in in-browser characters ("🐱" is 2 characters + " " = 3) + startOffset: 3, + endOffset: 17, + }; + + it("Should calculate global offsets by code points and relative offsets by string length in multiline hypertext (previous version)", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: false, + }); + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectBetweenTexts("Warning", "test"); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextMultilineResultValue.start); + expect(resultValue.end).to.eq(refHyperTextMultilineResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextMultilineResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextMultilineResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextMultilineResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextMultilineResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextMultilineResultValue.text); + + // This functionality is broken but it also is considered as outdated + // LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult(results).init(); + // LabelStudio.waitForObjectsReady(); + // RichText.hasRegionWithText("Warning:"); + // RichText.hasRegionWithText("🐱 This is a test"); + // + // LabelStudio.serialize().then((results) => { + // const resultValue = results[0].value; + // expect(resultValue.start).to.eq(refHyperTextMultilineResultValue.start); + // expect(resultValue.end).to.eq(refHyperTextMultilineResultValue.end); + // expect(resultValue.globalOffsets.start).to.eq(refHyperTextMultilineResultValue.globalOffsets.start); + // expect(resultValue.globalOffsets.end).to.eq(refHyperTextMultilineResultValue.globalOffsets.end); + // expect(resultValue.startOffset).to.eq(refHyperTextMultilineResultValue.startOffset); + // expect(resultValue.endOffset).to.eq(refHyperTextMultilineResultValue.endOffset); + // expect(resultValue.text).to.eq(refHyperTextMultilineResultValue.text); + // }); + }); + }); + + it("Should calculate global offsets by code points and relative offsets by string length in multiline hypertext", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: true, + }); + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectBetweenTexts("Warning", "test"); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextMultilineResultValue.start); + expect(resultValue.end).to.eq(refHyperTextMultilineResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextMultilineResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextMultilineResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextMultilineResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextMultilineResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextMultilineResultValue.text); + + LabelStudio.params().config(simpleHyperTextConfig).data(simpleHyperTextData).withResult(results).init(); + LabelStudio.waitForObjectsReady(); + RichText.hasRegionWithText("Warning:"); + RichText.hasRegionWithText("🐱 This is a test"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq(refHyperTextMultilineResultValue.start); + expect(resultValue.end).to.eq(refHyperTextMultilineResultValue.end); + expect(resultValue.globalOffsets.start).to.eq(refHyperTextMultilineResultValue.globalOffsets.start); + expect(resultValue.globalOffsets.end).to.eq(refHyperTextMultilineResultValue.globalOffsets.end); + expect(resultValue.startOffset).to.eq(refHyperTextMultilineResultValue.startOffset); + expect(resultValue.endOffset).to.eq(refHyperTextMultilineResultValue.endOffset); + expect(resultValue.text).to.eq(refHyperTextMultilineResultValue.text); + }); + }); + }); + + it("Heuristic edge case", () => { + LabelStudio.addFeatureFlagsOnPageLoad({ + [FF_LSDV_4620_3]: true, + }); + LabelStudio.params().config(simpleHyperTextConfig).data({ text: "

🐱\nmeans cat

" }).withResult([]).init(); + LabelStudio.waitForObjectsReady(); + Labels.select("region"); + RichText.selectText("means"); + RichText.hasRegionWithText("means"); + + LabelStudio.serialize().then((results) => { + const resultValue = results[0].value; + expect(resultValue.start).to.eq("/p[1]/text()[1]"); + expect(resultValue.end).to.eq("/p[1]/text()[1]"); + expect(resultValue.globalOffsets.start).to.eq(2); + expect(resultValue.globalOffsets.end).to.eq(7); + expect(resultValue.startOffset).to.eq(3); + expect(resultValue.endOffset).to.eq(8); + expect(resultValue.text).to.eq("means"); + }); + }); +}); diff --git a/web/libs/frontend-test/src/helpers/LSF/LabelStudio.ts b/web/libs/frontend-test/src/helpers/LSF/LabelStudio.ts index 6f7b3ff660fc..f3d9f0157ae4 100644 --- a/web/libs/frontend-test/src/helpers/LSF/LabelStudio.ts +++ b/web/libs/frontend-test/src/helpers/LSF/LabelStudio.ts @@ -128,7 +128,7 @@ class LSParamsBuilder { export const LabelStudio = { /** - * Initializes LabelStudio intance with given configuration + * Initializes LabelStudio instance with given configuration */ init(params: LSParams) { cy.log("Initialize LSF"); diff --git a/web/libs/frontend-test/src/helpers/LSF/RichText.ts b/web/libs/frontend-test/src/helpers/LSF/RichText.ts new file mode 100644 index 000000000000..3189a7542f72 --- /dev/null +++ b/web/libs/frontend-test/src/helpers/LSF/RichText.ts @@ -0,0 +1,97 @@ +class RichTextHelper { + private get _baseRootSelector() { + return ".lsf-htx-richtext"; + } + + private _rootSelector: string; + + constructor(rootSelector) { + this._rootSelector = rootSelector.replace(/^\&/, this._baseRootSelector); + } + + get root() { + return cy.get(this._rootSelector); + } + + get content() { + return this.root.then(($el) => { + if ($el[0].tagName === "IFRAME") { + return cy.wrap($el[0].contentDocument.body); + } + }); + } + + _selectRange(range: Range) { + const el: HTMLElement = ( + range.commonAncestorContainer.nodeType === Node.TEXT_NODE + ? range.commonAncestorContainer.parentElement + : range.commonAncestorContainer + ) as HTMLElement; + const elRect = el.getBoundingClientRect(); + const startEdgeRange = range.cloneRange(); + startEdgeRange.setEnd(range.startContainer, range.startOffset); + const endEdgeRange = range.cloneRange(); + endEdgeRange.setStart(range.endContainer, range.endOffset); + const startRect = startEdgeRange.getBoundingClientRect(); + const endRect = endEdgeRange.getBoundingClientRect(); + const x = startRect.left - elRect.left; + const y = startRect.top - elRect.top; + const x2 = endRect.right - elRect.left; + const y2 = endRect.bottom - elRect.top; + const eventOptions = { + eventConstructor: "MouseEvent", + buttons: 1, + }; + cy.wrap(el) + .trigger("mousedown", x, y, eventOptions) + .trigger("mousemove", x2, y2, eventOptions) + .then(() => { + const document = el.ownerDocument; + const selection = document.getSelection(); + selection.removeAllRanges(); + selection.addRange(range); + }) + .trigger("mouseup", x2, y2, eventOptions); + } + + selectText(text) { + return this.content.contains(text).then(($el) => { + const el = $el[0]; + const textElement = el.childNodes[0]; + const startOffset = el.textContent.indexOf(text); + const endOffset = startOffset + text.length; + const document = el.ownerDocument; + const range = document.createRange(); + range.setStart(textElement, startOffset); + range.setEnd(textElement, endOffset); + this._selectRange(range); + }); + } + selectBetweenTexts(startText, endText) { + return this.content.contains(startText).then(($elA) => { + this.content.contains(endText).then(($elB) => { + const elA = $elA[0]; + const elB = $elB[0]; + const textElementA = elA.childNodes[0]; + const textElementB = elB.childNodes[0]; + const startOffset = elA.textContent.indexOf(startText); + const endOffset = elB.textContent.indexOf(endText) + endText.length; + const document = elA.ownerDocument; + const range = document.createRange(); + range.setStart(textElementA, startOffset); + range.setEnd(textElementB, endOffset); + this._selectRange(range); + }); + }); + } + hasRegionWithText(text) { + this.content.find(".htx-highlight").contains(text).should("exist"); + } +} + +const RichText = new RichTextHelper("&:eq(0)"); +const useRichText = (rootSelector: string) => { + return new RichTextHelper(rootSelector); +}; + +export { RichText, useRichText };