Skip to content

Commit 29f1441

Browse files
author
yihao03
committed
Refactor XML parsing logic to use strict mode and enhance text escaping
1 parent 9f21e76 commit 29f1441

File tree

1 file changed

+32
-13
lines changed

1 file changed

+32
-13
lines changed

i18n/controllers/recurTranslate.ts

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import dotenv from "dotenv";
66
import sax from "sax";
77
import { Readable } from "stream";
88
import { fileURLToPath } from "url";
9+
import { strict } from "assert";
910

1011
dotenv.config();
1112

@@ -21,13 +22,15 @@ const ai = new OpenAI({
2122

2223
const MAXLEN = 5000;
2324

25+
const createParser = () => (sax as any).createStream(true, { trim: false }, { strictEntities: true });
26+
2427
async function translate(language: string, filePath: string): Promise<void> {
2528
try {
2629
// Pipe the XML file into the parser.
2730
const input_dir = fileURLToPath(
2831
import.meta.resolve("../../xml" + filePath)
2932
);
30-
console.log("Translating file: " + input_dir);
33+
3134
const translated: string = await recursivelyTranslate(language, input_dir);
3235

3336
const output_path = fileURLToPath(
@@ -52,19 +55,15 @@ async function recursivelyTranslate(
5255
): Promise<string> {
5356
// Recursive function to split and translate
5457
async function helper(ori: string, force: boolean): Promise<string> {
55-
ori = escapeXML(ori);
56-
5758
if (ori.length < MAXLEN && !force) {
58-
console.log("Translating chunk: " + ori.substring(0, 50) + "...");
5959
return await translateChunk(ori); // translate the chunk
6060
}
6161

62-
console.log("Chunk too large, splitting...");
6362
let subTranslated = "";
6463
// continue splitting the chunk
6564
// Create a SAX parser in strict mode to split source into chunks.
6665
await new Promise<void>((resolve, reject) => {
67-
const subParser = (sax as any).createStream(true, { trim: false });
66+
const subParser = createParser();
6867

6968
let subCurrentDepth = 0;
7069
let subCurrentSegment = "";
@@ -87,12 +86,22 @@ async function recursivelyTranslate(
8786
});
8887

8988
subParser.on("text", text => {
89+
text = strongEscapeXML(text);
9090
if (subIsRecording) {
91-
subCurrentSegment += `${text}`;
91+
subCurrentSegment += text;
9292
} else {
93-
if (subSegments.length > 0 && subSegments[subSegments.length - 1][1] != undefined) {
93+
if (
94+
subSegments.length > 0 &&
95+
subSegments[subSegments.length - 1][1] != undefined
96+
) {
9497
subSegments[subSegments.length - 1][1] += text;
9598
subSegments[subSegments.length - 1][0] = true;
99+
100+
// if (text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n") {
101+
// subSegments.push([false, text]);
102+
// } else {
103+
// subSegments.push([true, text]);
104+
// }
96105
} else {
97106
subSegments.push([true, text]);
98107
}
@@ -141,7 +150,6 @@ async function recursivelyTranslate(
141150
subTranslated += segment[1];
142151
}
143152
}
144-
console.log(`Completed chunk translation, continuing...`);
145153
resolve();
146154
});
147155

@@ -154,7 +162,7 @@ async function recursivelyTranslate(
154162
}
155163

156164
// Create a SAX parser in strict mode to split source into chunks.
157-
const parser = (sax as any).createStream(true, { trim: false });
165+
const parser = createParser();
158166

159167
// const assistant = await createAssistant(language, ai);
160168
const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r";
@@ -191,8 +199,9 @@ async function recursivelyTranslate(
191199
});
192200

193201
parser.on("text", text => {
202+
text = strongEscapeXML(text);
194203
if (isRecording) {
195-
currentSegment += `${text}`;
204+
currentSegment += text;
196205
} else {
197206
segments.push([false, text]);
198207
}
@@ -287,18 +296,19 @@ async function recursivelyTranslate(
287296
const text = messageContent.text;
288297

289298
const safeText = escapeXML(text.value);
299+
console.log(safeText);
290300
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
291301

292302
await new Promise<void>((resolve, reject) => {
293303
// Create a SAX parser in strict mode for cleaning up translations.
294-
const clean = (sax as any).createStream(true, { trim: false });
304+
const clean = createParser();
295305

296306
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
297307
let currDepth = -1;
298308

299309
clean.on("text", text => {
300310
if (currDepth >= 1) {
301-
translatedChunk += escapeXML(text);
311+
translatedChunk += strongEscapeXML(text);
302312
}
303313
});
304314

@@ -368,3 +378,12 @@ function formatAttributes(attrs) {
368378
function escapeXML(str: string): string {
369379
return str.replace(/&(?!(?:amp;|lt;|gt;|apos;|quot;))/g, "&amp;");
370380
}
381+
382+
function strongEscapeXML(str: string): string {
383+
return str
384+
.replace(/&(?!(?:amp;|lt;|gt;|apos;|quot;))/g, "&amp;")
385+
.replace(/</g, "&lt;")
386+
.replace(/>/g, "&gt;")
387+
.replace(/"/g, "&quot;")
388+
.replace(/'/g, "&apos;");
389+
}

0 commit comments

Comments
 (0)