Skip to content

Commit 098de27

Browse files
author
yihao03
committed
attempt to create a translate function that attempts to recursively splits the text into manageable chunk
1 parent 7e793de commit 098de27

File tree

1 file changed

+362
-0
lines changed

1 file changed

+362
-0
lines changed

i18n/controllers/translate copy.ts

Lines changed: 362 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
import fs from "fs";
2+
import OpenAI from "openai";
3+
import path from "path";
4+
import createAssistant from "../initializers/initialize";
5+
import dotenv from "dotenv";
6+
import sax from "sax";
7+
import { Readable } from "stream";
8+
import { fileURLToPath } from "url";
9+
10+
dotenv.config();
11+
12+
if (process.env.AI_MODEL === undefined || process.env.API_KEY === undefined) {
13+
throw Error("Please specify AI_MODEL and API_KEY!");
14+
}
15+
16+
// initialize OpenAI API
17+
const ai = new OpenAI({
18+
apiKey: process.env.API_KEY,
19+
baseURL: process.env.AI_BASEURL
20+
});
21+
22+
const MAXLEN = 2000;
23+
24+
async function translate(language: string, filePath: string): Promise<void> {
25+
try {
26+
// Pipe the XML file into the parser.
27+
const input_dir = fileURLToPath(
28+
import.meta.resolve("../../xml" + filePath)
29+
);
30+
console.log(input_dir);
31+
const translated: string = await recursivelyTranslate(language, input_dir);
32+
33+
const output_path = fileURLToPath(
34+
import.meta.resolve("../../xml_cn" + filePath)
35+
);
36+
37+
// Ensure directory exists
38+
const dir = path.dirname(output_path);
39+
fs.mkdirSync(dir, { recursive: true });
40+
41+
fs.writeFileSync(output_path, translated);
42+
console.log(`Translation saved to ${output_path}`);
43+
} catch (parseErr) {
44+
console.error("Error parsing XML:", parseErr);
45+
}
46+
}
47+
48+
// TODO: change the toTranslate to a file path, read the file and translate the content
49+
async function recursivelyTranslate(
50+
language: string,
51+
path: string
52+
): Promise<string> {
53+
// Recursive function to split and translate
54+
async function helper(ori: string, force: boolean): Promise<string> {
55+
if (ori.length < MAXLEN && !force) {
56+
return await translateChunk(ori); // translate the chunk
57+
}
58+
59+
let subTranslated = "";
60+
// continue splitting the chunk
61+
// Create a SAX parser in strict mode to split source into chunks.
62+
await new Promise<void>((resolve, reject) => {
63+
const subParser = (sax as any).createStream(true, { trim: false });
64+
65+
let subCurrentDepth = 0;
66+
let subCurrentSegment = "";
67+
const subSegments: [boolean, string][] = [];
68+
let subIsRecording = false;
69+
70+
subParser.on("opentag", node => {
71+
subCurrentDepth++;
72+
73+
// If we're at depth 2, this is the start of a new segment.
74+
if (subCurrentDepth === 2 || subIsRecording) {
75+
subIsRecording = true;
76+
subCurrentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
77+
} else {
78+
subSegments.push([
79+
false,
80+
`<${node.name}${formatAttributes(node.attributes)}>`
81+
]);
82+
}
83+
});
84+
85+
subParser.on("text", text => {
86+
if (subIsRecording) {
87+
subCurrentSegment += `${text}`;
88+
} else {
89+
subSegments.push([false, text]);
90+
}
91+
});
92+
93+
subParser.on("cdata", cdata => {
94+
if (subIsRecording) {
95+
subCurrentSegment += `<![CDATA[${cdata}]]>`;
96+
}
97+
});
98+
99+
subParser.on("closetag", tagName => {
100+
if (subIsRecording) {
101+
subCurrentSegment += `</${tagName}>`;
102+
}
103+
104+
if (subCurrentDepth === 2) {
105+
// We are closing a segment element.
106+
subSegments.push([true, subCurrentSegment]);
107+
subCurrentSegment = "";
108+
subIsRecording = false;
109+
}
110+
111+
if (subCurrentDepth === 1) {
112+
// We are closing the root element.
113+
subSegments.push([false, `</${tagName}>`]);
114+
}
115+
116+
subCurrentDepth--;
117+
});
118+
119+
subParser.on("comment", comment => {
120+
if (subIsRecording) {
121+
subCurrentSegment += `<!-- ${comment} -->`;
122+
} else {
123+
subSegments.push([false, `<!-- ${comment} -->`]);
124+
}
125+
});
126+
127+
subParser.on("end", async () => {
128+
for (const segment of subSegments) {
129+
if (segment[0]) {
130+
subTranslated += await helper(segment[1], false);
131+
} else {
132+
subTranslated += segment[1];
133+
}
134+
}
135+
console.log(`Done translating all segments.`);
136+
resolve();
137+
});
138+
139+
subParser.on("error", reject);
140+
141+
Readable.from(ori).pipe(subParser);
142+
});
143+
144+
return subTranslated;
145+
}
146+
147+
// Create a SAX parser in strict mode to split source into chunks.
148+
const parser = (sax as any).createStream(true, { trim: false });
149+
150+
// const assistant = await createAssistant(language, ai);
151+
const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r";
152+
const thread = await ai.beta.threads.create();
153+
let translated = "";
154+
155+
try {
156+
await new Promise<void>((resolve, reject) => {
157+
console.log("Translating " + path + " at " + thread.id);
158+
// Variables to track current depth and segments.
159+
let currentDepth = 0;
160+
let currentSegment = "";
161+
const segments: [boolean, string][] = [];
162+
163+
// In this context:
164+
// - Depth 0: Before any element is opened.
165+
// - Depth 1: The root element (<CHAPTER>).
166+
// - Depth 2: Each direct child of the root that we want to capture.
167+
let isRecording = false;
168+
169+
parser.on("opentag", node => {
170+
currentDepth++;
171+
172+
// If we're at depth 2, this is the start of a new segment.
173+
if (currentDepth === 2 || isRecording) {
174+
isRecording = true;
175+
currentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
176+
} else {
177+
segments.push([
178+
false,
179+
`<${node.name}${formatAttributes(node.attributes)}>`
180+
]);
181+
}
182+
});
183+
184+
parser.on("text", text => {
185+
if (isRecording) {
186+
currentSegment += `${text}`;
187+
} else {
188+
segments.push([false, text]);
189+
}
190+
});
191+
192+
parser.on("cdata", cdata => {
193+
if (isRecording) {
194+
currentSegment += `<![CDATA[${cdata}]]>`;
195+
}
196+
});
197+
198+
parser.on("closetag", tagName => {
199+
if (isRecording) {
200+
currentSegment += `</${tagName}>`;
201+
}
202+
203+
if (currentDepth === 2) {
204+
// We are closing a segment element.
205+
segments.push([true, currentSegment]);
206+
currentSegment = "";
207+
isRecording = false;
208+
}
209+
210+
if (currentDepth === 1) {
211+
// We are closing the root element.
212+
segments.push([false, `</${tagName}>`]);
213+
}
214+
215+
currentDepth--;
216+
});
217+
218+
parser.on("comment", comment => {
219+
if (isRecording) {
220+
currentSegment += `<!-- ${comment} -->`;
221+
} else {
222+
segments.push([false, `<!-- ${comment} -->`]);
223+
}
224+
});
225+
226+
parser.on("end", async () => {
227+
for (const segment of segments) {
228+
if (segment[0]) {
229+
translated += await helper(segment[1], false);
230+
} else {
231+
translated += segment[1];
232+
}
233+
}
234+
console.log(`Done translating all segments.`);
235+
resolve()
236+
});
237+
238+
parser.on("error", reject);
239+
240+
fs.createReadStream(path).pipe(parser);
241+
});
242+
243+
return translated;
244+
} catch (parseErr) {
245+
console.error("Error parsing XML:", parseErr);
246+
return translated + "<!-- Error parsing this section -->";
247+
}
248+
249+
async function translateChunk(chunk: string): Promise<string> {
250+
// console.log("translating chunk: " + chunk);
251+
// Create a SAX parser in strict mode for cleaning up translations.
252+
const clean = (sax as any).createStream(true, { trim: false });
253+
254+
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
255+
let currDepth = -1;
256+
257+
clean.on("text", text => {
258+
if (currDepth >= 1) {
259+
translatedChunk += escapeXML(text);
260+
}
261+
});
262+
263+
clean.on("opentag", node => {
264+
currDepth++;
265+
if (node.name != "WRAPPER") {
266+
translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`;
267+
}
268+
});
269+
270+
clean.on("closetag", tagName => {
271+
if (tagName != "WRAPPER") {
272+
translatedChunk += `</${tagName}>`;
273+
}
274+
currDepth--;
275+
});
276+
277+
clean.on("cdata", cdata => {
278+
translatedChunk += `<![CDATA[${cdata}]]>`;
279+
});
280+
281+
clean.on("comment", comment => {
282+
translatedChunk += `<!-- ${comment} -->`;
283+
});
284+
285+
clean.on("error", error => {
286+
console.log(
287+
"error encountered when validating XML: " +
288+
error +
289+
"\nvalidating section: " +
290+
chunk.substring(0, 100) +
291+
"..."
292+
);
293+
294+
// Attempt to recover using the internal parser
295+
try {
296+
clean._parser.resume();
297+
} catch (e) {
298+
console.log("Failed to resume parser:", e);
299+
}
300+
});
301+
302+
let translatedChunk = "";
303+
304+
try {
305+
await ai.beta.threads.messages.create(thread.id, {
306+
role: "user",
307+
content: `Translate this content to ${language}.
308+
IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
309+
If a term exists in the reference file, use that translation without deviation.
310+
Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
311+
Content to translate:
312+
${chunk}`
313+
});
314+
const run = await ai.beta.threads.runs.createAndPoll(thread.id, {
315+
assistant_id: assistant_id
316+
});
317+
318+
const messages = await ai.beta.threads.messages.list(thread.id, {
319+
run_id: run.id
320+
});
321+
const message = messages.data.pop()!;
322+
const messageContent = message.content[0];
323+
324+
if (messageContent.type !== "text") {
325+
throw new Error(
326+
`Unexpected message content type: ${messageContent.type}`
327+
);
328+
}
329+
330+
const text = messageContent.text;
331+
// console.log(text.value);
332+
333+
const safeText = escapeXML(text.value);
334+
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
335+
336+
await new Promise<void>((resolve, reject) => {
337+
clean.once("end", resolve);
338+
clean.once("error", reject);
339+
textStream.pipe(clean);
340+
});
341+
342+
return translatedChunk;
343+
} catch (err) {
344+
console.log(`Error occured while translating ${path}:\n ` + err);
345+
return translated + "<!-- Error translating this section -->";
346+
}
347+
}
348+
}
349+
350+
export default translate;
351+
352+
// Helper function to format attributes into a string.
353+
function formatAttributes(attrs) {
354+
const attrStr = Object.entries(attrs)
355+
.map(([key, val]) => `${key}="${val}"`)
356+
.join(" ");
357+
return attrStr ? " " + attrStr : "";
358+
}
359+
360+
function escapeXML(str: string): string {
361+
return str.replace(/&(?!(?:amp;|lt;|gt;|apos;|quot;))/g, "&amp;");
362+
}

0 commit comments

Comments
 (0)