Skip to content

Commit 4922e98

Browse files
author
yihao03
committed
Update .gitignore and refactor translation logic to use recurTranslate function
1 parent 10856a1 commit 4922e98

File tree

6 files changed

+629
-71
lines changed

6 files changed

+629
-71
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@ test_node_env/node_modules
1212
.DS_Store
1313
.env
1414
*.icloud
15+
16+
/xml_*

i18n/controllers/translate copy.ts renamed to i18n/controllers/recurTranslate.ts

Lines changed: 71 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ const ai = new OpenAI({
1919
baseURL: process.env.AI_BASEURL
2020
});
2121

22-
const MAXLEN = 2000;
22+
const MAXLEN = 5000;
2323

2424
async function translate(language: string, filePath: string): Promise<void> {
2525
try {
2626
// Pipe the XML file into the parser.
2727
const input_dir = fileURLToPath(
2828
import.meta.resolve("../../xml" + filePath)
2929
);
30-
console.log(input_dir);
30+
console.log("Translating file: " + input_dir);
3131
const translated: string = await recursivelyTranslate(language, input_dir);
3232

3333
const output_path = fileURLToPath(
@@ -52,10 +52,14 @@ async function recursivelyTranslate(
5252
): Promise<string> {
5353
// Recursive function to split and translate
5454
async function helper(ori: string, force: boolean): Promise<string> {
55+
ori = escapeXML(ori);
56+
5557
if (ori.length < MAXLEN && !force) {
58+
console.log("Translating chunk: " + ori.substring(0, 50) + "...");
5659
return await translateChunk(ori); // translate the chunk
5760
}
5861

62+
console.log("Chunk too large, splitting...");
5963
let subTranslated = "";
6064
// continue splitting the chunk
6165
// Create a SAX parser in strict mode to split source into chunks.
@@ -86,7 +90,11 @@ async function recursivelyTranslate(
8690
if (subIsRecording) {
8791
subCurrentSegment += `${text}`;
8892
} else {
89-
subSegments.push([false, text]);
93+
if (text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n") {
94+
subSegments.push([false, text]);
95+
} else {
96+
subSegments.push([true, text]);
97+
}
9098
}
9199
});
92100

@@ -132,7 +140,7 @@ async function recursivelyTranslate(
132140
subTranslated += segment[1];
133141
}
134142
}
135-
console.log(`Done translating all segments.`);
143+
console.log(`Completed chunk translation, continuing...`);
136144
resolve();
137145
});
138146

@@ -232,7 +240,7 @@ async function recursivelyTranslate(
232240
}
233241
}
234242
console.log(`Done translating all segments.`);
235-
resolve()
243+
resolve();
236244
});
237245

238246
parser.on("error", reject);
@@ -247,69 +255,17 @@ async function recursivelyTranslate(
247255
}
248256

249257
async function translateChunk(chunk: string): Promise<string> {
250-
// console.log("translating chunk: " + chunk);
251-
// Create a SAX parser in strict mode for cleaning up translations.
252-
const clean = (sax as any).createStream(true, { trim: false });
253-
254-
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
255-
let currDepth = -1;
256-
257-
clean.on("text", text => {
258-
if (currDepth >= 1) {
259-
translatedChunk += escapeXML(text);
260-
}
261-
});
262-
263-
clean.on("opentag", node => {
264-
currDepth++;
265-
if (node.name != "WRAPPER") {
266-
translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`;
267-
}
268-
});
269-
270-
clean.on("closetag", tagName => {
271-
if (tagName != "WRAPPER") {
272-
translatedChunk += `</${tagName}>`;
273-
}
274-
currDepth--;
275-
});
276-
277-
clean.on("cdata", cdata => {
278-
translatedChunk += `<![CDATA[${cdata}]]>`;
279-
});
280-
281-
clean.on("comment", comment => {
282-
translatedChunk += `<!-- ${comment} -->`;
283-
});
284-
285-
clean.on("error", error => {
286-
console.log(
287-
"error encountered when validating XML: " +
288-
error +
289-
"\nvalidating section: " +
290-
chunk.substring(0, 100) +
291-
"..."
292-
);
293-
294-
// Attempt to recover using the internal parser
295-
try {
296-
clean._parser.resume();
297-
} catch (e) {
298-
console.log("Failed to resume parser:", e);
299-
}
300-
});
301-
302258
let translatedChunk = "";
303259

304260
try {
305261
await ai.beta.threads.messages.create(thread.id, {
306262
role: "user",
307263
content: `Translate this content to ${language}.
308-
IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
309-
If a term exists in the reference file, use that translation without deviation.
310-
Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
311-
Content to translate:
312-
${chunk}`
264+
IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
265+
If a term exists in the reference file, use that translation without deviation.
266+
Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
267+
Content to translate:
268+
${chunk}`
313269
});
314270
const run = await ai.beta.threads.runs.createAndPoll(thread.id, {
315271
assistant_id: assistant_id
@@ -328,14 +284,65 @@ async function recursivelyTranslate(
328284
}
329285

330286
const text = messageContent.text;
331-
// console.log(text.value);
332287

333288
const safeText = escapeXML(text.value);
334289
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
335290

336291
await new Promise<void>((resolve, reject) => {
292+
// Create a SAX parser in strict mode for cleaning up translations.
293+
const clean = (sax as any).createStream(true, { trim: false });
294+
295+
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
296+
let currDepth = -1;
297+
298+
clean.on("text", text => {
299+
if (currDepth >= 1) {
300+
translatedChunk += escapeXML(text);
301+
}
302+
});
303+
304+
clean.on("opentag", node => {
305+
currDepth++;
306+
if (node.name != "WRAPPER") {
307+
translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`;
308+
}
309+
});
310+
311+
clean.on("closetag", tagName => {
312+
if (tagName != "WRAPPER") {
313+
translatedChunk += `</${tagName}>`;
314+
}
315+
currDepth--;
316+
});
317+
318+
clean.on("cdata", cdata => {
319+
translatedChunk += `<![CDATA[${cdata}]]>`;
320+
});
321+
322+
clean.on("comment", comment => {
323+
translatedChunk += `<!-- ${comment} -->`;
324+
});
325+
326+
clean.on("error", error => {
327+
console.log(
328+
"error encountered when validating XML: " +
329+
error +
330+
"\nvalidating section: " +
331+
chunk.substring(0, 100) +
332+
"..."
333+
);
334+
335+
// Attempt to recover using the internal parser
336+
try {
337+
clean._parser.resume();
338+
} catch (e) {
339+
console.log("Failed to resume parser:", e);
340+
reject;
341+
}
342+
});
343+
337344
clean.once("end", resolve);
338-
clean.once("error", reject);
345+
339346
textStream.pipe(clean);
340347
});
341348

i18n/controllers/translate.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,6 @@ async function translate(language: string, filePath: string) {
188188
await ai.beta.threads.messages.create(thread.id, {
189189
role: "user",
190190
content: `Translate this content to ${language}.
191-
IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
192-
If a term exists in the reference file, use that translation without deviation.
193-
Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
194-
Content to translate:
195191
${chunk}`
196192
});
197193
const run = await ai.beta.threads.runs.createAndPoll(thread.id, {

i18n/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { getSource } from "./controllers/gitComm.ts";
22
import PathGenerator from "./controllers/path.ts";
3-
import translate from "./controllers/translate.ts";
3+
import translate from "./controllers/recurTranslate.ts";
44

55
export default async function fancyName(path: string) {
66
const startTime = new Date().getTime();
@@ -16,11 +16,11 @@ export default async function fancyName(path: string) {
1616

1717

1818
await Promise.all([
19-
// fancyName("2")
19+
fancyName("2")
2020
// fancyName("1.1"),
2121
// fancyName("1.1.2"),
2222
// fancyName("1.1.3"),
23-
fancyName("1.1.4"),
23+
// fancyName("1.1.4"),
2424
// fancyName("1.1.5"),
2525
// fancyName("1.1.6"),
2626
// fancyName("1.1.7"),

0 commit comments

Comments
 (0)