|
| 1 | +import fs from "fs"; |
| 2 | +import OpenAI from "openai"; |
| 3 | +import path from "path"; |
| 4 | +import createAssistant from "../initializers/initialize"; |
| 5 | +import dotenv from "dotenv"; |
| 6 | +import sax from "sax"; |
| 7 | +import { Readable } from "stream"; |
| 8 | +import { fileURLToPath } from "url"; |
| 9 | + |
| 10 | +dotenv.config(); |
| 11 | + |
| 12 | +if (process.env.AI_MODEL === undefined || process.env.API_KEY === undefined) { |
| 13 | + throw Error("Please specify AI_MODEL and API_KEY!"); |
| 14 | +} |
| 15 | + |
| 16 | +// initialize OpenAI API |
| 17 | +const ai = new OpenAI({ |
| 18 | + apiKey: process.env.API_KEY, |
| 19 | + baseURL: process.env.AI_BASEURL |
| 20 | +}); |
| 21 | + |
| 22 | +const MAXLEN = 2000; |
| 23 | + |
| 24 | +async function translate(language: string, filePath: string): Promise<void> { |
| 25 | + try { |
| 26 | + // Pipe the XML file into the parser. |
| 27 | + const input_dir = fileURLToPath( |
| 28 | + import.meta.resolve("../../xml" + filePath) |
| 29 | + ); |
| 30 | + console.log(input_dir); |
| 31 | + const translated: string = await recursivelyTranslate(language, input_dir); |
| 32 | + |
| 33 | + const output_path = fileURLToPath( |
| 34 | + import.meta.resolve("../../xml_cn" + filePath) |
| 35 | + ); |
| 36 | + |
| 37 | + // Ensure directory exists |
| 38 | + const dir = path.dirname(output_path); |
| 39 | + fs.mkdirSync(dir, { recursive: true }); |
| 40 | + |
| 41 | + fs.writeFileSync(output_path, translated); |
| 42 | + console.log(`Translation saved to ${output_path}`); |
| 43 | + } catch (parseErr) { |
| 44 | + console.error("Error parsing XML:", parseErr); |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +// TODO: change the toTranslate to a file path, read the file and translate the content |
| 49 | +async function recursivelyTranslate( |
| 50 | + language: string, |
| 51 | + path: string |
| 52 | +): Promise<string> { |
| 53 | + // Recursive function to split and translate |
| 54 | + async function helper(ori: string, force: boolean): Promise<string> { |
| 55 | + if (ori.length < MAXLEN && !force) { |
| 56 | + return await translateChunk(ori); // translate the chunk |
| 57 | + } |
| 58 | + |
| 59 | + let subTranslated = ""; |
| 60 | + // continue splitting the chunk |
| 61 | + // Create a SAX parser in strict mode to split source into chunks. |
| 62 | + await new Promise<void>((resolve, reject) => { |
| 63 | + const subParser = (sax as any).createStream(true, { trim: false }); |
| 64 | + |
| 65 | + let subCurrentDepth = 0; |
| 66 | + let subCurrentSegment = ""; |
| 67 | + const subSegments: [boolean, string][] = []; |
| 68 | + let subIsRecording = false; |
| 69 | + |
| 70 | + subParser.on("opentag", node => { |
| 71 | + subCurrentDepth++; |
| 72 | + |
| 73 | + // If we're at depth 2, this is the start of a new segment. |
| 74 | + if (subCurrentDepth === 2 || subIsRecording) { |
| 75 | + subIsRecording = true; |
| 76 | + subCurrentSegment += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 77 | + } else { |
| 78 | + subSegments.push([ |
| 79 | + false, |
| 80 | + `<${node.name}${formatAttributes(node.attributes)}>` |
| 81 | + ]); |
| 82 | + } |
| 83 | + }); |
| 84 | + |
| 85 | + subParser.on("text", text => { |
| 86 | + if (subIsRecording) { |
| 87 | + subCurrentSegment += `${text}`; |
| 88 | + } else { |
| 89 | + subSegments.push([false, text]); |
| 90 | + } |
| 91 | + }); |
| 92 | + |
| 93 | + subParser.on("cdata", cdata => { |
| 94 | + if (subIsRecording) { |
| 95 | + subCurrentSegment += `<![CDATA[${cdata}]]>`; |
| 96 | + } |
| 97 | + }); |
| 98 | + |
| 99 | + subParser.on("closetag", tagName => { |
| 100 | + if (subIsRecording) { |
| 101 | + subCurrentSegment += `</${tagName}>`; |
| 102 | + } |
| 103 | + |
| 104 | + if (subCurrentDepth === 2) { |
| 105 | + // We are closing a segment element. |
| 106 | + subSegments.push([true, subCurrentSegment]); |
| 107 | + subCurrentSegment = ""; |
| 108 | + subIsRecording = false; |
| 109 | + } |
| 110 | + |
| 111 | + if (subCurrentDepth === 1) { |
| 112 | + // We are closing the root element. |
| 113 | + subSegments.push([false, `</${tagName}>`]); |
| 114 | + } |
| 115 | + |
| 116 | + subCurrentDepth--; |
| 117 | + }); |
| 118 | + |
| 119 | + subParser.on("comment", comment => { |
| 120 | + if (subIsRecording) { |
| 121 | + subCurrentSegment += `<!-- ${comment} -->`; |
| 122 | + } else { |
| 123 | + subSegments.push([false, `<!-- ${comment} -->`]); |
| 124 | + } |
| 125 | + }); |
| 126 | + |
| 127 | + subParser.on("end", async () => { |
| 128 | + for (const segment of subSegments) { |
| 129 | + if (segment[0]) { |
| 130 | + subTranslated += await helper(segment[1], false); |
| 131 | + } else { |
| 132 | + subTranslated += segment[1]; |
| 133 | + } |
| 134 | + } |
| 135 | + console.log(`Done translating all segments.`); |
| 136 | + resolve(); |
| 137 | + }); |
| 138 | + |
| 139 | + subParser.on("error", reject); |
| 140 | + |
| 141 | + Readable.from(ori).pipe(subParser); |
| 142 | + }); |
| 143 | + |
| 144 | + return subTranslated; |
| 145 | + } |
| 146 | + |
| 147 | + // Create a SAX parser in strict mode to split source into chunks. |
| 148 | + const parser = (sax as any).createStream(true, { trim: false }); |
| 149 | + |
| 150 | + // const assistant = await createAssistant(language, ai); |
| 151 | + const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r"; |
| 152 | + const thread = await ai.beta.threads.create(); |
| 153 | + let translated = ""; |
| 154 | + |
| 155 | + try { |
| 156 | + await new Promise<void>((resolve, reject) => { |
| 157 | + console.log("Translating " + path + " at " + thread.id); |
| 158 | + // Variables to track current depth and segments. |
| 159 | + let currentDepth = 0; |
| 160 | + let currentSegment = ""; |
| 161 | + const segments: [boolean, string][] = []; |
| 162 | + |
| 163 | + // In this context: |
| 164 | + // - Depth 0: Before any element is opened. |
| 165 | + // - Depth 1: The root element (<CHAPTER>). |
| 166 | + // - Depth 2: Each direct child of the root that we want to capture. |
| 167 | + let isRecording = false; |
| 168 | + |
| 169 | + parser.on("opentag", node => { |
| 170 | + currentDepth++; |
| 171 | + |
| 172 | + // If we're at depth 2, this is the start of a new segment. |
| 173 | + if (currentDepth === 2 || isRecording) { |
| 174 | + isRecording = true; |
| 175 | + currentSegment += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 176 | + } else { |
| 177 | + segments.push([ |
| 178 | + false, |
| 179 | + `<${node.name}${formatAttributes(node.attributes)}>` |
| 180 | + ]); |
| 181 | + } |
| 182 | + }); |
| 183 | + |
| 184 | + parser.on("text", text => { |
| 185 | + if (isRecording) { |
| 186 | + currentSegment += `${text}`; |
| 187 | + } else { |
| 188 | + segments.push([false, text]); |
| 189 | + } |
| 190 | + }); |
| 191 | + |
| 192 | + parser.on("cdata", cdata => { |
| 193 | + if (isRecording) { |
| 194 | + currentSegment += `<![CDATA[${cdata}]]>`; |
| 195 | + } |
| 196 | + }); |
| 197 | + |
| 198 | + parser.on("closetag", tagName => { |
| 199 | + if (isRecording) { |
| 200 | + currentSegment += `</${tagName}>`; |
| 201 | + } |
| 202 | + |
| 203 | + if (currentDepth === 2) { |
| 204 | + // We are closing a segment element. |
| 205 | + segments.push([true, currentSegment]); |
| 206 | + currentSegment = ""; |
| 207 | + isRecording = false; |
| 208 | + } |
| 209 | + |
| 210 | + if (currentDepth === 1) { |
| 211 | + // We are closing the root element. |
| 212 | + segments.push([false, `</${tagName}>`]); |
| 213 | + } |
| 214 | + |
| 215 | + currentDepth--; |
| 216 | + }); |
| 217 | + |
| 218 | + parser.on("comment", comment => { |
| 219 | + if (isRecording) { |
| 220 | + currentSegment += `<!-- ${comment} -->`; |
| 221 | + } else { |
| 222 | + segments.push([false, `<!-- ${comment} -->`]); |
| 223 | + } |
| 224 | + }); |
| 225 | + |
| 226 | + parser.on("end", async () => { |
| 227 | + for (const segment of segments) { |
| 228 | + if (segment[0]) { |
| 229 | + translated += await helper(segment[1], false); |
| 230 | + } else { |
| 231 | + translated += segment[1]; |
| 232 | + } |
| 233 | + } |
| 234 | + console.log(`Done translating all segments.`); |
| 235 | + resolve() |
| 236 | + }); |
| 237 | + |
| 238 | + parser.on("error", reject); |
| 239 | + |
| 240 | + fs.createReadStream(path).pipe(parser); |
| 241 | + }); |
| 242 | + |
| 243 | + return translated; |
| 244 | + } catch (parseErr) { |
| 245 | + console.error("Error parsing XML:", parseErr); |
| 246 | + return translated + "<!-- Error parsing this section -->"; |
| 247 | + } |
| 248 | + |
| 249 | + async function translateChunk(chunk: string): Promise<string> { |
| 250 | + // console.log("translating chunk: " + chunk); |
| 251 | + // Create a SAX parser in strict mode for cleaning up translations. |
| 252 | + const clean = (sax as any).createStream(true, { trim: false }); |
| 253 | + |
| 254 | + // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags |
| 255 | + let currDepth = -1; |
| 256 | + |
| 257 | + clean.on("text", text => { |
| 258 | + if (currDepth >= 1) { |
| 259 | + translatedChunk += escapeXML(text); |
| 260 | + } |
| 261 | + }); |
| 262 | + |
| 263 | + clean.on("opentag", node => { |
| 264 | + currDepth++; |
| 265 | + if (node.name != "WRAPPER") { |
| 266 | + translatedChunk += `<${node.name}${formatAttributes(node.attributes)}>`; |
| 267 | + } |
| 268 | + }); |
| 269 | + |
| 270 | + clean.on("closetag", tagName => { |
| 271 | + if (tagName != "WRAPPER") { |
| 272 | + translatedChunk += `</${tagName}>`; |
| 273 | + } |
| 274 | + currDepth--; |
| 275 | + }); |
| 276 | + |
| 277 | + clean.on("cdata", cdata => { |
| 278 | + translatedChunk += `<![CDATA[${cdata}]]>`; |
| 279 | + }); |
| 280 | + |
| 281 | + clean.on("comment", comment => { |
| 282 | + translatedChunk += `<!-- ${comment} -->`; |
| 283 | + }); |
| 284 | + |
| 285 | + clean.on("error", error => { |
| 286 | + console.log( |
| 287 | + "error encountered when validating XML: " + |
| 288 | + error + |
| 289 | + "\nvalidating section: " + |
| 290 | + chunk.substring(0, 100) + |
| 291 | + "..." |
| 292 | + ); |
| 293 | + |
| 294 | + // Attempt to recover using the internal parser |
| 295 | + try { |
| 296 | + clean._parser.resume(); |
| 297 | + } catch (e) { |
| 298 | + console.log("Failed to resume parser:", e); |
| 299 | + } |
| 300 | + }); |
| 301 | + |
| 302 | + let translatedChunk = ""; |
| 303 | + |
| 304 | + try { |
| 305 | + await ai.beta.threads.messages.create(thread.id, { |
| 306 | + role: "user", |
| 307 | + content: `Translate this content to ${language}. |
| 308 | + IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there. |
| 309 | + If a term exists in the reference file, use that translation without deviation. |
| 310 | + Do not modify XML tags, attributes of XML tags and structure. Do not say anything else. |
| 311 | + Content to translate: |
| 312 | + ${chunk}` |
| 313 | + }); |
| 314 | + const run = await ai.beta.threads.runs.createAndPoll(thread.id, { |
| 315 | + assistant_id: assistant_id |
| 316 | + }); |
| 317 | + |
| 318 | + const messages = await ai.beta.threads.messages.list(thread.id, { |
| 319 | + run_id: run.id |
| 320 | + }); |
| 321 | + const message = messages.data.pop()!; |
| 322 | + const messageContent = message.content[0]; |
| 323 | + |
| 324 | + if (messageContent.type !== "text") { |
| 325 | + throw new Error( |
| 326 | + `Unexpected message content type: ${messageContent.type}` |
| 327 | + ); |
| 328 | + } |
| 329 | + |
| 330 | + const text = messageContent.text; |
| 331 | + // console.log(text.value); |
| 332 | + |
| 333 | + const safeText = escapeXML(text.value); |
| 334 | + const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>"); |
| 335 | + |
| 336 | + await new Promise<void>((resolve, reject) => { |
| 337 | + clean.once("end", resolve); |
| 338 | + clean.once("error", reject); |
| 339 | + textStream.pipe(clean); |
| 340 | + }); |
| 341 | + |
| 342 | + return translatedChunk; |
| 343 | + } catch (err) { |
| 344 | + console.log(`Error occured while translating ${path}:\n ` + err); |
| 345 | + return translated + "<!-- Error translating this section -->"; |
| 346 | + } |
| 347 | + } |
| 348 | +} |
| 349 | + |
| 350 | +export default translate; |
| 351 | + |
| 352 | +// Helper function to format attributes into a string. |
| 353 | +function formatAttributes(attrs) { |
| 354 | + const attrStr = Object.entries(attrs) |
| 355 | + .map(([key, val]) => `${key}="${val}"`) |
| 356 | + .join(" "); |
| 357 | + return attrStr ? " " + attrStr : ""; |
| 358 | +} |
| 359 | + |
| 360 | +function escapeXML(str: string): string { |
| 361 | + return str.replace(/&(?!(?:amp;|lt;|gt;|apos;|quot;))/g, "&"); |
| 362 | +} |
0 commit comments