@@ -19,15 +19,15 @@ const ai = new OpenAI({
19
19
baseURL : process . env . AI_BASEURL
20
20
} ) ;
21
21
22
- const MAXLEN = 2000 ;
22
+ const MAXLEN = 5000 ;
23
23
24
24
async function translate ( language : string , filePath : string ) : Promise < void > {
25
25
try {
26
26
// Pipe the XML file into the parser.
27
27
const input_dir = fileURLToPath (
28
28
import . meta. resolve ( "../../xml" + filePath )
29
29
) ;
30
- console . log ( input_dir ) ;
30
+ console . log ( "Translating file: " + input_dir ) ;
31
31
const translated : string = await recursivelyTranslate ( language , input_dir ) ;
32
32
33
33
const output_path = fileURLToPath (
@@ -52,10 +52,14 @@ async function recursivelyTranslate(
52
52
) : Promise < string > {
53
53
// Recursive function to split and translate
54
54
async function helper ( ori : string , force : boolean ) : Promise < string > {
55
+ ori = escapeXML ( ori ) ;
56
+
55
57
if ( ori . length < MAXLEN && ! force ) {
58
+ console . log ( "Translating chunk: " + ori . substring ( 0 , 50 ) + "..." ) ;
56
59
return await translateChunk ( ori ) ; // translate the chunk
57
60
}
58
61
62
+ console . log ( "Chunk too large, splitting..." ) ;
59
63
let subTranslated = "" ;
60
64
// continue splitting the chunk
61
65
// Create a SAX parser in strict mode to split source into chunks.
@@ -86,7 +90,11 @@ async function recursivelyTranslate(
86
90
if ( subIsRecording ) {
87
91
subCurrentSegment += `${ text } ` ;
88
92
} else {
89
- subSegments . push ( [ false , text ] ) ;
93
+ if ( text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n" ) {
94
+ subSegments . push ( [ false , text ] ) ;
95
+ } else {
96
+ subSegments . push ( [ true , text ] ) ;
97
+ }
90
98
}
91
99
} ) ;
92
100
@@ -132,7 +140,7 @@ async function recursivelyTranslate(
132
140
subTranslated += segment [ 1 ] ;
133
141
}
134
142
}
135
- console . log ( `Done translating all segments .` ) ;
143
+ console . log ( `Completed chunk translation, continuing.. .` ) ;
136
144
resolve ( ) ;
137
145
} ) ;
138
146
@@ -232,7 +240,7 @@ async function recursivelyTranslate(
232
240
}
233
241
}
234
242
console . log ( `Done translating all segments.` ) ;
235
- resolve ( )
243
+ resolve ( ) ;
236
244
} ) ;
237
245
238
246
parser . on ( "error" , reject ) ;
@@ -247,69 +255,17 @@ async function recursivelyTranslate(
247
255
}
248
256
249
257
async function translateChunk ( chunk : string ) : Promise < string > {
250
- // console.log("translating chunk: " + chunk);
251
- // Create a SAX parser in strict mode for cleaning up translations.
252
- const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
253
-
254
- // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
255
- let currDepth = - 1 ;
256
-
257
- clean . on ( "text" , text => {
258
- if ( currDepth >= 1 ) {
259
- translatedChunk += escapeXML ( text ) ;
260
- }
261
- } ) ;
262
-
263
- clean . on ( "opentag" , node => {
264
- currDepth ++ ;
265
- if ( node . name != "WRAPPER" ) {
266
- translatedChunk += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
267
- }
268
- } ) ;
269
-
270
- clean . on ( "closetag" , tagName => {
271
- if ( tagName != "WRAPPER" ) {
272
- translatedChunk += `</${ tagName } >` ;
273
- }
274
- currDepth -- ;
275
- } ) ;
276
-
277
- clean . on ( "cdata" , cdata => {
278
- translatedChunk += `<![CDATA[${ cdata } ]]>` ;
279
- } ) ;
280
-
281
- clean . on ( "comment" , comment => {
282
- translatedChunk += `<!-- ${ comment } -->` ;
283
- } ) ;
284
-
285
- clean . on ( "error" , error => {
286
- console . log (
287
- "error encountered when validating XML: " +
288
- error +
289
- "\nvalidating section: " +
290
- chunk . substring ( 0 , 100 ) +
291
- "..."
292
- ) ;
293
-
294
- // Attempt to recover using the internal parser
295
- try {
296
- clean . _parser . resume ( ) ;
297
- } catch ( e ) {
298
- console . log ( "Failed to resume parser:" , e ) ;
299
- }
300
- } ) ;
301
-
302
258
let translatedChunk = "" ;
303
259
304
260
try {
305
261
await ai . beta . threads . messages . create ( thread . id , {
306
262
role : "user" ,
307
263
content : `Translate this content to ${ language } .
308
- IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
309
- If a term exists in the reference file, use that translation without deviation.
310
- Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
311
- Content to translate:
312
- ${ chunk } `
264
+ IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
265
+ If a term exists in the reference file, use that translation without deviation.
266
+ Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
267
+ Content to translate:
268
+ ${ chunk } `
313
269
} ) ;
314
270
const run = await ai . beta . threads . runs . createAndPoll ( thread . id , {
315
271
assistant_id : assistant_id
@@ -328,14 +284,65 @@ async function recursivelyTranslate(
328
284
}
329
285
330
286
const text = messageContent . text ;
331
- // console.log(text.value);
332
287
333
288
const safeText = escapeXML ( text . value ) ;
334
289
const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
335
290
336
291
await new Promise < void > ( ( resolve , reject ) => {
292
+ // Create a SAX parser in strict mode for cleaning up translations.
293
+ const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
294
+
295
+ // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
296
+ let currDepth = - 1 ;
297
+
298
+ clean . on ( "text" , text => {
299
+ if ( currDepth >= 1 ) {
300
+ translatedChunk += escapeXML ( text ) ;
301
+ }
302
+ } ) ;
303
+
304
+ clean . on ( "opentag" , node => {
305
+ currDepth ++ ;
306
+ if ( node . name != "WRAPPER" ) {
307
+ translatedChunk += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
308
+ }
309
+ } ) ;
310
+
311
+ clean . on ( "closetag" , tagName => {
312
+ if ( tagName != "WRAPPER" ) {
313
+ translatedChunk += `</${ tagName } >` ;
314
+ }
315
+ currDepth -- ;
316
+ } ) ;
317
+
318
+ clean . on ( "cdata" , cdata => {
319
+ translatedChunk += `<![CDATA[${ cdata } ]]>` ;
320
+ } ) ;
321
+
322
+ clean . on ( "comment" , comment => {
323
+ translatedChunk += `<!-- ${ comment } -->` ;
324
+ } ) ;
325
+
326
+ clean . on ( "error" , error => {
327
+ console . log (
328
+ "error encountered when validating XML: " +
329
+ error +
330
+ "\nvalidating section: " +
331
+ chunk . substring ( 0 , 100 ) +
332
+ "..."
333
+ ) ;
334
+
335
+ // Attempt to recover using the internal parser
336
+ try {
337
+ clean . _parser . resume ( ) ;
338
+ } catch ( e ) {
339
+ console . log ( "Failed to resume parser:" , e ) ;
340
+ reject ;
341
+ }
342
+ } ) ;
343
+
337
344
clean . once ( "end" , resolve ) ;
338
- clean . once ( "error" , reject ) ;
345
+
339
346
textStream . pipe ( clean ) ;
340
347
} ) ;
341
348
0 commit comments