@@ -6,6 +6,7 @@ import dotenv from "dotenv";
6
6
import sax from "sax" ;
7
7
import { Readable } from "stream" ;
8
8
import { fileURLToPath } from "url" ;
9
+ import { strict } from "assert" ;
9
10
10
11
dotenv . config ( ) ;
11
12
@@ -21,13 +22,15 @@ const ai = new OpenAI({
21
22
22
23
const MAXLEN = 5000 ;
23
24
25
+ const createParser = ( ) => ( sax as any ) . createStream ( true , { trim : false } , { strictEntities : true } ) ;
26
+
24
27
async function translate ( language : string , filePath : string ) : Promise < void > {
25
28
try {
26
29
// Pipe the XML file into the parser.
27
30
const input_dir = fileURLToPath (
28
31
import . meta. resolve ( "../../xml" + filePath )
29
32
) ;
30
- console . log ( "Translating file: " + input_dir ) ;
33
+
31
34
const translated : string = await recursivelyTranslate ( language , input_dir ) ;
32
35
33
36
const output_path = fileURLToPath (
@@ -52,19 +55,15 @@ async function recursivelyTranslate(
52
55
) : Promise < string > {
53
56
// Recursive function to split and translate
54
57
async function helper ( ori : string , force : boolean ) : Promise < string > {
55
- ori = escapeXML ( ori ) ;
56
-
57
58
if ( ori . length < MAXLEN && ! force ) {
58
- console . log ( "Translating chunk: " + ori . substring ( 0 , 50 ) + "..." ) ;
59
59
return await translateChunk ( ori ) ; // translate the chunk
60
60
}
61
61
62
- console . log ( "Chunk too large, splitting..." ) ;
63
62
let subTranslated = "" ;
64
63
// continue splitting the chunk
65
64
// Create a SAX parser in strict mode to split source into chunks.
66
65
await new Promise < void > ( ( resolve , reject ) => {
67
- const subParser = ( sax as any ) . createStream ( true , { trim : false } ) ;
66
+ const subParser = createParser ( ) ;
68
67
69
68
let subCurrentDepth = 0 ;
70
69
let subCurrentSegment = "" ;
@@ -87,12 +86,22 @@ async function recursivelyTranslate(
87
86
} ) ;
88
87
89
88
subParser . on ( "text" , text => {
89
+ text = strongEscapeXML ( text ) ;
90
90
if ( subIsRecording ) {
91
- subCurrentSegment += ` ${ text } ` ;
91
+ subCurrentSegment += text ;
92
92
} else {
93
- if ( subSegments . length > 0 && subSegments [ subSegments . length - 1 ] [ 1 ] != undefined ) {
93
+ if (
94
+ subSegments . length > 0 &&
95
+ subSegments [ subSegments . length - 1 ] [ 1 ] != undefined
96
+ ) {
94
97
subSegments [ subSegments . length - 1 ] [ 1 ] += text ;
95
98
subSegments [ subSegments . length - 1 ] [ 0 ] = true ;
99
+
100
+ // if (text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n") {
101
+ // subSegments.push([false, text]);
102
+ // } else {
103
+ // subSegments.push([true, text]);
104
+ // }
96
105
} else {
97
106
subSegments . push ( [ true , text ] ) ;
98
107
}
@@ -141,7 +150,6 @@ async function recursivelyTranslate(
141
150
subTranslated += segment [ 1 ] ;
142
151
}
143
152
}
144
- console . log ( `Completed chunk translation, continuing...` ) ;
145
153
resolve ( ) ;
146
154
} ) ;
147
155
@@ -154,7 +162,7 @@ async function recursivelyTranslate(
154
162
}
155
163
156
164
// Create a SAX parser in strict mode to split source into chunks.
157
- const parser = ( sax as any ) . createStream ( true , { trim : false } ) ;
165
+ const parser = createParser ( ) ;
158
166
159
167
// const assistant = await createAssistant(language, ai);
160
168
const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r" ;
@@ -191,8 +199,9 @@ async function recursivelyTranslate(
191
199
} ) ;
192
200
193
201
parser . on ( "text" , text => {
202
+ text = strongEscapeXML ( text ) ;
194
203
if ( isRecording ) {
195
- currentSegment += ` ${ text } ` ;
204
+ currentSegment += text ;
196
205
} else {
197
206
segments . push ( [ false , text ] ) ;
198
207
}
@@ -287,18 +296,19 @@ async function recursivelyTranslate(
287
296
const text = messageContent . text ;
288
297
289
298
const safeText = escapeXML ( text . value ) ;
299
+ console . log ( safeText ) ;
290
300
const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
291
301
292
302
await new Promise < void > ( ( resolve , reject ) => {
293
303
// Create a SAX parser in strict mode for cleaning up translations.
294
- const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
304
+ const clean = createParser ( ) ;
295
305
296
306
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
297
307
let currDepth = - 1 ;
298
308
299
309
clean . on ( "text" , text => {
300
310
if ( currDepth >= 1 ) {
301
- translatedChunk += escapeXML ( text ) ;
311
+ translatedChunk += strongEscapeXML ( text ) ;
302
312
}
303
313
} ) ;
304
314
@@ -368,3 +378,12 @@ function formatAttributes(attrs) {
368
378
function escapeXML ( str : string ) : string {
369
379
return str . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" ) ;
370
380
}
381
+
382
+ function strongEscapeXML ( str : string ) : string {
383
+ return str
384
+ . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" )
385
+ . replace ( / < / g, "<" )
386
+ . replace ( / > / g, ">" )
387
+ . replace ( / " / g, """ )
388
+ . replace ( / ' / g, "'" ) ;
389
+ }
0 commit comments