1
+ import fs from "fs" ;
2
+ import OpenAI from "openai" ;
3
+ import path from "path" ;
4
+ import createAssistant from "../initializers/initialize" ;
5
+ import dotenv from "dotenv" ;
6
+ import sax from "sax" ;
7
+ import { Readable } from "stream" ;
8
+ import { fileURLToPath } from "url" ;
9
+
10
+ dotenv . config ( ) ;
11
+
12
+ if ( process . env . AI_MODEL === undefined || process . env . API_KEY === undefined ) {
13
+ throw Error ( "Please specify AI_MODEL and API_KEY!" ) ;
14
+ }
15
+
16
+ // initialize OpenAI API
17
+ const ai = new OpenAI ( {
18
+ apiKey : process . env . API_KEY ,
19
+ baseURL : process . env . AI_BASEURL
20
+ } ) ;
21
+
22
+ // TODO: change the toTranslate to a file path, read the file and translate the content
23
+ async function translate ( language : string , filePath : string ) {
24
+ // Create a SAX parser in strict mode to split source into chunks.
25
+ const parser = ( sax as any ) . createStream ( true , { trim : false } ) ;
26
+
27
+ // const assistant = await createAssistant(language, ai);
28
+ const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r" ;
29
+ const thread = await ai . beta . threads . create ( ) ;
30
+ let translated = "" ;
31
+
32
+ console . dir ( thread ) ;
33
+ // Variables to track current depth and segments.
34
+ let currentDepth = 0 ;
35
+ let currentSegment = "" ;
36
+ const segments : [ boolean , string ] [ ] = [ ] ;
37
+
38
+ // In this context:
39
+ // - Depth 0: Before any element is opened.
40
+ // - Depth 1: The root element (<CHAPTER>).
41
+ // - Depth 2: Each direct child of the root that we want to capture.
42
+ let isRecording = false ;
43
+
44
+ parser . on ( "opentag" , node => {
45
+ currentDepth ++ ;
46
+
47
+ // If we're at depth 2, this is the start of a new segment.
48
+ if ( currentDepth === 2 || isRecording ) {
49
+ isRecording = true ;
50
+ currentSegment += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
51
+ } else {
52
+ segments . push ( [
53
+ false ,
54
+ `<${ node . name } ${ formatAttributes ( node . attributes ) } >`
55
+ ] ) ;
56
+ }
57
+ } ) ;
58
+
59
+ parser . on ( "text" , text => {
60
+ if ( isRecording ) {
61
+ currentSegment += `${ text } ` ;
62
+ } else {
63
+ segments . push ( [ false , text ] ) ;
64
+ }
65
+ } ) ;
66
+
67
+ parser . on ( "cdata" , cdata => {
68
+ if ( isRecording ) {
69
+ currentSegment += `<![CDATA[${ cdata } ]]>` ;
70
+ }
71
+ } ) ;
72
+
73
+ parser . on ( "closetag" , tagName => {
74
+ if ( isRecording ) {
75
+ currentSegment += `</${ tagName } >` ;
76
+ }
77
+
78
+ if ( currentDepth === 2 ) {
79
+ // We are closing a segment element.
80
+ segments . push ( [ true , currentSegment ] ) ;
81
+ currentSegment = "" ;
82
+ isRecording = false ;
83
+ }
84
+
85
+ if ( currentDepth === 1 ) {
86
+ // We are closing the root element.
87
+ segments . push ( [ false , `</${ tagName } >` ] ) ;
88
+ }
89
+
90
+ currentDepth -- ;
91
+ } ) ;
92
+
93
+ parser . on ( "comment" , comment => {
94
+ if ( isRecording ) {
95
+ currentSegment += `<!-- ${ comment } -->` ;
96
+ } else {
97
+ segments . push ( [ false , `<!-- ${ comment } -->` ] ) ;
98
+ }
99
+ } ) ;
100
+
101
+ parser . on ( "end" , async ( ) => {
102
+ for ( const segment of segments ) {
103
+ if ( segment [ 0 ] ) {
104
+ translated += await translateChunk ( segment [ 1 ] ) ;
105
+ } else {
106
+ translated += segment [ 1 ] ;
107
+ }
108
+ }
109
+ console . log ( `Done translating all segments.` ) ;
110
+ const output_path = fileURLToPath (
111
+ import . meta. resolve ( "../../xml/translations" + filePath )
112
+ ) ;
113
+
114
+ // Ensure directory exists
115
+ const dir = path . dirname ( output_path ) ;
116
+ fs . mkdirSync ( dir , { recursive : true } ) ;
117
+
118
+ fs . writeFileSync ( output_path , translated ) ;
119
+ console . log ( `Translation saved to ${ output_path } ` ) ;
120
+ } ) ;
121
+
122
+ try {
123
+ // Pipe the XML file into the parser.
124
+ const input_dir = fileURLToPath (
125
+ import . meta. resolve ( "../../xml" + filePath )
126
+ ) ;
127
+ console . log ( input_dir ) ;
128
+ fs . createReadStream ( input_dir ) . pipe ( parser ) ;
129
+ } catch ( parseErr ) {
130
+ console . error ( "Error parsing XML:" , parseErr ) ;
131
+ }
132
+
133
+ async function translateChunk ( chunk : string ) {
134
+ // console.log("translating chunk: " + chunk);
135
+ // Create a SAX parser in strict mode for cleaning up translations.
136
+ const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
137
+
138
+ // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
139
+ let currDepth = - 1 ;
140
+
141
+ clean . on ( "text" , text => {
142
+ if ( currDepth >= 1 ) {
143
+ translated += text ;
144
+ }
145
+ } ) ;
146
+
147
+ clean . on ( "opentag" , node => {
148
+ currDepth ++ ;
149
+ if ( node . name != "WRAPPER" ) {
150
+ translated += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
151
+ }
152
+ } ) ;
153
+
154
+ clean . on ( "closetag" , tagName => {
155
+ if ( tagName != "WRAPPER" ) {
156
+ translated += `</${ tagName } >` ;
157
+ }
158
+ currDepth -- ;
159
+ } ) ;
160
+
161
+ clean . on ( "cdata" , cdata => {
162
+ translated += `<![CDATA[${ cdata } ]]>` ;
163
+ } ) ;
164
+
165
+ clean . on ( "comment" , comment => {
166
+ translated += `<!-- ${ comment } -->` ;
167
+ } ) ;
168
+
169
+ let translated = "" ;
170
+
171
+ try {
172
+ await ai . beta . threads . messages . create ( thread . id , {
173
+ role : "user" ,
174
+ content : `Translate this content to ${ language } .
175
+ IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
176
+ If a term exists in the reference file, use that translation without deviation.
177
+ Do not modify XML tags, content of XML tags and structure. Do not say anything else. Only translate the content and return the xml as is.
178
+ Content to translate:
179
+ ${ chunk } `
180
+ } ) ;
181
+ const run = await ai . beta . threads . runs . createAndPoll ( thread . id , {
182
+ assistant_id : assistant_id
183
+ } ) ;
184
+
185
+ const messages = await ai . beta . threads . messages . list ( thread . id , {
186
+ run_id : run . id
187
+ } ) ;
188
+ const message = messages . data . pop ( ) ! ;
189
+ const messageContent = message . content [ 0 ] ;
190
+
191
+ if ( messageContent . type !== "text" ) {
192
+ throw new Error (
193
+ `Unexpected message content type: ${ messageContent . type } `
194
+ ) ;
195
+ }
196
+
197
+ const text = messageContent . text ;
198
+ // console.log(text.value);
199
+
200
+ const safeText = escapeXML ( text . value ) ;
201
+ const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
202
+
203
+ await new Promise < void > ( ( resolve , reject ) => {
204
+ clean . once ( "end" , resolve ) ;
205
+ clean . once ( "error" , reject ) ;
206
+ textStream . pipe ( clean ) ;
207
+ } ) ;
208
+
209
+ return translated ;
210
+ } catch ( err ) {
211
+ console . log ( `Error occured while translating ${ filePath } :\n ` + err ) ;
212
+ }
213
+ }
214
+ }
215
+
216
+ export default translate ;
217
+
218
+ // Helper function to format attributes into a string.
219
+ function formatAttributes ( attrs ) {
220
+ const attrStr = Object . entries ( attrs )
221
+ . map ( ( [ key , val ] ) => `${ key } ="${ val } "` )
222
+ . join ( " " ) ;
223
+ return attrStr ? " " + attrStr : "" ;
224
+ }
225
+
226
+ function escapeXML ( str : string ) : string {
227
+ return str . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" ) ;
228
+ }
0 commit comments