Skip to content

Commit e82194a

Browse files
author
yihao03
committed
integrated i18n into sicp repository
1 parent 1bb8737 commit e82194a

File tree

7 files changed

+389
-2
lines changed

7 files changed

+389
-2
lines changed

i18n/controllers/gitComm.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { Octokit } from "octokit";
2+
import dotenv from "dotenv";
3+
import fs from "fs";
4+
import path from "path";
5+
dotenv.config();
6+
7+
if (process.env.GITHUB_OWNER === undefined || process.env.GITHUB_REPO === undefined) {
8+
throw Error("Please specify GITHUB_OWNER, GITHUB_REPO to pull EN XML from!");
9+
}
10+
11+
// initialize GitHub API
12+
const octokit = new Octokit();
13+
14+
async function getSource(filePath: string): Promise<string> {
15+
let toTranslate;
16+
17+
try {
18+
const result = await octokit.request(
19+
"GET /repos/{owner}/{repo}/contents/{path}",
20+
{
21+
owner: process.env.GITHUB_OWNER!,
22+
repo: process.env.GITHUB_REPO!,
23+
path: filePath,
24+
headers: {
25+
accept: "application/vnd.github.raw+json",
26+
},
27+
}
28+
);
29+
30+
toTranslate = result.data;
31+
const output_dir = path.join(import.meta.dirname, "../../ori");
32+
33+
// Ensure directory exists
34+
const dir = path.dirname(path.join(output_dir, filePath));
35+
fs.mkdirSync(dir, { recursive: true });
36+
37+
const fullPath = path.resolve(path.join(output_dir, filePath));
38+
fs.writeFileSync(fullPath, toTranslate);
39+
40+
console.log(
41+
`Successfully retrieved ${filePath} from GitHub, retrieval status: ${result.status}`
42+
);
43+
} catch (error) {
44+
console.log(
45+
`Error retrieving ${filePath} from GitHub.\n Status: ${error.status}.\n Rate limit remaining: ${error.response.headers["x-ratelimit-remaining"]}.\n Message: ${error.response.data.message}`
46+
);
47+
}
48+
49+
return toTranslate as string;
50+
}
51+
52+
async function commitTranslated() {}
53+
54+
export { getSource, commitTranslated };

i18n/controllers/path.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
export default function PathGenerator(path: string) {
2+
const pathArray: string[] = path.split(".");
3+
let gitPath: string = "";
4+
const gitPathPrefix: string[] = ["chapter", "section", "subsection"];
5+
6+
let i = 0;
7+
const len = pathArray.length;
8+
9+
while (i <= len && i < 3) {
10+
if (i === len) {
11+
gitPath += `/${gitPathPrefix[i - 1]}${pathArray[i - 1]}`;
12+
} else {
13+
gitPath += `/${gitPathPrefix[i]}${pathArray[i]}`;
14+
}
15+
i++;
16+
}
17+
18+
return gitPath + ".xml";
19+
}

i18n/controllers/translate.ts

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
import fs from "fs";
2+
import OpenAI from "openai";
3+
import path from "path";
4+
import createAssistant from "../initializers/initialize";
5+
import dotenv from "dotenv";
6+
import sax from "sax";
7+
import { Readable } from "stream";
8+
import { fileURLToPath } from "url";
9+
10+
dotenv.config();
11+
12+
if (process.env.AI_MODEL === undefined || process.env.API_KEY === undefined) {
13+
throw Error("Please specify AI_MODEL and API_KEY!");
14+
}
15+
16+
// initialize OpenAI API
17+
const ai = new OpenAI({
18+
apiKey: process.env.API_KEY,
19+
baseURL: process.env.AI_BASEURL
20+
});
21+
22+
// TODO: change the toTranslate to a file path, read the file and translate the content
23+
async function translate(language: string, filePath: string) {
24+
// Create a SAX parser in strict mode to split source into chunks.
25+
const parser = (sax as any).createStream(true, { trim: false });
26+
27+
// const assistant = await createAssistant(language, ai);
28+
const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r";
29+
const thread = await ai.beta.threads.create();
30+
let translated = "";
31+
32+
console.dir(thread);
33+
// Variables to track current depth and segments.
34+
let currentDepth = 0;
35+
let currentSegment = "";
36+
const segments: [boolean, string][] = [];
37+
38+
// In this context:
39+
// - Depth 0: Before any element is opened.
40+
// - Depth 1: The root element (<CHAPTER>).
41+
// - Depth 2: Each direct child of the root that we want to capture.
42+
let isRecording = false;
43+
44+
parser.on("opentag", node => {
45+
currentDepth++;
46+
47+
// If we're at depth 2, this is the start of a new segment.
48+
if (currentDepth === 2 || isRecording) {
49+
isRecording = true;
50+
currentSegment += `<${node.name}${formatAttributes(node.attributes)}>`;
51+
} else {
52+
segments.push([
53+
false,
54+
`<${node.name}${formatAttributes(node.attributes)}>`
55+
]);
56+
}
57+
});
58+
59+
parser.on("text", text => {
60+
if (isRecording) {
61+
currentSegment += `${text}`;
62+
} else {
63+
segments.push([false, text]);
64+
}
65+
});
66+
67+
parser.on("cdata", cdata => {
68+
if (isRecording) {
69+
currentSegment += `<![CDATA[${cdata}]]>`;
70+
}
71+
});
72+
73+
parser.on("closetag", tagName => {
74+
if (isRecording) {
75+
currentSegment += `</${tagName}>`;
76+
}
77+
78+
if (currentDepth === 2) {
79+
// We are closing a segment element.
80+
segments.push([true, currentSegment]);
81+
currentSegment = "";
82+
isRecording = false;
83+
}
84+
85+
if (currentDepth === 1) {
86+
// We are closing the root element.
87+
segments.push([false, `</${tagName}>`]);
88+
}
89+
90+
currentDepth--;
91+
});
92+
93+
parser.on("comment", comment => {
94+
if (isRecording) {
95+
currentSegment += `<!-- ${comment} -->`;
96+
} else {
97+
segments.push([false, `<!-- ${comment} -->`]);
98+
}
99+
});
100+
101+
parser.on("end", async () => {
102+
for (const segment of segments) {
103+
if (segment[0]) {
104+
translated += await translateChunk(segment[1]);
105+
} else {
106+
translated += segment[1];
107+
}
108+
}
109+
console.log(`Done translating all segments.`);
110+
const output_path = fileURLToPath(
111+
import.meta.resolve("../../xml/translations" + filePath)
112+
);
113+
114+
// Ensure directory exists
115+
const dir = path.dirname(output_path);
116+
fs.mkdirSync(dir, { recursive: true });
117+
118+
fs.writeFileSync(output_path, translated);
119+
console.log(`Translation saved to ${output_path}`);
120+
});
121+
122+
try {
123+
// Pipe the XML file into the parser.
124+
const input_dir = fileURLToPath(
125+
import.meta.resolve("../../xml" + filePath)
126+
);
127+
console.log(input_dir);
128+
fs.createReadStream(input_dir).pipe(parser);
129+
} catch (parseErr) {
130+
console.error("Error parsing XML:", parseErr);
131+
}
132+
133+
async function translateChunk(chunk: string) {
134+
// console.log("translating chunk: " + chunk);
135+
// Create a SAX parser in strict mode for cleaning up translations.
136+
const clean = (sax as any).createStream(true, { trim: false });
137+
138+
// SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
139+
let currDepth = -1;
140+
141+
clean.on("text", text => {
142+
if (currDepth >= 1) {
143+
translated += text;
144+
}
145+
});
146+
147+
clean.on("opentag", node => {
148+
currDepth++;
149+
if (node.name != "WRAPPER") {
150+
translated += `<${node.name}${formatAttributes(node.attributes)}>`;
151+
}
152+
});
153+
154+
clean.on("closetag", tagName => {
155+
if (tagName != "WRAPPER") {
156+
translated += `</${tagName}>`;
157+
}
158+
currDepth--;
159+
});
160+
161+
clean.on("cdata", cdata => {
162+
translated += `<![CDATA[${cdata}]]>`;
163+
});
164+
165+
clean.on("comment", comment => {
166+
translated += `<!-- ${comment} -->`;
167+
});
168+
169+
let translated = "";
170+
171+
try {
172+
await ai.beta.threads.messages.create(thread.id, {
173+
role: "user",
174+
content: `Translate this content to ${language}.
175+
IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
176+
If a term exists in the reference file, use that translation without deviation.
177+
Do not modify XML tags, content of XML tags and structure. Do not say anything else. Only translate the content and return the xml as is.
178+
Content to translate:
179+
${chunk}`
180+
});
181+
const run = await ai.beta.threads.runs.createAndPoll(thread.id, {
182+
assistant_id: assistant_id
183+
});
184+
185+
const messages = await ai.beta.threads.messages.list(thread.id, {
186+
run_id: run.id
187+
});
188+
const message = messages.data.pop()!;
189+
const messageContent = message.content[0];
190+
191+
if (messageContent.type !== "text") {
192+
throw new Error(
193+
`Unexpected message content type: ${messageContent.type}`
194+
);
195+
}
196+
197+
const text = messageContent.text;
198+
// console.log(text.value);
199+
200+
const safeText = escapeXML(text.value);
201+
const textStream = Readable.from("<WRAPPER>" + safeText + "</WRAPPER>");
202+
203+
await new Promise<void>((resolve, reject) => {
204+
clean.once("end", resolve);
205+
clean.once("error", reject);
206+
textStream.pipe(clean);
207+
});
208+
209+
return translated;
210+
} catch (err) {
211+
console.log(`Error occured while translating ${filePath}:\n ` + err);
212+
}
213+
}
214+
}
215+
216+
export default translate;
217+
218+
// Helper function to format attributes into a string.
219+
function formatAttributes(attrs) {
220+
const attrStr = Object.entries(attrs)
221+
.map(([key, val]) => `${key}="${val}"`)
222+
.join(" ");
223+
return attrStr ? " " + attrStr : "";
224+
}
225+
226+
function escapeXML(str: string): string {
227+
return str.replace(/&(?!(?:amp;|lt;|gt;|apos;|quot;))/g, "&amp;");
228+
}

i18n/index.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import { getSource } from "./controllers/gitComm.ts";
2+
import PathGenerator from "./controllers/path.ts";
3+
import translate from "./controllers/translate.ts";
4+
5+
export default async function fancyName(path: string) {
6+
const startTime = new Date().getTime();
7+
8+
const fullPath = PathGenerator(path);
9+
console.log("Translating: " + fullPath);
10+
await translate("Chinese", fullPath);
11+
12+
const elapsed = new Date().getTime() - startTime;
13+
console.log(fullPath + " took " + elapsed / 1000.0 + " seconds");
14+
}
15+
16+
17+
18+
await Promise.all([
19+
// fancyName("2")
20+
// fancyName("1.1"),
21+
// fancyName("1.1.2"),
22+
// fancyName("1.1.3"),
23+
fancyName("1.1.4"),
24+
// fancyName("1.1.5"),
25+
// fancyName("1.1.6"),
26+
// fancyName("1.1.7"),
27+
// fancyName("1.1.8"),
28+
// translate("Chinese", "1"),
29+
]);

i18n/initializers/initialize.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import fs from "fs";
2+
import OpenAI from "openai/index.mjs";
3+
4+
export default async function createAssistant(language: string, ai: OpenAI) {
5+
const assistant = await ai.beta.assistants.create({
6+
name: "SICP Translator",
7+
instructions: `You are a professional translator with high technical skills in computer science.
8+
You MUST adhere to the following rules strictly:
9+
1. ALWAYS use the exact translations for technical terms found in the uploaded reference file.
10+
2. If a term appears in the reference file, you MUST use the provided translation without exception.
11+
3. Preserve all XML tags and structure exactly as given.
12+
4. Do not add any additional information or explanatory notes.
13+
5. When translating, search the reference file first for any technical terms before translating them.
14+
6. Do not format your response using markdown or any other formatting.`,
15+
model: process.env.AI_MODEL as string,
16+
tools: [{ type: "file_search" }]
17+
});
18+
19+
const fileStreams = [
20+
"/home/yihao/projects/XML_translater/metadatas/try.txt"
21+
].map(path => fs.createReadStream(path));
22+
23+
// Create a vector store including our two files.
24+
const vectorStore = await ai.beta.vectorStores.create({
25+
name: "Translation instructions"
26+
});
27+
28+
await ai.beta.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, {
29+
files: fileStreams
30+
});
31+
32+
await ai.beta.assistants.update(assistant.id, {
33+
tool_resources: { file_search: { vector_store_ids: [vectorStore.id] } }
34+
});
35+
36+
const updatedAssistant = await ai.beta.assistants.retrieve(assistant.id);
37+
return updatedAssistant;
38+
}

i18n/package.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"type": "module",
3+
"contributors": [
4+
{
5+
"name": "yihao",
6+
"url": "https://github.com/yihao03/"
7+
},
8+
{
9+
"name": "Haodong",
10+
"url": "https://github.com/coder114514"
11+
}
12+
],
13+
"dependencies": {
14+
}
15+
}

0 commit comments

Comments
 (0)