苦于工作中大量多语言翻译造成错误, 所以有了这个工具.
递归遍历工具所在目录的文件, 根据文件名中的语言国家信息, 把文件内容从 UTF-8 转换成其本地语言, 或者从本地语言转换成 UTF-8.
能在配置文件中设定什么格式的文件名对应什么格式的编码. 无法匹配的统一视为 UTF-8 编码.
multi-encoding from-utf8 -i <UTF8源文件路径> -o <转本地编码保存文件路径>
将文件其以 UTF-8 编码读取, 以本地编码重新保存.multi-encoding to-utf8 -i <本地编码文件路径> -o <转UTF8编码保存文件路径>
将文件已本地编码读取, 以 UTF-8 编码保存.
# 越靠前的表达式越优先匹配
[[code_map]]
encoding_label = "windows-1251"
name_regex = """
RU.*|ru.*\
"""
# 匹配多个时尽量写成一个正则表达式
[[code_map]]
encoding_label = "windows-1252"
name_regex = """
DE.*|de.*|\
EN.*|en.*|\
ES.*|es.*|\
FI.*|fi.*|\
FR.*|fr.*|\
IT.*|it.*|\
PT.*|pt.*|\
NL.*|nl.*|\
DA.*|da.*|\
SR.*|sr.*|\
SV.*|sv.*\
"""
- v0.1.0 实现基本的软件入口, 编码解码.
- v0.2.0 实现多文件和按目录深度遍历编码解码
- v0.3.0 ???
- v1.0.0 实现文件拖动到应用后, 直接点击后, 交互式选择编码解码方式然后运行; 足够多的实践来找到并解决bug.
第二列的值能够被设置为config.toml
中encoding_label
的键值. 多个不同标签会被解释为同一种编码.
见 rust_encoding 库的文档
# Name | Labels |
---|---|
## The Encoding | |
--------------- | --------------------- |
UTF-8 | "unicode-1-1-utf-8" |
"unicode11utf8" | |
"unicode20utf8" | |
"utf-8" | |
"utf8" | |
"x-unicode20utf8" | |
--- | --------------------- |
## Legacy single-byte encodings | |
------------------------------- | --------------------- |
IBM866 | "866" |
"cp866" | |
"csibm866" | |
"ibm866" | |
ISO-8859-2 | "csisolatin2" |
"iso-8859-2" | |
"iso-ir-101" | |
"iso8859-2" | |
"iso88592" | |
"iso_8859-2" | |
"iso_8859-2:1987" | |
"l2" | |
"latin2" | |
ISO-8859-3 | "csisolatin3" |
"iso-8859-3" | |
"iso-ir-109" | |
"iso8859-3" | |
"iso88593" | |
"iso_8859-3" | |
"iso_8859-3:1988" | |
"l3" | |
"latin3" | |
ISO-8859-4 | "csisolatin4" |
"iso-8859-4" | |
"iso-ir-110" | |
"iso8859-4" | |
"iso88594" | |
"iso_8859-4" | |
"iso_8859-4:1988" | |
"l4" | |
"latin4" | |
ISO-8859-5 | "csisolatincyrillic" |
"cyrillic" | |
"iso-8859-5" | |
"iso-ir-144" | |
"iso8859-5" | |
"iso88595" | |
"iso_8859-5" | |
"iso_8859-5:1988" | |
ISO-8859-6 | "arabic" |
"asmo-708" | |
"csiso88596e" | |
"csiso88596i" | |
"csisolatinarabic" | |
"ecma-114" | |
"iso-8859-6" | |
"iso-8859-6-e" | |
"iso-8859-6-i" | |
"iso-ir-127" | |
"iso8859-6" | |
"iso88596" | |
"iso_8859-6" | |
"iso_8859-6:1987" | |
ISO-8859-7 | "csisolatingreek" |
"ecma-118" | |
"elot_928" | |
"greek" | |
"greek8" | |
"iso-8859-7" | |
"iso-ir-126" | |
"iso8859-7" | |
"iso88597" | |
"iso_8859-7" | |
"iso_8859-7:1987" | |
"sun_eu_greek" | |
ISO-8859-8 | "csiso88598e" |
"csisolatinhebrew" | |
"hebrew" | |
"iso-8859-8" | |
"iso-8859-8-e" | |
"iso-ir-138" | |
"iso8859-8" | |
"iso88598" | |
"iso_8859-8" | |
"iso_8859-8:1988" | |
"visual" | |
ISO-8859-8-I | "csiso88598i" |
"iso-8859-8-i" | |
"logical" | |
ISO-8859-10 | "csisolatin6" |
"iso-8859-10" | |
"iso-ir-157" | |
"iso8859-10" | |
"iso885910" | |
"l6" | |
"latin6" | |
ISO-8859-13 | "iso-8859-13" |
"iso8859-13" | |
"iso885913" | |
ISO-8859-14 | "iso-8859-14" |
"iso8859-14" | |
"iso885914" | |
ISO-8859-15 | "csisolatin9" |
"iso-8859-15" | |
"iso8859-15" | |
"iso885915" | |
"iso_8859-15" | |
"l9" | |
ISO-8859-16 | "iso-8859-16" |
KOI8-R | "cskoi8r" |
"koi" | |
"koi8" | |
"koi8-r" | |
"koi8_r" | |
KOI8-U | "koi8-ru" |
"koi8-u" | |
macintosh | "csmacintosh" |
"mac" | |
"macintosh" | |
"x-mac-roman" | |
windows-874 | "dos-874" |
"iso-8859-11" | |
"iso8859-11" | |
"iso885911" | |
"tis-620" | |
"windows-874" | |
windows-1250 | "cp1250" |
"windows-1250" | |
"x-cp1250" | |
windows-1251 | "cp1251" |
"windows-1251" | |
"x-cp1251" | |
----------------------------------------------------------------------------- | --------------------- |
windows-1252 | |
^ See below for the relationship to historical "Latin1" and "ASCII" concepts. | |
----------------------------------------------------------------------------- | --------------------- |
"ansi_x3.4-1968" | |
"ascii" | |
"cp1252" | |
"cp819" | |
"csisolatin1" | |
"ibm819" | |
"iso-8859-1" | |
"iso-ir-100" | |
"iso8859-1" | |
"iso88591" | |
"iso_8859-1" | |
"iso_8859-1:1987" | |
"l1" | |
"latin1" | |
"us-ascii" | |
"windows-1252" | |
"x-cp1252" | |
windows-1253 | "cp1253" |
"windows-1253" | |
"x-cp1253" | |
windows-1254 | "cp1254" |
"csisolatin5" | |
"iso-8859-9" | |
"iso-ir-148" | |
"iso8859-9" | |
"iso88599" | |
"iso_8859-9" | |
"iso_8859-9:1989" | |
"l5" | |
"latin5" | |
"windows-1254" | |
"x-cp1254" | |
windows-1255 | "cp1255" |
"windows-1255" | |
"x-cp1255" | |
windows-1256 | "cp1256" |
"windows-1256" | |
"x-cp1256" | |
windows-1257 | "cp1257" |
"windows-1257" | |
"x-cp1257" | |
windows-1258 | "cp1258" |
"windows-1258" | |
"x-cp1258" | |
x-mac-cyrillic | "x-mac-cyrillic" |
"x-mac-ukrainian" | |
--------------------------------------------------- | --------------------- |
## Legacy multi-byte Chinese (simplified) encodings | |
--------------------------------------------------- | --------------------- |
GBK | "chinese" |
"csgb2312" | |
"csiso58gb231280" | |
"gb2312" | |
"gb_2312" | |
"gb_2312-80" | |
"gbk" | |
"iso-ir-58" | |
"x-gbk" | |
gb18030 | "gb18030" |
--------------------------------------------------- | --------------------- |
## Legacy multi-byte Chinese (traditional) encodings | |
---------------------------------------------------- | --------------------- |
Big5 | "big5" |
"big5-hkscs" | |
"cn-big5" | |
"csbig5" | |
"x-x-big5" | |
--- | --------------------- |
## Legacy multi-byte Japanese encodings | |
--------------------------------------- | --------------------- |
EUC-JP | "cseucpkdfmtjapanese" |
"euc-jp" | |
"x-euc-jp" | |
ISO-2022-JP | "csiso2022jp" |
"iso-2022-jp" | |
Shift_JIS | "csshiftjis" |
"ms932" | |
"ms_kanji" | |
"shift-jis" | |
"shift_jis" | |
"sjis" | |
"windows-31j" | |
"x-sjis" | |
--- | --------------------- |
## Legacy multi-byte Korean encodings | |
------------------------------------- | --------------------- |
EUC-KR | "cseuckr" |
"csksc56011987" | |
"euc-kr" | |
"iso-ir-149" | |
"korean" | |
"ks_c_5601-1987" | |
"ks_c_5601-1989" | |
"ksc5601" | |
"ksc_5601" | |
"windows-949" | |
--- | --------------------- |
## Legacy miscellaneous encodings | |
--------------------------------- | --------------------- |
replacement | "csiso2022kr" |
"hz-gb-2312" | |
"iso-2022-cn" | |
"iso-2022-cn-ext" | |
"iso-2022-kr" | |
"replacement" | |
UTF-16BE | "unicodefffe" |
"utf-16be" | |
UTF-16LE | "csunicode" |
"iso-10646-ucs-2" | |
"ucs-2" | |
"unicode" | |
"unicodefeff" | |
"utf-16" | |
"utf-16le" | |
x-user-defined | "x-user-defined" |