Merge pull request atomvm#1708 from jgonet/jgonet/unicode-option

bettio · bettio · commit f629e56cb276 · 2025-06-15T20:14:18.000+02:00
Add unicode alias for utf8 option

These changes are made under both the "Apache 2.0" and the "GNU Lesser General
Public License 2.1 or later" license terms (dual license).

SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
diff --git a/libs/estdlib/src/unicode.erl b/libs/estdlib/src/unicode.erl
@@ -47,7 +47,7 @@
     char() | unicode_binary() | charlist(), unicode_binary() | []
 ).
 
--type encoding() :: utf8 | latin1.
+-type encoding() :: utf8 | unicode | latin1.
 
 -export_type([
     unicode_binary/0,
diff --git a/src/libAtomVM/nifs.c b/src/libAtomVM/nifs.c
@@ -5341,13 +5341,28 @@ static term nif_maps_next(Context *ctx, int argc, term argv[])
     return ret;
 }
 
+static bool encoding_from_atom(term encoding_atom, enum CharDataEncoding *encoding)
+{
+    switch (encoding_atom) {
+        case LATIN1_ATOM:
+            *encoding = Latin1Encoding;
+            return true;
+        case UTF8_ATOM:
+        case UNICODE_ATOM:
+            *encoding = UTF8Encoding;
+            return true;
+        default:
+            return false;
+    }
+}
+
 static term nif_unicode_characters_to_list(Context *ctx, int argc, term argv[])
 {
     enum CharDataEncoding in_encoding = UTF8Encoding;
-    if (argc == 2) {
-        if (argv[1] == LATIN1_ATOM) {
-            in_encoding = Latin1Encoding;
-        } else if (UNLIKELY((argv[1] != UTF8_ATOM))) {
+    bool has_in_encoding = argc == 2;
+    if (has_in_encoding) {
+        term in_encoding_atom = argv[1];
+        if (UNLIKELY(!encoding_from_atom(in_encoding_atom, &in_encoding))) {
             RAISE_ERROR(BADARG_ATOM);
         }
     }
@@ -5406,18 +5421,18 @@ static term nif_unicode_characters_to_binary(Context *ctx, int argc, term argv[]
 {
     enum CharDataEncoding in_encoding = UTF8Encoding;
     enum CharDataEncoding out_encoding = UTF8Encoding;
-    if (argc > 1) {
-        if (argv[1] == LATIN1_ATOM) {
-            in_encoding = Latin1Encoding;
-        } else if (UNLIKELY((argv[1] != UTF8_ATOM))) {
+    bool has_in_encoding = argc > 1;
+    bool has_out_encoding = argc == 3;
+    if (has_in_encoding) {
+        term in_encoding_atom = argv[1];
+        if (UNLIKELY(!encoding_from_atom(in_encoding_atom, &in_encoding))) {
             RAISE_ERROR(BADARG_ATOM);
         }
-        if (argc == 3) {
-            if (argv[2] == LATIN1_ATOM) {
-                out_encoding = Latin1Encoding;
-            } else if (UNLIKELY((argv[2] != UTF8_ATOM))) {
-                RAISE_ERROR(BADARG_ATOM);
-            }
+    }
+    if (has_out_encoding) {
+        term out_encoding_atom = argv[2];
+        if (UNLIKELY(!encoding_from_atom(out_encoding_atom, &out_encoding))) {
+            RAISE_ERROR(BADARG_ATOM);
         }
     }
     size_t len;
diff --git a/tests/erlang_tests/test_unicode.erl b/tests/erlang_tests/test_unicode.erl
@@ -52,12 +52,17 @@ test_to_list_latin1() ->
 test_to_list_utf8() ->
     "hello" = unicode:characters_to_list(<<"hello">>),
     "hello" = unicode:characters_to_list("hello", utf8),
+    "hello" = unicode:characters_to_list("hello", unicode),
     "hé" = unicode:characters_to_list(<<"hé"/utf8>>),
     "hé" = unicode:characters_to_list("hé"),
     "hé" = unicode:characters_to_list(<<"hé"/utf8>>, utf8),
+    "hé" = unicode:characters_to_list(<<"hé"/utf8>>, unicode),
     {error, "h", [-1]} = unicode:characters_to_list([$h, -1], utf8),
+    {error, "h", [-1]} = unicode:characters_to_list([$h, -1], unicode),
     [$h, 16#10ffff] = unicode:characters_to_list([$h, 16#10ffff], utf8),
+    [$h, 16#10ffff] = unicode:characters_to_list([$h, 16#10ffff], unicode),
     {error, "h", [16#110000]} = unicode:characters_to_list([$h, 16#110000], utf8),
+    {error, "h", [16#110000]} = unicode:characters_to_list([$h, 16#110000], unicode),
     {incomplete, "h", <<"é">>} = unicode:characters_to_list(<<"hé">>),
     {error, [], <<16#A0, 16#A1>>} = unicode:characters_to_list(<<16#A0, 16#A1>>),
     % Erlang/OTP documentation writes: "The last part is mostly for debugging"
@@ -131,10 +136,21 @@ test_to_binary_latin1() ->
 
 test_to_binary_utf8() ->
     <<"hello">> = unicode:characters_to_binary("hello", utf8, utf8),
+    <<"hello">> = unicode:characters_to_binary("hello", unicode, unicode),
+    <<"hello">> = unicode:characters_to_binary("hello", utf8, unicode),
+    <<"hello">> = unicode:characters_to_binary("hello", unicode, utf8),
     <<"hello">> = unicode:characters_to_binary(<<"hello">>, utf8, utf8),
+    <<"hello">> = unicode:characters_to_binary(<<"hello">>, unicode, unicode),
+    <<"hello">> = unicode:characters_to_binary(<<"hello">>, utf8, unicode),
+    <<"hello">> = unicode:characters_to_binary(<<"hello">>, unicode, utf8),
     <<"hé"/utf8>> = unicode:characters_to_binary("hé", latin1, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary("hé", latin1, unicode),
     <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, unicode, unicode),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8, unicode),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, unicode, utf8),
     <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, utf8),
+    <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>, unicode),
     <<"hé"/utf8>> = unicode:characters_to_binary(<<"hé"/utf8>>),
     {error, <<"h">>, [-1]} = unicode:characters_to_binary([$h, -1]),
     <<"h", 244, 143, 191, 191>> = unicode:characters_to_binary([$h, 16#10FFFF]),