Skip to content

Commit 876a5bc

Browse files
committed
add test for variation selectors
1 parent 614d3dc commit 876a5bc

File tree

2 files changed

+132
-0
lines changed

2 files changed

+132
-0
lines changed

apps/language_server/test/source_file_test.exs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,63 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
659659
assert {1, 8} == SourceFile.lsp_position_to_elixir("Hello 🙌 World", {0, 8})
660660
end
661661

662+
test "lsp_position_to_elixir single line index inside supplementary variation selector surrogate pair" do
663+
# Choose a byte ≥ 16 so that the variation selector is in the supplementary range.
664+
# Byte 20 yields: 0xE0100 + (20 - 16) = 0xE0104.
665+
#
666+
# The encoder prepends a base character. Here the base "A" (a BMP character)
667+
# is encoded in UTF16 as 2 bytes (one code unit). The supplementary variation selector
668+
# will be encoded in UTF16 as a surrogate pair (4 bytes, or 2 code units).
669+
encoded = VariationSelectorEncoder.encode("A", <<20>>) <> "B"
670+
671+
# In UTF16, the string consists of:
672+
# • code unit 0: "A"
673+
# • code units 1 & 2: variation selector (surrogate pair)
674+
# • code unit 3: "B"
675+
#
676+
# When converting to UTF8, "A" plus its variation selector form one grapheme cluster.
677+
# Thus:
678+
# - Position {0, 1} (offset covering just "A") results in 1 complete grapheme → column = 1 + 1 = 2.
679+
# - Position {0, 2} (offset inside the surrogate pair) is clamped back to include only "A" → column 2.
680+
# - Position {0, 3} (offset covering the full surrogate pair) still forms one grapheme → column 2.
681+
# - Position {0, 4} (offset covering the full combined grapheme plus "B") gives 2 graphemes → column 3.
682+
pos1 = SourceFile.lsp_position_to_elixir(encoded, {0, 1})
683+
pos2 = SourceFile.lsp_position_to_elixir(encoded, {0, 2})
684+
pos3 = SourceFile.lsp_position_to_elixir(encoded, {0, 3})
685+
pos4 = SourceFile.lsp_position_to_elixir(encoded, {0, 4})
686+
687+
assert pos1 == {1, 2}
688+
assert pos2 == {1, 2}
689+
assert pos3 == {1, 2}
690+
assert pos4 == {1, 3}
691+
end
692+
693+
test "lsp_position_to_elixir with BMP variation selector" do
694+
# Choose a byte < 16 so that the variation selector is in the BMP.
695+
# Byte 10 yields: 0xFE00 + 10 = 0xFE0A.
696+
# Both "A" and the variation selector will be encoded as single UTF16 code units.
697+
encoded = VariationSelectorEncoder.encode("A", <<10>>) <> "B"
698+
699+
# UTF16 breakdown:
700+
# • code unit 0: "A"
701+
# • code unit 1: variation selector
702+
# • code unit 2: "B"
703+
#
704+
# "A" and its BMP variation selector form one grapheme cluster.
705+
pos1 = SourceFile.lsp_position_to_elixir(encoded, {0, 1})
706+
pos2 = SourceFile.lsp_position_to_elixir(encoded, {0, 2})
707+
pos3 = SourceFile.lsp_position_to_elixir(encoded, {0, 3})
708+
709+
# In UTF8, since the BMP variation selector is a combining mark, "A" and its selector form one grapheme.
710+
# - A partial covering just "A" (position {0,1}) yields one grapheme → column = 2.
711+
# - A partial covering "A" and the variation selector (position {0,2}) is still one grapheme → column 2.
712+
# - Only when "B" is also included (position {0,3}) does the grapheme count increase → column 3.
713+
714+
assert pos1 == {1, 2}
715+
assert pos2 == {1, 2}
716+
assert pos3 == {1, 3}
717+
end
718+
662719
test "lsp_position_to_elixir multi line" do
663720
assert {2, 2} == SourceFile.lsp_position_to_elixir("abcde\n1234", {1, 1})
664721
end
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
defmodule VariationSelectorEncoder do
2+
@moduledoc """
3+
Encodes a message into a string whose characters are a base
4+
character followed by variation selectors representing each byte.
5+
6+
The mapping is as follows:
7+
- For bytes less than 16, the variation selector is U+FE00 + byte.
8+
- For bytes ≥ 16, the variation selector is U+E0100 + (byte - 16).
9+
"""
10+
11+
# Converts a single byte into its corresponding variation selector character.
12+
defp byte_to_variation_selector(byte) when byte < 16 do
13+
<<0xFE00 + byte::utf8>>
14+
end
15+
16+
defp byte_to_variation_selector(byte) do
17+
<<0xE0100 + (byte - 16)::utf8>>
18+
end
19+
20+
@doc """
21+
Encodes a message by prepending the base character (e.g. an emoji)
22+
and appending variation selector characters representing the given bytes.
23+
"""
24+
def encode(base, bytes) when is_binary(base) and is_binary(bytes) do
25+
for <<byte::8 <- bytes>>, into: base, do: byte_to_variation_selector(byte)
26+
end
27+
28+
# Converts a variation selector codepoint (an integer) back into a byte.
29+
defp variation_selector_to_byte(codepoint)
30+
when codepoint in 0xFE00..0xFE0F do
31+
<<codepoint - 0xFE00>>
32+
end
33+
34+
defp variation_selector_to_byte(codepoint)
35+
when codepoint in 0xE0100..0xE01EF do
36+
<<codepoint - 0xE0100 + 16>>
37+
end
38+
39+
defp variation_selector_to_byte(_), do: <<>>
40+
41+
@doc """
42+
Decodes a string created by `encode/2` and returns the list of bytes.
43+
44+
It ignores characters with no variation selector.
45+
"""
46+
def decode(encoded) when is_binary(encoded) do
47+
for <<codepoint::utf8 <- encoded>>, into: <<>> do
48+
variation_selector_to_byte(codepoint)
49+
end
50+
end
51+
end
52+
53+
# Example usage:
54+
# defmodule Main do
55+
# def run do
56+
# # Encode the bytes corresponding to "hello" using the base character 😊.
57+
# original = "hello"
58+
# char = "😊"
59+
# IO.puts("Original: " <> original)
60+
# IO.puts("Original bytes: " <> Base.encode16(original))
61+
# IO.puts("Char: " <> char)
62+
# IO.puts("Char bytes: " <> Base.encode16(char))
63+
# encoded = VariationSelectorEncoder.encode(char, original)
64+
65+
# IO.puts("Encoded: " <> encoded)
66+
# IO.puts("Encoded bytes: " <> Base.encode16(encoded))
67+
68+
# # Decode the message back into bytes.
69+
# decoded_bytes = VariationSelectorEncoder.decode(encoded)
70+
# IO.puts("Decoded: " <> decoded_bytes)
71+
# IO.puts("Decoded bytes: " <> Base.encode16(decoded_bytes))
72+
# end
73+
# end
74+
75+
# Main.run()

0 commit comments

Comments
 (0)