@@ -659,6 +659,63 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
659
659
assert { 1 , 8 } == SourceFile . lsp_position_to_elixir ( "Hello 🙌 World" , { 0 , 8 } )
660
660
end
661
661
662
+ test "lsp_position_to_elixir single line index inside supplementary variation selector surrogate pair" do
663
+ # Choose a byte ≥ 16 so that the variation selector is in the supplementary range.
664
+ # Byte 20 yields: 0xE0100 + (20 - 16) = 0xE0104.
665
+ #
666
+ # The encoder prepends a base character. Here the base "A" (a BMP character)
667
+ # is encoded in UTF16 as 2 bytes (one code unit). The supplementary variation selector
668
+ # will be encoded in UTF16 as a surrogate pair (4 bytes, or 2 code units).
669
+ encoded = VariationSelectorEncoder . encode ( "A" , << 20 >> ) <> "B"
670
+
671
+ # In UTF16, the string consists of:
672
+ # • code unit 0: "A"
673
+ # • code units 1 & 2: variation selector (surrogate pair)
674
+ # • code unit 3: "B"
675
+ #
676
+ # When converting to UTF8, "A" plus its variation selector form one grapheme cluster.
677
+ # Thus:
678
+ # - Position {0, 1} (offset covering just "A") results in 1 complete grapheme → column = 1 + 1 = 2.
679
+ # - Position {0, 2} (offset inside the surrogate pair) is clamped back to include only "A" → column 2.
680
+ # - Position {0, 3} (offset covering the full surrogate pair) still forms one grapheme → column 2.
681
+ # - Position {0, 4} (offset covering the full combined grapheme plus "B") gives 2 graphemes → column 3.
682
+ pos1 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 1 } )
683
+ pos2 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 2 } )
684
+ pos3 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 3 } )
685
+ pos4 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 4 } )
686
+
687
+ assert pos1 == { 1 , 2 }
688
+ assert pos2 == { 1 , 2 }
689
+ assert pos3 == { 1 , 2 }
690
+ assert pos4 == { 1 , 3 }
691
+ end
692
+
693
+ test "lsp_position_to_elixir with BMP variation selector" do
694
+ # Choose a byte < 16 so that the variation selector is in the BMP.
695
+ # Byte 10 yields: 0xFE00 + 10 = 0xFE0A.
696
+ # Both "A" and the variation selector will be encoded as single UTF16 code units.
697
+ encoded = VariationSelectorEncoder . encode ( "A" , << 10 >> ) <> "B"
698
+
699
+ # UTF16 breakdown:
700
+ # • code unit 0: "A"
701
+ # • code unit 1: variation selector
702
+ # • code unit 2: "B"
703
+ #
704
+ # "A" and its BMP variation selector form one grapheme cluster.
705
+ pos1 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 1 } )
706
+ pos2 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 2 } )
707
+ pos3 = SourceFile . lsp_position_to_elixir ( encoded , { 0 , 3 } )
708
+
709
+ # In UTF8, since the BMP variation selector is a combining mark, "A" and its selector form one grapheme.
710
+ # - A partial covering just "A" (position {0,1}) yields one grapheme → column = 2.
711
+ # - A partial covering "A" and the variation selector (position {0,2}) is still one grapheme → column 2.
712
+ # - Only when "B" is also included (position {0,3}) does the grapheme count increase → column 3.
713
+
714
+ assert pos1 == { 1 , 2 }
715
+ assert pos2 == { 1 , 2 }
716
+ assert pos3 == { 1 , 3 }
717
+ end
718
+
662
719
test "lsp_position_to_elixir multi line" do
663
720
assert { 2 , 2 } == SourceFile . lsp_position_to_elixir ( "abcde\n 1234" , { 1 , 1 } )
664
721
end
0 commit comments