Skip to content

Commit 09e0f39

Browse files
committed
fix crash when LSP cursor position is exactly in the middle of UTF16 high surrogate pair
1 parent b904fb4 commit 09e0f39

File tree

4 files changed

+80
-58
lines changed

4 files changed

+80
-58
lines changed

apps/debug_adapter/lib/debug_adapter/utils.ex

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,39 @@ defmodule ElixirLS.DebugAdapter.Utils do
2828
def dap_character_to_elixir(_utf8_line, dap_character) when dap_character <= 0, do: 0
2929

3030
def dap_character_to_elixir(utf8_line, dap_character) do
31-
utf16_line =
32-
utf8_line
33-
|> characters_to_binary!(:utf8, :utf16)
34-
35-
byte_size = byte_size(utf16_line)
36-
37-
utf8_character =
38-
utf16_line
39-
|> (&binary_part(
40-
&1,
41-
0,
42-
min(dap_character * 2, byte_size)
43-
)).()
44-
|> characters_to_binary!(:utf16, :utf8)
45-
|> String.length()
46-
47-
utf8_character
31+
utf16_line = characters_to_binary!(utf8_line, :utf8, :utf16)
32+
max_bytes = byte_size(utf16_line)
33+
34+
# LSP character -> code units -> bytes
35+
offset0 = dap_character * 2
36+
offset = clamp_offset_to_surrogate_boundary(utf16_line, offset0, max_bytes)
37+
38+
partial = binary_part(utf16_line, 0, offset)
39+
partial_utf8 = characters_to_binary!(partial, :utf16, :utf8)
40+
String.length(partial_utf8)
41+
end
42+
43+
# “Clamp” helper.
44+
# - If offset is out of bounds, keep it within [0, max_bytes].
45+
# - Then check if we landed *immediately* after a high surrogate (0xD800..0xDBFF);
46+
# if so, subtract 2 to avoid slicing in the middle.
47+
defp clamp_offset_to_surrogate_boundary(_bin, offset, max_bytes) when offset >= max_bytes,
48+
do: max_bytes
49+
50+
defp clamp_offset_to_surrogate_boundary(_bin, offset, _max_bytes) when offset <= 0,
51+
do: 0
52+
53+
defp clamp_offset_to_surrogate_boundary(bin, offset, _max_bytes) do
54+
# We know 0 < offset < max_bytes at this point
55+
# Look at the 2 bytes immediately before `offset`
56+
<<_::binary-size(offset - 2), maybe_high::binary-size(2), _::binary>> = bin
57+
code_unit = :binary.decode_unsigned(maybe_high, :big)
58+
59+
# If that 16-bit code_unit is a high surrogate, we've sliced in half
60+
if code_unit in 0xD800..0xDBFF do
61+
offset - 2
62+
else
63+
offset
64+
end
4865
end
4966
end

apps/debug_adapter/test/utils_test.exs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,11 @@ defmodule ElixirLS.DebugAdapter.UtilsTest do
6262
test "dap_character_to_elixir utf8" do
6363
assert 1 == Utils.dap_character_to_elixir("🏳️‍🌈abcde", 6)
6464
end
65+
66+
test "dap_character_to_elixir index inside high surrogate pair" do
67+
assert 6 == Utils.dap_character_to_elixir("Hello 🙌 World", 6)
68+
assert 6 == Utils.dap_character_to_elixir("Hello 🙌 World", 7)
69+
assert 7 == Utils.dap_character_to_elixir("Hello 🙌 World", 8)
70+
end
6571
end
6672
end

apps/language_server/lib/language_server/source_file.ex

Lines changed: 34 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -44,30 +44,6 @@ defmodule ElixirLS.LanguageServer.SourceFile do
4444
do_lines_with_endings(rest, line <> <<char::utf8>>)
4545
end
4646

47-
def text_before(text, position_line, position_character) do
48-
text
49-
|> lines
50-
|> Enum.with_index()
51-
|> Enum.reduce_while([], fn
52-
{line, count}, acc when count < position_line ->
53-
{:cont, [line, ?\n | acc]}
54-
55-
{line, count}, acc when count == position_line ->
56-
slice =
57-
characters_to_binary!(line, :utf8, :utf16)
58-
|> (&binary_part(
59-
&1,
60-
0,
61-
min(position_character * 2, byte_size(&1))
62-
)).()
63-
|> characters_to_binary!(:utf16, :utf8)
64-
65-
{:halt, [slice, ?\n | acc]}
66-
end)
67-
|> Enum.reverse()
68-
|> IO.iodata_to_binary()
69-
end
70-
7147
def apply_content_changes(%__MODULE__{} = source_file, []) do
7248
source_file
7349
end
@@ -339,23 +315,40 @@ defmodule ElixirLS.LanguageServer.SourceFile do
339315
def lsp_character_to_elixir(_utf8_line, lsp_character) when lsp_character <= 0, do: 1
340316

341317
def lsp_character_to_elixir(utf8_line, lsp_character) do
342-
utf16_line =
343-
utf8_line
344-
|> characters_to_binary!(:utf8, :utf16)
345-
346-
byte_size = byte_size(utf16_line)
347-
348-
utf8_character =
349-
utf16_line
350-
|> (&binary_part(
351-
&1,
352-
0,
353-
min(lsp_character * 2, byte_size)
354-
)).()
355-
|> characters_to_binary!(:utf16, :utf8)
356-
|> String.length()
357-
358-
utf8_character + 1
318+
utf16_line = characters_to_binary!(utf8_line, :utf8, :utf16)
319+
max_bytes = byte_size(utf16_line)
320+
321+
# LSP character -> code units -> bytes
322+
offset0 = lsp_character * 2
323+
offset = clamp_offset_to_surrogate_boundary(utf16_line, offset0, max_bytes)
324+
325+
partial = binary_part(utf16_line, 0, offset)
326+
partial_utf8 = characters_to_binary!(partial, :utf16, :utf8)
327+
String.length(partial_utf8) + 1
328+
end
329+
330+
# “Clamp” helper.
331+
# - If offset is out of bounds, keep it within [0, max_bytes].
332+
# - Then check if we landed *immediately* after a high surrogate (0xD800..0xDBFF);
333+
# if so, subtract 2 to avoid slicing in the middle.
334+
defp clamp_offset_to_surrogate_boundary(_bin, offset, max_bytes) when offset >= max_bytes,
335+
do: max_bytes
336+
337+
defp clamp_offset_to_surrogate_boundary(_bin, offset, _max_bytes) when offset <= 0,
338+
do: 0
339+
340+
defp clamp_offset_to_surrogate_boundary(bin, offset, _max_bytes) do
341+
# We know 0 < offset < max_bytes at this point
342+
# Look at the 2 bytes immediately before `offset`
343+
<<_::binary-size(offset - 2), maybe_high::binary-size(2), _::binary>> = bin
344+
code_unit = :binary.decode_unsigned(maybe_high, :big)
345+
346+
# If that 16-bit code_unit is a high surrogate, we've sliced in half
347+
if code_unit in 0xD800..0xDBFF do
348+
offset - 2
349+
else
350+
offset
351+
end
359352
end
360353

361354
def lsp_position_to_elixir(_urf8_text_or_lines, {lsp_line, _lsp_character}) when lsp_line < 0,

apps/language_server/test/source_file_test.exs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,12 @@ defmodule ElixirLS.LanguageServer.SourceFileTest do
653653
assert {1, 2} == SourceFile.lsp_position_to_elixir("🏳️‍🌈abcde", {0, 6})
654654
end
655655

656+
test "lsp_position_to_elixir single line index inside high surrogate pair" do
657+
assert {1, 7} == SourceFile.lsp_position_to_elixir("Hello 🙌 World", {0, 6})
658+
assert {1, 7} == SourceFile.lsp_position_to_elixir("Hello 🙌 World", {0, 7})
659+
assert {1, 8} == SourceFile.lsp_position_to_elixir("Hello 🙌 World", {0, 8})
660+
end
661+
656662
test "lsp_position_to_elixir multi line" do
657663
assert {2, 2} == SourceFile.lsp_position_to_elixir("abcde\n1234", {1, 1})
658664
end

0 commit comments

Comments
 (0)