Skip to content

Commit c6fa8b8

Browse files
committed
Prevent text extractors from adding unnecessary spaces
1 parent 47d061b commit c6fa8b8

File tree

5 files changed

+205
-8
lines changed

5 files changed

+205
-8
lines changed

lib/meeseeks/extractor/data.ex

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,52 @@ defmodule Meeseeks.Extractor.Data do
1717
def from_node(%Element{id: id}, document) do
1818
Helpers.child_nodes(document, id)
1919
|> Enum.filter(&Helpers.data_node?/1)
20-
|> Enum.map(&from_node(&1, document))
21-
|> Enum.intersperse(" ")
20+
|> join_nodes(document)
2221
end
2322

2423
def from_node(%{__struct__: struct}, _) when struct in @other_nodes, do: []
24+
25+
# join_nodes
26+
27+
defp join_nodes([], _document), do: []
28+
29+
defp join_nodes([node], document), do: from_node(node, document)
30+
31+
defp join_nodes(nodes, document), do: join_nodes(nodes, [], document)
32+
33+
defp join_nodes([], acc, _document), do: :lists.reverse(acc)
34+
35+
defp join_nodes([node | nodes], acc, document) do
36+
acc = join_node(node, acc, document)
37+
join_nodes(nodes, acc, document)
38+
end
39+
40+
# Head
41+
defp join_node(node, [], document) do
42+
case from_node(node, document) do
43+
[] -> []
44+
"" -> []
45+
iodata -> [iodata]
46+
end
47+
end
48+
49+
# Tail
50+
defp join_node(node, acc, document) do
51+
case from_node(node, document) do
52+
[] ->
53+
acc
54+
55+
"" ->
56+
acc
57+
58+
iodata ->
59+
[previous | _] = acc
60+
61+
if Helpers.ends_in_whitespace?(previous) do
62+
[iodata | acc]
63+
else
64+
[iodata, " " | acc]
65+
end
66+
end
67+
end
2568
end

lib/meeseeks/extractor/helpers.ex

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,72 @@ defmodule Meeseeks.Extractor.Helpers do
5656
defp html_escape_char("&"), do: "&"
5757
defp html_escape_char("\""), do: """
5858
defp html_escape_char("'"), do: "'"
59+
60+
# ends_in_whitespace?
61+
62+
# Adapted from trim_trailing in String.Break, which is found in
63+
# elixir/unicode/properties.ex
64+
65+
@whitespace_max_size 3
66+
67+
whitespace = List.flatten([
68+
Enum.map(String.to_integer("0009", 16)..String.to_integer("000D", 16), fn int -> <<int::utf8>> end),
69+
<<String.to_integer("0020", 16)::utf8>>,
70+
<<String.to_integer("0085", 16)::utf8>>,
71+
<<String.to_integer("00A0", 16)::utf8>>,
72+
<<String.to_integer("1680", 16)::utf8>>,
73+
Enum.map(String.to_integer("2000", 16)..String.to_integer("200A", 16), fn int -> <<int::utf8>> end),
74+
<<String.to_integer("2028", 16)::utf8>>,
75+
<<String.to_integer("2029", 16)::utf8>>,
76+
<<String.to_integer("202F", 16)::utf8>>,
77+
<<String.to_integer("205F", 16)::utf8>>,
78+
<<String.to_integer("3000", 16)::utf8>>
79+
])
80+
81+
def ends_in_whitespace?(iodata)
82+
def ends_in_whitespace?(l) when is_list(l), do: list_ends_in_whitespace?(l)
83+
def ends_in_whitespace?(b) when is_binary(b), do: bin_ends_in_whitespace?(b)
84+
85+
defp list_ends_in_whitespace?([]), do: false
86+
defp list_ends_in_whitespace?([x]), do: ends_in_whitespace?(x)
87+
defp list_ends_in_whitespace?([_, x]), do: ends_in_whitespace?(x)
88+
defp list_ends_in_whitespace?([_, _, x]), do: ends_in_whitespace?(x)
89+
90+
defp list_ends_in_whitespace?(l) do
91+
[x | _] = :lists.reverse(l)
92+
ends_in_whitespace?(x)
93+
end
94+
95+
defp bin_ends_in_whitespace?(""), do: false
96+
defp bin_ends_in_whitespace?(b) do
97+
bin_ends_in_whitespace?(b, byte_size(b))
98+
end
99+
100+
defp bin_ends_in_whitespace?(b, size) when size < @whitespace_max_size do
101+
s_bin_ends_in_whitespace?(b)
102+
end
103+
104+
defp bin_ends_in_whitespace?(b, size) do
105+
b_end = binary_part(b, size, -@whitespace_max_size)
106+
l_bin_ends_in_whitespace?(b_end)
107+
end
108+
109+
for cp <- whitespace do
110+
case byte_size(cp) do
111+
3 ->
112+
defp l_bin_ends_in_whitespace?(unquote(cp)), do: true
113+
114+
2 ->
115+
defp l_bin_ends_in_whitespace?(<<_, unquote(cp)>>), do: true
116+
defp s_bin_ends_in_whitespace?(unquote(cp)), do: true
117+
118+
1 ->
119+
defp l_bin_ends_in_whitespace?(<<_, _, unquote(cp)>>), do: true
120+
defp s_bin_ends_in_whitespace?(<<_, unquote(cp)>>), do: true
121+
defp s_bin_ends_in_whitespace?(unquote(cp)), do: true
122+
end
123+
end
124+
125+
defp l_bin_ends_in_whitespace?(_), do: false
126+
defp s_bin_ends_in_whitespace?(_), do: false
59127
end

lib/meeseeks/extractor/own_text.ex

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,54 @@ defmodule Meeseeks.Extractor.OwnText do
1515
def from_node(%Element{id: id}, document) do
1616
Helpers.child_nodes(document, id)
1717
|> Enum.filter(&Helpers.text_node?/1)
18-
|> Enum.map(&from_node(&1, document))
19-
|> Enum.intersperse(" ")
18+
|> join_nodes(document)
2019
end
2120

2221
def from_node(%Text{content: content}, _), do: content
2322

2423
def from_node(%{__struct__: struct}, _) when struct in @other_nodes, do: []
24+
25+
# join_nodes
26+
27+
defp join_nodes([], _document), do: []
28+
29+
defp join_nodes([node], document), do: from_node(node, document)
30+
31+
defp join_nodes(nodes, document), do: join_nodes(nodes, [], document)
32+
33+
defp join_nodes([], acc, _document), do: :lists.reverse(acc)
34+
35+
defp join_nodes([node | nodes], acc, document) do
36+
acc = join_node(node, acc, document)
37+
join_nodes(nodes, acc, document)
38+
end
39+
40+
# Head
41+
defp join_node(node, [], document) do
42+
case from_node(node, document) do
43+
[] -> []
44+
"" -> []
45+
iodata -> [iodata]
46+
end
47+
end
48+
49+
# Tail
50+
defp join_node(node, acc, document) do
51+
case from_node(node, document) do
52+
[] ->
53+
acc
54+
55+
"" ->
56+
acc
57+
58+
iodata ->
59+
[previous | _] = acc
60+
61+
if Helpers.ends_in_whitespace?(previous) do
62+
[iodata | acc]
63+
else
64+
[iodata, " " | acc]
65+
end
66+
end
67+
end
2568
end

lib/meeseeks/extractor/text.ex

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,54 @@ defmodule Meeseeks.Extractor.Text do
1414

1515
def from_node(%Element{id: id}, document) do
1616
Helpers.child_nodes(document, id)
17-
|> Enum.map(&from_node(&1, document))
18-
|> Enum.intersperse(" ")
17+
|> join_nodes(document)
1918
end
2019

2120
def from_node(%Text{content: content}, _), do: content
2221

2322
def from_node(%{__struct__: struct}, _) when struct in @other_nodes, do: []
23+
24+
# join_nodes
25+
26+
defp join_nodes([], _document), do: []
27+
28+
defp join_nodes([node], document), do: from_node(node, document)
29+
30+
defp join_nodes(nodes, document), do: join_nodes(nodes, [], document)
31+
32+
defp join_nodes([], acc, _document), do: :lists.reverse(acc)
33+
34+
defp join_nodes([node | nodes], acc, document) do
35+
acc = join_node(node, acc, document)
36+
join_nodes(nodes, acc, document)
37+
end
38+
39+
# Head
40+
defp join_node(node, [], document) do
41+
case from_node(node, document) do
42+
[] -> []
43+
"" -> []
44+
iodata -> [iodata]
45+
end
46+
end
47+
48+
# Tail
49+
defp join_node(node, acc, document) do
50+
case from_node(node, document) do
51+
[] ->
52+
acc
53+
54+
"" ->
55+
acc
56+
57+
iodata ->
58+
[previous | _] = acc
59+
60+
if Helpers.ends_in_whitespace?(previous) do
61+
[iodata | acc]
62+
else
63+
[iodata, " " | acc]
64+
end
65+
end
66+
end
2467
end

test/meeseeks/document/node_test.exs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ defmodule Meeseeks.Document.NodeTest do
6565

6666
test "get data without collapsing whitespace when node has data" do
6767
node = Document.get_node(@document, 7)
68-
expected = "3 4 5"
68+
expected = "3 4 5"
6969
assert Node.data(node, @document, collapse_whitespace: false) == expected
7070
end
7171

@@ -142,7 +142,7 @@ defmodule Meeseeks.Document.NodeTest do
142142

143143
test "get text without collapsing whitespace when node has text" do
144144
node = Document.get_node(@document, 7)
145-
expected = "0 0.5\n 1"
145+
expected = "0 0.5\n 1"
146146
assert Node.text(node, @document, collapse_whitespace: false) == expected
147147
end
148148

0 commit comments

Comments
 (0)