Skip to content

Commit 01752c8

Browse files
committed
fix html2text
1 parent 6865221 commit 01752c8

File tree

3 files changed

+11
-6
lines changed

3 files changed

+11
-6
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "WordCloud"
22
uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b"
33
authors = ["guoyongzhi <guo-yong-zhi@outlook.com>"]
4-
version = "0.13.1"
4+
version = "0.13.2"
55

66
[deps]
77
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"

src/textprocessing.jl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,22 +294,24 @@ function processtext(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}
294294
end
295295
function html2text(content::AbstractString)
296296
patterns = [
297+
r"\"[\s\S]*?\"" => " ",
297298
r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ",
298299
r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ",
299-
r"<!--[\s\S]*?-->" => " ",
300300
"<br>" => "\n",
301301
r"<[\s\S]*?>" => " ",
302+
]
303+
for p in patterns
304+
content = replace(content, p) # single pass not work
305+
end
306+
patterns = [
302307
"&nbsp;" => " ",
303308
"&quot;" => "\"",
304309
"&amp;" => "&",
305310
"&lt;" => "<",
306311
"&gt;" => ">",
307312
r"&#?\w{1,6};" => " ",
308313
]
309-
for p in patterns
310-
content = replace(content, p)
311-
end
312-
content
314+
replace(content, patterns...)
313315
end
314316
html2text(file::IO) = html2text(read(file, String))
315317
end

test/test_textprocessing.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,7 @@
7575
@test abs(pm(12.5, 2-1e-8) - sqrt(12.5^2/2+1/2)) < 1e-6
7676
@test pm(π, Inf) π
7777
@test pm(7π, -Inf) == 1.
78+
79+
htstr = """&pound;abcd<div x-component-name="DisasterSokuho" x-component-data="{&quot;earthquake&quot;:&quot;&lt;!-- 地震速報のメッセージを消しました (2024-04-25 12:00:08)-->\n&quot;,&quot;tsunami&quot;:&quot;&lt;!-- 津波速報のメッセージを消しました (2024-04-25 12:05:35)-->\n&quot;}"><div class="tYQVs"><div>"""
80+
@test strip(html2text(htstr)) == "abcd"
7881
end

0 commit comments

Comments
 (0)