Skip to content

Commit 364a2ba

Browse files
committed
fix html2text
1 parent b185354 commit 364a2ba

File tree

3 files changed

+5
-2
lines changed

3 files changed

+5
-2
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "WordCloud"
22
uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b"
33
authors = ["guoyongzhi <guo-yong-zhi@outlook.com>"]
4-
version = "1.1.0"
4+
version = "1.1.1"
55

66
[deps]
77
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"

src/textprocessing.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,10 @@ function processtext(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}
294294
end
295295
function html2text(content::AbstractString)
296296
patterns = [
297-
r"\"[\s\S]*?\"" => " ",
298297
r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ",
299298
r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ",
299+
r"<!--[\s\S]*?-->" => " ",
300+
r"<[\s\S]*?=\s*?\"[\s\S]*?\"\s*?>" => " ",
300301
"<br>" => "\n",
301302
r"<[\s\S]*?>" => " ",
302303
]

test/test_textprocessing.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,6 @@
7878

7979
htstr = """&pound;abcd<div x-component-name="DisasterSokuho" x-component-data="{&quot;earthquake&quot;:&quot;&lt;!-- 地震速報のメッセージを消しました (2024-04-25 12:00:08)-->\n&quot;,&quot;tsunami&quot;:&quot;&lt;!-- 津波速報のメッセージを消しました (2024-04-25 12:05:35)-->\n&quot;}"><div class="tYQVs"><div>"""
8080
@test strip(html2text(htstr)) == "abcd"
81+
htstr= """<span class="reference-text">"<a rel="nofollow" class="external text" href="不应该出现">something</a>." <i>"""
82+
@test replace(html2text(htstr), r"\s"=>"") == "\"something.\""
8183
end

0 commit comments

Comments
 (0)