Skip to content

Commit bc4713f

Browse files
Kill those fuckers
remove all broken html entities from Aniworld and SerienStream
1 parent be43940 commit bc4713f

File tree

4 files changed

+32
-19
lines changed

4 files changed

+32
-19
lines changed

src/gucken/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
import warnings
22
warnings.filterwarnings('ignore', message='Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
33

4-
__version__ = "0.3.0"
4+
__version__ = "0.3.1"

src/gucken/provider/aniworld.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from asyncio import gather
22
from dataclasses import dataclass
3-
from html import unescape
43
from typing import Union
54

65
from bs4 import BeautifulSoup
@@ -16,7 +15,7 @@
1615
from ..hoster.streamtape import StreamtapeHoster
1716
from .common import Episode, Hoster, Language, Provider, SearchResult, Series
1817
from ..utils import json_loads
19-
18+
from ..utils import fully_unescape
2019

2120
def provider_to_hoster(provider: str, url: str) -> Hoster:
2221
if provider == "VOE":
@@ -157,9 +156,9 @@ async def search(keyword: str) -> Union[list[AniWorldSearchResult], None]:
157156
search_results.append(
158157
AniWorldSearchResult(
159158
provider_name="aniworld.to",
160-
name=unescape(series.get("name")).strip(),
159+
name=fully_unescape(series.get("name")).strip(),
161160
link=series.get("link"),
162-
description=unescape(series.get("description")),
161+
description=fully_unescape(series.get("description")),
163162
cover=f"https://{AniWorldProvider.host}{series.get('cover')}",
164163
production_year=series.get("productionYear"),
165164
host=AniWorldProvider.host,
@@ -227,15 +226,15 @@ async def get_series(search_result: AniWorldSearchResult) -> AniWorldSeries:
227226

228227
return AniWorldSeries(
229228
# cover=f"https://{search_result.host}" + soup.find("div", class_="seriesCoverBox").find("img").attrs.get("data-src"),
230-
name=unescape(
229+
name=fully_unescape(
231230
soup.find("h1", attrs={"itemprop": "name"}).find("span").text
232231
).strip(),
233-
production_year=unescape(
232+
production_year=fully_unescape(
234233
soup.find("div", class_="series-title").find("small").text
235234
).strip(),
236235
# age=int(soup.find("div", class_="fsk").find("span").text),
237236
# imdb_link=soup.find("a", class_="imdb-link").attrs.get("href"),
238-
full_description=unescape(
237+
full_description=fully_unescape(
239238
soup.find("p", class_="seri_des").attrs.get("data-full-description")
240239
),
241240
regisseure=directors,
@@ -298,8 +297,8 @@ async def get_episodes_from_soup(
298297
hoster.add(VidmolyHoster)
299298

300299
e_count += 1
301-
title_en = title.find("span").text.strip()
302-
title_de = title.find("strong").text.strip()
300+
title_en = fully_unescape(title.find("span").text.strip())
301+
title_de = fully_unescape(title.find("strong").text.strip())
303302
title = (
304303
f"{title_en} - {title_de}"
305304
if title_en and title_de

src/gucken/provider/serienstream.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from asyncio import gather
22
from dataclasses import dataclass
3-
from html import unescape
43
from typing import Union
54

65
from bs4 import BeautifulSoup
@@ -16,6 +15,7 @@
1615
from ..hoster.streamtape import StreamtapeHoster
1716
from .common import Episode, Hoster, Language, Provider, SearchResult, Series
1817
from ..utils import json_loads
18+
from ..utils import fully_unescape
1919

2020
# TODO: Timeouts
2121
# TODO: use base_url
@@ -167,9 +167,9 @@ async def search(keyword: str) -> Union[list[SerienStreamSearchResult], None]:
167167
search_results.append(
168168
SerienStreamSearchResult(
169169
provider_name="serienstream.to",
170-
name=unescape(series.get("name")).strip(),
170+
name=fully_unescape(series.get("name")).strip(),
171171
link=series.get("link"),
172-
description=unescape(series.get("description")),
172+
description=fully_unescape(series.get("description")),
173173
cover=f"https://s.to{series.get('cover')}",
174174
production_year=series.get("productionYear"),
175175
host=SerienStreamProvider.host,
@@ -237,15 +237,15 @@ async def get_series(search_result: SerienStreamSearchResult) -> SerienStreamSer
237237

238238
return SerienStreamSeries(
239239
# cover=f"https://{search_result.host}" + soup.find("div", class_="seriesCoverBox").find("img").attrs.get("data-src"),
240-
name=unescape(
240+
name=fully_unescape(
241241
soup.find("h1", attrs={"itemprop": "name"}).find("span").text
242242
).strip(),
243-
production_year=unescape(
243+
production_year=fully_unescape(
244244
soup.find("div", class_="series-title").find("small").text
245245
).strip(),
246246
# age=int(soup.find("div", class_="fsk").find("span").text),
247247
# imdb_link=soup.find("a", class_="imdb-link").attrs.get("href"),
248-
full_description=unescape(
248+
full_description=fully_unescape(
249249
soup.find("p", class_="seri_des").attrs.get("data-full-description")
250250
),
251251
regisseure=directors,
@@ -308,10 +308,10 @@ async def get_episodes_from_soup(
308308
hoster.add(VidmolyHoster)
309309

310310
e_count += 1
311-
title_en = title.find("span").text.strip()
312-
title_de = title.find("strong").text.strip()
311+
title_en = fully_unescape(title.find("span").text.strip())
312+
title_de = fully_unescape(title.find("strong").text.strip())
313313
title = (
314-
f"{title_en} - {title_de}"
314+
"{title_en} - {title_de}"
315315
if title_en and title_de
316316
else title_en or title_de
317317
)

src/gucken/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#import logging
22
import os
33
import sys
4+
from html import unescape
45

56
from typing import Union
67
from typing import NamedTuple
@@ -103,3 +104,16 @@ def get_vlc_intf_user_path(player_path: str) -> VLCPaths:
103104

104105
def set_default_vlc_interface_cfg(key: str, value: any) -> str:
105106
return f'config["{key}"] = config["{key}"] or ' + str(value) or "nil"
107+
108+
109+
def fully_unescape(s: str) -> str:
110+
"""
111+
Aniworld and SerienStream have many broken/escaped html entities.
112+
This function will unescape all of them!
113+
"""
114+
while True:
115+
prev = s
116+
s = unescape(s)
117+
if s == prev:
118+
break
119+
return s

0 commit comments

Comments
 (0)