Skip to content

Commit 1cf116a

Browse files
authored
Merge pull request #408 from jdepoix/feature/improve-js-var-parsing
Improve JS var parsing
2 parents 090a5db + 4238bc7 commit 1cf116a

File tree

6 files changed

+160
-7
lines changed

6 files changed

+160
-7
lines changed

youtube_transcript_api/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@
2424
YouTubeRequestFailed,
2525
InvalidVideoId,
2626
AgeRestricted,
27+
YouTubeDataUnparsable,
2728
)

youtube_transcript_api/_api.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ def __init__(
3232
http_client: Optional[Session] = None,
3333
):
3434
"""
35+
Note on thread-safety: As this class will initialize a `requests.Session`
36+
object, it is not thread-safe. Make sure to initialize an instance of
37+
`YouTubeTranscriptApi` per thread, if used in a multi-threading scenario!
38+
3539
:param cookie_path: Path to a text file containing YouTube authorization cookies
3640
:param proxy_config: an optional ProxyConfig object, defining proxies used for
3741
all network requests. This can be used to work around your IP being blocked

youtube_transcript_api/_errors.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ def __str__(self) -> str:
6969
return self._build_error_message()
7070

7171

72+
class YouTubeDataUnparsable(CouldNotRetrieveTranscript):
73+
CAUSE_MESSAGE = (
74+
"The data required to fetch the transcript is not parsable. This should "
75+
"not happen, please open an issue (make sure to include the video ID)!"
76+
)
77+
78+
7279
class YouTubeRequestFailed(CouldNotRetrieveTranscript):
7380
CAUSE_MESSAGE = "Request to YouTube failed: {reason}"
7481

youtube_transcript_api/_transcripts.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
RequestBlocked,
2727
AgeRestricted,
2828
VideoUnplayable,
29+
YouTubeDataUnparsable,
2930
)
3031
from ._settings import WATCH_URL
3132

@@ -367,15 +368,14 @@ def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
367368
raise exception.with_proxy_config(self._proxy_config)
368369

369370
def _extract_captions_json(self, html: str, video_id: str) -> Dict:
370-
splitted_html = html.split("var ytInitialPlayerResponse = ")
371-
372-
if len(splitted_html) <= 1:
371+
var_parser = _JsVarParser("ytInitialPlayerResponse")
372+
try:
373+
video_data = var_parser.parse(html, video_id)
374+
except YouTubeDataUnparsable as e:
373375
if 'class="g-recaptcha"' in html:
374376
raise IpBlocked(video_id)
375-
376-
video_data = json.loads(
377-
splitted_html[1].split("</script>")[0].strip().rstrip(";")
378-
)
377+
# This should never happen!
378+
raise e # pragma: no cover
379379

380380
self._assert_playability(video_data.get("playabilityStatus"), video_id)
381381

@@ -474,3 +474,47 @@ def parse(self, raw_data: str) -> List[FetchedTranscriptSnippet]:
474474
for xml_element in ElementTree.fromstring(raw_data)
475475
if xml_element.text is not None
476476
]
477+
478+
479+
class _JsVarParser:
480+
def __init__(self, var_name: str):
481+
self._var_name = var_name
482+
483+
def parse(self, raw_html: str, video_id: str) -> Dict:
484+
char_iterator = self._create_var_char_iterator(raw_html, video_id)
485+
var_string = self._find_var_substring(char_iterator, video_id)
486+
return json.loads(var_string)
487+
488+
def _create_var_char_iterator(self, raw_html: str, video_id: str) -> Iterator[str]:
489+
splitted_html = raw_html.split(f"var {self._var_name}")
490+
if len(splitted_html) <= 1:
491+
raise YouTubeDataUnparsable(video_id)
492+
char_iterator = iter(splitted_html[1])
493+
while next(char_iterator) != "{":
494+
pass
495+
return char_iterator
496+
497+
def _find_var_substring(self, char_iterator: Iterator[str], video_id: str) -> str:
498+
escaped = False
499+
in_quotes = False
500+
depth = 1
501+
chars = ["{"]
502+
503+
for char in char_iterator:
504+
chars.append(char)
505+
if escaped:
506+
escaped = False
507+
elif char == "\\":
508+
escaped = True
509+
elif char == '"':
510+
in_quotes = not in_quotes
511+
elif not in_quotes:
512+
if char == "{":
513+
depth += 1
514+
elif char == "}":
515+
depth -= 1
516+
if depth == 0:
517+
return "".join(chars)
518+
519+
# This should never happen!
520+
raise YouTubeDataUnparsable(video_id) # pragma: no cover

youtube_transcript_api/test/assets/youtube_altered_user_agent.html.static

Lines changed: 83 additions & 0 deletions
Large diffs are not rendered by default.

youtube_transcript_api/test/test_api.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,20 @@ def test_fetch_formatted(self):
104104
self.ref_transcript,
105105
)
106106

107+
def test_fetch__with_altered_user_agent(self):
108+
httpretty.register_uri(
109+
httpretty.GET,
110+
"https://www.youtube.com/watch",
111+
body=load_asset("youtube_altered_user_agent.html.static"),
112+
)
113+
114+
transcript = YouTubeTranscriptApi().fetch("GJLlxj_dtq8")
115+
116+
self.assertEqual(
117+
transcript,
118+
self.ref_transcript,
119+
)
120+
107121
def test_list(self):
108122
transcript_list = YouTubeTranscriptApi().list("GJLlxj_dtq8")
109123

0 commit comments

Comments
 (0)