Merge pull request #408 from jdepoix/feature/improve-js-var-parsing

jdepoix · web-flow · commit 1cf116a341d5 · 2025-03-25T19:09:10.000+01:00
Improve JS var parsing
diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py
@@ -24,4 +24,5 @@
     YouTubeRequestFailed,
     InvalidVideoId,
     AgeRestricted,
+    YouTubeDataUnparsable,
 )
diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
@@ -32,6 +32,10 @@ def __init__(
         http_client: Optional[Session] = None,
     ):
         """
+        Note on thread-safety: As this class will initialize a `requests.Session`
+        object, it is not thread-safe. Make sure to initialize an instance of
+        `YouTubeTranscriptApi` per thread, if used in a multi-threading scenario!
+
         :param cookie_path: Path to a text file containing YouTube authorization cookies
         :param proxy_config: an optional ProxyConfig object, defining proxies used for
             all network requests. This can be used to work around your IP being blocked
diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py
@@ -69,6 +69,13 @@ def __str__(self) -> str:
         return self._build_error_message()
 
 
+class YouTubeDataUnparsable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = (
+        "The data required to fetch the transcript is not parsable. This should "
+        "not happen, please open an issue (make sure to include the video ID)!"
+    )
+
+
 class YouTubeRequestFailed(CouldNotRetrieveTranscript):
     CAUSE_MESSAGE = "Request to YouTube failed: {reason}"
 
diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
@@ -26,6 +26,7 @@
     RequestBlocked,
     AgeRestricted,
     VideoUnplayable,
+    YouTubeDataUnparsable,
 )
 from ._settings import WATCH_URL
 
@@ -367,15 +368,14 @@ def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
             raise exception.with_proxy_config(self._proxy_config)
 
     def _extract_captions_json(self, html: str, video_id: str) -> Dict:
-        splitted_html = html.split("var ytInitialPlayerResponse = ")
-
-        if len(splitted_html) <= 1:
+        var_parser = _JsVarParser("ytInitialPlayerResponse")
+        try:
+            video_data = var_parser.parse(html, video_id)
+        except YouTubeDataUnparsable as e:
             if 'class="g-recaptcha"' in html:
                 raise IpBlocked(video_id)
-
-        video_data = json.loads(
-            splitted_html[1].split("</script>")[0].strip().rstrip(";")
-        )
+            # This should never happen!
+            raise e  # pragma: no cover
 
         self._assert_playability(video_data.get("playabilityStatus"), video_id)
 
@@ -474,3 +474,47 @@ def parse(self, raw_data: str) -> List[FetchedTranscriptSnippet]:
             for xml_element in ElementTree.fromstring(raw_data)
             if xml_element.text is not None
         ]
+
+
+class _JsVarParser:
+    def __init__(self, var_name: str):
+        self._var_name = var_name
+
+    def parse(self, raw_html: str, video_id: str) -> Dict:
+        char_iterator = self._create_var_char_iterator(raw_html, video_id)
+        var_string = self._find_var_substring(char_iterator, video_id)
+        return json.loads(var_string)
+
+    def _create_var_char_iterator(self, raw_html: str, video_id: str) -> Iterator[str]:
+        splitted_html = raw_html.split(f"var {self._var_name}")
+        if len(splitted_html) <= 1:
+            raise YouTubeDataUnparsable(video_id)
+        char_iterator = iter(splitted_html[1])
+        while next(char_iterator) != "{":
+            pass
+        return char_iterator
+
+    def _find_var_substring(self, char_iterator: Iterator[str], video_id: str) -> str:
+        escaped = False
+        in_quotes = False
+        depth = 1
+        chars = ["{"]
+
+        for char in char_iterator:
+            chars.append(char)
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_quotes = not in_quotes
+            elif not in_quotes:
+                if char == "{":
+                    depth += 1
+                elif char == "}":
+                    depth -= 1
+            if depth == 0:
+                return "".join(chars)
+
+        # This should never happen!
+        raise YouTubeDataUnparsable(video_id)  # pragma: no cover
diff --git a/youtube_transcript_api/test/assets/youtube_altered_user_agent.html.static b/youtube_transcript_api/test/assets/youtube_altered_user_agent.html.static
diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py
@@ -104,6 +104,20 @@ def test_fetch_formatted(self):
             self.ref_transcript,
         )
 
+    def test_fetch__with_altered_user_agent(self):
+        httpretty.register_uri(
+            httpretty.GET,
+            "https://www.youtube.com/watch",
+            body=load_asset("youtube_altered_user_agent.html.static"),
+        )
+
+        transcript = YouTubeTranscriptApi().fetch("GJLlxj_dtq8")
+
+        self.assertEqual(
+            transcript,
+            self.ref_transcript,
+        )
+
     def test_list(self):
         transcript_list = YouTubeTranscriptApi().list("GJLlxj_dtq8")
 

Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,5 @@`
`24`	`24`	`YouTubeRequestFailed,`
`25`	`25`	`InvalidVideoId,`
`26`	`26`	`AgeRestricted,`
	`27`	`+ YouTubeDataUnparsable,`
`27`	`28`	`)`