Skip to content

Commit 76ee440

Browse files
authored
Merge pull request #33 from dermasmid/sort-by
fix sort by
2 parents 2f8c2ac + 8dc8967 commit 76ee440

File tree

1 file changed

+61
-20
lines changed

1 file changed

+61
-20
lines changed

scrapetube/scrapetube.py

Lines changed: 61 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ def get_channel(
5858
``"streams"``: Streams
5959
"""
6060

61-
sort_by_map = {"newest": "dd", "oldest": "da", "popular": "p"}
62-
6361
base_url = ""
6462
if channel_url:
6563
base_url = channel_url
@@ -68,13 +66,12 @@ def get_channel(
6866
elif channel_username:
6967
base_url = f"https://www.youtube.com/@{channel_username}"
7068

71-
url = "{base_url}/{content_type}?view=0&sort={sort_by}&flow=grid".format(
69+
url = "{base_url}/{content_type}?view=0&flow=grid".format(
7270
base_url=base_url,
7371
content_type=content_type,
74-
sort_by=sort_by_map[sort_by],
7572
)
7673
api_endpoint = "https://www.youtube.com/youtubei/v1/browse"
77-
videos = get_videos(url, api_endpoint, type_property_map[content_type], limit, sleep)
74+
videos = get_videos(url, api_endpoint, type_property_map[content_type], limit, sleep, sort_by)
7875
for video in videos:
7976
yield video
8077

@@ -162,15 +159,39 @@ def get_search(
162159
yield video
163160

164161

162+
163+
def get_video(
164+
id: str,
165+
) -> dict:
166+
167+
"""Get a single video.
168+
169+
Parameters:
170+
id (``str``):
171+
The video id from the video you want to get.
172+
"""
173+
174+
session = get_session()
175+
url = f"https://www.youtube.com/watch?v={id}"
176+
html = get_initial_data(session, url)
177+
client = json.loads(
178+
get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}'
179+
)["client"]
180+
session.headers["X-YouTube-Client-Name"] = "1"
181+
session.headers["X-YouTube-Client-Version"] = client["clientVersion"]
182+
data = json.loads(
183+
get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}"
184+
)
185+
return next(search_dict(data, "videoPrimaryInfoRenderer"))
186+
187+
188+
165189
def get_videos(
166-
url: str, api_endpoint: str, selector: str, limit: int, sleep: int
190+
url: str, api_endpoint: str, selector: str, limit: int, sleep: int, sort_by: str = None
167191
) -> Generator[dict, None, None]:
168-
session = requests.Session()
169-
session.headers[
170-
"User-Agent"
171-
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36"
192+
session = get_session()
172193
is_first = True
173-
quit = False
194+
quit_it = False
174195
count = 0
175196
while True:
176197
if is_first:
@@ -184,8 +205,10 @@ def get_videos(
184205
data = json.loads(
185206
get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}"
186207
)
187-
next_data = get_next_data(data)
208+
next_data = get_next_data(data, sort_by)
188209
is_first = False
210+
if sort_by and sort_by != "newest":
211+
continue
189212
else:
190213
data = get_ajax_data(session, api_endpoint, api_key, next_data, client)
191214
next_data = get_next_data(data)
@@ -194,20 +217,28 @@ def get_videos(
194217
count += 1
195218
yield result
196219
if count == limit:
197-
quit = True
220+
quit_it = True
198221
break
199222
except GeneratorExit:
200-
quit = True
223+
quit_it = True
201224
break
202225

203-
if not next_data or quit:
226+
if not next_data or quit_it:
204227
break
205228

206229
time.sleep(sleep)
207230

208231
session.close()
209232

210233

234+
def get_session() -> requests.Session:
235+
session = requests.Session()
236+
session.headers[
237+
"User-Agent"
238+
] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
239+
session.headers["Accept-Language"] = "en"
240+
return session
241+
211242
def get_initial_data(session: requests.Session, url: str) -> str:
212243
session.cookies.set("CONSENT", "YES+cb", domain=".youtube.com")
213244
response = session.get(url)
@@ -237,13 +268,23 @@ def get_json_from_html(html: str, key: str, num_chars: int = 2, stop: str = '"')
237268
return html[pos_begin:pos_end]
238269

239270

240-
def get_next_data(data: dict) -> dict:
241-
raw_next_data = next(search_dict(data, "continuationEndpoint"), None)
242-
if not raw_next_data:
271+
def get_next_data(data: dict, sort_by: str = None) -> dict:
272+
# Youtube, please don't change the order of these
273+
sort_by_map = {
274+
"newest": 0,
275+
"popular": 1,
276+
"oldest": 2,
277+
}
278+
if sort_by and sort_by != "newest":
279+
endpoint = next(
280+
search_dict(data, "feedFilterChipBarRenderer"), None)["contents"][sort_by_map[sort_by]]["chipCloudChipRenderer"]["navigationEndpoint"]
281+
else:
282+
endpoint = next(search_dict(data, "continuationEndpoint"), None)
283+
if not endpoint:
243284
return None
244285
next_data = {
245-
"token": raw_next_data["continuationCommand"]["token"],
246-
"click_params": {"clickTrackingParams": raw_next_data["clickTrackingParams"]},
286+
"token": endpoint["continuationCommand"]["token"],
287+
"click_params": {"clickTrackingParams": endpoint["clickTrackingParams"]},
247288
}
248289

249290
return next_data

0 commit comments

Comments
 (0)