Skip to content
This repository was archived by the owner on Mar 22, 2025. It is now read-only.

Commit cb6c30d

Browse files
authored
feat: Add parallel processing, refactor (#11)
1 parent 401f189 commit cb6c30d

File tree

14 files changed

+871
-397
lines changed

14 files changed

+871
-397
lines changed

.github/workflows/bump-version.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ jobs:
99
bump-version:
1010
uses: janw/workflows/.github/workflows/commitizen-bump-version.yaml@main
1111
secrets:
12-
personal-access-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
12+
personal-access-token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
1313
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
1414
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}

.github/workflows/linters.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,49 @@ on:
66
jobs:
77
commitizen:
88
uses: janw/workflows/.github/workflows/commitizen.yaml@main
9+
10+
pre-commit:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Check out
14+
uses: actions/checkout@v3
15+
with:
16+
token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
17+
18+
- name: Install poetry
19+
run: pipx install poetry
20+
21+
- name: Set up python environment
22+
uses: actions/setup-python@v5
23+
with:
24+
cache: poetry
25+
python-version: 3.x
26+
27+
- name: Install dependencies
28+
run: poetry install --sync
29+
30+
- id: cache-restore
31+
uses: actions/cache/restore@v4
32+
with:
33+
path: ~/.cache/pre-commit
34+
key: pre-commit-v0|${{ steps.setup-python.outputs.python-version }}|${{ hashFiles('.pre-commit-config.yaml') }}
35+
36+
- run: poetry run pre-commit run --show-diff-on-failure --color=always --all-files
37+
shell: bash
38+
39+
- uses: stefanzweifel/git-auto-commit-action@v5
40+
if: >
41+
always()
42+
&& !startsWith(github.event.head_commit.message, 'build(autofix):')
43+
with:
44+
commit_message: "build(autofix): Auto-fix linting issues"
45+
commit_user_name: "Jan Willhaus [bot]"
46+
commit_user_email: "bot@janw.xyz"
47+
commit_author: Jan Willhaus [bot] <bot@janw.xyz>
48+
49+
- id: cache-save
50+
uses: actions/cache/save@v4
51+
if: always() && steps.cache-restore.outputs.cache-hit != 'true'
52+
with:
53+
key: ${{ steps.cache-restore.outputs.cache-primary-key }}
54+
path: ~/.cache/pre-commit

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ ipython_config.py
8585

8686
# pyenv
8787
.python-version
88+
.tool-versions
8889

8990
# celery beat schedule file
9091
celerybeat-schedule

.pre-commit-config.yaml

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
1-
ci:
2-
autoupdate_commit_msg: 'build(pre-commit): pre-commit.ci autoupdate'
3-
autoupdate_schedule: weekly
4-
autofix_commit_msg: 'ci(pre-commit): auto fixes from pre-commit hooks'
5-
autofix_prs: true
6-
7-
default_install_hook_types:
8-
- pre-commit
9-
default_stages:
10-
- pre-commit
111
repos:
2+
- repo: meta
3+
hooks:
4+
- id: check-hooks-apply
5+
6+
- repo: https://github.com/janw/pre-commit-hooks
7+
rev: v0.1.0
8+
hooks:
9+
- id: sync_ruff_version
10+
1211
- repo: https://github.com/astral-sh/ruff-pre-commit
13-
rev: 'v0.6.9'
12+
rev: 'v0.7.1'
1413
hooks:
1514
- id: ruff
1615
args: [ --fix, --exit-non-zero-on-fix ]
@@ -28,4 +27,17 @@ repos:
2827
- repo: https://github.com/python-poetry/poetry
2928
rev: '1.8.0'
3029
hooks:
30+
- id: poetry-lock
31+
args:
32+
- --no-update
3133
- id: poetry-check
34+
35+
- repo: local
36+
hooks:
37+
- id: mypy
38+
name: mypy
39+
entry: poetry run mypy
40+
language: system
41+
require_serial: true
42+
pass_filenames: false
43+
types: [python]

letterboxd_rss/__init__.py

Lines changed: 1 addition & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,111 +1 @@
1-
import re
2-
3-
from requests import session
4-
from bs4 import BeautifulSoup
5-
from feedgen.feed import FeedGenerator
6-
7-
match_imdb = re.compile(r"^https?://www.imdb.com")
8-
match_tmdb = re.compile(r"^https?://www.themoviedb.org")
9-
10-
base_url = "https://letterboxd.com/"
11-
12-
s = session()
13-
14-
15-
def process(args):
16-
watchlist_url = args.letterboxd_url.rstrip("/")
17-
if not watchlist_url.startswith("https://"):
18-
watchlist_url = f"{base_url}{watchlist_url}"
19-
if not watchlist_url.endswith("watchlist"):
20-
watchlist_url += "/watchlist"
21-
watchlist_url += "/"
22-
23-
feedlen = args.max_length
24-
output_file = args.output
25-
page_title = "The Dude's Watchlist"
26-
27-
feed = FeedGenerator()
28-
feed.title(page_title)
29-
feed.id(watchlist_url)
30-
feed.link(href=watchlist_url, rel="alternate")
31-
feed.description(page_title + " from Letterboxd")
32-
33-
# Get first page, gather general data
34-
r = s.get(watchlist_url)
35-
r.raise_for_status()
36-
soup = BeautifulSoup(r.text, "html.parser")
37-
38-
watchlist_title = soup.find("meta", attrs={"property": "og:title"})
39-
page_title = watchlist_title.attrs["content"]
40-
41-
m = soup.find("span", attrs={"class": "js-watchlist-count"})
42-
if len(m) > 0:
43-
total_movies = int(m.text.split()[0])
44-
print(f"Found a total of {total_movies} movies")
45-
46-
paginator = soup.find_all("li", attrs={"class": "paginate-page"})
47-
page_count = int(paginator[-1].text) if paginator else 1
48-
last_page_index = page_count + 1
49-
50-
movies_added = 0
51-
for page in range(1, last_page_index):
52-
if page > 1:
53-
r = s.get(watchlist_url + "/page/%i/" % page)
54-
soup = BeautifulSoup(r.text, "html.parser")
55-
print()
56-
57-
ul = soup.find("ul", attrs={"class": "poster-list"})
58-
movies = ul.find_all("li")
59-
movies_on_page = len(movies)
60-
61-
print(f"Gathering on page {page} (contains {movies_on_page} movies)\n")
62-
63-
for movie in movies:
64-
added = extract_metadata(movie, feed)
65-
66-
# Update total counter
67-
movies_added += added
68-
if feedlen > 0 and movies_added >= feedlen:
69-
print("\nReached desired maximum feed length")
70-
break
71-
72-
if feedlen > 0 and movies_added >= feedlen:
73-
break
74-
75-
if movies_added > 0:
76-
print(f"Writing feed to {output_file}")
77-
feed.rss_file(output_file)
78-
79-
80-
def extract_metadata(movie, feed):
81-
movie_url = base_url + "film/" + movie.div.attrs["data-film-slug"]
82-
movie_page = s.get(movie_url)
83-
movie_soup = BeautifulSoup(movie_page.text, "html.parser")
84-
85-
try:
86-
movie_title = movie_soup.find("meta", attrs={"property": "og:title"}).attrs[
87-
"content"
88-
]
89-
print("Adding", movie_title)
90-
movie_link = movie_soup.find(
91-
"a", attrs={"href": [match_imdb, match_tmdb]}
92-
).attrs["href"]
93-
if movie_link.endswith("/maindetails"):
94-
movie_link = movie_link[:-11]
95-
movie_description = movie_soup.find(
96-
"meta", attrs={"property": "og:description"}
97-
)
98-
if movie_description is not None:
99-
movie_description = movie_description.text.strip()
100-
101-
item = feed.add_item()
102-
item.title(movie_title)
103-
item.description(movie_description)
104-
item.link(href=movie_link, rel="alternate")
105-
item.guid(movie_link)
106-
107-
return 1
108-
except Exception:
109-
print("Parsing failed on", movie_url)
110-
111-
return 0
1+
__version__ = "v0.3.0"

letterboxd_rss/base.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from __future__ import annotations
2+
3+
from concurrent.futures import Future, ThreadPoolExecutor, wait
4+
from typing import TYPE_CHECKING, Dict, List, Optional
5+
6+
from bs4 import BeautifulSoup
7+
from bs4.element import Tag
8+
9+
from letterboxd_rss.feed import create_feed
10+
from letterboxd_rss.parsing import parse_page
11+
from letterboxd_rss.session import session
12+
from letterboxd_rss.utils import make_watchlist_url
13+
14+
if TYPE_CHECKING:
15+
from feedgen.feed import FeedEntry
16+
17+
18+
def process(
19+
letterboxd_url: str,
20+
output_file: str,
21+
max_length: int,
22+
) -> None:
23+
page_title = ""
24+
watchlist_url = make_watchlist_url(letterboxd_url)
25+
next_url: Optional[str] = watchlist_url + "page/1/"
26+
remaining_count = max_length
27+
with ThreadPoolExecutor(max_workers=4) as pool:
28+
future_to_url: Dict[Future[FeedEntry], str] = {}
29+
30+
while next_url and remaining_count > 0:
31+
r = session.get_and_raise(next_url)
32+
soup = BeautifulSoup(r.text, "html.parser")
33+
34+
next_url, _futures = parse_page(soup, max_movies=remaining_count, pool=pool)
35+
future_to_url.update(_futures)
36+
remaining_count -= len(_futures)
37+
38+
entries: List[FeedEntry] = []
39+
for future in wait(future_to_url).done:
40+
url = future_to_url[future]
41+
try:
42+
entry = future.result()
43+
except Exception as exc:
44+
print("%r generated an exception: %s" % (url, exc))
45+
else:
46+
entries.append(entry)
47+
48+
watchlist_title = soup.find("meta", attrs={"property": "og:title"})
49+
page_title = watchlist_title.attrs["content"] if isinstance(watchlist_title, Tag) else "The Dude's Watchlist"
50+
51+
if entries:
52+
create_feed(
53+
entries,
54+
page_title=page_title,
55+
watchlist_url=watchlist_url,
56+
output_file=output_file,
57+
)

letterboxd_rss/__main__.py renamed to letterboxd_rss/cli.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1-
import sys
1+
from __future__ import annotations
2+
23
import argparse
3-
from letterboxd_rss import process
4+
from typing import List, Optional
5+
6+
from letterboxd_rss.base import process
47

58

6-
def main(argv=None):
9+
def main(argv: Optional[List[str]] = None) -> None:
710
parser = argparse.ArgumentParser()
811
parser.add_argument(
912
"letterboxd_url",
@@ -26,7 +29,8 @@ def main(argv=None):
2629
help="Maximum number of watchlist items to keep in the feed",
2730
)
2831
args = parser.parse_args(argv)
29-
process(args)
30-
31-
32-
main(sys.argv[1:])
32+
process(
33+
letterboxd_url=args.letterboxd_url,
34+
output_file=args.output,
35+
max_length=args.max_length,
36+
)

letterboxd_rss/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from letterboxd_rss import __version__
2+
3+
PROG_NAME = "letterboxd-rss"
4+
USER_AGENT = f"{PROG_NAME}/{__version__} (https://github.com/janw/{PROG_NAME})"
5+
6+
REQUESTS_TIMEOUT = 30
7+
8+
9+
BASE_URL = "https://letterboxd.com"

letterboxd_rss/feed.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from __future__ import annotations
2+
3+
from typing import List
4+
5+
from feedgen.feed import FeedEntry, FeedGenerator
6+
7+
8+
def create_feed(entries: List[FeedEntry], page_title: str, watchlist_url: str, output_file: str) -> None:
9+
feed = FeedGenerator()
10+
feed.title(page_title)
11+
feed.id(watchlist_url)
12+
feed.link(href=watchlist_url, rel="alternate")
13+
feed.description(page_title + " from Letterboxd")
14+
for entry in entries:
15+
feed.add_entry(entry)
16+
17+
print(f"Writing feed to {output_file}")
18+
feed.rss_file(output_file)

0 commit comments

Comments
 (0)