Skip to content

Commit 3736369

Browse files
committed
Initial upload
0 parents  commit 3736369

File tree

5 files changed

+105
-0
lines changed

5 files changed

+105
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.idea
2+
out

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# mikucolle-scrape
2+
3+
a simple scraper to download all card images from https://mikucolle.gamerch.com/
4+
5+
![scraper screenshot](docs/mikuscrape.png)
6+
7+
## Usage
8+
9+
- Install requirements from requirements.txt: ```pip install -r requirements.txt```
10+
- Run main.py: ```python main.py```
11+

docs/mikuscrape.png

417 KB
Loading

main.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import os
2+
import requests
3+
import urllib.request
4+
5+
from bs4 import BeautifulSoup
6+
from tqdm import tqdm
7+
8+
9+
BASE_URL = "https://mikucolle.gamerch.com"
10+
CARD_PAGE = "https://mikucolle.gamerch.com/%E3%82%AB%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7"
11+
OUTPUT_FOLDER = "out"
12+
MIKU = '''⠄⠄⠄⠄⠄⠄⣀⣀⠄⠄⠄⠄⣀⣀⣀⣀⣀⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄
13+
⠄⠄⠄⣠⣤⠞⡋⠉⠧⠶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⢀⠏⠲⣄⠄⠄⠄⠄
14+
⠄⢀⡴⠋⢁⢐⣵⣶⣿⠟⣛⣿⣿⣿⠿⢿⣿⣦⣝⡻⣿⢇⡟⠄⣠⣿⣿⣷⣦⡀
15+
⠄⠸⢳⡜⢱⣿⣿⠛⡅⣿⣿⣿⡟⣱⣿⣦⡙⣿⣿⣿⡆⡜⠄⣀⢹⣿⣿⣿⣿⣿
16+
⠄⢰⣧⢱⣿⣿⢃⠾⣃⢿⣿⣿⢰⣿⣿⣿⠳⠘⣿⣿⣦⡙⢤⡻⠸⡿⠿⣿⣿⣿
17+
⠄⣿⡟⣼⣿⡏⣴⣿⣿⡜⣿⣿⢸⣿⣿⣿⣿⣷⠸⣿⣿⣿⢲⣙⢦⠄⠄⣼⣿⣿
18+
⢸⣿⡇⣿⣿⡇⣿⡏⠈⣷⣜⢿⢸⣿⣿⡟⠈⣿⣆⢹⣿⣿⠄⠙⣷⠄⠄⣿⣿⣿
19+
⣾⣿⡇⣿⣿⠃⣿⡇⠰⣿⣿⣶⣸⣿⣿⣇⠰⣿⣿⡆⣿⡟⠄⠄⡏⠄⢸⣿⣿⡟
20+
⠟⣵⣦⢹⣿⢸⣿⣿⣶⣿⣿⣥⣿⣿⣿⣿⣶⣿⣿⡇⣿⡇⣀⣤⠃⠄⡀⠟⠋⠄
21+
⡘⣿⡰⠊⠇⢾⣿⣿⣿⣿⣟⠻⣿⡿⣻⣿⣿⣿⣿⢃⡿⢰⡿⠋⠄⠄⠄⠄⣠⣾
22+
⣿⣌⠵⠋⠈⠈⠻⢿⣿⣿⣿⣿⣶⣾⣿⣿⣿⣿⡇⠸⣑⡥⢂⣼⡷⠂⠄⢸⣿⣿
23+
⣿⣿⡀⠄⠄⠄⠄⠄⢌⣙⡛⢛⠛⣛⠛⣛⢋⣥⡂⢴⡿⣱⣿⠟⠄⠄⠄⠘⣿⣿
24+
⣿⣿⣿⣷⣦⣄⣀⣀⡼⡿⣷⡜⡗⠴⠸⠟⣼⡿⣴⡓⢎⣛⠁⠄⠄⠄⠄⠄⢿⣿
25+
⣿⣿⣿⣿⣿⣿⠄⠙⠻⢧⣿⣿⡜⣼⢸⣎⣭⣹⢸⡿⣣⠞⢷⡀⠄⠄⠄⠄⢸⣿
26+
⣿⣿⣿⣿⣿⣿⠄⠄⠄⠄⣿⣿⡇⣿⢸⣿⣿⣿⡗⢨⠁⠄⠄⢳⡄⠄⠄⠄⢸⣿'''
27+
28+
29+
def download_html(page):
30+
req = requests.get(page)
31+
if req.status_code == 200:
32+
resp = req.text
33+
return resp
34+
return None
35+
36+
37+
def extract_urls(html):
38+
urls = []
39+
40+
bs = BeautifulSoup(html, "html.parser")
41+
table = bs.find("table").tbody
42+
43+
for tr in table.find_all("tr"):
44+
td = tr.find("td", {"data-col": "4"})
45+
link = td.find("a")
46+
47+
full_path = BASE_URL + link['href']
48+
urls.append(full_path)
49+
50+
return urls
51+
52+
53+
def download_images(url_list):
54+
for url in tqdm(url_list):
55+
html = download_html(url)
56+
57+
if html is not None:
58+
bs = BeautifulSoup(html, "html.parser")
59+
image = bs.find("img", {"class": "ui_wikidb_main_img"})
60+
character = bs.find("a", {"class": "ui_page_match"})
61+
name = bs.find("h2", {"id": "js_wikidb_main_name"})
62+
63+
image_url = image['src']
64+
character_name = character['title']
65+
filename = f"{name.text}.jpg"
66+
character_dir = os.path.join(OUTPUT_FOLDER, character_name)
67+
68+
if not os.path.exists(character_dir):
69+
os.makedirs(character_dir, exist_ok=True)
70+
71+
output_file = os.path.join(character_dir, filename)
72+
urllib.request.urlretrieve(image_url, filename=output_file)
73+
74+
75+
def main():
76+
print("MikuColle scraper!")
77+
print(MIKU)
78+
print()
79+
print("downloading main page")
80+
html = download_html(CARD_PAGE)
81+
print("extracting card urls")
82+
urls = extract_urls(html)
83+
print("downloading card images")
84+
download_images(urls)
85+
print("done!")
86+
87+
88+
if __name__ == '__main__':
89+
main()

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests~=2.28.1
2+
beautifulsoup4~=4.11.1
3+
tqdm~=4.64.1

0 commit comments

Comments
 (0)