|
| 1 | +import os |
| 2 | +import requests |
| 3 | +import urllib.request |
| 4 | + |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +from tqdm import tqdm |
| 7 | + |
| 8 | + |
| 9 | +BASE_URL = "https://mikucolle.gamerch.com" |
| 10 | +CARD_PAGE = "https://mikucolle.gamerch.com/%E3%82%AB%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7" |
| 11 | +OUTPUT_FOLDER = "out" |
| 12 | +MIKU = '''⠄⠄⠄⠄⠄⠄⣀⣀⠄⠄⠄⠄⣀⣀⣀⣀⣀⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄ |
| 13 | +⠄⠄⠄⣠⣤⠞⡋⠉⠧⠶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⢀⠏⠲⣄⠄⠄⠄⠄ |
| 14 | +⠄⢀⡴⠋⢁⢐⣵⣶⣿⠟⣛⣿⣿⣿⠿⢿⣿⣦⣝⡻⣿⢇⡟⠄⣠⣿⣿⣷⣦⡀ |
| 15 | +⠄⠸⢳⡜⢱⣿⣿⠛⡅⣿⣿⣿⡟⣱⣿⣦⡙⣿⣿⣿⡆⡜⠄⣀⢹⣿⣿⣿⣿⣿ |
| 16 | +⠄⢰⣧⢱⣿⣿⢃⠾⣃⢿⣿⣿⢰⣿⣿⣿⠳⠘⣿⣿⣦⡙⢤⡻⠸⡿⠿⣿⣿⣿ |
| 17 | +⠄⣿⡟⣼⣿⡏⣴⣿⣿⡜⣿⣿⢸⣿⣿⣿⣿⣷⠸⣿⣿⣿⢲⣙⢦⠄⠄⣼⣿⣿ |
| 18 | +⢸⣿⡇⣿⣿⡇⣿⡏⠈⣷⣜⢿⢸⣿⣿⡟⠈⣿⣆⢹⣿⣿⠄⠙⣷⠄⠄⣿⣿⣿ |
| 19 | +⣾⣿⡇⣿⣿⠃⣿⡇⠰⣿⣿⣶⣸⣿⣿⣇⠰⣿⣿⡆⣿⡟⠄⠄⡏⠄⢸⣿⣿⡟ |
| 20 | +⠟⣵⣦⢹⣿⢸⣿⣿⣶⣿⣿⣥⣿⣿⣿⣿⣶⣿⣿⡇⣿⡇⣀⣤⠃⠄⡀⠟⠋⠄ |
| 21 | +⡘⣿⡰⠊⠇⢾⣿⣿⣿⣿⣟⠻⣿⡿⣻⣿⣿⣿⣿⢃⡿⢰⡿⠋⠄⠄⠄⠄⣠⣾ |
| 22 | +⣿⣌⠵⠋⠈⠈⠻⢿⣿⣿⣿⣿⣶⣾⣿⣿⣿⣿⡇⠸⣑⡥⢂⣼⡷⠂⠄⢸⣿⣿ |
| 23 | +⣿⣿⡀⠄⠄⠄⠄⠄⢌⣙⡛⢛⠛⣛⠛⣛⢋⣥⡂⢴⡿⣱⣿⠟⠄⠄⠄⠘⣿⣿ |
| 24 | +⣿⣿⣿⣷⣦⣄⣀⣀⡼⡿⣷⡜⡗⠴⠸⠟⣼⡿⣴⡓⢎⣛⠁⠄⠄⠄⠄⠄⢿⣿ |
| 25 | +⣿⣿⣿⣿⣿⣿⠄⠙⠻⢧⣿⣿⡜⣼⢸⣎⣭⣹⢸⡿⣣⠞⢷⡀⠄⠄⠄⠄⢸⣿ |
| 26 | +⣿⣿⣿⣿⣿⣿⠄⠄⠄⠄⣿⣿⡇⣿⢸⣿⣿⣿⡗⢨⠁⠄⠄⢳⡄⠄⠄⠄⢸⣿''' |
| 27 | + |
| 28 | + |
| 29 | +def download_html(page): |
| 30 | + req = requests.get(page) |
| 31 | + if req.status_code == 200: |
| 32 | + resp = req.text |
| 33 | + return resp |
| 34 | + return None |
| 35 | + |
| 36 | + |
| 37 | +def extract_urls(html): |
| 38 | + urls = [] |
| 39 | + |
| 40 | + bs = BeautifulSoup(html, "html.parser") |
| 41 | + table = bs.find("table").tbody |
| 42 | + |
| 43 | + for tr in table.find_all("tr"): |
| 44 | + td = tr.find("td", {"data-col": "4"}) |
| 45 | + link = td.find("a") |
| 46 | + |
| 47 | + full_path = BASE_URL + link['href'] |
| 48 | + urls.append(full_path) |
| 49 | + |
| 50 | + return urls |
| 51 | + |
| 52 | + |
| 53 | +def download_images(url_list): |
| 54 | + for url in tqdm(url_list): |
| 55 | + html = download_html(url) |
| 56 | + |
| 57 | + if html is not None: |
| 58 | + bs = BeautifulSoup(html, "html.parser") |
| 59 | + image = bs.find("img", {"class": "ui_wikidb_main_img"}) |
| 60 | + character = bs.find("a", {"class": "ui_page_match"}) |
| 61 | + name = bs.find("h2", {"id": "js_wikidb_main_name"}) |
| 62 | + |
| 63 | + image_url = image['src'] |
| 64 | + character_name = character['title'] |
| 65 | + filename = f"{name.text}.jpg" |
| 66 | + character_dir = os.path.join(OUTPUT_FOLDER, character_name) |
| 67 | + |
| 68 | + if not os.path.exists(character_dir): |
| 69 | + os.makedirs(character_dir, exist_ok=True) |
| 70 | + |
| 71 | + output_file = os.path.join(character_dir, filename) |
| 72 | + urllib.request.urlretrieve(image_url, filename=output_file) |
| 73 | + |
| 74 | + |
| 75 | +def main(): |
| 76 | + print("MikuColle scraper!") |
| 77 | + print(MIKU) |
| 78 | + print() |
| 79 | + print("downloading main page") |
| 80 | + html = download_html(CARD_PAGE) |
| 81 | + print("extracting card urls") |
| 82 | + urls = extract_urls(html) |
| 83 | + print("downloading card images") |
| 84 | + download_images(urls) |
| 85 | + print("done!") |
| 86 | + |
| 87 | + |
| 88 | +if __name__ == '__main__': |
| 89 | + main() |
0 commit comments