From 29644a20a290db591a86108481d252c008f258a6 Mon Sep 17 00:00:00 2001 From: rabbitflyer5 <> Date: Wed, 9 Aug 2023 13:26:42 -0700 Subject: [PATCH 1/3] Added search filter toolbar support for size, color, and type, as well as safesearch-setting functionality. --- src/cli.py | 29 ++++++++++++++-- src/main.py | 2 +- src/scraper.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 116 insertions(+), 5 deletions(-) diff --git a/src/cli.py b/src/cli.py index 704d845..d01e1c7 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,4 +1,5 @@ import argparse, os, sys +from enum import Enum # Should revisit this to look for xdg_downloads in env def get_download_path(): @@ -29,7 +30,6 @@ def check_pos_int(val: int): else: raise ValueError - def get_arguments(argv=sys.argv): """ The cli front end for the scraper. @@ -40,7 +40,7 @@ def get_arguments(argv=sys.argv): Returns: parser.parse_args() -- A struct with all required info to run the scraper """ - parser = argparse.ArgumentParser(description="Scrape google for images") + parser = argparse.ArgumentParser(description="Scrape Google for images") parser.add_argument("keyword", help="the phrase used to find images", type=str, @@ -59,6 +59,31 @@ def get_arguments(argv=sys.argv): type=check_pos_int, nargs="?", default=1) + parser.add_argument("-s", "--size", + help="Restrict your search to a certain size of image. Can be 'large', 'medium', or 'icon'.", + type=str, + nargs="?", + choices=['large','medium','icon'], + default='') + parser.add_argument("--color", + help="Search for a certain color of image. Can be 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown', 'grayscale', or 'transparent'.", + type=str, + nargs="?", + choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown', 'grayscale', 'transparent'], + default='') + parser.add_argument("-k", "--type", + help="The type of image to search for. Can be 'clipart', 'lineart', or 'animated'.", + type=str, + nargs="?", + choices=['clipart', 'lineart', 'animated'], + dest="imgtype", + default='') + parser.add_argument("-p", "--safesearch", + help="Force the use of a specific safesearch setting. Can be 'on' or 'off'.", + type=str, + nargs="?", + choices=['on', 'off'], + default='') args = parser.parse_args(argv[1:]) # Set default directory if args.directory is None: diff --git a/src/main.py b/src/main.py index 62e8cba..20e1a5e 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ def main(): args = get_arguments(sys.argv) - scrape_images(args.keyword[0], args.count, args.directory, args.threads) + scrape_images(args.keyword[0], args.count, args.directory, args.threads, args.size, args.color, args.imgtype, args.safesearch) if __name__ == "__main__": main() diff --git a/src/scraper.py b/src/scraper.py index bd1ac66..2b5f04b 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -41,6 +41,91 @@ def add_filetype(file_path: str): eprint(err) return 1 + +def process_image_size(val: str): + if (val == 'large'): + return "isz:l" + elif (val == 'medium'): + return "isz:m" + elif (val == 'icon'): + return "isz:i" + else: + return "" + +def process_image_color(val: str): + if (val == "grayscale"): + return "ic:gray" + elif (val == "transparent"): + return "ic:trans" + elif (val == "red"): + return "ic:specific%2Cisc:red" + elif (val == "orange"): + return "ic:specific%2Cisc:orange" + elif (val == "yellow"): + return "ic:specific%2Cisc:yellow" + elif (val == "green"): + return "ic:specific%2Cisc:green" + elif (val == "teal"): + return "ic:specific%2Cisc:teal" + elif (val == "blue"): + return "ic:specific%2Cisc:blue" + elif (val == "purple"): + return "ic:specific%2Cisc:purple" + elif (val == "pink"): + return "ic:specific%2Cisc:pink" + elif (val == "white"): + return "ic:specific%2Cisc:white" + elif (val == "gray"): + return "ic:specific%2Cisc:gray" + elif (val == "black"): + return "ic:specific%2Cisc:black" + elif (val == "brown"): + return "ic:specific%2Cisc:brown" + else: + return "" + +def process_image_type(val: str): + if (val == "clipart"): + return "itp:clipart" + elif (val == "lineart"): + return "itp:lineart" + elif (val == "animated"): + return "itp:animated" + else: + return "" + +def process_safesearch(val: str): + if (val == "on"): + return "on" + elif (val == "off"): + return "off" + else: + return "" + + +def setup_url(searchurl: str, imgsize: str, imgcolor: str, imgtype: str, safesearch: str): + features = [searchurl] + subfeatures = [[],[]] + if (imgsize != None): + subfeatures[0] += [process_image_size(imgsize)] + if (imgcolor != None): + subfeatures[0] += [process_image_color(imgcolor)] + if (imgtype != None): + subfeatures[0] += [process_image_type(imgtype)] + if (safesearch != None): + subfeatures[1] += [process_safesearch(safesearch)] + + delim1 = "&" + delim2 = "%2C" + + if (subfeatures[0] != []): + features += ["tbs=" + delim2.join(subfeatures[0])] + if (subfeatures[1] != []): + features += ["safe=" + delim2.join(subfeatures[1])] + + return delim1.join(features) + + ############################# scraping helpers ################################ def get_image_urls(query: str, page: int): @@ -159,7 +244,7 @@ def get_manifest(search_key: str, image_cnt: int): ################################# main api #################################### -def scrape_images(search_key, image_cnt, directory, threads): +def scrape_images(search_key, image_cnt, directory, threads, size, color, imgtype, safesearch): """ Request manifest, generate paths, save files, get filetype. This is the only function that should be called externally. @@ -174,7 +259,8 @@ def scrape_images(search_key, image_cnt, directory, threads): print("savedir: {}".format(directory)) if not os.path.exists(directory): os.makedirs(directory) - + global search_url + search_url = setup_url(search_url, size, color, imgtype, safesearch) id_url_manifest = get_manifest(search_key, image_cnt) with ThreadPoolExecutor(max_workers=threads) as pool: with tqdm(total=len(id_url_manifest)) as progress: From 767c8a83e5d9caac6501a6e11f15be7f2799dc3f Mon Sep 17 00:00:00 2001 From: rabbitflyer5 <> Date: Wed, 9 Aug 2023 16:38:13 -0700 Subject: [PATCH 2/3] Added most of the remaining advanced image search features. --- src/cli.py | 42 ++++++++++++------- src/main.py | 2 +- src/scraper.py | 109 ++++++++++++++++++++++++++++++------------------- 3 files changed, 96 insertions(+), 57 deletions(-) diff --git a/src/cli.py b/src/cli.py index d01e1c7..b5afacd 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,5 +1,4 @@ import argparse, os, sys -from enum import Enum # Should revisit this to look for xdg_downloads in env def get_download_path(): @@ -60,30 +59,45 @@ def get_arguments(argv=sys.argv): nargs="?", default=1) parser.add_argument("-s", "--size", - help="Restrict your search to a certain size of image. Can be 'large', 'medium', or 'icon'.", + help="Restrict your search to a certain size of image.", type=str, nargs="?", - choices=['large','medium','icon'], - default='') - parser.add_argument("--color", - help="Search for a certain color of image. Can be 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown', 'grayscale', or 'transparent'.", + choices=['large','medium','icon', '400x300', '640x480', '800x600', '1024x768', '2mp', '4mp', '8mp', '10mp', '12mp', '15mp', '20mp', '40mp', '70mp']) + parser.add_argument("-a", "--aspectratio", + help="Restrict to specific aspect ratios.", type=str, nargs="?", - choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown', 'grayscale', 'transparent'], - default='') + choices=['tall', 'square', 'wide', 'panoramic']) + parser.add_argument("-i", "--color", + help="Search for a certain color of image.", + type=str, + nargs="?", + choices=['color', 'grayscale', 'transparent', 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown']) parser.add_argument("-k", "--type", - help="The type of image to search for. Can be 'clipart', 'lineart', or 'animated'.", + help="The type of image to search for.", + type=str, + nargs="?", + choices=['face', 'photo', 'clipart', 'lineart', 'animated'], + dest="imgtype") + parser.add_argument("-r", "--region", + help="Get results from a specific region.", + type=str, + nargs="?") + parser.add_argument("-f", "--filetype", + help="Search for a specific file extension.", + type=str, + nargs="?", + choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw']) + parser.add_argument("-u", "--usage", + help="Specify usage rights.", type=str, nargs="?", - choices=['clipart', 'lineart', 'animated'], - dest="imgtype", - default='') + choices=['cc', 'other']) parser.add_argument("-p", "--safesearch", help="Force the use of a specific safesearch setting. Can be 'on' or 'off'.", type=str, nargs="?", - choices=['on', 'off'], - default='') + choices=['on', 'off']) args = parser.parse_args(argv[1:]) # Set default directory if args.directory is None: diff --git a/src/main.py b/src/main.py index 20e1a5e..2a6679a 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,7 @@ def main(): args = get_arguments(sys.argv) - scrape_images(args.keyword[0], args.count, args.directory, args.threads, args.size, args.color, args.imgtype, args.safesearch) + scrape_images(args.keyword[0], args.count, args.directory, args.threads, args.size, args.aspectratio, args.color, args.imgtype, args.region, args.filetype, args.usage, args.safesearch) if __name__ == "__main__": main() diff --git a/src/scraper.py b/src/scraper.py index 2b5f04b..b61dc6d 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -43,75 +43,100 @@ def add_filetype(file_path: str): def process_image_size(val: str): + key = 'isz:' if (val == 'large'): - return "isz:l" + return key + 'l' elif (val == 'medium'): - return "isz:m" + return key + 'm' elif (val == 'icon'): - return "isz:i" + return key + 'i' + elif (val in ['400x300', '640x480', '800x600', '1024x768']): + key += 'lt%2Cislt:' + if (val == '400x300'): + return key + "qsvga" + elif (val == '640x480'): + return key + "vga" + elif (val == '800x600'): + return key + "svga" + elif (val == '1024x768'): + return key + "xga" + elif (val in ['2mp','4mp','6mp','8mp','10mp','12mp','15mp','20mp','40mp','70mp']): + return key + 'lt%2Cislt:' + val else: return "" +def process_image_aspectratio(val: str): + key = 'iar:' + if (val == 'tall'): + return key + 't' + elif (val == 'square'): + return key + 's' + elif (val == 'wide'): + return key + 'w' + elif (val == 'panoramic'): + return key + 'xw' + def process_image_color(val: str): - if (val == "grayscale"): + if (val == "color"): + return "ic:color" + elif (val == "grayscale"): return "ic:gray" elif (val == "transparent"): return "ic:trans" - elif (val == "red"): - return "ic:specific%2Cisc:red" - elif (val == "orange"): - return "ic:specific%2Cisc:orange" - elif (val == "yellow"): - return "ic:specific%2Cisc:yellow" - elif (val == "green"): - return "ic:specific%2Cisc:green" - elif (val == "teal"): - return "ic:specific%2Cisc:teal" - elif (val == "blue"): - return "ic:specific%2Cisc:blue" - elif (val == "purple"): - return "ic:specific%2Cisc:purple" - elif (val == "pink"): - return "ic:specific%2Cisc:pink" - elif (val == "white"): - return "ic:specific%2Cisc:white" - elif (val == "gray"): - return "ic:specific%2Cisc:gray" - elif (val == "black"): - return "ic:specific%2Cisc:black" - elif (val == "brown"): - return "ic:specific%2Cisc:brown" + elif (val in ['red','orange','yellow','green','teal','blue','purple','pink','white','gray','black','brown']): + return "ic:specific%2Cisc:" + val else: return "" def process_image_type(val: str): - if (val == "clipart"): - return "itp:clipart" - elif (val == "lineart"): - return "itp:lineart" - elif (val == "animated"): - return "itp:animated" + if (val in ['face', 'photo', 'clipart', 'lineart', 'animated']): + return 'itp:' + val else: return "" +def process_image_region(val: str): + if (val == ''): + return '' + else: + return 'ctr:country' + val.upper() + +def process_image_filetype(val: str): + if (val in ['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw']): + return 'ift:' + val + +def process_image_usage(val: str): + key = 'sur:' + if (val == 'cc'): + return key + 'cl' + elif (val == 'other'): + return key + 'ol' + else: + return '' + def process_safesearch(val: str): - if (val == "on"): - return "on" - elif (val == "off"): - return "off" + if (val in ["on", "off"]): + return val else: return "" -def setup_url(searchurl: str, imgsize: str, imgcolor: str, imgtype: str, safesearch: str): +def setup_url(searchurl: str, imgsize: str, imgaspectratio: str, imgcolor: str, imgtype: str, imgregion: str, imgfiletype: str, imgusage: str, safesearch: str): features = [searchurl] subfeatures = [[],[]] if (imgsize != None): subfeatures[0] += [process_image_size(imgsize)] + if (imgaspectratio != None): + subfeatures[0] += [process_image_aspectratio(imgaspectratio)] if (imgcolor != None): subfeatures[0] += [process_image_color(imgcolor)] if (imgtype != None): subfeatures[0] += [process_image_type(imgtype)] + if (imgregion != None): + subfeatures[0] += [process_image_region(imgregion)] + if (imgfiletype != None): + subfeatures[0] += [process_image_filetype(imgfiletype)] + if (imgusage != None): + subfeatures[0] += [process_image_usage(imgusage)] if (safesearch != None): subfeatures[1] += [process_safesearch(safesearch)] @@ -122,7 +147,7 @@ def setup_url(searchurl: str, imgsize: str, imgcolor: str, imgtype: str, safesea features += ["tbs=" + delim2.join(subfeatures[0])] if (subfeatures[1] != []): features += ["safe=" + delim2.join(subfeatures[1])] - + print(delim1.join(features)) return delim1.join(features) @@ -244,7 +269,7 @@ def get_manifest(search_key: str, image_cnt: int): ################################# main api #################################### -def scrape_images(search_key, image_cnt, directory, threads, size, color, imgtype, safesearch): +def scrape_images(search_key, image_cnt, directory, threads, size, aspectratio, color, imgtype, region, filetype, usage, safesearch): """ Request manifest, generate paths, save files, get filetype. This is the only function that should be called externally. @@ -260,7 +285,7 @@ def scrape_images(search_key, image_cnt, directory, threads, size, color, imgtyp if not os.path.exists(directory): os.makedirs(directory) global search_url - search_url = setup_url(search_url, size, color, imgtype, safesearch) + search_url = setup_url(search_url, size, aspectratio, color, imgtype, region, filetype, usage, safesearch) id_url_manifest = get_manifest(search_key, image_cnt) with ThreadPoolExecutor(max_workers=threads) as pool: with tqdm(total=len(id_url_manifest)) as progress: From 7f384ae0f4e2b407c46097f4e716f5e60a29aeb2 Mon Sep 17 00:00:00 2001 From: Ryan Talley Date: Thu, 12 Oct 2023 01:02:04 -0400 Subject: [PATCH 3/3] began restructuring data/filters into dict --- src/cli.py | 46 +++++++++++++++------- src/main.py | 15 ++++++- src/scraper.py | 103 ++++++++++++++++++++++++++++--------------------- 3 files changed, 105 insertions(+), 59 deletions(-) diff --git a/src/cli.py b/src/cli.py index b5afacd..16ff88f 100644 --- a/src/cli.py +++ b/src/cli.py @@ -43,7 +43,7 @@ def get_arguments(argv=sys.argv): parser.add_argument("keyword", help="the phrase used to find images", type=str, - nargs=1) + nargs="?") parser.add_argument("-c", "--count", help="How many images to try to scrape", type=check_pos_int, @@ -62,45 +62,65 @@ def get_arguments(argv=sys.argv): help="Restrict your search to a certain size of image.", type=str, nargs="?", - choices=['large','medium','icon', '400x300', '640x480', '800x600', '1024x768', '2mp', '4mp', '8mp', '10mp', '12mp', '15mp', '20mp', '40mp', '70mp']) + default=None, + choices=["large","medium","icon", "400x300", "640x480", + "800x600", "1024x768", "2mp", "4mp", "8mp", + "10mp", "12mp", "15mp", "20mp", "40mp", "70mp"]) parser.add_argument("-a", "--aspectratio", help="Restrict to specific aspect ratios.", type=str, nargs="?", - choices=['tall', 'square', 'wide', 'panoramic']) + default=None, + choices=["tall", "square", "wide", "panoramic"]) parser.add_argument("-i", "--color", help="Search for a certain color of image.", type=str, nargs="?", - choices=['color', 'grayscale', 'transparent', 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown']) + default=None, + choices=["color", "grayscale", "transparent", "red", + "orange", "yellow", "green", "teal", "blue", + "purple", "pink", "white", "gray", "black", + "brown"]) parser.add_argument("-k", "--type", help="The type of image to search for.", type=str, nargs="?", - choices=['face', 'photo', 'clipart', 'lineart', 'animated'], - dest="imgtype") + default=None, + choices=["face", "photo", "clipart", + "lineart", "animated"], + dest="type") parser.add_argument("-r", "--region", - help="Get results from a specific region.", + help="Get results from a specific country.", type=str, - nargs="?") + nargs="?", + default=None) + parser.add_argument("-w", "--site", + help="Get results from a specific site or domain.", + type=str, + nargs="?", + default=None) parser.add_argument("-f", "--filetype", help="Search for a specific file extension.", type=str, nargs="?", - choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw']) + default=None, + choices=["jpg", "gif", "png", "bmp", + "svg", "webp", "ico", "raw"]) parser.add_argument("-u", "--usage", help="Specify usage rights.", type=str, nargs="?", - choices=['cc', 'other']) + default=None, + choices=["cc", "other"]) parser.add_argument("-p", "--safesearch", - help="Force the use of a specific safesearch setting. Can be 'on' or 'off'.", + help="Specify safesearch usage. Can be 'on' or 'off'.", type=str, nargs="?", - choices=['on', 'off']) + default="off", + choices=["on", "off"]) args = parser.parse_args(argv[1:]) # Set default directory if args.directory is None: print(args.keyword[0]) - args.directory = get_default_dir(args.keyword[0]) + args.directory = get_default_dir(args.keyword) return args diff --git a/src/main.py b/src/main.py index 2a6679a..a823ead 100644 --- a/src/main.py +++ b/src/main.py @@ -5,7 +5,20 @@ def main(): args = get_arguments(sys.argv) - scrape_images(args.keyword[0], args.count, args.directory, args.threads, args.size, args.aspectratio, args.color, args.imgtype, args.region, args.filetype, args.usage, args.safesearch) + print(args) + print(args.keyword) + filters = { + "size": args.size, + "aspectratio": args.aspectratio, + "color": args.color, + "type": args.type, + "region": args.region, + "site": args.site, + "filetype": args.filetype, + "usage": args.usage, + "safesearch": args.safesearch, + } + scrape_images(args.keyword, args.count, args.directory, args.threads, filters) if __name__ == "__main__": main() diff --git a/src/scraper.py b/src/scraper.py index b61dc6d..bd695b7 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,5 +1,6 @@ import json, os, sys from concurrent.futures import ThreadPoolExecutor +from copy import copy import filetype import requests @@ -43,6 +44,7 @@ def add_filetype(file_path: str): def process_image_size(val: str): + # This can be refactored to use maps and eliminate redundancy (test first) key = 'isz:' if (val == 'large'): return key + 'l' @@ -51,7 +53,7 @@ def process_image_size(val: str): elif (val == 'icon'): return key + 'i' elif (val in ['400x300', '640x480', '800x600', '1024x768']): - key += 'lt%2Cislt:' + key += 'lt%2Cislt:' if (val == '400x300'): return key + "qsvga" elif (val == '640x480'): @@ -60,8 +62,9 @@ def process_image_size(val: str): return key + "svga" elif (val == '1024x768'): return key + "xga" - elif (val in ['2mp','4mp','6mp','8mp','10mp','12mp','15mp','20mp','40mp','70mp']): - return key + 'lt%2Cislt:' + val + elif (val in ['2mp','4mp','6mp','8mp','10mp', + '12mp','15mp','20mp','40mp','70mp']): + return key + 'lt%2Cislt:' + val else: return "" @@ -83,7 +86,8 @@ def process_image_color(val: str): return "ic:gray" elif (val == "transparent"): return "ic:trans" - elif (val in ['red','orange','yellow','green','teal','blue','purple','pink','white','gray','black','brown']): + elif (val in ['red','orange','yellow','green','teal','blue', + 'purple','pink','white','gray','black','brown']): return "ic:specific%2Cisc:" + val else: return "" @@ -95,9 +99,7 @@ def process_image_type(val: str): return "" def process_image_region(val: str): - if (val == ''): - return '' - else: + if (val != None): return 'ctr:country' + val.upper() def process_image_filetype(val: str): @@ -113,42 +115,40 @@ def process_image_usage(val: str): else: return '' -def process_safesearch(val: str): - if (val in ["on", "off"]): - return val - else: - return "" +def setup_url(filters): + global search_url + filtered_url = copy(search_url) -def setup_url(searchurl: str, imgsize: str, imgaspectratio: str, imgcolor: str, imgtype: str, imgregion: str, imgfiletype: str, imgusage: str, safesearch: str): - features = [searchurl] - subfeatures = [[],[]] - if (imgsize != None): - subfeatures[0] += [process_image_size(imgsize)] - if (imgaspectratio != None): - subfeatures[0] += [process_image_aspectratio(imgaspectratio)] - if (imgcolor != None): - subfeatures[0] += [process_image_color(imgcolor)] - if (imgtype != None): - subfeatures[0] += [process_image_type(imgtype)] - if (imgregion != None): - subfeatures[0] += [process_image_region(imgregion)] - if (imgfiletype != None): - subfeatures[0] += [process_image_filetype(imgfiletype)] - if (imgusage != None): - subfeatures[0] += [process_image_usage(imgusage)] - if (safesearch != None): - subfeatures[1] += [process_safesearch(safesearch)] - - delim1 = "&" - delim2 = "%2C" - - if (subfeatures[0] != []): - features += ["tbs=" + delim2.join(subfeatures[0])] - if (subfeatures[1] != []): - features += ["safe=" + delim2.join(subfeatures[1])] - print(delim1.join(features)) - return delim1.join(features) + features = [search_url] + url_ids = [] + + filter_keys = list(filters.keys()) + # Need to confirm we can't put these into the tbs tag + if filters["safesearch"] == "on": + filtered_url += "&safe=on" + if filters["site"] != None: + filtered_url += ("&as_sitesearch=" + filters["site"]) + filter_keys.remove("safesearch") + filter_keys.remove("site") + # if filters["region"] != None: + # filtered_url += ("&" + process_image_region(filters["region"])) + # filter_keys.remove("region") + + # append_val = (lambda l, a: l.append(a) if a is not None) + def append_val(l, v): + if v is not None: + l.append(v) + for k in filter_keys: + function_name = "process_image_" + k + process_function = globals()[function_name] + append_val(url_ids, process_function(filters[k])) + + delim = "," + if (url_ids[0] != []): + filtered_url += "&tbs=" + delim.join(url_ids) + + return filtered_url ############################# scraping helpers ################################ @@ -269,7 +269,7 @@ def get_manifest(search_key: str, image_cnt: int): ################################# main api #################################### -def scrape_images(search_key, image_cnt, directory, threads, size, aspectratio, color, imgtype, region, filetype, usage, safesearch): +def scrape_images(search_key, image_cnt, directory, threads, filters): """ Request manifest, generate paths, save files, get filetype. This is the only function that should be called externally. @@ -279,13 +279,13 @@ def scrape_images(search_key, image_cnt, directory, threads, size, aspectratio, image_cnt -- how many images are we trying to scrape directory -- the folder to save scraped images in threads -- how many worker threads to spawn + filters -- hashmap of image filters to apply to the search results """ if DEBUG: print("savedir: {}".format(directory)) if not os.path.exists(directory): os.makedirs(directory) - global search_url - search_url = setup_url(search_url, size, aspectratio, color, imgtype, region, filetype, usage, safesearch) + search_url = setup_url(filters) id_url_manifest = get_manifest(search_key, image_cnt) with ThreadPoolExecutor(max_workers=threads) as pool: with tqdm(total=len(id_url_manifest)) as progress: @@ -313,7 +313,20 @@ def test(): directory = get_default_dir(search_key) threads = 4 - scrape_images(search_key, image_cnt, directory, threads) + filters = { + "size": "large", + "aspectratio": "panoramic", + "color": "green", + "type": "clipart", + "region": "CA", # Needs a mapping of inputs to country choices + # Use an import statement and roll in another file + "site": "laksjdf", + "filetype": "png", + "usage": "other", + "safesearch": "on" + } + + scrape_images(search_key, image_cnt, directory, threads, filters) if __name__ == "__main__":