From 353b7de86f790a2ee03f2573218d15d51a855043 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Mon, 26 Nov 2018 22:31:46 +0100 Subject: scrape vam --- browser.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 browser.py (limited to 'browser.py') diff --git a/browser.py b/browser.py new file mode 100644 index 0000000..f808f9b --- /dev/null +++ b/browser.py @@ -0,0 +1,62 @@ +from multiprocessing import Pool +import os +import requests +import time +from PIL import Image + +headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', +} + +def fetch_file(url, fn, **kwargs): + try: + resp = requests.get(url, params=kwargs, headers=headers, verify=False) + if resp.status_code != 200: + return None + except: + return None + size = 0 + if os.path.exists(fn): + return + with open(fn, 'wb') as f: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + size += len(chunk) + f.write(chunk) + print("{} kb. {}".format(round(size / 1024), fn)) + return None + +def fetch_raw(url, **kwargs): + try: + resp = requests.get(url, params=kwargs, headers=headers, verify=False) + if resp.status_code != 200: + return None + except: + return None + return resp.text + +def fetch_json(url, **kwargs): + try: + resp = requests.get(url, params=kwargs, headers=headers, verify=False) + if resp.status_code != 200: + return None + except: + return None + return resp.json() + +# Run this with a pool of 5 agents having a chunksize of 3 until finished +def parallel_fetch(dataset): + print("Fetching {} tiles".format(len(dataset))) + agents = 5 + chunksize = 3 + with Pool(processes=agents) as pool: + pool.starmap(fetch_file, dataset, chunksize) + +def load_image(fn): + try: + image = Image.open(fn) + width, height = image.size + return image, width, height + except: + return None, 0, 0 + -- cgit v1.2.3-70-g09d2