summaryrefslogtreecommitdiff
path: root/browser.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-26 22:31:46 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-26 22:31:46 +0100
commit353b7de86f790a2ee03f2573218d15d51a855043 (patch)
treeef291ee6b223a4babdc20d5c6dfebf760369032d /browser.py
parent78a670cce0a717c52d1641ee01d8295e38eb367b (diff)
scrape vam
Diffstat (limited to 'browser.py')
-rw-r--r--browser.py62
1 files changed, 62 insertions, 0 deletions
diff --git a/browser.py b/browser.py
new file mode 100644
index 0000000..f808f9b
--- /dev/null
+++ b/browser.py
@@ -0,0 +1,62 @@
+from multiprocessing import Pool
+import os
+import requests
+import time
+from PIL import Image
+
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+}
+
+def fetch_file(url, fn, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ size = 0
+ if os.path.exists(fn):
+ return
+ with open(fn, 'wb') as f:
+ for chunk in resp.iter_content(chunk_size=1024):
+ if chunk:
+ size += len(chunk)
+ f.write(chunk)
+ print("{} kb. {}".format(round(size / 1024), fn))
+ return None
+
+def fetch_raw(url, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ return resp.text
+
+def fetch_json(url, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ return resp.json()
+
+# Run this with a pool of 5 agents having a chunksize of 3 until finished
+def parallel_fetch(dataset):
+ print("Fetching {} tiles".format(len(dataset)))
+ agents = 5
+ chunksize = 3
+ with Pool(processes=agents) as pool:
+ pool.starmap(fetch_file, dataset, chunksize)
+
+def load_image(fn):
+ try:
+ image = Image.open(fn)
+ width, height = image.size
+ return image, width, height
+ except:
+ return None, 0, 0
+