summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-26 22:31:46 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-26 22:31:46 +0100
commit353b7de86f790a2ee03f2573218d15d51a855043 (patch)
treeef291ee6b223a4babdc20d5c6dfebf760369032d
parent78a670cce0a717c52d1641ee01d8295e38eb367b (diff)
scrape vam
-rw-r--r--browser.py62
-rw-r--r--scrape-codex.py1
-rw-r--r--scrape-vam.py75
3 files changed, 137 insertions, 1 deletions
diff --git a/browser.py b/browser.py
new file mode 100644
index 0000000..f808f9b
--- /dev/null
+++ b/browser.py
@@ -0,0 +1,62 @@
+from multiprocessing import Pool
+import os
+import requests
+import time
+from PIL import Image
+
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+}
+
+def fetch_file(url, fn, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ size = 0
+ if os.path.exists(fn):
+ return
+ with open(fn, 'wb') as f:
+ for chunk in resp.iter_content(chunk_size=1024):
+ if chunk:
+ size += len(chunk)
+ f.write(chunk)
+ print("{} kb. {}".format(round(size / 1024), fn))
+ return None
+
+def fetch_raw(url, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ return resp.text
+
+def fetch_json(url, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ return resp.json()
+
+# Run this with a pool of 5 agents having a chunksize of 3 until finished
+def parallel_fetch(dataset):
+ print("Fetching {} tiles".format(len(dataset)))
+ agents = 5
+ chunksize = 3
+ with Pool(processes=agents) as pool:
+ pool.starmap(fetch_file, dataset, chunksize)
+
+def load_image(fn):
+ try:
+ image = Image.open(fn)
+ width, height = image.size
+ return image, width, height
+ except:
+ return None, 0, 0
+
diff --git a/scrape-codex.py b/scrape-codex.py
index 3bd6ec0..6cc1ba0 100644
--- a/scrape-codex.py
+++ b/scrape-codex.py
@@ -90,7 +90,6 @@ def grab(s, n):
TILE_W = round((max_width / ZOOM) / 256) + 1
TILE_H = round((max_height / ZOOM) / 256) + 1
if TILE_W < TILE_H:
- TILE_W += 1
TILE_H += 1
print("{}x{}".format(TILE_W, TILE_H))
diff --git a/scrape-vam.py b/scrape-vam.py
new file mode 100644
index 0000000..8ded1f6
--- /dev/null
+++ b/scrape-vam.py
@@ -0,0 +1,75 @@
+#!python
+
+from multiprocessing import Pool
+from math import floor
+import os
+import requests
+import time
+from PIL import Image
+from browser import *
+
+TAG = '2006AW'
+FIRST_PAGE = 1773
+LAST_PAGE = 1879
+
+ZOOM = 1
+
+# Fetch all the tiles from a tile server and then compose them into a single image
+def grab(page):
+ out_fn = "./{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, ZOOM)
+ if os.path.exists(out_fn):
+ return
+
+ print("Fetching {}...".format(page))
+ os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True)
+
+ json_url = "https://framemark.vam.ac.uk//collections/2006AW{}/info.json".format(page)
+ data = fetch_json(json_url)
+ if data is None:
+ print("error with page {}".format(page))
+ return
+ max_width = data['width']
+ max_height = data['height']
+
+ TILE_W = round((max_width / ZOOM) / 256) + 1
+ TILE_H = round((max_height / ZOOM) / 256) + 1
+ if TILE_W < TILE_H:
+ TILE_H += 1
+ print("{}x{}".format(TILE_W, TILE_H))
+
+ dataset = []
+ for i in range(0, TILE_W + 1):
+ for j in range(0, TILE_H + 1):
+ # https://framemark.vam.ac.uk/collections/2006AW1773/768,256,256,256/256,/0/default.jpg
+ url = "https://framemark.vam.ac.uk/collections/2006AW{}/{},{},256,256/256,/0/default.jpg".format(page, i * 256, j * 256)
+ fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
+ if not os.path.exists(fn):
+ dataset.append((url, fn,))
+
+ # Fetch all the images we don't have
+ if len(dataset):
+ parallel_fetch(dataset)
+
+ # Get the dimensions of the final image (interior plus corners)
+ ww = max_width
+ hh = max_height
+
+ # Build the new canvas by pasting the tiles across it
+ canvas = Image.new('RGB', (ww, hh,))
+ x = 0
+ for i in range(0, TILE_W + 1):
+ y = 0
+ for j in range(0, TILE_H + 1):
+ fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
+ image, width, height = load_image(fn)
+ if image:
+ canvas.paste(image, (x, y))
+ y += 256
+ x += 256
+ canvas.save(out_fn)
+
+if __name__ == '__main__':
+ os.makedirs("{}/{}".format(TAG, ZOOM), exist_ok=True)
+ for page in range(FIRST_PAGE, LAST_PAGE + 1):
+ grab(page)
+