scrape vam

author: Jules Laplace <julescarbon@gmail.com> 2018-11-26 22:31:46 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-26 22:31:46 +0100
commit: 353b7de86f790a2ee03f2573218d15d51a855043 (patch)
tree: ef291ee6b223a4babdc20d5c6dfebf760369032d
parent: 78a670cce0a717c52d1641ee01d8295e38eb367b (diff)
3 files changed, 137 insertions, 1 deletions
diff --git a/browser.py b/browser.py
new file mode 100644
index 0000000..f808f9b
--- /dev/null
+++ b/browser.py
@@ -0,0 +1,62 @@
+from multiprocessing import Pool
+import os
+import requests
+import time
+from PIL import Image
+
+headers = {
+  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+}
+
+def fetch_file(url, fn, **kwargs):
+  try:
+    resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+    if resp.status_code != 200:
+      return None
+  except:
+    return None
+  size = 0
+  if os.path.exists(fn):
+    return
+  with open(fn, 'wb') as f:
+    for chunk in resp.iter_content(chunk_size=1024):
+      if chunk:
+        size += len(chunk)
+        f.write(chunk)
+  print("{} kb. {}".format(round(size / 1024), fn))
+  return None
+
+def fetch_raw(url, **kwargs):
+  try:
+    resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+    if resp.status_code != 200:
+      return None
+  except:
+    return None
+  return resp.text
+
+def fetch_json(url, **kwargs):
+  try:
+    resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+    if resp.status_code != 200:
+      return None
+  except:
+    return None
+  return resp.json()
+
+# Run this with a pool of 5 agents having a chunksize of 3 until finished
+def parallel_fetch(dataset):
+  print("Fetching {} tiles".format(len(dataset)))
+  agents = 5
+  chunksize = 3
+  with Pool(processes=agents) as pool:
+    pool.starmap(fetch_file, dataset, chunksize)
+
+def load_image(fn):
+  try:
+    image = Image.open(fn)
+    width, height = image.size
+    return image, width, height
+  except:
+    return None, 0, 0
+
diff --git a/scrape-codex.py b/scrape-codex.py
index 3bd6ec0..6cc1ba0 100644
--- a/scrape-codex.py
+++ b/scrape-codex.py
@@ -90,7 +90,6 @@ def grab(s, n):
   TILE_W = round((max_width / ZOOM) / 256) + 1
   TILE_H = round((max_height / ZOOM) / 256) + 1
   if TILE_W < TILE_H:
-    TILE_W += 1
     TILE_H += 1
   print("{}x{}".format(TILE_W, TILE_H))
 
diff --git a/scrape-vam.py b/scrape-vam.py
new file mode 100644
index 0000000..8ded1f6
--- /dev/null
+++ b/scrape-vam.py
@@ -0,0 +1,75 @@
+#!python
+
+from multiprocessing import Pool
+from math import floor
+import os
+import requests
+import time
+from PIL import Image
+from browser import *
+
+TAG = '2006AW'
+FIRST_PAGE = 1773
+LAST_PAGE = 1879
+
+ZOOM = 1
+
+# Fetch all the tiles from a tile server and then compose them into a single image
+def grab(page):
+  out_fn = "./{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, ZOOM)
+  if os.path.exists(out_fn):
+    return
+
+  print("Fetching {}...".format(page))
+  os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True)
+
+  json_url = "https://framemark.vam.ac.uk//collections/2006AW{}/info.json".format(page)
+  data = fetch_json(json_url)
+  if data is None:
+    print("error with page {}".format(page))
+    return
+  max_width = data['width']
+  max_height = data['height']
+
+  TILE_W = round((max_width / ZOOM) / 256) + 1
+  TILE_H = round((max_height / ZOOM) / 256) + 1
+  if TILE_W < TILE_H:
+    TILE_H += 1
+  print("{}x{}".format(TILE_W, TILE_H))
+
+  dataset = []
+  for i in range(0, TILE_W + 1):
+    for j in range(0, TILE_H + 1):
+      # https://framemark.vam.ac.uk/collections/2006AW1773/768,256,256,256/256,/0/default.jpg
+      url = "https://framemark.vam.ac.uk/collections/2006AW{}/{},{},256,256/256,/0/default.jpg".format(page, i * 256, j * 256)
+      fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
+      if not os.path.exists(fn):
+        dataset.append((url, fn,))
+
+  # Fetch all the images we don't have
+  if len(dataset):
+    parallel_fetch(dataset)
+
+  # Get the dimensions of the final image (interior plus corners)
+  ww = max_width
+  hh = max_height
+
+  # Build the new canvas by pasting the tiles across it
+  canvas = Image.new('RGB', (ww, hh,))
+  x = 0
+  for i in range(0, TILE_W + 1):
+    y = 0
+    for j in range(0, TILE_H + 1):
+      fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
+      image, width, height = load_image(fn)
+      if image:
+        canvas.paste(image, (x, y))
+      y += 256
+    x += 256
+  canvas.save(out_fn)
+
+if __name__ == '__main__':
+  os.makedirs("{}/{}".format(TAG, ZOOM), exist_ok=True)
+  for page in range(FIRST_PAGE, LAST_PAGE + 1):
+    grab(page)
+
author	Jules Laplace <julescarbon@gmail.com>	2018-11-26 22:31:46 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-26 22:31:46 +0100
commit	353b7de86f790a2ee03f2573218d15d51a855043 (patch)
tree	ef291ee6b223a4babdc20d5c6dfebf760369032d
parent	78a670cce0a717c52d1641ee01d8295e38eb367b (diff)