diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 17:07:58 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 17:07:58 +0100 |
| commit | 52baa8002c9d711f860be2dc468df4c3812b4956 (patch) | |
| tree | 6ae142c9b07869bf0f732c0e25876e6350177b1c /scrape-codex.py | |
scrape codex
Diffstat (limited to 'scrape-codex.py')
| -rw-r--r-- | scrape-codex.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/scrape-codex.py b/scrape-codex.py new file mode 100644 index 0000000..6e3cb55 --- /dev/null +++ b/scrape-codex.py @@ -0,0 +1,106 @@ +#!python + +from multiprocessing import Pool +import os +import requests +import time +from PIL import Image + +TAG = 'arundel_ms_263' +LAST_PAGE = 283 + +ZOOM = 11 +TILE_W = 4 +TILE_H = 3 + +# ZOOM = 13 +# TILE_W = 16 +# TILE_H = 12 + +# ZOOM = 14 +# TILE_W = 33 +# TILE_H = 24 + +headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', +} + +def fetch_file(url, fn, **kwargs): + try: + resp = requests.get(url, params=kwargs, headers=headers, verify=False) + if resp.status_code != 200: + return None + except: + return None + size = 0 + if os.path.exists(fn): + return + with open(fn, 'wb') as f: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + size += len(chunk) + f.write(chunk) + print("{} kb. {}".format(round(size / 1024), fn)) + return None + +# Run this with a pool of 5 agents having a chunksize of 3 until finished +def parallel_fetch(dataset): + print("Fetching {} tiles".format(len(dataset))) + agents = 5 + chunksize = 3 + with Pool(processes=agents) as pool: + pool.starmap(fetch_file, dataset, chunksize) + +# Fetch all the tiles from a tile server and then compose them into a single image +def grab(s, n): + page = "{:03d}{}".format(n, s) + print("Fetching {}...".format(page)) + os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True) + dataset = [] + for i in range(0, TILE_W + 1): + for j in range(0, TILE_H + 1): + url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}_files/{}/{}_{}.jpg".format(TAG, page, ZOOM, i, j) + fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j) + if not os.path.exists(fn): + dataset.append((url, fn,)) + + # Fetch all the images we don't have + if len(dataset): + parallel_fetch(dataset) + + # Get the dimensions of the final image (interior plus corners) + ww = 258 * (TILE_W - 1) + hh = 258 * (TILE_H - 1) + + fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, 0, 0) + image = Image.open(fn) + width, height = image.size + ww += width + hh += height + + fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W, TILE_H) + image = Image.open(fn) + width, height = image.size + ww += width + hh += height + + # Build the new canvas by pasting the tiles across it + canvas = Image.new('RGB', (ww, hh,)) + x = 0 + for i in range(0, TILE_W + 1): + y = 0 + for j in range(0, TILE_H + 1): + fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j) + image = Image.open(fn) + width, height = image.size + canvas.paste(image, (x, y)) + y += height + x += width + canvas.save("./{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, ZOOM)) + +if __name__ == '__main__': + os.makedirs("{}/{}".format(TAG, ZOOM), exist_ok=True) + for n in range(1, LAST_PAGE + 1): + grab('v', n) + grab('r', n) + |
