summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scrape-codex.py36
1 files changed, 26 insertions, 10 deletions
diff --git a/scrape-codex.py b/scrape-codex.py
index 6a7f9b9..c452a62 100644
--- a/scrape-codex.py
+++ b/scrape-codex.py
@@ -43,6 +43,15 @@ def fetch_file(url, fn, **kwargs):
print("{} kb. {}".format(round(size / 1024), fn))
return None
+def fetch_raw(url, fn, **kwargs):
+ try:
+ resp = requests.get(url, params=kwargs, headers=headers, verify=False)
+ if resp.status_code != 200:
+ return None
+ except:
+ return None
+ return resp.text
+
# Run this with a pool of 5 agents having a chunksize of 3 until finished
def parallel_fetch(dataset):
print("Fetching {} tiles".format(len(dataset)))
@@ -68,6 +77,19 @@ def grab(s, n):
print("Fetching {}...".format(page))
os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True)
+
+ xml_url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}.xml".format(TAG, page)
+ xml = fetch_raw(xml_url)
+ if xml is None:
+ print("error with page {}".format(page))
+ return
+ max_width = int(xml.split('Width="')[1].split('"')[0])
+ max_height = int(xml.split('Height="')[1].split('"')[0])
+
+ TILE_W = floor(max_width / ZOOM) + 1
+ TILE_H = floor(max_height / ZOOM) + 1
+ print("{}x{}".format(TILE_W, TILE_H))
+
dataset = []
for i in range(0, TILE_W + 1):
for j in range(0, TILE_H + 1):
@@ -81,8 +103,8 @@ def grab(s, n):
parallel_fetch(dataset)
# Get the dimensions of the final image (interior plus corners)
- ww = 258 * (TILE_W - 1)
- hh = 258 * (TILE_H - 1)
+ ww = 256 * (TILE_W - 1)
+ hh = 256 * (TILE_H - 1)
start_w = 0
end_w = 0
@@ -120,14 +142,8 @@ def grab(s, n):
image, width, height = load_image(fn)
if image:
canvas.paste(image, (x, y))
- if j == 0:
- y += start_h
- else:
- y += height
- if i == 0:
- x += start_w
- else:
- x += width
+ y += height
+ x += width
canvas.save(out_fn)
if __name__ == '__main__':