diff options
Diffstat (limited to 'scrape-codex.py')
| -rw-r--r-- | scrape-codex.py | 36 |
1 files changed, 26 insertions, 10 deletions
diff --git a/scrape-codex.py b/scrape-codex.py index 6a7f9b9..c452a62 100644 --- a/scrape-codex.py +++ b/scrape-codex.py @@ -43,6 +43,15 @@ def fetch_file(url, fn, **kwargs): print("{} kb. {}".format(round(size / 1024), fn)) return None +def fetch_raw(url, fn, **kwargs): + try: + resp = requests.get(url, params=kwargs, headers=headers, verify=False) + if resp.status_code != 200: + return None + except: + return None + return resp.text + # Run this with a pool of 5 agents having a chunksize of 3 until finished def parallel_fetch(dataset): print("Fetching {} tiles".format(len(dataset))) @@ -68,6 +77,19 @@ def grab(s, n): print("Fetching {}...".format(page)) os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True) + + xml_url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}.xml".format(TAG, page) + xml = fetch_raw(xml_url) + if xml is None: + print("error with page {}".format(page)) + return + max_width = int(xml.split('Width="')[1].split('"')[0]) + max_height = int(xml.split('Height="')[1].split('"')[0]) + + TILE_W = floor(max_width / ZOOM) + 1 + TILE_H = floor(max_height / ZOOM) + 1 + print("{}x{}".format(TILE_W, TILE_H)) + dataset = [] for i in range(0, TILE_W + 1): for j in range(0, TILE_H + 1): @@ -81,8 +103,8 @@ def grab(s, n): parallel_fetch(dataset) # Get the dimensions of the final image (interior plus corners) - ww = 258 * (TILE_W - 1) - hh = 258 * (TILE_H - 1) + ww = 256 * (TILE_W - 1) + hh = 256 * (TILE_H - 1) start_w = 0 end_w = 0 @@ -120,14 +142,8 @@ def grab(s, n): image, width, height = load_image(fn) if image: canvas.paste(image, (x, y)) - if j == 0: - y += start_h - else: - y += height - if i == 0: - x += start_w - else: - x += width + y += height + x += width canvas.save(out_fn) if __name__ == '__main__': |
