scrape-codex.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

#!python

from multiprocessing import Pool
from math import floor
import os
import requests
import time
from PIL import Image

TAG = 'arundel_ms_263'
LAST_PAGE = 283

ZOOM = 11
TILE_W = 4
TILE_H = 3

# ZOOM = 13
# TILE_W = 16
# TILE_H = 12

# ZOOM = 14
# TILE_W = 33
# TILE_H = 24

headers = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}

def fetch_file(url, fn, **kwargs):
  try:
    resp = requests.get(url, params=kwargs, headers=headers, verify=False)
    if resp.status_code != 200:
      return None
  except:
    return None
  size = 0
  if os.path.exists(fn):
    return
  with open(fn, 'wb') as f:
    for chunk in resp.iter_content(chunk_size=1024):
      if chunk:
        size += len(chunk)
        f.write(chunk)
  print("{} kb. {}".format(round(size / 1024), fn))
  return None

def fetch_raw(url, **kwargs):
  try:
    resp = requests.get(url, params=kwargs, headers=headers, verify=False)
    if resp.status_code != 200:
      return None
  except:
    return None
  return resp.text

# Run this with a pool of 5 agents having a chunksize of 3 until finished
def parallel_fetch(dataset):
  print("Fetching {} tiles".format(len(dataset)))
  agents = 5
  chunksize = 3
  with Pool(processes=agents) as pool:
    pool.starmap(fetch_file, dataset, chunksize)

def load_image(fn):
  try:
    image = Image.open(fn)
    width, height = image.size
    return image, width, height
  except:
    return None, 0, 0

# Fetch all the tiles from a tile server and then compose them into a single image
def grab(s, n):
  page = "{:03d}{}".format(n, s)
  out_fn = "./{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, ZOOM)
  if os.path.exists(out_fn):
    return

  print("Fetching {}...".format(page))
  os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True)

  xml_url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}.xml".format(TAG, page)
  xml = fetch_raw(xml_url)
  if xml is None:
    print("error with page {}".format(page))
    return
  max_width = int(xml.split('Width="')[1].split('"')[0])
  max_height = int(xml.split('Height="')[1].split('"')[0])

  TILE_W = floor(max_width / ZOOM) + 1
  TILE_H = floor(max_height / ZOOM) + 1
  print("{}x{}".format(TILE_W, TILE_H))

  dataset = []
  for i in range(0, TILE_W + 1):
    for j in range(0, TILE_H + 1):
      url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}_files/{}/{}_{}.jpg".format(TAG, page, ZOOM, i, j)
      fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
      if not os.path.exists(fn):
        dataset.append((url, fn,))

  # Fetch all the images we don't have
  if len(dataset):
    parallel_fetch(dataset)

  # Get the dimensions of the final image (interior plus corners)
  ww = 256 * (TILE_W - 1)
  hh = 256 * (TILE_H - 1)

  start_w = 0
  end_w = 0
  start_h = 0
  end_h = 0

  fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, 0, 0)
  image, width, height = load_image(fn)
  ww += width
  hh += height
  start_w = width
  start_h = height

  fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W, TILE_H)
  image, width, height = load_image(fn)
  ww += width
  hh += height

  if image is None:
    fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W-1, TILE_H)
    image, width, height = load_image(fn)
    ww += width

    fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W, TILE_H-1)
    image, width, height = load_image(fn)
    hh += height

  # Build the new canvas by pasting the tiles across it
  canvas = Image.new('RGB', (ww, hh,))
  x = 0
  for i in range(0, TILE_W + 1):
    y = 0
    for j in range(0, TILE_H + 1):
      fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
      image, width, height = load_image(fn)
      if image:
        canvas.paste(image, (x, y))
      y += height
    x += width
  canvas.save(out_fn)

if __name__ == '__main__':
  os.makedirs("{}/{}".format(TAG, ZOOM), exist_ok=True)
  for n in range(1, LAST_PAGE + 1):
    grab('v', n)
    grab('r', n)