1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
#!python
from multiprocessing import Pool
from math import floor
import os
import requests
import time
from PIL import Image
TAG = 'arundel_ms_263'
LAST_PAGE = 283
ZOOM = 11
TILE_W = 4
TILE_H = 3
# ZOOM = 13
# TILE_W = 16
# TILE_H = 12
# ZOOM = 14
# TILE_W = 33
# TILE_H = 24
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
def fetch_file(url, fn, **kwargs):
try:
resp = requests.get(url, params=kwargs, headers=headers, verify=False)
if resp.status_code != 200:
return None
except:
return None
size = 0
if os.path.exists(fn):
return
with open(fn, 'wb') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
size += len(chunk)
f.write(chunk)
print("{} kb. {}".format(round(size / 1024), fn))
return None
def fetch_raw(url, **kwargs):
try:
resp = requests.get(url, params=kwargs, headers=headers, verify=False)
if resp.status_code != 200:
return None
except:
return None
return resp.text
# Run this with a pool of 5 agents having a chunksize of 3 until finished
def parallel_fetch(dataset):
print("Fetching {} tiles".format(len(dataset)))
agents = 5
chunksize = 3
with Pool(processes=agents) as pool:
pool.starmap(fetch_file, dataset, chunksize)
def load_image(fn):
try:
image = Image.open(fn)
width, height = image.size
return image, width, height
except:
return None, 0, 0
# Fetch all the tiles from a tile server and then compose them into a single image
def grab(s, n):
page = "{:03d}{}".format(n, s)
out_fn = "./{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, ZOOM)
if os.path.exists(out_fn):
return
print("Fetching {}...".format(page))
os.makedirs("./{}/{}/{}".format(TAG, ZOOM, page), exist_ok=True)
xml_url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}.xml".format(TAG, page)
xml = fetch_raw(xml_url)
if xml is None:
print("error with page {}".format(page))
return
max_width = int(xml.split('Width="')[1].split('"')[0])
max_height = int(xml.split('Height="')[1].split('"')[0])
TILE_W = floor(max_width / ZOOM) + 1
TILE_H = floor(max_height / ZOOM) + 1
print("{}x{}".format(TILE_W, TILE_H))
dataset = []
for i in range(0, TILE_W + 1):
for j in range(0, TILE_H + 1):
url = "http://www.bl.uk/manuscripts/Proxy.ashx?view={}_f{}_files/{}/{}_{}.jpg".format(TAG, page, ZOOM, i, j)
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
if not os.path.exists(fn):
dataset.append((url, fn,))
# Fetch all the images we don't have
if len(dataset):
parallel_fetch(dataset)
# Get the dimensions of the final image (interior plus corners)
ww = 256 * (TILE_W - 1)
hh = 256 * (TILE_H - 1)
start_w = 0
end_w = 0
start_h = 0
end_h = 0
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, 0, 0)
image, width, height = load_image(fn)
ww += width
hh += height
start_w = width
start_h = height
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W, TILE_H)
image, width, height = load_image(fn)
ww += width
hh += height
if image is None:
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W-1, TILE_H)
image, width, height = load_image(fn)
ww += width
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, TILE_W, TILE_H-1)
image, width, height = load_image(fn)
hh += height
# Build the new canvas by pasting the tiles across it
canvas = Image.new('RGB', (ww, hh,))
x = 0
for i in range(0, TILE_W + 1):
y = 0
for j in range(0, TILE_H + 1):
fn = "./{}/{}/{}/{}_{}.jpg".format(TAG, ZOOM, page, i, j)
image, width, height = load_image(fn)
if image:
canvas.paste(image, (x, y))
y += height
x += width
canvas.save(out_fn)
if __name__ == '__main__':
os.makedirs("{}/{}".format(TAG, ZOOM), exist_ok=True)
for n in range(1, LAST_PAGE + 1):
grab('v', n)
grab('r', n)
|