1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from pathlib import Path\n",
"from os.path import join"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"data = \"\"\"\n",
"<table><tr><th><img src=\"/icons/blank.gif\" alt=\"[ICO]\"></th><th><a href=\"?C=N;O=D\">Name</a></th><th><a href=\"?C=M;O=A\">Last modified</a></th><th><a href=\"?C=S;O=A\">Size</a></th><th><a href=\"?C=D;O=A\">Description</a></th></tr><tr><th colspan=\"5\"><hr></th></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/back.gif\" alt=\"[DIR]\"></td><td><a href=\"/traces/mmsys/2015/paper-5/\">Parent Directory</a></td><td> </td><td align=\"right\"> - </td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desccred.zip\">desccred.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\">133M</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desctxt.zip\">desctxt.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\"> 18M</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"descvis.zip\">descvis.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 60M</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"devset_topics.xml\">devset_topics.xml</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\">6.0K</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"gt.zip\">gt.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 91K</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"img.zip\">img.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">1.1G</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"imgwiki.zip\">imgwiki.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">341M</td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"poiNameCorrespondences.txt\">poiNameCorrespondences.txt</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">905 </td><td> </td></tr>\n",
"<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"xml.zip\">xml.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">811K</td><td> </td></tr>\n",
"<tr><th colspan=\"5\"><hr></th></tr>\n",
"</table>\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"def parse_urls(url, exts):\n",
" ahrefs = soup.find_all('a')\n",
" urls = []\n",
" for a in ahrefs:\n",
" href = a['href']\n",
" if Path(href).suffix[1:] in exts:\n",
" urls.append('{}{}'.format(url, href))\n",
" return urls"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desccred.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desctxt.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/descvis.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/gt.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/img.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/imgwiki.zip\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/poiNameCorrespondences.txt\n",
"http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/xml.zip\n"
]
}
],
"source": [
"soup = BeautifulSoup(data,'lxml')\n",
"burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n",
"urls = parse_urls(burl, ['zip', 'txt'])\n",
"for u in urls:\n",
" print(u)\n",
"\n",
"# for row in rows:\n",
"# ahrefs = row.find_all(href=True)\n",
"# for a in ahrefs:\n",
"# href = a['href']\n",
"# if 'zip' in href:\n",
"# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n",
"# print(url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install lxml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|