summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/bs4_scratch.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/bs4_scratch.ipynb')
-rw-r--r--megapixels/notebooks/bs4_scratch.ipynb127
1 files changed, 127 insertions, 0 deletions
diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb
new file mode 100644
index 00000000..dce0ddc2
--- /dev/null
+++ b/megapixels/notebooks/bs4_scratch.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from bs4 import BeautifulSoup\n",
+ "from pathlib import Path\n",
+ "from os.path import join"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = \"\"\"\n",
+ "<table><tr><th><img src=\"/icons/blank.gif\" alt=\"[ICO]\"></th><th><a href=\"?C=N;O=D\">Name</a></th><th><a href=\"?C=M;O=A\">Last modified</a></th><th><a href=\"?C=S;O=A\">Size</a></th><th><a href=\"?C=D;O=A\">Description</a></th></tr><tr><th colspan=\"5\"><hr></th></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/back.gif\" alt=\"[DIR]\"></td><td><a href=\"/traces/mmsys/2015/paper-5/\">Parent Directory</a></td><td>&nbsp;</td><td align=\"right\"> - </td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desccred.zip\">desccred.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\">133M</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desctxt.zip\">desctxt.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\"> 18M</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"descvis.zip\">descvis.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 60M</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"devset_topics.xml\">devset_topics.xml</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\">6.0K</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"gt.zip\">gt.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 91K</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"img.zip\">img.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">1.1G</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"imgwiki.zip\">imgwiki.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">341M</td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"poiNameCorrespondences.txt\">poiNameCorrespondences.txt</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">905 </td><td>&nbsp;</td></tr>\n",
+ "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"xml.zip\">xml.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">811K</td><td>&nbsp;</td></tr>\n",
+ "<tr><th colspan=\"5\"><hr></th></tr>\n",
+ "</table>\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def parse_urls(url, exts):\n",
+ " ahrefs = soup.find_all('a')\n",
+ " urls = []\n",
+ " for a in ahrefs:\n",
+ " href = a['href']\n",
+ " if Path(href).suffix[1:] in exts:\n",
+ " urls.append('{}{}'.format(url, href))\n",
+ " return urls"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desccred.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desctxt.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/descvis.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/gt.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/img.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/imgwiki.zip\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/poiNameCorrespondences.txt\n",
+ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/xml.zip\n"
+ ]
+ }
+ ],
+ "source": [
+ "soup = BeautifulSoup(data,'lxml')\n",
+ "burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n",
+ "urls = parse_urls(burl, ['zip', 'txt'])\n",
+ "for u in urls:\n",
+ " print(u)\n",
+ "\n",
+ "# for row in rows:\n",
+ "# ahrefs = row.find_all(href=True)\n",
+ "# for a in ahrefs:\n",
+ "# href = a['href']\n",
+ "# if 'zip' in href:\n",
+ "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n",
+ "# print(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%pip install lxml"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}