diff options
Diffstat (limited to 'megapixels/notebooks/bs4_scratch.ipynb')
| -rw-r--r-- | megapixels/notebooks/bs4_scratch.ipynb | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb new file mode 100644 index 00000000..dce0ddc2 --- /dev/null +++ b/megapixels/notebooks/bs4_scratch.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "from os.path import join" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "data = \"\"\"\n", + "<table><tr><th><img src=\"/icons/blank.gif\" alt=\"[ICO]\"></th><th><a href=\"?C=N;O=D\">Name</a></th><th><a href=\"?C=M;O=A\">Last modified</a></th><th><a href=\"?C=S;O=A\">Size</a></th><th><a href=\"?C=D;O=A\">Description</a></th></tr><tr><th colspan=\"5\"><hr></th></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/back.gif\" alt=\"[DIR]\"></td><td><a href=\"/traces/mmsys/2015/paper-5/\">Parent Directory</a></td><td> </td><td align=\"right\"> - </td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desccred.zip\">desccred.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\">133M</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"desctxt.zip\">desctxt.zip</a></td><td align=\"right\">17-Nov-2015 02:21 </td><td align=\"right\"> 18M</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"descvis.zip\">descvis.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 60M</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"devset_topics.xml\">devset_topics.xml</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\">6.0K</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"gt.zip\">gt.zip</a></td><td align=\"right\">17-Nov-2015 02:22 </td><td align=\"right\"> 91K</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"img.zip\">img.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">1.1G</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"imgwiki.zip\">imgwiki.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">341M</td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/text.gif\" alt=\"[TXT]\"></td><td><a href=\"poiNameCorrespondences.txt\">poiNameCorrespondences.txt</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">905 </td><td> </td></tr>\n", + "<tr><td valign=\"top\"><img src=\"/icons/compressed.gif\" alt=\"[ ]\"></td><td><a href=\"xml.zip\">xml.zip</a></td><td align=\"right\">17-Nov-2015 02:24 </td><td align=\"right\">811K</td><td> </td></tr>\n", + "<tr><th colspan=\"5\"><hr></th></tr>\n", + "</table>\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_urls(url, exts):\n", + " ahrefs = soup.find_all('a')\n", + " urls = []\n", + " for a in ahrefs:\n", + " href = a['href']\n", + " if Path(href).suffix[1:] in exts:\n", + " urls.append('{}{}'.format(url, href))\n", + " return urls" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desccred.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desctxt.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/descvis.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/gt.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/img.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/imgwiki.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/poiNameCorrespondences.txt\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/xml.zip\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data,'lxml')\n", + "burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", + "urls = parse_urls(burl, ['zip', 'txt'])\n", + "for u in urls:\n", + " print(u)\n", + "\n", + "# for row in rows:\n", + "# ahrefs = row.find_all(href=True)\n", + "# for a in ahrefs:\n", + "# href = a['href']\n", + "# if 'zip' in href:\n", + "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n", + "# print(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
