{ "cells": [ { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", "from os.path import join" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "data = \"\"\"\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
\"[ICO]\"NameLast modifiedSizeDescription

\"[DIR]\"Parent Directory  -  
\"[desccred.zip17-Nov-2015 02:21 133M 
\"[desctxt.zip17-Nov-2015 02:21 18M 
\"[descvis.zip17-Nov-2015 02:22 60M 
\"[TXT]\"devset_topics.xml17-Nov-2015 02:22 6.0K 
\"[gt.zip17-Nov-2015 02:22 91K 
\"[img.zip17-Nov-2015 02:24 1.1G 
\"[imgwiki.zip17-Nov-2015 02:24 341M 
\"[TXT]\"poiNameCorrespondences.txt17-Nov-2015 02:24 905  
\"[xml.zip17-Nov-2015 02:24 811K 

\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "def parse_urls(url, exts):\n", " ahrefs = soup.find_all('a')\n", " urls = []\n", " for a in ahrefs:\n", " href = a['href']\n", " if Path(href).suffix[1:] in exts:\n", " urls.append('{}{}'.format(url, href))\n", " return urls" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desccred.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desctxt.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/descvis.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/gt.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/img.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/imgwiki.zip\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/poiNameCorrespondences.txt\n", "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/xml.zip\n" ] } ], "source": [ "soup = BeautifulSoup(data,'lxml')\n", "burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", "urls = parse_urls(burl, ['zip', 'txt'])\n", "for u in urls:\n", " print(u)\n", "\n", "# for row in rows:\n", "# ahrefs = row.find_all(href=True)\n", "# for a in ahrefs:\n", "# href = a['href']\n", "# if 'zip' in href:\n", "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n", "# print(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install lxml" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }