From 90abf459d1df1f21960c1d653a1f936d1ec30256 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Wed, 5 Dec 2018 12:00:15 +0100 Subject: . --- megapixels/notebooks/bs4_scratch.ipynb | 127 +++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 megapixels/notebooks/bs4_scratch.ipynb (limited to 'megapixels/notebooks/bs4_scratch.ipynb') diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb new file mode 100644 index 00000000..dce0ddc2 --- /dev/null +++ b/megapixels/notebooks/bs4_scratch.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "from os.path import join" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "data = \"\"\"\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\"[ICO]\"NameLast modifiedSizeDescription

\"[DIR]\"Parent Directory  -  
\"[desccred.zip17-Nov-2015 02:21 133M 
\"[desctxt.zip17-Nov-2015 02:21 18M 
\"[descvis.zip17-Nov-2015 02:22 60M 
\"[TXT]\"devset_topics.xml17-Nov-2015 02:22 6.0K 
\"[gt.zip17-Nov-2015 02:22 91K 
\"[img.zip17-Nov-2015 02:24 1.1G 
\"[imgwiki.zip17-Nov-2015 02:24 341M 
\"[TXT]\"poiNameCorrespondences.txt17-Nov-2015 02:24 905  
\"[xml.zip17-Nov-2015 02:24 811K 

\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_urls(url, exts):\n", + " ahrefs = soup.find_all('a')\n", + " urls = []\n", + " for a in ahrefs:\n", + " href = a['href']\n", + " if Path(href).suffix[1:] in exts:\n", + " urls.append('{}{}'.format(url, href))\n", + " return urls" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desccred.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/desctxt.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/descvis.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/gt.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/img.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/imgwiki.zip\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/poiNameCorrespondences.txt\n", + "http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/xml.zip\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data,'lxml')\n", + "burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", + "urls = parse_urls(burl, ['zip', 'txt'])\n", + "for u in urls:\n", + " print(u)\n", + "\n", + "# for row in rows:\n", + "# ahrefs = row.find_all(href=True)\n", + "# for a in ahrefs:\n", + "# href = a['href']\n", + "# if 'zip' in href:\n", + "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n", + "# print(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3-70-g09d2