diff options
Diffstat (limited to 'megapixels/notebooks/bs4_scratch.ipynb')
| -rw-r--r-- | megapixels/notebooks/bs4_scratch.ipynb | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb new file mode 100644 index 00000000..e63d286f --- /dev/null +++ b/megapixels/notebooks/bs4_scratch.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "from os.path import join" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "data = \"\"\"\n", + "<div class=\"entry-content\">\n", + "<h2 id=\"annotations\" class=\"offset\">Annotations</h2>\n", + "<p>Below are examples of our different annotations within the dataset. Every pedestrian, cyclist and motorcyclist (higher than 50px) in every frame is annotated with a bounding box, along side with three attributes: occlusion, difficult (low contrast or unusual posture) and pose. People on posters, sculptures and groups where individuals are hard to seperate are marked as “ignore”.</p>\n", + "<style type=\"text/css\">\n", + "\t\t\t#gallery-3 {\n", + "\t\t\t\tmargin: auto;\n", + "\t\t\t}\n", + "\t\t\t#gallery-3 .gallery-item {\n", + "\t\t\t\tfloat: left;\n", + "\t\t\t\tmargin-top: 10px;\n", + "\t\t\t\ttext-align: center;\n", + "\t\t\t\twidth: 25%;\n", + "\t\t\t}\n", + "\t\t\t#gallery-3 img {\n", + "\t\t\t\tborder: 2px solid #cfcfcf;\n", + "\t\t\t}\n", + "\t\t\t#gallery-3 .gallery-caption {\n", + "\t\t\t\tmargin-left: 0;\n", + "\t\t\t}\n", + "\t\t\t/* see gallery_shortcode() in wp-includes/media.php */\n", + "\t\t</style>\n", + "<h2 id=\"pedestrian\" class=\"offset\">Pedestrian</h2>\n", + "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-81\">\n", + "\t\t\t\tHigh frequency of pedestrians\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-82\">\n", + "\t\t\t\tDark scenes with low contrast\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-83\">\n", + "\t\t\t\tOccluded pedestrians\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-85\">\n", + "\t\t\t\tSideview of crossing pedestrians\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<p><br style=\"clear: both\">\n", + "\t\t</p></div>\n", + "<h2 id=\"bicycle\" class=\"offset\">Bicycledriver and Motorbikedriver</h2>\n", + "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", + "\t\t\t</dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-86\">\n", + "\t\t\t\tBicycle drivers from back including glare\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-87\">\n", + "\t\t\t\tBicycledriver sideways\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-88\">\n", + "\t\t\t\tScenes with mixed annotations\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-89\">\n", + "\t\t\t\tMultiple bicycle drivers\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<p><br style=\"clear: both\">\n", + "\t\t</p></div>\n", + "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", + "\t\t\t</dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-90\">\n", + "\t\t\t\tMotorbikedrivers from back\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-91\">\n", + "\t\t\t\tMultiple motorbikedrives in a scene\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-92\">\n", + "\t\t\t\tMotorbikedrivers in traffic including glare\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-93\">\n", + "\t\t\t\tMotorbike driver followed during several frames\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<p><br style=\"clear: both\">\n", + "\t\t</p></div>\n", + "<h2 id=\"ignore\" class=\"offset\">Ignore</h2>\n", + "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", + "\t\t\t</dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-94\">\n", + "\t\t\t\tIgnore larger group of pedestrians that can not be distinguished\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-95\">\n", + "\t\t\t\tIgnore confusing traffic signs\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-96\">\n", + "\t\t\t\tIgnore irrelevant people like passenger\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<dl class=\"gallery-item\">\n", + "<dt class=\"gallery-icon landscape\">\n", + "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", + "\t\t\t</p></dt>\n", + "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-97\">\n", + "\t\t\t\tIgnore advertisements and billboards that may include target objects\n", + "\t\t\t\t</dd>\n", + "</dl>\n", + "<p><br style=\"clear: both\">\n", + "\t\t</p></div>\n", + "<p> </p>\n", + "</div>\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_urls(url, exts):\n", + " ahrefs = soup.find_all('img')\n", + " urls = []\n", + " for a in ahrefs:\n", + " href = a['href']\n", + " if Path(href).suffix[1:] in exts:\n", + " urls.append('{}{}'.format(url, href))\n", + " return urls\n", + "\n", + "def parse_images(url, exts):\n", + " imgs = soup.find_all('img')\n", + " tags = []\n", + " for img in imgs:\n", + " src = img['src']\n", + " if Path(src).suffix[1:] in exts:\n", + " tags.append('{}{}'.format(url, src))\n", + " return tags" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\n", + "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data,'lxml')\n", + "#burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", + "burl = ''\n", + "#tags = parse_urls(burl, ['jpg', 'txt'])\n", + "tags = parse_images(burl, ['jpg', 'png', 'gif'])\n", + "for t in tags:\n", + " print(t)\n", + "\n", + "# for row in rows:\n", + "# ahrefs = row.find_all(href=True)\n", + "# for a in ahrefs:\n", + "# href = a['href']\n", + "# if 'zip' in href:\n", + "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n", + "# print(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
