summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/bs4_scratch.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/bs4_scratch.ipynb')
-rw-r--r--megapixels/notebooks/bs4_scratch.ipynb290
1 files changed, 290 insertions, 0 deletions
diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb
new file mode 100644
index 00000000..e63d286f
--- /dev/null
+++ b/megapixels/notebooks/bs4_scratch.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from bs4 import BeautifulSoup\n",
+ "from pathlib import Path\n",
+ "from os.path import join"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = \"\"\"\n",
+ "<div class=\"entry-content\">\n",
+ "<h2 id=\"annotations\" class=\"offset\">Annotations</h2>\n",
+ "<p>Below are examples of our different annotations within the dataset. Every pedestrian, cyclist and motorcyclist (higher than 50px) in every frame is annotated with a bounding box, along side with three attributes: occlusion, difficult (low contrast or unusual posture) and pose. People on posters, sculptures and groups where individuals are hard to seperate are marked as “ignore”.</p>\n",
+ "<style type=\"text/css\">\n",
+ "\t\t\t#gallery-3 {\n",
+ "\t\t\t\tmargin: auto;\n",
+ "\t\t\t}\n",
+ "\t\t\t#gallery-3 .gallery-item {\n",
+ "\t\t\t\tfloat: left;\n",
+ "\t\t\t\tmargin-top: 10px;\n",
+ "\t\t\t\ttext-align: center;\n",
+ "\t\t\t\twidth: 25%;\n",
+ "\t\t\t}\n",
+ "\t\t\t#gallery-3 img {\n",
+ "\t\t\t\tborder: 2px solid #cfcfcf;\n",
+ "\t\t\t}\n",
+ "\t\t\t#gallery-3 .gallery-caption {\n",
+ "\t\t\t\tmargin-left: 0;\n",
+ "\t\t\t}\n",
+ "\t\t\t/* see gallery_shortcode() in wp-includes/media.php */\n",
+ "\t\t</style>\n",
+ "<h2 id=\"pedestrian\" class=\"offset\">Pedestrian</h2>\n",
+ "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-81\">\n",
+ "\t\t\t\tHigh frequency of pedestrians\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-82\">\n",
+ "\t\t\t\tDark scenes with low contrast\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-83\">\n",
+ "\t\t\t\tOccluded pedestrians\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-85\">\n",
+ "\t\t\t\tSideview of crossing pedestrians\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<p><br style=\"clear: both\">\n",
+ "\t\t</p></div>\n",
+ "<h2 id=\"bicycle\" class=\"offset\">Bicycledriver and Motorbikedriver</h2>\n",
+ "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n",
+ "\t\t\t</dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-86\">\n",
+ "\t\t\t\tBicycle drivers from back including glare\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-87\">\n",
+ "\t\t\t\tBicycledriver sideways\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-88\">\n",
+ "\t\t\t\tScenes with mixed annotations\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-89\">\n",
+ "\t\t\t\tMultiple bicycle drivers\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<p><br style=\"clear: both\">\n",
+ "\t\t</p></div>\n",
+ "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n",
+ "\t\t\t</dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-90\">\n",
+ "\t\t\t\tMotorbikedrivers from back\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-91\">\n",
+ "\t\t\t\tMultiple motorbikedrives in a scene\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-92\">\n",
+ "\t\t\t\tMotorbikedrivers in traffic including glare\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-93\">\n",
+ "\t\t\t\tMotorbike driver followed during several frames\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<p><br style=\"clear: both\">\n",
+ "\t\t</p></div>\n",
+ "<h2 id=\"ignore\" class=\"offset\">Ignore</h2>\n",
+ "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n",
+ "\t\t\t</dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-94\">\n",
+ "\t\t\t\tIgnore larger group of pedestrians that can not be distinguished\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-95\">\n",
+ "\t\t\t\tIgnore confusing traffic signs\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-96\">\n",
+ "\t\t\t\tIgnore irrelevant people like passenger\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<dl class=\"gallery-item\">\n",
+ "<dt class=\"gallery-icon landscape\">\n",
+ "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n",
+ "\t\t\t</p></dt>\n",
+ "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-97\">\n",
+ "\t\t\t\tIgnore advertisements and billboards that may include target objects\n",
+ "\t\t\t\t</dd>\n",
+ "</dl>\n",
+ "<p><br style=\"clear: both\">\n",
+ "\t\t</p></div>\n",
+ "<p>&nbsp;</p>\n",
+ "</div>\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def parse_urls(url, exts):\n",
+ " ahrefs = soup.find_all('img')\n",
+ " urls = []\n",
+ " for a in ahrefs:\n",
+ " href = a['href']\n",
+ " if Path(href).suffix[1:] in exts:\n",
+ " urls.append('{}{}'.format(url, href))\n",
+ " return urls\n",
+ "\n",
+ "def parse_images(url, exts):\n",
+ " imgs = soup.find_all('img')\n",
+ " tags = []\n",
+ " for img in imgs:\n",
+ " src = img['src']\n",
+ " if Path(src).suffix[1:] in exts:\n",
+ " tags.append('{}{}'.format(url, src))\n",
+ " return tags"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\n",
+ "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\n"
+ ]
+ }
+ ],
+ "source": [
+ "soup = BeautifulSoup(data,'lxml')\n",
+ "#burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n",
+ "burl = ''\n",
+ "#tags = parse_urls(burl, ['jpg', 'txt'])\n",
+ "tags = parse_images(burl, ['jpg', 'png', 'gif'])\n",
+ "for t in tags:\n",
+ " print(t)\n",
+ "\n",
+ "# for row in rows:\n",
+ "# ahrefs = row.find_all(href=True)\n",
+ "# for a in ahrefs:\n",
+ "# href = a['href']\n",
+ "# if 'zip' in href:\n",
+ "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n",
+ "# print(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}