From a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 12 Feb 2019 15:18:46 +0100 Subject: reorder nbs --- megapixels/notebooks/bs4_scratch.ipynb | 577 +++++++++++++++++++++------------ 1 file changed, 377 insertions(+), 200 deletions(-) (limited to 'megapixels/notebooks/bs4_scratch.ipynb') diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb index e63d286f..eeaa618d 100644 --- a/megapixels/notebooks/bs4_scratch.ipynb +++ b/megapixels/notebooks/bs4_scratch.ipynb @@ -2,189 +2,107 @@ "cells": [ { "cell_type": "code", - "execution_count": 56, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", - "from os.path import join" + "from os.path import join\n", + "import urllib.request\n", + "import lxml\n", + "from functools import partial\n", + "from multiprocessing.dummy import Pool as ThreadPool\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "%reload_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ - "data = \"\"\"\n", - "
\n", - "

Annotations

\n", - "

Below are examples of our different annotations within the dataset. Every pedestrian, cyclist and motorcyclist (higher than 50px) in every frame is annotated with a bounding box, along side with three attributes: occlusion, difficult (low contrast or unusual posture) and pose. People on posters, sculptures and groups where individuals are hard to seperate are marked as “ignore”.

\n", - "\n", - "

Pedestrian

\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tHigh frequency of pedestrians\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tDark scenes with low contrast\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tOccluded pedestrians\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tSideview of crossing pedestrians\n", - "\t\t\t\t
\n", - "
\n", - "


\n", - "\t\t

\n", - "

Bicycledriver and Motorbikedriver

\n", - "
\n", - "
\n", - "
\n", - "\t\t\t\t\t\t\t\t\"\"\n", - "\t\t\t
\n", - "
\n", - "\t\t\t\tBicycle drivers from back including glare\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tBicycledriver sideways\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tScenes with mixed annotations\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tMultiple bicycle drivers\n", - "\t\t\t\t
\n", - "
\n", - "


\n", - "\t\t

\n", - "
\n", - "
\n", - "
\n", - "\t\t\t\t\t\t\t\t\"\"\n", - "\t\t\t
\n", - "
\n", - "\t\t\t\tMotorbikedrivers from back\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tMultiple motorbikedrives in a scene\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tMotorbikedrivers in traffic including glare\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tMotorbike driver followed during several frames\n", - "\t\t\t\t
\n", - "
\n", - "


\n", - "\t\t

\n", - "

Ignore

\n", - "
\n", - "
\n", - "
\n", - "\t\t\t\t\t\t\t\t\"\"\n", - "\t\t\t
\n", - "
\n", - "\t\t\t\tIgnore larger group of pedestrians that can not be distinguished\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tIgnore confusing traffic signs\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tIgnore irrelevant people like passenger\n", - "\t\t\t\t
\n", - "
\n", - "
\n", - "
\n", - "

\t\t\t\t\"\"\n", - "\t\t\t

\n", - "
\n", - "\t\t\t\tIgnore advertisements and billboards that may include target objects\n", - "\t\t\t\t
\n", - "
\n", - "


\n", - "\t\t

\n", - "

 

\n", - "
\n", - "\"\"\"" + "fp_html = '/data_store/datasets/people/vgg_face/downloads/celeb_list.html'\n", + "with open(fp_html,'r') as fp:\n", + " data = fp.readlines()\n", + "data = data[0]\n", + "#data = urllib.request.urlopen('https://www.youtube.com/watch?v=396sRuzVrHQ').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "lines = data.split('
')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "names = [x.split('  ')[1].strip() for x in lines]" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "data_error2 = urllib.request.urlopen('https://www.youtube.com/watch?v=XAuCRa3ySfU').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no desc\n", + "\n", + "\n", + "\n", + "\n", + " This video is unavailable.\n", + "\n", + " \n", + "\n", + "Sorry about that.\n", + " \n", + "\n", + "\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data_error2, 'lxml')\n", + "desc_result = soup.find('p', attrs={'id': 'eow-description'})\n", + "if desc_result:\n", + " print(desc_result.text)\n", + "else:\n", + " print('no desc')\n", + "\n", + "error_result = soup.find('div', attrs={'id': 'player-unavailable'})\n", + "if error_result:\n", + " print(error_result.text)" ] }, { @@ -192,6 +110,43 @@ "execution_count": 71, "metadata": {}, "outputs": [], + "source": [ + "data = \"\"\"\n", + "\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "If a recognizable person appears in this video, use for commercial purposes may infringe a right of privacy or publicity. It may not be used to state or imply the endorsement by NASA employees of a commercial product, process or service, or used in any other manner that might mislead. Accordingly, it is requested that if this video is used in advertising and other commercial promotion, layout and copy be submitted to NASA prior to release.\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data, 'lxml')\n", + "d = soup.find('div', attrs={'id': 'description', 'slot': 'content'})\n", + "print(d.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], "source": [ "def parse_urls(url, exts):\n", " ahrefs = soup.find_all('img')\n", @@ -209,37 +164,44 @@ " src = img['src']\n", " if Path(src).suffix[1:] in exts:\n", " tags.append('{}{}'.format(url, src))\n", - " return tags" + " return tags\n", + "\n", + "metavars = [\n", + " {'name': ('title','title')},\n", + " {'name': ('description', 'description')},\n", + " {'name': ('keywords', 'keywords')},\n", + " {'itemprop': ('paid', 'paid')},\n", + " {'itemprop': ('videoId', 'video_id')},\n", + " {'itemprop': ('duration', 'duration')},\n", + " {'itemprop': ('width', 'width')},\n", + " {'itemprop': ('height', 'height')},\n", + " {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},\n", + " {'itemprop': ('interactionCount', 'views')},\n", + " {'itemprop': ('datePublished', 'date_published')},\n", + " {'itemprop': ('genre', 'genre')},\n", + " {'itemprop': ('unlisted', 'genre')}\n", + "]\n", + "\n", + "def parse_yt_meta(soup, prop, propvals):\n", + " content = soup.find('meta', attrs={prop:propvals[0]}).get('content','')\n", + " return {propvals[1]: content}\n", + "\n", + "def parse_yt_page(soup):\n", + " #BeautifulSoup(data, 'html.parser').find(attrs={'name': 'description'})\n", + " results = []\n", + " for metavar in metavars:\n", + " prop, propvals = list(metavar.items())[0]\n", + " result = parse_yt_meta(soup, prop, propvals)\n", + " #print(prop, propvals, result)\n", + " results.append(result)\n", + " return results" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\n" - ] - } - ], + "outputs": [], "source": [ "soup = BeautifulSoup(data,'lxml')\n", "#burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", @@ -258,6 +220,221 @@ "# print(url)" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_papers(paper_id, sort_type='is-influential'):\n", + " pages = 21\n", + " results = []\n", + " page_limit = 10\n", + " num_pages = 2142 // page_limit\n", + " num_pages = 990\n", + " for page_num in range(num_pages):\n", + " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", + " print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", + " data = urllib.request.urlopen(url).read()\n", + " soup = BeautifulSoup(data,'lxml')\n", + " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", + " page_results = []\n", + " for t in titles:\n", + " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", + " print(f'page contained {len(page_results)}')\n", + " results += page_results\n", + " print(len(results), 'return3ed results')\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def gen_url(paper_id, page_num, sort_type, page_limit=10):\n", + " url = 'https://www.semanticscholar.org/paper/'\n", + " params = {\n", + " 'tab': 'abstract',\n", + " 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential\n", + " 'citingPapersLimit': page_limit,\n", + " 'citingPapersOffset': page_num * page_limit,\n", + " 'citedPapersSort': 'is-influential',\n", + " 'citedPapersLimit': 1,\n", + " 'citedPapersOffset': 0,\n", + " }\n", + " url_args = urllib.parse.urlencode(params)\n", + " url = join(url, paper_id, f'?{url_args}')\n", + " return url\n", + "\n", + "def get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4):\n", + " \n", + " def pool_process(url_obj):\n", + " # threaded function\n", + " results = []\n", + " try:\n", + " #print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", + " data = urllib.request.urlopen(url_obj['url'], timeout=30).read()\n", + " soup = BeautifulSoup(data,'lxml')\n", + " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", + " page_results = []\n", + " for t in titles:\n", + " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", + " results += page_results\n", + " except Exception as e:\n", + " print(f'Error: {e}, {url_obj[\"url\"]}')\n", + " pbar.update(1)\n", + " return results # list of paper title and id\n", + "\n", + " # pregenerate URLs\n", + " page_limit = 10\n", + " num_pages = 990 // page_limit\n", + " url_objs = []\n", + " for page_num in range(num_pages):\n", + " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", + " url_objs.append({'url': url, 'sort_type':sort_type})\n", + "\n", + " results = []\n", + " opt_threads = 20\n", + " pbar = tqdm(total=len(url_objs))\n", + " pool_process = partial(pool_process)\n", + " pool = ThreadPool(opt_threads) \n", + " with tqdm(total=len(url_objs)) as pbar:\n", + " results = pool.map(pool_process, url_objs)\n", + " \n", + " pbar.close()\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "26a6ee5a7b9b42fd91581c45134ab9c2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "471db3dfbc3b4f25a304b11491db8725", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "paper_id = '370b5757a5379b15e30d619e4d3fb9e8e13f3256'\n", + "papers_influ = get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcca087a063945eca3f92cac38cd3fad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7d761ddff25f4802bb45902eefa3acb6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "papers_year = get_papers_mt(paper_id, sort_type='year', opt_threads=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "papers_all = []\n", + "for pl in papers_influ:\n", + " for p in pl:\n", + " papers_" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "papers_all = []\n", + "papers_ids = []\n", + "for paper_list in papers_influ:\n", + " for paper in paper_list:\n", + " paper_id = paper['paper_id']\n", + " if not paper_id in papers_ids:\n", + " papers_ids.append(paper_id)\n", + " papers_all.append(p)\n", + "for paper_list in papers_year:\n", + " for paper in paper_list:\n", + " paper_id = paper['paper_id']\n", + " if not paper_id in papers_ids:\n", + " papers_ids.append(paper_id)\n", + " papers_all.append(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1317\n" + ] + } + ], + "source": [ + "print(len(papers_all))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -268,9 +445,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:megapixels]", + "display_name": "megapixels", "language": "python", - "name": "conda-env-megapixels-py" + "name": "megapixels" }, "language_info": { "codemirror_mode": { @@ -282,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.8" } }, "nbformat": 4, -- cgit v1.2.3-70-g09d2