diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-12 15:18:46 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-12 15:18:46 +0100 |
| commit | a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch) | |
| tree | 1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/bs4_scratch.ipynb | |
| parent | e95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff) | |
reorder nbs
Diffstat (limited to 'megapixels/notebooks/bs4_scratch.ipynb')
| -rw-r--r-- | megapixels/notebooks/bs4_scratch.ipynb | 577 |
1 files changed, 377 insertions, 200 deletions
diff --git a/megapixels/notebooks/bs4_scratch.ipynb b/megapixels/notebooks/bs4_scratch.ipynb index e63d286f..eeaa618d 100644 --- a/megapixels/notebooks/bs4_scratch.ipynb +++ b/megapixels/notebooks/bs4_scratch.ipynb @@ -2,189 +2,107 @@ "cells": [ { "cell_type": "code", - "execution_count": 56, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", - "from os.path import join" + "from os.path import join\n", + "import urllib.request\n", + "import lxml\n", + "from functools import partial\n", + "from multiprocessing.dummy import Pool as ThreadPool\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "%reload_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ - "data = \"\"\"\n", - "<div class=\"entry-content\">\n", - "<h2 id=\"annotations\" class=\"offset\">Annotations</h2>\n", - "<p>Below are examples of our different annotations within the dataset. Every pedestrian, cyclist and motorcyclist (higher than 50px) in every frame is annotated with a bounding box, along side with three attributes: occlusion, difficult (low contrast or unusual posture) and pose. People on posters, sculptures and groups where individuals are hard to seperate are marked as “ignore”.</p>\n", - "<style type=\"text/css\">\n", - "\t\t\t#gallery-3 {\n", - "\t\t\t\tmargin: auto;\n", - "\t\t\t}\n", - "\t\t\t#gallery-3 .gallery-item {\n", - "\t\t\t\tfloat: left;\n", - "\t\t\t\tmargin-top: 10px;\n", - "\t\t\t\ttext-align: center;\n", - "\t\t\t\twidth: 25%;\n", - "\t\t\t}\n", - "\t\t\t#gallery-3 img {\n", - "\t\t\t\tborder: 2px solid #cfcfcf;\n", - "\t\t\t}\n", - "\t\t\t#gallery-3 .gallery-caption {\n", - "\t\t\t\tmargin-left: 0;\n", - "\t\t\t}\n", - "\t\t\t/* see gallery_shortcode() in wp-includes/media.php */\n", - "\t\t</style>\n", - "<h2 id=\"pedestrian\" class=\"offset\">Pedestrian</h2>\n", - "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-81\">\n", - "\t\t\t\tHigh frequency of pedestrians\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-82\">\n", - "\t\t\t\tDark scenes with low contrast\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-83\">\n", - "\t\t\t\tOccluded pedestrians\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-85\">\n", - "\t\t\t\tSideview of crossing pedestrians\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<p><br style=\"clear: both\">\n", - "\t\t</p></div>\n", - "<h2 id=\"bicycle\" class=\"offset\">Bicycledriver and Motorbikedriver</h2>\n", - "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", - "\t\t\t</dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-86\">\n", - "\t\t\t\tBicycle drivers from back including glare\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-87\">\n", - "\t\t\t\tBicycledriver sideways\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-88\">\n", - "\t\t\t\tScenes with mixed annotations\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-89\">\n", - "\t\t\t\tMultiple bicycle drivers\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<p><br style=\"clear: both\">\n", - "\t\t</p></div>\n", - "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", - "\t\t\t</dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-90\">\n", - "\t\t\t\tMotorbikedrivers from back\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-91\">\n", - "\t\t\t\tMultiple motorbikedrives in a scene\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-92\">\n", - "\t\t\t\tMotorbikedrivers in traffic including glare\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-93\">\n", - "\t\t\t\tMotorbike driver followed during several frames\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<p><br style=\"clear: both\">\n", - "\t\t</p></div>\n", - "<h2 id=\"ignore\" class=\"offset\">Ignore</h2>\n", - "<div id=\"gallery-3\" class=\"gallery galleryid-56 gallery-columns-4 gallery-size-medium\">\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "\t\t\t\t\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\" alt=\"\" class=\"alignnone size-medium wp-image-43\" width=\"300\" height=\"184\"></a>\n", - "\t\t\t</dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-94\">\n", - "\t\t\t\tIgnore larger group of pedestrians that can not be distinguished\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\" alt=\"\" class=\"alignnone size-medium wp-image-28\" width=\"300\" height=\"186\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-95\">\n", - "\t\t\t\tIgnore confusing traffic signs\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-96\">\n", - "\t\t\t\tIgnore irrelevant people like passenger\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<dl class=\"gallery-item\">\n", - "<dt class=\"gallery-icon landscape\">\n", - "<p>\t\t\t\t<a class=\"grouped_elements\" rel=\"tc-fancybox-group\" href=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" data-lb-type=\"grouped-post\"><img src=\"http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\" alt=\"\" class=\"alignnone size-medium wp-image-44\" width=\"300\" height=\"193\"></a>\n", - "\t\t\t</p></dt>\n", - "<dd class=\"wp-caption-text gallery-caption\" id=\"gallery-3-97\">\n", - "\t\t\t\tIgnore advertisements and billboards that may include target objects\n", - "\t\t\t\t</dd>\n", - "</dl>\n", - "<p><br style=\"clear: both\">\n", - "\t\t</p></div>\n", - "<p> </p>\n", - "</div>\n", - "\"\"\"" + "fp_html = '/data_store/datasets/people/vgg_face/downloads/celeb_list.html'\n", + "with open(fp_html,'r') as fp:\n", + " data = fp.readlines()\n", + "data = data[0]\n", + "#data = urllib.request.urlopen('https://www.youtube.com/watch?v=396sRuzVrHQ').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "lines = data.split('<br>')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "names = [x.split(' ')[1].strip() for x in lines]" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "data_error2 = urllib.request.urlopen('https://www.youtube.com/watch?v=XAuCRa3ySfU').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no desc\n", + "\n", + "\n", + "\n", + "\n", + " This video is unavailable.\n", + "\n", + " \n", + "\n", + "Sorry about that.\n", + " \n", + "\n", + "\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data_error2, 'lxml')\n", + "desc_result = soup.find('p', attrs={'id': 'eow-description'})\n", + "if desc_result:\n", + " print(desc_result.text)\n", + "else:\n", + " print('no desc')\n", + "\n", + "error_result = soup.find('div', attrs={'id': 'player-unavailable'})\n", + "if error_result:\n", + " print(error_result.text)" ] }, { @@ -193,6 +111,43 @@ "metadata": {}, "outputs": [], "source": [ + "data = \"\"\"\n", + "\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "If a recognizable person appears in this video, use for commercial purposes may infringe a right of privacy or publicity. It may not be used to state or imply the endorsement by NASA employees of a commercial product, process or service, or used in any other manner that might mislead. Accordingly, it is requested that if this video is used in advertising and other commercial promotion, layout and copy be submitted to NASA prior to release.\n" + ] + } + ], + "source": [ + "soup = BeautifulSoup(data, 'lxml')\n", + "d = soup.find('div', attrs={'id': 'description', 'slot': 'content'})\n", + "print(d.text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ "def parse_urls(url, exts):\n", " ahrefs = soup.find_all('img')\n", " urls = []\n", @@ -209,37 +164,44 @@ " src = img['src']\n", " if Path(src).suffix[1:] in exts:\n", " tags.append('{}{}'.format(url, src))\n", - " return tags" + " return tags\n", + "\n", + "metavars = [\n", + " {'name': ('title','title')},\n", + " {'name': ('description', 'description')},\n", + " {'name': ('keywords', 'keywords')},\n", + " {'itemprop': ('paid', 'paid')},\n", + " {'itemprop': ('videoId', 'video_id')},\n", + " {'itemprop': ('duration', 'duration')},\n", + " {'itemprop': ('width', 'width')},\n", + " {'itemprop': ('height', 'height')},\n", + " {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},\n", + " {'itemprop': ('interactionCount', 'views')},\n", + " {'itemprop': ('datePublished', 'date_published')},\n", + " {'itemprop': ('genre', 'genre')},\n", + " {'itemprop': ('unlisted', 'genre')}\n", + "]\n", + "\n", + "def parse_yt_meta(soup, prop, propvals):\n", + " content = soup.find('meta', attrs={prop:propvals[0]}).get('content','')\n", + " return {propvals[1]: content}\n", + "\n", + "def parse_yt_page(soup):\n", + " #BeautifulSoup(data, 'html.parser').find(attrs={'name': 'description'})\n", + " results = []\n", + " for metavar in metavars:\n", + " prop, propvals = list(metavar.items())[0]\n", + " result = parse_yt_meta(soup, prop, propvals)\n", + " #print(prop, propvals, result)\n", + " results.append(result)\n", + " return results" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/many_peds3.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_dark_low_contrast.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/peds_occluded.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ped_sideways.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_back_glare.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_driver_sideways.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_mixed_with_ped.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/bicycle_multiple_drivers2.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_drivers_crossing.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_traffic_glare.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/mb_driver_back2.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ignore_group.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_traffic_signs.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign_passengers.png\n", - "http://www.nightowls-dataset.org/wp-content/uploads/2018/03/ign.ads_.png\n" - ] - } - ], + "outputs": [], "source": [ "soup = BeautifulSoup(data,'lxml')\n", "#burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", @@ -260,6 +222,221 @@ }, { "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_papers(paper_id, sort_type='is-influential'):\n", + " pages = 21\n", + " results = []\n", + " page_limit = 10\n", + " num_pages = 2142 // page_limit\n", + " num_pages = 990\n", + " for page_num in range(num_pages):\n", + " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", + " print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", + " data = urllib.request.urlopen(url).read()\n", + " soup = BeautifulSoup(data,'lxml')\n", + " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", + " page_results = []\n", + " for t in titles:\n", + " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", + " print(f'page contained {len(page_results)}')\n", + " results += page_results\n", + " print(len(results), 'return3ed results')\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def gen_url(paper_id, page_num, sort_type, page_limit=10):\n", + " url = 'https://www.semanticscholar.org/paper/'\n", + " params = {\n", + " 'tab': 'abstract',\n", + " 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential\n", + " 'citingPapersLimit': page_limit,\n", + " 'citingPapersOffset': page_num * page_limit,\n", + " 'citedPapersSort': 'is-influential',\n", + " 'citedPapersLimit': 1,\n", + " 'citedPapersOffset': 0,\n", + " }\n", + " url_args = urllib.parse.urlencode(params)\n", + " url = join(url, paper_id, f'?{url_args}')\n", + " return url\n", + "\n", + "def get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4):\n", + " \n", + " def pool_process(url_obj):\n", + " # threaded function\n", + " results = []\n", + " try:\n", + " #print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", + " data = urllib.request.urlopen(url_obj['url'], timeout=30).read()\n", + " soup = BeautifulSoup(data,'lxml')\n", + " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", + " page_results = []\n", + " for t in titles:\n", + " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", + " results += page_results\n", + " except Exception as e:\n", + " print(f'Error: {e}, {url_obj[\"url\"]}')\n", + " pbar.update(1)\n", + " return results # list of paper title and id\n", + "\n", + " # pregenerate URLs\n", + " page_limit = 10\n", + " num_pages = 990 // page_limit\n", + " url_objs = []\n", + " for page_num in range(num_pages):\n", + " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", + " url_objs.append({'url': url, 'sort_type':sort_type})\n", + "\n", + " results = []\n", + " opt_threads = 20\n", + " pbar = tqdm(total=len(url_objs))\n", + " pool_process = partial(pool_process)\n", + " pool = ThreadPool(opt_threads) \n", + " with tqdm(total=len(url_objs)) as pbar:\n", + " results = pool.map(pool_process, url_objs)\n", + " \n", + " pbar.close()\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "26a6ee5a7b9b42fd91581c45134ab9c2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "471db3dfbc3b4f25a304b11491db8725", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "paper_id = '370b5757a5379b15e30d619e4d3fb9e8e13f3256'\n", + "papers_influ = get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcca087a063945eca3f92cac38cd3fad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7d761ddff25f4802bb45902eefa3acb6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "papers_year = get_papers_mt(paper_id, sort_type='year', opt_threads=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "papers_all = []\n", + "for pl in papers_influ:\n", + " for p in pl:\n", + " papers_" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "papers_all = []\n", + "papers_ids = []\n", + "for paper_list in papers_influ:\n", + " for paper in paper_list:\n", + " paper_id = paper['paper_id']\n", + " if not paper_id in papers_ids:\n", + " papers_ids.append(paper_id)\n", + " papers_all.append(p)\n", + "for paper_list in papers_year:\n", + " for paper in paper_list:\n", + " paper_id = paper['paper_id']\n", + " if not paper_id in papers_ids:\n", + " papers_ids.append(paper_id)\n", + " papers_all.append(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1317\n" + ] + } + ], + "source": [ + "print(len(papers_all))" + ] + }, + { + "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], @@ -268,9 +445,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:megapixels]", + "display_name": "megapixels", "language": "python", - "name": "conda-env-megapixels-py" + "name": "megapixels" }, "language_info": { "codemirror_mode": { @@ -282,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.8" } }, "nbformat": 4, |
