{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", "from os.path import join\n", "import urllib.request\n", "import lxml\n", "from functools import partial\n", "from multiprocessing.dummy import Pool as ThreadPool\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "fp_html = '/data_store/datasets/people/vgg_face/downloads/celeb_list.html'\n", "with open(fp_html,'r') as fp:\n", " data = fp.readlines()\n", "data = data[0]\n", "#data = urllib.request.urlopen('https://www.youtube.com/watch?v=396sRuzVrHQ').read()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "lines = data.split('
')" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "names = [x.split('  ')[1].strip() for x in lines]" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "\n", "data_error2 = urllib.request.urlopen('https://www.youtube.com/watch?v=XAuCRa3ySfU').read()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "no desc\n", "\n", "\n", "\n", "\n", " This video is unavailable.\n", "\n", " \n", "\n", "Sorry about that.\n", " \n", "\n", "\n" ] } ], "source": [ "soup = BeautifulSoup(data_error2, 'lxml')\n", "desc_result = soup.find('p', attrs={'id': 'eow-description'})\n", "if desc_result:\n", " print(desc_result.text)\n", "else:\n", " print('no desc')\n", "\n", "error_result = soup.find('div', attrs={'id': 'player-unavailable'})\n", "if error_result:\n", " print(error_result.text)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "data = \"\"\"\n", "\n", " \"\"\"" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "If a recognizable person appears in this video, use for commercial purposes may infringe a right of privacy or publicity. It may not be used to state or imply the endorsement by NASA employees of a commercial product, process or service, or used in any other manner that might mislead. Accordingly, it is requested that if this video is used in advertising and other commercial promotion, layout and copy be submitted to NASA prior to release.\n" ] } ], "source": [ "soup = BeautifulSoup(data, 'lxml')\n", "d = soup.find('div', attrs={'id': 'description', 'slot': 'content'})\n", "print(d.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def parse_urls(url, exts):\n", " ahrefs = soup.find_all('img')\n", " urls = []\n", " for a in ahrefs:\n", " href = a['href']\n", " if Path(href).suffix[1:] in exts:\n", " urls.append('{}{}'.format(url, href))\n", " return urls\n", "\n", "def parse_images(url, exts):\n", " imgs = soup.find_all('img')\n", " tags = []\n", " for img in imgs:\n", " src = img['src']\n", " if Path(src).suffix[1:] in exts:\n", " tags.append('{}{}'.format(url, src))\n", " return tags\n", "\n", "metavars = [\n", " {'name': ('title','title')},\n", " {'name': ('description', 'description')},\n", " {'name': ('keywords', 'keywords')},\n", " {'itemprop': ('paid', 'paid')},\n", " {'itemprop': ('videoId', 'video_id')},\n", " {'itemprop': ('duration', 'duration')},\n", " {'itemprop': ('width', 'width')},\n", " {'itemprop': ('height', 'height')},\n", " {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},\n", " {'itemprop': ('interactionCount', 'views')},\n", " {'itemprop': ('datePublished', 'date_published')},\n", " {'itemprop': ('genre', 'genre')},\n", " {'itemprop': ('unlisted', 'genre')}\n", "]\n", "\n", "def parse_yt_meta(soup, prop, propvals):\n", " content = soup.find('meta', attrs={prop:propvals[0]}).get('content','')\n", " return {propvals[1]: content}\n", "\n", "def parse_yt_page(soup):\n", " #BeautifulSoup(data, 'html.parser').find(attrs={'name': 'description'})\n", " results = []\n", " for metavar in metavars:\n", " prop, propvals = list(metavar.items())[0]\n", " result = parse_yt_meta(soup, prop, propvals)\n", " #print(prop, propvals, result)\n", " results.append(result)\n", " return results" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(data,'lxml')\n", "#burl = 'http://skuld.cs.umass.edu/traces/mmsys/2015/paper-5/devset/'\n", "burl = ''\n", "#tags = parse_urls(burl, ['jpg', 'txt'])\n", "tags = parse_images(burl, ['jpg', 'png', 'gif'])\n", "for t in tags:\n", " print(t)\n", "\n", "# for row in rows:\n", "# ahrefs = row.find_all(href=True)\n", "# for a in ahrefs:\n", "# href = a['href']\n", "# if 'zip' in href:\n", "# url = 'http://{}'.format(Path(join(url_root, Path(href).name)))\n", "# print(url)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_papers(paper_id, sort_type='is-influential'):\n", " pages = 21\n", " results = []\n", " page_limit = 10\n", " num_pages = 2142 // page_limit\n", " num_pages = 990\n", " for page_num in range(num_pages):\n", " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", " print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", " data = urllib.request.urlopen(url).read()\n", " soup = BeautifulSoup(data,'lxml')\n", " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", " page_results = []\n", " for t in titles:\n", " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", " print(f'page contained {len(page_results)}')\n", " results += page_results\n", " print(len(results), 'return3ed results')\n", " return results" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def gen_url(paper_id, page_num, sort_type, page_limit=10):\n", " url = 'https://www.semanticscholar.org/paper/'\n", " params = {\n", " 'tab': 'abstract',\n", " 'citingPapersSort': sort_type, # citingPapersSort=year,is-influential\n", " 'citingPapersLimit': page_limit,\n", " 'citingPapersOffset': page_num * page_limit,\n", " 'citedPapersSort': 'is-influential',\n", " 'citedPapersLimit': 1,\n", " 'citedPapersOffset': 0,\n", " }\n", " url_args = urllib.parse.urlencode(params)\n", " url = join(url, paper_id, f'?{url_args}')\n", " return url\n", "\n", "def get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4):\n", " \n", " def pool_process(url_obj):\n", " # threaded function\n", " results = []\n", " try:\n", " #print(f'get page {page_num}, {page_limit*page_num} - {(page_num+1)*page_limit}, for {url}')\n", " data = urllib.request.urlopen(url_obj['url'], timeout=30).read()\n", " soup = BeautifulSoup(data,'lxml')\n", " titles = soup.find_all('h2', attrs={'class': 'citation__title'})\n", " page_results = []\n", " for t in titles:\n", " page_results.append({'title': t.text, 'paper_id': t['data-heap-paper-id']})\n", " results += page_results\n", " except Exception as e:\n", " print(f'Error: {e}, {url_obj[\"url\"]}')\n", " pbar.update(1)\n", " return results # list of paper title and id\n", "\n", " # pregenerate URLs\n", " page_limit = 10\n", " num_pages = 990 // page_limit\n", " url_objs = []\n", " for page_num in range(num_pages):\n", " url = gen_url(paper_id, page_num, sort_type, page_limit=page_limit)\n", " url_objs.append({'url': url, 'sort_type':sort_type})\n", "\n", " results = []\n", " opt_threads = 20\n", " pbar = tqdm(total=len(url_objs))\n", " pool_process = partial(pool_process)\n", " pool = ThreadPool(opt_threads) \n", " with tqdm(total=len(url_objs)) as pbar:\n", " results = pool.map(pool_process, url_objs)\n", " \n", " pbar.close()\n", " return results" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "26a6ee5a7b9b42fd91581c45134ab9c2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "471db3dfbc3b4f25a304b11491db8725", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "paper_id = '370b5757a5379b15e30d619e4d3fb9e8e13f3256'\n", "papers_influ = get_papers_mt(paper_id, sort_type='is-influential', opt_threads=4)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bcca087a063945eca3f92cac38cd3fad", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7d761ddff25f4802bb45902eefa3acb6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=99), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "papers_year = get_papers_mt(paper_id, sort_type='year', opt_threads=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "papers_all = []\n", "for pl in papers_influ:\n", " for p in pl:\n", " papers_" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "papers_all = []\n", "papers_ids = []\n", "for paper_list in papers_influ:\n", " for paper in paper_list:\n", " paper_id = paper['paper_id']\n", " if not paper_id in papers_ids:\n", " papers_ids.append(paper_id)\n", " papers_all.append(p)\n", "for paper_list in papers_year:\n", " for paper in paper_list:\n", " paper_id = paper['paper_id']\n", " if not paper_id in papers_ids:\n", " papers_ids.append(paper_id)\n", " papers_all.append(p)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1317\n" ] } ], "source": [ "print(len(papers_all))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }