From 14727041f2b54dea9a37ff6e2dfef161b6243556 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Sun, 10 Feb 2019 20:39:03 +0100 Subject: add csv converter for citations --- megapixels/app/models/citations.py | 17 +++ megapixels/commands/datasets/citations_to_csv.py | 92 ++++++++++++++ megapixels/commands/datasets/ijb_youtube_meta.py | 146 +++++++++++++++++++++++ 3 files changed, 255 insertions(+) create mode 100644 megapixels/app/models/citations.py create mode 100644 megapixels/commands/datasets/citations_to_csv.py create mode 100644 megapixels/commands/datasets/ijb_youtube_meta.py (limited to 'megapixels') diff --git a/megapixels/app/models/citations.py b/megapixels/app/models/citations.py new file mode 100644 index 00000000..b0e02fc7 --- /dev/null +++ b/megapixels/app/models/citations.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from dataclasses import dataclass +from mashumaro import DataClassJSONMixin + +@dataclass +class Paper(DataClassJSONMixin): + key: str + dataset_name: str + paper_id: str + title: str + paper_type: str + year: int + paper_url: str = '' + loc: str = '' + loc_type: str = '' + lat: float = 0.0 + lng: float = 0.0 \ No newline at end of file diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py new file mode 100644 index 00000000..431ee4cd --- /dev/null +++ b/megapixels/commands/datasets/citations_to_csv.py @@ -0,0 +1,92 @@ +import click + +from app.utils import click_utils +from app.utils.logger_utils import Logger +from app.models.citations import Paper + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input license data CSV') +@click.option('-o', '--output', 'opt_fp_out', + help='Output directory') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Convert JSON to CSV""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + + import json + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils, im_utils + + log = Logger.getLogger() + log.info('Convert JSON to CSV') + + # load + with open(opt_fp_in, 'r') as fp: + json_data = json.load(fp) + + # parse + papers = [] + dataset_key = json_data['paper']['key'] + dataset_name = json_data['paper']['name'] + papers_main = get_orig_paper(json_data) + papers += papers_main + papers_citations = get_citations(dataset_key, dataset_name, json_data) + papers += papers_citations + papers = [p.to_dict() for p in papers] + + # save + if not opt_fp_out: + fp_out = opt_fp_in.replace('.json','.csv') + log.info(fp_out) + + df_papers = pd.DataFrame.from_dict(papers) + df_papers.index.name = 'index' + df_papers.to_csv(fp_out) + + + +# ---------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------- +def get_citations(dataset_key, dataset_name, json_data): + papers = [] + d_type = 'citation' + for p in json_data['citations']: + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('addresses', '') + if addresses: + for a in addresses: + paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type, + year, p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf']) + + papers.append(paper) + return papers + +def get_orig_paper(json_data): + papers = [] + d = json_data + p = d['paper'] + d_type = 'main' + year = 0 if p.get('year', 0) == '' else p.get('year', 0) + addresses = p.get('address','') + if addresses: + for a in addresses: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, + p['pdf'], + a['address'], a['type'], a['lat'], a['lng']) + else: + paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf']) + papers.append(paper) + return papers \ No newline at end of file diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py new file mode 100644 index 00000000..87df390c --- /dev/null +++ b/megapixels/commands/datasets/ijb_youtube_meta.py @@ -0,0 +1,146 @@ +"""Create screenshots for YouTube.com URLs in the IJB dataset + +TODO +- grey out boxes in sidebar +- resize driver screenshot area to include author text + +Installing webdrivers: + +Chrome +wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip + +Firefox +wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz + +PhantomJS +npm install -g phantomjs +""" + +import click + +from app.settings import app_cfg + +fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv' +fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv' +fps_default_in = [fp_default_in_a, fp_default_in_b] +fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True, + help='Input license data CSV') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out, + help='Output directory') +@click.option('-t', '--threads', 'opt_threads', default=4, + help='Number of threads') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): + """IJB-C screenshot sources""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool + import urllib.request + + import lxml + from bs4 import BeautifulSoup + import pandas as pd + import cv2 as cv + from tqdm import tqdm + + + from app.utils import file_utils, im_utils, logger_utils + + log = logger_utils.Logger.getLogger() + + + metavars = [ + {'name': ('title','title')}, + {'name': ('description', 'description')}, + {'name': ('keywords', 'keywords')}, + {'itemprop': ('paid', 'paid')}, + {'itemprop': ('videoId', 'video_id')}, + {'itemprop': ('duration', 'duration')}, + {'itemprop': ('width', 'width')}, + {'itemprop': ('height', 'height')}, + {'itemprop': ('isFamilyFriendly', 'is_family_friendly')}, + {'itemprop': ('interactionCount', 'views')}, + {'itemprop': ('datePublished', 'date_published')}, + {'itemprop': ('genre', 'genre')}, + {'itemprop': ('unlisted', 'genre')} + ] + + from pprint import pprint + def pool_process(media_item): + # threaded function + global parse_yt_page + results = [] + try: + url = media_item['media_url'].strip() + url = url.replace('http:', 'https:') + url = url.replace('www.youtube','youtube') + log.debug(f'get: {url}') + data = urllib.request.urlopen(url, timeout=60).read() + soup = BeautifulSoup(data,'lxml') + for metavar in metavars: + propname, propvals = list(metavar.items())[0] + #result = parse_yt_meta(soup, propname, propvals) + content = soup.find('meta', attrs={propname:propvals[0]}) + if content: + media_item[propvals[1]] = content.get('content','') + if 'duration' in media_item.keys(): + # fix values + duration = media_item['duration'] + mins = int(duration.split('M')[0].replace('PT','')) + secs = int(duration.split('M')[1].replace('S','')) + media_item['duration'] = mins + (60 * secs) + if 'paid' in media_item.keys(): + media_item['paid'] = int(bool(media_item['paid'] == 'True')) + if 'is_family_friendly' in media_item.keys(): + media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True')) + except Exception as e: + log.debug(f'Error: {e}, {media_item["media_url"]}') + pbar.update(1) + return media_item # a list of dict key:val dicts + + # read CSV and get URLs + df_media = None + for fp in fps_default_in: + df = pd.read_csv(fp) + log.info(f'reading {len(df)} rows') + if df_media is None: + df_media = df + else: + df_media = df_media.append(df, ignore_index=True) + + name_maps = { + 'Media ID': 'media_id', + 'Media URL': 'media_url', + 'Source URL': 'source_url', + 'Attribution': 'attribution', + 'CC License': 'cc_license', + } + df_media.rename(columns=name_maps, inplace=True) + log.info(f'{len(df_media)} rows') + df_media = df_media[df_media.media_id.str.contains("video/")] + log.info(f'{len(df_media)} rows') + df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True) + log.info(f'{len(df_media)} rows') + media_items = df_media.to_dict('records') + + results = [] + pbar = tqdm(total=len(media_items)) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=len(media_items)) as pbar: + results = pool.map(pool_process, media_items) + pbar.close() + + # create DataFrame and save to CSV + file_utils.mkdirs(opt_fp_out) + df = pd.DataFrame.from_dict(results) + df.index.name = 'index' + df.to_csv(opt_fp_out) \ No newline at end of file -- cgit v1.2.3-70-g09d2