summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/app/models/citations.py17
-rw-r--r--megapixels/commands/datasets/citations_to_csv.py92
-rw-r--r--megapixels/commands/datasets/ijb_youtube_meta.py146
3 files changed, 255 insertions, 0 deletions
diff --git a/megapixels/app/models/citations.py b/megapixels/app/models/citations.py
new file mode 100644
index 00000000..b0e02fc7
--- /dev/null
+++ b/megapixels/app/models/citations.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+from dataclasses import dataclass
+from mashumaro import DataClassJSONMixin
+
+@dataclass
+class Paper(DataClassJSONMixin):
+ key: str
+ dataset_name: str
+ paper_id: str
+ title: str
+ paper_type: str
+ year: int
+ paper_url: str = ''
+ loc: str = ''
+ loc_type: str = ''
+ lat: float = 0.0
+ lng: float = 0.0 \ No newline at end of file
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py
new file mode 100644
index 00000000..431ee4cd
--- /dev/null
+++ b/megapixels/commands/datasets/citations_to_csv.py
@@ -0,0 +1,92 @@
+import click
+
+from app.utils import click_utils
+from app.utils.logger_utils import Logger
+from app.models.citations import Paper
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input license data CSV')
+@click.option('-o', '--output', 'opt_fp_out',
+ help='Output directory')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """Convert JSON to CSV"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+
+ import json
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+
+ log = Logger.getLogger()
+ log.info('Convert JSON to CSV')
+
+ # load
+ with open(opt_fp_in, 'r') as fp:
+ json_data = json.load(fp)
+
+ # parse
+ papers = []
+ dataset_key = json_data['paper']['key']
+ dataset_name = json_data['paper']['name']
+ papers_main = get_orig_paper(json_data)
+ papers += papers_main
+ papers_citations = get_citations(dataset_key, dataset_name, json_data)
+ papers += papers_citations
+ papers = [p.to_dict() for p in papers]
+
+ # save
+ if not opt_fp_out:
+ fp_out = opt_fp_in.replace('.json','.csv')
+ log.info(fp_out)
+
+ df_papers = pd.DataFrame.from_dict(papers)
+ df_papers.index.name = 'index'
+ df_papers.to_csv(fp_out)
+
+
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+def get_citations(dataset_key, dataset_name, json_data):
+ papers = []
+ d_type = 'citation'
+ for p in json_data['citations']:
+ year = 0 if p.get('year', 0) == '' else p.get('year', 0)
+ addresses = p.get('addresses', '')
+ if addresses:
+ for a in addresses:
+ paper = Paper(dataset_key, dataset_name, p['id'], p['title'], d_type,
+ year, p['pdf'],
+ a['address'], a['type'], a['lat'], a['lng'])
+ else:
+ paper = Paper(p['key'], p['name'], d['id'], p['title'], 'main', year, p['pdf'])
+
+ papers.append(paper)
+ return papers
+
+def get_orig_paper(json_data):
+ papers = []
+ d = json_data
+ p = d['paper']
+ d_type = 'main'
+ year = 0 if p.get('year', 0) == '' else p.get('year', 0)
+ addresses = p.get('address','')
+ if addresses:
+ for a in addresses:
+ paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year,
+ p['pdf'],
+ a['address'], a['type'], a['lat'], a['lng'])
+ else:
+ paper = Paper(p['key'], p['name'], p['paper_id'], p['title'], d_type, year, p['pdf'])
+ papers.append(paper)
+ return papers \ No newline at end of file
diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py
new file mode 100644
index 00000000..87df390c
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_youtube_meta.py
@@ -0,0 +1,146 @@
+"""Create screenshots for YouTube.com URLs in the IJB dataset
+
+TODO
+- grey out boxes in sidebar
+- resize driver screenshot area to include author text
+
+Installing webdrivers:
+
+Chrome
+wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
+
+Firefox
+wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+
+PhantomJS
+npm install -g phantomjs
+"""
+
+import click
+
+from app.settings import app_cfg
+
+fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
+fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
+fps_default_in = [fp_default_in_a, fp_default_in_b]
+fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
+ help='Input license data CSV')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out,
+ help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=4,
+ help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+ """IJB-C screenshot sources"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
+ import urllib.request
+
+ import lxml
+ from bs4 import BeautifulSoup
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+
+ from app.utils import file_utils, im_utils, logger_utils
+
+ log = logger_utils.Logger.getLogger()
+
+
+ metavars = [
+ {'name': ('title','title')},
+ {'name': ('description', 'description')},
+ {'name': ('keywords', 'keywords')},
+ {'itemprop': ('paid', 'paid')},
+ {'itemprop': ('videoId', 'video_id')},
+ {'itemprop': ('duration', 'duration')},
+ {'itemprop': ('width', 'width')},
+ {'itemprop': ('height', 'height')},
+ {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},
+ {'itemprop': ('interactionCount', 'views')},
+ {'itemprop': ('datePublished', 'date_published')},
+ {'itemprop': ('genre', 'genre')},
+ {'itemprop': ('unlisted', 'genre')}
+ ]
+
+ from pprint import pprint
+ def pool_process(media_item):
+ # threaded function
+ global parse_yt_page
+ results = []
+ try:
+ url = media_item['media_url'].strip()
+ url = url.replace('http:', 'https:')
+ url = url.replace('www.youtube','youtube')
+ log.debug(f'get: {url}')
+ data = urllib.request.urlopen(url, timeout=60).read()
+ soup = BeautifulSoup(data,'lxml')
+ for metavar in metavars:
+ propname, propvals = list(metavar.items())[0]
+ #result = parse_yt_meta(soup, propname, propvals)
+ content = soup.find('meta', attrs={propname:propvals[0]})
+ if content:
+ media_item[propvals[1]] = content.get('content','')
+ if 'duration' in media_item.keys():
+ # fix values
+ duration = media_item['duration']
+ mins = int(duration.split('M')[0].replace('PT',''))
+ secs = int(duration.split('M')[1].replace('S',''))
+ media_item['duration'] = mins + (60 * secs)
+ if 'paid' in media_item.keys():
+ media_item['paid'] = int(bool(media_item['paid'] == 'True'))
+ if 'is_family_friendly' in media_item.keys():
+ media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True'))
+ except Exception as e:
+ log.debug(f'Error: {e}, {media_item["media_url"]}')
+ pbar.update(1)
+ return media_item # a list of dict key:val dicts
+
+ # read CSV and get URLs
+ df_media = None
+ for fp in fps_default_in:
+ df = pd.read_csv(fp)
+ log.info(f'reading {len(df)} rows')
+ if df_media is None:
+ df_media = df
+ else:
+ df_media = df_media.append(df, ignore_index=True)
+
+ name_maps = {
+ 'Media ID': 'media_id',
+ 'Media URL': 'media_url',
+ 'Source URL': 'source_url',
+ 'Attribution': 'attribution',
+ 'CC License': 'cc_license',
+ }
+ df_media.rename(columns=name_maps, inplace=True)
+ log.info(f'{len(df_media)} rows')
+ df_media = df_media[df_media.media_id.str.contains("video/")]
+ log.info(f'{len(df_media)} rows')
+ df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True)
+ log.info(f'{len(df_media)} rows')
+ media_items = df_media.to_dict('records')
+
+ results = []
+ pbar = tqdm(total=len(media_items))
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=len(media_items)) as pbar:
+ results = pool.map(pool_process, media_items)
+ pbar.close()
+
+ # create DataFrame and save to CSV
+ file_utils.mkdirs(opt_fp_out)
+ df = pd.DataFrame.from_dict(results)
+ df.index.name = 'index'
+ df.to_csv(opt_fp_out) \ No newline at end of file