summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_youtube_meta.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/ijb_youtube_meta.py')
-rw-r--r--megapixels/commands/datasets/ijb_youtube_meta.py146
1 files changed, 146 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py
new file mode 100644
index 00000000..87df390c
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_youtube_meta.py
@@ -0,0 +1,146 @@
+"""Create screenshots for YouTube.com URLs in the IJB dataset
+
+TODO
+- grey out boxes in sidebar
+- resize driver screenshot area to include author text
+
+Installing webdrivers:
+
+Chrome
+wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
+
+Firefox
+wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+
+PhantomJS
+npm install -g phantomjs
+"""
+
+import click
+
+from app.settings import app_cfg
+
+fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
+fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
+fps_default_in = [fp_default_in_a, fp_default_in_b]
+fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
+ help='Input license data CSV')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out,
+ help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=4,
+ help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+ """IJB-C screenshot sources"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
+ import urllib.request
+
+ import lxml
+ from bs4 import BeautifulSoup
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+
+ from app.utils import file_utils, im_utils, logger_utils
+
+ log = logger_utils.Logger.getLogger()
+
+
+ metavars = [
+ {'name': ('title','title')},
+ {'name': ('description', 'description')},
+ {'name': ('keywords', 'keywords')},
+ {'itemprop': ('paid', 'paid')},
+ {'itemprop': ('videoId', 'video_id')},
+ {'itemprop': ('duration', 'duration')},
+ {'itemprop': ('width', 'width')},
+ {'itemprop': ('height', 'height')},
+ {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},
+ {'itemprop': ('interactionCount', 'views')},
+ {'itemprop': ('datePublished', 'date_published')},
+ {'itemprop': ('genre', 'genre')},
+ {'itemprop': ('unlisted', 'genre')}
+ ]
+
+ from pprint import pprint
+ def pool_process(media_item):
+ # threaded function
+ global parse_yt_page
+ results = []
+ try:
+ url = media_item['media_url'].strip()
+ url = url.replace('http:', 'https:')
+ url = url.replace('www.youtube','youtube')
+ log.debug(f'get: {url}')
+ data = urllib.request.urlopen(url, timeout=60).read()
+ soup = BeautifulSoup(data,'lxml')
+ for metavar in metavars:
+ propname, propvals = list(metavar.items())[0]
+ #result = parse_yt_meta(soup, propname, propvals)
+ content = soup.find('meta', attrs={propname:propvals[0]})
+ if content:
+ media_item[propvals[1]] = content.get('content','')
+ if 'duration' in media_item.keys():
+ # fix values
+ duration = media_item['duration']
+ mins = int(duration.split('M')[0].replace('PT',''))
+ secs = int(duration.split('M')[1].replace('S',''))
+ media_item['duration'] = mins + (60 * secs)
+ if 'paid' in media_item.keys():
+ media_item['paid'] = int(bool(media_item['paid'] == 'True'))
+ if 'is_family_friendly' in media_item.keys():
+ media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True'))
+ except Exception as e:
+ log.debug(f'Error: {e}, {media_item["media_url"]}')
+ pbar.update(1)
+ return media_item # a list of dict key:val dicts
+
+ # read CSV and get URLs
+ df_media = None
+ for fp in fps_default_in:
+ df = pd.read_csv(fp)
+ log.info(f'reading {len(df)} rows')
+ if df_media is None:
+ df_media = df
+ else:
+ df_media = df_media.append(df, ignore_index=True)
+
+ name_maps = {
+ 'Media ID': 'media_id',
+ 'Media URL': 'media_url',
+ 'Source URL': 'source_url',
+ 'Attribution': 'attribution',
+ 'CC License': 'cc_license',
+ }
+ df_media.rename(columns=name_maps, inplace=True)
+ log.info(f'{len(df_media)} rows')
+ df_media = df_media[df_media.media_id.str.contains("video/")]
+ log.info(f'{len(df_media)} rows')
+ df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True)
+ log.info(f'{len(df_media)} rows')
+ media_items = df_media.to_dict('records')
+
+ results = []
+ pbar = tqdm(total=len(media_items))
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=len(media_items)) as pbar:
+ results = pool.map(pool_process, media_items)
+ pbar.close()
+
+ # create DataFrame and save to CSV
+ file_utils.mkdirs(opt_fp_out)
+ df = pd.DataFrame.from_dict(results)
+ df.index.name = 'index'
+ df.to_csv(opt_fp_out) \ No newline at end of file