"""Create screenshots for YouTube.com URLs in the IJB dataset TODO - grey out boxes in sidebar - resize driver screenshot area to include author text Installing webdrivers: Chrome wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip Firefox wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz PhantomJS npm install -g phantomjs """ import click from app.settings import app_cfg fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv' fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv' fps_default_in = [fp_default_in_a, fp_default_in_b] fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True, help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out, help='Output directory') @click.option('-t', '--threads', 'opt_threads', default=4, help='Number of threads') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): """IJB-C screenshot sources""" import sys from glob import glob from os.path import join from pathlib import Path import time from functools import partial from multiprocessing.dummy import Pool as ThreadPool import urllib.request import lxml from bs4 import BeautifulSoup import pandas as pd import cv2 as cv from tqdm import tqdm from app.utils import file_utils, im_utils, logger_utils log = logger_utils.Logger.getLogger() metavars = [ {'name': ('title','yt_title')}, {'name': ('description', 'yt_description')}, {'name': ('keywords', 'yt_keywords')}, {'itemprop': ('paid', 'yt_paid')}, {'itemprop': ('videoId', 'yt_video_id')}, {'itemprop': ('duration', 'yt_duration')}, {'itemprop': ('width', 'yt_width')}, {'itemprop': ('height', 'yt_height')}, {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')}, {'itemprop': ('interactionCount', 'yt_views')}, {'itemprop': ('datePublished', 'yt_date_published')}, {'itemprop': ('genre', 'yt_genre')}, {'itemprop': ('unlisted', 'yt_unlisted')} ] # from pprint import pprint def pool_process(media_item): # threaded function global parse_yt_page results = [] try: url = media_item['ijb_media_url'].strip() url = url.replace('http:', 'https:') url = url.replace('www.youtube','youtube') data = urllib.request.urlopen(url, timeout=30).read() soup = BeautifulSoup(data,'lxml') for metavar in metavars: propname, propvals = list(metavar.items())[0] #result = parse_yt_meta(soup, propname, propvals) content = soup.find('meta', attrs={propname:propvals[0]}) if content: media_item[propvals[1]] = content.get('content','') # description, or error, is not metavar because it can be truncated desc_result = soup.find('p', attrs={'id': 'eow-description'}) description = desc_result.text if desc_result else '' if not 'yt_duration' in media_item.keys(): error_result = soup.find('div', attrs={'id': 'player-unavailable'}) description = error_result.text if error_result else 'Video unavailable' media_item['yt_description'] = description log.debug(f'url: {url}, description: {description}') if 'yt_duration' in media_item.keys(): # fix values duration = media_item['yt_duration'] mins = int(duration.split('M')[0].replace('PT','')) secs = int(duration.split('M')[1].replace('S','')) media_item['yt_duration'] = mins + (60 * secs) if 'yt_paid' in media_item.keys(): media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True')) if 'yt_is_family_friendly' in media_item.keys(): media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True')) except Exception as e: log.debug(f'Error: {e}, {media_item["ijb_media_url"]}') pbar.update(1) return media_item # a list of dict key:val dicts # read CSV and get URLs df_media = None for fp in fps_default_in: df = pd.read_csv(fp) log.info(f'reading {len(df)} rows') if df_media is None: df_media = df else: df_media = df_media.append(df, ignore_index=True) name_maps = { 'Media ID': 'ijb_media_id', 'Media URL': 'ijb_media_url', 'Source URL': 'ijb_source_url', 'Attribution': 'ijb_attribution', 'CC License': 'ijb_cc_license', } df_media.rename(columns=name_maps, inplace=True) log.info(f'{len(df_media)} rows') df_media = df_media[df_media.ijb_media_id.str.contains("video/")] log.info(f'{len(df_media)} rows') df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True) log.info(f'{len(df_media)} rows') media_items = df_media.to_dict('records') results = [] pbar = tqdm(total=len(media_items)) pool_process = partial(pool_process) pool = ThreadPool(opt_threads) with tqdm(total=len(media_items)) as pbar: results = pool.map(pool_process, media_items) pbar.close() # create DataFrame and save to CSV file_utils.mkdirs(opt_fp_out) df = pd.DataFrame.from_dict(results) df.index.name = 'index' df.to_csv(opt_fp_out)