"""Create screenshots for YouTube.com URLs in the IJB dataset TODO - grey out boxes in sidebar - resize driver screenshot area to include author text Installing webdrivers: Chrome wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip Firefox wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz PhantomJS npm install -g phantomjs """ import click from app.settings import app_cfg #/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') @click.option('-t', '--threads', 'opt_threads', default=20, help='Number of threads') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): """IJB-C screenshot sources""" import sys from glob import glob from os.path import join from pathlib import Path import time from functools import partial from multiprocessing.dummy import Pool as ThreadPool import pandas as pd import cv2 as cv from tqdm import tqdm from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from app.utils import file_utils, im_utils, logger_utils log = logger_utils.Logger.getLogger() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-dev-shm-usage') def pool_process(route, chrome_options): # Threaded image resize function try: pbar.update(1) driver = webdriver.Chrome(chrome_options=chrome_options) driver.set_window_size(1920,1080) url = route['url'] fp_out = route['dst'] log.debug(f'url: {url}, dst: {fp_out}') driver.get(url) if 'youtube.com' in url: try: wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) except Exception as e: log.debug(f'error: {e}') pass else: wait = WebDriverWait(driver,10) time.sleep(1) # wait for element time.sleep(10) # wait for element file_utils.mkdirs(fp_out) log.debug(f'save to: {fp_out}') driver.get_screenshot_as_file(fp_out) driver.quit() return True except: return False # load routes = [] df_licenses = pd.read_csv(opt_fp_in) log.info(f'{len(df_licenses)} rows') for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): filepath = df_license['Media ID'] if not 'video/' in filepath: continue url = str(df_license['Media URL']) if not ('http://' in url or 'https://' in url): url = 'http://' + url fp_media = filepath.replace(Path(filepath).suffix, '.png') fp_out = join(opt_fp_out, fp_media) obj = {'url': url, 'dst': fp_out} routes.append(obj) # setup multithreading for route in routes: log.debug(f'url: {route["url"]}, dst: {route["dst"]}') return results = [] pbar = tqdm(total=len(routes)) pool_process = partial(pool_process, chrome_options=chrome_options) pool = ThreadPool(opt_threads) with tqdm(total=len(routes)) as pbar: results = pool.map(pool_process, routes) pbar.close() #wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer'))) #wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show')) #wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) #driver.execute_script("document.getElementById('related').style.display = 'None';") ''' title_is title_contains presence_of_element_located visibility_of_element_located visibility_of presence_of_all_elements_located text_to_be_present_in_element text_to_be_present_in_element_value frame_to_be_available_and_switch_to_it invisibility_of_element_located element_to_be_clickable - it is Displayed and Enabled. staleness_of element_to_be_selected element_located_to_be_selected element_selection_state_to_be element_located_selection_state_to_be alert_is_present '''