diff options
Diffstat (limited to 'megapixels/commands/datasets/ijb_screenshot_mt.py')
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot_mt.py | 156 |
1 files changed, 0 insertions, 156 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot_mt.py b/megapixels/commands/datasets/ijb_screenshot_mt.py deleted file mode 100644 index 616893c7..00000000 --- a/megapixels/commands/datasets/ijb_screenshot_mt.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Create screenshots for YouTube.com URLs in the IJB dataset - -TODO -- grey out boxes in sidebar -- resize driver screenshot area to include author text - -Installing webdrivers: - -Chrome -wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip - -Firefox -wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz - -PhantomJS -npm install -g phantomjs -""" - -import click - -from app.settings import app_cfg - -#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input license data CSV') -@click.option('-o', '--output', 'opt_fp_out', required=True, - help='Output directory') -@click.option('-t', '--threads', 'opt_threads', default=20, - help='Number of threads') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): - """IJB-C screenshot sources""" - - import sys - from glob import glob - from os.path import join - from pathlib import Path - import time - from functools import partial - from multiprocessing.dummy import Pool as ThreadPool - - import pandas as pd - import cv2 as cv - from tqdm import tqdm - - from selenium import webdriver - from selenium.webdriver.support import expected_conditions as EC - from selenium.webdriver.support.wait import WebDriverWait - from selenium.webdriver.common.by import By - - from app.utils import file_utils, im_utils, logger_utils - - log = logger_utils.Logger.getLogger() - - chrome_options = webdriver.ChromeOptions() - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--headless') - chrome_options.add_argument('--disable-dev-shm-usage') - - - def pool_process(route, chrome_options): - # Threaded image resize function - try: - pbar.update(1) - - driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) - - url = route['url'] - fp_out = route['dst'] - log.debug(f'url: {url}, dst: {fp_out}') - driver.get(url) - - if 'youtube.com' in url: - try: - wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) - except Exception as e: - log.debug(f'error: {e}') - pass - else: - wait = WebDriverWait(driver,10) - time.sleep(1) # wait for element - - time.sleep(10) # wait for element - file_utils.mkdirs(fp_out) - log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) - driver.quit() - - return True - except: - return False - - # load - routes = [] - df_licenses = pd.read_csv(opt_fp_in) - log.info(f'{len(df_licenses)} rows') - for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): - filepath = df_license['Media ID'] - if not 'video/' in filepath: - continue - url = str(df_license['Media URL']) - if not ('http://' in url or 'https://' in url): - url = 'http://' + url - fp_media = filepath.replace(Path(filepath).suffix, '.png') - fp_out = join(opt_fp_out, fp_media) - obj = {'url': url, 'dst': fp_out} - routes.append(obj) - - # setup multithreading - for route in routes: - log.debug(f'url: {route["url"]}, dst: {route["dst"]}') - - return - results = [] - pbar = tqdm(total=len(routes)) - pool_process = partial(pool_process, chrome_options=chrome_options) - pool = ThreadPool(opt_threads) - with tqdm(total=len(routes)) as pbar: - results = pool.map(pool_process, routes) - pbar.close() - - - - - - - - - -#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer'))) -#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show')) -#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) -#driver.execute_script("document.getElementById('related').style.display = 'None';") - -''' -title_is -title_contains -presence_of_element_located -visibility_of_element_located -visibility_of -presence_of_all_elements_located -text_to_be_present_in_element -text_to_be_present_in_element_value -frame_to_be_available_and_switch_to_it -invisibility_of_element_located -element_to_be_clickable - it is Displayed and Enabled. -staleness_of -element_to_be_selected -element_located_to_be_selected -element_selection_state_to_be -element_located_selection_state_to_be -alert_is_present -'''
\ No newline at end of file |
