diff options
Diffstat (limited to 'megapixels/commands/datasets/ijb_screenshot.py')
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot.py | 111 |
1 files changed, 81 insertions, 30 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py index e6940d88..616893c7 100644 --- a/megapixels/commands/datasets/ijb_screenshot.py +++ b/megapixels/commands/datasets/ijb_screenshot.py @@ -1,9 +1,20 @@ -# Chrome -# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip -# Firefox -# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz -# PhantomJS -# npm install -g phantomjs +"""Create screenshots for YouTube.com URLs in the IJB dataset + +TODO +- grey out boxes in sidebar +- resize driver screenshot area to include author text + +Installing webdrivers: + +Chrome +wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip + +Firefox +wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz + +PhantomJS +npm install -g phantomjs +""" import click @@ -16,8 +27,10 @@ from app.settings import app_cfg help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') +@click.option('-t', '--threads', 'opt_threads', default=20, + help='Number of threads') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): """IJB-C screenshot sources""" import sys @@ -25,6 +38,8 @@ def cli(ctx, opt_fp_in, opt_fp_out): from os.path import join from pathlib import Path import time + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool import pandas as pd import cv2 as cv @@ -43,39 +58,75 @@ def cli(ctx, opt_fp_in, opt_fp_out): chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-dev-shm-usage') - driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) - + + + def pool_process(route, chrome_options): + # Threaded image resize function + try: + pbar.update(1) + + driver = webdriver.Chrome(chrome_options=chrome_options) + driver.set_window_size(1920,1080) + + url = route['url'] + fp_out = route['dst'] + log.debug(f'url: {url}, dst: {fp_out}') + driver.get(url) + + if 'youtube.com' in url: + try: + wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + except Exception as e: + log.debug(f'error: {e}') + pass + else: + wait = WebDriverWait(driver,10) + time.sleep(1) # wait for element + + time.sleep(10) # wait for element + file_utils.mkdirs(fp_out) + log.debug(f'save to: {fp_out}') + driver.get_screenshot_as_file(fp_out) + driver.quit() + + return True + except: + return False + + # load + routes = [] df_licenses = pd.read_csv(opt_fp_in) log.info(f'{len(df_licenses)} rows') - for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): filepath = df_license['Media ID'] - if 'frames/' in filepath or 'img/' in filepath: + if not 'video/' in filepath: continue - url = df_license['Media URL'] + url = str(df_license['Media URL']) if not ('http://' in url or 'https://' in url): url = 'http://' + url - log.debug(f'getting: {url}') - driver.get(url) - if 'youtube.com' in url: - try: - wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) - time.sleep(1) # wait for element - except Exception as e: - log.debug(f'error: {e}') - pass - else: - wait = WebDriverWait(driver,5) - time.sleep(1) # wait for element - fp_media = filepath.replace(Path(filepath).suffix, '.png') fp_out = join(opt_fp_out, fp_media) - file_utils.mkdirs(fp_out) - log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) + obj = {'url': url, 'dst': fp_out} + routes.append(obj) + + # setup multithreading + for route in routes: + log.debug(f'url: {route["url"]}, dst: {route["dst"]}') + + return + results = [] + pbar = tqdm(total=len(routes)) + pool_process = partial(pool_process, chrome_options=chrome_options) + pool = ThreadPool(opt_threads) + with tqdm(total=len(routes)) as pbar: + results = pool.map(pool_process, routes) + pbar.close() - driver.quit() + + + + + |
