diff options
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot.py | 111 | ||||
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot_mt.py | 156 | ||||
| -rw-r--r-- | megapixels/commands/templates/basic.py (renamed from megapixels/commands/datasets/template.py) | 0 |
3 files changed, 81 insertions, 186 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py index e6940d88..616893c7 100644 --- a/megapixels/commands/datasets/ijb_screenshot.py +++ b/megapixels/commands/datasets/ijb_screenshot.py @@ -1,9 +1,20 @@ -# Chrome -# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip -# Firefox -# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz -# PhantomJS -# npm install -g phantomjs +"""Create screenshots for YouTube.com URLs in the IJB dataset + +TODO +- grey out boxes in sidebar +- resize driver screenshot area to include author text + +Installing webdrivers: + +Chrome +wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip + +Firefox +wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz + +PhantomJS +npm install -g phantomjs +""" import click @@ -16,8 +27,10 @@ from app.settings import app_cfg help='Input license data CSV') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') +@click.option('-t', '--threads', 'opt_threads', default=20, + help='Number of threads') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): """IJB-C screenshot sources""" import sys @@ -25,6 +38,8 @@ def cli(ctx, opt_fp_in, opt_fp_out): from os.path import join from pathlib import Path import time + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool import pandas as pd import cv2 as cv @@ -43,39 +58,75 @@ def cli(ctx, opt_fp_in, opt_fp_out): chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-dev-shm-usage') - driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) - + + + def pool_process(route, chrome_options): + # Threaded image resize function + try: + pbar.update(1) + + driver = webdriver.Chrome(chrome_options=chrome_options) + driver.set_window_size(1920,1080) + + url = route['url'] + fp_out = route['dst'] + log.debug(f'url: {url}, dst: {fp_out}') + driver.get(url) + + if 'youtube.com' in url: + try: + wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + except Exception as e: + log.debug(f'error: {e}') + pass + else: + wait = WebDriverWait(driver,10) + time.sleep(1) # wait for element + + time.sleep(10) # wait for element + file_utils.mkdirs(fp_out) + log.debug(f'save to: {fp_out}') + driver.get_screenshot_as_file(fp_out) + driver.quit() + + return True + except: + return False + + # load + routes = [] df_licenses = pd.read_csv(opt_fp_in) log.info(f'{len(df_licenses)} rows') - for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): filepath = df_license['Media ID'] - if 'frames/' in filepath or 'img/' in filepath: + if not 'video/' in filepath: continue - url = df_license['Media URL'] + url = str(df_license['Media URL']) if not ('http://' in url or 'https://' in url): url = 'http://' + url - log.debug(f'getting: {url}') - driver.get(url) - if 'youtube.com' in url: - try: - wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) - time.sleep(1) # wait for element - except Exception as e: - log.debug(f'error: {e}') - pass - else: - wait = WebDriverWait(driver,5) - time.sleep(1) # wait for element - fp_media = filepath.replace(Path(filepath).suffix, '.png') fp_out = join(opt_fp_out, fp_media) - file_utils.mkdirs(fp_out) - log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) + obj = {'url': url, 'dst': fp_out} + routes.append(obj) + + # setup multithreading + for route in routes: + log.debug(f'url: {route["url"]}, dst: {route["dst"]}') + + return + results = [] + pbar = tqdm(total=len(routes)) + pool_process = partial(pool_process, chrome_options=chrome_options) + pool = ThreadPool(opt_threads) + with tqdm(total=len(routes)) as pbar: + results = pool.map(pool_process, routes) + pbar.close() - driver.quit() + + + + + diff --git a/megapixels/commands/datasets/ijb_screenshot_mt.py b/megapixels/commands/datasets/ijb_screenshot_mt.py deleted file mode 100644 index 616893c7..00000000 --- a/megapixels/commands/datasets/ijb_screenshot_mt.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Create screenshots for YouTube.com URLs in the IJB dataset - -TODO -- grey out boxes in sidebar -- resize driver screenshot area to include author text - -Installing webdrivers: - -Chrome -wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip - -Firefox -wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz - -PhantomJS -npm install -g phantomjs -""" - -import click - -from app.settings import app_cfg - -#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input license data CSV') -@click.option('-o', '--output', 'opt_fp_out', required=True, - help='Output directory') -@click.option('-t', '--threads', 'opt_threads', default=20, - help='Number of threads') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): - """IJB-C screenshot sources""" - - import sys - from glob import glob - from os.path import join - from pathlib import Path - import time - from functools import partial - from multiprocessing.dummy import Pool as ThreadPool - - import pandas as pd - import cv2 as cv - from tqdm import tqdm - - from selenium import webdriver - from selenium.webdriver.support import expected_conditions as EC - from selenium.webdriver.support.wait import WebDriverWait - from selenium.webdriver.common.by import By - - from app.utils import file_utils, im_utils, logger_utils - - log = logger_utils.Logger.getLogger() - - chrome_options = webdriver.ChromeOptions() - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--headless') - chrome_options.add_argument('--disable-dev-shm-usage') - - - def pool_process(route, chrome_options): - # Threaded image resize function - try: - pbar.update(1) - - driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) - - url = route['url'] - fp_out = route['dst'] - log.debug(f'url: {url}, dst: {fp_out}') - driver.get(url) - - if 'youtube.com' in url: - try: - wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) - except Exception as e: - log.debug(f'error: {e}') - pass - else: - wait = WebDriverWait(driver,10) - time.sleep(1) # wait for element - - time.sleep(10) # wait for element - file_utils.mkdirs(fp_out) - log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) - driver.quit() - - return True - except: - return False - - # load - routes = [] - df_licenses = pd.read_csv(opt_fp_in) - log.info(f'{len(df_licenses)} rows') - for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): - filepath = df_license['Media ID'] - if not 'video/' in filepath: - continue - url = str(df_license['Media URL']) - if not ('http://' in url or 'https://' in url): - url = 'http://' + url - fp_media = filepath.replace(Path(filepath).suffix, '.png') - fp_out = join(opt_fp_out, fp_media) - obj = {'url': url, 'dst': fp_out} - routes.append(obj) - - # setup multithreading - for route in routes: - log.debug(f'url: {route["url"]}, dst: {route["dst"]}') - - return - results = [] - pbar = tqdm(total=len(routes)) - pool_process = partial(pool_process, chrome_options=chrome_options) - pool = ThreadPool(opt_threads) - with tqdm(total=len(routes)) as pbar: - results = pool.map(pool_process, routes) - pbar.close() - - - - - - - - - -#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer'))) -#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show')) -#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) -#driver.execute_script("document.getElementById('related').style.display = 'None';") - -''' -title_is -title_contains -presence_of_element_located -visibility_of_element_located -visibility_of -presence_of_all_elements_located -text_to_be_present_in_element -text_to_be_present_in_element_value -frame_to_be_available_and_switch_to_it -invisibility_of_element_located -element_to_be_clickable - it is Displayed and Enabled. -staleness_of -element_to_be_selected -element_located_to_be_selected -element_selection_state_to_be -element_located_selection_state_to_be -alert_is_present -'''
\ No newline at end of file diff --git a/megapixels/commands/datasets/template.py b/megapixels/commands/templates/basic.py index 2e952896..2e952896 100644 --- a/megapixels/commands/datasets/template.py +++ b/megapixels/commands/templates/basic.py |
