summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_screenshot.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/ijb_screenshot.py')
-rw-r--r--megapixels/commands/datasets/ijb_screenshot.py111
1 files changed, 81 insertions, 30 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py
index e6940d88..616893c7 100644
--- a/megapixels/commands/datasets/ijb_screenshot.py
+++ b/megapixels/commands/datasets/ijb_screenshot.py
@@ -1,9 +1,20 @@
-# Chrome
-# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
-# Firefox
-# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
-# PhantomJS
-# npm install -g phantomjs
+"""Create screenshots for YouTube.com URLs in the IJB dataset
+
+TODO
+- grey out boxes in sidebar
+- resize driver screenshot area to include author text
+
+Installing webdrivers:
+
+Chrome
+wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
+
+Firefox
+wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+
+PhantomJS
+npm install -g phantomjs
+"""
import click
@@ -16,8 +27,10 @@ from app.settings import app_cfg
help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=20,
+ help='Number of threads')
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
"""IJB-C screenshot sources"""
import sys
@@ -25,6 +38,8 @@ def cli(ctx, opt_fp_in, opt_fp_out):
from os.path import join
from pathlib import Path
import time
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
import pandas as pd
import cv2 as cv
@@ -43,39 +58,75 @@ def cli(ctx, opt_fp_in, opt_fp_out):
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
- driver = webdriver.Chrome(chrome_options=chrome_options)
- driver.set_window_size(1920,1080)
-
+
+
+ def pool_process(route, chrome_options):
+ # Threaded image resize function
+ try:
+ pbar.update(1)
+
+ driver = webdriver.Chrome(chrome_options=chrome_options)
+ driver.set_window_size(1920,1080)
+
+ url = route['url']
+ fp_out = route['dst']
+ log.debug(f'url: {url}, dst: {fp_out}')
+ driver.get(url)
+
+ if 'youtube.com' in url:
+ try:
+ wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+ except Exception as e:
+ log.debug(f'error: {e}')
+ pass
+ else:
+ wait = WebDriverWait(driver,10)
+ time.sleep(1) # wait for element
+
+ time.sleep(10) # wait for element
+ file_utils.mkdirs(fp_out)
+ log.debug(f'save to: {fp_out}')
+ driver.get_screenshot_as_file(fp_out)
+ driver.quit()
+
+ return True
+ except:
+ return False
+
+ # load
+ routes = []
df_licenses = pd.read_csv(opt_fp_in)
log.info(f'{len(df_licenses)} rows')
-
for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
filepath = df_license['Media ID']
- if 'frames/' in filepath or 'img/' in filepath:
+ if not 'video/' in filepath:
continue
- url = df_license['Media URL']
+ url = str(df_license['Media URL'])
if not ('http://' in url or 'https://' in url):
url = 'http://' + url
- log.debug(f'getting: {url}')
- driver.get(url)
- if 'youtube.com' in url:
- try:
- wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
- time.sleep(1) # wait for element
- except Exception as e:
- log.debug(f'error: {e}')
- pass
- else:
- wait = WebDriverWait(driver,5)
- time.sleep(1) # wait for element
-
fp_media = filepath.replace(Path(filepath).suffix, '.png')
fp_out = join(opt_fp_out, fp_media)
- file_utils.mkdirs(fp_out)
- log.debug(f'save to: {fp_out}')
- driver.get_screenshot_as_file(fp_out)
+ obj = {'url': url, 'dst': fp_out}
+ routes.append(obj)
+
+ # setup multithreading
+ for route in routes:
+ log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
+
+ return
+ results = []
+ pbar = tqdm(total=len(routes))
+ pool_process = partial(pool_process, chrome_options=chrome_options)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=len(routes)) as pbar:
+ results = pool.map(pool_process, routes)
+ pbar.close()
- driver.quit()
+
+
+
+
+