3 files changed, 81 insertions, 186 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py
index e6940d88..616893c7 100644
--- a/megapixels/commands/datasets/ijb_screenshot.py
+++ b/megapixels/commands/datasets/ijb_screenshot.py
@@ -1,9 +1,20 @@
-# Chrome
-# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
-# Firefox
-# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
-# PhantomJS
-# npm install -g phantomjs
+"""Create screenshots for YouTube.com URLs in the IJB dataset
+
+TODO
+- grey out boxes in sidebar
+- resize driver screenshot area to include author text
+
+Installing webdrivers:
+
+Chrome
+wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
+
+Firefox
+wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+
+PhantomJS
+npm install -g phantomjs
+"""
 
 import click
 
@@ -16,8 +27,10 @@ from app.settings import app_cfg
   help='Input license data CSV')
 @click.option('-o', '--output', 'opt_fp_out', required=True,
   help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=20,
+  help='Number of threads')
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
   """IJB-C screenshot sources"""
   
   import sys
@@ -25,6 +38,8 @@ def cli(ctx, opt_fp_in, opt_fp_out):
   from os.path import join
   from pathlib import Path
   import time
+  from functools import partial
+  from multiprocessing.dummy import Pool as ThreadPool
 
   import pandas as pd
   import cv2 as cv
@@ -43,39 +58,75 @@ def cli(ctx, opt_fp_in, opt_fp_out):
   chrome_options.add_argument('--no-sandbox')
   chrome_options.add_argument('--headless')
   chrome_options.add_argument('--disable-dev-shm-usage')
-  driver = webdriver.Chrome(chrome_options=chrome_options)
-  driver.set_window_size(1920,1080)
-  
+
+
+  def pool_process(route, chrome_options):
+    # Threaded image resize function
+    try:
+      pbar.update(1)
+      
+      driver = webdriver.Chrome(chrome_options=chrome_options)
+      driver.set_window_size(1920,1080)
+      
+      url = route['url']
+      fp_out = route['dst']
+      log.debug(f'url: {url}, dst: {fp_out}')
+      driver.get(url)
+
+      if 'youtube.com' in url:
+        try:
+          wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+        except Exception as e:
+          log.debug(f'error: {e}')
+          pass
+      else:
+        wait = WebDriverWait(driver,10)
+        time.sleep(1)  # wait for element
+
+      time.sleep(10)  # wait for element
+      file_utils.mkdirs(fp_out)
+      log.debug(f'save to: {fp_out}')
+      driver.get_screenshot_as_file(fp_out)
+      driver.quit()
+
+      return True
+    except:
+      return False
+
+  # load
+  routes = []
   df_licenses = pd.read_csv(opt_fp_in)
   log.info(f'{len(df_licenses)} rows')
-
   for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
     filepath = df_license['Media ID']
-    if 'frames/' in filepath or 'img/' in filepath:
+    if not 'video/' in filepath:
       continue
-    url = df_license['Media URL']
+    url = str(df_license['Media URL'])
     if not ('http://' in url or 'https://' in url):
       url = 'http://' + url 
-    log.debug(f'getting: {url}')
-    driver.get(url)
-    if 'youtube.com' in url:
-      try:
-        wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
-        time.sleep(1)  # wait for element
-      except Exception as e:
-        log.debug(f'error: {e}')
-        pass
-    else:
-      wait = WebDriverWait(driver,5)
-      time.sleep(1)  # wait for element
-
     fp_media = filepath.replace(Path(filepath).suffix, '.png')
     fp_out = join(opt_fp_out, fp_media)
-    file_utils.mkdirs(fp_out)
-    log.debug(f'save to: {fp_out}')
-    driver.get_screenshot_as_file(fp_out)
+    obj = {'url': url, 'dst': fp_out}
+    routes.append(obj)
+  
+  # setup multithreading
+  for route in routes:
+    log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
+
+  return
+  results = []
+  pbar = tqdm(total=len(routes))
+  pool_process = partial(pool_process, chrome_options=chrome_options)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=len(routes)) as pbar:
+    results = pool.map(pool_process, routes)
+  pbar.close()
 
-  driver.quit()
+
+
+
+
+  
 
 
 
diff --git a/megapixels/commands/datasets/ijb_screenshot_mt.py b/megapixels/commands/datasets/ijb_screenshot_mt.py
deleted file mode 100644
index 616893c7..00000000
--- a/megapixels/commands/datasets/ijb_screenshot_mt.py
+++ /dev/null
@@ -1,156 +0,0 @@
-"""Create screenshots for YouTube.com URLs in the IJB dataset
-
-TODO
-- grey out boxes in sidebar
-- resize driver screenshot area to include author text
-
-Installing webdrivers:
-
-Chrome
-wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
-
-Firefox
-wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
-
-PhantomJS
-npm install -g phantomjs
-"""
-
-import click
-
-from app.settings import app_cfg
-
-#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True,
-  help='Input license data CSV')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
-  help='Output directory')
-@click.option('-t', '--threads', 'opt_threads', default=20,
-  help='Number of threads')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
-  """IJB-C screenshot sources"""
-  
-  import sys
-  from glob import glob
-  from os.path import join
-  from pathlib import Path
-  import time
-  from functools import partial
-  from multiprocessing.dummy import Pool as ThreadPool
-
-  import pandas as pd
-  import cv2 as cv
-  from tqdm import tqdm
-
-  from selenium import webdriver
-  from selenium.webdriver.support import expected_conditions as EC
-  from selenium.webdriver.support.wait import WebDriverWait
-  from selenium.webdriver.common.by import By 
-
-  from app.utils import file_utils, im_utils, logger_utils
-
-  log = logger_utils.Logger.getLogger()
-
-  chrome_options = webdriver.ChromeOptions()
-  chrome_options.add_argument('--no-sandbox')
-  chrome_options.add_argument('--headless')
-  chrome_options.add_argument('--disable-dev-shm-usage')
-
-
-  def pool_process(route, chrome_options):
-    # Threaded image resize function
-    try:
-      pbar.update(1)
-      
-      driver = webdriver.Chrome(chrome_options=chrome_options)
-      driver.set_window_size(1920,1080)
-      
-      url = route['url']
-      fp_out = route['dst']
-      log.debug(f'url: {url}, dst: {fp_out}')
-      driver.get(url)
-
-      if 'youtube.com' in url:
-        try:
-          wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
-        except Exception as e:
-          log.debug(f'error: {e}')
-          pass
-      else:
-        wait = WebDriverWait(driver,10)
-        time.sleep(1)  # wait for element
-
-      time.sleep(10)  # wait for element
-      file_utils.mkdirs(fp_out)
-      log.debug(f'save to: {fp_out}')
-      driver.get_screenshot_as_file(fp_out)
-      driver.quit()
-
-      return True
-    except:
-      return False
-
-  # load
-  routes = []
-  df_licenses = pd.read_csv(opt_fp_in)
-  log.info(f'{len(df_licenses)} rows')
-  for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
-    filepath = df_license['Media ID']
-    if not 'video/' in filepath:
-      continue
-    url = str(df_license['Media URL'])
-    if not ('http://' in url or 'https://' in url):
-      url = 'http://' + url 
-    fp_media = filepath.replace(Path(filepath).suffix, '.png')
-    fp_out = join(opt_fp_out, fp_media)
-    obj = {'url': url, 'dst': fp_out}
-    routes.append(obj)
-  
-  # setup multithreading
-  for route in routes:
-    log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
-
-  return
-  results = []
-  pbar = tqdm(total=len(routes))
-  pool_process = partial(pool_process, chrome_options=chrome_options)
-  pool = ThreadPool(opt_threads) 
-  with tqdm(total=len(routes)) as pbar:
-    results = pool.map(pool_process, routes)
-  pbar.close()
-
-
-
-
-
-  
-
-
-
-#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer')))
-#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show'))
-#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
-#driver.execute_script("document.getElementById('related').style.display = 'None';")
-
-'''
-title_is
-title_contains
-presence_of_element_located
-visibility_of_element_located
-visibility_of
-presence_of_all_elements_located
-text_to_be_present_in_element
-text_to_be_present_in_element_value
-frame_to_be_available_and_switch_to_it
-invisibility_of_element_located
-element_to_be_clickable - it is Displayed and Enabled.
-staleness_of
-element_to_be_selected
-element_located_to_be_selected
-element_selection_state_to_be
-element_located_selection_state_to_be
-alert_is_present
-'''
-\ No newline at end of file
diff --git a/megapixels/commands/datasets/template.py b/megapixels/commands/templates/basic.py
index 2e952896..2e952896 100644
--- a/megapixels/commands/datasets/template.py
+++ b/megapixels/commands/templates/basic.py