summaryrefslogtreecommitdiff
path: root/megapixels/commands
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands')
-rw-r--r--megapixels/commands/datasets/ijb_screenshot.py111
-rw-r--r--megapixels/commands/datasets/ijb_screenshot_mt.py156
-rw-r--r--megapixels/commands/templates/basic.py (renamed from megapixels/commands/datasets/template.py)0
3 files changed, 81 insertions, 186 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py
index e6940d88..616893c7 100644
--- a/megapixels/commands/datasets/ijb_screenshot.py
+++ b/megapixels/commands/datasets/ijb_screenshot.py
@@ -1,9 +1,20 @@
-# Chrome
-# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
-# Firefox
-# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
-# PhantomJS
-# npm install -g phantomjs
+"""Create screenshots for YouTube.com URLs in the IJB dataset
+
+TODO
+- grey out boxes in sidebar
+- resize driver screenshot area to include author text
+
+Installing webdrivers:
+
+Chrome
+wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
+
+Firefox
+wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+
+PhantomJS
+npm install -g phantomjs
+"""
import click
@@ -16,8 +27,10 @@ from app.settings import app_cfg
help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output directory')
+@click.option('-t', '--threads', 'opt_threads', default=20,
+ help='Number of threads')
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
"""IJB-C screenshot sources"""
import sys
@@ -25,6 +38,8 @@ def cli(ctx, opt_fp_in, opt_fp_out):
from os.path import join
from pathlib import Path
import time
+ from functools import partial
+ from multiprocessing.dummy import Pool as ThreadPool
import pandas as pd
import cv2 as cv
@@ -43,39 +58,75 @@ def cli(ctx, opt_fp_in, opt_fp_out):
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
- driver = webdriver.Chrome(chrome_options=chrome_options)
- driver.set_window_size(1920,1080)
-
+
+
+ def pool_process(route, chrome_options):
+ # Threaded image resize function
+ try:
+ pbar.update(1)
+
+ driver = webdriver.Chrome(chrome_options=chrome_options)
+ driver.set_window_size(1920,1080)
+
+ url = route['url']
+ fp_out = route['dst']
+ log.debug(f'url: {url}, dst: {fp_out}')
+ driver.get(url)
+
+ if 'youtube.com' in url:
+ try:
+ wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+ except Exception as e:
+ log.debug(f'error: {e}')
+ pass
+ else:
+ wait = WebDriverWait(driver,10)
+ time.sleep(1) # wait for element
+
+ time.sleep(10) # wait for element
+ file_utils.mkdirs(fp_out)
+ log.debug(f'save to: {fp_out}')
+ driver.get_screenshot_as_file(fp_out)
+ driver.quit()
+
+ return True
+ except:
+ return False
+
+ # load
+ routes = []
df_licenses = pd.read_csv(opt_fp_in)
log.info(f'{len(df_licenses)} rows')
-
for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
filepath = df_license['Media ID']
- if 'frames/' in filepath or 'img/' in filepath:
+ if not 'video/' in filepath:
continue
- url = df_license['Media URL']
+ url = str(df_license['Media URL'])
if not ('http://' in url or 'https://' in url):
url = 'http://' + url
- log.debug(f'getting: {url}')
- driver.get(url)
- if 'youtube.com' in url:
- try:
- wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
- time.sleep(1) # wait for element
- except Exception as e:
- log.debug(f'error: {e}')
- pass
- else:
- wait = WebDriverWait(driver,5)
- time.sleep(1) # wait for element
-
fp_media = filepath.replace(Path(filepath).suffix, '.png')
fp_out = join(opt_fp_out, fp_media)
- file_utils.mkdirs(fp_out)
- log.debug(f'save to: {fp_out}')
- driver.get_screenshot_as_file(fp_out)
+ obj = {'url': url, 'dst': fp_out}
+ routes.append(obj)
+
+ # setup multithreading
+ for route in routes:
+ log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
+
+ return
+ results = []
+ pbar = tqdm(total=len(routes))
+ pool_process = partial(pool_process, chrome_options=chrome_options)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=len(routes)) as pbar:
+ results = pool.map(pool_process, routes)
+ pbar.close()
- driver.quit()
+
+
+
+
+
diff --git a/megapixels/commands/datasets/ijb_screenshot_mt.py b/megapixels/commands/datasets/ijb_screenshot_mt.py
deleted file mode 100644
index 616893c7..00000000
--- a/megapixels/commands/datasets/ijb_screenshot_mt.py
+++ /dev/null
@@ -1,156 +0,0 @@
-"""Create screenshots for YouTube.com URLs in the IJB dataset
-
-TODO
-- grey out boxes in sidebar
-- resize driver screenshot area to include author text
-
-Installing webdrivers:
-
-Chrome
-wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
-
-Firefox
-wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
-
-PhantomJS
-npm install -g phantomjs
-"""
-
-import click
-
-from app.settings import app_cfg
-
-#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True,
- help='Input license data CSV')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
- help='Output directory')
-@click.option('-t', '--threads', 'opt_threads', default=20,
- help='Number of threads')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
- """IJB-C screenshot sources"""
-
- import sys
- from glob import glob
- from os.path import join
- from pathlib import Path
- import time
- from functools import partial
- from multiprocessing.dummy import Pool as ThreadPool
-
- import pandas as pd
- import cv2 as cv
- from tqdm import tqdm
-
- from selenium import webdriver
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
-
- from app.utils import file_utils, im_utils, logger_utils
-
- log = logger_utils.Logger.getLogger()
-
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-dev-shm-usage')
-
-
- def pool_process(route, chrome_options):
- # Threaded image resize function
- try:
- pbar.update(1)
-
- driver = webdriver.Chrome(chrome_options=chrome_options)
- driver.set_window_size(1920,1080)
-
- url = route['url']
- fp_out = route['dst']
- log.debug(f'url: {url}, dst: {fp_out}')
- driver.get(url)
-
- if 'youtube.com' in url:
- try:
- wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
- except Exception as e:
- log.debug(f'error: {e}')
- pass
- else:
- wait = WebDriverWait(driver,10)
- time.sleep(1) # wait for element
-
- time.sleep(10) # wait for element
- file_utils.mkdirs(fp_out)
- log.debug(f'save to: {fp_out}')
- driver.get_screenshot_as_file(fp_out)
- driver.quit()
-
- return True
- except:
- return False
-
- # load
- routes = []
- df_licenses = pd.read_csv(opt_fp_in)
- log.info(f'{len(df_licenses)} rows')
- for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
- filepath = df_license['Media ID']
- if not 'video/' in filepath:
- continue
- url = str(df_license['Media URL'])
- if not ('http://' in url or 'https://' in url):
- url = 'http://' + url
- fp_media = filepath.replace(Path(filepath).suffix, '.png')
- fp_out = join(opt_fp_out, fp_media)
- obj = {'url': url, 'dst': fp_out}
- routes.append(obj)
-
- # setup multithreading
- for route in routes:
- log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
-
- return
- results = []
- pbar = tqdm(total=len(routes))
- pool_process = partial(pool_process, chrome_options=chrome_options)
- pool = ThreadPool(opt_threads)
- with tqdm(total=len(routes)) as pbar:
- results = pool.map(pool_process, routes)
- pbar.close()
-
-
-
-
-
-
-
-
-
-#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer')))
-#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show'))
-#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
-#driver.execute_script("document.getElementById('related').style.display = 'None';")
-
-'''
-title_is
-title_contains
-presence_of_element_located
-visibility_of_element_located
-visibility_of
-presence_of_all_elements_located
-text_to_be_present_in_element
-text_to_be_present_in_element_value
-frame_to_be_available_and_switch_to_it
-invisibility_of_element_located
-element_to_be_clickable - it is Displayed and Enabled.
-staleness_of
-element_to_be_selected
-element_located_to_be_selected
-element_selection_state_to_be
-element_located_selection_state_to_be
-alert_is_present
-''' \ No newline at end of file
diff --git a/megapixels/commands/datasets/template.py b/megapixels/commands/templates/basic.py
index 2e952896..2e952896 100644
--- a/megapixels/commands/datasets/template.py
+++ b/megapixels/commands/templates/basic.py