summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_screenshot.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/ijb_screenshot.py')
-rw-r--r--megapixels/commands/datasets/ijb_screenshot.py66
1 files changed, 53 insertions, 13 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py
index 616893c7..7778978e 100644
--- a/megapixels/commands/datasets/ijb_screenshot.py
+++ b/megapixels/commands/datasets/ijb_screenshot.py
@@ -29,11 +29,16 @@ from app.settings import app_cfg
help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=20,
help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('--verify', 'opt_verify', is_flag=True,
+ help='Only verify files')
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice, opt_verify):
"""IJB-C screenshot sources"""
import sys
+ import os
from glob import glob
from os.path import join
from pathlib import Path
@@ -42,7 +47,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
from multiprocessing.dummy import Pool as ThreadPool
import pandas as pd
- import cv2 as cv
+ from PIL import Image
+ import io
from tqdm import tqdm
from selenium import webdriver
@@ -66,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
pbar.update(1)
driver = webdriver.Chrome(chrome_options=chrome_options)
- driver.set_window_size(1920,1080)
+ driver.set_window_size(1920,3600) # accommodate vertical videos
url = route['url']
fp_out = route['dst']
@@ -75,18 +81,40 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
if 'youtube.com' in url:
try:
- wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+ #wait = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+ wait = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,'player-container-outer')))
except Exception as e:
- log.debug(f'error: {e}')
+ log.debug(f'WebDriver error: {e}')
pass
else:
wait = WebDriverWait(driver,10)
time.sleep(1) # wait for element
- time.sleep(10) # wait for element
+ time.sleep(5) # wait for element
+ #el_vid = driver.find_element_by_id('player-container-outer')
+ el_shelf = driver.find_element_by_id('ticket-shelf')
+ el_related = driver.find_element_by_id('related')
+ el_primary = driver.find_element_by_id('primary')
+ err = False
+ try:
+ el_error = driver.find_element_by_id('error-screen')
+ if not(el_error.location['x'] == 0 and el_error.location['width'] == 0):
+ err = True
+ except:
+ pass
+
+ margin_left = 24
+ margin_bottom = 24 if err else 0
+ box = (el_primary.location['x'] - margin_left, el_primary.location['y'],
+ el_primary.location['x'] + el_primary.size['width'], el_shelf.location['y'] + margin_bottom)
+ im_bytes = driver.get_screenshot_as_png()
+ im = Image.open(io.BytesIO(im_bytes))
+ im = im.crop(box)
+
file_utils.mkdirs(fp_out)
log.debug(f'save to: {fp_out}')
- driver.get_screenshot_as_file(fp_out)
+ #driver.get_screenshot_as_file(fp_out)
+ im.save(fp_out)
driver.quit()
return True
@@ -95,6 +123,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
# load
routes = []
+ video_ids = []
+
df_licenses = pd.read_csv(opt_fp_in)
log.info(f'{len(df_licenses)} rows')
for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
@@ -102,18 +132,28 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
if not 'video/' in filepath:
continue
url = str(df_license['Media URL'])
+ try:
+ video_id = url.split('?v=')[1]
+ except Exception as e:
+ log.debug(f'error parsing url: "{url}"')
+ if video_id in video_ids:
+ continue
+ video_ids.append(video_id)
if not ('http://' in url or 'https://' in url):
url = 'http://' + url
- fp_media = filepath.replace(Path(filepath).suffix, '.png')
- fp_out = join(opt_fp_out, fp_media)
+ #fp_media = filepath.replace(Path(filepath).suffix, '.png')
+ #fp_out = join(opt_fp_out, fp_media)
+ fp_out = join(opt_fp_out, f'{video_id}.png')
+ if Path(fp_out).exists() and (os.stat(fp_out).st_size // 1000) > 13:
+ continue
obj = {'url': url, 'dst': fp_out}
routes.append(obj)
- # setup multithreading
- for route in routes:
- log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
+ if opt_slice:
+ routes = routes[opt_slice[0]:opt_slice[1]]
+ log.debug(f'processing: {len(routes)}')
- return
+ # setup multithreading
results = []
pbar = tqdm(total=len(routes))
pool_process = partial(pool_process, chrome_options=chrome_options)