diff options
Diffstat (limited to 'megapixels/commands/datasets/ijb_screenshot.py')
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot.py | 66 |
1 files changed, 53 insertions, 13 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py index 616893c7..7778978e 100644 --- a/megapixels/commands/datasets/ijb_screenshot.py +++ b/megapixels/commands/datasets/ijb_screenshot.py @@ -29,11 +29,16 @@ from app.settings import app_cfg help='Output directory') @click.option('-t', '--threads', 'opt_threads', default=20, help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--verify', 'opt_verify', is_flag=True, + help='Only verify files') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice, opt_verify): """IJB-C screenshot sources""" import sys + import os from glob import glob from os.path import join from pathlib import Path @@ -42,7 +47,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): from multiprocessing.dummy import Pool as ThreadPool import pandas as pd - import cv2 as cv + from PIL import Image + import io from tqdm import tqdm from selenium import webdriver @@ -66,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): pbar.update(1) driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) + driver.set_window_size(1920,3600) # accommodate vertical videos url = route['url'] fp_out = route['dst'] @@ -75,18 +81,40 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): if 'youtube.com' in url: try: - wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + #wait = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + wait = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,'player-container-outer'))) except Exception as e: - log.debug(f'error: {e}') + log.debug(f'WebDriver error: {e}') pass else: wait = WebDriverWait(driver,10) time.sleep(1) # wait for element - time.sleep(10) # wait for element + time.sleep(5) # wait for element + #el_vid = driver.find_element_by_id('player-container-outer') + el_shelf = driver.find_element_by_id('ticket-shelf') + el_related = driver.find_element_by_id('related') + el_primary = driver.find_element_by_id('primary') + err = False + try: + el_error = driver.find_element_by_id('error-screen') + if not(el_error.location['x'] == 0 and el_error.location['width'] == 0): + err = True + except: + pass + + margin_left = 24 + margin_bottom = 24 if err else 0 + box = (el_primary.location['x'] - margin_left, el_primary.location['y'], + el_primary.location['x'] + el_primary.size['width'], el_shelf.location['y'] + margin_bottom) + im_bytes = driver.get_screenshot_as_png() + im = Image.open(io.BytesIO(im_bytes)) + im = im.crop(box) + file_utils.mkdirs(fp_out) log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) + #driver.get_screenshot_as_file(fp_out) + im.save(fp_out) driver.quit() return True @@ -95,6 +123,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): # load routes = [] + video_ids = [] + df_licenses = pd.read_csv(opt_fp_in) log.info(f'{len(df_licenses)} rows') for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): @@ -102,18 +132,28 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): if not 'video/' in filepath: continue url = str(df_license['Media URL']) + try: + video_id = url.split('?v=')[1] + except Exception as e: + log.debug(f'error parsing url: "{url}"') + if video_id in video_ids: + continue + video_ids.append(video_id) if not ('http://' in url or 'https://' in url): url = 'http://' + url - fp_media = filepath.replace(Path(filepath).suffix, '.png') - fp_out = join(opt_fp_out, fp_media) + #fp_media = filepath.replace(Path(filepath).suffix, '.png') + #fp_out = join(opt_fp_out, fp_media) + fp_out = join(opt_fp_out, f'{video_id}.png') + if Path(fp_out).exists() and (os.stat(fp_out).st_size // 1000) > 13: + continue obj = {'url': url, 'dst': fp_out} routes.append(obj) - # setup multithreading - for route in routes: - log.debug(f'url: {route["url"]}, dst: {route["dst"]}') + if opt_slice: + routes = routes[opt_slice[0]:opt_slice[1]] + log.debug(f'processing: {len(routes)}') - return + # setup multithreading results = [] pbar = tqdm(total=len(routes)) pool_process = partial(pool_process, chrome_options=chrome_options) |
