diff options
| author | Adam Harvey <adam@ahprojects.com> | 2019-06-03 13:15:37 +0200 |
|---|---|---|
| committer | Adam Harvey <adam@ahprojects.com> | 2019-06-03 13:15:37 +0200 |
| commit | 144c425749bbc7d092b31977439c911185eb6f33 (patch) | |
| tree | 46bbaf16adb6bb8bbc29bba92e451629858489c7 /megapixels/commands | |
| parent | 74d6c74ff7f85125a32d3710d8620893cd5e9aa7 (diff) | |
grapoh
Diffstat (limited to 'megapixels/commands')
| -rw-r--r-- | megapixels/commands/datasets/msc_flickr_embassies.py | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/msc_flickr_embassies.py b/megapixels/commands/datasets/msc_flickr_embassies.py new file mode 100644 index 00000000..945f037c --- /dev/null +++ b/megapixels/commands/datasets/msc_flickr_embassies.py @@ -0,0 +1,195 @@ +"""Create screenshots for YouTube.com URLs in the IJB dataset + +TODO +- grey out boxes in sidebar +- resize driver screenshot area to include author text + +Installing webdrivers: + +Chrome +wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip + +Firefox +wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz + +PhantomJS +npm install -g phantomjs +""" + +import click + +from app.settings import app_cfg + +# The search result title must contain one of these words +valid_title_words = ['embassy', 'botschaft'] + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input CSV with list of embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output CSV') +@click.option('-t', '--threads', 'opt_threads', default=20, + help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice, opt_verify): + """IJB-C screenshot sources""" + + import sys + import os + from glob import glob + from os.path import join + from pathlib import Path + import time + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool + + import pandas as pd + from PIL import Image + import io + from tqdm import tqdm + + from selenium import webdriver + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.support.wait import WebDriverWait + from selenium.webdriver.common.by import By + + from app.utils import file_utils, im_utils, logger_utils + + log = logger_utils.Logger.getLogger() + + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--headless') + chrome_options.add_argument('--disable-dev-shm-usage') + + + def pool_process(route, chrome_options): + # Threaded image resize function + try: + pbar.update(1) + + driver = webdriver.Chrome(chrome_options=chrome_options) + driver.set_window_size(1920,3600) # accommodate vertical videos + + url = route['url'] + fp_out = route['dst'] + log.debug(f'url: {url}, dst: {fp_out}') + driver.get(url) + + if 'youtube.com' in url: + try: + #wait = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + wait = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,'player-container-outer'))) + except Exception as e: + log.debug(f'WebDriver error: {e}') + pass + else: + wait = WebDriverWait(driver,10) + time.sleep(1) # wait for element + + time.sleep(5) # wait for element + #el_vid = driver.find_element_by_id('player-container-outer') + el_shelf = driver.find_element_by_id('results_links_deep') + el_related = driver.find_element_by_id('related') + el_primary = driver.find_element_by_id('primary') + err = False + try: + el_error = driver.find_element_by_id('error-screen') + if not(el_error.location['x'] == 0 and el_error.location['width'] == 0): + err = True + except: + pass + + margin_left = 24 + margin_bottom = 24 if err else 0 + box = (el_primary.location['x'] - margin_left, el_primary.location['y'], + el_primary.location['x'] + el_primary.size['width'], el_shelf.location['y'] + margin_bottom) + im_bytes = driver.get_screenshot_as_png() + im = Image.open(io.BytesIO(im_bytes)) + im = im.crop(box) + + file_utils.mkdirs(fp_out) + log.debug(f'save to: {fp_out}') + #driver.get_screenshot_as_file(fp_out) + im.save(fp_out) + driver.quit() + + return True + except: + return False + + # load + routes = [] + video_ids = [] + + df_licenses = pd.read_csv(opt_fp_in) + log.info(f'{len(df_licenses)} rows') + for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): + filepath = df_license['Media ID'] + if not 'video/' in filepath: + continue + url = str(df_license['Media URL']) + try: + video_id = url.split('?v=')[1] + except Exception as e: + log.debug(f'error parsing url: "{url}"') + if video_id in video_ids: + continue + video_ids.append(video_id) + if not ('http://' in url or 'https://' in url): + url = 'http://' + url + #fp_media = filepath.replace(Path(filepath).suffix, '.png') + #fp_out = join(opt_fp_out, fp_media) + fp_out = join(opt_fp_out, f'{video_id}.png') + if Path(fp_out).exists() and (os.stat(fp_out).st_size // 1000) > 13: + continue + obj = {'url': url, 'dst': fp_out} + routes.append(obj) + + if opt_slice: + routes = routes[opt_slice[0]:opt_slice[1]] + log.debug(f'processing: {len(routes)}') + + # setup multithreading + results = [] + pbar = tqdm(total=len(routes)) + pool_process = partial(pool_process, chrome_options=chrome_options) + pool = ThreadPool(opt_threads) + with tqdm(total=len(routes)) as pbar: + results = pool.map(pool_process, routes) + pbar.close() + + + + + + + + + +#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer'))) +#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show')) +#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) +#driver.execute_script("document.getElementById('related').style.display = 'None';") + +''' +title_is +title_contains +presence_of_element_located +visibility_of_element_located +visibility_of +presence_of_all_elements_located +text_to_be_present_in_element +text_to_be_present_in_element_value +frame_to_be_available_and_switch_to_it +invisibility_of_element_located +element_to_be_clickable - it is Displayed and Enabled. +staleness_of +element_to_be_selected +element_located_to_be_selected +element_selection_state_to_be +element_located_selection_state_to_be +alert_is_present +'''
\ No newline at end of file |
