summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_screenshot.py
blob: e6940d88e7dd81cfce5e81a9628bae74a6ddaaec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Chrome
# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
# Firefox
# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
# PhantomJS
# npm install -g phantomjs

import click

from app.settings import app_cfg

#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
  help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
  help='Output directory')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out):
  """IJB-C screenshot sources"""
  
  import sys
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time

  import pandas as pd
  import cv2 as cv
  from tqdm import tqdm

  from selenium import webdriver
  from selenium.webdriver.support import expected_conditions as EC
  from selenium.webdriver.support.wait import WebDriverWait
  from selenium.webdriver.common.by import By 

  from app.utils import file_utils, im_utils, logger_utils

  log = logger_utils.Logger.getLogger()

  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(chrome_options=chrome_options)
  driver.set_window_size(1920,1080)
  
  df_licenses = pd.read_csv(opt_fp_in)
  log.info(f'{len(df_licenses)} rows')

  for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
    filepath = df_license['Media ID']
    if 'frames/' in filepath or 'img/' in filepath:
      continue
    url = df_license['Media URL']
    if not ('http://' in url or 'https://' in url):
      url = 'http://' + url 
    log.debug(f'getting: {url}')
    driver.get(url)
    if 'youtube.com' in url:
      try:
        wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
        time.sleep(1)  # wait for element
      except Exception as e:
        log.debug(f'error: {e}')
        pass
    else:
      wait = WebDriverWait(driver,5)
      time.sleep(1)  # wait for element

    fp_media = filepath.replace(Path(filepath).suffix, '.png')
    fp_out = join(opt_fp_out, fp_media)
    file_utils.mkdirs(fp_out)
    log.debug(f'save to: {fp_out}')
    driver.get_screenshot_as_file(fp_out)

  driver.quit()



#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer')))
#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show'))
#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
#driver.execute_script("document.getElementById('related').style.display = 'None';")

'''
title_is
title_contains
presence_of_element_located
visibility_of_element_located
visibility_of
presence_of_all_elements_located
text_to_be_present_in_element
text_to_be_present_in_element_value
frame_to_be_available_and_switch_to_it
invisibility_of_element_located
element_to_be_clickable - it is Displayed and Enabled.
staleness_of
element_to_be_selected
element_located_to_be_selected
element_selection_state_to_be
element_located_selection_state_to_be
alert_is_present
'''