summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_screenshot.py
blob: 616893c7394c75b877bf2f24dd955e7878b1bb27 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""Create screenshots for YouTube.com URLs in the IJB dataset

TODO
- grey out boxes in sidebar
- resize driver screenshot area to include author text

Installing webdrivers:

Chrome
wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip

Firefox
wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz

PhantomJS
npm install -g phantomjs
"""

import click

from app.settings import app_cfg

#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
  help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
  help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=20,
  help='Number of threads')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
  """IJB-C screenshot sources"""
  
  import sys
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time
  from functools import partial
  from multiprocessing.dummy import Pool as ThreadPool

  import pandas as pd
  import cv2 as cv
  from tqdm import tqdm

  from selenium import webdriver
  from selenium.webdriver.support import expected_conditions as EC
  from selenium.webdriver.support.wait import WebDriverWait
  from selenium.webdriver.common.by import By 

  from app.utils import file_utils, im_utils, logger_utils

  log = logger_utils.Logger.getLogger()

  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--disable-dev-shm-usage')


  def pool_process(route, chrome_options):
    # Threaded image resize function
    try:
      pbar.update(1)
      
      driver = webdriver.Chrome(chrome_options=chrome_options)
      driver.set_window_size(1920,1080)
      
      url = route['url']
      fp_out = route['dst']
      log.debug(f'url: {url}, dst: {fp_out}')
      driver.get(url)

      if 'youtube.com' in url:
        try:
          wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
        except Exception as e:
          log.debug(f'error: {e}')
          pass
      else:
        wait = WebDriverWait(driver,10)
        time.sleep(1)  # wait for element

      time.sleep(10)  # wait for element
      file_utils.mkdirs(fp_out)
      log.debug(f'save to: {fp_out}')
      driver.get_screenshot_as_file(fp_out)
      driver.quit()

      return True
    except:
      return False

  # load
  routes = []
  df_licenses = pd.read_csv(opt_fp_in)
  log.info(f'{len(df_licenses)} rows')
  for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
    filepath = df_license['Media ID']
    if not 'video/' in filepath:
      continue
    url = str(df_license['Media URL'])
    if not ('http://' in url or 'https://' in url):
      url = 'http://' + url 
    fp_media = filepath.replace(Path(filepath).suffix, '.png')
    fp_out = join(opt_fp_out, fp_media)
    obj = {'url': url, 'dst': fp_out}
    routes.append(obj)
  
  # setup multithreading
  for route in routes:
    log.debug(f'url: {route["url"]}, dst: {route["dst"]}')

  return
  results = []
  pbar = tqdm(total=len(routes))
  pool_process = partial(pool_process, chrome_options=chrome_options)
  pool = ThreadPool(opt_threads) 
  with tqdm(total=len(routes)) as pbar:
    results = pool.map(pool_process, routes)
  pbar.close()





  



#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer')))
#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show'))
#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
#driver.execute_script("document.getElementById('related').style.display = 'None';")

'''
title_is
title_contains
presence_of_element_located
visibility_of_element_located
visibility_of
presence_of_all_elements_located
text_to_be_present_in_element
text_to_be_present_in_element_value
frame_to_be_available_and_switch_to_it
invisibility_of_element_located
element_to_be_clickable - it is Displayed and Enabled.
staleness_of
element_to_be_selected
element_located_to_be_selected
element_selection_state_to_be
element_located_selection_state_to_be
alert_is_present
'''