1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
# Chrome
# wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
# Firefox
# wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
# PhantomJS
# npm install -g phantomjs
import click
from app.settings import app_cfg
#/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True,
help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output directory')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out):
"""IJB-C screenshot sources"""
import sys
from glob import glob
from os.path import join
from pathlib import Path
import time
import pandas as pd
import cv2 as cv
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from app.utils import file_utils, im_utils, logger_utils
log = logger_utils.Logger.getLogger()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_window_size(1920,1080)
df_licenses = pd.read_csv(opt_fp_in)
log.info(f'{len(df_licenses)} rows')
for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
filepath = df_license['Media ID']
if 'frames/' in filepath or 'img/' in filepath:
continue
url = df_license['Media URL']
if not ('http://' in url or 'https://' in url):
url = 'http://' + url
log.debug(f'getting: {url}')
driver.get(url)
if 'youtube.com' in url:
try:
wait = WebDriverWait(driver,3).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
time.sleep(1) # wait for element
except Exception as e:
log.debug(f'error: {e}')
pass
else:
wait = WebDriverWait(driver,5)
time.sleep(1) # wait for element
fp_media = filepath.replace(Path(filepath).suffix, '.png')
fp_out = join(opt_fp_out, fp_media)
file_utils.mkdirs(fp_out)
log.debug(f'save to: {fp_out}')
driver.get_screenshot_as_file(fp_out)
driver.quit()
#wait = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-watch-next-secondary-results-renderer')))
#wait = WebDriverWait(driver,3).until(EC.text_to_be_present_in_element_value((By.CLASS_NAME,'yt-next-continuation'), 'show'))
#wait = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
#driver.execute_script("document.getElementById('related').style.display = 'None';")
'''
title_is
title_contains
presence_of_element_located
visibility_of_element_located
visibility_of
presence_of_all_elements_located
text_to_be_present_in_element
text_to_be_present_in_element_value
frame_to_be_available_and_switch_to_it
invisibility_of_element_located
element_to_be_clickable - it is Displayed and Enabled.
staleness_of
element_to_be_selected
element_located_to_be_selected
element_selection_state_to_be
element_located_selection_state_to_be
alert_is_present
'''
|