1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
"""Create screenshots for YouTube.com URLs in the IJB dataset
TODO
- grey out boxes in sidebar
- resize driver screenshot area to include author text
Installing webdrivers:
Chrome
wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip
Firefox
wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
PhantomJS
npm install -g phantomjs
"""
import click
from app.settings import app_cfg
fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
fps_default_in = [fp_default_in_a, fp_default_in_b]
fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv'
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out,
help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=4,
help='Number of threads')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
"""IJB-C screenshot sources"""
import sys
from glob import glob
from os.path import join
from pathlib import Path
import time
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool
import urllib.request
import lxml
from bs4 import BeautifulSoup
import pandas as pd
import cv2 as cv
from tqdm import tqdm
from app.utils import file_utils, im_utils, logger_utils
log = logger_utils.Logger.getLogger()
metavars = [
{'name': ('title','yt_title')},
{'name': ('description', 'yt_description')},
{'name': ('keywords', 'yt_keywords')},
{'itemprop': ('paid', 'yt_paid')},
{'itemprop': ('videoId', 'yt_video_id')},
{'itemprop': ('duration', 'yt_duration')},
{'itemprop': ('width', 'yt_width')},
{'itemprop': ('height', 'yt_height')},
{'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')},
{'itemprop': ('interactionCount', 'yt_views')},
{'itemprop': ('datePublished', 'yt_date_published')},
{'itemprop': ('genre', 'yt_genre')},
{'itemprop': ('unlisted', 'yt_unlisted')}
]
# from pprint import pprint
def pool_process(media_item):
# threaded function
global parse_yt_page
results = []
try:
url = media_item['ijb_media_url'].strip()
url = url.replace('http:', 'https:')
url = url.replace('www.youtube','youtube')
data = urllib.request.urlopen(url, timeout=30).read()
soup = BeautifulSoup(data,'lxml')
for metavar in metavars:
propname, propvals = list(metavar.items())[0]
#result = parse_yt_meta(soup, propname, propvals)
content = soup.find('meta', attrs={propname:propvals[0]})
if content:
media_item[propvals[1]] = content.get('content','')
# description, or error, is not metavar because it can be truncated
desc_result = soup.find('p', attrs={'id': 'eow-description'})
description = desc_result.text if desc_result else ''
if not 'yt_duration' in media_item.keys():
error_result = soup.find('div', attrs={'id': 'player-unavailable'})
description = error_result.text if error_result else 'Video unavailable'
media_item['yt_description'] = description
log.debug(f'url: {url}, description: {description}')
if 'yt_duration' in media_item.keys():
# fix values
duration = media_item['yt_duration']
mins = int(duration.split('M')[0].replace('PT',''))
secs = int(duration.split('M')[1].replace('S',''))
media_item['yt_duration'] = mins + (60 * secs)
if 'yt_paid' in media_item.keys():
media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True'))
if 'yt_is_family_friendly' in media_item.keys():
media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True'))
except Exception as e:
log.debug(f'Error: {e}, {media_item["ijb_media_url"]}')
pbar.update(1)
return media_item # a list of dict key:val dicts
# read CSV and get URLs
df_media = None
for fp in fps_default_in:
df = pd.read_csv(fp)
log.info(f'reading {len(df)} rows')
if df_media is None:
df_media = df
else:
df_media = df_media.append(df, ignore_index=True)
name_maps = {
'Media ID': 'ijb_media_id',
'Media URL': 'ijb_media_url',
'Source URL': 'ijb_source_url',
'Attribution': 'ijb_attribution',
'CC License': 'ijb_cc_license',
}
df_media.rename(columns=name_maps, inplace=True)
log.info(f'{len(df_media)} rows')
df_media = df_media[df_media.ijb_media_id.str.contains("video/")]
log.info(f'{len(df_media)} rows')
df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True)
log.info(f'{len(df_media)} rows')
media_items = df_media.to_dict('records')
results = []
pbar = tqdm(total=len(media_items))
pool_process = partial(pool_process)
pool = ThreadPool(opt_threads)
with tqdm(total=len(media_items)) as pbar:
results = pool.map(pool_process, media_items)
pbar.close()
# create DataFrame and save to CSV
file_utils.mkdirs(opt_fp_out)
df = pd.DataFrame.from_dict(results)
df.index.name = 'index'
df.to_csv(opt_fp_out)
|