megapixels/commands/datasets/ijb_youtube_meta.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

"""Create screenshots for YouTube.com URLs in the IJB dataset

TODO
- grey out boxes in sidebar
- resize driver screenshot area to include author text

Installing webdrivers:

Chrome
wget https://chromedriver.storage.googleapis.com/73.0.3683.20/chromedriver_linux64.zip

Firefox
wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz

PhantomJS
npm install -g phantomjs
"""

import click

from app.settings import app_cfg

fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
fps_default_in = [fp_default_in_a, fp_default_in_b]
fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv'

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
  help='Input license data CSV')
@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_default_out,
  help='Output directory')
@click.option('-t', '--threads', 'opt_threads', default=4,
  help='Number of threads')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
  """IJB-C screenshot sources"""
  
  import sys
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time
  from functools import partial
  from multiprocessing.dummy import Pool as ThreadPool
  import urllib.request

  import lxml
  from bs4 import BeautifulSoup
  import pandas as pd
  import cv2 as cv
  from tqdm import tqdm


  from app.utils import file_utils, im_utils, logger_utils

  log = logger_utils.Logger.getLogger()

  
  metavars = [
    {'name': ('title','yt_title')},
    {'name': ('description', 'yt_description')},
    {'name': ('keywords', 'yt_keywords')},
    {'itemprop': ('paid', 'yt_paid')},
    {'itemprop': ('videoId', 'yt_video_id')},
    {'itemprop': ('duration', 'yt_duration')},
    {'itemprop': ('width', 'yt_width')},
    {'itemprop': ('height', 'yt_height')},
    {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')},
    {'itemprop': ('interactionCount', 'yt_views')},
    {'itemprop': ('datePublished', 'yt_date_published')},
    {'itemprop': ('genre', 'yt_genre')},
    {'itemprop': ('unlisted', 'yt_unlisted')}
  ]

  # from pprint import pprint
  def pool_process(media_item):
    # threaded function
    global parse_yt_page
    results = []
    try:
      url = media_item['ijb_media_url'].strip()
      url = url.replace('http:', 'https:')
      url = url.replace('www.youtube','youtube')
      data = urllib.request.urlopen(url, timeout=30).read()
      soup = BeautifulSoup(data,'lxml')
      for metavar in metavars:
        propname, propvals = list(metavar.items())[0]
        #result = parse_yt_meta(soup, propname, propvals)
        content = soup.find('meta', attrs={propname:propvals[0]})
        if content:
          media_item[propvals[1]] = content.get('content','')
      
      # description, or error, is not metavar because it can be truncated
      desc_result = soup.find('p', attrs={'id': 'eow-description'})
      description = desc_result.text if desc_result else ''
      
      if not 'yt_duration' in media_item.keys():
        error_result = soup.find('div', attrs={'id': 'player-unavailable'})
        description = error_result.text if error_result else 'Video unavailable'

      media_item['yt_description'] = description
      log.debug(f'url: {url}, description: {description}')

      if 'yt_duration' in media_item.keys():
        # fix values
        duration = media_item['yt_duration']
        mins = int(duration.split('M')[0].replace('PT',''))
        secs = int(duration.split('M')[1].replace('S',''))
        media_item['yt_duration'] = mins + (60 * secs)
      if 'yt_paid' in media_item.keys():
        media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True'))
      if 'yt_is_family_friendly' in media_item.keys():
        media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True'))
    except Exception as e:
      log.debug(f'Error: {e}, {media_item["ijb_media_url"]}')
    pbar.update(1)
    return media_item  # a list of dict key:val dicts

  # read CSV and get URLs
  df_media = None
  for fp in fps_default_in:
    df = pd.read_csv(fp)
    log.info(f'reading {len(df)} rows')
    if df_media is None:
      df_media = df
    else:
      df_media = df_media.append(df, ignore_index=True)

  name_maps = {
    'Media ID': 'ijb_media_id',
    'Media URL': 'ijb_media_url',
    'Source URL': 'ijb_source_url',
    'Attribution': 'ijb_attribution',
    'CC License': 'ijb_cc_license',
  }
  df_media.rename(columns=name_maps, inplace=True)
  log.info(f'{len(df_media)} rows')
  df_media = df_media[df_media.ijb_media_id.str.contains("video/")]
  log.info(f'{len(df_media)} rows')
  df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True)
  log.info(f'{len(df_media)} rows')
  media_items = df_media.to_dict('records')

  results = []
  pbar = tqdm(total=len(media_items))
  pool_process = partial(pool_process)
  pool = ThreadPool(opt_threads) 
  with tqdm(total=len(media_items)) as pbar:
    results = pool.map(pool_process, media_items)
  pbar.close()

  # create DataFrame and save to CSV
  file_utils.mkdirs(opt_fp_out)
  df = pd.DataFrame.from_dict(results)
  df.index.name = 'index'
  df.to_csv(opt_fp_out)