fix bug to skip existing files

author: adamhrv <adam@ahprojects.com> 2019-02-11 23:25:13 +0100
committer: adamhrv <adam@ahprojects.com> 2019-02-11 23:25:13 +0100
commit: 9115c4b920a6155f8b66ac64c71d008f67058e7e (patch)
tree: 1590c105104f2da4b53f7963550b3cbc243364ae
parent: 6f0a583de3a2fce438ecc424dd52c6a559088e87 (diff)
3 files changed, 146 insertions, 46 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py
index 616893c7..7778978e 100644
--- a/megapixels/commands/datasets/ijb_screenshot.py
+++ b/megapixels/commands/datasets/ijb_screenshot.py
@@ -29,11 +29,16 @@ from app.settings import app_cfg
   help='Output directory')
 @click.option('-t', '--threads', 'opt_threads', default=20,
   help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('--verify', 'opt_verify', is_flag=True,
+  help='Only verify files')
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice, opt_verify):
   """IJB-C screenshot sources"""
   
   import sys
+  import os
   from glob import glob
   from os.path import join
   from pathlib import Path
@@ -42,7 +47,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
   from multiprocessing.dummy import Pool as ThreadPool
 
   import pandas as pd
-  import cv2 as cv
+  from PIL import Image
+  import io
   from tqdm import tqdm
 
   from selenium import webdriver
@@ -66,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
       pbar.update(1)
       
       driver = webdriver.Chrome(chrome_options=chrome_options)
-      driver.set_window_size(1920,1080)
+      driver.set_window_size(1920,3600)  # accommodate vertical videos
       
       url = route['url']
       fp_out = route['dst']
@@ -75,18 +81,40 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
 
       if 'youtube.com' in url:
         try:
-          wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+          #wait = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer')))
+          wait = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,'player-container-outer')))
         except Exception as e:
-          log.debug(f'error: {e}')
+          log.debug(f'WebDriver error: {e}')
           pass
       else:
         wait = WebDriverWait(driver,10)
         time.sleep(1)  # wait for element
 
-      time.sleep(10)  # wait for element
+      time.sleep(5)  # wait for element
+      #el_vid = driver.find_element_by_id('player-container-outer')
+      el_shelf = driver.find_element_by_id('ticket-shelf')
+      el_related = driver.find_element_by_id('related')
+      el_primary = driver.find_element_by_id('primary')
+      err = False
+      try:
+        el_error = driver.find_element_by_id('error-screen')
+        if not(el_error.location['x'] == 0 and el_error.location['width'] == 0):
+          err = True
+      except:
+        pass
+      
+      margin_left = 24
+      margin_bottom = 24 if err else 0
+      box = (el_primary.location['x'] - margin_left, el_primary.location['y'], 
+        el_primary.location['x'] + el_primary.size['width'], el_shelf.location['y'] + margin_bottom)
+      im_bytes = driver.get_screenshot_as_png()
+      im = Image.open(io.BytesIO(im_bytes))
+      im = im.crop(box)
+
       file_utils.mkdirs(fp_out)
       log.debug(f'save to: {fp_out}')
-      driver.get_screenshot_as_file(fp_out)
+      #driver.get_screenshot_as_file(fp_out)
+      im.save(fp_out)
       driver.quit()
 
       return True
@@ -95,6 +123,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
 
   # load
   routes = []
+  video_ids = []
+
   df_licenses = pd.read_csv(opt_fp_in)
   log.info(f'{len(df_licenses)} rows')
   for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)):
@@ -102,18 +132,28 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
     if not 'video/' in filepath:
       continue
     url = str(df_license['Media URL'])
+    try:
+      video_id = url.split('?v=')[1]
+    except Exception as e:
+      log.debug(f'error parsing url: "{url}"')
+    if video_id in video_ids:
+      continue
+    video_ids.append(video_id)
     if not ('http://' in url or 'https://' in url):
       url = 'http://' + url 
-    fp_media = filepath.replace(Path(filepath).suffix, '.png')
-    fp_out = join(opt_fp_out, fp_media)
+    #fp_media = filepath.replace(Path(filepath).suffix, '.png')
+    #fp_out = join(opt_fp_out, fp_media)
+    fp_out = join(opt_fp_out, f'{video_id}.png')
+    if Path(fp_out).exists() and (os.stat(fp_out).st_size // 1000) > 13:
+      continue
     obj = {'url': url, 'dst': fp_out}
     routes.append(obj)
   
-  # setup multithreading
-  for route in routes:
-    log.debug(f'url: {route["url"]}, dst: {route["dst"]}')
+  if opt_slice:
+    routes = routes[opt_slice[0]:opt_slice[1]]
+  log.debug(f'processing: {len(routes)}')  
 
-  return
+  # setup multithreading
   results = []
   pbar = tqdm(total=len(routes))
   pool_process = partial(pool_process, chrome_options=chrome_options)
diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py
index 87df390c..374f651c 100644
--- a/megapixels/commands/datasets/ijb_youtube_meta.py
+++ b/megapixels/commands/datasets/ijb_youtube_meta.py
@@ -23,7 +23,7 @@ from app.settings import app_cfg
 fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
 fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
 fps_default_in = [fp_default_in_a, fp_default_in_b]
-fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv'
+fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv'
 
 @click.command()
 @click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
@@ -58,32 +58,31 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
 
   
   metavars = [
-    {'name': ('title','title')},
-    {'name': ('description', 'description')},
-    {'name': ('keywords', 'keywords')},
-    {'itemprop': ('paid', 'paid')},
-    {'itemprop': ('videoId', 'video_id')},
-    {'itemprop': ('duration', 'duration')},
-    {'itemprop': ('width', 'width')},
-    {'itemprop': ('height', 'height')},
-    {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},
-    {'itemprop': ('interactionCount', 'views')},
-    {'itemprop': ('datePublished', 'date_published')},
-    {'itemprop': ('genre', 'genre')},
-    {'itemprop': ('unlisted', 'genre')}
+    {'name': ('title','yt_title')},
+    {'name': ('description', 'yt_description')},
+    {'name': ('keywords', 'yt_keywords')},
+    {'itemprop': ('paid', 'yt_paid')},
+    {'itemprop': ('videoId', 'yt_video_id')},
+    {'itemprop': ('duration', 'yt_duration')},
+    {'itemprop': ('width', 'yt_width')},
+    {'itemprop': ('height', 'yt_height')},
+    {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')},
+    {'itemprop': ('interactionCount', 'yt_views')},
+    {'itemprop': ('datePublished', 'yt_date_published')},
+    {'itemprop': ('genre', 'yt_genre')},
+    {'itemprop': ('unlisted', 'yt_unlisted')}
   ]
 
-  from pprint import pprint
+  # from pprint import pprint
   def pool_process(media_item):
     # threaded function
     global parse_yt_page
     results = []
     try:
-      url = media_item['media_url'].strip()
+      url = media_item['ijb_media_url'].strip()
       url = url.replace('http:', 'https:')
       url = url.replace('www.youtube','youtube')
-      log.debug(f'get: {url}')
-      data = urllib.request.urlopen(url, timeout=60).read()
+      data = urllib.request.urlopen(url, timeout=30).read()
       soup = BeautifulSoup(data,'lxml')
       for metavar in metavars:
         propname, propvals = list(metavar.items())[0]
@@ -91,18 +90,30 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
         content = soup.find('meta', attrs={propname:propvals[0]})
         if content:
           media_item[propvals[1]] = content.get('content','')
-      if 'duration' in media_item.keys():
+      
+      # description, or error, is not metavar because it can be truncated
+      desc_result = soup.find('p', attrs={'id': 'eow-description'})
+      description = desc_result.text if desc_result else ''
+      
+      if not 'yt_duration' in media_item.keys():
+        error_result = soup.find('div', attrs={'id': 'player-unavailable'})
+        description = error_result.text if error_result else 'Video unavailable'
+
+      media_item['yt_description'] = description
+      log.debug(f'url: {url}, description: {description}')
+
+      if 'yt_duration' in media_item.keys():
         # fix values
-        duration = media_item['duration']
+        duration = media_item['yt_duration']
         mins = int(duration.split('M')[0].replace('PT',''))
         secs = int(duration.split('M')[1].replace('S',''))
-        media_item['duration'] = mins + (60 * secs)
-      if 'paid' in media_item.keys():
-        media_item['paid'] = int(bool(media_item['paid'] == 'True'))
-      if 'is_family_friendly' in media_item.keys():
-        media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True'))
+        media_item['yt_duration'] = mins + (60 * secs)
+      if 'yt_paid' in media_item.keys():
+        media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True'))
+      if 'yt_is_family_friendly' in media_item.keys():
+        media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True'))
     except Exception as e:
-      log.debug(f'Error: {e}, {media_item["media_url"]}')
+      log.debug(f'Error: {e}, {media_item["ijb_media_url"]}')
     pbar.update(1)
     return media_item  # a list of dict key:val dicts
 
@@ -117,17 +128,17 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
       df_media = df_media.append(df, ignore_index=True)
 
   name_maps = {
-    'Media ID': 'media_id',
-    'Media URL': 'media_url',
-    'Source URL': 'source_url',
-    'Attribution': 'attribution',
-    'CC License': 'cc_license',
+    'Media ID': 'ijb_media_id',
+    'Media URL': 'ijb_media_url',
+    'Source URL': 'ijb_source_url',
+    'Attribution': 'ijb_attribution',
+    'CC License': 'ijb_cc_license',
   }
   df_media.rename(columns=name_maps, inplace=True)
   log.info(f'{len(df_media)} rows')
-  df_media = df_media[df_media.media_id.str.contains("video/")]
+  df_media = df_media[df_media.ijb_media_id.str.contains("video/")]
   log.info(f'{len(df_media)} rows')
-  df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True)
+  df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True)
   log.info(f'{len(df_media)} rows')
   media_items = df_media.to_dict('records')
 
diff --git a/megapixels/commands/templates/multithreaded.py b/megapixels/commands/templates/multithreaded.py
new file mode 100644
index 00000000..fec3dac4
--- /dev/null
+++ b/megapixels/commands/templates/multithreaded.py
@@ -0,0 +1,49 @@
+import click
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output file')
+@click.option('-t', '--threads', 'opt_threads', default=4,
+  help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+  """Template for multithreading"""
+  
+  from functools import partial
+  from multiprocessing.dummy import Pool as ThreadPool
+
+  from tqdm import tqdm
+  
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+  log.info('multithreaded template')
+
+  # setup multithreading function
+  def pool_process(data_obj):
+    # threaded function
+    global parse_yt_page
+    results = []
+    try:
+      # do something here with data_obj
+    except Exception as e:
+      log.debug(f'Error: {e}')
+    pbar.update(1)
+    return results
+
+  # setup multithreading data holds
+  items = []  # list of dicts to process
+  results = []
+  num_items = len(items)
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    results = pool.map(pool_process, media_items)
+  
+  pbar.close()
+
author	adamhrv <adam@ahprojects.com>	2019-02-11 23:25:13 +0100
committer	adamhrv <adam@ahprojects.com>	2019-02-11 23:25:13 +0100
commit	9115c4b920a6155f8b66ac64c71d008f67058e7e (patch)
tree	1590c105104f2da4b53f7963550b3cbc243364ae
parent	6f0a583de3a2fce438ecc424dd52c6a559088e87 (diff)