.

author: adamhrv <adam@ahprojects.com> 2018-12-05 12:00:15 +0100
committer: adamhrv <adam@ahprojects.com> 2018-12-05 12:00:15 +0100
commit: 90abf459d1df1f21960c1d653a1f936d1ec30256 (patch)
tree: facab8e9bac6c56e69c369c2140cdbea218a01df /megapixels/commands/datasets/ytmu.py
parent: 0529d4cd1618016319e995c37aa118bf8c2d501b (diff)
1 files changed, 205 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/ytmu.py b/megapixels/commands/datasets/ytmu.py
new file mode 100644
index 00000000..66680ed0
--- /dev/null
+++ b/megapixels/commands/datasets/ytmu.py
@@ -0,0 +1,205 @@
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import dlib
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out',
+  help='Output directory')
+@click.option('--videos', 'opt_dir_videos',
+  help='Output directory')
+@click.option('--action', 'opt_action', 
+  type=click.Choice(['info', 'faces', 'rename', 'download', 'metadata', 'split_frames']),
+  default='info',
+  help='Command action')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_videos, opt_action):
+  """YTMU utils"""
+
+  
+  from tqdm import tqdm
+
+  # -------------------------------------------------
+  # process  
+  
+  if opt_action == 'metadata':
+    # downloads video metadata with ytdl
+    handle_metadata(opt_fp_in, opt_fp_out)
+  elif opt_action == 'download':
+    # downloads video files with ytdl
+    handle_download(opt_fp_in, opt_fp_out)
+  elif opt_action == 'info':
+    # converts original data file to clean CSV
+    handle_info()
+  elif opt_action == 'rename':
+    # rename the videos to video ID
+    handle_rename(opt_fp_in, opt_fp_out, opt_dir_videos)
+  elif opt_action == 'split_frames':
+    # rename the videos to video ID
+    handle_split_frames(opt_fp_in, opt_fp_out, opt_dir_videos)
+  
+
+
+
+# ----------------------------------------------------
+# handlers
+
+def handle_split_frames(fp_in, dir_out, dir_videos):
+  if not dir_out or not dir_videos:
+    log.error('-o/--output and --videos required')
+    return
+  import cv2 as cv
+  from tqdm import tqdm
+  from app.processors import face_detector
+  detector = face_detector.DetectorDLIBCNN()
+
+  # get file list
+  fp_videos = glob(join(dir_videos, '*.mp4'))
+  fp_videos += glob(join(dir_videos, '*.webm'))
+  fp_videos += glob(join(dir_videos, '*.mkv'))
+  face_interval = 30
+  frame_interval_count = 0
+  frame_count = 0
+
+  file_utils.mkdirs(dir_out)
+
+  for fp_video in tqdm(fp_videos):
+    # log.debug('opening: {}'.format(fp_video))
+    video = cv.VideoCapture(fp_video)
+    while video.isOpened():
+      res, frame = video.read()
+      if not res:
+        break
+
+      frame_count += 1  # for naming
+      frame_interval_count += 1  # for interval
+      bboxes = detector.detect(frame, opt_size=(320, 240), opt_pyramids=0)
+      if len(bboxes) > 0 and frame_interval_count >= face_interval:
+        # save frame
+        fp_frame = join(dir_out, '{}_{}.jpg'.format(Path(fp_video).stem, file_utils.zpad(frame_count)))
+        cv.imwrite(fp_frame, frame)
+        frame_interval_count = 0
+
+
+def handle_metadata(fp_in, fp_out):
+  
+  keys = ['description', 'average_rating', 'dislike_count', 'categories', 
+  'thumbnail', 'title', 'upload_date', 'uploader_url', 'uploader_id',
+  'fps', 'height', 'width', 'like_count', 'license', 'tags']
+
+  import youtube_dl
+
+  ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'})
+
+  df = pd.read_csv(fp_in)
+  data_exp = []
+
+  for i, row in df.iterrows():
+    video_data = {'url': row['url'], 'id': row['id']}
+    try:
+      with ydl:
+        url = 'http://www.youtube.com/watch?v={}'.format(row['id'])
+        result = ydl.extract_info(url, download=False)
+      video = result['entries'][0] if 'entries' in result else result
+      for k in keys:
+        val = video[k]
+        if k == 'title':
+          log.debug(val)
+        if type(val) == list:
+          val = '; '.join(val)
+        if type(val) == str:
+          video_data[k] = str(val).replace(',',';')
+      # log.debug('video_data: {}'.format(video_data))
+    except Exception as e:
+      log.warn('video unavilable: {}'.format(row['url']))
+      log.error(e)
+      continue
+    data_exp.append(video_data)
+
+  df_exp = pd.DataFrame.from_dict(data_exp)
+  df_exp.to_csv(fp_out)
+
+
+def handle_download(fp_in, dir_out):
+  import youtube_dl
+  df = pd.read_csv(fp_in)
+  fp_videos = glob(join(dir_out, '*.mp4'))
+  fp_videos += glob(join(dir_out, '*.webm'))
+  fp_videos += glob(join(dir_out, '*.mkv'))
+  
+  ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'})
+
+  for i, row in df.iterrows():
+    vid = row['id']
+    found = False
+    for fp_video in fp_videos:
+      if vid in fp_video:
+        log.debug('skip: {}'.format(vid))
+        found = True
+    if not found:
+      try:
+        with ydl:
+          ydl.download(['http://www.youtube.com/watch?v={}'.format(vid)])
+      except:
+        log.error('could not dl: {}'.format(vid))
+
+
+def handle_info(fp_in, fp_out):
+  if not fp_out:
+    log.error('--output required')
+    return
+  urls = file_utils.load_text(fp_in)
+  videos  = []
+  for url in urls:
+    splits = url.split('v=')
+    try:
+      vid = splits[1]
+      vid = vid.split('&')[0]
+      videos.append({'url': url, 'id': vid})
+    except:
+      log.warn('no video id for {}'.format(url))
+  # convert to df
+  df = pd.DataFrame.from_dict(videos)
+  df.to_csv(opt_fp_out)
+
+  
+def handle_rename(fp_in, fp_out, dir_videos):
+  import shutil
+  
+  if not dir_videos:
+    log.error('--videos required')
+    return
+
+  fp_videos = glob(join(dir_videos, '*.mp4'))
+  fp_videos += glob(join(dir_videos, '*.webm'))
+  fp_videos += glob(join(dir_videos, '*.mkv'))
+
+  df = pd.read_csv(fp_in)
+
+  for i, row in df.iterrows():
+    vid = row['id']
+    fp_videos_copy = fp_videos.copy()
+    for fp_video in fp_videos:
+      if vid in fp_video:
+        dst = join(dir_videos, '{}{}'.format(vid, Path(fp_video).suffix))
+        shutil.move(fp_video, dst)
+        log.debug('move {} to {}'.format(fp_video, dst))
+        fp_videos.remove(fp_video)
+        break
+\ No newline at end of file
author	adamhrv <adam@ahprojects.com>	2018-12-05 12:00:15 +0100
committer	adamhrv <adam@ahprojects.com>	2018-12-05 12:00:15 +0100
commit	90abf459d1df1f21960c1d653a1f936d1ec30256 (patch)
tree	facab8e9bac6c56e69c369c2140cdbea218a01df /megapixels/commands/datasets/ytmu.py
parent	0529d4cd1618016319e995c37aa118bf8c2d501b (diff)