summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ytmu.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2018-12-05 12:00:15 +0100
committeradamhrv <adam@ahprojects.com>2018-12-05 12:00:15 +0100
commit90abf459d1df1f21960c1d653a1f936d1ec30256 (patch)
treefacab8e9bac6c56e69c369c2140cdbea218a01df /megapixels/commands/datasets/ytmu.py
parent0529d4cd1618016319e995c37aa118bf8c2d501b (diff)
.
Diffstat (limited to 'megapixels/commands/datasets/ytmu.py')
-rw-r--r--megapixels/commands/datasets/ytmu.py205
1 files changed, 205 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/ytmu.py b/megapixels/commands/datasets/ytmu.py
new file mode 100644
index 00000000..66680ed0
--- /dev/null
+++ b/megapixels/commands/datasets/ytmu.py
@@ -0,0 +1,205 @@
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import dlib
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out',
+ help='Output directory')
+@click.option('--videos', 'opt_dir_videos',
+ help='Output directory')
+@click.option('--action', 'opt_action',
+ type=click.Choice(['info', 'faces', 'rename', 'download', 'metadata', 'split_frames']),
+ default='info',
+ help='Command action')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_videos, opt_action):
+ """YTMU utils"""
+
+
+ from tqdm import tqdm
+
+ # -------------------------------------------------
+ # process
+
+ if opt_action == 'metadata':
+ # downloads video metadata with ytdl
+ handle_metadata(opt_fp_in, opt_fp_out)
+ elif opt_action == 'download':
+ # downloads video files with ytdl
+ handle_download(opt_fp_in, opt_fp_out)
+ elif opt_action == 'info':
+ # converts original data file to clean CSV
+ handle_info()
+ elif opt_action == 'rename':
+ # rename the videos to video ID
+ handle_rename(opt_fp_in, opt_fp_out, opt_dir_videos)
+ elif opt_action == 'split_frames':
+ # rename the videos to video ID
+ handle_split_frames(opt_fp_in, opt_fp_out, opt_dir_videos)
+
+
+
+
+# ----------------------------------------------------
+# handlers
+
+def handle_split_frames(fp_in, dir_out, dir_videos):
+ if not dir_out or not dir_videos:
+ log.error('-o/--output and --videos required')
+ return
+ import cv2 as cv
+ from tqdm import tqdm
+ from app.processors import face_detector
+ detector = face_detector.DetectorDLIBCNN()
+
+ # get file list
+ fp_videos = glob(join(dir_videos, '*.mp4'))
+ fp_videos += glob(join(dir_videos, '*.webm'))
+ fp_videos += glob(join(dir_videos, '*.mkv'))
+ face_interval = 30
+ frame_interval_count = 0
+ frame_count = 0
+
+ file_utils.mkdirs(dir_out)
+
+ for fp_video in tqdm(fp_videos):
+ # log.debug('opening: {}'.format(fp_video))
+ video = cv.VideoCapture(fp_video)
+ while video.isOpened():
+ res, frame = video.read()
+ if not res:
+ break
+
+ frame_count += 1 # for naming
+ frame_interval_count += 1 # for interval
+ bboxes = detector.detect(frame, opt_size=(320, 240), opt_pyramids=0)
+ if len(bboxes) > 0 and frame_interval_count >= face_interval:
+ # save frame
+ fp_frame = join(dir_out, '{}_{}.jpg'.format(Path(fp_video).stem, file_utils.zpad(frame_count)))
+ cv.imwrite(fp_frame, frame)
+ frame_interval_count = 0
+
+
+def handle_metadata(fp_in, fp_out):
+
+ keys = ['description', 'average_rating', 'dislike_count', 'categories',
+ 'thumbnail', 'title', 'upload_date', 'uploader_url', 'uploader_id',
+ 'fps', 'height', 'width', 'like_count', 'license', 'tags']
+
+ import youtube_dl
+
+ ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'})
+
+ df = pd.read_csv(fp_in)
+ data_exp = []
+
+ for i, row in df.iterrows():
+ video_data = {'url': row['url'], 'id': row['id']}
+ try:
+ with ydl:
+ url = 'http://www.youtube.com/watch?v={}'.format(row['id'])
+ result = ydl.extract_info(url, download=False)
+ video = result['entries'][0] if 'entries' in result else result
+ for k in keys:
+ val = video[k]
+ if k == 'title':
+ log.debug(val)
+ if type(val) == list:
+ val = '; '.join(val)
+ if type(val) == str:
+ video_data[k] = str(val).replace(',',';')
+ # log.debug('video_data: {}'.format(video_data))
+ except Exception as e:
+ log.warn('video unavilable: {}'.format(row['url']))
+ log.error(e)
+ continue
+ data_exp.append(video_data)
+
+ df_exp = pd.DataFrame.from_dict(data_exp)
+ df_exp.to_csv(fp_out)
+
+
+def handle_download(fp_in, dir_out):
+ import youtube_dl
+ df = pd.read_csv(fp_in)
+ fp_videos = glob(join(dir_out, '*.mp4'))
+ fp_videos += glob(join(dir_out, '*.webm'))
+ fp_videos += glob(join(dir_out, '*.mkv'))
+
+ ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'})
+
+ for i, row in df.iterrows():
+ vid = row['id']
+ found = False
+ for fp_video in fp_videos:
+ if vid in fp_video:
+ log.debug('skip: {}'.format(vid))
+ found = True
+ if not found:
+ try:
+ with ydl:
+ ydl.download(['http://www.youtube.com/watch?v={}'.format(vid)])
+ except:
+ log.error('could not dl: {}'.format(vid))
+
+
+def handle_info(fp_in, fp_out):
+ if not fp_out:
+ log.error('--output required')
+ return
+ urls = file_utils.load_text(fp_in)
+ videos = []
+ for url in urls:
+ splits = url.split('v=')
+ try:
+ vid = splits[1]
+ vid = vid.split('&')[0]
+ videos.append({'url': url, 'id': vid})
+ except:
+ log.warn('no video id for {}'.format(url))
+ # convert to df
+ df = pd.DataFrame.from_dict(videos)
+ df.to_csv(opt_fp_out)
+
+
+def handle_rename(fp_in, fp_out, dir_videos):
+ import shutil
+
+ if not dir_videos:
+ log.error('--videos required')
+ return
+
+ fp_videos = glob(join(dir_videos, '*.mp4'))
+ fp_videos += glob(join(dir_videos, '*.webm'))
+ fp_videos += glob(join(dir_videos, '*.mkv'))
+
+ df = pd.read_csv(fp_in)
+
+ for i, row in df.iterrows():
+ vid = row['id']
+ fp_videos_copy = fp_videos.copy()
+ for fp_video in fp_videos:
+ if vid in fp_video:
+ dst = join(dir_videos, '{}{}'.format(vid, Path(fp_video).suffix))
+ shutil.move(fp_video, dst)
+ log.debug('move {} to {}'.format(fp_video, dst))
+ fp_videos.remove(fp_video)
+ break \ No newline at end of file