diff options
| author | adamhrv <adam@ahprojects.com> | 2018-12-05 12:00:15 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2018-12-05 12:00:15 +0100 |
| commit | 90abf459d1df1f21960c1d653a1f936d1ec30256 (patch) | |
| tree | facab8e9bac6c56e69c369c2140cdbea218a01df /megapixels/commands/datasets/ytmu.py | |
| parent | 0529d4cd1618016319e995c37aa118bf8c2d501b (diff) | |
.
Diffstat (limited to 'megapixels/commands/datasets/ytmu.py')
| -rw-r--r-- | megapixels/commands/datasets/ytmu.py | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/ytmu.py b/megapixels/commands/datasets/ytmu.py new file mode 100644 index 00000000..66680ed0 --- /dev/null +++ b/megapixels/commands/datasets/ytmu.py @@ -0,0 +1,205 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import dlib +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', + help='Output directory') +@click.option('--videos', 'opt_dir_videos', + help='Output directory') +@click.option('--action', 'opt_action', + type=click.Choice(['info', 'faces', 'rename', 'download', 'metadata', 'split_frames']), + default='info', + help='Command action') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_videos, opt_action): + """YTMU utils""" + + + from tqdm import tqdm + + # ------------------------------------------------- + # process + + if opt_action == 'metadata': + # downloads video metadata with ytdl + handle_metadata(opt_fp_in, opt_fp_out) + elif opt_action == 'download': + # downloads video files with ytdl + handle_download(opt_fp_in, opt_fp_out) + elif opt_action == 'info': + # converts original data file to clean CSV + handle_info() + elif opt_action == 'rename': + # rename the videos to video ID + handle_rename(opt_fp_in, opt_fp_out, opt_dir_videos) + elif opt_action == 'split_frames': + # rename the videos to video ID + handle_split_frames(opt_fp_in, opt_fp_out, opt_dir_videos) + + + + +# ---------------------------------------------------- +# handlers + +def handle_split_frames(fp_in, dir_out, dir_videos): + if not dir_out or not dir_videos: + log.error('-o/--output and --videos required') + return + import cv2 as cv + from tqdm import tqdm + from app.processors import face_detector + detector = face_detector.DetectorDLIBCNN() + + # get file list + fp_videos = glob(join(dir_videos, '*.mp4')) + fp_videos += glob(join(dir_videos, '*.webm')) + fp_videos += glob(join(dir_videos, '*.mkv')) + face_interval = 30 + frame_interval_count = 0 + frame_count = 0 + + file_utils.mkdirs(dir_out) + + for fp_video in tqdm(fp_videos): + # log.debug('opening: {}'.format(fp_video)) + video = cv.VideoCapture(fp_video) + while video.isOpened(): + res, frame = video.read() + if not res: + break + + frame_count += 1 # for naming + frame_interval_count += 1 # for interval + bboxes = detector.detect(frame, opt_size=(320, 240), opt_pyramids=0) + if len(bboxes) > 0 and frame_interval_count >= face_interval: + # save frame + fp_frame = join(dir_out, '{}_{}.jpg'.format(Path(fp_video).stem, file_utils.zpad(frame_count))) + cv.imwrite(fp_frame, frame) + frame_interval_count = 0 + + +def handle_metadata(fp_in, fp_out): + + keys = ['description', 'average_rating', 'dislike_count', 'categories', + 'thumbnail', 'title', 'upload_date', 'uploader_url', 'uploader_id', + 'fps', 'height', 'width', 'like_count', 'license', 'tags'] + + import youtube_dl + + ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'}) + + df = pd.read_csv(fp_in) + data_exp = [] + + for i, row in df.iterrows(): + video_data = {'url': row['url'], 'id': row['id']} + try: + with ydl: + url = 'http://www.youtube.com/watch?v={}'.format(row['id']) + result = ydl.extract_info(url, download=False) + video = result['entries'][0] if 'entries' in result else result + for k in keys: + val = video[k] + if k == 'title': + log.debug(val) + if type(val) == list: + val = '; '.join(val) + if type(val) == str: + video_data[k] = str(val).replace(',',';') + # log.debug('video_data: {}'.format(video_data)) + except Exception as e: + log.warn('video unavilable: {}'.format(row['url'])) + log.error(e) + continue + data_exp.append(video_data) + + df_exp = pd.DataFrame.from_dict(data_exp) + df_exp.to_csv(fp_out) + + +def handle_download(fp_in, dir_out): + import youtube_dl + df = pd.read_csv(fp_in) + fp_videos = glob(join(dir_out, '*.mp4')) + fp_videos += glob(join(dir_out, '*.webm')) + fp_videos += glob(join(dir_out, '*.mkv')) + + ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'}) + + for i, row in df.iterrows(): + vid = row['id'] + found = False + for fp_video in fp_videos: + if vid in fp_video: + log.debug('skip: {}'.format(vid)) + found = True + if not found: + try: + with ydl: + ydl.download(['http://www.youtube.com/watch?v={}'.format(vid)]) + except: + log.error('could not dl: {}'.format(vid)) + + +def handle_info(fp_in, fp_out): + if not fp_out: + log.error('--output required') + return + urls = file_utils.load_text(fp_in) + videos = [] + for url in urls: + splits = url.split('v=') + try: + vid = splits[1] + vid = vid.split('&')[0] + videos.append({'url': url, 'id': vid}) + except: + log.warn('no video id for {}'.format(url)) + # convert to df + df = pd.DataFrame.from_dict(videos) + df.to_csv(opt_fp_out) + + +def handle_rename(fp_in, fp_out, dir_videos): + import shutil + + if not dir_videos: + log.error('--videos required') + return + + fp_videos = glob(join(dir_videos, '*.mp4')) + fp_videos += glob(join(dir_videos, '*.webm')) + fp_videos += glob(join(dir_videos, '*.mkv')) + + df = pd.read_csv(fp_in) + + for i, row in df.iterrows(): + vid = row['id'] + fp_videos_copy = fp_videos.copy() + for fp_video in fp_videos: + if vid in fp_video: + dst = join(dir_videos, '{}{}'.format(vid, Path(fp_video).suffix)) + shutil.move(fp_video, dst) + log.debug('move {} to {}'.format(fp_video, dst)) + fp_videos.remove(fp_video) + break
\ No newline at end of file |
