from glob import glob import os from os.path import join from pathlib import Path import click from app.settings import types from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils import logger_utils import dlib import pandas as pd from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils log = logger_utils.Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input directory') @click.option('-o', '--output', 'opt_fp_out', help='Output directory') @click.option('--videos', 'opt_dir_videos', help='Output directory') @click.option('--action', 'opt_action', type=click.Choice(['info', 'faces', 'rename', 'download', 'metadata', 'split_frames']), default='info', help='Command action') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_dir_videos, opt_action): """YTMU utils""" from tqdm import tqdm # ------------------------------------------------- # process if opt_action == 'metadata': # downloads video metadata with ytdl handle_metadata(opt_fp_in, opt_fp_out) elif opt_action == 'download': # downloads video files with ytdl handle_download(opt_fp_in, opt_fp_out) elif opt_action == 'info': # converts original data file to clean CSV handle_info() elif opt_action == 'rename': # rename the videos to video ID handle_rename(opt_fp_in, opt_fp_out, opt_dir_videos) elif opt_action == 'split_frames': # rename the videos to video ID handle_split_frames(opt_fp_in, opt_fp_out, opt_dir_videos) # ---------------------------------------------------- # handlers def handle_split_frames(fp_in, dir_out, dir_videos): if not dir_out or not dir_videos: log.error('-o/--output and --videos required') return import cv2 as cv from tqdm import tqdm from app.processors import face_detector detector = face_detector.DetectorDLIBCNN() # get file list fp_videos = glob(join(dir_videos, '*.mp4')) fp_videos += glob(join(dir_videos, '*.webm')) fp_videos += glob(join(dir_videos, '*.mkv')) face_interval = 30 frame_interval_count = 0 frame_count = 0 file_utils.mkdirs(dir_out) for fp_video in tqdm(fp_videos): # log.debug('opening: {}'.format(fp_video)) video = cv.VideoCapture(fp_video) while video.isOpened(): res, frame = video.read() if not res: break frame_count += 1 # for naming frame_interval_count += 1 # for interval bboxes = detector.detect(frame, opt_size=(320, 240), opt_pyramids=0) if len(bboxes) > 0 and frame_interval_count >= face_interval: # save frame fp_frame = join(dir_out, '{}_{}.jpg'.format(Path(fp_video).stem, file_utils.zpad(frame_count))) cv.imwrite(fp_frame, frame) frame_interval_count = 0 def handle_metadata(fp_in, fp_out): keys = ['description', 'average_rating', 'dislike_count', 'categories', 'thumbnail', 'title', 'upload_date', 'uploader_url', 'uploader_id', 'fps', 'height', 'width', 'like_count', 'license', 'tags'] import youtube_dl ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'}) df = pd.read_csv(fp_in) data_exp = [] for i, row in df.iterrows(): video_data = {'url': row['url'], 'id': row['id']} try: with ydl: url = 'http://www.youtube.com/watch?v={}'.format(row['id']) result = ydl.extract_info(url, download=False) video = result['entries'][0] if 'entries' in result else result for k in keys: val = video[k] if k == 'title': log.debug(val) if type(val) == list: val = '; '.join(val) if type(val) == str: video_data[k] = str(val).replace(',',';') # log.debug('video_data: {}'.format(video_data)) except Exception as e: log.warn('video unavilable: {}'.format(row['url'])) log.error(e) continue data_exp.append(video_data) df_exp = pd.DataFrame.from_dict(data_exp) df_exp.to_csv(fp_out) def handle_download(fp_in, dir_out): import youtube_dl df = pd.read_csv(fp_in) fp_videos = glob(join(dir_out, '*.mp4')) fp_videos += glob(join(dir_out, '*.webm')) fp_videos += glob(join(dir_out, '*.mkv')) ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'}) for i, row in df.iterrows(): vid = row['id'] found = False for fp_video in fp_videos: if vid in fp_video: log.debug('skip: {}'.format(vid)) found = True if not found: try: with ydl: ydl.download(['http://www.youtube.com/watch?v={}'.format(vid)]) except: log.error('could not dl: {}'.format(vid)) def handle_info(fp_in, fp_out): if not fp_out: log.error('--output required') return urls = file_utils.load_text(fp_in) videos = [] for url in urls: splits = url.split('v=') try: vid = splits[1] vid = vid.split('&')[0] videos.append({'url': url, 'id': vid}) except: log.warn('no video id for {}'.format(url)) # convert to df df = pd.DataFrame.from_dict(videos) df.to_csv(opt_fp_out) def handle_rename(fp_in, fp_out, dir_videos): import shutil if not dir_videos: log.error('--videos required') return fp_videos = glob(join(dir_videos, '*.mp4')) fp_videos += glob(join(dir_videos, '*.webm')) fp_videos += glob(join(dir_videos, '*.mkv')) df = pd.read_csv(fp_in) for i, row in df.iterrows(): vid = row['id'] fp_videos_copy = fp_videos.copy() for fp_video in fp_videos: if vid in fp_video: dst = join(dir_videos, '{}{}'.format(vid, Path(fp_video).suffix)) shutil.move(fp_video, dst) log.debug('move {} to {}'.format(fp_video, dst)) fp_videos.remove(fp_video) break