diff options
| author | adamhrv <adam@ahprojects.com> | 2019-02-11 23:25:13 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-02-11 23:25:13 +0100 |
| commit | 9115c4b920a6155f8b66ac64c71d008f67058e7e (patch) | |
| tree | 1590c105104f2da4b53f7963550b3cbc243364ae | |
| parent | 6f0a583de3a2fce438ecc424dd52c6a559088e87 (diff) | |
fix bug to skip existing files
| -rw-r--r-- | megapixels/commands/datasets/ijb_screenshot.py | 66 | ||||
| -rw-r--r-- | megapixels/commands/datasets/ijb_youtube_meta.py | 77 | ||||
| -rw-r--r-- | megapixels/commands/templates/multithreaded.py | 49 |
3 files changed, 146 insertions, 46 deletions
diff --git a/megapixels/commands/datasets/ijb_screenshot.py b/megapixels/commands/datasets/ijb_screenshot.py index 616893c7..7778978e 100644 --- a/megapixels/commands/datasets/ijb_screenshot.py +++ b/megapixels/commands/datasets/ijb_screenshot.py @@ -29,11 +29,16 @@ from app.settings import app_cfg help='Output directory') @click.option('-t', '--threads', 'opt_threads', default=20, help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--verify', 'opt_verify', is_flag=True, + help='Only verify files') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice, opt_verify): """IJB-C screenshot sources""" import sys + import os from glob import glob from os.path import join from pathlib import Path @@ -42,7 +47,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): from multiprocessing.dummy import Pool as ThreadPool import pandas as pd - import cv2 as cv + from PIL import Image + import io from tqdm import tqdm from selenium import webdriver @@ -66,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): pbar.update(1) driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_window_size(1920,1080) + driver.set_window_size(1920,3600) # accommodate vertical videos url = route['url'] fp_out = route['dst'] @@ -75,18 +81,40 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): if 'youtube.com' in url: try: - wait = WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + #wait = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME,'ytd-video-secondary-info-renderer'))) + wait = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID,'player-container-outer'))) except Exception as e: - log.debug(f'error: {e}') + log.debug(f'WebDriver error: {e}') pass else: wait = WebDriverWait(driver,10) time.sleep(1) # wait for element - time.sleep(10) # wait for element + time.sleep(5) # wait for element + #el_vid = driver.find_element_by_id('player-container-outer') + el_shelf = driver.find_element_by_id('ticket-shelf') + el_related = driver.find_element_by_id('related') + el_primary = driver.find_element_by_id('primary') + err = False + try: + el_error = driver.find_element_by_id('error-screen') + if not(el_error.location['x'] == 0 and el_error.location['width'] == 0): + err = True + except: + pass + + margin_left = 24 + margin_bottom = 24 if err else 0 + box = (el_primary.location['x'] - margin_left, el_primary.location['y'], + el_primary.location['x'] + el_primary.size['width'], el_shelf.location['y'] + margin_bottom) + im_bytes = driver.get_screenshot_as_png() + im = Image.open(io.BytesIO(im_bytes)) + im = im.crop(box) + file_utils.mkdirs(fp_out) log.debug(f'save to: {fp_out}') - driver.get_screenshot_as_file(fp_out) + #driver.get_screenshot_as_file(fp_out) + im.save(fp_out) driver.quit() return True @@ -95,6 +123,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): # load routes = [] + video_ids = [] + df_licenses = pd.read_csv(opt_fp_in) log.info(f'{len(df_licenses)} rows') for df_idx, df_license in tqdm(df_licenses.iterrows(), total=len(df_licenses)): @@ -102,18 +132,28 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): if not 'video/' in filepath: continue url = str(df_license['Media URL']) + try: + video_id = url.split('?v=')[1] + except Exception as e: + log.debug(f'error parsing url: "{url}"') + if video_id in video_ids: + continue + video_ids.append(video_id) if not ('http://' in url or 'https://' in url): url = 'http://' + url - fp_media = filepath.replace(Path(filepath).suffix, '.png') - fp_out = join(opt_fp_out, fp_media) + #fp_media = filepath.replace(Path(filepath).suffix, '.png') + #fp_out = join(opt_fp_out, fp_media) + fp_out = join(opt_fp_out, f'{video_id}.png') + if Path(fp_out).exists() and (os.stat(fp_out).st_size // 1000) > 13: + continue obj = {'url': url, 'dst': fp_out} routes.append(obj) - # setup multithreading - for route in routes: - log.debug(f'url: {route["url"]}, dst: {route["dst"]}') + if opt_slice: + routes = routes[opt_slice[0]:opt_slice[1]] + log.debug(f'processing: {len(routes)}') - return + # setup multithreading results = [] pbar = tqdm(total=len(routes)) pool_process = partial(pool_process, chrome_options=chrome_options) diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py index 87df390c..374f651c 100644 --- a/megapixels/commands/datasets/ijb_youtube_meta.py +++ b/megapixels/commands/datasets/ijb_youtube_meta.py @@ -23,7 +23,7 @@ from app.settings import app_cfg fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv' fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv' fps_default_in = [fp_default_in_a, fp_default_in_b] -fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv' +fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True, @@ -58,32 +58,31 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): metavars = [ - {'name': ('title','title')}, - {'name': ('description', 'description')}, - {'name': ('keywords', 'keywords')}, - {'itemprop': ('paid', 'paid')}, - {'itemprop': ('videoId', 'video_id')}, - {'itemprop': ('duration', 'duration')}, - {'itemprop': ('width', 'width')}, - {'itemprop': ('height', 'height')}, - {'itemprop': ('isFamilyFriendly', 'is_family_friendly')}, - {'itemprop': ('interactionCount', 'views')}, - {'itemprop': ('datePublished', 'date_published')}, - {'itemprop': ('genre', 'genre')}, - {'itemprop': ('unlisted', 'genre')} + {'name': ('title','yt_title')}, + {'name': ('description', 'yt_description')}, + {'name': ('keywords', 'yt_keywords')}, + {'itemprop': ('paid', 'yt_paid')}, + {'itemprop': ('videoId', 'yt_video_id')}, + {'itemprop': ('duration', 'yt_duration')}, + {'itemprop': ('width', 'yt_width')}, + {'itemprop': ('height', 'yt_height')}, + {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')}, + {'itemprop': ('interactionCount', 'yt_views')}, + {'itemprop': ('datePublished', 'yt_date_published')}, + {'itemprop': ('genre', 'yt_genre')}, + {'itemprop': ('unlisted', 'yt_unlisted')} ] - from pprint import pprint + # from pprint import pprint def pool_process(media_item): # threaded function global parse_yt_page results = [] try: - url = media_item['media_url'].strip() + url = media_item['ijb_media_url'].strip() url = url.replace('http:', 'https:') url = url.replace('www.youtube','youtube') - log.debug(f'get: {url}') - data = urllib.request.urlopen(url, timeout=60).read() + data = urllib.request.urlopen(url, timeout=30).read() soup = BeautifulSoup(data,'lxml') for metavar in metavars: propname, propvals = list(metavar.items())[0] @@ -91,18 +90,30 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): content = soup.find('meta', attrs={propname:propvals[0]}) if content: media_item[propvals[1]] = content.get('content','') - if 'duration' in media_item.keys(): + + # description, or error, is not metavar because it can be truncated + desc_result = soup.find('p', attrs={'id': 'eow-description'}) + description = desc_result.text if desc_result else '' + + if not 'yt_duration' in media_item.keys(): + error_result = soup.find('div', attrs={'id': 'player-unavailable'}) + description = error_result.text if error_result else 'Video unavailable' + + media_item['yt_description'] = description + log.debug(f'url: {url}, description: {description}') + + if 'yt_duration' in media_item.keys(): # fix values - duration = media_item['duration'] + duration = media_item['yt_duration'] mins = int(duration.split('M')[0].replace('PT','')) secs = int(duration.split('M')[1].replace('S','')) - media_item['duration'] = mins + (60 * secs) - if 'paid' in media_item.keys(): - media_item['paid'] = int(bool(media_item['paid'] == 'True')) - if 'is_family_friendly' in media_item.keys(): - media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True')) + media_item['yt_duration'] = mins + (60 * secs) + if 'yt_paid' in media_item.keys(): + media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True')) + if 'yt_is_family_friendly' in media_item.keys(): + media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True')) except Exception as e: - log.debug(f'Error: {e}, {media_item["media_url"]}') + log.debug(f'Error: {e}, {media_item["ijb_media_url"]}') pbar.update(1) return media_item # a list of dict key:val dicts @@ -117,17 +128,17 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): df_media = df_media.append(df, ignore_index=True) name_maps = { - 'Media ID': 'media_id', - 'Media URL': 'media_url', - 'Source URL': 'source_url', - 'Attribution': 'attribution', - 'CC License': 'cc_license', + 'Media ID': 'ijb_media_id', + 'Media URL': 'ijb_media_url', + 'Source URL': 'ijb_source_url', + 'Attribution': 'ijb_attribution', + 'CC License': 'ijb_cc_license', } df_media.rename(columns=name_maps, inplace=True) log.info(f'{len(df_media)} rows') - df_media = df_media[df_media.media_id.str.contains("video/")] + df_media = df_media[df_media.ijb_media_id.str.contains("video/")] log.info(f'{len(df_media)} rows') - df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True) + df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True) log.info(f'{len(df_media)} rows') media_items = df_media.to_dict('records') diff --git a/megapixels/commands/templates/multithreaded.py b/megapixels/commands/templates/multithreaded.py new file mode 100644 index 00000000..fec3dac4 --- /dev/null +++ b/megapixels/commands/templates/multithreaded.py @@ -0,0 +1,49 @@ +import click + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('-t', '--threads', 'opt_threads', default=4, + help='Number of threads') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): + """Template for multithreading""" + + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool + + from tqdm import tqdm + + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + log.info('multithreaded template') + + # setup multithreading function + def pool_process(data_obj): + # threaded function + global parse_yt_page + results = [] + try: + # do something here with data_obj + except Exception as e: + log.debug(f'Error: {e}') + pbar.update(1) + return results + + # setup multithreading data holds + items = [] # list of dicts to process + results = [] + num_items = len(items) + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + results = pool.map(pool_process, media_items) + + pbar.close() + |
