diff options
Diffstat (limited to 'megapixels/commands/datasets/ijb_youtube_meta.py')
| -rw-r--r-- | megapixels/commands/datasets/ijb_youtube_meta.py | 77 |
1 files changed, 44 insertions, 33 deletions
diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py index 87df390c..374f651c 100644 --- a/megapixels/commands/datasets/ijb_youtube_meta.py +++ b/megapixels/commands/datasets/ijb_youtube_meta.py @@ -23,7 +23,7 @@ from app.settings import app_cfg fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv' fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv' fps_default_in = [fp_default_in_a, fp_default_in_b] -fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv' +fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True, @@ -58,32 +58,31 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): metavars = [ - {'name': ('title','title')}, - {'name': ('description', 'description')}, - {'name': ('keywords', 'keywords')}, - {'itemprop': ('paid', 'paid')}, - {'itemprop': ('videoId', 'video_id')}, - {'itemprop': ('duration', 'duration')}, - {'itemprop': ('width', 'width')}, - {'itemprop': ('height', 'height')}, - {'itemprop': ('isFamilyFriendly', 'is_family_friendly')}, - {'itemprop': ('interactionCount', 'views')}, - {'itemprop': ('datePublished', 'date_published')}, - {'itemprop': ('genre', 'genre')}, - {'itemprop': ('unlisted', 'genre')} + {'name': ('title','yt_title')}, + {'name': ('description', 'yt_description')}, + {'name': ('keywords', 'yt_keywords')}, + {'itemprop': ('paid', 'yt_paid')}, + {'itemprop': ('videoId', 'yt_video_id')}, + {'itemprop': ('duration', 'yt_duration')}, + {'itemprop': ('width', 'yt_width')}, + {'itemprop': ('height', 'yt_height')}, + {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')}, + {'itemprop': ('interactionCount', 'yt_views')}, + {'itemprop': ('datePublished', 'yt_date_published')}, + {'itemprop': ('genre', 'yt_genre')}, + {'itemprop': ('unlisted', 'yt_unlisted')} ] - from pprint import pprint + # from pprint import pprint def pool_process(media_item): # threaded function global parse_yt_page results = [] try: - url = media_item['media_url'].strip() + url = media_item['ijb_media_url'].strip() url = url.replace('http:', 'https:') url = url.replace('www.youtube','youtube') - log.debug(f'get: {url}') - data = urllib.request.urlopen(url, timeout=60).read() + data = urllib.request.urlopen(url, timeout=30).read() soup = BeautifulSoup(data,'lxml') for metavar in metavars: propname, propvals = list(metavar.items())[0] @@ -91,18 +90,30 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): content = soup.find('meta', attrs={propname:propvals[0]}) if content: media_item[propvals[1]] = content.get('content','') - if 'duration' in media_item.keys(): + + # description, or error, is not metavar because it can be truncated + desc_result = soup.find('p', attrs={'id': 'eow-description'}) + description = desc_result.text if desc_result else '' + + if not 'yt_duration' in media_item.keys(): + error_result = soup.find('div', attrs={'id': 'player-unavailable'}) + description = error_result.text if error_result else 'Video unavailable' + + media_item['yt_description'] = description + log.debug(f'url: {url}, description: {description}') + + if 'yt_duration' in media_item.keys(): # fix values - duration = media_item['duration'] + duration = media_item['yt_duration'] mins = int(duration.split('M')[0].replace('PT','')) secs = int(duration.split('M')[1].replace('S','')) - media_item['duration'] = mins + (60 * secs) - if 'paid' in media_item.keys(): - media_item['paid'] = int(bool(media_item['paid'] == 'True')) - if 'is_family_friendly' in media_item.keys(): - media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True')) + media_item['yt_duration'] = mins + (60 * secs) + if 'yt_paid' in media_item.keys(): + media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True')) + if 'yt_is_family_friendly' in media_item.keys(): + media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True')) except Exception as e: - log.debug(f'Error: {e}, {media_item["media_url"]}') + log.debug(f'Error: {e}, {media_item["ijb_media_url"]}') pbar.update(1) return media_item # a list of dict key:val dicts @@ -117,17 +128,17 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): df_media = df_media.append(df, ignore_index=True) name_maps = { - 'Media ID': 'media_id', - 'Media URL': 'media_url', - 'Source URL': 'source_url', - 'Attribution': 'attribution', - 'CC License': 'cc_license', + 'Media ID': 'ijb_media_id', + 'Media URL': 'ijb_media_url', + 'Source URL': 'ijb_source_url', + 'Attribution': 'ijb_attribution', + 'CC License': 'ijb_cc_license', } df_media.rename(columns=name_maps, inplace=True) log.info(f'{len(df_media)} rows') - df_media = df_media[df_media.media_id.str.contains("video/")] + df_media = df_media[df_media.ijb_media_id.str.contains("video/")] log.info(f'{len(df_media)} rows') - df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True) + df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True) log.info(f'{len(df_media)} rows') media_items = df_media.to_dict('records') |
