summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/ijb_youtube_meta.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-11 23:25:13 +0100
committeradamhrv <adam@ahprojects.com>2019-02-11 23:25:13 +0100
commit9115c4b920a6155f8b66ac64c71d008f67058e7e (patch)
tree1590c105104f2da4b53f7963550b3cbc243364ae /megapixels/commands/datasets/ijb_youtube_meta.py
parent6f0a583de3a2fce438ecc424dd52c6a559088e87 (diff)
fix bug to skip existing files
Diffstat (limited to 'megapixels/commands/datasets/ijb_youtube_meta.py')
-rw-r--r--megapixels/commands/datasets/ijb_youtube_meta.py77
1 files changed, 44 insertions, 33 deletions
diff --git a/megapixels/commands/datasets/ijb_youtube_meta.py b/megapixels/commands/datasets/ijb_youtube_meta.py
index 87df390c..374f651c 100644
--- a/megapixels/commands/datasets/ijb_youtube_meta.py
+++ b/megapixels/commands/datasets/ijb_youtube_meta.py
@@ -23,7 +23,7 @@ from app.settings import app_cfg
fp_default_in_a = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'
fp_default_in_b = '/data_store/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'
fps_default_in = [fp_default_in_a, fp_default_in_b]
-fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_media_ytmeta.csv'
+fp_default_out = '/data_store/datasets/people/ijb_c/research/cs3_4_media_youtube_meta.csv'
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fps_default_in, multiple=True,
@@ -58,32 +58,31 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
metavars = [
- {'name': ('title','title')},
- {'name': ('description', 'description')},
- {'name': ('keywords', 'keywords')},
- {'itemprop': ('paid', 'paid')},
- {'itemprop': ('videoId', 'video_id')},
- {'itemprop': ('duration', 'duration')},
- {'itemprop': ('width', 'width')},
- {'itemprop': ('height', 'height')},
- {'itemprop': ('isFamilyFriendly', 'is_family_friendly')},
- {'itemprop': ('interactionCount', 'views')},
- {'itemprop': ('datePublished', 'date_published')},
- {'itemprop': ('genre', 'genre')},
- {'itemprop': ('unlisted', 'genre')}
+ {'name': ('title','yt_title')},
+ {'name': ('description', 'yt_description')},
+ {'name': ('keywords', 'yt_keywords')},
+ {'itemprop': ('paid', 'yt_paid')},
+ {'itemprop': ('videoId', 'yt_video_id')},
+ {'itemprop': ('duration', 'yt_duration')},
+ {'itemprop': ('width', 'yt_width')},
+ {'itemprop': ('height', 'yt_height')},
+ {'itemprop': ('isFamilyFriendly', 'yt_is_family_friendly')},
+ {'itemprop': ('interactionCount', 'yt_views')},
+ {'itemprop': ('datePublished', 'yt_date_published')},
+ {'itemprop': ('genre', 'yt_genre')},
+ {'itemprop': ('unlisted', 'yt_unlisted')}
]
- from pprint import pprint
+ # from pprint import pprint
def pool_process(media_item):
# threaded function
global parse_yt_page
results = []
try:
- url = media_item['media_url'].strip()
+ url = media_item['ijb_media_url'].strip()
url = url.replace('http:', 'https:')
url = url.replace('www.youtube','youtube')
- log.debug(f'get: {url}')
- data = urllib.request.urlopen(url, timeout=60).read()
+ data = urllib.request.urlopen(url, timeout=30).read()
soup = BeautifulSoup(data,'lxml')
for metavar in metavars:
propname, propvals = list(metavar.items())[0]
@@ -91,18 +90,30 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
content = soup.find('meta', attrs={propname:propvals[0]})
if content:
media_item[propvals[1]] = content.get('content','')
- if 'duration' in media_item.keys():
+
+ # description, or error, is not metavar because it can be truncated
+ desc_result = soup.find('p', attrs={'id': 'eow-description'})
+ description = desc_result.text if desc_result else ''
+
+ if not 'yt_duration' in media_item.keys():
+ error_result = soup.find('div', attrs={'id': 'player-unavailable'})
+ description = error_result.text if error_result else 'Video unavailable'
+
+ media_item['yt_description'] = description
+ log.debug(f'url: {url}, description: {description}')
+
+ if 'yt_duration' in media_item.keys():
# fix values
- duration = media_item['duration']
+ duration = media_item['yt_duration']
mins = int(duration.split('M')[0].replace('PT',''))
secs = int(duration.split('M')[1].replace('S',''))
- media_item['duration'] = mins + (60 * secs)
- if 'paid' in media_item.keys():
- media_item['paid'] = int(bool(media_item['paid'] == 'True'))
- if 'is_family_friendly' in media_item.keys():
- media_item['is_family_friendly'] = int(bool(media_item['is_family_friendly'] == 'True'))
+ media_item['yt_duration'] = mins + (60 * secs)
+ if 'yt_paid' in media_item.keys():
+ media_item['yt_paid'] = int(bool(media_item['yt_paid'] == 'True'))
+ if 'yt_is_family_friendly' in media_item.keys():
+ media_item['yt_is_family_friendly'] = int(bool(media_item['yt_is_family_friendly'] == 'True'))
except Exception as e:
- log.debug(f'Error: {e}, {media_item["media_url"]}')
+ log.debug(f'Error: {e}, {media_item["ijb_media_url"]}')
pbar.update(1)
return media_item # a list of dict key:val dicts
@@ -117,17 +128,17 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
df_media = df_media.append(df, ignore_index=True)
name_maps = {
- 'Media ID': 'media_id',
- 'Media URL': 'media_url',
- 'Source URL': 'source_url',
- 'Attribution': 'attribution',
- 'CC License': 'cc_license',
+ 'Media ID': 'ijb_media_id',
+ 'Media URL': 'ijb_media_url',
+ 'Source URL': 'ijb_source_url',
+ 'Attribution': 'ijb_attribution',
+ 'CC License': 'ijb_cc_license',
}
df_media.rename(columns=name_maps, inplace=True)
log.info(f'{len(df_media)} rows')
- df_media = df_media[df_media.media_id.str.contains("video/")]
+ df_media = df_media[df_media.ijb_media_id.str.contains("video/")]
log.info(f'{len(df_media)} rows')
- df_media.drop_duplicates(subset=['media_url'], keep='first', inplace=True)
+ df_media.drop_duplicates(subset=['ijb_media_url'], keep='first', inplace=True)
log.info(f'{len(df_media)} rows')
media_items = df_media.to_dict('records')