diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
17 files changed, 2319 insertions, 215 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py index 98d36b5f..5ce0a678 100644 --- a/megapixels/app/settings/app_cfg.py +++ b/megapixels/app/settings/app_cfg.py @@ -95,6 +95,7 @@ DIR_COMMANDS_FAISS = 'commands/faiss' DIR_COMMANDS_MISC = 'commands/misc' DIR_COMMANDS_SITE = 'commands/site' DIR_COMMANDS_DEMO = 'commands/demo' +DIR_COMMANDS_MSC = 'commands/msc' # ----------------------------------------------------------------------------- # Filesystem settings diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py index ed717662..0b81fef6 100644 --- a/megapixels/commands/datasets/download_ibmdif.py +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t help='Output path') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') @click.option('--agents', 'opt_fp_agents', default=fp_user_agents) @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents): """Threaded image/file downloader""" """ @@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): urllib.request.urlretrieve(item['url'], fp_out) item['status'] = True except Exception as e: + log.debug(f'Failed: user: {item["username"]}, url: {url}') if str(e) != 'HTTP Error 403: Forbidden': log.debug(f'Error: {e}') fp_error = f'{fp_out}_error.txt' @@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): # setup multithreading data holders log.debug(f'loading {opt_fp_in}') df_records = pd.read_csv(opt_fp_in) + if opt_slice: + df_records = df_records[opt_slice[0]:opt_slice[1]] log.debug(f'loaded {len(df_records):,} csv records') log.debug('deduplicating') df_records = df_records.drop_duplicates(subset='sha256', keep="last") @@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): for x in tqdm(records): sha256 = x['sha256'] - + username = x['username'] fp_dst = join(opt_fp_out, f"{sha256}.json") fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() @@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): if not (fp_dst_is_file or fp_dst_is_err): url = url_prefix + sha256 + '.json' user_agent = user_agents[randint(0, len(user_agents)) - 1] - pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent}) else: n_skipped += 1 diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index c64afbba..45ca8f6e 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -6,9 +6,9 @@ import click help='Input') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output') -@click.option('-t', '--threads', 'opt_threads', default=8, +@click.option('-t', '--threads', 'opt_threads', default=8, show_default=True, help='Number of threads') -@click.option('--wayback', 'opt_wayback', is_flag=True, +@click.option('--wayback', 'opt_wayback', is_flag=True, default=False, help='Check Wayback archive for URL and download cached image') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): @@ -52,7 +52,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): estr = str(e) if item['opt_wayback']: if 'HTTP Error' in estr: - # check + # TODO add/parse/handle request for wayback machine archive url_wayback = url_wayback_base + item['url'] fp_error = f'{fp_out}_error.txt' with open(fp_error, 'w') as fp: @@ -67,6 +67,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items = [] + log.debug(f'Initializing multithreaded pool...') for x in tqdm(records): fp_dst = join(opt_fp_out, x['filepath']) fp_dst_is_file = Path(fp_dst).is_file() @@ -75,7 +76,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) - log.info(f'processing {num_items:,} items') + log.info(f'Going to download {num_items:,} files') pool_results = [] # run the multithreading with progress bar diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py index 780ede49..f09f3089 100644 --- a/megapixels/commands/datasets/flickr_api.py +++ b/megapixels/commands/datasets/flickr_api.py @@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils -query_types = ['photo_id', 'album_id', 'flickr_id'] +query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile'] +# ??? +# photo_id: 123456789 # flickr_id: 123456789@N01 -# photo_id: log = logger_utils.Logger.getLogger() @@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger() help='Output directory') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') -@click.option('--query-type', 'opt_query_type', default='photo_id', +@click.option('-q', '--query', 'opt_query_type', required=True, type=click.Choice(query_types), help='API query type') @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') @@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, # process if not opt_api_key or not opt_api_secret: - log.error('source .env vars for Flickr API and try again') + log.error('source env/flickr.env vars for Flickr API and try again') return # check how many flickr keys api_keys = [] api_secrets = [] - for i in range(1,10): + for i in range(1,20): try: var_name_key = f'FLICKR_API_KEY_{i}' var_name_secret = f'FLICKR_API_SECRET_{i}' @@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, log.info(f'Shuffling between: {len(api_keys)} api keys') # read in CSV - # | query, filepath | - - records = pd.read_csv(opt_fp_in).to_dict('records') + # | query, filename, count | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile': + df_records = df_records.drop_duplicates(subset='nsid', keep="last") + else: + df_records = df_records.drop_duplicates(subset='photo_id', keep="last") + log.info(f'After deduplication: {len(df_records)}') + records = df_records.to_dict('records') + if opt_slice: records = records[opt_slice[0]:opt_slice[1]] @@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, for record in tqdm(records): - fp_out = join(opt_fp_out, record['filepath']) + if 'nsid' in opt_query_type: + fp_out = join(opt_fp_out, f"{record['nsid']}.json") + else: + fp_out = join(opt_fp_out, f'{record["photo_id"]}.json') + fp_out_err = fp_out + '_error.txt' if Path(fp_out).is_file() or Path(fp_out_err).is_file(): continue - # append relevant data try: # shuffle the api keys to avoid rate limiting rand_int = randint(0,len(api_keys)-1) api_key = api_keys[rand_int] api_secret = api_secrets[rand_int] - - #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) - #photo = flickr_api.Photo(id=record['query']) - # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 - photo_id = record['query'] - flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' - flickr_url += f'&api_key={api_key}' - flickr_url += f'&photo_id={photo_id}' - flickr_url += '&format=json' - flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_url': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_profile': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + elif opt_query_type == 'photo_id': + # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + fp_out + photo_id = record['photo_id'] + flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' with urllib.request.urlopen(flickr_url) as url: data = json.loads(url.read().decode()) if data['stat'] =='fail': - raise Exception('failed') + error_msg = data["message"] + log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') + if error_msg == 'Service currently unavailable': + time.sleep(10) + raise Exception(error_msg) elif data['stat'] =='ok': with open(fp_out, 'w') as fp: json.dump(data, fp, sort_keys=True, indent=2) - #except FlickrAPIError as e: except Exception as e: - # if "HTTP Server Error 500" in str(e): - log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') - if "not found" in str(e) or 'failed' in str(e): + log.error(f'{e}') + if "not found" in str(e) or 'Invalid NSID provided' in str(e): with open(fp_out_err, 'w') as fp: fp.write('') diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py new file mode 100644 index 00000000..5b5f0ce3 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api_to_csv.py @@ -0,0 +1,382 @@ +""" +Converts directory of JSON API output files to CSV format +""" + +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['nsid_profile', 'nsid_url', 'photo_id'] + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import json + + + # ------------------------------------------------- + # process + if Path(opt_fp_out).is_file() and not opt_force: + log.error('File exists. Use "--force" to overwrite it') + return + + fp_files = glob(join(opt_fp_in, '*.json')) + fp_files = [f for f in fp_files if 'error' not in f] + if opt_slice: + fp_files = fp_files[opt_slice[0]:opt_slice[1]] + + log.debug(f'Found {len(fp_files)} files') + items = [] + + for fp_file in tqdm(fp_files): + + if opt_query_type == 'photo_id': + try: + photo = file_utils.load_json(fp_file).get('photo') + except Exception as e: + log.error(f'{e}, skipping: {fp_file}') + continue + dates = photo.get('dates') + posted = dates.get('posted') + taken = dates.get('taken') + description = photo.get('description').get('_content') + location = photo.get('location', {}) + country = location.get('country', {}) + location_country = country.get('_country', '') + location_place = country.get('place_id', '') + location_woeid = country.get('woeid', '') + location_lat = location.get('latitude', '') + location_lon = location.get('longitude', '') + location_place_id = location.get('place_id', '') + owner = photo.get('owner') + nsid = owner.get('nsid') + path_alias = owner.get('path_alias') + owner_realname = owner.get('realname') + owner_username = owner.get('username') + owner_location = owner.get('location') + photo_id = Path(fp_file).stem + server = photo.get('server') + farm = photo.get('farm') + secret = photo.get('secret') + # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg + image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg' + + obj = { + 'posted': posted, + 'taken': taken, + 'description': description, + 'country': location_country, + 'place': location_place, + 'woeid': location_woeid, + 'lat': location_lat, + 'lon': location_lon, + 'place_id': location_place_id, + 'nsid': nsid, + 'path_alias': path_alias, + 'realname': owner_realname, + 'username': owner_username, + 'owner_location': owner_location, + 'photo_id': photo_id, + 'secret': secret, + 'url': image_url + } + + + elif opt_query_type == 'nsid_profile': + obj = file_utils.load_json(fp_file).get('profile') + obj.pop('showcase_set') + obj.pop('showcase_set_title') + obj.pop('pinterest') + obj.pop('tumblr') + elif opt_query_type == 'nsid_url': + obj = file_utils.load_json(fp_file).get('user') + elif opt_query_type == 'user_profile': + metadata = file_utils.load_json(fp_file).get('photo') + owner = metadata.get('owner') + path_alias = owner.get('path_alias') + nsid = owner.get('nsid') + username = owner.get('username') + realname = owner.get('realname') + description = metadata.get('description').get('_content') + title = metadata.get('title').get('_content') + location = metadata.get('location') + dates = metadata.get('dates') + date_taken = dates.get('taken') + date_posted = dates.get('posted') + fname = Path(fp_file).stem + obj = { + 'photo_id': fname, + 'nsid': nsid, + 'path_alias': path_alias, + 'username': username, + 'realname': realname, + 'title': title, + 'description': description, + 'location': location, + 'date_taken': date_taken, + 'date_posted': date_posted + } + + items.append(obj) + + # conver to DataFrame + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.info(f'Wrote {len(df)} to {opt_fp_out}') + +""" +nsid_url + { + "stat": "ok", + "user": { + "nsid": "7153718@N04", + "url": "https://www.flickr.com/people/babyfish4/" + } +} +""" +""" + location: of the owner + dateuploaded + license + "dates": + "lastupdate": "1416447096" + "posted": "1112900873" + "taken": "2005-04-06 18:37:38" + description: + _content: playing in a field + title: + _content: jessica + location: cornwall, uk +""" + +""" + { + "profile": { + "city": null, + "country": null, + "facebook": "", + "first_name": null, + "hometown": "", + "id": "7153718@N04", + "instagram": "", + "join_date": "1172669959", + "last_name": null, + "nsid": "7153718@N04", + "occupation": "", + "pinterest": "", + "profile_description": "", + "showcase_set": "72157680616398790", + "showcase_set_title": "Profile Showcase", + "tumblr": "", + "twitter": "" + }, + "stat": "ok" +} +""" + +""" +photo_id + + + { + "photo": { + "comments": { + "_content": "0" + }, + "dates": { + "lastupdate": "0", + "posted": "1094612969", + "taken": "2004-09-04 22:41:18", + "takengranularity": "0", + "takenunknown": 0 + }, + "dateuploaded": "1094612969", + "description": { + "_content": "" + }, + "editability": { + "canaddmeta": 0, + "cancomment": 0 + }, + "farm": 1, + "geoperms": { + "iscontact": 0, + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + }, + "id": "371498", + "isfavorite": 0, + "license": "1", + "location": { + "accuracy": "15", + "context": "0", + "country": { + "_content": "United States", + "place_id": "nz.gsghTUb4c2WAecA", + "woeid": "23424977" + }, + "county": { + "_content": "Tompkins", + "place_id": "1uCJJtBQUL80G6hbPw", + "woeid": "12589366" + }, + "latitude": "42.399028", + "longitude": "-76.652519", + "place_id": "1uCJJtBQUL80G6hbPw", + "region": { + "_content": "New York", + "place_id": "ODHTuIhTUb75gdBu", + "woeid": "2347591" + }, + "woeid": "12589366" + }, + "media": "photo", + "notes": { + "note": [] + }, + "originalformat": "jpg", + "originalsecret": "704f392686", + "owner": { + "iconfarm": 1, + "iconserver": "1", + "location": "Los Angeles, CA, USA", + "nsid": "48600072071@N01", + "path_alias": "barb", + "realname": "Barb Dybwad", + "username": "doctor paradox" + }, + "people": { + "haspeople": 0 + }, + "publiceditability": { + "canaddmeta": 0, + "cancomment": 1 + }, + "rotation": 0, + "safety_level": "0", + "secret": "704f392686", + "server": "1", + "tags": { + "tag": [ + { + "_content": "unfound", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9017", + "machine_tag": 0, + "raw": "unfound" + }, + { + "_content": "digicam", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-40406", + "machine_tag": 0, + "raw": "digicam" + }, + { + "_content": "upstateny", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9655", + "machine_tag": 0, + "raw": "upstateny" + }, + { + "_content": "musefest", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72456", + "machine_tag": 0, + "raw": "musefest" + }, + { + "_content": "musicfestival", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72628", + "machine_tag": 0, + "raw": "musicfestival" + }, + { + "_content": "people", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-290", + "machine_tag": 0, + "raw": "people" + }, + { + "_content": "portrait", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-278", + "machine_tag": 0, + "raw": "portrait" + }, + { + "_content": "maco", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-19439", + "machine_tag": 0, + "raw": "maco" + } + ] + }, + "title": { + "_content": "maco2" + }, + "urls": { + "url": [ + { + "_content": "https://www.flickr.com/photos/barb/371498/", + "type": "photopage" + } + ] + }, + "usage": { + "canblog": 0, + "candownload": 1, + "canprint": 0, + "canshare": 1 + }, + "views": "290", + "visibility": { + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + } + }, + "stat": "ok" +} +"""
\ No newline at end of file diff --git a/megapixels/commands/msc/count.py b/megapixels/commands/msc/count.py new file mode 100644 index 00000000..3c242bc6 --- /dev/null +++ b/megapixels/commands/msc/count.py @@ -0,0 +1,123 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# datasets +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face'] + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): + """Cross reference""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Cross reference embassy list') + + + fp_counts = {} + fp_filepaths = {} + fp_dataset_base = '/data_store/datasets/people/' + + for dk in dataset_keys: + fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv') + fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv') + + df_embassies = pd.read_csv(opt_fp_in) + df_embassies.fillna('', inplace=True) + embassy_nsids = list(df_embassies['nsid']) + + match_items = [] + embassy_images = [] + malta_images = [] + + for dataset_key, fp_dataset in tqdm(fp_counts.items()): + df_counts = pd.read_csv(fp_dataset) + log.debug(f'loading: {fp_filepaths[dataset_key]}') + df_filepaths = pd.read_csv(fp_filepaths[dataset_key]) + nsids = list(df_counts['nsid']) + for nsid in nsids: + if nsid in embassy_nsids: + # add to matches, and count + count = df_counts[df_counts['nsid'] == nsid]['count'].values[0] + first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] + last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] + path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] + page_url = f'https://flickr.com/photos/{path_alias}' + embassy_name = f'{first_name} {last_name}' + embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0] + + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'name': embassy_name, + 'dataset_key': dataset_key, + 'nsid': nsid, + 'page_url': page_url, + 'embassy_type': embassy_meta.type, + 'username': embassy_meta.username + } + match_items.append(match_obj) + + # add photo ids or url + df_nsids = df_filepaths[df_filepaths['nsid'] == nsid] + nsid_records = df_nsids.to_dict('records') + for nsid_record in nsid_records: + photo_id = nsid_record.get('photo_id') + im_obj = { + 'nsid': nsid, + 'url': nsid_record.get('url'), + 'photo_id': photo_id, + 'dataset_key': dataset_key, + 'path_alias': path_alias, + 'name': embassy_name, + 'page_url': page_url, + 'username': embassy_meta.username, + 'filepath': f'{photo_id}.jpg' + } + + embassy_images.append(im_obj) + if nsid == '51226353@N03': + malta_images.append(im_obj) + + # Save embassy matches + df = pd.DataFrame.from_dict(match_items) + df.to_csv(opt_fp_out, index=False) + total = df['count'].sum() + + # Save image matches + df = pd.DataFrame.from_dict(embassy_images) + fp_out = opt_fp_out.replace('.csv', '_images.csv') + df.to_csv(fp_out, index=False) + total = len(embassy_images) + log.debug(f'wrote {fp_out}') + log.debug(f'Found {total:,} embassy images') + + # Save malta images + df = pd.DataFrame.from_dict(malta_images) + fp_out = opt_fp_out.replace('.csv', '_images_malta.csv') + df.to_csv(fp_out, index=False) + total = len(malta) + log.debug(f'wrote {fp_out}') + log.debug(f'Found {total:,} malta embassy images')
\ No newline at end of file diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py deleted file mode 100644 index d4457945..00000000 --- a/megapixels/commands/msc/cross_reference.py +++ /dev/null @@ -1,78 +0,0 @@ -from os.path import join - -import click - -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# source file for Embassy NSIDs -fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv' - -# list of datasets to cross reference -dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there'] -fp_dataset_base = '/data_store/datasets/people/' -fp_datasets = {} -for dk in dataset_keys: - fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv') - - -# output file -fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv' - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies, - help='Input file for embassies') -@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, - help='Output file') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): - """Cross reference""" - - import sys - from os.path import join - from glob import glob - from pathlib import Path - import time - - import pandas as pd - from tqdm import tqdm - - log = Logger.getLogger() - log.info('Cross reference embassy list') - - df_embassies = pd.read_csv(opt_fp_in) - df_embassies.fillna('', inplace=True) - embassy_nsids = list(df_embassies['nsid']) - - match_items = [] - for dataset_key, fp_dataset in fp_datasets.items(): - df_dataset = pd.read_csv(fp_dataset) - nsids = list(df_dataset['nsid']) - for nsid in nsids: - if nsid in embassy_nsids: - # add to matches, and count - count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0] - first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] - last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] - path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] - log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}') - match_obj = { - 'count': count, - 'path_alias': path_alias, - 'name': f'{first_name} {last_name}', - 'dataset_key': dataset_key, - 'nsid': nsid - } - match_items.append(match_obj) - - df = pd.DataFrame.from_dict(match_items) - df.to_csv(opt_fp_out, index=False) - - total = df['count'].sum() - - log.debug(f'Found {total} embassy photos')
\ No newline at end of file diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py index d5d251db..045e3b69 100644 --- a/megapixels/commands/msc/summarize.py +++ b/megapixels/commands/msc/summarize.py @@ -29,7 +29,7 @@ def cli(ctx, opt_fp_in, opt_fp_out): log = Logger.getLogger() - dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs'] + dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs'] df = pd.DataFrame() fp_out = opt_fp_out.replace('.csv', '_citations.csv') @@ -37,10 +37,11 @@ def cli(ctx, opt_fp_in, opt_fp_out): fp_csv = join(opt_fp_in, f'{dataset_name}.csv') _df = pd.read_csv(fp_csv) _df = _df[_df.lat != 0] + _df.drop('id', axis=1, inplace=True) print(dataset_name, len(_df)) df = df.append(_df, ignore_index=True) - df.to_csv(opt_fp_out, index=False) + df.to_csv(fp_out, index=False) # create country summary fp_out = opt_fp_out.replace('.csv', '_countries.csv') diff --git a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb index a35c3b24..8d3b4251 100644 --- a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb @@ -29,41 +29,145 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create CSV for API" + "## Cleanup filepaths CSV" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in_photo_ids = '/data_store_hdd/datasets/people/adience/research/adience_photo_ids.csv'\n", + "fp_in_flickr_api_dump = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_dump.csv'\n", + "fp_in_flickr_api_dump_photo_ids = '/data_store_hdd/datasets/people/adience/research/flickr_api_dump_photo_id.csv'\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/adience/research/adience_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 44, "metadata": {}, + "outputs": [], "source": [ - "| filepath | query | count |\n", - "|:---|:---|:---|\n", - "| 12234 | 12234@123| 10 |" + "# photo id list\n", + "df = pd.read_csv(fp_in_photo_ids)\n", + "records = df.to_dict('records')\n", + "\n", + "# photo id --> url list\n", + "df_api_urls = pd.read_csv(fp_in_flickr_api_dump_photo_ids)\n", + "api_urls = df_api_urls.to_dict('records')\n", + "\n", + "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api_dump)\n", + "flickr_api_dump = df_flickr_api_dump.to_dict('records')\n", + "\n", + "# create lookup table for user info?\n", + "flickr_api_lookup = {}\n", + "for api_item in flickr_api_dump:\n", + " nsid = api_item['nsid']\n", + " flickr_api_lookup[nsid] = api_item\n", + " \n", + "# create lookup table for user info?\n", + "api_url_lookup = {}\n", + "for api_url_item in api_urls:\n", + " photo_id = api_url_item['photo_id']\n", + " api_url_lookup[photo_id] = api_url_item\n", + " \n", + "results = []\n", + "for record in records:\n", + " photo_id = record['photo_id']\n", + " if photo_id in api_url_lookup.keys():\n", + " api_item = api_url_lookup.get(photo_id)\n", + " url = api_item.get('url')\n", + " nsid = api_item.get('nsid')\n", + " obj = {\n", + " 'filepath': f'{photo_id}.jpg',\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url\n", + " }\n", + " results.append(obj)\n", + " \n", + "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create the file meta csv\n", + "results = []\n", + "results_download = []\n", + "for flickr_meta_record in flickr_meta_records:\n", + " # farm, server, photo id, secret\n", + " photo_id = str(flickr_meta_record['photo_id'])\n", + " nsid = flickr_meta_record.get('nsid')\n", + " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n", + " json_data = file_utils.load_json(fp_json)\n", + " photo_meta = json_data.get('photo')\n", + " farm = photo_meta.get('farm')\n", + " server = photo_meta.get('server')\n", + " secret = photo_meta.get('secret')\n", + " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n", + " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n", + " obj = {\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url,\n", + " 'filepath': f'{photo_id}.jpg'\n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "fp_in_dir = '/data_store/datasets/people/adience/dataset/'\n", - "fp_out_queries = '/data_store/datasets/people/adience/research/adience_flickr_api_queries.csv'" + "## Create Photo ID list" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9\n" - ] - } - ], + "outputs": [], "source": [ "fp_files = glob(join(fp_in_dir, '*.txt'))\n", "print(len(fp_files))" @@ -71,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -94,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -123,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -132,24 +236,16 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10804\n" - ] - } - ], + "outputs": [], "source": [ "print(len(df_images))" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb new file mode 100644 index 00000000..3d571aff --- /dev/null +++ b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download FFHQ Images\n", + "\n", + "- https://github.com/NVlabs/ffhq-dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import requests\n", + "import json\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# load JSON\n", + "fp_in = '/data_store/datasets/people/ffhq/ffhq-dataset-v1.json'\n", + "fp_out = '/data_store/datasets/people/ffhq/research/flickr_api_urls.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open(fp_in, 'r') as fp:\n", + " ffhq_items = json.load(fp)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "217c694742e8408d871c3b41183676fb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "results = []\n", + "# get photos urls\n", + "for idx, ffhq_item in tqdm(ffhq_items.items()):\n", + " url = ffhq_item.get('metadata').get('photo_url')\n", + " photo_id = Path(url).stem\n", + " obj = {'photo_id': photo_id}\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'photo_id': '1133484654'}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame.from_dict(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(fp_out, index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb index 311d3462..140b6361 100644 --- a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb @@ -40,6 +40,110 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create filepaths CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "fp_flickr_meta = '/data_store_hdd/datasets/people/helen/research/helen_flickr_api_dump.csv'\n", + "fp_photo_ids = '/data_store_hdd/datasets/people/helen/research/helen_flickr_photo_ids.csv'\n", + "fp_filepaths = '/data_store_hdd/datasets/people/helen/research/helen_file_meta.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "df_photo_ids = pd.read_csv(fp_photo_ids)\n", + "photo_ids = df_photo_ids.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_flickr_meta, dtype={'photo_id': str})\n", + "flickr_meta_records = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1854\n", + "2122\n" + ] + } + ], + "source": [ + "print(len(flickr_meta_records))\n", + "print(len(df_photo_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# create the file meta csv\n", + "results = []\n", + "results_download = []\n", + "for flickr_meta_record in flickr_meta_records:\n", + " # farm, server, photo id, secret\n", + " photo_id = str(flickr_meta_record['photo_id'])\n", + " nsid = flickr_meta_record.get('nsid')\n", + " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n", + " json_data = file_utils.load_json(fp_json)\n", + " photo_meta = json_data.get('photo')\n", + " farm = photo_meta.get('farm')\n", + " server = photo_meta.get('server')\n", + " secret = photo_meta.get('secret')\n", + " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n", + " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n", + " obj = {\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url,\n", + " 'filepath': f'{photo_id}.jpg'\n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = pd.DataFrame.from_dict(results)\n", + "df_out.to_csv(fp_filepaths, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { "cell_type": "code", "execution_count": 16, "metadata": {}, diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb index ff41e799..6d2b768a 100644 --- a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb @@ -29,70 +29,353 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create CSV for API" + "## IBM DiF clean CSVs\n", + "\n", + "- 2283 files could not be downloaded or accessed in the API\n", + "- these images were downloaded, but possibly no longer exist" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 60, "metadata": {}, + "outputs": [], + "source": [ + "# flickr api data\n", + "fp_in_meta_flickr = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_flickr.csv'\n", + "\n", + "# api query dump\n", + "fp_in_flickr_api = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", + "\n", + "# ibm count data\n", + "fp_in_meta_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths.csv'\n", + "fp_meta_filepaths_adj = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths_adj.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], "source": [ - "| photo_id |\n", - "|:---|\n", - "| 12234 |" + "df_meta_filepaths = pd.read_csv(fp_in_meta_filepaths)\n", + "meta_filepaths = df_meta_filepaths.to_dict('records')\n", + "df_meta_flickr = pd.read_csv(fp_in_meta_flickr)\n", + "meta_flickr = df_meta_flickr.to_dict('records')\n", + "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98155\n", + "98155\n", + "98153\n", + "100438\n", + "98154\n" + ] + } + ], + "source": [ + "print(len(df_flickr_api_dump))\n", + "print(len(df_flickr_api_dump.drop_duplicates(subset='nsid')))\n", + "print(len(df_meta_flickr))\n", + "print(len(df_meta_filepaths))\n", + "print(len(df_meta_filepaths.drop_duplicates(subset='nsid')))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ - "# flickr api data\n", - "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", - "# ibm count data\n", - "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n", - "# output\n", - "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'" + "# drop epmty NSIDs\n", + "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ - "# load ibm data and create count lookup with photoid\n", - "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n", - "ibm_meta_records = df_ibm_meta.to_dict('records')\n", - "count_lookup = {}\n", - "for ibm_meta_record in ibm_meta_records:\n", - " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n", - " count_lookup[photo_id] = ibm_meta_record['count']" + "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "nsid_filepaths = {}\n", + "dupes = []\n", + "for meta_filepath in meta_filepaths:\n", + " nsid = meta_filepath['nsid']\n", + " if nsid not in nsid_filepaths.keys():\n", + " nsid_filepaths[nsid] = meta_filepath\n", + " else:\n", + " dupes.append(meta_filepath)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98154\n", + "2284\n" + ] + } + ], + "source": [ + "print(len(nsid_filepaths))\n", + "print(len(dupes))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n", + "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n", + "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n", + "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n", + "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n", + "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n", + "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n", + "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n", + "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n", + "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n" + ] + } + ], + "source": [ + "for dupe in dupes[:10]:\n", + " print(dupe)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100438\n" + ] + } + ], + "source": [ + "print(len(dupes))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98153\n" + ] + } + ], + "source": [ + "print(len(nsid_groups))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100436\n" + ] + } + ], + "source": [ + "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n", + "print(len(fp_ims))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9314013316\n" + ] + } + ], + "source": [ + "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n", + "print(photo_ids[0])" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'photo_id'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m: 'photo_id'" + ] + } + ], + "source": [ + "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7a9a78bf0e442a5b8445906bc85da99", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "100438" + "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))" ] }, - "execution_count": 69, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# find which photo IDs are no longer accessible\n", + "missing_photo_ids = []\n", + "for photo_id in tqdm(photo_ids):\n", + " photo_id = int(photo_id)\n", + " if photo_id not in filepath_photo_ids:\n", + " missing_photo_ids.append(photo_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "[]\n" + ] } ], "source": [ + "print(len(missing_photo_ids))\n", + "print(missing_photo_ids[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df_flickr_meta' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined" + ] + } + ], + "source": [ + "total = df_flickr_meta['count'].sum()\n", + "print(total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load ibm data and create count lookup with photoid\n", + "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n", + "ibm_meta_records = df_ibm_meta.to_dict('records')\n", + "count_lookup = {}\n", + "for ibm_meta_record in ibm_meta_records:\n", + " photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n", + " count_lookup[photo_id] = ibm_meta_record['count']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "len(count_lookup)" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -111,18 +394,9 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n", - "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n" - ] - } - ], + "outputs": [], "source": [ "# load flickr data\n", "for flickr_meta_record in flickr_meta_records:\n", @@ -143,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -160,55 +434,99 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "# photo ids and nsids\n", "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n", - "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'" + "\n", + "# file urls\n", + "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n", + "\n", + "# flickr meta\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" - ] - } - ], + "outputs": [], "source": [ - "df = pd.read_csv(fp_flickr_api_dump)\n", - "groups = df.groupby('nsid')" + "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n", + "df_flickr_meta.fillna('', inplace=True)\n", + "flickr_metas = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "|filepath|nsid|photo_id|url|\n", + "```" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "results = []\n", - "for nsid, group in groups:\n", - " obj = {\n", - " 'nsid': nsid,\n", - " 'count': len(group)\n", - " }\n", - " results.append(obj)" + "photo_id_to_nsid = {}\n", + "for flickr_meta in flickr_metas:\n", + " photo_id = flickr_meta.get('photo_id')\n", + " if photo_id:\n", + " photo_id = str(int(photo_id))\n", + " photo_id_to_nsid[photo_id] = flickr_meta['nsid']" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(list(photo_id_to_nsid.keys())[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n", + "ibm_urls = df_ibm_urls.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "photo_id_to_url = {}\n", + "missed = []\n", + "for ibm_url in ibm_urls:\n", + " photo_id = str(ibm_url['filepath'].split('_')[0])\n", + " try:\n", + " ibm_url['photo_id'] = photo_id\n", + " ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n", + " except Exception as e:\n", + "# print(e, photo_id)\n", + " missed.append(photo_id)\n", + "print(f'missed: {len(missed)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)" + "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)" ] }, { diff --git a/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb new file mode 100644 index 00000000..b4a29243 --- /dev/null +++ b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Count IJB sources" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import requests\n", + "import json\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# load JSON\n", + "fp_in_cs3 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'\n", + "fp_in_cs4 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'\n", + "fp_in_ijb_b = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-B/ijbb_licenses_and_sources.csv'\n", + "fp_in_ijb_a = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-A/SOURCES.csv'\n", + "fp_out = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/summary.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df_cs3 = pd.read_csv(fp_in_cs3)\n", + "df_cs4 = pd.read_csv(fp_in_cs4)\n", + "df_sources = df_cs3.append(df_cs4)\n", + "df_sources.fillna('', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# ijb b\n", + "#df_sources = pd.read_csv(fp_in_ijb_b).fillna('')\n", + "# ijb a\n", + "df_sources = pd.read_csv(fp_in_ijb_a).fillna('')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "sources = df_sources.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "results = {}\n", + "others = []\n", + "keys = ['flickr.com', 'youtube.com', 'wikipedia.org', 'wikimedia.org']\n", + "for k in keys:\n", + " results[k] = []\n", + "for source in sources:\n", + " url = str(source['Media URL'])\n", + " media_id = source['Media ID']\n", + " if 'nonfaces' in media_id:\n", + " continue\n", + " found = False\n", + " for k in keys:\n", + " if k in url:\n", + " results[k].append(url)\n", + " found = True\n", + " if not found:\n", + " if url:\n", + " others.append(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "flickr.com 0\n", + "youtube.com 1388\n", + "wikipedia.org 0\n", + "wikimedia.org 4298\n" + ] + } + ], + "source": [ + "for k,v in results.items():\n", + " print(k, len(set(v)))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "siliconangle.com/files/2011/06/kaz-hirai.jpg\n", + "etnosi.files.wordpress.com/2012/05/sofi-marinova-baku.jpg\n", + "images.coveralia.com/audio/p/Pia_Zadora-When_The_Lights_Go_Out-Interior_Frontal.jpg\n", + "4.bp.blogspot.com/-TFHOJVIW3a8/T_1mD6MdOxI/AAAAAAAADAg/PhKDPx0Aqu0/s1600/ivan_pavlov.jpg\n", + "863793661388437597-a-1802744773732722657-s-sites.googlegroups.com/site/virginmarysite/Home/jackneosex.jpg\n", + "amckiereads.files.wordpress.com/2010/12/darwish.jpg?w=600\n", + "img.interia.pl/komputery/nimg/5/7/Kazuo_Hirai_plan_odbudowe_5726348.jpg\n", + "2.bp.blogspot.com/-JAYvKsHcQPI/T4f3wbCIMDI/AAAAAAAAFDM/lTs3uKlb3A0/s1600/deeksha_seth_launches_chandana_brothers_showroom_Yellow+Saree+smiling+pics+%25285%2529.jpg\n", + "1.bp.blogspot.com/-D3SI27GS7-g/U-iD5fPcFDI/AAAAAAAABOs/VaB_BRRa6OU/s320/news8.jpg\n", + "1.bp.blogspot.com/_ilOjS7A_kk4/SVGCtcyAAmI/AAAAAAAAAH4/9-KKBqYeDBA/s400/playstation-3-grill_12.jpg\n" + ] + } + ], + "source": [ + "for other in others[:10]:\n", + " print(other)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21319" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(results['flickr.com']) +len(results['wikimedia.org']) + len(others)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb index 48133228..3c0dd631 100644 --- a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Prepare Flickr API Batch CSV" + "# MegaFace: Prepare Flickr API Batch CSV" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,6 +29,115 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Create the file meta CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file.csv'\n", + "fp_out_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file_ext.csv'\n", + "fp_out_meta_flickr = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_flickr_02.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df_files = pd.read_csv(fp_in_meta_files)\n", + "df_files.rename(columns={'subdir': 'filepath'}, inplace=True)\n", + "file_records = df_files.to_dict('records')\n", + "photo_ids = [x['photo_id'] for x in file_records]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d91329c27b8b4fc4ae68eb817ea82e19", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=4753520), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "for file_record in tqdm(file_records):\n", + " photo_id = Path(file_record['url']).stem.split('_')[0]\n", + " filepath = f'{photo_id}.jpg'\n", + " file_record['filepath'] = filepath\n", + "\n", + "df_meta_file = pd.DataFrame.from_dict(file_records)\n", + "df_meta_file.drop_duplicates(inplace=True)\n", + "df_meta_file.to_csv(fp_out_meta_files, index=False)\n", + "print(f'Wrote {len(df_meta_file)} lines')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the NSID/count CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total users: 48,382\n", + "Total images: 3,311,471\n" + ] + } + ], + "source": [ + "nsid_groups = df_meta_file.groupby('nsid')\n", + "results = []\n", + "for nsid, group in nsid_groups:\n", + " results.append({'nsid': nsid, 'count': len(group)})\n", + "df_meta_flickr = pd.DataFrame.from_dict(results)\n", + "df_meta_flickr.to_csv(fp_out_meta_flickr, index=False)\n", + "\n", + "print(f'Total users: {len(results):,}')\n", + "print(f'Total images: {df_meta_flickr[\"count\"].sum():,}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "## Create CSV for API" ] }, diff --git a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb index 8746a740..57c32bec 100644 --- a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb +++ b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb @@ -38,12 +38,50 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "fp_in = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_ext.csv'\n", - "fp_out = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_test.csv'" + "fp_in_api_photo_id = '/data_store_hdd/datasets/people/pipa/research/flickr_api_photo_id.csv'\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/pipa/research/pipa_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(fp_in_api_photo_id)\n", + "records = df.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "for record in records:\n", + " obj = {\n", + " 'photo_id': record.get('photo_id'),\n", + " 'nsid': record.get('nsid'),\n", + " 'url': record.get('url'),\n", + " 'secret': record.get('secret'),\n", + " 'filepath': f'{photo_id}_{secret}.jpg'\n", + " }\n", + " results.append(obj)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)" ] }, { diff --git a/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb new file mode 100644 index 00000000..99bbe32e --- /dev/null +++ b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# VGG Face (V1) Prepare Flickr API" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob, iglob\n", + "from pathlib import Path\n", + "from tqdm import tqdm_notebook as tqdm\n", + "\n", + "import pandas as pd\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert annotation files to list of photo IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "fp_dir_annos = '/data_store/datasets/people/vgg_face/downloads/vgg_face_dataset/files/'\n", + "fp_photo_ids = '/data_store/datasets/people/vgg_face/research/photo_ids.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b92b24eac4c84f2f96e32f6eba8d2dc0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=2622), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "photo_ids = []\n", + "all_photos = []\n", + "fp_annos = glob(join(fp_dir_annos, '*.txt'))\n", + "for fp_anno in tqdm(fp_annos):\n", + " df_annos = pd.read_csv(fp_anno, delimiter=' ', names=['url', 'a', 'b', 'c', 'd', 'e', 'f', 'g'])\n", + " records = df_annos.to_dict('records')\n", + " for record in records:\n", + " url = record['url']\n", + " all_photos.append(url)\n", + " if 'flickr.com' in url:\n", + " photo_id = Path(url).stem.split('_')[0]\n", + " photo_ids.append({'photo_id': photo_id})" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2604849\n" + ] + } + ], + "source": [ + "print(len(all_photos))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/data_store/datasets/people/vgg_face/research/photo_ids.csv')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_utils.ensure_posixpath(fp_photo_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(photo_ids).to_csv(fp_photo_ids, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "## Convert Flickr API data to filepaths and counts" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in_flickr_api = '/data_store_hdd/datasets/people/vgg_face/research/vgg_flickr_api_photo_ids.csv'\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/vgg_face/research/vgg_filepaths.csv'\n", + "fp_out_counts = '/data_store_hdd/datasets/people/vgg_face/research/vgg_counts.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(fp_in_flickr_api)\n", + "records = df.to_dict('records')\n", + "\n", + "# write filepaths\n", + "results = []\n", + "for record in records:\n", + " photo_id = record['photo_id']\n", + " obj = {\n", + " 'filepath': f'{photo_id}.jpg',\n", + " 'nsid': record['nsid'],\n", + " 'photo_id': photo_id,\n", + " 'url': record['url']\n", + " }\n", + " results.append(obj)\n", + "\n", + "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)\n", + "\n", + "# write counts\n", + "results = []\n", + "nsid_groups = df.groupby('nsid')\n", + "for nsid, group in nsid_groups:\n", + " results.append({'nsid': nsid, 'count': len(group)})\n", + "\n", + "pd.DataFrame.from_dict(results).to_csv(fp_out_counts, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "fp = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n", + "df = pd.read_csv(fp)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "df_match = df[df['nsid'] == '50747072@N03']" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " bureau country nsid path_alias type \\\n", + "0 EUR Russia 50747072@N03 otkroyameriku Consulate \n", + "\n", + " url username \\\n", + "0 http://www.flickr.com/photos/otkroyameriku Генконсульство США в СПб \n", + "\n", + " verified notes \n", + "0 NaN NaN 1\n" + ] + } + ], + "source": [ + "print(df_match, len(df_match))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'50747072@N03'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "match.nsid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb index c2ec5c84..66f803a4 100644 --- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb @@ -37,6 +37,318 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create filepaths CSV for individual lookup" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n", + "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_flickr_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function drop in module pandas.core.frame:\n", + "\n", + "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n", + " Drop specified labels from rows or columns.\n", + " \n", + " Remove rows or columns by specifying label names and corresponding\n", + " axis, or by specifying directly index or column names. When using a\n", + " multi-index, labels on different levels can be removed by specifying\n", + " the level.\n", + " \n", + " Parameters\n", + " ----------\n", + " labels : single label or list-like\n", + " Index or column labels to drop.\n", + " axis : {0 or 'index', 1 or 'columns'}, default 0\n", + " Whether to drop labels from the index (0 or 'index') or\n", + " columns (1 or 'columns').\n", + " index, columns : single label or list-like\n", + " Alternative to specifying axis (``labels, axis=1``\n", + " is equivalent to ``columns=labels``).\n", + " \n", + " .. versionadded:: 0.21.0\n", + " level : int or level name, optional\n", + " For MultiIndex, level from which the labels will be removed.\n", + " inplace : bool, default False\n", + " If True, do operation inplace and return None.\n", + " errors : {'ignore', 'raise'}, default 'raise'\n", + " If 'ignore', suppress error and only existing labels are\n", + " dropped.\n", + " \n", + " Returns\n", + " -------\n", + " dropped : pandas.DataFrame\n", + " \n", + " Raises\n", + " ------\n", + " KeyError\n", + " If none of the labels are found in the selected axis\n", + " \n", + " See Also\n", + " --------\n", + " DataFrame.loc : Label-location based indexer for selection by label.\n", + " DataFrame.dropna : Return DataFrame with labels on given axis omitted\n", + " where (all or any) data are missing.\n", + " DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n", + " removed, optionally only considering certain columns.\n", + " Series.drop : Return Series with specified index labels removed.\n", + " \n", + " Examples\n", + " --------\n", + " >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n", + " ... columns=['A', 'B', 'C', 'D'])\n", + " >>> df\n", + " A B C D\n", + " 0 0 1 2 3\n", + " 1 4 5 6 7\n", + " 2 8 9 10 11\n", + " \n", + " Drop columns\n", + " \n", + " >>> df.drop(['B', 'C'], axis=1)\n", + " A D\n", + " 0 0 3\n", + " 1 4 7\n", + " 2 8 11\n", + " \n", + " >>> df.drop(columns=['B', 'C'])\n", + " A D\n", + " 0 0 3\n", + " 1 4 7\n", + " 2 8 11\n", + " \n", + " Drop a row by index\n", + " \n", + " >>> df.drop([0, 1])\n", + " A B C D\n", + " 2 8 9 10 11\n", + " \n", + " Drop columns and/or rows of MultiIndex DataFrame\n", + " \n", + " >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n", + " ... ['speed', 'weight', 'length']],\n", + " ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n", + " ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n", + " >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n", + " ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n", + " ... [250, 150], [1.5, 0.8], [320, 250],\n", + " ... [1, 0.8], [0.3,0.2]])\n", + " >>> df\n", + " big small\n", + " lama speed 45.0 30.0\n", + " weight 200.0 100.0\n", + " length 1.5 1.0\n", + " cow speed 30.0 20.0\n", + " weight 250.0 150.0\n", + " length 1.5 0.8\n", + " falcon speed 320.0 250.0\n", + " weight 1.0 0.8\n", + " length 0.3 0.2\n", + " \n", + " >>> df.drop(index='cow', columns='small')\n", + " big\n", + " lama speed 45.0\n", + " weight 200.0\n", + " length 1.5\n", + " falcon speed 320.0\n", + " weight 1.0\n", + " length 0.3\n", + " \n", + " >>> df.drop(index='length', level=1)\n", + " big small\n", + " lama speed 45.0 30.0\n", + " weight 200.0 100.0\n", + " cow speed 30.0 20.0\n", + " weight 250.0 150.0\n", + " falcon speed 320.0 250.0\n", + " weight 1.0 0.8\n", + "\n" + ] + } + ], + "source": [ + "help(pd.DataFrame.drop)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['nsid', 'photo_id', 'url'], dtype='object')\n" + ] + } + ], + "source": [ + "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n", + "print(df_flickr_meta.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "#df_flickr_meta['subdir'] = ''\n", + "df_flickr_meta['filepath'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>nsid</th>\n", + " <th>photo_id</th>\n", + " <th>url</th>\n", + " <th>filepath</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>51576145@N02</td>\n", + " <td>4762068863</td>\n", + " <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29689383@N02</td>\n", + " <td>5711730606</td>\n", + " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29689383@N02</td>\n", + " <td>5711730606</td>\n", + " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>27982139@N00</td>\n", + " <td>2439203939</td>\n", + " <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>27982139@N00</td>\n", + " <td>2464402099</td>\n", + " <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n", + " <td></td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " nsid photo_id \\\n", + "0 51576145@N02 4762068863 \n", + "1 29689383@N02 5711730606 \n", + "2 29689383@N02 5711730606 \n", + "3 27982139@N00 2439203939 \n", + "4 27982139@N00 2464402099 \n", + "\n", + " url filepath \n", + "0 http://farm5.staticflickr.com/4117/4762068863_... \n", + "1 http://farm3.staticflickr.com/2800/5711730606_... \n", + "2 http://farm3.staticflickr.com/2800/5711730606_... \n", + "3 http://farm3.staticflickr.com/2105/2439203939_... \n", + "4 http://farm4.staticflickr.com/3030/2464402099_... " + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_flickr_meta.head()" + ] + }, + { "cell_type": "code", "execution_count": 31, "metadata": {}, |
