From 53f6e26015e65b8696ed1a6e5c74bdfef14b3ac2 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 19 Mar 2019 12:20:38 +0100 Subject: add cmds --- megapixels/commands/datasets/decrypt_ibm.py | 100 ++++++++++ megapixels/commands/datasets/download_images.py | 14 +- megapixels/commands/datasets/flickr_api.py | 202 +++++++++++++++++++++ .../commands/datasets/megaface_flickr_api.py | 141 -------------- megapixels/commands/datasets/pull_spreadsheet.py | 6 +- megapixels/commands/datasets/whogoesthere.py | 72 ++++++++ 6 files changed, 388 insertions(+), 147 deletions(-) create mode 100644 megapixels/commands/datasets/decrypt_ibm.py create mode 100644 megapixels/commands/datasets/flickr_api.py delete mode 100644 megapixels/commands/datasets/megaface_flickr_api.py create mode 100644 megapixels/commands/datasets/whogoesthere.py diff --git a/megapixels/commands/datasets/decrypt_ibm.py b/megapixels/commands/datasets/decrypt_ibm.py new file mode 100644 index 00000000..d25c879a --- /dev/null +++ b/megapixels/commands/datasets/decrypt_ibm.py @@ -0,0 +1,100 @@ +import click +import subprocess + +fp_in = '/data_store_hdd/datasets/people/ibm_dif/research/username_sha.csv' +fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv' +fp_dir_json = '/data_store_hdd/datasets/people/ibm_dif/research/valid_files/' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, + help='Input CSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output path') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice): + """Threaded Flickr metadata decryption""" + """ + CSV should be formatted as + + |sha256|username| + |---|---| + |123456789|mruser| + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + from random import randint + import json + + import pandas as pd + from tqdm import tqdm + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + sha256 = item['sha256'] + try: + # decrypt + cmd = ['/home/adam/.nvm/versions/node/v9.9.0/bin/node', + '/data_store/datasets/people/ibm_dif/web_files/decrypt_cli', + item['username'], item['sha256']] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + o, e = proc.communicate() + data = json.loads(o.decode()) + item['count'] = int(data['count']) + item['url'] = data['url'] + # log.info(f"{data['count']}, {data['url']}") + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holders + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + log.debug(f'Processing {len(records):,}') + + pool_items = [] + for record in records: + fp_json = join(fp_dir_json, f"{record['sha256']}.json") + if Path(fp_json).is_file(): + pool_items.append(record) + + + # too many records for RAM + del records + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + + # run the multithreading with progress bar + pool_results = [] + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + df_results = pd.DataFrame.from_dict(pool_results) + df_results = df_results[df_results.status == True] + df_results.drop(['status'], axis=1, inplace=True) + df_results.to_csv(opt_fp_out, index=False) + log.debug(f'Saved file to: {opt_fp_out}') + total = sum([int(x['count']) for x in pool_results]) + log.debug(f'Total: {total:,}') diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index f1519c61..c64afbba 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -8,8 +8,10 @@ import click help='Output') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') +@click.option('--wayback', 'opt_wayback', is_flag=True, + help='Check Wayback archive for URL and download cached image') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): """Threaded image downloader""" """ @@ -38,6 +40,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): # setup multithreading function def pool_process(item): # threaded function + url_wayback_base = 'https://archive.org/wayback/available?url=' fp_out = item['filepath'] try: # download image @@ -45,7 +48,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): urllib.request.urlretrieve(item['url'], fp_out) item['status'] = True except Exception as e: - log.debug(f'Error: {e}') + log.debug(f'Error: {e}, url: {item["url"]}') + estr = str(e) + if item['opt_wayback']: + if 'HTTP Error' in estr: + # check + url_wayback = url_wayback_base + item['url'] fp_error = f'{fp_out}_error.txt' with open(fp_error, 'w') as fp: fp.write('') @@ -64,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() if not fp_dst_is_file and not fp_dst_is_err: - pool_items.append({'url':x['url'], 'filepath': fp_dst}) + pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) log.info(f'processing {num_items:,} items') diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py new file mode 100644 index 00000000..780ede49 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api.py @@ -0,0 +1,202 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['photo_id', 'album_id', 'flickr_id'] +# flickr_id: 123456789@N01 +# photo_id: + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output directory') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--query-type', 'opt_query_type', default='photo_id', + type=click.Choice(query_types), + help='API query type') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source .env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,10): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | query, filepath | + + records = pd.read_csv(opt_fp_in).to_dict('records') + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + + for record in tqdm(records): + fp_out = join(opt_fp_out, record['filepath']) + fp_out_err = fp_out + '_error.txt' + if Path(fp_out).is_file() or Path(fp_out_err).is_file(): + continue + # append relevant data + try: + # shuffle the api keys to avoid rate limiting + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + + #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) + + #photo = flickr_api.Photo(id=record['query']) + # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + photo_id = record['query'] + flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + raise Exception('failed') + elif data['stat'] =='ok': + with open(fp_out, 'w') as fp: + json.dump(data, fp, sort_keys=True, indent=2) + + #except FlickrAPIError as e: + except Exception as e: + # if "HTTP Server Error 500" in str(e): + log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') + if "not found" in str(e) or 'failed' in str(e): + with open(fp_out_err, 'w') as fp: + fp.write('') + + if opt_delay: + time.sleep(opt_delay) + + + +""" +df_id_dict.update( { + 'user_name': info.get('username', ''), + 'location': info.get('location', ''), + 'real_name': info.get('realname', ''), + 'time_zone': info.get('timezone', {}).get('timezone_id', ''), + 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), + 'photos_count': info.get('photos_info', {}).get('count'), + 'description': info.get('description', ''), + 'id': info.get('id'), + 'path_alias': info.get('path_alias', ''), + 'is_pro': info.get('ispro', ''), + 'url_photos': info.get('photosurl', ''), + 'url_profile': info.get('photosurl', ''), + 'url_mobile': info.get('mobileurl', ''), + }) +""" + +""" +info = photo.getInfo() + +# serialize tags +info['tag_names'] = [] +info['tag_ids'] = [] +tags = info['tags'] +for t in tags: + info['tag_names'].append(str(t.text)) + info['tag_ids'].append(str(t.id)) + +owner = info['owner'] +info['owner_id'] = str(owner.id) +info['owner_username'] = str(owner.username) + +info.pop('tags') +info.pop('owner') + +""" + +""" +Example API data: +{'id': '7124086@N07', + 'nsid': '7124086@N07', + 'ispro': 1, + 'can_buy_pro': 0, + 'iconserver': '2325', + 'iconfarm': 3, + 'path_alias': 'shirleylin', + 'has_stats': '1', + 'pro_badge': 'standard', + 'expire': '0', + 'username': 'ShirleyLin', + 'realname': 'Shirley Lin', + 'location': 'Fremont, California, US', + 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', + 'offset': '-08:00', + 'timezone_id': 'PST8PDT'}, + 'description': '', + 'photosurl': 'https://www.flickr.com/photos/shirleylin/', + 'profileurl': 'https://www.flickr.com/people/shirleylin/', + 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', + 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', + 'firstdate': '1172556588', + 'count': 9665}} +""" + +""" +https://www.flickr.com/services/api/explore/flickr.photosets.getPhotos +https://www.flickr.com/services/api/explore/flickr.photos.getInfo +""" \ No newline at end of file diff --git a/megapixels/commands/datasets/megaface_flickr_api.py b/megapixels/commands/datasets/megaface_flickr_api.py deleted file mode 100644 index 62232ab8..00000000 --- a/megapixels/commands/datasets/megaface_flickr_api.py +++ /dev/null @@ -1,141 +0,0 @@ -from glob import glob -import os -from os.path import join -from pathlib import Path - -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils import logger_utils - -import dlib -import pandas as pd -from PIL import Image, ImageOps, ImageFilter -from app.utils import file_utils, im_utils - - -log = logger_utils.Logger.getLogger() - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input directory') -@click.option('-o', '--output', 'opt_fp_out', - help='Output directory') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-d', '--delay', 'opt_delay', default=None, type=int, - help='Delay between API calls to prevent rate-limiting') -@click.option('--checkpoints', 'opt_checkpoints', is_flag=True, - help='Save checkpoints') -@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY') -@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET') -@click.option('--checkpoint_interval', 'opt_ckpt_interval', default=10000, - help='Save checkpoint interval') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, - opt_delay, opt_checkpoints, opt_ckpt_interval): - """Appends Flickr API info to CSV""" - - from tqdm import tqdm - from glob import glob - import time - import flickr_api # pip install flickr_api - from flickr_api.flickrerrors import FlickrAPIError - - # ------------------------------------------------- - # process - - if not opt_api_key or not opt_api_secret: - log.error('source .env vars for Flickr API and try again') - return - - # init Flickr API - flickr_api.set_keys(api_key=opt_api_key, api_secret=opt_api_secret) - - # reqd in CSV - df_ids = pd.read_csv(opt_fp_in) - if opt_slice: - df_ids = df_ids[opt_slice[0]:opt_slice[1]] - - log.info('Processing: {:,} items'.format(len(df_ids))) - - # iterate MegaFace IDs - identities = [] - - tqdm.pandas() - - for idx, df_id in tqdm(df_ids.iterrows(), total=len(df_ids)): - # a = flickr_api.Person(id='123456789@N01') - df_id_dict = dict(df_id) - - # append relevant data - try: - person = flickr_api.Person(id=df_id['nsid']) - info = person.getInfo() - df_id_dict.update( { - 'user_name': info.get('username', ''), - 'location': info.get('location', ''), - 'real_name': info.get('realname', ''), - 'time_zone': info.get('timezone', {}).get('timezone_id', ''), - 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), - 'photos_count': info.get('photos_info', {}).get('count'), - 'description': info.get('description', ''), - 'id': info.get('id'), - 'path_alias': info.get('path_alias', ''), - 'is_pro': info.get('ispro', ''), - 'url_photos': info.get('photosurl', ''), - 'url_profile': info.get('photosurl', ''), - 'url_mobile': info.get('mobileurl', ''), - }) - identities.append(df_id_dict) - - except FlickrAPIError as e: - log.error(e) - - - if opt_checkpoints: - if (idx + 1) % opt_ckpt_interval == 0: - df = pd.DataFrame.from_dict(identities) - fpp_out = Path(opt_fp_out) - opt_fp_out_ckpt = join(fpp_out.parent, '{}_ckpt_{}.csv'.format(fpp_out.stem, file_utils.zpad(idx + 1))) - log.info('Saving checkpoint {:,} to {}'.format(idx + 1, opt_fp_out_ckpt)) - df.to_csv(opt_fp_out_ckpt, index=False) - - if opt_delay: - time.sleep(opt_delay) - - - df = pd.DataFrame.from_dict(identities) - df.to_csv(opt_fp_out, index=False) - - log.info('Wrote: {:,} lines to {}'.format(len(df), opt_fp_out)) - - -""" -Example API data: -{'id': '7124086@N07', - 'nsid': '7124086@N07', - 'ispro': 1, - 'can_buy_pro': 0, - 'iconserver': '2325', - 'iconfarm': 3, - 'path_alias': 'shirleylin', - 'has_stats': '1', - 'pro_badge': 'standard', - 'expire': '0', - 'username': 'ShirleyLin', - 'realname': 'Shirley Lin', - 'location': 'Fremont, California, US', - 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', - 'offset': '-08:00', - 'timezone_id': 'PST8PDT'}, - 'description': '', - 'photosurl': 'https://www.flickr.com/photos/shirleylin/', - 'profileurl': 'https://www.flickr.com/people/shirleylin/', - 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', - 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', - 'firstdate': '1172556588', - 'count': 9665}} -""" \ No newline at end of file diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py index 0094ea59..b8b68094 100644 --- a/megapixels/commands/datasets/pull_spreadsheet.py +++ b/megapixels/commands/datasets/pull_spreadsheet.py @@ -20,7 +20,7 @@ from app.utils.logger_utils import Logger log = Logger.getLogger() -opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ] +opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal'] @click.command() @click.option('-n', '--name', 'opt_spreadsheets', multiple=True, @@ -65,14 +65,14 @@ def clean_datasets_sheet_ft(df): # clean data for FT df = df[df['ft_share'] == 'Y'] keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] - keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment'] return df[keys] def clean_datasets_sheet_nyt(df): # clean data for FT df = df[df['ft_share'] == 'Y'] keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] - keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment'] return df[keys] def fetch_spreadsheet(): diff --git a/megapixels/commands/datasets/whogoesthere.py b/megapixels/commands/datasets/whogoesthere.py new file mode 100644 index 00000000..6cf9f009 --- /dev/null +++ b/megapixels/commands/datasets/whogoesthere.py @@ -0,0 +1,72 @@ +""" +Unpack data for: + +Z. Bessinger, C. Stauffer, and N. Jacobs, “Who Goes There? Approaches to +Mapping Facial Appearance Diversity,” in Proceedings of the 24th SIGSPATIAL +International Conference on Advances in Geographic Information Systems, 2016. +""" + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +keys_all = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', + 'content_length', 'country_code', 'date_taken', 'date_uploaded', + 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', + 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', + 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', + 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', + 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags'] + +keys_keep = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', + 'content_length', 'country_code', 'date_taken', 'date_uploaded', + 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', + 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', + 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', + 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', + 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags'] + +@click.command() +@click.pass_context +@click.option('-i', '--input', 'opt_fp_in', required=True) +@click.option('-o', '--output', 'opt_fp_out', required=True) +@click.option('--value', 'opt_value', required=True, type=click.Choice(keys_all)) +def cli(ctx, opt_fp_in, opt_fp_out, opt_value): + """Convert WhoGoesThere HDF5""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import h5py + from scipy import misc + from io import BytesIO + from base64 import b64decode + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Uncompress HDF5') + + key_vals = [] + + with h5py.File(opt_fp_in, 'r') as fp: + num_items = len(fp['face']) + log.info(f'items: {num_items:,}') + + for idx in tqdm(range(0, min(99999999,num_items))): + # face_str = fp['face'][0] + # face_im = misc.imread(BytesIO(b64decode(face_str))) + # print(fo['face_landmarks_f/x'][0]) + # age = fp['age'][idx].decode() + key_val = fp[opt_value][idx].decode() + key_vals.append(key_val) + + key_vals = set(key_vals) + with open(opt_fp_out, 'w') as fp: + for key_val in key_vals: + fp.write(f'{key_val}\n') \ No newline at end of file -- cgit v1.2.3-70-g09d2