diff options
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/decrypt_ibm.py | 100 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 121 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 90 | ||||
| -rw-r--r-- | megapixels/commands/datasets/flickr_api.py | 202 | ||||
| -rw-r--r-- | megapixels/commands/datasets/ijb_skin_color.py | 32 | ||||
| -rw-r--r-- | megapixels/commands/datasets/megaface_flickr_api.py | 141 | ||||
| -rw-r--r-- | megapixels/commands/datasets/pull_spreadsheet.py | 124 | ||||
| -rw-r--r-- | megapixels/commands/datasets/whogoesthere.py | 72 |
8 files changed, 741 insertions, 141 deletions
diff --git a/megapixels/commands/datasets/decrypt_ibm.py b/megapixels/commands/datasets/decrypt_ibm.py new file mode 100644 index 00000000..d25c879a --- /dev/null +++ b/megapixels/commands/datasets/decrypt_ibm.py @@ -0,0 +1,100 @@ +import click +import subprocess + +fp_in = '/data_store_hdd/datasets/people/ibm_dif/research/username_sha.csv' +fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv' +fp_dir_json = '/data_store_hdd/datasets/people/ibm_dif/research/valid_files/' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, + help='Input CSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output path') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_slice): + """Threaded Flickr metadata decryption""" + """ + CSV should be formatted as + + |sha256|username| + |---|---| + |123456789|mruser| + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + from random import randint + import json + + import pandas as pd + from tqdm import tqdm + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + sha256 = item['sha256'] + try: + # decrypt + cmd = ['/home/adam/.nvm/versions/node/v9.9.0/bin/node', + '/data_store/datasets/people/ibm_dif/web_files/decrypt_cli', + item['username'], item['sha256']] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + o, e = proc.communicate() + data = json.loads(o.decode()) + item['count'] = int(data['count']) + item['url'] = data['url'] + # log.info(f"{data['count']}, {data['url']}") + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holders + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + log.debug(f'Processing {len(records):,}') + + pool_items = [] + for record in records: + fp_json = join(fp_dir_json, f"{record['sha256']}.json") + if Path(fp_json).is_file(): + pool_items.append(record) + + + # too many records for RAM + del records + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + + # run the multithreading with progress bar + pool_results = [] + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + df_results = pd.DataFrame.from_dict(pool_results) + df_results = df_results[df_results.status == True] + df_results.drop(['status'], axis=1, inplace=True) + df_results.to_csv(opt_fp_out, index=False) + log.debug(f'Saved file to: {opt_fp_out}') + total = sum([int(x['count']) for x in pool_results]) + log.debug(f'Total: {total:,}') diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py new file mode 100644 index 00000000..ed717662 --- /dev/null +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -0,0 +1,121 @@ +import click + +fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input CSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output path') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.option('--agents', 'opt_fp_agents', default=fp_user_agents) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): + """Threaded image/file downloader""" + + """ + CSV should be formatted as + + |url|filepath| + |---|---| + |https:/site.com/photo.jpg|myfolder/myname.jpg| + + Saves logfile.csv output and uses for errors + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + from random import randint + + import pandas as pd + from tqdm import tqdm + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/' + + with open(fp_user_agents, 'r') as fp: + user_agents = fp.readlines() + user_agents = [x.strip() for x in user_agents] + + + # setup multithreading function + def pool_process(item): + # threaded function + fp_out = item['filepath'] + try: + # download image + opener = urllib.request.build_opener() + opener.addheaders = [('User-agent', item['user_agent'])] + urllib.request.install_opener(opener) + urllib.request.urlretrieve(item['url'], fp_out) + item['status'] = True + except Exception as e: + if str(e) != 'HTTP Error 403: Forbidden': + log.debug(f'Error: {e}') + fp_error = f'{fp_out}_error.txt' + with open(fp_error, 'w') as fp: + fp.write('') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holders + log.debug(f'loading {opt_fp_in}') + df_records = pd.read_csv(opt_fp_in) + log.debug(f'loaded {len(df_records):,} csv records') + log.debug('deduplicating') + df_records = df_records.drop_duplicates(subset='sha256', keep="last") + log.debug(f'unique records {len(df_records):,}') + records = df_records.to_dict('records') + log.debug(f'loaded {len(records):,} items') + + pool_items = [] + n_skipped = 0 + n_valids = 0 + n_errors = 0 + + for x in tqdm(records): + sha256 = x['sha256'] + + fp_dst = join(opt_fp_out, f"{sha256}.json") + fp_dst_is_file = Path(fp_dst).is_file() + fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() + + if fp_dst_is_file: + n_valids += 1 + elif fp_dst_is_err: + n_errors += 1 + + if not (fp_dst_is_file or fp_dst_is_err): + url = url_prefix + sha256 + '.json' + user_agent = user_agents[randint(0, len(user_agents)) - 1] + pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + else: + n_skipped += 1 + + num_items = len(pool_items) + log.info(f'Error files: {n_errors:,} items') + log.info(f'Valid files: {n_valids:,} items') + log.info(f'skipping {n_skipped:,} items') + log.info(f'processing {num_items:,} items') + pool_results = [] + + # too many records for RAM + del records + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py new file mode 100644 index 00000000..c64afbba --- /dev/null +++ b/megapixels/commands/datasets/download_images.py @@ -0,0 +1,90 @@ +import click + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.option('--wayback', 'opt_wayback', is_flag=True, + help='Check Wayback archive for URL and download cached image') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): + """Threaded image downloader""" + + """ + CSV should be formatted as + + |url|filepath| + |---|---| + |https:/site.com/photo.jpg|myfolder/myname.jpg| + + Saves logfile.csv output and uses for errors + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + + import pandas as pd + from tqdm import tqdm + from app.utils import file_utils + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + url_wayback_base = 'https://archive.org/wayback/available?url=' + fp_out = item['filepath'] + try: + # download image + file_utils.mkdirs(item['filepath']) + urllib.request.urlretrieve(item['url'], fp_out) + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}, url: {item["url"]}') + estr = str(e) + if item['opt_wayback']: + if 'HTTP Error' in estr: + # check + url_wayback = url_wayback_base + item['url'] + fp_error = f'{fp_out}_error.txt' + with open(fp_error, 'w') as fp: + fp.write('') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holds + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + + + pool_items = [] + for x in tqdm(records): + fp_dst = join(opt_fp_out, x['filepath']) + fp_dst_is_file = Path(fp_dst).is_file() + fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() + if not fp_dst_is_file and not fp_dst_is_err: + pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) + + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + pool_results = [] + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py new file mode 100644 index 00000000..780ede49 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api.py @@ -0,0 +1,202 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['photo_id', 'album_id', 'flickr_id'] +# flickr_id: 123456789@N01 +# photo_id: + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output directory') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--query-type', 'opt_query_type', default='photo_id', + type=click.Choice(query_types), + help='API query type') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source .env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,10): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | query, filepath | + + records = pd.read_csv(opt_fp_in).to_dict('records') + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + + for record in tqdm(records): + fp_out = join(opt_fp_out, record['filepath']) + fp_out_err = fp_out + '_error.txt' + if Path(fp_out).is_file() or Path(fp_out_err).is_file(): + continue + # append relevant data + try: + # shuffle the api keys to avoid rate limiting + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + + #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) + + #photo = flickr_api.Photo(id=record['query']) + # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + photo_id = record['query'] + flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + raise Exception('failed') + elif data['stat'] =='ok': + with open(fp_out, 'w') as fp: + json.dump(data, fp, sort_keys=True, indent=2) + + #except FlickrAPIError as e: + except Exception as e: + # if "HTTP Server Error 500" in str(e): + log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') + if "not found" in str(e) or 'failed' in str(e): + with open(fp_out_err, 'w') as fp: + fp.write('') + + if opt_delay: + time.sleep(opt_delay) + + + +""" +df_id_dict.update( { + 'user_name': info.get('username', ''), + 'location': info.get('location', ''), + 'real_name': info.get('realname', ''), + 'time_zone': info.get('timezone', {}).get('timezone_id', ''), + 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), + 'photos_count': info.get('photos_info', {}).get('count'), + 'description': info.get('description', ''), + 'id': info.get('id'), + 'path_alias': info.get('path_alias', ''), + 'is_pro': info.get('ispro', ''), + 'url_photos': info.get('photosurl', ''), + 'url_profile': info.get('photosurl', ''), + 'url_mobile': info.get('mobileurl', ''), + }) +""" + +""" +info = photo.getInfo() + +# serialize tags +info['tag_names'] = [] +info['tag_ids'] = [] +tags = info['tags'] +for t in tags: + info['tag_names'].append(str(t.text)) + info['tag_ids'].append(str(t.id)) + +owner = info['owner'] +info['owner_id'] = str(owner.id) +info['owner_username'] = str(owner.username) + +info.pop('tags') +info.pop('owner') + +""" + +""" +Example API data: +{'id': '7124086@N07', + 'nsid': '7124086@N07', + 'ispro': 1, + 'can_buy_pro': 0, + 'iconserver': '2325', + 'iconfarm': 3, + 'path_alias': 'shirleylin', + 'has_stats': '1', + 'pro_badge': 'standard', + 'expire': '0', + 'username': 'ShirleyLin', + 'realname': 'Shirley Lin', + 'location': 'Fremont, California, US', + 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', + 'offset': '-08:00', + 'timezone_id': 'PST8PDT'}, + 'description': '', + 'photosurl': 'https://www.flickr.com/photos/shirleylin/', + 'profileurl': 'https://www.flickr.com/people/shirleylin/', + 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', + 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', + 'firstdate': '1172556588', + 'count': 9665}} +""" + +""" +https://www.flickr.com/services/api/explore/flickr.photosets.getPhotos +https://www.flickr.com/services/api/explore/flickr.photos.getInfo +"""
\ No newline at end of file diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py new file mode 100644 index 00000000..bf3a6d5d --- /dev/null +++ b/megapixels/commands/datasets/ijb_skin_color.py @@ -0,0 +1,32 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in') +@click.option('-o', '--output', 'opt_fp_out') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Measure skin color IJB-C""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import cv2 as cv + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + log = Logger.getLogger() + log.info('IJBC Skin Color') diff --git a/megapixels/commands/datasets/megaface_flickr_api.py b/megapixels/commands/datasets/megaface_flickr_api.py deleted file mode 100644 index 62232ab8..00000000 --- a/megapixels/commands/datasets/megaface_flickr_api.py +++ /dev/null @@ -1,141 +0,0 @@ -from glob import glob -import os -from os.path import join -from pathlib import Path - -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils import logger_utils - -import dlib -import pandas as pd -from PIL import Image, ImageOps, ImageFilter -from app.utils import file_utils, im_utils - - -log = logger_utils.Logger.getLogger() - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input directory') -@click.option('-o', '--output', 'opt_fp_out', - help='Output directory') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-d', '--delay', 'opt_delay', default=None, type=int, - help='Delay between API calls to prevent rate-limiting') -@click.option('--checkpoints', 'opt_checkpoints', is_flag=True, - help='Save checkpoints') -@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY') -@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET') -@click.option('--checkpoint_interval', 'opt_ckpt_interval', default=10000, - help='Save checkpoint interval') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, - opt_delay, opt_checkpoints, opt_ckpt_interval): - """Appends Flickr API info to CSV""" - - from tqdm import tqdm - from glob import glob - import time - import flickr_api # pip install flickr_api - from flickr_api.flickrerrors import FlickrAPIError - - # ------------------------------------------------- - # process - - if not opt_api_key or not opt_api_secret: - log.error('source .env vars for Flickr API and try again') - return - - # init Flickr API - flickr_api.set_keys(api_key=opt_api_key, api_secret=opt_api_secret) - - # reqd in CSV - df_ids = pd.read_csv(opt_fp_in) - if opt_slice: - df_ids = df_ids[opt_slice[0]:opt_slice[1]] - - log.info('Processing: {:,} items'.format(len(df_ids))) - - # iterate MegaFace IDs - identities = [] - - tqdm.pandas() - - for idx, df_id in tqdm(df_ids.iterrows(), total=len(df_ids)): - # a = flickr_api.Person(id='123456789@N01') - df_id_dict = dict(df_id) - - # append relevant data - try: - person = flickr_api.Person(id=df_id['nsid']) - info = person.getInfo() - df_id_dict.update( { - 'user_name': info.get('username', ''), - 'location': info.get('location', ''), - 'real_name': info.get('realname', ''), - 'time_zone': info.get('timezone', {}).get('timezone_id', ''), - 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), - 'photos_count': info.get('photos_info', {}).get('count'), - 'description': info.get('description', ''), - 'id': info.get('id'), - 'path_alias': info.get('path_alias', ''), - 'is_pro': info.get('ispro', ''), - 'url_photos': info.get('photosurl', ''), - 'url_profile': info.get('photosurl', ''), - 'url_mobile': info.get('mobileurl', ''), - }) - identities.append(df_id_dict) - - except FlickrAPIError as e: - log.error(e) - - - if opt_checkpoints: - if (idx + 1) % opt_ckpt_interval == 0: - df = pd.DataFrame.from_dict(identities) - fpp_out = Path(opt_fp_out) - opt_fp_out_ckpt = join(fpp_out.parent, '{}_ckpt_{}.csv'.format(fpp_out.stem, file_utils.zpad(idx + 1))) - log.info('Saving checkpoint {:,} to {}'.format(idx + 1, opt_fp_out_ckpt)) - df.to_csv(opt_fp_out_ckpt, index=False) - - if opt_delay: - time.sleep(opt_delay) - - - df = pd.DataFrame.from_dict(identities) - df.to_csv(opt_fp_out, index=False) - - log.info('Wrote: {:,} lines to {}'.format(len(df), opt_fp_out)) - - -""" -Example API data: -{'id': '7124086@N07', - 'nsid': '7124086@N07', - 'ispro': 1, - 'can_buy_pro': 0, - 'iconserver': '2325', - 'iconfarm': 3, - 'path_alias': 'shirleylin', - 'has_stats': '1', - 'pro_badge': 'standard', - 'expire': '0', - 'username': 'ShirleyLin', - 'realname': 'Shirley Lin', - 'location': 'Fremont, California, US', - 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', - 'offset': '-08:00', - 'timezone_id': 'PST8PDT'}, - 'description': '', - 'photosurl': 'https://www.flickr.com/photos/shirleylin/', - 'profileurl': 'https://www.flickr.com/people/shirleylin/', - 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', - 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', - 'firstdate': '1172556588', - 'count': 9665}} -"""
\ No newline at end of file diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py new file mode 100644 index 00000000..b8b68094 --- /dev/null +++ b/megapixels/commands/datasets/pull_spreadsheet.py @@ -0,0 +1,124 @@ +import os +import click +import re +import os +import csv +import string +import codecs +import gspread +from os.path import join +from pathlib import Path +from multiprocessing import Pool +import simplejson as json +from oauth2client.service_account import ServiceAccountCredentials + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils, file_utils +from app.settings import app_cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal'] + +@click.command() +@click.option('-n', '--name', 'opt_spreadsheets', multiple=True, + type=click.Choice(opt_sheets), + default=['datasets'], + help='Spreadsheet name') +@click.option('--all', 'opt_all', is_flag=True, + help='Get all sheets') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Path to directory or filename') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force): + """Fetch Google spreadsheet""" + + import sys + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + if opt_all: + opt_spreadsheets = opt_sheets + + for sheet_name in opt_spreadsheets: + log.info(f'Get spreadsheet: {sheet_name}') + sheet_data = fetch_google_sheet_objects(name=sheet_name) + df_sheet = pd.DataFrame.from_dict(sheet_data) + if sheet_name == 'datasets': + df_sheet = clean_datasets_sheet_ft(df_sheet) + fpp_out = Path(opt_fp_out) + file_utils.mkdirs(fpp_out) + + if opt_all and fpp_out.is_file(): + fpp_out = fpp_out.parent + else: + fpp_out = join(opt_fp_out, f'{sheet_name}.csv') + df_sheet.to_csv(fpp_out) + + +def clean_datasets_sheet_ft(df): + # clean data for FT + df = df[df['ft_share'] == 'Y'] + keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment'] + return df[keys] + +def clean_datasets_sheet_nyt(df): + # clean data for FT + df = df[df['ft_share'] == 'Y'] + keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment'] + return df[keys] + +def fetch_spreadsheet(): + """Open the Google Spreadsheet, which contains the individual worksheets""" + scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] + fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json') + credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope) + docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc" + client = gspread.authorize(credentials) + spreadsheet = client.open_by_key(docid) + return spreadsheet + +def fetch_worksheet(name="institutions"): + """Get a reference to a particular "worksheet" from the Google Spreadsheet""" + spreadsheet = fetch_spreadsheet() + return spreadsheet.worksheet(name) + +def fetch_google_sheet(name="institutions"): + """Get all the values from a particular worksheet as a list of lists. + Returns: + :keys - the first row of the document + :lines - a list of lists with the rest of the rows""" + rows = fetch_worksheet(name).get_all_values() + keys = rows[0] + lines = rows[1:] + return keys, lines + +def fetch_google_sheet_objects(name): + """Get all the values from a worksheet as a list of dictionaries""" + keys, rows = fetch_google_sheet(name) + recs = [] + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + recs.append(rec) + return recs + +def fetch_google_lookup(name, item_key='key'): + """Get all the values from a worksheet as a dictionary of dictionaries. + Specify which field you want to use as the dictionary key.""" + keys, rows = fetch_google_sheet(name) + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + lookup[rec[item_key]] = rec + return lookup
\ No newline at end of file diff --git a/megapixels/commands/datasets/whogoesthere.py b/megapixels/commands/datasets/whogoesthere.py new file mode 100644 index 00000000..6cf9f009 --- /dev/null +++ b/megapixels/commands/datasets/whogoesthere.py @@ -0,0 +1,72 @@ +""" +Unpack data for: + +Z. Bessinger, C. Stauffer, and N. Jacobs, “Who Goes There? Approaches to +Mapping Facial Appearance Diversity,” in Proceedings of the 24th SIGSPATIAL +International Conference on Advances in Geographic Information Systems, 2016. +""" + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +keys_all = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', + 'content_length', 'country_code', 'date_taken', 'date_uploaded', + 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', + 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', + 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', + 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', + 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags'] + +keys_keep = ['accuracy', 'admin1', 'admin2', 'age', 'capture_device', 'city', + 'content_length', 'country_code', 'date_taken', 'date_uploaded', + 'description', 'face', 'face_bounds', 'face_key', 'face_landmarks_f', + 'face_landmarks_o', 'gender', 'im_download_url', 'im_extension_original', + 'im_farm_id', 'im_id', 'im_page_url', 'im_secret', 'im_secret_original', + 'im_server_id', 'index', 'latitude', 'license_name', 'license_url', 'longitude', + 'machine_tags', 'title', 'user_nickname', 'user_nsid', 'user_tags'] + +@click.command() +@click.pass_context +@click.option('-i', '--input', 'opt_fp_in', required=True) +@click.option('-o', '--output', 'opt_fp_out', required=True) +@click.option('--value', 'opt_value', required=True, type=click.Choice(keys_all)) +def cli(ctx, opt_fp_in, opt_fp_out, opt_value): + """Convert WhoGoesThere HDF5""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import h5py + from scipy import misc + from io import BytesIO + from base64 import b64decode + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Uncompress HDF5') + + key_vals = [] + + with h5py.File(opt_fp_in, 'r') as fp: + num_items = len(fp['face']) + log.info(f'items: {num_items:,}') + + for idx in tqdm(range(0, min(99999999,num_items))): + # face_str = fp['face'][0] + # face_im = misc.imread(BytesIO(b64decode(face_str))) + # print(fo['face_landmarks_f/x'][0]) + # age = fp['age'][idx].decode() + key_val = fp[opt_value][idx].decode() + key_vals.append(key_val) + + key_vals = set(key_vals) + with open(opt_fp_out, 'w') as fp: + for key_val in key_vals: + fp.write(f'{key_val}\n')
\ No newline at end of file |
