diff options
| author | adamhrv <adam@ahprojects.com> | 2019-05-29 15:24:30 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-05-29 15:24:30 +0200 |
| commit | 5b916111ee1a012650a586ec07bc9150d66020bc (patch) | |
| tree | 128092857e6a9b6d67877e55e05da4f99ea2f5eb /megapixels/commands | |
| parent | f5141a7b48ee569089b07428bc75cb84a55c4834 (diff) | |
add MSC nbs and cli cmds
Diffstat (limited to 'megapixels/commands')
| -rw-r--r-- | megapixels/commands/msc/append_embassies.py | 126 | ||||
| -rw-r--r-- | megapixels/commands/msc/append_embassy_profile.py | 150 | ||||
| -rw-r--r-- | megapixels/commands/msc/basic.py | 30 | ||||
| -rw-r--r-- | megapixels/commands/msc/cross_reference.py | 78 | ||||
| -rw-r--r-- | megapixels/commands/msc/embassy_flickr_api_data_to_csv.py | 120 | ||||
| -rw-r--r-- | megapixels/commands/msc/flickr_list_to_csv.py | 48 | ||||
| -rw-r--r-- | megapixels/commands/msc/plot_countries.py | 31 | ||||
| -rw-r--r-- | megapixels/commands/msc/summarize.py | 67 |
8 files changed, 650 insertions, 0 deletions
diff --git a/megapixels/commands/msc/append_embassies.py b/megapixels/commands/msc/append_embassies.py new file mode 100644 index 00000000..2e659344 --- /dev/null +++ b/megapixels/commands/msc/append_embassies.py @@ -0,0 +1,126 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay): + """Fetches Flickr API for embassy. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source ../env/flickr.env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,10): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | username, ... | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + df_records = df_records.drop_duplicates(subset='url', keep="last") + log.info(f'Dedpuplicated {len(df_records)}') + records = df_records.to_dict('records') + + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + for record in tqdm(records): + if record.get('nsid', None): + continue + try: + # shuffle the api keys to avoid rate limiting + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + """ + { "user": { "id": "46768316@N07", + "username": { "_content": "U.S. Embassy Tirana Art Contest" } }, "stat": "ok" } + """ + + # https://www.flickr.com/services/rest/ + # ?method=flickr.urls.lookupUser&api_key=xxx&url=[encoded url]&format=json&nojsoncallback=1 + url = record['url'] + url_encoded = quote_plus(url) + flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.urls.lookupUser' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&url={url_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + log.debug(f'{flickr_url}') + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + error_msg = data["message"] + log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') + raise Exception(error_msg) + elif data['stat'] =='ok': + user_data = data.get('user') + record['nsid'] = user_data.get('id') + record['username'] = user_data.get('username').get('_content') + + except Exception as e: + log.error(f'Exception: {e}, url: {flickr_url}') + + + # write data + df_records = pd.DataFrame.from_dict(records) + df_records.to_csv(opt_fp_out) diff --git a/megapixels/commands/msc/append_embassy_profile.py b/megapixels/commands/msc/append_embassy_profile.py new file mode 100644 index 00000000..7d301f06 --- /dev/null +++ b/megapixels/commands/msc/append_embassy_profile.py @@ -0,0 +1,150 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay): + """Fetches Flickr API for embassy. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source ../env/flickr.env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,20): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | username, ... | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + df_records = df_records.drop_duplicates(subset='url', keep="last") + log.info(f'Dedpuplicated {len(df_records)}') + records = df_records.to_dict('records') + + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + for record in tqdm(records): + if not record.get('nsid', None): + log.warn(f'No NSID for {record["url"]}') + continue + try: + # shuffle the api keys to avoid rate limiting + error_msg = '' + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + error_msg = data["message"] + raise Exception(error_msg) + elif data['stat'] =='ok': + profile = data.get('profile') + # append data + tags = ['join_date', 'occupation', 'hometown', 'first_name', 'last_name'] + tags += ['profile_description', 'city', 'country', 'twitter', 'facebook', 'instagram'] + for tag in tags: + record.setdefault(tag, profile.get(tag)) + + except Exception as e: + log.error(f'Exception: {e}, message: {error_msg}, url: {flickr_url}') + + + # write data + df_records = pd.DataFrame.from_dict(records) + df_records.to_csv(opt_fp_out, index=False) + + +""" +{ + "profile": { + "id": "129819216@N03", + "nsid": "129819216@N03", + "join_date": "1417769829", + "occupation": "", + "hometown": "", + "showcase_set": "72157680742231281", + "showcase_set_title": "Profile Showcase", + "first_name": "Ambasciata", + "last_name": "d'Italia a Praga", + "profile_description": "", + "city": "", + "country": "", + "facebook": "", + "twitter": "", + "tumblr": "", + "instagram": "", + "pinterest": "" + }, + "stat": "ok" +} +"""
\ No newline at end of file diff --git a/megapixels/commands/msc/basic.py b/megapixels/commands/msc/basic.py new file mode 100644 index 00000000..2e952896 --- /dev/null +++ b/megapixels/commands/msc/basic.py @@ -0,0 +1,30 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.pass_context +def cli(ctx, ): + """_template_""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import cv2 as cv + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + log = Logger.getLogger() + log.info('template works') diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py new file mode 100644 index 00000000..d4457945 --- /dev/null +++ b/megapixels/commands/msc/cross_reference.py @@ -0,0 +1,78 @@ +from os.path import join + +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +# source file for Embassy NSIDs +fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv' + +# list of datasets to cross reference +dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there'] +fp_dataset_base = '/data_store/datasets/people/' +fp_datasets = {} +for dk in dataset_keys: + fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv') + + +# output file +fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies, + help='Input file for embassies') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force): + """Cross reference""" + + import sys + from os.path import join + from glob import glob + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + log.info('Cross reference embassy list') + + df_embassies = pd.read_csv(opt_fp_in) + df_embassies.fillna('', inplace=True) + embassy_nsids = list(df_embassies['nsid']) + + match_items = [] + for dataset_key, fp_dataset in fp_datasets.items(): + df_dataset = pd.read_csv(fp_dataset) + nsids = list(df_dataset['nsid']) + for nsid in nsids: + if nsid in embassy_nsids: + # add to matches, and count + count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0] + first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0] + last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0] + path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0] + log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}') + match_obj = { + 'count': count, + 'path_alias': path_alias, + 'name': f'{first_name} {last_name}', + 'dataset_key': dataset_key, + 'nsid': nsid + } + match_items.append(match_obj) + + df = pd.DataFrame.from_dict(match_items) + df.to_csv(opt_fp_out, index=False) + + total = df['count'].sum() + + log.debug(f'Found {total} embassy photos')
\ No newline at end of file diff --git a/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py new file mode 100644 index 00000000..1a0b6a91 --- /dev/null +++ b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py @@ -0,0 +1,120 @@ +""" +Converts directory of JSON API output files to CSV format +""" + +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + +data_types = ['nsid_url', 'nsid_profile'] + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-q', '--query', 'opt_query', required=True, type=click.Choice(data_types), + help='Flickr API data type') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import json + + + # ------------------------------------------------- + # process + if Path(opt_fp_out).is_file() and not opt_force: + log.error('File exists. Use "--force" to overwrite it') + return + + fp_files = glob(join(opt_fp_in, '*.json')) + fp_files = [f for f in fp_files if 'error' not in f] + if opt_slice: + fp_files = fp_files[opt_slice[0]:opt_slice[1]] + + log.debug(f'Found {len(fp_files)} files') + items = [] + """ + { + "stat": "ok", + "user": { + "id": "95216244@N04", + "username": { + "_content": "AfghanistanWien" + } + } + } + """ + """ + { + "profile": { + "city": "Oslo", + "country": "Norway", + "facebook": "", + "first_name": "US Embassy", + "hometown": "Oslo", + "id": "133886098@N05", + "instagram": "", + "join_date": "1436521589", + "last_name": "Oslo", + "nsid": "133886098@N05", + "occupation": "", + "pinterest": "", + "profile_description": "This is the official Flickr profile of the U.S. Embassy in Oslo, Norway. Contact us: osloirc@state.gov.", + "showcase_set": "72157677372281094", + "showcase_set_title": "Profile Showcase", + "tumblr": "", + "twitter": "", + "website": "http://norway.usembassy.gov/index.html" + }, + "stat": "ok" + } + """ + # Convert to |nsid|username| + for fp_file in tqdm(fp_files): + metadata = file_utils.load_json(fp_file) + + if opt_query == 'nsid_url': + path_alias = Path(fp_file).stem + metadata = metadata.get('user') + nsid = metadata.get('id') + username = metadata.get('username').get('_content') + url = f'https://www.flickr.com/photos/{path_alias}' + obj = { + 'nsid': nsid, + 'username': username, + 'url': url, + 'path_alias': path_alias, + 'filename': f'{path_alias}.json' + } + elif opt_query == 'nsid_profile': + obj = metadata.get('profile') + + items.append(obj) + + + # conver to DataFrame + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.info(f'Wrote {len(df)} to {opt_fp_out}')
\ No newline at end of file diff --git a/megapixels/commands/msc/flickr_list_to_csv.py b/megapixels/commands/msc/flickr_list_to_csv.py new file mode 100644 index 00000000..f107db60 --- /dev/null +++ b/megapixels/commands/msc/flickr_list_to_csv.py @@ -0,0 +1,48 @@ +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt' +fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Convert embassy list to CSV""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils + + log = Logger.getLogger() + log.info('converting flickr list to CSV') + + items = [] + + embassies = file_utils.load_text(opt_fp_in) + + for embassy in tqdm(embassies): + splits = embassy.split(' ') + url = splits[0].strip() + title = ' '.join(splits[1:]).strip() + username = Path(url).stem + items.append({'title': title, 'url': url, 'username': username}) + + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.debug(f'Wrote {len(df)} lines') + + diff --git a/megapixels/commands/msc/plot_countries.py b/megapixels/commands/msc/plot_countries.py new file mode 100644 index 00000000..9df7dcd5 --- /dev/null +++ b/megapixels/commands/msc/plot_countries.py @@ -0,0 +1,31 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.pass_context +def cli(ctx, ): + """Plot countries""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import cv2 as cv + from tqdm import tqdm + import matplotlib.pyplot as plt + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + log = Logger.getLogger() + log.info('Plot country data') diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py new file mode 100644 index 00000000..d5d251db --- /dev/null +++ b/megapixels/commands/msc/summarize.py @@ -0,0 +1,67 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True) +@click.option('-o', '--output', 'opt_fp_out', required=True) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """_template_""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + from pprint import pprint + + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils + + log = Logger.getLogger() + + dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs'] + + df = pd.DataFrame() + fp_out = opt_fp_out.replace('.csv', '_citations.csv') + for dataset_name in dataset_names: + fp_csv = join(opt_fp_in, f'{dataset_name}.csv') + _df = pd.read_csv(fp_csv) + _df = _df[_df.lat != 0] + print(dataset_name, len(_df)) + df = df.append(_df, ignore_index=True) + + df.to_csv(opt_fp_out, index=False) + + # create country summary + fp_out = opt_fp_out.replace('.csv', '_countries.csv') + country_groups = df.groupby('country') + summary = [] + for group_name, group in country_groups: + summary.append({'country': group_name, 'citations': len(group)}) + df_summary = pd.DataFrame.from_dict(summary) + df_summary.sort_values(by='citations', ascending=False, inplace=True) + df_summary.to_csv(fp_out, index=False) + pprint(df_summary) + + # summary sector + summary = [] + fp_out = opt_fp_out.replace('.csv', '_sector.csv') + groups = df.groupby('loc_type') + for group_name, group in groups: + summary.append({'type': group_name, 'citations': len(group)}) + df_types = pd.DataFrame.from_dict(summary) + df_types.sort_values(by='citations', ascending=False, inplace=True) + df_types.to_csv(fp_out, index=False) + pprint(df_types) + + |
