summaryrefslogtreecommitdiff
path: root/megapixels/commands
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands')
-rw-r--r--megapixels/commands/msc/append_embassies.py126
-rw-r--r--megapixels/commands/msc/append_embassy_profile.py150
-rw-r--r--megapixels/commands/msc/basic.py30
-rw-r--r--megapixels/commands/msc/cross_reference.py78
-rw-r--r--megapixels/commands/msc/embassy_flickr_api_data_to_csv.py120
-rw-r--r--megapixels/commands/msc/flickr_list_to_csv.py48
-rw-r--r--megapixels/commands/msc/plot_countries.py31
-rw-r--r--megapixels/commands/msc/summarize.py67
8 files changed, 650 insertions, 0 deletions
diff --git a/megapixels/commands/msc/append_embassies.py b/megapixels/commands/msc/append_embassies.py
new file mode 100644
index 00000000..2e659344
--- /dev/null
+++ b/megapixels/commands/msc/append_embassies.py
@@ -0,0 +1,126 @@
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
+@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1')
+@click.option('-d', '--delay', 'opt_delay', default=None, type=float,
+ help='Delay between API calls to prevent rate-limiting')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
+ opt_delay):
+ """Fetches Flickr API for embassy. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import time
+ import json
+ import os, sys
+ from random import randint
+ import urllib.request
+ import flickr_api # pip install flickr_api
+ from flickr_api.flickrerrors import FlickrAPIError
+ from requests.compat import urljoin, quote_plus
+
+
+ # -------------------------------------------------
+ # process
+
+ if not opt_api_key or not opt_api_secret:
+ log.error('source ../env/flickr.env vars for Flickr API and try again')
+ return
+
+ # check how many flickr keys
+ api_keys = []
+ api_secrets = []
+ for i in range(1,10):
+ try:
+ var_name_key = f'FLICKR_API_KEY_{i}'
+ var_name_secret = f'FLICKR_API_SECRET_{i}'
+ if os.environ[var_name_key] and os.environ[var_name_secret]:
+ api_keys.append(os.environ[var_name_key])
+ api_secrets.append(os.environ[var_name_secret])
+ except Exception as e:
+ pass
+
+ log.info(f'Shuffling between: {len(api_keys)} api keys')
+
+ # read in CSV
+ # | username, ... |
+ df_records = pd.read_csv(opt_fp_in)
+ log.info(f'Dedpuplicating {len(df_records)}')
+ df_records = df_records.drop_duplicates(subset='url', keep="last")
+ log.info(f'Dedpuplicated {len(df_records)}')
+ records = df_records.to_dict('records')
+
+ if opt_slice:
+ records = records[opt_slice[0]:opt_slice[1]]
+
+ log.info('Processing: {:,} items'.format(len(records)))
+
+ identities = []
+
+ for record in tqdm(records):
+ if record.get('nsid', None):
+ continue
+ try:
+ # shuffle the api keys to avoid rate limiting
+ rand_int = randint(0,len(api_keys)-1)
+ api_key = api_keys[rand_int]
+ api_secret = api_secrets[rand_int]
+ """
+ { "user": { "id": "46768316@N07",
+ "username": { "_content": "U.S. Embassy Tirana Art Contest" } }, "stat": "ok" }
+ """
+
+ # https://www.flickr.com/services/rest/
+ # ?method=flickr.urls.lookupUser&api_key=xxx&url=[encoded url]&format=json&nojsoncallback=1
+ url = record['url']
+ url_encoded = quote_plus(url)
+ flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.urls.lookupUser'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&url={url_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+ log.debug(f'{flickr_url}')
+ with urllib.request.urlopen(flickr_url) as url:
+ data = json.loads(url.read().decode())
+
+ if data['stat'] =='fail':
+ error_msg = data["message"]
+ log.error(f'Failed. Message: {error_msg}, url: {flickr_url}')
+ raise Exception(error_msg)
+ elif data['stat'] =='ok':
+ user_data = data.get('user')
+ record['nsid'] = user_data.get('id')
+ record['username'] = user_data.get('username').get('_content')
+
+ except Exception as e:
+ log.error(f'Exception: {e}, url: {flickr_url}')
+
+
+ # write data
+ df_records = pd.DataFrame.from_dict(records)
+ df_records.to_csv(opt_fp_out)
diff --git a/megapixels/commands/msc/append_embassy_profile.py b/megapixels/commands/msc/append_embassy_profile.py
new file mode 100644
index 00000000..7d301f06
--- /dev/null
+++ b/megapixels/commands/msc/append_embassy_profile.py
@@ -0,0 +1,150 @@
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
+@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1')
+@click.option('-d', '--delay', 'opt_delay', default=None, type=float,
+ help='Delay between API calls to prevent rate-limiting')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
+ opt_delay):
+ """Fetches Flickr API for embassy. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import time
+ import json
+ import os, sys
+ from random import randint
+ import urllib.request
+ import flickr_api # pip install flickr_api
+ from flickr_api.flickrerrors import FlickrAPIError
+ from requests.compat import urljoin, quote_plus
+
+
+ # -------------------------------------------------
+ # process
+
+ if not opt_api_key or not opt_api_secret:
+ log.error('source ../env/flickr.env vars for Flickr API and try again')
+ return
+
+ # check how many flickr keys
+ api_keys = []
+ api_secrets = []
+ for i in range(1,20):
+ try:
+ var_name_key = f'FLICKR_API_KEY_{i}'
+ var_name_secret = f'FLICKR_API_SECRET_{i}'
+ if os.environ[var_name_key] and os.environ[var_name_secret]:
+ api_keys.append(os.environ[var_name_key])
+ api_secrets.append(os.environ[var_name_secret])
+ except Exception as e:
+ pass
+
+ log.info(f'Shuffling between: {len(api_keys)} api keys')
+
+ # read in CSV
+ # | username, ... |
+ df_records = pd.read_csv(opt_fp_in)
+ log.info(f'Dedpuplicating {len(df_records)}')
+ df_records = df_records.drop_duplicates(subset='url', keep="last")
+ log.info(f'Dedpuplicated {len(df_records)}')
+ records = df_records.to_dict('records')
+
+ if opt_slice:
+ records = records[opt_slice[0]:opt_slice[1]]
+
+ log.info('Processing: {:,} items'.format(len(records)))
+
+ identities = []
+
+ for record in tqdm(records):
+ if not record.get('nsid', None):
+ log.warn(f'No NSID for {record["url"]}')
+ continue
+ try:
+ # shuffle the api keys to avoid rate limiting
+ error_msg = ''
+ rand_int = randint(0,len(api_keys)-1)
+ api_key = api_keys[rand_int]
+ api_secret = api_secrets[rand_int]
+
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+
+ with urllib.request.urlopen(flickr_url) as url:
+ data = json.loads(url.read().decode())
+
+ if data['stat'] =='fail':
+ error_msg = data["message"]
+ raise Exception(error_msg)
+ elif data['stat'] =='ok':
+ profile = data.get('profile')
+ # append data
+ tags = ['join_date', 'occupation', 'hometown', 'first_name', 'last_name']
+ tags += ['profile_description', 'city', 'country', 'twitter', 'facebook', 'instagram']
+ for tag in tags:
+ record.setdefault(tag, profile.get(tag))
+
+ except Exception as e:
+ log.error(f'Exception: {e}, message: {error_msg}, url: {flickr_url}')
+
+
+ # write data
+ df_records = pd.DataFrame.from_dict(records)
+ df_records.to_csv(opt_fp_out, index=False)
+
+
+"""
+{
+ "profile": {
+ "id": "129819216@N03",
+ "nsid": "129819216@N03",
+ "join_date": "1417769829",
+ "occupation": "",
+ "hometown": "",
+ "showcase_set": "72157680742231281",
+ "showcase_set_title": "Profile Showcase",
+ "first_name": "Ambasciata",
+ "last_name": "d'Italia a Praga",
+ "profile_description": "",
+ "city": "",
+ "country": "",
+ "facebook": "",
+ "twitter": "",
+ "tumblr": "",
+ "instagram": "",
+ "pinterest": ""
+ },
+ "stat": "ok"
+}
+""" \ No newline at end of file
diff --git a/megapixels/commands/msc/basic.py b/megapixels/commands/msc/basic.py
new file mode 100644
index 00000000..2e952896
--- /dev/null
+++ b/megapixels/commands/msc/basic.py
@@ -0,0 +1,30 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.pass_context
+def cli(ctx, ):
+ """_template_"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+ log.info('template works')
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py
new file mode 100644
index 00000000..d4457945
--- /dev/null
+++ b/megapixels/commands/msc/cross_reference.py
@@ -0,0 +1,78 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# source file for Embassy NSIDs
+fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'
+
+# list of datasets to cross reference
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there']
+fp_dataset_base = '/data_store/datasets/people/'
+fp_datasets = {}
+for dk in dataset_keys:
+ fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv')
+
+
+# output file
+fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies,
+ help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+ """Cross reference"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ log.info('Cross reference embassy list')
+
+ df_embassies = pd.read_csv(opt_fp_in)
+ df_embassies.fillna('', inplace=True)
+ embassy_nsids = list(df_embassies['nsid'])
+
+ match_items = []
+ for dataset_key, fp_dataset in fp_datasets.items():
+ df_dataset = pd.read_csv(fp_dataset)
+ nsids = list(df_dataset['nsid'])
+ for nsid in nsids:
+ if nsid in embassy_nsids:
+ # add to matches, and count
+ count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0]
+ first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
+ last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
+ path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
+ log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}')
+ match_obj = {
+ 'count': count,
+ 'path_alias': path_alias,
+ 'name': f'{first_name} {last_name}',
+ 'dataset_key': dataset_key,
+ 'nsid': nsid
+ }
+ match_items.append(match_obj)
+
+ df = pd.DataFrame.from_dict(match_items)
+ df.to_csv(opt_fp_out, index=False)
+
+ total = df['count'].sum()
+
+ log.debug(f'Found {total} embassy photos') \ No newline at end of file
diff --git a/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py
new file mode 100644
index 00000000..1a0b6a91
--- /dev/null
+++ b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py
@@ -0,0 +1,120 @@
+"""
+Converts directory of JSON API output files to CSV format
+"""
+
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+data_types = ['nsid_url', 'nsid_profile']
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-q', '--query', 'opt_query', required=True, type=click.Choice(data_types),
+ help='Flickr API data type')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query):
+ """Fetches Flickr API for user info. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import json
+
+
+ # -------------------------------------------------
+ # process
+ if Path(opt_fp_out).is_file() and not opt_force:
+ log.error('File exists. Use "--force" to overwrite it')
+ return
+
+ fp_files = glob(join(opt_fp_in, '*.json'))
+ fp_files = [f for f in fp_files if 'error' not in f]
+ if opt_slice:
+ fp_files = fp_files[opt_slice[0]:opt_slice[1]]
+
+ log.debug(f'Found {len(fp_files)} files')
+ items = []
+ """
+ {
+ "stat": "ok",
+ "user": {
+ "id": "95216244@N04",
+ "username": {
+ "_content": "AfghanistanWien"
+ }
+ }
+ }
+ """
+ """
+ {
+ "profile": {
+ "city": "Oslo",
+ "country": "Norway",
+ "facebook": "",
+ "first_name": "US Embassy",
+ "hometown": "Oslo",
+ "id": "133886098@N05",
+ "instagram": "",
+ "join_date": "1436521589",
+ "last_name": "Oslo",
+ "nsid": "133886098@N05",
+ "occupation": "",
+ "pinterest": "",
+ "profile_description": "This is the official Flickr profile of the U.S. Embassy in Oslo, Norway. Contact us: osloirc@state.gov.",
+ "showcase_set": "72157677372281094",
+ "showcase_set_title": "Profile Showcase",
+ "tumblr": "",
+ "twitter": "",
+ "website": "http://norway.usembassy.gov/index.html"
+ },
+ "stat": "ok"
+ }
+ """
+ # Convert to |nsid|username|
+ for fp_file in tqdm(fp_files):
+ metadata = file_utils.load_json(fp_file)
+
+ if opt_query == 'nsid_url':
+ path_alias = Path(fp_file).stem
+ metadata = metadata.get('user')
+ nsid = metadata.get('id')
+ username = metadata.get('username').get('_content')
+ url = f'https://www.flickr.com/photos/{path_alias}'
+ obj = {
+ 'nsid': nsid,
+ 'username': username,
+ 'url': url,
+ 'path_alias': path_alias,
+ 'filename': f'{path_alias}.json'
+ }
+ elif opt_query == 'nsid_profile':
+ obj = metadata.get('profile')
+
+ items.append(obj)
+
+
+ # conver to DataFrame
+ df = pd.DataFrame.from_dict(items)
+ df.to_csv(opt_fp_out, index=False)
+ log.info(f'Wrote {len(df)} to {opt_fp_out}') \ No newline at end of file
diff --git a/megapixels/commands/msc/flickr_list_to_csv.py b/megapixels/commands/msc/flickr_list_to_csv.py
new file mode 100644
index 00000000..f107db60
--- /dev/null
+++ b/megapixels/commands/msc/flickr_list_to_csv.py
@@ -0,0 +1,48 @@
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt'
+fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
+ help='Output file')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """Convert embassy list to CSV"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils import file_utils
+
+ log = Logger.getLogger()
+ log.info('converting flickr list to CSV')
+
+ items = []
+
+ embassies = file_utils.load_text(opt_fp_in)
+
+ for embassy in tqdm(embassies):
+ splits = embassy.split(' ')
+ url = splits[0].strip()
+ title = ' '.join(splits[1:]).strip()
+ username = Path(url).stem
+ items.append({'title': title, 'url': url, 'username': username})
+
+ df = pd.DataFrame.from_dict(items)
+ df.to_csv(opt_fp_out, index=False)
+ log.debug(f'Wrote {len(df)} lines')
+
+
diff --git a/megapixels/commands/msc/plot_countries.py b/megapixels/commands/msc/plot_countries.py
new file mode 100644
index 00000000..9df7dcd5
--- /dev/null
+++ b/megapixels/commands/msc/plot_countries.py
@@ -0,0 +1,31 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.pass_context
+def cli(ctx, ):
+ """Plot countries"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+ import matplotlib.pyplot as plt
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+ log.info('Plot country data')
diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py
new file mode 100644
index 00000000..d5d251db
--- /dev/null
+++ b/megapixels/commands/msc/summarize.py
@@ -0,0 +1,67 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True)
+@click.option('-o', '--output', 'opt_fp_out', required=True)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """_template_"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+ from pprint import pprint
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils import file_utils
+
+ log = Logger.getLogger()
+
+ dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs']
+
+ df = pd.DataFrame()
+ fp_out = opt_fp_out.replace('.csv', '_citations.csv')
+ for dataset_name in dataset_names:
+ fp_csv = join(opt_fp_in, f'{dataset_name}.csv')
+ _df = pd.read_csv(fp_csv)
+ _df = _df[_df.lat != 0]
+ print(dataset_name, len(_df))
+ df = df.append(_df, ignore_index=True)
+
+ df.to_csv(opt_fp_out, index=False)
+
+ # create country summary
+ fp_out = opt_fp_out.replace('.csv', '_countries.csv')
+ country_groups = df.groupby('country')
+ summary = []
+ for group_name, group in country_groups:
+ summary.append({'country': group_name, 'citations': len(group)})
+ df_summary = pd.DataFrame.from_dict(summary)
+ df_summary.sort_values(by='citations', ascending=False, inplace=True)
+ df_summary.to_csv(fp_out, index=False)
+ pprint(df_summary)
+
+ # summary sector
+ summary = []
+ fp_out = opt_fp_out.replace('.csv', '_sector.csv')
+ groups = df.groupby('loc_type')
+ for group_name, group in groups:
+ summary.append({'type': group_name, 'citations': len(group)})
+ df_types = pd.DataFrame.from_dict(summary)
+ df_types.sort_values(by='citations', ascending=False, inplace=True)
+ df_types.to_csv(fp_out, index=False)
+ pprint(df_types)
+
+