summaryrefslogtreecommitdiff
path: root/megapixels/commands
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
committeradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
commit1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands
parente5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
add msc working utils
Diffstat (limited to 'megapixels/commands')
-rw-r--r--megapixels/commands/datasets/download_ibmdif.py11
-rw-r--r--megapixels/commands/datasets/download_images.py9
-rw-r--r--megapixels/commands/datasets/flickr_api.py84
-rw-r--r--megapixels/commands/datasets/flickr_api_to_csv.py382
-rw-r--r--megapixels/commands/msc/count.py123
-rw-r--r--megapixels/commands/msc/cross_reference.py78
-rw-r--r--megapixels/commands/msc/summarize.py5
7 files changed, 580 insertions, 112 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index ed717662..0b81fef6 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t
help='Output path')
@click.option('-t', '--threads', 'opt_threads', default=8,
help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents):
"""Threaded image/file downloader"""
"""
@@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
urllib.request.urlretrieve(item['url'], fp_out)
item['status'] = True
except Exception as e:
+ log.debug(f'Failed: user: {item["username"]}, url: {url}')
if str(e) != 'HTTP Error 403: Forbidden':
log.debug(f'Error: {e}')
fp_error = f'{fp_out}_error.txt'
@@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
# setup multithreading data holders
log.debug(f'loading {opt_fp_in}')
df_records = pd.read_csv(opt_fp_in)
+ if opt_slice:
+ df_records = df_records[opt_slice[0]:opt_slice[1]]
log.debug(f'loaded {len(df_records):,} csv records')
log.debug('deduplicating')
df_records = df_records.drop_duplicates(subset='sha256', keep="last")
@@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
for x in tqdm(records):
sha256 = x['sha256']
-
+ username = x['username']
fp_dst = join(opt_fp_out, f"{sha256}.json")
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
@@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
if not (fp_dst_is_file or fp_dst_is_err):
url = url_prefix + sha256 + '.json'
user_agent = user_agents[randint(0, len(user_agents)) - 1]
- pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+ pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent})
else:
n_skipped += 1
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
index c64afbba..45ca8f6e 100644
--- a/megapixels/commands/datasets/download_images.py
+++ b/megapixels/commands/datasets/download_images.py
@@ -6,9 +6,9 @@ import click
help='Input')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Output')
-@click.option('-t', '--threads', 'opt_threads', default=8,
+@click.option('-t', '--threads', 'opt_threads', default=8, show_default=True,
help='Number of threads')
-@click.option('--wayback', 'opt_wayback', is_flag=True,
+@click.option('--wayback', 'opt_wayback', is_flag=True, default=False,
help='Check Wayback archive for URL and download cached image')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
@@ -52,7 +52,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
estr = str(e)
if item['opt_wayback']:
if 'HTTP Error' in estr:
- # check
+ # TODO add/parse/handle request for wayback machine archive
url_wayback = url_wayback_base + item['url']
fp_error = f'{fp_out}_error.txt'
with open(fp_error, 'w') as fp:
@@ -67,6 +67,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
pool_items = []
+ log.debug(f'Initializing multithreaded pool...')
for x in tqdm(records):
fp_dst = join(opt_fp_out, x['filepath'])
fp_dst_is_file = Path(fp_dst).is_file()
@@ -75,7 +76,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
num_items = len(pool_items)
- log.info(f'processing {num_items:,} items')
+ log.info(f'Going to download {num_items:,} files')
pool_results = []
# run the multithreading with progress bar
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py
index 780ede49..f09f3089 100644
--- a/megapixels/commands/datasets/flickr_api.py
+++ b/megapixels/commands/datasets/flickr_api.py
@@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter
from app.utils import file_utils, im_utils
-query_types = ['photo_id', 'album_id', 'flickr_id']
+query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile']
+# ???
+# photo_id: 123456789
# flickr_id: 123456789@N01
-# photo_id:
log = logger_utils.Logger.getLogger()
@@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger()
help='Output directory')
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
help='Slice list of files')
-@click.option('--query-type', 'opt_query_type', default='photo_id',
+@click.option('-q', '--query', 'opt_query_type', required=True,
type=click.Choice(query_types),
help='API query type')
@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
@@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
# process
if not opt_api_key or not opt_api_secret:
- log.error('source .env vars for Flickr API and try again')
+ log.error('source env/flickr.env vars for Flickr API and try again')
return
# check how many flickr keys
api_keys = []
api_secrets = []
- for i in range(1,10):
+ for i in range(1,20):
try:
var_name_key = f'FLICKR_API_KEY_{i}'
var_name_secret = f'FLICKR_API_SECRET_{i}'
@@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
log.info(f'Shuffling between: {len(api_keys)} api keys')
# read in CSV
- # | query, filepath |
-
- records = pd.read_csv(opt_fp_in).to_dict('records')
+ # | query, filename, count |
+ df_records = pd.read_csv(opt_fp_in)
+ log.info(f'Dedpuplicating {len(df_records)}')
+ if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile':
+ df_records = df_records.drop_duplicates(subset='nsid', keep="last")
+ else:
+ df_records = df_records.drop_duplicates(subset='photo_id', keep="last")
+ log.info(f'After deduplication: {len(df_records)}')
+ records = df_records.to_dict('records')
+
if opt_slice:
records = records[opt_slice[0]:opt_slice[1]]
@@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
for record in tqdm(records):
- fp_out = join(opt_fp_out, record['filepath'])
+ if 'nsid' in opt_query_type:
+ fp_out = join(opt_fp_out, f"{record['nsid']}.json")
+ else:
+ fp_out = join(opt_fp_out, f'{record["photo_id"]}.json')
+
fp_out_err = fp_out + '_error.txt'
if Path(fp_out).is_file() or Path(fp_out_err).is_file():
continue
- # append relevant data
try:
# shuffle the api keys to avoid rate limiting
rand_int = randint(0,len(api_keys)-1)
api_key = api_keys[rand_int]
api_secret = api_secrets[rand_int]
-
- #flickr_api.set_keys(api_key=api_key, api_secret=api_secret)
- #photo = flickr_api.Photo(id=record['query'])
- # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
- photo_id = record['query']
- flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo'
- flickr_url += f'&api_key={api_key}'
- flickr_url += f'&photo_id={photo_id}'
- flickr_url += '&format=json'
- flickr_url += '&nojsoncallback=1'
+ # https://www.flickr.com/services/rest/
+ if opt_query_type == 'nsid_url':
+ # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+ # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+ # https://www.flickr.com/services/rest/
+ if opt_query_type == 'nsid_profile':
+ # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+ # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+ elif opt_query_type == 'photo_id':
+ # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
+ fp_out
+ photo_id = record['photo_id']
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&photo_id={photo_id}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
with urllib.request.urlopen(flickr_url) as url:
data = json.loads(url.read().decode())
if data['stat'] =='fail':
- raise Exception('failed')
+ error_msg = data["message"]
+ log.error(f'Failed. Message: {error_msg}, url: {flickr_url}')
+ if error_msg == 'Service currently unavailable':
+ time.sleep(10)
+ raise Exception(error_msg)
elif data['stat'] =='ok':
with open(fp_out, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=2)
- #except FlickrAPIError as e:
except Exception as e:
- # if "HTTP Server Error 500" in str(e):
- log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}')
- if "not found" in str(e) or 'failed' in str(e):
+ log.error(f'{e}')
+ if "not found" in str(e) or 'Invalid NSID provided' in str(e):
with open(fp_out_err, 'w') as fp:
fp.write('')
diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py
new file mode 100644
index 00000000..5b5f0ce3
--- /dev/null
+++ b/megapixels/commands/datasets/flickr_api_to_csv.py
@@ -0,0 +1,382 @@
+"""
+Converts directory of JSON API output files to CSV format
+"""
+
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+query_types = ['nsid_profile', 'nsid_url', 'photo_id']
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type):
+ """Fetches Flickr API for user info. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import json
+
+
+ # -------------------------------------------------
+ # process
+ if Path(opt_fp_out).is_file() and not opt_force:
+ log.error('File exists. Use "--force" to overwrite it')
+ return
+
+ fp_files = glob(join(opt_fp_in, '*.json'))
+ fp_files = [f for f in fp_files if 'error' not in f]
+ if opt_slice:
+ fp_files = fp_files[opt_slice[0]:opt_slice[1]]
+
+ log.debug(f'Found {len(fp_files)} files')
+ items = []
+
+ for fp_file in tqdm(fp_files):
+
+ if opt_query_type == 'photo_id':
+ try:
+ photo = file_utils.load_json(fp_file).get('photo')
+ except Exception as e:
+ log.error(f'{e}, skipping: {fp_file}')
+ continue
+ dates = photo.get('dates')
+ posted = dates.get('posted')
+ taken = dates.get('taken')
+ description = photo.get('description').get('_content')
+ location = photo.get('location', {})
+ country = location.get('country', {})
+ location_country = country.get('_country', '')
+ location_place = country.get('place_id', '')
+ location_woeid = country.get('woeid', '')
+ location_lat = location.get('latitude', '')
+ location_lon = location.get('longitude', '')
+ location_place_id = location.get('place_id', '')
+ owner = photo.get('owner')
+ nsid = owner.get('nsid')
+ path_alias = owner.get('path_alias')
+ owner_realname = owner.get('realname')
+ owner_username = owner.get('username')
+ owner_location = owner.get('location')
+ photo_id = Path(fp_file).stem
+ server = photo.get('server')
+ farm = photo.get('farm')
+ secret = photo.get('secret')
+ # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg
+ image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'
+
+ obj = {
+ 'posted': posted,
+ 'taken': taken,
+ 'description': description,
+ 'country': location_country,
+ 'place': location_place,
+ 'woeid': location_woeid,
+ 'lat': location_lat,
+ 'lon': location_lon,
+ 'place_id': location_place_id,
+ 'nsid': nsid,
+ 'path_alias': path_alias,
+ 'realname': owner_realname,
+ 'username': owner_username,
+ 'owner_location': owner_location,
+ 'photo_id': photo_id,
+ 'secret': secret,
+ 'url': image_url
+ }
+
+
+ elif opt_query_type == 'nsid_profile':
+ obj = file_utils.load_json(fp_file).get('profile')
+ obj.pop('showcase_set')
+ obj.pop('showcase_set_title')
+ obj.pop('pinterest')
+ obj.pop('tumblr')
+ elif opt_query_type == 'nsid_url':
+ obj = file_utils.load_json(fp_file).get('user')
+ elif opt_query_type == 'user_profile':
+ metadata = file_utils.load_json(fp_file).get('photo')
+ owner = metadata.get('owner')
+ path_alias = owner.get('path_alias')
+ nsid = owner.get('nsid')
+ username = owner.get('username')
+ realname = owner.get('realname')
+ description = metadata.get('description').get('_content')
+ title = metadata.get('title').get('_content')
+ location = metadata.get('location')
+ dates = metadata.get('dates')
+ date_taken = dates.get('taken')
+ date_posted = dates.get('posted')
+ fname = Path(fp_file).stem
+ obj = {
+ 'photo_id': fname,
+ 'nsid': nsid,
+ 'path_alias': path_alias,
+ 'username': username,
+ 'realname': realname,
+ 'title': title,
+ 'description': description,
+ 'location': location,
+ 'date_taken': date_taken,
+ 'date_posted': date_posted
+ }
+
+ items.append(obj)
+
+ # conver to DataFrame
+ df = pd.DataFrame.from_dict(items)
+ df.to_csv(opt_fp_out, index=False)
+ log.info(f'Wrote {len(df)} to {opt_fp_out}')
+
+"""
+nsid_url
+ {
+ "stat": "ok",
+ "user": {
+ "nsid": "7153718@N04",
+ "url": "https://www.flickr.com/people/babyfish4/"
+ }
+}
+"""
+"""
+ location: of the owner
+ dateuploaded
+ license
+ "dates":
+ "lastupdate": "1416447096"
+ "posted": "1112900873"
+ "taken": "2005-04-06 18:37:38"
+ description:
+ _content: playing in a field
+ title:
+ _content: jessica
+ location: cornwall, uk
+"""
+
+"""
+ {
+ "profile": {
+ "city": null,
+ "country": null,
+ "facebook": "",
+ "first_name": null,
+ "hometown": "",
+ "id": "7153718@N04",
+ "instagram": "",
+ "join_date": "1172669959",
+ "last_name": null,
+ "nsid": "7153718@N04",
+ "occupation": "",
+ "pinterest": "",
+ "profile_description": "",
+ "showcase_set": "72157680616398790",
+ "showcase_set_title": "Profile Showcase",
+ "tumblr": "",
+ "twitter": ""
+ },
+ "stat": "ok"
+}
+"""
+
+"""
+photo_id
+
+
+ {
+ "photo": {
+ "comments": {
+ "_content": "0"
+ },
+ "dates": {
+ "lastupdate": "0",
+ "posted": "1094612969",
+ "taken": "2004-09-04 22:41:18",
+ "takengranularity": "0",
+ "takenunknown": 0
+ },
+ "dateuploaded": "1094612969",
+ "description": {
+ "_content": ""
+ },
+ "editability": {
+ "canaddmeta": 0,
+ "cancomment": 0
+ },
+ "farm": 1,
+ "geoperms": {
+ "iscontact": 0,
+ "isfamily": 0,
+ "isfriend": 0,
+ "ispublic": 1
+ },
+ "id": "371498",
+ "isfavorite": 0,
+ "license": "1",
+ "location": {
+ "accuracy": "15",
+ "context": "0",
+ "country": {
+ "_content": "United States",
+ "place_id": "nz.gsghTUb4c2WAecA",
+ "woeid": "23424977"
+ },
+ "county": {
+ "_content": "Tompkins",
+ "place_id": "1uCJJtBQUL80G6hbPw",
+ "woeid": "12589366"
+ },
+ "latitude": "42.399028",
+ "longitude": "-76.652519",
+ "place_id": "1uCJJtBQUL80G6hbPw",
+ "region": {
+ "_content": "New York",
+ "place_id": "ODHTuIhTUb75gdBu",
+ "woeid": "2347591"
+ },
+ "woeid": "12589366"
+ },
+ "media": "photo",
+ "notes": {
+ "note": []
+ },
+ "originalformat": "jpg",
+ "originalsecret": "704f392686",
+ "owner": {
+ "iconfarm": 1,
+ "iconserver": "1",
+ "location": "Los Angeles, CA, USA",
+ "nsid": "48600072071@N01",
+ "path_alias": "barb",
+ "realname": "Barb Dybwad",
+ "username": "doctor paradox"
+ },
+ "people": {
+ "haspeople": 0
+ },
+ "publiceditability": {
+ "canaddmeta": 0,
+ "cancomment": 1
+ },
+ "rotation": 0,
+ "safety_level": "0",
+ "secret": "704f392686",
+ "server": "1",
+ "tags": {
+ "tag": [
+ {
+ "_content": "unfound",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-9017",
+ "machine_tag": 0,
+ "raw": "unfound"
+ },
+ {
+ "_content": "digicam",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-40406",
+ "machine_tag": 0,
+ "raw": "digicam"
+ },
+ {
+ "_content": "upstateny",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-9655",
+ "machine_tag": 0,
+ "raw": "upstateny"
+ },
+ {
+ "_content": "musefest",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-72456",
+ "machine_tag": 0,
+ "raw": "musefest"
+ },
+ {
+ "_content": "musicfestival",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-72628",
+ "machine_tag": 0,
+ "raw": "musicfestival"
+ },
+ {
+ "_content": "people",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-290",
+ "machine_tag": 0,
+ "raw": "people"
+ },
+ {
+ "_content": "portrait",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-278",
+ "machine_tag": 0,
+ "raw": "portrait"
+ },
+ {
+ "_content": "maco",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-19439",
+ "machine_tag": 0,
+ "raw": "maco"
+ }
+ ]
+ },
+ "title": {
+ "_content": "maco2"
+ },
+ "urls": {
+ "url": [
+ {
+ "_content": "https://www.flickr.com/photos/barb/371498/",
+ "type": "photopage"
+ }
+ ]
+ },
+ "usage": {
+ "canblog": 0,
+ "candownload": 1,
+ "canprint": 0,
+ "canshare": 1
+ },
+ "views": "290",
+ "visibility": {
+ "isfamily": 0,
+ "isfriend": 0,
+ "ispublic": 1
+ }
+ },
+ "stat": "ok"
+}
+""" \ No newline at end of file
diff --git a/megapixels/commands/msc/count.py b/megapixels/commands/msc/count.py
new file mode 100644
index 00000000..3c242bc6
--- /dev/null
+++ b/megapixels/commands/msc/count.py
@@ -0,0 +1,123 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+ """Cross reference"""
+
+ import sys
+ from os.path import join
+ from glob import glob
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ log.info('Cross reference embassy list')
+
+
+ fp_counts = {}
+ fp_filepaths = {}
+ fp_dataset_base = '/data_store/datasets/people/'
+
+ for dk in dataset_keys:
+ fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
+ fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+
+ df_embassies = pd.read_csv(opt_fp_in)
+ df_embassies.fillna('', inplace=True)
+ embassy_nsids = list(df_embassies['nsid'])
+
+ match_items = []
+ embassy_images = []
+ malta_images = []
+
+ for dataset_key, fp_dataset in tqdm(fp_counts.items()):
+ df_counts = pd.read_csv(fp_dataset)
+ log.debug(f'loading: {fp_filepaths[dataset_key]}')
+ df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
+ nsids = list(df_counts['nsid'])
+ for nsid in nsids:
+ if nsid in embassy_nsids:
+ # add to matches, and count
+ count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
+ first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
+ last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
+ path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
+ page_url = f'https://flickr.com/photos/{path_alias}'
+ embassy_name = f'{first_name} {last_name}'
+ embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0]
+
+ match_obj = {
+ 'count': count,
+ 'path_alias': path_alias,
+ 'name': embassy_name,
+ 'dataset_key': dataset_key,
+ 'nsid': nsid,
+ 'page_url': page_url,
+ 'embassy_type': embassy_meta.type,
+ 'username': embassy_meta.username
+ }
+ match_items.append(match_obj)
+
+ # add photo ids or url
+ df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
+ nsid_records = df_nsids.to_dict('records')
+ for nsid_record in nsid_records:
+ photo_id = nsid_record.get('photo_id')
+ im_obj = {
+ 'nsid': nsid,
+ 'url': nsid_record.get('url'),
+ 'photo_id': photo_id,
+ 'dataset_key': dataset_key,
+ 'path_alias': path_alias,
+ 'name': embassy_name,
+ 'page_url': page_url,
+ 'username': embassy_meta.username,
+ 'filepath': f'{photo_id}.jpg'
+ }
+
+ embassy_images.append(im_obj)
+ if nsid == '51226353@N03':
+ malta_images.append(im_obj)
+
+ # Save embassy matches
+ df = pd.DataFrame.from_dict(match_items)
+ df.to_csv(opt_fp_out, index=False)
+ total = df['count'].sum()
+
+ # Save image matches
+ df = pd.DataFrame.from_dict(embassy_images)
+ fp_out = opt_fp_out.replace('.csv', '_images.csv')
+ df.to_csv(fp_out, index=False)
+ total = len(embassy_images)
+ log.debug(f'wrote {fp_out}')
+ log.debug(f'Found {total:,} embassy images')
+
+ # Save malta images
+ df = pd.DataFrame.from_dict(malta_images)
+ fp_out = opt_fp_out.replace('.csv', '_images_malta.csv')
+ df.to_csv(fp_out, index=False)
+ total = len(malta)
+ log.debug(f'wrote {fp_out}')
+ log.debug(f'Found {total:,} malta embassy images') \ No newline at end of file
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py
deleted file mode 100644
index d4457945..00000000
--- a/megapixels/commands/msc/cross_reference.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from os.path import join
-
-import click
-
-from app.utils.logger_utils import Logger
-
-log = Logger.getLogger()
-
-# source file for Embassy NSIDs
-fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'
-
-# list of datasets to cross reference
-dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there']
-fp_dataset_base = '/data_store/datasets/people/'
-fp_datasets = {}
-for dk in dataset_keys:
- fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv')
-
-
-# output file
-fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv'
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies,
- help='Input file for embassies')
-@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
- help='Output file')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
- help='Slice list of files')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
- help='Force overwrite')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
- """Cross reference"""
-
- import sys
- from os.path import join
- from glob import glob
- from pathlib import Path
- import time
-
- import pandas as pd
- from tqdm import tqdm
-
- log = Logger.getLogger()
- log.info('Cross reference embassy list')
-
- df_embassies = pd.read_csv(opt_fp_in)
- df_embassies.fillna('', inplace=True)
- embassy_nsids = list(df_embassies['nsid'])
-
- match_items = []
- for dataset_key, fp_dataset in fp_datasets.items():
- df_dataset = pd.read_csv(fp_dataset)
- nsids = list(df_dataset['nsid'])
- for nsid in nsids:
- if nsid in embassy_nsids:
- # add to matches, and count
- count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0]
- first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
- last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
- path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
- log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}')
- match_obj = {
- 'count': count,
- 'path_alias': path_alias,
- 'name': f'{first_name} {last_name}',
- 'dataset_key': dataset_key,
- 'nsid': nsid
- }
- match_items.append(match_obj)
-
- df = pd.DataFrame.from_dict(match_items)
- df.to_csv(opt_fp_out, index=False)
-
- total = df['count'].sum()
-
- log.debug(f'Found {total} embassy photos') \ No newline at end of file
diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py
index d5d251db..045e3b69 100644
--- a/megapixels/commands/msc/summarize.py
+++ b/megapixels/commands/msc/summarize.py
@@ -29,7 +29,7 @@ def cli(ctx, opt_fp_in, opt_fp_out):
log = Logger.getLogger()
- dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs']
+ dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs']
df = pd.DataFrame()
fp_out = opt_fp_out.replace('.csv', '_citations.csv')
@@ -37,10 +37,11 @@ def cli(ctx, opt_fp_in, opt_fp_out):
fp_csv = join(opt_fp_in, f'{dataset_name}.csv')
_df = pd.read_csv(fp_csv)
_df = _df[_df.lat != 0]
+ _df.drop('id', axis=1, inplace=True)
print(dataset_name, len(_df))
df = df.append(_df, ignore_index=True)
- df.to_csv(opt_fp_out, index=False)
+ df.to_csv(fp_out, index=False)
# create country summary
fp_out = opt_fp_out.replace('.csv', '_countries.csv')