add msc working utils

author: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
committer: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
commit: 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree: 86c37309ff5bcb62716638562489ddb747c16159
parent: e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
17 files changed, 2319 insertions, 215 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 98d36b5f..5ce0a678 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -95,6 +95,7 @@ DIR_COMMANDS_FAISS = 'commands/faiss'
 DIR_COMMANDS_MISC  = 'commands/misc'
 DIR_COMMANDS_SITE = 'commands/site'
 DIR_COMMANDS_DEMO = 'commands/demo'
+DIR_COMMANDS_MSC = 'commands/msc'
 
 # -----------------------------------------------------------------------------
 # Filesystem settings
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index ed717662..0b81fef6 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t
   help='Output path')
 @click.option('-t', '--threads', 'opt_threads', default=8,
   help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
 @click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents):
   """Threaded image/file downloader"""
   
   """
@@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
       urllib.request.urlretrieve(item['url'], fp_out)
       item['status'] = True
     except Exception as e:
+      log.debug(f'Failed: user: {item["username"]}, url: {url}')
       if str(e) != 'HTTP Error 403: Forbidden':
         log.debug(f'Error: {e}')
       fp_error = f'{fp_out}_error.txt'
@@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
   # setup multithreading data holders
   log.debug(f'loading {opt_fp_in}')
   df_records = pd.read_csv(opt_fp_in)
+  if opt_slice:
+    df_records = df_records[opt_slice[0]:opt_slice[1]]
   log.debug(f'loaded {len(df_records):,} csv records')
   log.debug('deduplicating')
   df_records = df_records.drop_duplicates(subset='sha256', keep="last")
@@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
 
   for x in tqdm(records):
     sha256 = x['sha256']
-    
+    username = x['username']
     fp_dst = join(opt_fp_out, f"{sha256}.json")
     fp_dst_is_file = Path(fp_dst).is_file()
     fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
@@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
     if not (fp_dst_is_file or fp_dst_is_err):
       url = url_prefix + sha256 + '.json'
       user_agent = user_agents[randint(0, len(user_agents)) - 1]
-      pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+      pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent})
     else:
       n_skipped += 1
 
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
index c64afbba..45ca8f6e 100644
--- a/megapixels/commands/datasets/download_images.py
+++ b/megapixels/commands/datasets/download_images.py
@@ -6,9 +6,9 @@ import click
   help='Input')
 @click.option('-o', '--output', 'opt_fp_out', required=True,
   help='Output')
-@click.option('-t', '--threads', 'opt_threads', default=8,
+@click.option('-t', '--threads', 'opt_threads', default=8, show_default=True,
   help='Number of threads')
-@click.option('--wayback', 'opt_wayback', is_flag=True,
+@click.option('--wayback', 'opt_wayback', is_flag=True, default=False,
   help='Check Wayback archive for URL and download cached image')
 @click.pass_context
 def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
@@ -52,7 +52,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
       estr = str(e)
       if item['opt_wayback']:
         if 'HTTP Error' in estr:
-          # check 
+          # TODO add/parse/handle request for wayback machine archive
           url_wayback = url_wayback_base + item['url']
       fp_error = f'{fp_out}_error.txt'
       with open(fp_error, 'w') as fp:
@@ -67,6 +67,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
 
 
   pool_items = []
+  log.debug(f'Initializing multithreaded pool...')
   for x in tqdm(records):
     fp_dst = join(opt_fp_out, x['filepath'])
     fp_dst_is_file = Path(fp_dst).is_file()
@@ -75,7 +76,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
       pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
 
   num_items = len(pool_items)
-  log.info(f'processing {num_items:,} items')
+  log.info(f'Going to download {num_items:,} files')
   pool_results = []
 
   # run the multithreading with progress bar
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py
index 780ede49..f09f3089 100644
--- a/megapixels/commands/datasets/flickr_api.py
+++ b/megapixels/commands/datasets/flickr_api.py
@@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter
 from app.utils import file_utils, im_utils
 
 
-query_types = ['photo_id', 'album_id', 'flickr_id']
+query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile']
+# ???
+# photo_id: 123456789
 # flickr_id: 123456789@N01
-# photo_id: 
 
 log = logger_utils.Logger.getLogger()
 
@@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger()
   help='Output directory')
 @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
   help='Slice list of files')
-@click.option('--query-type', 'opt_query_type', default='photo_id',
+@click.option('-q', '--query', 'opt_query_type', required=True,
   type=click.Choice(query_types),
   help='API query type')
 @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
@@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   # process  
   
   if not opt_api_key or not opt_api_secret:
-    log.error('source .env vars for Flickr API and try again')
+    log.error('source env/flickr.env vars for Flickr API and try again')
     return
 
   # check how many flickr keys
   api_keys = []
   api_secrets = []
-  for i in range(1,10):
+  for i in range(1,20):
     try:
       var_name_key = f'FLICKR_API_KEY_{i}'
       var_name_secret = f'FLICKR_API_SECRET_{i}'
@@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   log.info(f'Shuffling between: {len(api_keys)} api keys')
 
   # read in CSV
-  # | query, filepath |
-
-  records = pd.read_csv(opt_fp_in).to_dict('records')
+  # | query, filename, count |
+  df_records = pd.read_csv(opt_fp_in)
+  log.info(f'Dedpuplicating {len(df_records)}')
+  if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile':
+    df_records = df_records.drop_duplicates(subset='nsid', keep="last")
+  else:
+    df_records = df_records.drop_duplicates(subset='photo_id', keep="last")
+  log.info(f'After deduplication: {len(df_records)}')
+  records = df_records.to_dict('records')
+  
   if opt_slice:
     records = records[opt_slice[0]:opt_slice[1]]
   
@@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   
 
   for record in tqdm(records):
-    fp_out = join(opt_fp_out, record['filepath'])
+    if 'nsid' in opt_query_type:
+      fp_out = join(opt_fp_out, f"{record['nsid']}.json")
+    else:
+      fp_out = join(opt_fp_out, f'{record["photo_id"]}.json')
+
     fp_out_err = fp_out + '_error.txt'
     if Path(fp_out).is_file() or Path(fp_out_err).is_file():
       continue
-    # append relevant data
     try:
       # shuffle the api keys to avoid rate limiting
       rand_int = randint(0,len(api_keys)-1)
       api_key = api_keys[rand_int]
       api_secret = api_secrets[rand_int]
-                                      
-      #flickr_api.set_keys(api_key=api_key, api_secret=api_secret)
 
-      #photo = flickr_api.Photo(id=record['query'])
-      # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
-      photo_id = record['query']
-      flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo'
-      flickr_url += f'&api_key={api_key}'
-      flickr_url += f'&photo_id={photo_id}'
-      flickr_url += '&format=json'
-      flickr_url += '&nojsoncallback=1'
+      # https://www.flickr.com/services/rest/
+      if opt_query_type == 'nsid_url':
+        # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+        # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+        nsid = record['nsid']
+        nsid_encoded = urllib.parse.quote_plus(nsid)
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&user_id={nsid_encoded}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
+      # https://www.flickr.com/services/rest/
+      if opt_query_type == 'nsid_profile':
+        # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+        # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+        nsid = record['nsid']
+        nsid_encoded = urllib.parse.quote_plus(nsid)
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&user_id={nsid_encoded}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
+      elif opt_query_type == 'photo_id':
+        # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
+        fp_out
+        photo_id = record['photo_id']
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&photo_id={photo_id}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
 
       with urllib.request.urlopen(flickr_url) as url:
         data = json.loads(url.read().decode())
 
       if data['stat'] =='fail':
-        raise Exception('failed')
+        error_msg = data["message"]
+        log.error(f'Failed. Message: {error_msg}, url: {flickr_url}')
+        if error_msg == 'Service currently unavailable':
+          time.sleep(10)
+        raise Exception(error_msg)
       elif data['stat'] =='ok':
         with open(fp_out, 'w') as fp:
           json.dump(data, fp, sort_keys=True, indent=2)
       
-    #except FlickrAPIError as e:
     except Exception as e:
-      # if "HTTP Server Error 500" in str(e):
-      log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}')
-      if "not found" in str(e) or 'failed' in str(e):
+      log.error(f'{e}')
+      if "not found" in str(e) or 'Invalid NSID provided' in str(e):
         with open(fp_out_err, 'w') as fp:
           fp.write('')
     
diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py
new file mode 100644
index 00000000..5b5f0ce3
--- /dev/null
+++ b/megapixels/commands/datasets/flickr_api_to_csv.py
@@ -0,0 +1,382 @@
+"""
+Converts directory of JSON API output files to CSV format
+"""
+
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+query_types = ['nsid_profile', 'nsid_url', 'photo_id']
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type):
+  """Fetches Flickr API for user info. Saves to JSON"""
+  
+  from tqdm import tqdm
+  from glob import glob
+  import json
+
+  
+  # -------------------------------------------------
+  # process  
+  if Path(opt_fp_out).is_file() and not opt_force:
+    log.error('File exists. Use "--force" to overwrite it')
+    return
+
+  fp_files = glob(join(opt_fp_in, '*.json'))
+  fp_files = [f for f in fp_files if 'error' not in f]
+  if opt_slice:
+    fp_files = fp_files[opt_slice[0]:opt_slice[1]]
+
+  log.debug(f'Found {len(fp_files)} files')
+  items = []
+
+  for fp_file in tqdm(fp_files):
+
+    if opt_query_type == 'photo_id':
+      try:
+        photo = file_utils.load_json(fp_file).get('photo')
+      except Exception as e:
+        log.error(f'{e}, skipping: {fp_file}')
+        continue
+      dates = photo.get('dates')
+      posted = dates.get('posted')
+      taken = dates.get('taken')
+      description = photo.get('description').get('_content')
+      location = photo.get('location', {})
+      country = location.get('country', {})
+      location_country = country.get('_country', '')
+      location_place = country.get('place_id', '')
+      location_woeid = country.get('woeid', '')
+      location_lat = location.get('latitude', '')
+      location_lon = location.get('longitude', '')
+      location_place_id = location.get('place_id', '')
+      owner = photo.get('owner')
+      nsid = owner.get('nsid')
+      path_alias = owner.get('path_alias')
+      owner_realname = owner.get('realname')
+      owner_username = owner.get('username')
+      owner_location = owner.get('location')
+      photo_id = Path(fp_file).stem
+      server = photo.get('server')
+      farm = photo.get('farm')
+      secret = photo.get('secret')
+      # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg
+      image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'
+
+      obj = {
+        'posted': posted,
+        'taken': taken,
+        'description': description,
+        'country': location_country,
+        'place': location_place,
+        'woeid': location_woeid,
+        'lat': location_lat,
+        'lon': location_lon,
+        'place_id': location_place_id,
+        'nsid': nsid,
+        'path_alias': path_alias,
+        'realname': owner_realname,
+        'username': owner_username,
+        'owner_location': owner_location,
+        'photo_id': photo_id,
+        'secret': secret,
+        'url': image_url
+      }
+
+  
+    elif opt_query_type == 'nsid_profile':
+      obj = file_utils.load_json(fp_file).get('profile')
+      obj.pop('showcase_set')
+      obj.pop('showcase_set_title')
+      obj.pop('pinterest')
+      obj.pop('tumblr')
+    elif opt_query_type == 'nsid_url':
+      obj = file_utils.load_json(fp_file).get('user')
+    elif opt_query_type == 'user_profile':
+      metadata = file_utils.load_json(fp_file).get('photo')
+      owner = metadata.get('owner')
+      path_alias = owner.get('path_alias')
+      nsid = owner.get('nsid')
+      username = owner.get('username')
+      realname = owner.get('realname')
+      description = metadata.get('description').get('_content')
+      title = metadata.get('title').get('_content')
+      location = metadata.get('location')
+      dates = metadata.get('dates')
+      date_taken = dates.get('taken')
+      date_posted = dates.get('posted')
+      fname = Path(fp_file).stem
+      obj = {
+        'photo_id': fname,
+        'nsid': nsid,
+        'path_alias': path_alias,
+        'username': username,
+        'realname': realname,
+        'title': title,
+        'description': description,
+        'location': location,
+        'date_taken': date_taken,
+        'date_posted': date_posted
+        }
+    
+    items.append(obj)
+
+  # conver to DataFrame
+  df = pd.DataFrame.from_dict(items)
+  df.to_csv(opt_fp_out, index=False)
+  log.info(f'Wrote {len(df)} to {opt_fp_out}')
+
+"""
+nsid_url
+  {
+  "stat": "ok",
+  "user": {
+    "nsid": "7153718@N04",
+    "url": "https://www.flickr.com/people/babyfish4/"
+  }
+}
+"""
+"""
+  location: of the owner
+  dateuploaded
+  license
+  "dates":
+    "lastupdate": "1416447096"
+    "posted": "1112900873"
+    "taken": "2005-04-06 18:37:38"
+  description:
+    _content: playing in a field
+  title:
+    _content: jessica
+  location: cornwall, uk
+"""
+
+"""
+  {
+  "profile": {
+    "city": null,
+    "country": null,
+    "facebook": "",
+    "first_name": null,
+    "hometown": "",
+    "id": "7153718@N04",
+    "instagram": "",
+    "join_date": "1172669959",
+    "last_name": null,
+    "nsid": "7153718@N04",
+    "occupation": "",
+    "pinterest": "",
+    "profile_description": "",
+    "showcase_set": "72157680616398790",
+    "showcase_set_title": "Profile Showcase",
+    "tumblr": "",
+    "twitter": ""
+  },
+  "stat": "ok"
+}
+"""
+
+"""
+photo_id
+
+
+  {
+  "photo": {
+    "comments": {
+      "_content": "0"
+    },
+    "dates": {
+      "lastupdate": "0",
+      "posted": "1094612969",
+      "taken": "2004-09-04 22:41:18",
+      "takengranularity": "0",
+      "takenunknown": 0
+    },
+    "dateuploaded": "1094612969",
+    "description": {
+      "_content": ""
+    },
+    "editability": {
+      "canaddmeta": 0,
+      "cancomment": 0
+    },
+    "farm": 1,
+    "geoperms": {
+      "iscontact": 0,
+      "isfamily": 0,
+      "isfriend": 0,
+      "ispublic": 1
+    },
+    "id": "371498",
+    "isfavorite": 0,
+    "license": "1",
+    "location": {
+      "accuracy": "15",
+      "context": "0",
+      "country": {
+        "_content": "United States",
+        "place_id": "nz.gsghTUb4c2WAecA",
+        "woeid": "23424977"
+      },
+      "county": {
+        "_content": "Tompkins",
+        "place_id": "1uCJJtBQUL80G6hbPw",
+        "woeid": "12589366"
+      },
+      "latitude": "42.399028",
+      "longitude": "-76.652519",
+      "place_id": "1uCJJtBQUL80G6hbPw",
+      "region": {
+        "_content": "New York",
+        "place_id": "ODHTuIhTUb75gdBu",
+        "woeid": "2347591"
+      },
+      "woeid": "12589366"
+    },
+    "media": "photo",
+    "notes": {
+      "note": []
+    },
+    "originalformat": "jpg",
+    "originalsecret": "704f392686",
+    "owner": {
+      "iconfarm": 1,
+      "iconserver": "1",
+      "location": "Los Angeles, CA, USA",
+      "nsid": "48600072071@N01",
+      "path_alias": "barb",
+      "realname": "Barb Dybwad",
+      "username": "doctor paradox"
+    },
+    "people": {
+      "haspeople": 0
+    },
+    "publiceditability": {
+      "canaddmeta": 0,
+      "cancomment": 1
+    },
+    "rotation": 0,
+    "safety_level": "0",
+    "secret": "704f392686",
+    "server": "1",
+    "tags": {
+      "tag": [
+        {
+          "_content": "unfound",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-9017",
+          "machine_tag": 0,
+          "raw": "unfound"
+        },
+        {
+          "_content": "digicam",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-40406",
+          "machine_tag": 0,
+          "raw": "digicam"
+        },
+        {
+          "_content": "upstateny",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-9655",
+          "machine_tag": 0,
+          "raw": "upstateny"
+        },
+        {
+          "_content": "musefest",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-72456",
+          "machine_tag": 0,
+          "raw": "musefest"
+        },
+        {
+          "_content": "musicfestival",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-72628",
+          "machine_tag": 0,
+          "raw": "musicfestival"
+        },
+        {
+          "_content": "people",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-290",
+          "machine_tag": 0,
+          "raw": "people"
+        },
+        {
+          "_content": "portrait",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-278",
+          "machine_tag": 0,
+          "raw": "portrait"
+        },
+        {
+          "_content": "maco",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-19439",
+          "machine_tag": 0,
+          "raw": "maco"
+        }
+      ]
+    },
+    "title": {
+      "_content": "maco2"
+    },
+    "urls": {
+      "url": [
+        {
+          "_content": "https://www.flickr.com/photos/barb/371498/",
+          "type": "photopage"
+        }
+      ]
+    },
+    "usage": {
+      "canblog": 0,
+      "candownload": 1,
+      "canprint": 0,
+      "canshare": 1
+    },
+    "views": "290",
+    "visibility": {
+      "isfamily": 0,
+      "isfriend": 0,
+      "ispublic": 1
+    }
+  },
+  "stat": "ok"
+}
+"""
+\ No newline at end of file
diff --git a/megapixels/commands/msc/count.py b/megapixels/commands/msc/count.py
new file mode 100644
index 00000000..3c242bc6
--- /dev/null
+++ b/megapixels/commands/msc/count.py
@@ -0,0 +1,123 @@
+from os.path import join
+
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+# datasets
+dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there', 'vgg_face']
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input file for embassies')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
+  """Cross reference"""
+  
+  import sys
+  from os.path import join
+  from glob import glob
+  from pathlib import Path
+  import time
+
+  import pandas as pd
+  from tqdm import tqdm
+
+  log = Logger.getLogger()
+  log.info('Cross reference embassy list')
+
+  
+  fp_counts = {}
+  fp_filepaths = {}
+  fp_dataset_base = '/data_store/datasets/people/'
+
+  for dk in dataset_keys:
+    fp_counts[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_counts.csv')
+    fp_filepaths[dk] = join(fp_dataset_base, dk, f'metadata/{dk}_filepaths.csv')
+
+  df_embassies = pd.read_csv(opt_fp_in)
+  df_embassies.fillna('', inplace=True)
+  embassy_nsids = list(df_embassies['nsid'])
+
+  match_items = []
+  embassy_images = []
+  malta_images = []
+
+  for dataset_key, fp_dataset in tqdm(fp_counts.items()):
+    df_counts = pd.read_csv(fp_dataset)
+    log.debug(f'loading: {fp_filepaths[dataset_key]}')
+    df_filepaths = pd.read_csv(fp_filepaths[dataset_key])
+    nsids = list(df_counts['nsid'])
+    for nsid in nsids:
+      if nsid in embassy_nsids:
+        # add to matches, and count
+        count = df_counts[df_counts['nsid'] == nsid]['count'].values[0]
+        first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
+        last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
+        path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
+        page_url = f'https://flickr.com/photos/{path_alias}'
+        embassy_name = f'{first_name} {last_name}'
+        embassy_meta = df_embassies[df_embassies['nsid'] == nsid].iloc[0]
+
+        match_obj = {
+          'count': count,
+          'path_alias': path_alias,
+          'name': embassy_name,
+          'dataset_key': dataset_key,
+          'nsid': nsid,
+          'page_url': page_url,
+          'embassy_type': embassy_meta.type,
+          'username': embassy_meta.username
+        }
+        match_items.append(match_obj)
+
+        # add photo ids or url
+        df_nsids = df_filepaths[df_filepaths['nsid'] == nsid]
+        nsid_records = df_nsids.to_dict('records')
+        for nsid_record in nsid_records:
+          photo_id = nsid_record.get('photo_id')
+          im_obj = {
+            'nsid': nsid,
+            'url': nsid_record.get('url'),
+            'photo_id': photo_id,
+            'dataset_key': dataset_key,
+            'path_alias': path_alias,
+            'name': embassy_name,
+            'page_url': page_url,
+            'username': embassy_meta.username,
+            'filepath': f'{photo_id}.jpg'
+          }
+          
+          embassy_images.append(im_obj)
+          if nsid == '51226353@N03':
+            malta_images.append(im_obj)
+
+  # Save embassy matches
+  df = pd.DataFrame.from_dict(match_items)
+  df.to_csv(opt_fp_out, index=False)
+  total = df['count'].sum()
+  
+  # Save image matches
+  df = pd.DataFrame.from_dict(embassy_images)
+  fp_out = opt_fp_out.replace('.csv', '_images.csv')
+  df.to_csv(fp_out, index=False)
+  total = len(embassy_images)
+  log.debug(f'wrote {fp_out}')
+  log.debug(f'Found {total:,} embassy images')
+
+  # Save malta images
+  df = pd.DataFrame.from_dict(malta_images)
+  fp_out = opt_fp_out.replace('.csv', '_images_malta.csv')
+  df.to_csv(fp_out, index=False)
+  total = len(malta)
+  log.debug(f'wrote {fp_out}')
+  log.debug(f'Found {total:,} malta embassy images')
+\ No newline at end of file
diff --git a/megapixels/commands/msc/cross_reference.py b/megapixels/commands/msc/cross_reference.py
deleted file mode 100644
index d4457945..00000000
--- a/megapixels/commands/msc/cross_reference.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from os.path import join
-
-import click
-
-from app.utils.logger_utils import Logger
-
-log = Logger.getLogger()
-
-# source file for Embassy NSIDs
-fp_in_embassies = '/data_store/datasets/msc/embassies/embassies_on_flickr_ext.csv'
-
-# list of datasets to cross reference
-dataset_keys = ['pipa', 'megaface', 'helen', 'ibm_dif', 'adience', 'who_goes_there']
-fp_dataset_base = '/data_store/datasets/people/'
-fp_datasets = {}
-for dk in dataset_keys:
-  fp_datasets[dk] = join(fp_dataset_base, dk, f'research/{dk}_flickr_meta.csv')
-
-
-# output file
-fp_out = '/data_store/datasets/msc/embassies/embassies_scores.csv'
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in_embassies,
-  help='Input file for embassies')
-@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
-  help='Output file')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
-  help='Slice list of files')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
-  help='Force overwrite')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force):
-  """Cross reference"""
-  
-  import sys
-  from os.path import join
-  from glob import glob
-  from pathlib import Path
-  import time
-
-  import pandas as pd
-  from tqdm import tqdm
-
-  log = Logger.getLogger()
-  log.info('Cross reference embassy list')
-
-  df_embassies = pd.read_csv(opt_fp_in)
-  df_embassies.fillna('', inplace=True)
-  embassy_nsids = list(df_embassies['nsid'])
-
-  match_items = []
-  for dataset_key, fp_dataset in fp_datasets.items():
-    df_dataset = pd.read_csv(fp_dataset)
-    nsids = list(df_dataset['nsid'])
-    for nsid in nsids:
-      if nsid in embassy_nsids:
-        # add to matches, and count
-        count = df_dataset[df_dataset['nsid'] == nsid]['count'].values[0]
-        first_name = df_embassies[df_embassies['nsid'] == nsid]['first_name'].values[0]
-        last_name = df_embassies[df_embassies['nsid'] == nsid]['last_name'].values[0]
-        path_alias = df_embassies[df_embassies['nsid'] == nsid]['path_alias'].values[0]
-        log.debug(f'{first_name} {last_name}, {path_alias} count: {count}, in {dataset_key}')
-        match_obj = {
-          'count': count,
-          'path_alias': path_alias,
-          'name': f'{first_name} {last_name}',
-          'dataset_key': dataset_key,
-          'nsid': nsid
-        }
-        match_items.append(match_obj)
-
-  df = pd.DataFrame.from_dict(match_items)
-  df.to_csv(opt_fp_out, index=False)
-  
-  total = df['count'].sum()
-  
-  log.debug(f'Found {total} embassy photos')
-\ No newline at end of file
diff --git a/megapixels/commands/msc/summarize.py b/megapixels/commands/msc/summarize.py
index d5d251db..045e3b69 100644
--- a/megapixels/commands/msc/summarize.py
+++ b/megapixels/commands/msc/summarize.py
@@ -29,7 +29,7 @@ def cli(ctx, opt_fp_in, opt_fp_out):
 
   log = Logger.getLogger()
 
-  dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'duke_mtmc', 'brainwash', 'msceleb', 'uccs']
+  dataset_names = ['helen', 'megaface', 'adience', 'pipa', 'lfpw', 'brainwash', 'msceleb', 'duke_mtmc', 'uccs']
   
   df = pd.DataFrame()
   fp_out = opt_fp_out.replace('.csv', '_citations.csv')
@@ -37,10 +37,11 @@ def cli(ctx, opt_fp_in, opt_fp_out):
     fp_csv = join(opt_fp_in, f'{dataset_name}.csv')
     _df = pd.read_csv(fp_csv)
     _df = _df[_df.lat != 0]
+    _df.drop('id', axis=1, inplace=True)
     print(dataset_name, len(_df))
     df = df.append(_df, ignore_index=True)
 
-  df.to_csv(opt_fp_out, index=False)
+  df.to_csv(fp_out, index=False)
 
   # create country summary
   fp_out = opt_fp_out.replace('.csv', '_countries.csv')
diff --git a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
index a35c3b24..8d3b4251 100644
--- a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb
@@ -29,41 +29,145 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Create CSV for API"
+    "## Cleanup filepaths CSV"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_in_photo_ids = '/data_store_hdd/datasets/people/adience/research/adience_photo_ids.csv'\n",
+    "fp_in_flickr_api_dump = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_dump.csv'\n",
+    "fp_in_flickr_api_dump_photo_ids = '/data_store_hdd/datasets/people/adience/research/flickr_api_dump_photo_id.csv'\n",
+    "fp_out_filepaths = '/data_store_hdd/datasets/people/adience/research/adience_filepaths.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "| filepath | query | count |\n",
-    "|:---|:---|:---|\n",
-    "| 12234 | 12234@123| 10 |"
+    "# photo id list\n",
+    "df = pd.read_csv(fp_in_photo_ids)\n",
+    "records = df.to_dict('records')\n",
+    "\n",
+    "# photo id --> url list\n",
+    "df_api_urls = pd.read_csv(fp_in_flickr_api_dump_photo_ids)\n",
+    "api_urls = df_api_urls.to_dict('records')\n",
+    "\n",
+    "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api_dump)\n",
+    "flickr_api_dump = df_flickr_api_dump.to_dict('records')\n",
+    "\n",
+    "# create lookup table for user info?\n",
+    "flickr_api_lookup = {}\n",
+    "for api_item in flickr_api_dump:\n",
+    "  nsid = api_item['nsid']\n",
+    "  flickr_api_lookup[nsid] = api_item\n",
+    "  \n",
+    "# create lookup table for user info?\n",
+    "api_url_lookup = {}\n",
+    "for api_url_item in api_urls:\n",
+    "  photo_id = api_url_item['photo_id']\n",
+    "  api_url_lookup[photo_id] = api_url_item\n",
+    "  \n",
+    "results = []\n",
+    "for record in records:\n",
+    "  photo_id = record['photo_id']\n",
+    "  if photo_id in api_url_lookup.keys():\n",
+    "    api_item = api_url_lookup.get(photo_id)\n",
+    "    url = api_item.get('url')\n",
+    "    nsid = api_item.get('nsid')\n",
+    "    obj = {\n",
+    "      'filepath': f'{photo_id}.jpg',\n",
+    "      'nsid': nsid,\n",
+    "      'photo_id': photo_id,\n",
+    "      'url': url\n",
+    "    }\n",
+    "    results.append(obj)\n",
+    "    \n",
+    "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the file meta csv\n",
+    "results = []\n",
+    "results_download = []\n",
+    "for flickr_meta_record in flickr_meta_records:\n",
+    "  # farm, server, photo id, secret\n",
+    "  photo_id = str(flickr_meta_record['photo_id'])\n",
+    "  nsid = flickr_meta_record.get('nsid')\n",
+    "  fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n",
+    "  json_data = file_utils.load_json(fp_json)\n",
+    "  photo_meta = json_data.get('photo')\n",
+    "  farm = photo_meta.get('farm')\n",
+    "  server = photo_meta.get('server')\n",
+    "  secret = photo_meta.get('secret')\n",
+    "  # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n",
+    "  url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n",
+    "  obj = {\n",
+    "    'nsid': nsid,\n",
+    "    'photo_id': photo_id,\n",
+    "    'url': url,\n",
+    "    'filepath': f'{photo_id}.jpg'\n",
+    "  }\n",
+    "  results.append(obj)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "fp_in_dir = '/data_store/datasets/people/adience/dataset/'\n",
-    "fp_out_queries = '/data_store/datasets/people/adience/research/adience_flickr_api_queries.csv'"
+    "## Create Photo ID list"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "9\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "fp_files = glob(join(fp_in_dir, '*.txt'))\n",
     "print(len(fp_files))"
@@ -71,7 +175,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -94,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,7 +218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -123,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -132,24 +236,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "10804\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(len(df_images))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb
new file mode 100644
index 00000000..3d571aff
--- /dev/null
+++ b/megapixels/notebooks/datasets/ffhq/prepare_flickr_meta.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download FFHQ Images\n",
+    "\n",
+    "- https://github.com/NVlabs/ffhq-dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "import requests\n",
+    "import json\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels')\n",
+    "from app.utils import file_utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load JSON\n",
+    "fp_in = '/data_store/datasets/people/ffhq/ffhq-dataset-v1.json'\n",
+    "fp_out = '/data_store/datasets/people/ffhq/research/flickr_api_urls.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(fp_in, 'r') as fp:\n",
+    "  ffhq_items = json.load(fp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "217c694742e8408d871c3b41183676fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = []\n",
+    "# get photos urls\n",
+    "for idx, ffhq_item in tqdm(ffhq_items.items()):\n",
+    "  url = ffhq_item.get('metadata').get('photo_url')\n",
+    "  photo_id = Path(url).stem\n",
+    "  obj = {'photo_id': photo_id}\n",
+    "  results.append(obj)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'photo_id': '1133484654'}"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame.from_dict(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(fp_out, index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
index 311d3462..140b6361 100644
--- a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
@@ -40,6 +40,110 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create filepaths CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_flickr_meta = '/data_store_hdd/datasets/people/helen/research/helen_flickr_api_dump.csv'\n",
+    "fp_photo_ids = '/data_store_hdd/datasets/people/helen/research/helen_flickr_photo_ids.csv'\n",
+    "fp_filepaths = '/data_store_hdd/datasets/people/helen/research/helen_file_meta.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_photo_ids = pd.read_csv(fp_photo_ids)\n",
+    "photo_ids = df_photo_ids.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_flickr_meta = pd.read_csv(fp_flickr_meta, dtype={'photo_id': str})\n",
+    "flickr_meta_records = df_flickr_meta.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1854\n",
+      "2122\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(flickr_meta_records))\n",
+    "print(len(df_photo_ids))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the file meta csv\n",
+    "results = []\n",
+    "results_download = []\n",
+    "for flickr_meta_record in flickr_meta_records:\n",
+    "  # farm, server, photo id, secret\n",
+    "  photo_id = str(flickr_meta_record['photo_id'])\n",
+    "  nsid = flickr_meta_record.get('nsid')\n",
+    "  fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n",
+    "  json_data = file_utils.load_json(fp_json)\n",
+    "  photo_meta = json_data.get('photo')\n",
+    "  farm = photo_meta.get('farm')\n",
+    "  server = photo_meta.get('server')\n",
+    "  secret = photo_meta.get('secret')\n",
+    "  # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n",
+    "  url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n",
+    "  obj = {\n",
+    "    'nsid': nsid,\n",
+    "    'photo_id': photo_id,\n",
+    "    'url': url,\n",
+    "    'filepath': f'{photo_id}.jpg'\n",
+    "  }\n",
+    "  results.append(obj)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_out = pd.DataFrame.from_dict(results)\n",
+    "df_out.to_csv(fp_filepaths, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
    "cell_type": "code",
    "execution_count": 16,
    "metadata": {},
diff --git a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
index ff41e799..6d2b768a 100644
--- a/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/ibm_dif/prepare_flickr_api.ipynb
@@ -29,70 +29,353 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Create CSV for API"
+    "## IBM DiF clean CSVs\n",
+    "\n",
+    "- 2283 files could not be downloaded or accessed in the API\n",
+    "- these images were downloaded, but possibly no longer exist"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 60,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "# flickr api data\n",
+    "fp_in_meta_flickr = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_flickr.csv'\n",
+    "\n",
+    "# api query dump\n",
+    "fp_in_flickr_api = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
+    "\n",
+    "# ibm count data\n",
+    "fp_in_meta_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths.csv'\n",
+    "fp_meta_filepaths_adj = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_meta_filepaths_adj.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  interactivity=interactivity, compiler=compiler, result=result)\n"
+     ]
+    }
+   ],
    "source": [
-    "| photo_id |\n",
-    "|:---|\n",
-    "| 12234 |"
+    "df_meta_filepaths = pd.read_csv(fp_in_meta_filepaths)\n",
+    "meta_filepaths = df_meta_filepaths.to_dict('records')\n",
+    "df_meta_flickr = pd.read_csv(fp_in_meta_flickr)\n",
+    "meta_flickr = df_meta_flickr.to_dict('records')\n",
+    "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "98155\n",
+      "98155\n",
+      "98153\n",
+      "100438\n",
+      "98154\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(df_flickr_api_dump))\n",
+    "print(len(df_flickr_api_dump.drop_duplicates(subset='nsid')))\n",
+    "print(len(df_meta_flickr))\n",
+    "print(len(df_meta_filepaths))\n",
+    "print(len(df_meta_filepaths.drop_duplicates(subset='nsid')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# flickr api data\n",
-    "fp_in_flickr_meta = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
-    "# ibm count data\n",
-    "fp_in_ibm_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n",
-    "# output\n",
-    "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'"
+    "# drop epmty NSIDs\n",
+    "df_meta_filepaths.drop_duplicates(subset='nsid', inplace=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# load ibm data and create count lookup with photoid\n",
-    "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
-    "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
-    "count_lookup = {}\n",
-    "for ibm_meta_record in ibm_meta_records:\n",
-    "  photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
-    "  count_lookup[photo_id] = ibm_meta_record['count']"
+    "df_meta_filepaths.to_csv(fp_meta_filepaths_adj, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nsid_filepaths = {}\n",
+    "dupes = []\n",
+    "for meta_filepath in meta_filepaths:\n",
+    "  nsid = meta_filepath['nsid']\n",
+    "  if nsid not in nsid_filepaths.keys():\n",
+    "    nsid_filepaths[nsid] = meta_filepath\n",
+    "  else:\n",
+    "    dupes.append(meta_filepath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "98154\n",
+      "2284\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(nsid_filepaths))\n",
+    "print(len(dupes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'filepath': '12537662393_247b2187ee.jpg', 'nsid': nan, 'photo_id': 12537662393, 'url': 'http://farm6.staticflickr.com/5476/12537662393_247b2187ee.jpg'}\n",
+      "{'filepath': '5837222502_29aaf5bb53.jpg', 'nsid': nan, 'photo_id': 5837222502, 'url': 'http://farm4.staticflickr.com/3089/5837222502_29aaf5bb53.jpg'}\n",
+      "{'filepath': '10859466623_4ceb1564dc.jpg', 'nsid': nan, 'photo_id': 10859466623, 'url': 'http://farm6.staticflickr.com/5530/10859466623_4ceb1564dc.jpg'}\n",
+      "{'filepath': '13719567455_fb96dc7ac6.jpg', 'nsid': nan, 'photo_id': 13719567455, 'url': 'http://farm4.staticflickr.com/3718/13719567455_fb96dc7ac6.jpg'}\n",
+      "{'filepath': '3486554266_ca1fc7d99c.jpg', 'nsid': nan, 'photo_id': 3486554266, 'url': 'http://farm4.staticflickr.com/3327/3486554266_ca1fc7d99c.jpg'}\n",
+      "{'filepath': '6168324261_d2fb7bbb60.jpg', 'nsid': nan, 'photo_id': 6168324261, 'url': 'http://farm7.staticflickr.com/6166/6168324261_d2fb7bbb60.jpg'}\n",
+      "{'filepath': '13938295982_0d950feba5.jpg', 'nsid': nan, 'photo_id': 13938295982, 'url': 'http://farm8.staticflickr.com/7162/13938295982_0d950feba5.jpg'}\n",
+      "{'filepath': '8881073633_546b6dbfe5.jpg', 'nsid': nan, 'photo_id': 8881073633, 'url': 'http://farm6.staticflickr.com/5459/8881073633_546b6dbfe5.jpg'}\n",
+      "{'filepath': '10918515734_404eb29879.jpg', 'nsid': nan, 'photo_id': 10918515734, 'url': 'http://farm6.staticflickr.com/5502/10918515734_404eb29879.jpg'}\n",
+      "{'filepath': '3236533532_05cacef8e9.jpg', 'nsid': nan, 'photo_id': 3236533532, 'url': 'http://farm4.staticflickr.com/3425/3236533532_05cacef8e9.jpg'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "for dupe in dupes[:10]:\n",
+    "  print(dupe)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100438\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(dupes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "98153\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(nsid_groups))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100436\n"
+     ]
+    }
+   ],
+   "source": [
+    "fp_ims = glob('/data_store_hdd/datasets/people/ibm_dif/downloads/images/*.jpg')\n",
+    "print(len(fp_ims))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9314013316\n"
+     ]
+    }
+   ],
+   "source": [
+    "photo_ids = [Path(x).stem.split('_')[0] for x in fp_ims]\n",
+    "print(photo_ids[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'photo_id'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                           Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-45-fd2de6074950>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfilepath_photo_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'photo_id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmeta_flickr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m: 'photo_id'"
+     ]
+    }
+   ],
+   "source": [
+    "filepath_photo_ids = [int(x['nsid']) for x in meta_flickr]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7a9a78bf0e442a5b8445906bc85da99",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "100438"
+       "HBox(children=(IntProgress(value=0, max=100436), HTML(value='')))"
       ]
      },
-     "execution_count": 69,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# find which photo IDs are no longer accessible\n",
+    "missing_photo_ids = []\n",
+    "for photo_id in tqdm(photo_ids):\n",
+    "  photo_id = int(photo_id)\n",
+    "  if photo_id not in filepath_photo_ids:\n",
+    "    missing_photo_ids.append(photo_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "[]\n"
+     ]
     }
    ],
    "source": [
+    "print(len(missing_photo_ids))\n",
+    "print(missing_photo_ids[0:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'df_flickr_meta' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                          Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-30-75e9fdbbbfbb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtotal\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_flickr_meta\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'count'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'df_flickr_meta' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "total = df_flickr_meta['count'].sum()\n",
+    "print(total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load ibm data and create count lookup with photoid\n",
+    "df_ibm_meta = pd.read_csv(fp_in_ibm_meta)\n",
+    "ibm_meta_records = df_ibm_meta.to_dict('records')\n",
+    "count_lookup = {}\n",
+    "for ibm_meta_record in ibm_meta_records:\n",
+    "  photo_id = int(Path(ibm_meta_record['url']).stem.split('_')[0])\n",
+    "  count_lookup[photo_id] = ibm_meta_record['count']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "len(count_lookup)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +384,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,18 +394,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Error: invalid literal for int() with base 10: '', {'country': '', 'description': 'Haircut Next...', 'lat': '', 'lon': '', 'nsid': '', 'owner_location': '', 'path_alias': '', 'photo_id': '', 'place': '', 'place_id': '', 'posted': '', 'realname': '', 'taken': '', 'username': '', 'woeid': ''}\n",
-      "Error: invalid literal for int() with base 10: '', {'country': '', 'description': '', 'lat': '86085317@N00', 'lon': 'New York', 'nsid': 'anonymousthomas', 'owner_location': '4975598', 'path_alias': '', 'photo_id': '', 'place': '1108685469', 'place_id': 'Thomas', 'posted': '2005-02-18 00:11:09', 'realname': 'anonymousthomas', 'taken': '', 'username': '', 'woeid': ''}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# load flickr data\n",
     "for flickr_meta_record in flickr_meta_records:\n",
@@ -143,7 +417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,55 +434,99 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# photo ids and nsids\n",
     "fp_flickr_api_dump = '/data_store_hdd/datasets/people/ibm_dif/research/flickr_api_query_dump.csv'\n",
-    "fp_out_meta = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_flickr_meta.csv'"
+    "\n",
+    "# file urls\n",
+    "fp_ibm_urls = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'\n",
+    "\n",
+    "# flickr meta\n",
+    "fp_out_filepaths = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_filepaths.csv'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (2,3,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  interactivity=interactivity, compiler=compiler, result=result)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "df = pd.read_csv(fp_flickr_api_dump)\n",
-    "groups = df.groupby('nsid')"
+    "df_flickr_meta = pd.read_csv(fp_flickr_api_dump)\n",
+    "df_flickr_meta.fillna('', inplace=True)\n",
+    "flickr_metas = df_flickr_meta.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "|filepath|nsid|photo_id|url|\n",
+    "```"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = []\n",
-    "for nsid, group in groups:\n",
-    "  obj = {\n",
-    "    'nsid': nsid,\n",
-    "    'count': len(group)\n",
-    "  }\n",
-    "  results.append(obj)"
+    "photo_id_to_nsid = {}\n",
+    "for flickr_meta in flickr_metas:\n",
+    "  photo_id = flickr_meta.get('photo_id')\n",
+    "  if photo_id:\n",
+    "    photo_id = str(int(photo_id))\n",
+    "    photo_id_to_nsid[photo_id] = flickr_meta['nsid']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(list(photo_id_to_nsid.keys())[0:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_ibm_urls = pd.read_csv(fp_ibm_urls)\n",
+    "ibm_urls = df_ibm_urls.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "photo_id_to_url = {}\n",
+    "missed = []\n",
+    "for ibm_url in ibm_urls:\n",
+    "  photo_id = str(ibm_url['filepath'].split('_')[0])\n",
+    "  try:\n",
+    "    ibm_url['photo_id'] = photo_id\n",
+    "    ibm_url['nsid'] = photo_id_to_nsid[photo_id]\n",
+    "  except Exception as e:\n",
+    "#     print(e, photo_id)\n",
+    "    missed.append(photo_id)\n",
+    "print(f'missed: {len(missed)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "pd.DataFrame.from_dict(results).to_csv(fp_out_meta, index=False)"
+    "pd.DataFrame.from_dict(ibm_urls).to_csv(fp_out_filepaths, index=False)"
    ]
   },
   {
diff --git a/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
new file mode 100644
index 00000000..b4a29243
--- /dev/null
+++ b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Count IJB sources"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "import requests\n",
+    "import json\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels')\n",
+    "from app.utils import file_utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load JSON\n",
+    "fp_in_cs3 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'\n",
+    "fp_in_cs4 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'\n",
+    "fp_in_ijb_b = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-B/ijbb_licenses_and_sources.csv'\n",
+    "fp_in_ijb_a = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-A/SOURCES.csv'\n",
+    "fp_out = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/summary.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cs3 = pd.read_csv(fp_in_cs3)\n",
+    "df_cs4 = pd.read_csv(fp_in_cs4)\n",
+    "df_sources = df_cs3.append(df_cs4)\n",
+    "df_sources.fillna('', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ijb b\n",
+    "#df_sources = pd.read_csv(fp_in_ijb_b).fillna('')\n",
+    "# ijb a\n",
+    "df_sources = pd.read_csv(fp_in_ijb_a).fillna('')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sources = df_sources.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "others = []\n",
+    "keys = ['flickr.com', 'youtube.com', 'wikipedia.org', 'wikimedia.org']\n",
+    "for k in keys:\n",
+    "  results[k] = []\n",
+    "for source in sources:\n",
+    "  url = str(source['Media URL'])\n",
+    "  media_id = source['Media ID']\n",
+    "  if 'nonfaces' in media_id:\n",
+    "    continue\n",
+    "  found = False\n",
+    "  for k in keys:\n",
+    "    if k in url:\n",
+    "      results[k].append(url)\n",
+    "      found = True\n",
+    "  if not found:\n",
+    "    if url:\n",
+    "      others.append(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "flickr.com 0\n",
+      "youtube.com 1388\n",
+      "wikipedia.org 0\n",
+      "wikimedia.org 4298\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k,v in results.items():\n",
+    "  print(k, len(set(v)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "siliconangle.com/files/2011/06/kaz-hirai.jpg\n",
+      "etnosi.files.wordpress.com/2012/05/sofi-marinova-baku.jpg\n",
+      "images.coveralia.com/audio/p/Pia_Zadora-When_The_Lights_Go_Out-Interior_Frontal.jpg\n",
+      "4.bp.blogspot.com/-TFHOJVIW3a8/T_1mD6MdOxI/AAAAAAAADAg/PhKDPx0Aqu0/s1600/ivan_pavlov.jpg\n",
+      "863793661388437597-a-1802744773732722657-s-sites.googlegroups.com/site/virginmarysite/Home/jackneosex.jpg\n",
+      "amckiereads.files.wordpress.com/2010/12/darwish.jpg?w=600\n",
+      "img.interia.pl/komputery/nimg/5/7/Kazuo_Hirai_plan_odbudowe_5726348.jpg\n",
+      "2.bp.blogspot.com/-JAYvKsHcQPI/T4f3wbCIMDI/AAAAAAAAFDM/lTs3uKlb3A0/s1600/deeksha_seth_launches_chandana_brothers_showroom_Yellow+Saree+smiling+pics+%25285%2529.jpg\n",
+      "1.bp.blogspot.com/-D3SI27GS7-g/U-iD5fPcFDI/AAAAAAAABOs/VaB_BRRa6OU/s320/news8.jpg\n",
+      "1.bp.blogspot.com/_ilOjS7A_kk4/SVGCtcyAAmI/AAAAAAAAAH4/9-KKBqYeDBA/s400/playstation-3-grill_12.jpg\n"
+     ]
+    }
+   ],
+   "source": [
+    "for other in others[:10]:\n",
+    "  print(other)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "21319"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(results['flickr.com']) +len(results['wikimedia.org']) + len(others)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
index 48133228..3c0dd631 100644
--- a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
@@ -4,12 +4,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Prepare Flickr API Batch CSV"
+    "# MegaFace: Prepare Flickr API Batch CSV"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,6 +29,115 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Create the file meta CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_in_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file.csv'\n",
+    "fp_out_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file_ext.csv'\n",
+    "fp_out_meta_flickr = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_flickr_02.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_files = pd.read_csv(fp_in_meta_files)\n",
+    "df_files.rename(columns={'subdir': 'filepath'}, inplace=True)\n",
+    "file_records = df_files.to_dict('records')\n",
+    "photo_ids = [x['photo_id'] for x in file_records]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d91329c27b8b4fc4ae68eb817ea82e19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=4753520), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for file_record in tqdm(file_records):\n",
+    "  photo_id = Path(file_record['url']).stem.split('_')[0]\n",
+    "  filepath = f'{photo_id}.jpg'\n",
+    "  file_record['filepath'] = filepath\n",
+    "\n",
+    "df_meta_file = pd.DataFrame.from_dict(file_records)\n",
+    "df_meta_file.drop_duplicates(inplace=True)\n",
+    "df_meta_file.to_csv(fp_out_meta_files, index=False)\n",
+    "print(f'Wrote {len(df_meta_file)} lines')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create the NSID/count CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total users: 48,382\n",
+      "Total images: 3,311,471\n"
+     ]
+    }
+   ],
+   "source": [
+    "nsid_groups = df_meta_file.groupby('nsid')\n",
+    "results = []\n",
+    "for nsid, group in nsid_groups:\n",
+    "  results.append({'nsid': nsid, 'count': len(group)})\n",
+    "df_meta_flickr = pd.DataFrame.from_dict(results)\n",
+    "df_meta_flickr.to_csv(fp_out_meta_flickr, index=False)\n",
+    "\n",
+    "print(f'Total users: {len(results):,}')\n",
+    "print(f'Total images: {df_meta_flickr[\"count\"].sum():,}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## Create CSV for API"
    ]
   },
diff --git a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb
index 8746a740..57c32bec 100644
--- a/megapixels/notebooks/datasets/pipa/pipa_flickr_metadata_cleanup.ipynb
+++ b/megapixels/notebooks/datasets/pipa/flickr_cleanup.ipynb
@@ -38,12 +38,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "fp_in = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_ext.csv'\n",
-    "fp_out = '/data_store/datasets/people/pipa/research/pipa_flickr_metadata_test.csv'"
+    "fp_in_api_photo_id = '/data_store_hdd/datasets/people/pipa/research/flickr_api_photo_id.csv'\n",
+    "fp_out_filepaths = '/data_store_hdd/datasets/people/pipa/research/pipa_filepaths.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(fp_in_api_photo_id)\n",
+    "records = df.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = []\n",
+    "for record in records:\n",
+    "  obj = {\n",
+    "    'photo_id': record.get('photo_id'),\n",
+    "    'nsid': record.get('nsid'),\n",
+    "    'url': record.get('url'),\n",
+    "    'secret': record.get('secret'),\n",
+    "    'filepath': f'{photo_id}_{secret}.jpg'\n",
+    "  }\n",
+    "  results.append(obj)\n",
+    "  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)"
    ]
   },
   {
diff --git a/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
new file mode 100644
index 00000000..99bbe32e
--- /dev/null
+++ b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# VGG Face (V1) Prepare Flickr API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob, iglob\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "\n",
+    "import pandas as pd\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+    "from app.utils import file_utils"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert annotation files to list of photo IDs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_dir_annos = '/data_store/datasets/people/vgg_face/downloads/vgg_face_dataset/files/'\n",
+    "fp_photo_ids = '/data_store/datasets/people/vgg_face/research/photo_ids.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b92b24eac4c84f2f96e32f6eba8d2dc0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=2622), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "photo_ids = []\n",
+    "all_photos = []\n",
+    "fp_annos = glob(join(fp_dir_annos, '*.txt'))\n",
+    "for fp_anno in tqdm(fp_annos):\n",
+    "  df_annos = pd.read_csv(fp_anno, delimiter=' ', names=['url', 'a', 'b', 'c', 'd', 'e', 'f', 'g'])\n",
+    "  records = df_annos.to_dict('records')\n",
+    "  for record in records:\n",
+    "    url = record['url']\n",
+    "    all_photos.append(url)\n",
+    "    if 'flickr.com' in url:\n",
+    "      photo_id = Path(url).stem.split('_')[0]\n",
+    "      photo_ids.append({'photo_id': photo_id})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2604849\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(all_photos))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/data_store/datasets/people/vgg_face/research/photo_ids.csv')"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file_utils.ensure_posixpath(fp_photo_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_dict(photo_ids).to_csv(fp_photo_ids, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Convert Flickr API data to filepaths and counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_in_flickr_api = '/data_store_hdd/datasets/people/vgg_face/research/vgg_flickr_api_photo_ids.csv'\n",
+    "fp_out_filepaths = '/data_store_hdd/datasets/people/vgg_face/research/vgg_filepaths.csv'\n",
+    "fp_out_counts = '/data_store_hdd/datasets/people/vgg_face/research/vgg_counts.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(fp_in_flickr_api)\n",
+    "records = df.to_dict('records')\n",
+    "\n",
+    "# write filepaths\n",
+    "results = []\n",
+    "for record in records:\n",
+    "  photo_id = record['photo_id']\n",
+    "  obj = {\n",
+    "    'filepath': f'{photo_id}.jpg',\n",
+    "    'nsid': record['nsid'],\n",
+    "    'photo_id': photo_id,\n",
+    "    'url': record['url']\n",
+    "  }\n",
+    "  results.append(obj)\n",
+    "\n",
+    "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)\n",
+    "\n",
+    "# write counts\n",
+    "results = []\n",
+    "nsid_groups = df.groupby('nsid')\n",
+    "for nsid, group in nsid_groups:\n",
+    "  results.append({'nsid': nsid, 'count': len(group)})\n",
+    "\n",
+    "pd.DataFrame.from_dict(results).to_csv(fp_out_counts, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n",
+    "df = pd.read_csv(fp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_match = df[df['nsid'] == '50747072@N03']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  bureau country          nsid     path_alias       type  \\\n",
+      "0    EUR  Russia  50747072@N03  otkroyameriku  Consulate   \n",
+      "\n",
+      "                                          url                  username  \\\n",
+      "0  http://www.flickr.com/photos/otkroyameriku  Генконсульство США в СПб   \n",
+      "\n",
+      "  verified notes  \n",
+      "0      NaN   NaN   1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df_match, len(df_match))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'50747072@N03'"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "match.nsid"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
index c2ec5c84..66f803a4 100644
--- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
@@ -37,6 +37,318 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Create filepaths CSV for individual lookup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n",
+    "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_flickr_meta = pd.read_csv(fp_flickr_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on function drop in module pandas.core.frame:\n",
+      "\n",
+      "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n",
+      "    Drop specified labels from rows or columns.\n",
+      "    \n",
+      "    Remove rows or columns by specifying label names and corresponding\n",
+      "    axis, or by specifying directly index or column names. When using a\n",
+      "    multi-index, labels on different levels can be removed by specifying\n",
+      "    the level.\n",
+      "    \n",
+      "    Parameters\n",
+      "    ----------\n",
+      "    labels : single label or list-like\n",
+      "        Index or column labels to drop.\n",
+      "    axis : {0 or 'index', 1 or 'columns'}, default 0\n",
+      "        Whether to drop labels from the index (0 or 'index') or\n",
+      "        columns (1 or 'columns').\n",
+      "    index, columns : single label or list-like\n",
+      "        Alternative to specifying axis (``labels, axis=1``\n",
+      "        is equivalent to ``columns=labels``).\n",
+      "    \n",
+      "        .. versionadded:: 0.21.0\n",
+      "    level : int or level name, optional\n",
+      "        For MultiIndex, level from which the labels will be removed.\n",
+      "    inplace : bool, default False\n",
+      "        If True, do operation inplace and return None.\n",
+      "    errors : {'ignore', 'raise'}, default 'raise'\n",
+      "        If 'ignore', suppress error and only existing labels are\n",
+      "        dropped.\n",
+      "    \n",
+      "    Returns\n",
+      "    -------\n",
+      "    dropped : pandas.DataFrame\n",
+      "    \n",
+      "    Raises\n",
+      "    ------\n",
+      "    KeyError\n",
+      "        If none of the labels are found in the selected axis\n",
+      "    \n",
+      "    See Also\n",
+      "    --------\n",
+      "    DataFrame.loc : Label-location based indexer for selection by label.\n",
+      "    DataFrame.dropna : Return DataFrame with labels on given axis omitted\n",
+      "        where (all or any) data are missing.\n",
+      "    DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n",
+      "        removed, optionally only considering certain columns.\n",
+      "    Series.drop : Return Series with specified index labels removed.\n",
+      "    \n",
+      "    Examples\n",
+      "    --------\n",
+      "    >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n",
+      "    ...                   columns=['A', 'B', 'C', 'D'])\n",
+      "    >>> df\n",
+      "       A  B   C   D\n",
+      "    0  0  1   2   3\n",
+      "    1  4  5   6   7\n",
+      "    2  8  9  10  11\n",
+      "    \n",
+      "    Drop columns\n",
+      "    \n",
+      "    >>> df.drop(['B', 'C'], axis=1)\n",
+      "       A   D\n",
+      "    0  0   3\n",
+      "    1  4   7\n",
+      "    2  8  11\n",
+      "    \n",
+      "    >>> df.drop(columns=['B', 'C'])\n",
+      "       A   D\n",
+      "    0  0   3\n",
+      "    1  4   7\n",
+      "    2  8  11\n",
+      "    \n",
+      "    Drop a row by index\n",
+      "    \n",
+      "    >>> df.drop([0, 1])\n",
+      "       A  B   C   D\n",
+      "    2  8  9  10  11\n",
+      "    \n",
+      "    Drop columns and/or rows of MultiIndex DataFrame\n",
+      "    \n",
+      "    >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n",
+      "    ...                              ['speed', 'weight', 'length']],\n",
+      "    ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n",
+      "    ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n",
+      "    >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n",
+      "    ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n",
+      "    ...                         [250, 150], [1.5, 0.8], [320, 250],\n",
+      "    ...                         [1, 0.8], [0.3,0.2]])\n",
+      "    >>> df\n",
+      "                    big     small\n",
+      "    lama    speed   45.0    30.0\n",
+      "            weight  200.0   100.0\n",
+      "            length  1.5     1.0\n",
+      "    cow     speed   30.0    20.0\n",
+      "            weight  250.0   150.0\n",
+      "            length  1.5     0.8\n",
+      "    falcon  speed   320.0   250.0\n",
+      "            weight  1.0     0.8\n",
+      "            length  0.3     0.2\n",
+      "    \n",
+      "    >>> df.drop(index='cow', columns='small')\n",
+      "                    big\n",
+      "    lama    speed   45.0\n",
+      "            weight  200.0\n",
+      "            length  1.5\n",
+      "    falcon  speed   320.0\n",
+      "            weight  1.0\n",
+      "            length  0.3\n",
+      "    \n",
+      "    >>> df.drop(index='length', level=1)\n",
+      "                    big     small\n",
+      "    lama    speed   45.0    30.0\n",
+      "            weight  200.0   100.0\n",
+      "    cow     speed   30.0    20.0\n",
+      "            weight  250.0   150.0\n",
+      "    falcon  speed   320.0   250.0\n",
+      "            weight  1.0     0.8\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "help(pd.DataFrame.drop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['nsid', 'photo_id', 'url'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n",
+    "print(df_flickr_meta.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_flickr_meta['subdir'] = ''\n",
+    "df_flickr_meta['filepath'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nsid</th>\n",
+       "      <th>photo_id</th>\n",
+       "      <th>url</th>\n",
+       "      <th>filepath</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>51576145@N02</td>\n",
+       "      <td>4762068863</td>\n",
+       "      <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>29689383@N02</td>\n",
+       "      <td>5711730606</td>\n",
+       "      <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>29689383@N02</td>\n",
+       "      <td>5711730606</td>\n",
+       "      <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>27982139@N00</td>\n",
+       "      <td>2439203939</td>\n",
+       "      <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>27982139@N00</td>\n",
+       "      <td>2464402099</td>\n",
+       "      <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           nsid    photo_id  \\\n",
+       "0  51576145@N02  4762068863   \n",
+       "1  29689383@N02  5711730606   \n",
+       "2  29689383@N02  5711730606   \n",
+       "3  27982139@N00  2439203939   \n",
+       "4  27982139@N00  2464402099   \n",
+       "\n",
+       "                                                 url filepath  \n",
+       "0  http://farm5.staticflickr.com/4117/4762068863_...           \n",
+       "1  http://farm3.staticflickr.com/2800/5711730606_...           \n",
+       "2  http://farm3.staticflickr.com/2800/5711730606_...           \n",
+       "3  http://farm3.staticflickr.com/2105/2439203939_...           \n",
+       "4  http://farm4.staticflickr.com/3030/2464402099_...           "
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_flickr_meta.head()"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 31,
    "metadata": {},
author	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
committer	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
commit	1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree	86c37309ff5bcb62716638562489ddb747c16159
parent	e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)