# MegaFace: Prepare Flickr API Batch CSV

In [1]:
%reload_ext autoreload
%autoreload 2

import os
from os.path import join
from glob import glob, iglob
from pathlib import Path
from tqdm import tqdm_notebook as tqdm

import pandas as pd

## Create the file meta CSV

In [29]:
fp_in_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file.csv'
fp_out_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file_ext.csv'
fp_out_meta_flickr = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_flickr_02.csv'

In [10]:
df_files = pd.read_csv(fp_in_meta_files)
df_files.rename(columns={'subdir': 'filepath'}, inplace=True)
file_records = df_files.to_dict('records')
photo_ids = [x['photo_id'] for x in file_records]

In [19]:
for file_record in tqdm(file_records):
  photo_id = Path(file_record['url']).stem.split('_')[0]
  filepath = f'{photo_id}.jpg'
  file_record['filepath'] = filepath

df_meta_file = pd.DataFrame.from_dict(file_records)
df_meta_file.drop_duplicates(inplace=True)
df_meta_file.to_csv(fp_out_meta_files, index=False)
print(f'Wrote {len(df_meta_file)} lines')

HBox(children=(IntProgress(value=0, max=4753520), HTML(value='')))




## Create the NSID/count CSV

In [36]:
nsid_groups = df_meta_file.groupby('nsid')
results = []
for nsid, group in nsid_groups:
  results.append({'nsid': nsid, 'count': len(group)})
df_meta_flickr = pd.DataFrame.from_dict(results)
df_meta_flickr.to_csv(fp_out_meta_flickr, index=False)

print(f'Total users: {len(results):,}')
print(f'Total images: {df_meta_flickr["count"].sum():,}')

Total users: 48,382
Total images: 3,311,471


## Create CSV for API

| filepath | query |
|:---|:---|
| 12234 | 12234@123|

In [72]:
fp_in_dir_ids = '/data_store_ssd_perrier/datasets/people/megaface/downloads/MegafaceIdentities_VGG_META/'
fp_out_queries = '/data_store_hdd/datasets/people/megaface/research/megaface_flickr_api_queries.csv'
fp_out_queries_full = '/data_store_hdd/datasets/people/megaface/research/megaface_flickr_api_queries_full.csv'

In [59]:
nsid_paths = glob(join(fp_in_dir_ids, '*'))

In [74]:
items = []
for nsid_path in tqdm(nsid_paths):
  nsid_full = Path(nsid_path).name
  nsid = nsid_full.split('_')[0]
  json_files = glob(join(fp_in_dir_ids, nsid_path, '*.json'))
  for json_file in json_files:
    nsid_id_json = file_utils.load_json(json_file)
    full_image_url = nsid_id_json.get('full_img_url')
    obj = {'nsid': nsid, 'nsid_full': nsid_full, 'full_image_url': full_image_url}
    items.append(obj)

HBox(children=(IntProgress(value=0, max=672057), HTML(value='')))

In [75]:
df = pd.DataFrame.from_dict(items)
df.to_csv(fp_out_queries_full, index=False)

In [76]:
count_lookup = {}
for item in items:
  nsid = item['nsid']
  if not nsid in count_lookup.keys():
    count_lookup[nsid] = 0
  count_lookup[nsid] += 1

## Combine MegaFace Flickr API Meta for User/NSID

In [77]:
fp_in_dir = '/media/adam/ah8tb/data_store/datasets/people/megaface/research/flickr_api_dump'
fp_files = glob(join(fp_in_dir, '*.json'))
fp_files_err = [f for f in fp_files if '.txt' in f]
fp_files = [f for f in fp_files if '.txt' not in f]

In [78]:
print(len(fp_files))
print(len(fp_files_err))

46906
0


In [44]:
# combine files into single CSV

```
{
  "stat": "ok",
  "user": {
    "nsid": "7122726@N03",
    "url": "https://www.flickr.com/people/tdbsca/"
  }
}
```

In [99]:
meta_records = []
for fp_file in tqdm(fp_files):
  data = file_utils.load_json(fp_file)
  user = data.get('user')
  nsid = user.get('nsid')
  path_alias = Path(user.get('url')).stem
  count = count_lookup.get(nsid)
  obj = {'path_alias': path_alias, 'nsid': nsid, 'count': count}
  meta_records.append(obj)

HBox(children=(IntProgress(value=0, max=46906), HTML(value='')))

In [100]:
df_meta_records = pd.DataFrame.from_dict(meta_records)
print(len(df_meta_records))

46906


In [101]:
df_meta_records.drop_duplicates(subset='nsid', keep='last', inplace=True)

In [102]:
print(len(df_meta_records))

46906


In [103]:
fp_out_combined = '/data_store_hdd/datasets/people/megaface/research/megaface_flickr_meta_count.csv'
df_meta_records.to_csv(fp_out_combined, index=False)

In [104]:
print(df_meta_records['count'].sum())

4627187
