4 files changed, 336 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
new file mode 100644
index 00000000..48aca5f0
--- /dev/null
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -0,0 +1,98 @@
+import click
+
+fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input CSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output path')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+  """Threaded image/file downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+  from random import randint
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/'
+
+  with open(fp_user_agents, 'r') as fp:
+    user_agents = fp.readlines()
+  user_agents = [x.strip() for x in user_agents]
+  
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      opener = urllib.request.build_opener()
+      opener.addheaders = [('User-agent', item['user_agent'])]
+      urllib.request.install_opener(opener)
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      if str(e) != 'HTTP Error 403: Forbidden':
+        log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holders
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['sha256'] + '.json')
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      url = url_prefix + x['sha256'] + '.json'
+      user_agent = user_agents[randint(0, len(user_agents)) - 1]
+      pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+  
+  # too many records for RAM
+  del records
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
new file mode 100644
index 00000000..f1519c61
--- /dev/null
+++ b/megapixels/commands/datasets/download_images.py
@@ -0,0 +1,82 @@
+import click
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+  """Threaded image downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils import file_utils
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      file_utils.mkdirs(item['filepath'])
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holds
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['filepath'])
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      pool_items.append({'url':x['url'], 'filepath': fp_dst})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py
new file mode 100644
index 00000000..bf3a6d5d
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_skin_color.py
@@ -0,0 +1,32 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in')
+@click.option('-o', '--output', 'opt_fp_out')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+  """Measure skin color IJB-C"""
+  
+  import sys
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+
+  import pandas as pd
+  import cv2 as cv
+  from tqdm import tqdm
+  
+  from app.utils import file_utils, im_utils
+  from app.models.data_store import DataStore
+
+  log = Logger.getLogger()
+  log.info('IJBC Skin Color')
diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py
new file mode 100644
index 00000000..0094ea59
--- /dev/null
+++ b/megapixels/commands/datasets/pull_spreadsheet.py
@@ -0,0 +1,124 @@
+import os
+import click
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+from os.path import join
+from pathlib import Path
+from multiprocessing import Pool
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils, file_utils
+from app.settings import app_cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]
+
+@click.command()
+@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
+  type=click.Choice(opt_sheets), 
+  default=['datasets'],
+  help='Spreadsheet name')
+@click.option('--all', 'opt_all', is_flag=True,
+  help='Get all sheets')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Path to directory or filename')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
+  """Fetch Google spreadsheet"""
+  
+  import sys
+  import pandas as pd
+  from tqdm import tqdm
+
+  log = Logger.getLogger()
+  if opt_all:
+    opt_spreadsheets = opt_sheets
+  
+  for sheet_name in opt_spreadsheets:
+    log.info(f'Get spreadsheet: {sheet_name}')
+    sheet_data = fetch_google_sheet_objects(name=sheet_name)
+    df_sheet = pd.DataFrame.from_dict(sheet_data)
+    if sheet_name == 'datasets':
+      df_sheet = clean_datasets_sheet_ft(df_sheet)
+    fpp_out = Path(opt_fp_out)
+    file_utils.mkdirs(fpp_out)
+
+    if opt_all and fpp_out.is_file():
+      fpp_out = fpp_out.parent
+    else:
+      fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
+    df_sheet.to_csv(fpp_out)
+
+
+def clean_datasets_sheet_ft(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def clean_datasets_sheet_nyt(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def fetch_spreadsheet():
+  """Open the Google Spreadsheet, which contains the individual worksheets"""
+  scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+  fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
+  credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
+  docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+  client = gspread.authorize(credentials)
+  spreadsheet = client.open_by_key(docid)
+  return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+  """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
+  spreadsheet = fetch_spreadsheet()
+  return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+  """Get all the values from a particular worksheet as a list of lists.
+  Returns:
+  :keys - the first row of the document
+  :lines - a list of lists with the rest of the rows"""
+  rows = fetch_worksheet(name).get_all_values()
+  keys = rows[0]
+  lines = rows[1:]
+  return keys, lines
+
+def fetch_google_sheet_objects(name):
+  """Get all the values from a worksheet as a list of dictionaries"""
+  keys, rows = fetch_google_sheet(name)
+  recs = []
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    recs.append(rec)
+  return recs
+
+def fetch_google_lookup(name, item_key='key'):
+  """Get all the values from a worksheet as a dictionary of dictionaries.
+  Specify which field you want to use as the dictionary key."""
+  keys, rows = fetch_google_sheet(name)
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    lookup[rec[item_key]] = rec
+  return lookup
+\ No newline at end of file