From b2dcfb3ef79557b7fccfdd94aa8ac4991552d7e1 Mon Sep 17 00:00:00 2001
From: adamhrv <adam@ahprojects.com>
Date: Wed, 13 Mar 2019 17:47:58 +0100
Subject: add downloader

---
 megapixels/app/settings/app_cfg.py               |   5 +
 megapixels/commands/datasets/download_ibmdif.py  |  98 ++++++++++++++++++
 megapixels/commands/datasets/download_images.py  |  82 +++++++++++++++
 megapixels/commands/datasets/ijb_skin_color.py   |  32 ++++++
 megapixels/commands/datasets/pull_spreadsheet.py | 124 +++++++++++++++++++++++
 5 files changed, 341 insertions(+)
 create mode 100644 megapixels/commands/datasets/download_ibmdif.py
 create mode 100644 megapixels/commands/datasets/download_images.py
 create mode 100644 megapixels/commands/datasets/ijb_skin_color.py
 create mode 100644 megapixels/commands/datasets/pull_spreadsheet.py

diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 891ab503..f6d0a7df 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -175,3 +175,8 @@ DIR_SITE_FINAL_CITATIONS = "../site/datasets/final/"
 # -----------------------------------------------------------------------------
 CELERY_BROKER_URL = 'redis://localhost:6379/0'
 CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
+
+# -----------------------------------------------------------------------------
+# Build settings
+# -----------------------------------------------------------------------------
+BUILD_RESEARCH = False
\ No newline at end of file
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
new file mode 100644
index 00000000..48aca5f0
--- /dev/null
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -0,0 +1,98 @@
+import click
+
+fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input CSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output path')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+  """Threaded image/file downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+  from random import randint
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/'
+
+  with open(fp_user_agents, 'r') as fp:
+    user_agents = fp.readlines()
+  user_agents = [x.strip() for x in user_agents]
+  
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      opener = urllib.request.build_opener()
+      opener.addheaders = [('User-agent', item['user_agent'])]
+      urllib.request.install_opener(opener)
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      if str(e) != 'HTTP Error 403: Forbidden':
+        log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holders
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['sha256'] + '.json')
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      url = url_prefix + x['sha256'] + '.json'
+      user_agent = user_agents[randint(0, len(user_agents)) - 1]
+      pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+  
+  # too many records for RAM
+  del records
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
new file mode 100644
index 00000000..f1519c61
--- /dev/null
+++ b/megapixels/commands/datasets/download_images.py
@@ -0,0 +1,82 @@
+import click
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+  """Threaded image downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils import file_utils
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      file_utils.mkdirs(item['filepath'])
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holds
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['filepath'])
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      pool_items.append({'url':x['url'], 'filepath': fp_dst})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py
new file mode 100644
index 00000000..bf3a6d5d
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_skin_color.py
@@ -0,0 +1,32 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in')
+@click.option('-o', '--output', 'opt_fp_out')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+  """Measure skin color IJB-C"""
+  
+  import sys
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+
+  import pandas as pd
+  import cv2 as cv
+  from tqdm import tqdm
+  
+  from app.utils import file_utils, im_utils
+  from app.models.data_store import DataStore
+
+  log = Logger.getLogger()
+  log.info('IJBC Skin Color')
diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py
new file mode 100644
index 00000000..0094ea59
--- /dev/null
+++ b/megapixels/commands/datasets/pull_spreadsheet.py
@@ -0,0 +1,124 @@
+import os
+import click
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+from os.path import join
+from pathlib import Path
+from multiprocessing import Pool
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils, file_utils
+from app.settings import app_cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]
+
+@click.command()
+@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
+  type=click.Choice(opt_sheets), 
+  default=['datasets'],
+  help='Spreadsheet name')
+@click.option('--all', 'opt_all', is_flag=True,
+  help='Get all sheets')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Path to directory or filename')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
+  """Fetch Google spreadsheet"""
+  
+  import sys
+  import pandas as pd
+  from tqdm import tqdm
+
+  log = Logger.getLogger()
+  if opt_all:
+    opt_spreadsheets = opt_sheets
+  
+  for sheet_name in opt_spreadsheets:
+    log.info(f'Get spreadsheet: {sheet_name}')
+    sheet_data = fetch_google_sheet_objects(name=sheet_name)
+    df_sheet = pd.DataFrame.from_dict(sheet_data)
+    if sheet_name == 'datasets':
+      df_sheet = clean_datasets_sheet_ft(df_sheet)
+    fpp_out = Path(opt_fp_out)
+    file_utils.mkdirs(fpp_out)
+
+    if opt_all and fpp_out.is_file():
+      fpp_out = fpp_out.parent
+    else:
+      fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
+    df_sheet.to_csv(fpp_out)
+
+
+def clean_datasets_sheet_ft(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def clean_datasets_sheet_nyt(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def fetch_spreadsheet():
+  """Open the Google Spreadsheet, which contains the individual worksheets"""
+  scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+  fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
+  credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
+  docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+  client = gspread.authorize(credentials)
+  spreadsheet = client.open_by_key(docid)
+  return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+  """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
+  spreadsheet = fetch_spreadsheet()
+  return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+  """Get all the values from a particular worksheet as a list of lists.
+  Returns:
+  :keys - the first row of the document
+  :lines - a list of lists with the rest of the rows"""
+  rows = fetch_worksheet(name).get_all_values()
+  keys = rows[0]
+  lines = rows[1:]
+  return keys, lines
+
+def fetch_google_sheet_objects(name):
+  """Get all the values from a worksheet as a list of dictionaries"""
+  keys, rows = fetch_google_sheet(name)
+  recs = []
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    recs.append(rec)
+  return recs
+
+def fetch_google_lookup(name, item_key='key'):
+  """Get all the values from a worksheet as a dictionary of dictionaries.
+  Specify which field you want to use as the dictionary key."""
+  keys, rows = fetch_google_sheet(name)
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    lookup[rec[item_key]] = rec
+  return lookup
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2


From 26646e6adf3833f6282e9515c14ad61e485440c0 Mon Sep 17 00:00:00 2001
From: adamhrv <adam@ahprojects.com>
Date: Wed, 13 Mar 2019 17:48:28 +0100
Subject: css, txt tweaks

---
 site/assets/css/css.css              | 7 +++++++
 site/content/pages/about/credits.md  | 8 ++++----
 site/content/pages/about/index.md    | 8 ++++++--
 site/public/about/credits/index.html | 8 ++++----
 site/public/about/index.html         | 5 +++--
 todo.md                              | 4 +++-
 6 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/site/assets/css/css.css b/site/assets/css/css.css
index 69302409..32c7dad6 100644
--- a/site/assets/css/css.css
+++ b/site/assets/css/css.css
@@ -485,6 +485,7 @@ section.fullwidth .image {
 
 /* about page */
 
+
 .flex-container {
   padding: 0;
   margin: 0;
@@ -500,6 +501,9 @@ section.fullwidth .image {
   justify-content: space-around;
 }
 
+.team-photos-container{
+  margin-top:40px;
+}
 .team-member {
   height: auto;
   margin-top: 10px;
@@ -508,6 +512,9 @@ section.fullwidth .image {
   font-weight: bold;
   flex-grow: 1;
   margin:0 40px 0 0;
+  padding:20px;
+  border-radius:6px;
+  background: #202020;
 }
 .team-member&:last-child{
   margin:0 0 0 40px;
diff --git a/site/content/pages/about/credits.md b/site/content/pages/about/credits.md
index 3ad962df..d4021d44 100644
--- a/site/content/pages/about/credits.md
+++ b/site/content/pages/about/credits.md
@@ -27,14 +27,14 @@ authors: Adam Harvey
 
 #### Team
 
-- Research, concept: Adam Harvey
-- Site development, visualizations: Jules LaPlace
-- Assistant researcher: Berit Gilma
+- Research and image analysis: Adam Harvey
+- Development and visualizations: Jules LaPlace
 - Produced in Partnership with Mozilla
+- Contributing researchers: Berit Gilma, Mathana Stender
 
 #### Code
 
-- This site uses D3 and C2 for visuzations. 
+- This site uses D3 and C2 for visuzations
 - Add more here
 
 #### Data
diff --git a/site/content/pages/about/index.md b/site/content/pages/about/index.md
index b1b7a80f..cf7d782b 100644
--- a/site/content/pages/about/index.md
+++ b/site/content/pages/about/index.md
@@ -26,11 +26,15 @@ authors: Adam Harvey
 
 (PAGE UNDER DEVELOPMENT)
 
-<p><div style="font-size:20px;line-height:36px">MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that aims to unravel their histories, futures, geographies, and meanings. Throughout 2019 this site, coded by Jules LaPlace, will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</div></p>
+<p><div style="font-size:20px;line-height:36px">Ever since government agencies began researching face recognition in the early 1960's, datasets of face images have always been central to technological advancements. Today, these datasets no longer originate in labs, but instead from family photo albums posted on photo sharing sites, surveillance cameras on college campuses, search engine queries for celebrities, cafe livestreams, or <a href="https://www.theverge.com/2017/8/22/16180080/transgender-youtubers-ai-facial-recognition-dataset">personal videos</a> posted on YouTube. Collectively, facial recognition datasets are now gathered "in the wild".</div></p>
+
+<p>MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that unravels their histories, futures, geographies, and meanings. Throughout 2019 this site this site will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</p>
+
+During the last year, hundreds of these facial analysis datasets created "in the wild" have been collected to understand how they contribute to a global supply chain of biometric data that is powering the global facial recognition industry.
 
 The MegaPixels website is produced in partnership with [Mozilla](https://mozilla.org).
 
-<div class="flex-container">
+<div class="flex-container team-photos-container">
     <div class="team-member">
         <img src="https://nyc3.digitaloceanspaces.com/megapixels/v1/site/about/assets/adam-harvey-3d.jpg" />
         <h3>Adam Harvey</h3>
diff --git a/site/public/about/credits/index.html b/site/public/about/credits/index.html
index b7ab8085..b73ad927 100644
--- a/site/public/about/credits/index.html
+++ b/site/public/about/credits/index.html
@@ -38,14 +38,14 @@
 </ul>
 </section><h4>Team</h4>
 <ul>
-<li>Research, concept: Adam Harvey</li>
-<li>Site development, visualizations: Jules LaPlace</li>
-<li>Assistant researcher: Berit Gilma</li>
+<li>Research and image analysis: Adam Harvey</li>
+<li>Development and visualizations: Jules LaPlace</li>
 <li>Produced in Partnership with Mozilla</li>
+<li>Contributing researchers: Berit Gilma, Mathana Stender</li>
 </ul>
 <h4>Code</h4>
 <ul>
-<li>This site uses D3 and C2 for visuzations. </li>
+<li>This site uses D3 and C2 for visuzations</li>
 <li>Add more here</li>
 </ul>
 <h4>Data</h4>
diff --git a/site/public/about/index.html b/site/public/about/index.html
index 0ddc1a4b..07455fdc 100644
--- a/site/public/about/index.html
+++ b/site/public/about/index.html
@@ -37,8 +37,9 @@
 <li><a href="/about/privacy/">Privacy Policy</a></li>
 </ul>
 </section><p>(PAGE UNDER DEVELOPMENT)</p>
-<p><div style="font-size:20px;line-height:36px">MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that aims to unravel their histories, futures, geographies, and meanings. Throughout 2019 this site, coded by Jules LaPlace, will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</div></p><p>The MegaPixels website is produced in partnership with <a href="https://mozilla.org">Mozilla</a>.</p>
-<div class="flex-container">
+<p><div style="font-size:20px;line-height:36px">Ever since government agencies began researching face recognition in the early 1960's, datasets of face images have always been central to technological advancements. Today, these datasets no longer originate in labs, but instead from family photo albums posted on photo sharing sites, surveillance cameras on college campuses, search engine queries for celebrities, cafe livestreams, or <a href="https://www.theverge.com/2017/8/22/16180080/transgender-youtubers-ai-facial-recognition-dataset">personal videos</a> posted on YouTube. Collectively, facial recognition datasets are now gathered "in the wild".</div></p><p>MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that unravels their histories, futures, geographies, and meanings. Throughout 2019 this site this site will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</p><p>During the last year, hundreds of these facial analysis datasets created "in the wild" have been collected to understand how they contribute to a global supply chain of biometric data that is powering the global facial recognition industry.</p>
+<p>The MegaPixels website is produced in partnership with <a href="https://mozilla.org">Mozilla</a>.</p>
+<div class="flex-container team-photos-container">
    <div class="team-member">
        <img src="https://nyc3.digitaloceanspaces.com/megapixels/v1/site/about/assets/adam-harvey-3d.jpg" />
        <h3>Adam Harvey</h3>
diff --git a/todo.md b/todo.md
index 899552de..420945ef 100644
--- a/todo.md
+++ b/todo.md
@@ -25,9 +25,11 @@
 ## About
 
 - AH: update bio images
-- JL: add underline/active state to the subnavigation
 - awaiting Mozilla response for their text
 
+## about/press
+
+
 ## Research
 
 - on hold until closer to FT launch date
-- 
cgit v1.2.3-70-g09d2