rebuild and add compare-csv-counts.py script

author: Jules Laplace <julescarbon@gmail.com> 2019-03-14 02:50:24 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2019-03-14 02:50:24 +0100
commit: c826165945096a90902bb7a31db72eb0670ab388 (patch)
tree: 7d4fd76f427468b8be3148c6b8fe4c70ec1d718a
parent: 8df493f6f9d18acfe5919cf257c2da0d2b30ab7a (diff)
parent: 26646e6adf3833f6282e9515c14ad61e485440c0 (diff)
13 files changed, 414 insertions, 12 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 891ab503..f6d0a7df 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -175,3 +175,8 @@ DIR_SITE_FINAL_CITATIONS = "../site/datasets/final/"
 # -----------------------------------------------------------------------------
 CELERY_BROKER_URL = 'redis://localhost:6379/0'
 CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
+
+# -----------------------------------------------------------------------------
+# Build settings
+# -----------------------------------------------------------------------------
+BUILD_RESEARCH = False
+\ No newline at end of file
diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py
index 79093bc7..00470e4b 100644
--- a/megapixels/app/site/parser.py
+++ b/megapixels/app/site/parser.py
@@ -292,6 +292,9 @@ def parse_research_index(research_posts):
   content = "<div class='research_index'>"
   for post in research_posts:
     print(post)
+    if 'path' not in post:
+      print("No path attribute for post")
+      return ""
     s3_path = s3.make_s3_path(cfg.S3_SITE_PATH, post['path'])
     if 'image' in post:
       post_image = s3_path + post['image']
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
new file mode 100644
index 00000000..48aca5f0
--- /dev/null
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -0,0 +1,98 @@
+import click
+
+fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input CSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output path')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+  """Threaded image/file downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+  from random import randint
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/'
+
+  with open(fp_user_agents, 'r') as fp:
+    user_agents = fp.readlines()
+  user_agents = [x.strip() for x in user_agents]
+  
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      opener = urllib.request.build_opener()
+      opener.addheaders = [('User-agent', item['user_agent'])]
+      urllib.request.install_opener(opener)
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      if str(e) != 'HTTP Error 403: Forbidden':
+        log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holders
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['sha256'] + '.json')
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      url = url_prefix + x['sha256'] + '.json'
+      user_agent = user_agents[randint(0, len(user_agents)) - 1]
+      pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+  
+  # too many records for RAM
+  del records
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
new file mode 100644
index 00000000..f1519c61
--- /dev/null
+++ b/megapixels/commands/datasets/download_images.py
@@ -0,0 +1,82 @@
+import click
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+  help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+  """Threaded image downloader"""
+  
+  """
+  CSV should be formatted as 
+  
+  |url|filepath|
+  |---|---|
+  |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+  Saves logfile.csv output and uses for errors
+  """
+  
+  from os.path import join
+  from functools import partial
+  from pathlib import Path
+  from multiprocessing.dummy import Pool as ThreadPool
+  import urllib
+
+  import pandas as pd
+  from tqdm import tqdm
+  from app.utils import file_utils
+  from app.utils.logger_utils import Logger
+
+  log = Logger.getLogger()
+
+  # setup multithreading function
+  def pool_process(item):
+    # threaded function
+    fp_out = item['filepath']
+    try:
+      # download image
+      file_utils.mkdirs(item['filepath'])
+      urllib.request.urlretrieve(item['url'], fp_out)
+      item['status'] = True
+    except Exception as e:
+      log.debug(f'Error: {e}')
+      fp_error = f'{fp_out}_error.txt'
+      with open(fp_error, 'w') as fp:
+        fp.write('')
+      item['status'] = False
+    pbar.update(1)
+    return item
+
+  # setup multithreading data holds
+  log.debug(f'loading {opt_fp_in}')
+  records = pd.read_csv(opt_fp_in).to_dict('records')
+
+
+  pool_items = []
+  for x in tqdm(records):
+    fp_dst = join(opt_fp_out, x['filepath'])
+    fp_dst_is_file = Path(fp_dst).is_file()
+    fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+    if not fp_dst_is_file and not fp_dst_is_err:
+      pool_items.append({'url':x['url'], 'filepath': fp_dst})
+
+  num_items = len(pool_items)
+  log.info(f'processing {num_items:,} items')
+  pool_results = []
+
+  # run the multithreading with progress bar
+  pbar = tqdm(total=num_items)
+  pool_process = partial(pool_process)
+  pool = ThreadPool(opt_threads) 
+  with tqdm(total=num_items) as pbar:
+    pool_results = pool.map(pool_process, pool_items)
+  
+  pbar.close()
+
+  
diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py
new file mode 100644
index 00000000..bf3a6d5d
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_skin_color.py
@@ -0,0 +1,32 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in')
+@click.option('-o', '--output', 'opt_fp_out')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+  """Measure skin color IJB-C"""
+  
+  import sys
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+
+  import pandas as pd
+  import cv2 as cv
+  from tqdm import tqdm
+  
+  from app.utils import file_utils, im_utils
+  from app.models.data_store import DataStore
+
+  log = Logger.getLogger()
+  log.info('IJBC Skin Color')
diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py
new file mode 100644
index 00000000..0094ea59
--- /dev/null
+++ b/megapixels/commands/datasets/pull_spreadsheet.py
@@ -0,0 +1,124 @@
+import os
+import click
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+from os.path import join
+from pathlib import Path
+from multiprocessing import Pool
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils, file_utils
+from app.settings import app_cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]
+
+@click.command()
+@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
+  type=click.Choice(opt_sheets), 
+  default=['datasets'],
+  help='Spreadsheet name')
+@click.option('--all', 'opt_all', is_flag=True,
+  help='Get all sheets')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Path to directory or filename')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
+  """Fetch Google spreadsheet"""
+  
+  import sys
+  import pandas as pd
+  from tqdm import tqdm
+
+  log = Logger.getLogger()
+  if opt_all:
+    opt_spreadsheets = opt_sheets
+  
+  for sheet_name in opt_spreadsheets:
+    log.info(f'Get spreadsheet: {sheet_name}')
+    sheet_data = fetch_google_sheet_objects(name=sheet_name)
+    df_sheet = pd.DataFrame.from_dict(sheet_data)
+    if sheet_name == 'datasets':
+      df_sheet = clean_datasets_sheet_ft(df_sheet)
+    fpp_out = Path(opt_fp_out)
+    file_utils.mkdirs(fpp_out)
+
+    if opt_all and fpp_out.is_file():
+      fpp_out = fpp_out.parent
+    else:
+      fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
+    df_sheet.to_csv(fpp_out)
+
+
+def clean_datasets_sheet_ft(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def clean_datasets_sheet_nyt(df):
+  # clean data for FT
+  df = df[df['ft_share'] == 'Y']
+  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+  return df[keys]
+
+def fetch_spreadsheet():
+  """Open the Google Spreadsheet, which contains the individual worksheets"""
+  scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+  fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
+  credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
+  docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+  client = gspread.authorize(credentials)
+  spreadsheet = client.open_by_key(docid)
+  return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+  """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
+  spreadsheet = fetch_spreadsheet()
+  return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+  """Get all the values from a particular worksheet as a list of lists.
+  Returns:
+  :keys - the first row of the document
+  :lines - a list of lists with the rest of the rows"""
+  rows = fetch_worksheet(name).get_all_values()
+  keys = rows[0]
+  lines = rows[1:]
+  return keys, lines
+
+def fetch_google_sheet_objects(name):
+  """Get all the values from a worksheet as a list of dictionaries"""
+  keys, rows = fetch_google_sheet(name)
+  recs = []
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    recs.append(rec)
+  return recs
+
+def fetch_google_lookup(name, item_key='key'):
+  """Get all the values from a worksheet as a dictionary of dictionaries.
+  Specify which field you want to use as the dictionary key."""
+  keys, rows = fetch_google_sheet(name)
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    lookup[rec[item_key]] = rec
+  return lookup
+\ No newline at end of file
diff --git a/scraper/compare-csv-counts.py b/scraper/compare-csv-counts.py
new file mode 100644
index 00000000..fcbfe32f
--- /dev/null
+++ b/scraper/compare-csv-counts.py
@@ -0,0 +1,27 @@
+#!python
+
+import operator
+
+def read_text(fn):
+  with open(fn, 'r') as f:
+    lines = f.readlines()
+    lookup = {}
+    for line in lines:
+      line = line.strip()
+      total, fn = line.split(' ')
+      lookup[fn] = int(total)
+    return lookup
+
+new = read_text('new2.txt')
+old = read_text('new.txt')
+
+compare = {}
+for key in sorted(new.keys()):
+  if key in old:
+    diff = new[key] - old[key]
+    compare[key] = diff
+
+sorted_compare = sorted(compare.items(), key=operator.itemgetter(1))
+for key, diff in sorted_compare:
+  print("{}\t{}".format(diff, key))
+
diff --git a/site/assets/css/css.css b/site/assets/css/css.css
index 69302409..32c7dad6 100644
--- a/site/assets/css/css.css
+++ b/site/assets/css/css.css
@@ -485,6 +485,7 @@ section.fullwidth .image {
 
 /* about page */
 
+
 .flex-container {
   padding: 0;
   margin: 0;
@@ -500,6 +501,9 @@ section.fullwidth .image {
   justify-content: space-around;
 }
 
+.team-photos-container{
+  margin-top:40px;
+}
 .team-member {
   height: auto;
   margin-top: 10px;
@@ -508,6 +512,9 @@ section.fullwidth .image {
   font-weight: bold;
   flex-grow: 1;
   margin:0 40px 0 0;
+  padding:20px;
+  border-radius:6px;
+  background: #202020;
 }
 .team-member&:last-child{
   margin:0 0 0 40px;
diff --git a/site/content/pages/about/credits.md b/site/content/pages/about/credits.md
index 04ecdf9c..bc2283fd 100644
--- a/site/content/pages/about/credits.md
+++ b/site/content/pages/about/credits.md
@@ -27,15 +27,20 @@ authors: Adam Harvey
 
 #### Team
 
-- Research, concept: Adam Harvey
-- Site development, visualizations: Jules LaPlace
-- Assistant researcher: Berit Gilma
+- Research and image analysis: Adam Harvey
+- Development and visualizations: Jules LaPlace
 - Produced in Partnership with Mozilla
+- Contributing researchers: Berit Gilma, Mathana Stender
 
 #### Code
 
+<<<<<<< HEAD
 - This site uses D3.js, C3.js, and ThreeJS for visualizations.
 - Data aggregation uses Pandas and PDFMiner.Six.
+=======
+- This site uses D3 and C2 for visuzations
+- Add more here
+>>>>>>> 26646e6adf3833f6282e9515c14ad61e485440c0
 
 #### Data
 
diff --git a/site/content/pages/about/index.md b/site/content/pages/about/index.md
index ffbd4541..3dd14bfe 100644
--- a/site/content/pages/about/index.md
+++ b/site/content/pages/about/index.md
@@ -26,11 +26,15 @@ authors: Adam Harvey
 
 (PAGE UNDER DEVELOPMENT)
 
-<p><div style="font-size:20px;line-height:36px">MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that aims to unravel their histories, futures, geographies, and meanings. Throughout 2019 this site, coded by Jules LaPlace, will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</div></p>
+<p><div style="font-size:20px;line-height:36px">Ever since government agencies began researching face recognition in the early 1960's, datasets of face images have always been central to technological advancements. Today, these datasets no longer originate in labs, but instead from family photo albums posted on photo sharing sites, surveillance cameras on college campuses, search engine queries for celebrities, cafe livestreams, or <a href="https://www.theverge.com/2017/8/22/16180080/transgender-youtubers-ai-facial-recognition-dataset">personal videos</a> posted on YouTube. Collectively, facial recognition datasets are now gathered "in the wild".</div></p>
+
+<p>MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that unravels their histories, futures, geographies, and meanings. Throughout 2019 this site this site will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</p>
+
+During the last year, hundreds of these facial analysis datasets created "in the wild" have been collected to understand how they contribute to a global supply chain of biometric data that is powering the global facial recognition industry.
 
 The MegaPixels website is produced in partnership with [Mozilla](https://mozilla.org).
 
-<div class="flex-container">
+<div class="flex-container team-photos-container">
     <div class="team-member">
         <img src="https://nyc3.digitaloceanspaces.com/megapixels/v1/site/about/assets/adam-harvey-3d.jpg" />
         <h3>Adam Harvey</h3>
diff --git a/site/public/about/credits/index.html b/site/public/about/credits/index.html
index 8a079a66..b4c17c4d 100644
--- a/site/public/about/credits/index.html
+++ b/site/public/about/credits/index.html
@@ -38,15 +38,27 @@
 </ul>
 </section><h4>Team</h4>
 <ul>
-<li>Research, concept: Adam Harvey</li>
-<li>Site development, visualizations: Jules LaPlace</li>
-<li>Assistant researcher: Berit Gilma</li>
+<li>Research and image analysis: Adam Harvey</li>
+<li>Development and visualizations: Jules LaPlace</li>
 <li>Produced in Partnership with Mozilla</li>
+<li>Contributing researchers: Berit Gilma, Mathana Stender</li>
 </ul>
 <h4>Code</h4>
+<p>&lt;&lt;&lt;&lt;&lt;&lt;&lt; HEAD</p>
 <ul>
 <li>This site uses D3.js, C3.js, and ThreeJS for visualizations.</li>
-<li>Data aggregation uses Pandas and PDFMiner.Six.</li>
+<li><h1>Data aggregation uses Pandas and PDFMiner.Six.</h1>
+</li>
+<li>This site uses D3 and C2 for visuzations</li>
+<li>Add more here<blockquote><blockquote><blockquote><blockquote><blockquote><blockquote><blockquote><p>&gt; 26646e6adf3833f6282e9515c14ad61e485440c0</p>
+</blockquote>
+</blockquote>
+</blockquote>
+</blockquote>
+</blockquote>
+</blockquote>
+</blockquote>
+</li>
 </ul>
 <h4>Data</h4>
 <ul>
diff --git a/site/public/about/index.html b/site/public/about/index.html
index 19d91e3f..694f7ec9 100644
--- a/site/public/about/index.html
+++ b/site/public/about/index.html
@@ -37,8 +37,9 @@
 <li><a href="/about/privacy/">Privacy Policy</a></li>
 </ul>
 </section><p>(PAGE UNDER DEVELOPMENT)</p>
-<p><div style="font-size:20px;line-height:36px">MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that aims to unravel their histories, futures, geographies, and meanings. Throughout 2019 this site, coded by Jules LaPlace, will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</div></p><p>The MegaPixels website is produced in partnership with <a href="https://mozilla.org">Mozilla</a>.</p>
-<div class="flex-container">
+<p><div style="font-size:20px;line-height:36px">Ever since government agencies began researching face recognition in the early 1960's, datasets of face images have always been central to technological advancements. Today, these datasets no longer originate in labs, but instead from family photo albums posted on photo sharing sites, surveillance cameras on college campuses, search engine queries for celebrities, cafe livestreams, or <a href="https://www.theverge.com/2017/8/22/16180080/transgender-youtubers-ai-facial-recognition-dataset">personal videos</a> posted on YouTube. Collectively, facial recognition datasets are now gathered "in the wild".</div></p><p>MegaPixels is art and research by <a href="https://ahprojects.com">Adam Harvey</a> about facial recognition datasets that unravels their histories, futures, geographies, and meanings. Throughout 2019 this site this site will publish research reports, visualizations, raw data, and interactive tools to explore how publicly available facial recognition datasets contribute to a global supply chain of biometric data that powers the global facial recognition industry.</p><p>During the last year, hundreds of these facial analysis datasets created "in the wild" have been collected to understand how they contribute to a global supply chain of biometric data that is powering the global facial recognition industry.</p>
+<p>The MegaPixels website is produced in partnership with <a href="https://mozilla.org">Mozilla</a>.</p>
+<div class="flex-container team-photos-container">
    <div class="team-member">
        <img src="https://nyc3.digitaloceanspaces.com/megapixels/v1/site/about/assets/adam-harvey-3d.jpg" />
        <h3>Adam Harvey</h3>
diff --git a/todo.md b/todo.md
index 899552de..420945ef 100644
--- a/todo.md
+++ b/todo.md
@@ -25,9 +25,11 @@
 ## About
 
 - AH: update bio images
-- JL: add underline/active state to the subnavigation
 - awaiting Mozilla response for their text
 
+## about/press
+
+
 ## Research
 
 - on hold until closer to FT launch date
author	Jules Laplace <julescarbon@gmail.com>	2019-03-14 02:50:24 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2019-03-14 02:50:24 +0100
commit	c826165945096a90902bb7a31db72eb0670ab388 (patch)
tree	7d4fd76f427468b8be3148c6b8fe4c70ec1d718a
parent	8df493f6f9d18acfe5919cf257c2da0d2b30ab7a (diff)
parent	26646e6adf3833f6282e9515c14ad61e485440c0 (diff)