summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-03-14 02:50:24 +0100
committerJules Laplace <julescarbon@gmail.com>2019-03-14 02:50:24 +0100
commitc826165945096a90902bb7a31db72eb0670ab388 (patch)
tree7d4fd76f427468b8be3148c6b8fe4c70ec1d718a /megapixels
parent8df493f6f9d18acfe5919cf257c2da0d2b30ab7a (diff)
parent26646e6adf3833f6282e9515c14ad61e485440c0 (diff)
rebuild and add compare-csv-counts.py script
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/app/settings/app_cfg.py5
-rw-r--r--megapixels/app/site/parser.py3
-rw-r--r--megapixels/commands/datasets/download_ibmdif.py98
-rw-r--r--megapixels/commands/datasets/download_images.py82
-rw-r--r--megapixels/commands/datasets/ijb_skin_color.py32
-rw-r--r--megapixels/commands/datasets/pull_spreadsheet.py124
6 files changed, 344 insertions, 0 deletions
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 891ab503..f6d0a7df 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -175,3 +175,8 @@ DIR_SITE_FINAL_CITATIONS = "../site/datasets/final/"
# -----------------------------------------------------------------------------
CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
+
+# -----------------------------------------------------------------------------
+# Build settings
+# -----------------------------------------------------------------------------
+BUILD_RESEARCH = False \ No newline at end of file
diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py
index 79093bc7..00470e4b 100644
--- a/megapixels/app/site/parser.py
+++ b/megapixels/app/site/parser.py
@@ -292,6 +292,9 @@ def parse_research_index(research_posts):
content = "<div class='research_index'>"
for post in research_posts:
print(post)
+ if 'path' not in post:
+ print("No path attribute for post")
+ return ""
s3_path = s3.make_s3_path(cfg.S3_SITE_PATH, post['path'])
if 'image' in post:
post_image = s3_path + post['image']
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
new file mode 100644
index 00000000..48aca5f0
--- /dev/null
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -0,0 +1,98 @@
+import click
+
+fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input CSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output path')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+ help='Number of threads')
+@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+ """Threaded image/file downloader"""
+
+ """
+ CSV should be formatted as
+
+ |url|filepath|
+ |---|---|
+ |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+ Saves logfile.csv output and uses for errors
+ """
+
+ from os.path import join
+ from functools import partial
+ from pathlib import Path
+ from multiprocessing.dummy import Pool as ThreadPool
+ import urllib
+ from random import randint
+
+ import pandas as pd
+ from tqdm import tqdm
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+
+ url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/'
+
+ with open(fp_user_agents, 'r') as fp:
+ user_agents = fp.readlines()
+ user_agents = [x.strip() for x in user_agents]
+
+
+ # setup multithreading function
+ def pool_process(item):
+ # threaded function
+ fp_out = item['filepath']
+ try:
+ # download image
+ opener = urllib.request.build_opener()
+ opener.addheaders = [('User-agent', item['user_agent'])]
+ urllib.request.install_opener(opener)
+ urllib.request.urlretrieve(item['url'], fp_out)
+ item['status'] = True
+ except Exception as e:
+ if str(e) != 'HTTP Error 403: Forbidden':
+ log.debug(f'Error: {e}')
+ fp_error = f'{fp_out}_error.txt'
+ with open(fp_error, 'w') as fp:
+ fp.write('')
+ item['status'] = False
+ pbar.update(1)
+ return item
+
+ # setup multithreading data holders
+ log.debug(f'loading {opt_fp_in}')
+ records = pd.read_csv(opt_fp_in).to_dict('records')
+
+ pool_items = []
+ for x in tqdm(records):
+ fp_dst = join(opt_fp_out, x['sha256'] + '.json')
+ fp_dst_is_file = Path(fp_dst).is_file()
+ fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+ if not fp_dst_is_file and not fp_dst_is_err:
+ url = url_prefix + x['sha256'] + '.json'
+ user_agent = user_agents[randint(0, len(user_agents)) - 1]
+ pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+
+ num_items = len(pool_items)
+ log.info(f'processing {num_items:,} items')
+ pool_results = []
+
+ # too many records for RAM
+ del records
+
+ # run the multithreading with progress bar
+ pbar = tqdm(total=num_items)
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=num_items) as pbar:
+ pool_results = pool.map(pool_process, pool_items)
+
+ pbar.close()
+
+
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
new file mode 100644
index 00000000..f1519c61
--- /dev/null
+++ b/megapixels/commands/datasets/download_images.py
@@ -0,0 +1,82 @@
+import click
+
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output')
+@click.option('-t', '--threads', 'opt_threads', default=8,
+ help='Number of threads')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+ """Threaded image downloader"""
+
+ """
+ CSV should be formatted as
+
+ |url|filepath|
+ |---|---|
+ |https:/site.com/photo.jpg|myfolder/myname.jpg|
+
+ Saves logfile.csv output and uses for errors
+ """
+
+ from os.path import join
+ from functools import partial
+ from pathlib import Path
+ from multiprocessing.dummy import Pool as ThreadPool
+ import urllib
+
+ import pandas as pd
+ from tqdm import tqdm
+ from app.utils import file_utils
+ from app.utils.logger_utils import Logger
+
+ log = Logger.getLogger()
+
+ # setup multithreading function
+ def pool_process(item):
+ # threaded function
+ fp_out = item['filepath']
+ try:
+ # download image
+ file_utils.mkdirs(item['filepath'])
+ urllib.request.urlretrieve(item['url'], fp_out)
+ item['status'] = True
+ except Exception as e:
+ log.debug(f'Error: {e}')
+ fp_error = f'{fp_out}_error.txt'
+ with open(fp_error, 'w') as fp:
+ fp.write('')
+ item['status'] = False
+ pbar.update(1)
+ return item
+
+ # setup multithreading data holds
+ log.debug(f'loading {opt_fp_in}')
+ records = pd.read_csv(opt_fp_in).to_dict('records')
+
+
+ pool_items = []
+ for x in tqdm(records):
+ fp_dst = join(opt_fp_out, x['filepath'])
+ fp_dst_is_file = Path(fp_dst).is_file()
+ fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
+ if not fp_dst_is_file and not fp_dst_is_err:
+ pool_items.append({'url':x['url'], 'filepath': fp_dst})
+
+ num_items = len(pool_items)
+ log.info(f'processing {num_items:,} items')
+ pool_results = []
+
+ # run the multithreading with progress bar
+ pbar = tqdm(total=num_items)
+ pool_process = partial(pool_process)
+ pool = ThreadPool(opt_threads)
+ with tqdm(total=num_items) as pbar:
+ pool_results = pool.map(pool_process, pool_items)
+
+ pbar.close()
+
+
diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py
new file mode 100644
index 00000000..bf3a6d5d
--- /dev/null
+++ b/megapixels/commands/datasets/ijb_skin_color.py
@@ -0,0 +1,32 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in')
+@click.option('-o', '--output', 'opt_fp_out')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """Measure skin color IJB-C"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+ log.info('IJBC Skin Color')
diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py
new file mode 100644
index 00000000..0094ea59
--- /dev/null
+++ b/megapixels/commands/datasets/pull_spreadsheet.py
@@ -0,0 +1,124 @@
+import os
+import click
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+from os.path import join
+from pathlib import Path
+from multiprocessing import Pool
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils, file_utils
+from app.settings import app_cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]
+
+@click.command()
+@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
+ type=click.Choice(opt_sheets),
+ default=['datasets'],
+ help='Spreadsheet name')
+@click.option('--all', 'opt_all', is_flag=True,
+ help='Get all sheets')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Path to directory or filename')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
+ """Fetch Google spreadsheet"""
+
+ import sys
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ if opt_all:
+ opt_spreadsheets = opt_sheets
+
+ for sheet_name in opt_spreadsheets:
+ log.info(f'Get spreadsheet: {sheet_name}')
+ sheet_data = fetch_google_sheet_objects(name=sheet_name)
+ df_sheet = pd.DataFrame.from_dict(sheet_data)
+ if sheet_name == 'datasets':
+ df_sheet = clean_datasets_sheet_ft(df_sheet)
+ fpp_out = Path(opt_fp_out)
+ file_utils.mkdirs(fpp_out)
+
+ if opt_all and fpp_out.is_file():
+ fpp_out = fpp_out.parent
+ else:
+ fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
+ df_sheet.to_csv(fpp_out)
+
+
+def clean_datasets_sheet_ft(df):
+ # clean data for FT
+ df = df[df['ft_share'] == 'Y']
+ keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+ keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+ return df[keys]
+
+def clean_datasets_sheet_nyt(df):
+ # clean data for FT
+ df = df[df['ft_share'] == 'Y']
+ keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+ keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+ return df[keys]
+
+def fetch_spreadsheet():
+ """Open the Google Spreadsheet, which contains the individual worksheets"""
+ scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+ fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
+ credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
+ docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+ client = gspread.authorize(credentials)
+ spreadsheet = client.open_by_key(docid)
+ return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+ """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
+ spreadsheet = fetch_spreadsheet()
+ return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+ """Get all the values from a particular worksheet as a list of lists.
+ Returns:
+ :keys - the first row of the document
+ :lines - a list of lists with the rest of the rows"""
+ rows = fetch_worksheet(name).get_all_values()
+ keys = rows[0]
+ lines = rows[1:]
+ return keys, lines
+
+def fetch_google_sheet_objects(name):
+ """Get all the values from a worksheet as a list of dictionaries"""
+ keys, rows = fetch_google_sheet(name)
+ recs = []
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ recs.append(rec)
+ return recs
+
+def fetch_google_lookup(name, item_key='key'):
+ """Get all the values from a worksheet as a dictionary of dictionaries.
+ Specify which field you want to use as the dictionary key."""
+ keys, rows = fetch_google_sheet(name)
+ lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ lookup[rec[item_key]] = rec
+ return lookup \ No newline at end of file