diff options
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 98 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 82 | ||||
| -rw-r--r-- | megapixels/commands/datasets/ijb_skin_color.py | 32 | ||||
| -rw-r--r-- | megapixels/commands/datasets/pull_spreadsheet.py | 124 |
4 files changed, 336 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py new file mode 100644 index 00000000..48aca5f0 --- /dev/null +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -0,0 +1,98 @@ +import click + +fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.txt' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input CSV file') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output path') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.option('--agents', 'opt_fp_agents', default=fp_user_agents) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): + """Threaded image/file downloader""" + + """ + CSV should be formatted as + + |url|filepath| + |---|---| + |https:/site.com/photo.jpg|myfolder/myname.jpg| + + Saves logfile.csv output and uses for errors + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + from random import randint + + import pandas as pd + from tqdm import tqdm + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + url_prefix = 'https://dataviz.nbcnews.com/projects/20190306-ibm-flickr-usernames/data/' + + with open(fp_user_agents, 'r') as fp: + user_agents = fp.readlines() + user_agents = [x.strip() for x in user_agents] + + + # setup multithreading function + def pool_process(item): + # threaded function + fp_out = item['filepath'] + try: + # download image + opener = urllib.request.build_opener() + opener.addheaders = [('User-agent', item['user_agent'])] + urllib.request.install_opener(opener) + urllib.request.urlretrieve(item['url'], fp_out) + item['status'] = True + except Exception as e: + if str(e) != 'HTTP Error 403: Forbidden': + log.debug(f'Error: {e}') + fp_error = f'{fp_out}_error.txt' + with open(fp_error, 'w') as fp: + fp.write('') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holders + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + + pool_items = [] + for x in tqdm(records): + fp_dst = join(opt_fp_out, x['sha256'] + '.json') + fp_dst_is_file = Path(fp_dst).is_file() + fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() + if not fp_dst_is_file and not fp_dst_is_err: + url = url_prefix + x['sha256'] + '.json' + user_agent = user_agents[randint(0, len(user_agents)) - 1] + pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + pool_results = [] + + # too many records for RAM + del records + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py new file mode 100644 index 00000000..f1519c61 --- /dev/null +++ b/megapixels/commands/datasets/download_images.py @@ -0,0 +1,82 @@ +import click + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): + """Threaded image downloader""" + + """ + CSV should be formatted as + + |url|filepath| + |---|---| + |https:/site.com/photo.jpg|myfolder/myname.jpg| + + Saves logfile.csv output and uses for errors + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + + import pandas as pd + from tqdm import tqdm + from app.utils import file_utils + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + fp_out = item['filepath'] + try: + # download image + file_utils.mkdirs(item['filepath']) + urllib.request.urlretrieve(item['url'], fp_out) + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}') + fp_error = f'{fp_out}_error.txt' + with open(fp_error, 'w') as fp: + fp.write('') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holds + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + + + pool_items = [] + for x in tqdm(records): + fp_dst = join(opt_fp_out, x['filepath']) + fp_dst_is_file = Path(fp_dst).is_file() + fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() + if not fp_dst_is_file and not fp_dst_is_err: + pool_items.append({'url':x['url'], 'filepath': fp_dst}) + + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + pool_results = [] + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + diff --git a/megapixels/commands/datasets/ijb_skin_color.py b/megapixels/commands/datasets/ijb_skin_color.py new file mode 100644 index 00000000..bf3a6d5d --- /dev/null +++ b/megapixels/commands/datasets/ijb_skin_color.py @@ -0,0 +1,32 @@ +import click + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in') +@click.option('-o', '--output', 'opt_fp_out') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Measure skin color IJB-C""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + import cv2 as cv + from tqdm import tqdm + + from app.utils import file_utils, im_utils + from app.models.data_store import DataStore + + log = Logger.getLogger() + log.info('IJBC Skin Color') diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py new file mode 100644 index 00000000..0094ea59 --- /dev/null +++ b/megapixels/commands/datasets/pull_spreadsheet.py @@ -0,0 +1,124 @@ +import os +import click +import re +import os +import csv +import string +import codecs +import gspread +from os.path import join +from pathlib import Path +from multiprocessing import Pool +import simplejson as json +from oauth2client.service_account import ServiceAccountCredentials + +from app.settings import types +from app.models.dataset import Dataset +from app.utils import click_utils, file_utils +from app.settings import app_cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ] + +@click.command() +@click.option('-n', '--name', 'opt_spreadsheets', multiple=True, + type=click.Choice(opt_sheets), + default=['datasets'], + help='Spreadsheet name') +@click.option('--all', 'opt_all', is_flag=True, + help='Get all sheets') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Path to directory or filename') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force): + """Fetch Google spreadsheet""" + + import sys + import pandas as pd + from tqdm import tqdm + + log = Logger.getLogger() + if opt_all: + opt_spreadsheets = opt_sheets + + for sheet_name in opt_spreadsheets: + log.info(f'Get spreadsheet: {sheet_name}') + sheet_data = fetch_google_sheet_objects(name=sheet_name) + df_sheet = pd.DataFrame.from_dict(sheet_data) + if sheet_name == 'datasets': + df_sheet = clean_datasets_sheet_ft(df_sheet) + fpp_out = Path(opt_fp_out) + file_utils.mkdirs(fpp_out) + + if opt_all and fpp_out.is_file(): + fpp_out = fpp_out.parent + else: + fpp_out = join(opt_fp_out, f'{sheet_name}.csv') + df_sheet.to_csv(fpp_out) + + +def clean_datasets_sheet_ft(df): + # clean data for FT + df = df[df['ft_share'] == 'Y'] + keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment'] + return df[keys] + +def clean_datasets_sheet_nyt(df): + # clean data for FT + df = df[df['ft_share'] == 'Y'] + keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild'] + keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment'] + return df[keys] + +def fetch_spreadsheet(): + """Open the Google Spreadsheet, which contains the individual worksheets""" + scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] + fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json') + credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope) + docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc" + client = gspread.authorize(credentials) + spreadsheet = client.open_by_key(docid) + return spreadsheet + +def fetch_worksheet(name="institutions"): + """Get a reference to a particular "worksheet" from the Google Spreadsheet""" + spreadsheet = fetch_spreadsheet() + return spreadsheet.worksheet(name) + +def fetch_google_sheet(name="institutions"): + """Get all the values from a particular worksheet as a list of lists. + Returns: + :keys - the first row of the document + :lines - a list of lists with the rest of the rows""" + rows = fetch_worksheet(name).get_all_values() + keys = rows[0] + lines = rows[1:] + return keys, lines + +def fetch_google_sheet_objects(name): + """Get all the values from a worksheet as a list of dictionaries""" + keys, rows = fetch_google_sheet(name) + recs = [] + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + recs.append(rec) + return recs + +def fetch_google_lookup(name, item_key='key'): + """Get all the values from a worksheet as a dictionary of dictionaries. + Specify which field you want to use as the dictionary key.""" + keys, rows = fetch_google_sheet(name) + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + lookup[rec[item_key]] = rec + return lookup
\ No newline at end of file |
