summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/pull_spreadsheet.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-03-13 17:47:58 +0100
committeradamhrv <adam@ahprojects.com>2019-03-13 17:47:58 +0100
commitb2dcfb3ef79557b7fccfdd94aa8ac4991552d7e1 (patch)
tree1571e1d9d47ffe63b93670ae5870faca29b1bd3f /megapixels/commands/datasets/pull_spreadsheet.py
parent1fe0e5c79c3cbc4b13083116980e62b449866100 (diff)
add downloader
Diffstat (limited to 'megapixels/commands/datasets/pull_spreadsheet.py')
-rw-r--r--megapixels/commands/datasets/pull_spreadsheet.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/pull_spreadsheet.py b/megapixels/commands/datasets/pull_spreadsheet.py
new file mode 100644
index 00000000..0094ea59
--- /dev/null
+++ b/megapixels/commands/datasets/pull_spreadsheet.py
@@ -0,0 +1,124 @@
+import os
+import click
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+from os.path import join
+from pathlib import Path
+from multiprocessing import Pool
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils, file_utils
+from app.settings import app_cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]
+
+@click.command()
+@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
+ type=click.Choice(opt_sheets),
+ default=['datasets'],
+ help='Spreadsheet name')
+@click.option('--all', 'opt_all', is_flag=True,
+ help='Get all sheets')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Path to directory or filename')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.pass_context
+def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
+ """Fetch Google spreadsheet"""
+
+ import sys
+ import pandas as pd
+ from tqdm import tqdm
+
+ log = Logger.getLogger()
+ if opt_all:
+ opt_spreadsheets = opt_sheets
+
+ for sheet_name in opt_spreadsheets:
+ log.info(f'Get spreadsheet: {sheet_name}')
+ sheet_data = fetch_google_sheet_objects(name=sheet_name)
+ df_sheet = pd.DataFrame.from_dict(sheet_data)
+ if sheet_name == 'datasets':
+ df_sheet = clean_datasets_sheet_ft(df_sheet)
+ fpp_out = Path(opt_fp_out)
+ file_utils.mkdirs(fpp_out)
+
+ if opt_all and fpp_out.is_file():
+ fpp_out = fpp_out.parent
+ else:
+ fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
+ df_sheet.to_csv(fpp_out)
+
+
+def clean_datasets_sheet_ft(df):
+ # clean data for FT
+ df = df[df['ft_share'] == 'Y']
+ keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+ keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+ return df[keys]
+
+def clean_datasets_sheet_nyt(df):
+ # clean data for FT
+ df = df[df['ft_share'] == 'Y']
+ keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
+ keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
+ return df[keys]
+
+def fetch_spreadsheet():
+ """Open the Google Spreadsheet, which contains the individual worksheets"""
+ scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+ fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
+ credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
+ docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+ client = gspread.authorize(credentials)
+ spreadsheet = client.open_by_key(docid)
+ return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+ """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
+ spreadsheet = fetch_spreadsheet()
+ return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+ """Get all the values from a particular worksheet as a list of lists.
+ Returns:
+ :keys - the first row of the document
+ :lines - a list of lists with the rest of the rows"""
+ rows = fetch_worksheet(name).get_all_values()
+ keys = rows[0]
+ lines = rows[1:]
+ return keys, lines
+
+def fetch_google_sheet_objects(name):
+ """Get all the values from a worksheet as a list of dictionaries"""
+ keys, rows = fetch_google_sheet(name)
+ recs = []
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ recs.append(rec)
+ return recs
+
+def fetch_google_lookup(name, item_key='key'):
+ """Get all the values from a worksheet as a dictionary of dictionaries.
+ Specify which field you want to use as the dictionary key."""
+ keys, rows = fetch_google_sheet(name)
+ lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ lookup[rec[item_key]] = rec
+ return lookup \ No newline at end of file