import os import click import re import os import csv import string import codecs import gspread from os.path import join from pathlib import Path from multiprocessing import Pool import simplejson as json from oauth2client.service_account import ServiceAccountCredentials from app.settings import types from app.models.dataset import Dataset from app.utils import click_utils, file_utils from app.settings import app_cfg from app.utils.logger_utils import Logger log = Logger.getLogger() opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal'] dataset_sheet_keys = ['key', 'name_short', 'name_full', 'url', 'dl_im', 'purpose', 'funded_by', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'campus', 'youtube', 'flickr', 'google', 'bing', 'comment'] @click.command() @click.option('-n', '--name', 'opt_spreadsheets', multiple=True, type=click.Choice(opt_sheets), default=['datasets'], help='Spreadsheet name') @click.option('--all', 'opt_all', is_flag=True, help='Get all sheets') @click.option('-o', '--output', 'opt_fp_out', required=True, type=click.Path(file_okay=False, dir_okay=True), help='Path to directory or filename') @click.option('--share', 'opt_share', required=True, type=click.Choice(['nyt', 'ft']), help='Share filter') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite') @click.pass_context def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_share, opt_force): """Fetch Google spreadsheet""" import sys import pandas as pd from tqdm import tqdm log = Logger.getLogger() if opt_all: opt_spreadsheets = opt_sheets for sheet_name in opt_spreadsheets: log.info(f'Get spreadsheet: {sheet_name}') fp_out = join(opt_fp_out, f'{sheet_name}.csv') fpp_out = Path(fp_out) if fpp_out.exists() and not opt_force: log.error(f'File "{fpp_out} exists. Use "-f" to overwrite') return sheet_data = fetch_google_sheet_objects(name=sheet_name) df_sheet = pd.DataFrame.from_dict(sheet_data) if sheet_name == 'datasets': df_sheet = clean_datasets_sheet_ft(df_sheet) fpp_out = Path(opt_fp_out) file_utils.mkdirs(fpp_out) if opt_all and fpp_out.is_file(): fpp_out = fpp_out.parent else: fpp_out = join(opt_fp_out, f'{sheet_name}.csv') log.info(f'Writing file: {fpp_out}') df_sheet.to_csv(fpp_out) def clean_datasets_sheet_ft(df): # clean data for FT df = df[df['ft_share'] == 'Y'] return df[dataset_sheet_keys] def clean_datasets_sheet_nyt(df): # clean data for FT df = df[df['ft_share'] == 'Y'] return df[dataset_sheet_keys] def fetch_spreadsheet(): """Open the Google Spreadsheet, which contains the individual worksheets""" scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json') credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope) docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc" client = gspread.authorize(credentials) spreadsheet = client.open_by_key(docid) return spreadsheet def fetch_worksheet(name="institutions"): """Get a reference to a particular "worksheet" from the Google Spreadsheet""" spreadsheet = fetch_spreadsheet() return spreadsheet.worksheet(name) def fetch_google_sheet(name="institutions"): """Get all the values from a particular worksheet as a list of lists. Returns: :keys - the first row of the document :lines - a list of lists with the rest of the rows""" rows = fetch_worksheet(name).get_all_values() keys = rows[0] lines = rows[1:] return keys, lines def fetch_google_sheet_objects(name): """Get all the values from a worksheet as a list of dictionaries""" keys, rows = fetch_google_sheet(name) recs = [] for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] recs.append(rec) return recs def fetch_google_lookup(name, item_key='key'): """Get all the values from a worksheet as a dictionary of dictionaries. Specify which field you want to use as the dictionary key.""" keys, rows = fetch_google_sheet(name) lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] lookup[rec[item_key]] = rec return lookup