megapixels/commands/datasets/pull_spreadsheet.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

import os
import click
import re
import os
import csv
import string
import codecs
import gspread
from os.path import join
from pathlib import Path
from multiprocessing import Pool
import simplejson as json
from oauth2client.service_account import ServiceAccountCredentials

from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils, file_utils
from app.settings import app_cfg
from app.utils.logger_utils import Logger

log = Logger.getLogger()

opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal', ]

@click.command()
@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
  type=click.Choice(opt_sheets), 
  default=['datasets'],
  help='Spreadsheet name')
@click.option('--all', 'opt_all', is_flag=True,
  help='Get all sheets')
@click.option('-o', '--output', 'opt_fp_out', required=True,
  help='Path to directory or filename')
@click.option('-f', '--force', 'opt_force', is_flag=True,
  help='Force overwrite')
@click.pass_context
def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
  """Fetch Google spreadsheet"""
  
  import sys
  import pandas as pd
  from tqdm import tqdm

  log = Logger.getLogger()
  if opt_all:
    opt_spreadsheets = opt_sheets
  
  for sheet_name in opt_spreadsheets:
    log.info(f'Get spreadsheet: {sheet_name}')
    sheet_data = fetch_google_sheet_objects(name=sheet_name)
    df_sheet = pd.DataFrame.from_dict(sheet_data)
    if sheet_name == 'datasets':
      df_sheet = clean_datasets_sheet_ft(df_sheet)
    fpp_out = Path(opt_fp_out)
    file_utils.mkdirs(fpp_out)

    if opt_all and fpp_out.is_file():
      fpp_out = fpp_out.parent
    else:
      fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
    df_sheet.to_csv(fpp_out)


def clean_datasets_sheet_ft(df):
  # clean data for FT
  df = df[df['ft_share'] == 'Y']
  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
  return df[keys]

def clean_datasets_sheet_nyt(df):
  # clean data for FT
  df = df[df['ft_share'] == 'Y']
  keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
  keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces', 'youtube', 'flickr', 'google', 'bing', 'comment']
  return df[keys]

def fetch_spreadsheet():
  """Open the Google Spreadsheet, which contains the individual worksheets"""
  scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
  fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
  credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
  docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
  client = gspread.authorize(credentials)
  spreadsheet = client.open_by_key(docid)
  return spreadsheet

def fetch_worksheet(name="institutions"):
  """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
  spreadsheet = fetch_spreadsheet()
  return spreadsheet.worksheet(name)

def fetch_google_sheet(name="institutions"):
  """Get all the values from a particular worksheet as a list of lists.
  Returns:
  :keys - the first row of the document
  :lines - a list of lists with the rest of the rows"""
  rows = fetch_worksheet(name).get_all_values()
  keys = rows[0]
  lines = rows[1:]
  return keys, lines

def fetch_google_sheet_objects(name):
  """Get all the values from a worksheet as a list of dictionaries"""
  keys, rows = fetch_google_sheet(name)
  recs = []
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    recs.append(rec)
  return recs

def fetch_google_lookup(name, item_key='key'):
  """Get all the values from a worksheet as a dictionary of dictionaries.
  Specify which field you want to use as the dictionary key."""
  keys, rows = fetch_google_sheet(name)
  lookup = {}
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    lookup[rec[item_key]] = rec
  return lookup