1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
import os
import click
import re
import os
import csv
import string
import codecs
import gspread
from os.path import join
from pathlib import Path
from multiprocessing import Pool
import simplejson as json
from oauth2client.service_account import ServiceAccountCredentials
from app.settings import types
from app.models.dataset import Dataset
from app.utils import click_utils, file_utils
from app.settings import app_cfg
from app.utils.logger_utils import Logger
log = Logger.getLogger()
opt_sheets = ['datasets', 'relationships', 'funding', 'references', 'sources', 'tags', 'citations', 'legal']
@click.command()
@click.option('-n', '--name', 'opt_spreadsheets', multiple=True,
type=click.Choice(opt_sheets),
default=['datasets'],
help='Spreadsheet name')
@click.option('--all', 'opt_all', is_flag=True,
help='Get all sheets')
@click.option('-o', '--output', 'opt_fp_out', required=True,
help='Path to directory or filename')
@click.option('-f', '--force', 'opt_force', is_flag=True,
help='Force overwrite')
@click.pass_context
def cli(ctx, opt_spreadsheets, opt_fp_out, opt_all, opt_force):
"""Fetch Google spreadsheet"""
import sys
import pandas as pd
from tqdm import tqdm
log = Logger.getLogger()
if opt_all:
opt_spreadsheets = opt_sheets
for sheet_name in opt_spreadsheets:
log.info(f'Get spreadsheet: {sheet_name}')
sheet_data = fetch_google_sheet_objects(name=sheet_name)
df_sheet = pd.DataFrame.from_dict(sheet_data)
if sheet_name == 'datasets':
df_sheet = clean_datasets_sheet_ft(df_sheet)
fpp_out = Path(opt_fp_out)
file_utils.mkdirs(fpp_out)
if opt_all and fpp_out.is_file():
fpp_out = fpp_out.parent
else:
fpp_out = join(opt_fp_out, f'{sheet_name}.csv')
df_sheet.to_csv(fpp_out)
def clean_datasets_sheet_ft(df):
# clean data for FT
df = df[df['ft_share'] == 'Y']
keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment']
return df[keys]
def clean_datasets_sheet_nyt(df):
# clean data for FT
df = df[df['ft_share'] == 'Y']
keys = ['key', 'name_short', 'name_full', 'url', 'downloaded', 'purpose', 'wild']
keys += ['campus', 'year_start', 'year_end', 'year_published', 'images', 'videos', 'identities', 'faces_or_persons', 'youtube', 'flickr', 'google', 'bing', 'comment']
return df[keys]
def fetch_spreadsheet():
"""Open the Google Spreadsheet, which contains the individual worksheets"""
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')
credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)
docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
client = gspread.authorize(credentials)
spreadsheet = client.open_by_key(docid)
return spreadsheet
def fetch_worksheet(name="institutions"):
"""Get a reference to a particular "worksheet" from the Google Spreadsheet"""
spreadsheet = fetch_spreadsheet()
return spreadsheet.worksheet(name)
def fetch_google_sheet(name="institutions"):
"""Get all the values from a particular worksheet as a list of lists.
Returns:
:keys - the first row of the document
:lines - a list of lists with the rest of the rows"""
rows = fetch_worksheet(name).get_all_values()
keys = rows[0]
lines = rows[1:]
return keys, lines
def fetch_google_sheet_objects(name):
"""Get all the values from a worksheet as a list of dictionaries"""
keys, rows = fetch_google_sheet(name)
recs = []
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
recs.append(rec)
return recs
def fetch_google_lookup(name, item_key='key'):
"""Get all the values from a worksheet as a dictionary of dictionaries.
Specify which field you want to use as the dictionary key."""
keys, rows = fetch_google_sheet(name)
lookup = {}
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
lookup[rec[item_key]] = rec
return lookup
|