summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-03-19 12:21:21 +0100
committeradamhrv <adam@ahprojects.com>2019-03-19 12:21:21 +0100
commita16b3cc7f796a5abe6c8c79f22b178785e6971f5 (patch)
treeec0a62f66479d7e7c7f0f03a5c2de79d87c51c3e /megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb
parent70f79c37278d7c47bee29cdf091bde448aae9240 (diff)
nbs for data collection
Diffstat (limited to 'megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb')
-rw-r--r--megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb624
1 files changed, 624 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb b/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb
new file mode 100644
index 00000000..183c063b
--- /dev/null
+++ b/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb
@@ -0,0 +1,624 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Pull Google Spreadsheet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import requests\n",
+ "import json\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "import difflib\n",
+ "import unidecode\n",
+ "\n",
+ "import slugify\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import api_utils, identity_utils\n",
+ "from app.settings import app_cfg\n",
+ "from app.settings import types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## To CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add CASIA Webface\n",
+ "\n",
+ "import os\n",
+ "import click\n",
+ "import re\n",
+ "import os\n",
+ "import csv\n",
+ "import string\n",
+ "import codecs\n",
+ "import gspread\n",
+ "from os.path import join\n",
+ "from pathlib import Path\n",
+ "import simplejson as json\n",
+ "from oauth2client.service_account import ServiceAccountCredentials\n",
+ "\n",
+ "from app.settings import types\n",
+ "from app.settings import app_cfg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fetch_spreadsheet():\n",
+ " \"\"\"Open the Google Spreadsheet, which contains the individual worksheets\"\"\"\n",
+ " scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']\n",
+ " fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')\n",
+ " credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)\n",
+ " docid = \"1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc\"\n",
+ " client = gspread.authorize(credentials)\n",
+ " spreadsheet = client.open_by_key(docid)\n",
+ " return spreadsheet\n",
+ "\n",
+ "def fetch_worksheet(name=\"institutions\"):\n",
+ " \"\"\"Get a reference to a particular \"worksheet\" from the Google Spreadsheet\"\"\"\n",
+ " spreadsheet = fetch_spreadsheet()\n",
+ " return spreadsheet.worksheet(name)\n",
+ "\n",
+ "def fetch_google_sheet(name=\"institutions\"):\n",
+ " \"\"\"Get all the values from a particular worksheet as a list of lists.\n",
+ " Returns:\n",
+ " :keys - the first row of the document\n",
+ " :lines - a list of lists with the rest of the rows\"\"\"\n",
+ " rows = fetch_worksheet(name).get_all_values()\n",
+ " keys = rows[0]\n",
+ " lines = rows[1:]\n",
+ " return keys, lines\n",
+ "\n",
+ "def fetch_google_sheet_objects(name):\n",
+ " \"\"\"Get all the values from a worksheet as a list of dictionaries\"\"\"\n",
+ " keys, rows = fetch_google_sheet(name)\n",
+ " recs = []\n",
+ " for row in rows:\n",
+ " rec = {}\n",
+ " for index, key in enumerate(keys):\n",
+ " rec[key] = row[index]\n",
+ " recs.append(rec)\n",
+ " return recs\n",
+ "\n",
+ "def fetch_google_lookup(name, item_key='key'):\n",
+ " \"\"\"Get all the values from a worksheet as a dictionary of dictionaries.\n",
+ " Specify which field you want to use as the dictionary key.\"\"\"\n",
+ " keys, rows = fetch_google_sheet(name)\n",
+ " lookup = {}\n",
+ " for row in rows:\n",
+ " rec = {}\n",
+ " for index, key in enumerate(keys):\n",
+ " rec[key] = row[index]\n",
+ " lookup[rec[item_key]] = rec\n",
+ " return lookup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sheet_datasets = fetch_google_sheet_objects(name='datasets')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sheet_stats = fetch_google_sheet_objects(name='statistics')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'key': '10k_US_adult_faces',\n",
+ " 'name': '10K US Adult Faces',\n",
+ " 'berit': 'Y',\n",
+ " 'charlie': '',\n",
+ " 'adam': '',\n",
+ " 'priority': '',\n",
+ " 'wild': 'Y',\n",
+ " 'indoor': '',\n",
+ " 'outdoor': '',\n",
+ " 'cyberspace': 'Y',\n",
+ " 'names': '',\n",
+ " 'downloaded': '',\n",
+ " 'year_start': '',\n",
+ " 'year_end': '',\n",
+ " 'year_published': '2013',\n",
+ " 'ongoing': '',\n",
+ " 'images': '10,168 ',\n",
+ " 'videos': '',\n",
+ " 'faces_unique': '10,168 ',\n",
+ " 'total_faces': '',\n",
+ " 'img_per_person': '',\n",
+ " 'num_cameras': '',\n",
+ " 'faces_persons': '',\n",
+ " 'female': '4362',\n",
+ " 'male': '5806',\n",
+ " 'landmarks': '77 ',\n",
+ " 'width': '',\n",
+ " 'height': '256',\n",
+ " 'color': '',\n",
+ " 'gray': '',\n",
+ " 'derivative_of': '',\n",
+ " 'tags': 'fr',\n",
+ " 'source': 'google',\n",
+ " 'purpose_short': 'US adult faces database',\n",
+ " 'size_gb': '',\n",
+ " 'agreement': '',\n",
+ " 'agree_requied': '',\n",
+ " 'agreement_signed': '',\n",
+ " 'comment': 'Using an online random name generator based on the 1990 U.S. Census name distribution we randomly sampled 25,000 first and last names and automatically downloaded from Google Image Search',\n",
+ " 'comment 2': '',\n",
+ " 'comment 3': '',\n",
+ " '': ''}"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sheet_stats[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['', 'added_on', 'comments', 'faces', 'ft_share', 'key', 'name_full',\n",
+ " 'name_short', 'pdf_paper', 'relevance', 'subset_of', 'superset_of',\n",
+ " 'url', 'using'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_datasets.keys()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sheet = fetch_google_lookup(name=opt_spreadsheet)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'key': 'lfw', 'name_short': 'LFW', 'using': 'Y', 'ft_share': '1', 'subset_of': '', 'superset_of': '', 'name_full': 'Labeled Faces in-the-Wild', 'url': 'http://vis-www.cs.umass.edu/lfw/', 'added_on': '', 'faces': '', 'pdf_paper': 'Y', 'comments': 'many references, only included first on from 2007', '': '', 'relevance': '10'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(sheet['lfw'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "10k_US_adult_faces\n",
+ "3d_rma\n",
+ "3dddb_unconstrained\n",
+ "3dpes\n",
+ "4dfab\n",
+ "50_people_one_question\n",
+ "a_pascal_yahoo\n",
+ "aberdeen \n",
+ "adience\n",
+ "afad\n",
+ "afew_va\n",
+ "affectnet\n",
+ "aflw\n",
+ "afw\n",
+ "agedb\n",
+ "alert_airport\n",
+ "am_fed\n",
+ "apis\n",
+ "ar_facedb\n",
+ "awe_ears\n",
+ "b3d_ac\n",
+ "bbc_pose\n",
+ "berkeley_pose\n",
+ "bfm\n",
+ "bio_id\n",
+ "bjut_3d\n",
+ "bosphorus\n",
+ "bp4d_plus\n",
+ "bp4d_spontanous\n",
+ "brainwash\n",
+ "bu_3dfe\n",
+ "buhmap_db\n",
+ "cafe\n",
+ "caltech_10k_web_faces\n",
+ "caltech_faces\n",
+ "caltech_pedestrians\n",
+ "camel\n",
+ "cas_peal\n",
+ "casablanca\n",
+ "casia_webface\n",
+ "caviar4reid\n",
+ "celeba\n",
+ "celeba_plus\n",
+ "cfd\n",
+ "chalearn\n",
+ "chokepoint\n",
+ "cityscapes\n",
+ "clothing_co_parsing\n",
+ "cmdp\n",
+ "cmu_pie\n",
+ "coco\n",
+ "coco_action\n",
+ "coco_qa\n",
+ "cofw\n",
+ "cohn_kanade\n",
+ "cohn_kanade_plus\n",
+ "columbia_gaze\n",
+ "complex_activities\n",
+ "cuhk01\n",
+ "cuhk02\n",
+ "cuhk03\n",
+ "cvc_01_barcelona\n",
+ "czech_news_agency\n",
+ "d3dfacs\n",
+ "dartmouth_children\n",
+ "data_61\n",
+ "deep_fashion\n",
+ "disfa\n",
+ "distance_nighttime\n",
+ "duke_mtmc\n",
+ "emotio_net\n",
+ "eth_andreas_ess\n",
+ "europersons\n",
+ "expw\n",
+ "face_research_lab\n",
+ "face_scrub\n",
+ "face_tracer\n",
+ "facebook\n",
+ "facebook_100\n",
+ "faceplace\n",
+ "faces94\n",
+ "faces95\n",
+ "faces96\n",
+ "families_in_the_wild\n",
+ "fddb\n",
+ "fei\n",
+ "feret\n",
+ "ferplus\n",
+ "fia\n",
+ "fiw_300\n",
+ "florida_inmates\n",
+ "frav2d\n",
+ "frav3d\n",
+ "grimace\n",
+ "frgc\n",
+ "gallagher\n",
+ "gavab_db\n",
+ "geofaces\n",
+ "georgia_tech_face_database\n",
+ "gmu\n",
+ "google\n",
+ "graz\n",
+ "h3d\n",
+ "hda_plus\n",
+ "helen\n",
+ "hi4d_adsip\n",
+ "hid_equinox_infrared\n",
+ "hipsterwars\n",
+ "hollywood_headset\n",
+ "hrt_transgender\n",
+ "ifad\n",
+ "ifdb\n",
+ "iit_dehli_ear\n",
+ "ijb_a\n",
+ "ijb_b\n",
+ "ijb_c\n",
+ "ijb_s\n",
+ "ilids_mcts\n",
+ "ilids_vid_reid\n",
+ "images_of_groups\n",
+ "imdb_wiki\n",
+ "imdb_face\n",
+ "imfdb\n",
+ "imm_face\n",
+ "immediacy\n",
+ "imsitu\n",
+ "inria_person\n",
+ "iqiyi\n",
+ "jaffe\n",
+ "jiku_mobile\n",
+ "jpl_pose\n",
+ "karpathy_instagram\n",
+ "kdef\n",
+ "kin_face\n",
+ "kinectface\n",
+ "kitti\n",
+ "lag\n",
+ "large_scale_person_search\n",
+ "leeds_sports_pose\n",
+ "leeds_sports_pose_extended\n",
+ "lfw\n",
+ "lfw_a\n",
+ "lfw_p\n",
+ "m2vts\n",
+ "m2vtsdb_extended\n",
+ "mafl\n",
+ "malf\n",
+ "mapillary\n",
+ "market_1501\n",
+ "market1203\n",
+ "mars\n",
+ "mcgill\n",
+ "meds\n",
+ "megaage\n",
+ "megaface\n",
+ "mifs\n",
+ "mikki\n",
+ "mit_cbcl\n",
+ "mit_cbcl_ped\n",
+ "mit_cbclss\n",
+ "miw\n",
+ "mmi_facial_expression\n",
+ "moments_in_time\n",
+ "morph\n",
+ "morph_nc\n",
+ "mot\n",
+ "mpi_large\n",
+ "mpi_small\n",
+ "mpii_gaze\n",
+ "mpii_human_pose\n",
+ "mr2\n",
+ "mrp_drone\n",
+ "msceleb\n",
+ "msmt_17\n",
+ "muct\n",
+ "mug_faces\n",
+ "multi_pie\n",
+ "mtfl\n",
+ "names_and_faces_news\n",
+ "nd_2006\n",
+ "nist_mid_mugshot\n",
+ "nova_emotions\n",
+ "nudedetection\n",
+ "orl\n",
+ "penn_fudan\n",
+ "peta\n",
+ "pets\n",
+ "pilot_parliament\n",
+ "pipa\n",
+ "pku\n",
+ "pku_reid\n",
+ "pornodb\n",
+ "precarious\n",
+ "prid\n",
+ "prw\n",
+ "psu\n",
+ "pubfig\n",
+ "pubfig_83\n",
+ "put_face\n",
+ "qmul_grid\n",
+ "qmul_ilids\n",
+ "qmul_surv_face\n",
+ "rafd\n",
+ "raid\n",
+ "rap_pedestrian\n",
+ "reseed\n",
+ "saivt\n",
+ "samm\n",
+ "sarc3d\n",
+ "scface\n",
+ "scut_fbp\n",
+ "scut_head\n",
+ "sdu_vid\n",
+ "urban_sed\n",
+ "sheffield\n",
+ "shinpuhkan_2014\n",
+ "social_relation\n",
+ "soton\n",
+ "sports_videos_in_the_wild\n",
+ "stair_actions\n",
+ "stanford_drone\n",
+ "stickmen_buffy\n",
+ "stickmen_family\n",
+ "stickmen_pascal\n",
+ "stirling_esrc_3s\n",
+ "sun_attributes\n",
+ "svs\n",
+ "texas_3dfrd\n",
+ "tiny_faces\n",
+ "tiny_images\n",
+ "towncenter\n",
+ "tud_brussels\n",
+ "tud_campus\n",
+ "tud_crossing\n",
+ "tud_motionpairs\n",
+ "tud_multiview\n",
+ "tud_pedestrian\n",
+ "tud_stadtmitte\n",
+ "tvhi\n",
+ "twinsburg_twins\n",
+ "uccs\n",
+ "ucf_101\n",
+ "ucf_crowd\n",
+ "ucf_selfie\n",
+ "ufdd\n",
+ "umb\n",
+ "umd_faces\n",
+ "unbc_shoulder_pain\n",
+ "urban_tribes\n",
+ "used\n",
+ "utk_face\n",
+ "v47\n",
+ "vadana\n",
+ "vgg_celebs_in_places\n",
+ "vgg_faces\n",
+ "vgg_faces2\n",
+ "violent_flows\n",
+ "viper\n",
+ "visual_phrases\n",
+ "vmu\n",
+ "voc\n",
+ "vqa\n",
+ "ward\n",
+ "who_goes_there\n",
+ "wider\n",
+ "wider_face\n",
+ "wider_attribute\n",
+ "wildtrack\n",
+ "yale_faces\n",
+ "yale_faces_b\n",
+ "yale_faces_b_ext\n",
+ "yawdd\n",
+ "yfcc_100m\n",
+ "york_3d\n",
+ "youtube_faces\n",
+ "youtube_makeup\n",
+ "youtube_poses\n",
+ "wlfdb\n",
+ "sal\n",
+ "semaine\n",
+ "belfast_naturalistic\n",
+ "belfast_induced\n",
+ "vam_faces\n",
+ "manhob_hci\n",
+ "deap\n",
+ "amfed\n",
+ "recola\n",
+ "avec_13\n",
+ "avec_14\n",
+ "mimicry\n",
+ "meissner\n",
+ "nottingham_scans\n",
+ "nottingham_orig\n",
+ "stirling_pain\n",
+ "utrecht_ecvp\n",
+ "mooney\n",
+ "\n",
+ "vcr\n",
+ "hufrd_pilgrims\n",
+ "vidtimit\n",
+ "casme\n",
+ "face_place\n",
+ "flickr_faces\n",
+ "hku_face\n",
+ "ibm_dif\n",
+ "gfw\n",
+ "kasparov_rgbd\n",
+ "vap_rgbd\n",
+ "aau_thermal_sports\n",
+ "aau_thermal_soccer\n",
+ "wdref\n",
+ "pic\n",
+ "ucfi\n",
+ "ldhf\n",
+ "nfrad\n",
+ "specface\n"
+ ]
+ }
+ ],
+ "source": [
+ "for k, v in sheet.items():\n",
+ " print(k)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}