diff options
| author | adamhrv <adam@ahprojects.com> | 2019-03-19 12:21:21 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-03-19 12:21:21 +0100 |
| commit | a16b3cc7f796a5abe6c8c79f22b178785e6971f5 (patch) | |
| tree | ec0a62f66479d7e7c7f0f03a5c2de79d87c51c3e /megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb | |
| parent | 70f79c37278d7c47bee29cdf091bde448aae9240 (diff) | |
nbs for data collection
Diffstat (limited to 'megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb | 624 |
1 files changed, 624 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb b/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb new file mode 100644 index 00000000..183c063b --- /dev/null +++ b/megapixels/notebooks/datasets/spreadsheets/pull_spreadsheet.ipynb @@ -0,0 +1,624 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pull Google Spreadsheet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import requests\n", + "import json\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "import difflib\n", + "import unidecode\n", + "\n", + "import slugify\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import api_utils, identity_utils\n", + "from app.settings import app_cfg\n", + "from app.settings import types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## To CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# add CASIA Webface\n", + "\n", + "import os\n", + "import click\n", + "import re\n", + "import os\n", + "import csv\n", + "import string\n", + "import codecs\n", + "import gspread\n", + "from os.path import join\n", + "from pathlib import Path\n", + "import simplejson as json\n", + "from oauth2client.service_account import ServiceAccountCredentials\n", + "\n", + "from app.settings import types\n", + "from app.settings import app_cfg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_spreadsheet():\n", + " \"\"\"Open the Google Spreadsheet, which contains the individual worksheets\"\"\"\n", + " scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']\n", + " fp_creds = join(app_cfg.DIR_ROOT, 'scraper/.creds/Megapixels-ef28f91112a9.json')\n", + " credentials = ServiceAccountCredentials.from_json_keyfile_name(fp_creds, scope)\n", + " docid = \"1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc\"\n", + " client = gspread.authorize(credentials)\n", + " spreadsheet = client.open_by_key(docid)\n", + " return spreadsheet\n", + "\n", + "def fetch_worksheet(name=\"institutions\"):\n", + " \"\"\"Get a reference to a particular \"worksheet\" from the Google Spreadsheet\"\"\"\n", + " spreadsheet = fetch_spreadsheet()\n", + " return spreadsheet.worksheet(name)\n", + "\n", + "def fetch_google_sheet(name=\"institutions\"):\n", + " \"\"\"Get all the values from a particular worksheet as a list of lists.\n", + " Returns:\n", + " :keys - the first row of the document\n", + " :lines - a list of lists with the rest of the rows\"\"\"\n", + " rows = fetch_worksheet(name).get_all_values()\n", + " keys = rows[0]\n", + " lines = rows[1:]\n", + " return keys, lines\n", + "\n", + "def fetch_google_sheet_objects(name):\n", + " \"\"\"Get all the values from a worksheet as a list of dictionaries\"\"\"\n", + " keys, rows = fetch_google_sheet(name)\n", + " recs = []\n", + " for row in rows:\n", + " rec = {}\n", + " for index, key in enumerate(keys):\n", + " rec[key] = row[index]\n", + " recs.append(rec)\n", + " return recs\n", + "\n", + "def fetch_google_lookup(name, item_key='key'):\n", + " \"\"\"Get all the values from a worksheet as a dictionary of dictionaries.\n", + " Specify which field you want to use as the dictionary key.\"\"\"\n", + " keys, rows = fetch_google_sheet(name)\n", + " lookup = {}\n", + " for row in rows:\n", + " rec = {}\n", + " for index, key in enumerate(keys):\n", + " rec[key] = row[index]\n", + " lookup[rec[item_key]] = rec\n", + " return lookup" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "sheet_datasets = fetch_google_sheet_objects(name='datasets')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "sheet_stats = fetch_google_sheet_objects(name='statistics')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'key': '10k_US_adult_faces',\n", + " 'name': '10K US Adult Faces',\n", + " 'berit': 'Y',\n", + " 'charlie': '',\n", + " 'adam': '',\n", + " 'priority': '',\n", + " 'wild': 'Y',\n", + " 'indoor': '',\n", + " 'outdoor': '',\n", + " 'cyberspace': 'Y',\n", + " 'names': '',\n", + " 'downloaded': '',\n", + " 'year_start': '',\n", + " 'year_end': '',\n", + " 'year_published': '2013',\n", + " 'ongoing': '',\n", + " 'images': '10,168 ',\n", + " 'videos': '',\n", + " 'faces_unique': '10,168 ',\n", + " 'total_faces': '',\n", + " 'img_per_person': '',\n", + " 'num_cameras': '',\n", + " 'faces_persons': '',\n", + " 'female': '4362',\n", + " 'male': '5806',\n", + " 'landmarks': '77 ',\n", + " 'width': '',\n", + " 'height': '256',\n", + " 'color': '',\n", + " 'gray': '',\n", + " 'derivative_of': '',\n", + " 'tags': 'fr',\n", + " 'source': 'google',\n", + " 'purpose_short': 'US adult faces database',\n", + " 'size_gb': '',\n", + " 'agreement': '',\n", + " 'agree_requied': '',\n", + " 'agreement_signed': '',\n", + " 'comment': 'Using an online random name generator based on the 1990 U.S. Census name distribution we randomly sampled 25,000 first and last names and automatically downloaded from Google Image Search',\n", + " 'comment 2': '',\n", + " 'comment 3': '',\n", + " '': ''}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sheet_stats[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['', 'added_on', 'comments', 'faces', 'ft_share', 'key', 'name_full',\n", + " 'name_short', 'pdf_paper', 'relevance', 'subset_of', 'superset_of',\n", + " 'url', 'using'],\n", + " dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_datasets.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "sheet = fetch_google_lookup(name=opt_spreadsheet)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key': 'lfw', 'name_short': 'LFW', 'using': 'Y', 'ft_share': '1', 'subset_of': '', 'superset_of': '', 'name_full': 'Labeled Faces in-the-Wild', 'url': 'http://vis-www.cs.umass.edu/lfw/', 'added_on': '', 'faces': '', 'pdf_paper': 'Y', 'comments': 'many references, only included first on from 2007', '': '', 'relevance': '10'}\n" + ] + } + ], + "source": [ + "print(sheet['lfw'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10k_US_adult_faces\n", + "3d_rma\n", + "3dddb_unconstrained\n", + "3dpes\n", + "4dfab\n", + "50_people_one_question\n", + "a_pascal_yahoo\n", + "aberdeen \n", + "adience\n", + "afad\n", + "afew_va\n", + "affectnet\n", + "aflw\n", + "afw\n", + "agedb\n", + "alert_airport\n", + "am_fed\n", + "apis\n", + "ar_facedb\n", + "awe_ears\n", + "b3d_ac\n", + "bbc_pose\n", + "berkeley_pose\n", + "bfm\n", + "bio_id\n", + "bjut_3d\n", + "bosphorus\n", + "bp4d_plus\n", + "bp4d_spontanous\n", + "brainwash\n", + "bu_3dfe\n", + "buhmap_db\n", + "cafe\n", + "caltech_10k_web_faces\n", + "caltech_faces\n", + "caltech_pedestrians\n", + "camel\n", + "cas_peal\n", + "casablanca\n", + "casia_webface\n", + "caviar4reid\n", + "celeba\n", + "celeba_plus\n", + "cfd\n", + "chalearn\n", + "chokepoint\n", + "cityscapes\n", + "clothing_co_parsing\n", + "cmdp\n", + "cmu_pie\n", + "coco\n", + "coco_action\n", + "coco_qa\n", + "cofw\n", + "cohn_kanade\n", + "cohn_kanade_plus\n", + "columbia_gaze\n", + "complex_activities\n", + "cuhk01\n", + "cuhk02\n", + "cuhk03\n", + "cvc_01_barcelona\n", + "czech_news_agency\n", + "d3dfacs\n", + "dartmouth_children\n", + "data_61\n", + "deep_fashion\n", + "disfa\n", + "distance_nighttime\n", + "duke_mtmc\n", + "emotio_net\n", + "eth_andreas_ess\n", + "europersons\n", + "expw\n", + "face_research_lab\n", + "face_scrub\n", + "face_tracer\n", + "facebook\n", + "facebook_100\n", + "faceplace\n", + "faces94\n", + "faces95\n", + "faces96\n", + "families_in_the_wild\n", + "fddb\n", + "fei\n", + "feret\n", + "ferplus\n", + "fia\n", + "fiw_300\n", + "florida_inmates\n", + "frav2d\n", + "frav3d\n", + "grimace\n", + "frgc\n", + "gallagher\n", + "gavab_db\n", + "geofaces\n", + "georgia_tech_face_database\n", + "gmu\n", + "google\n", + "graz\n", + "h3d\n", + "hda_plus\n", + "helen\n", + "hi4d_adsip\n", + "hid_equinox_infrared\n", + "hipsterwars\n", + "hollywood_headset\n", + "hrt_transgender\n", + "ifad\n", + "ifdb\n", + "iit_dehli_ear\n", + "ijb_a\n", + "ijb_b\n", + "ijb_c\n", + "ijb_s\n", + "ilids_mcts\n", + "ilids_vid_reid\n", + "images_of_groups\n", + "imdb_wiki\n", + "imdb_face\n", + "imfdb\n", + "imm_face\n", + "immediacy\n", + "imsitu\n", + "inria_person\n", + "iqiyi\n", + "jaffe\n", + "jiku_mobile\n", + "jpl_pose\n", + "karpathy_instagram\n", + "kdef\n", + "kin_face\n", + "kinectface\n", + "kitti\n", + "lag\n", + "large_scale_person_search\n", + "leeds_sports_pose\n", + "leeds_sports_pose_extended\n", + "lfw\n", + "lfw_a\n", + "lfw_p\n", + "m2vts\n", + "m2vtsdb_extended\n", + "mafl\n", + "malf\n", + "mapillary\n", + "market_1501\n", + "market1203\n", + "mars\n", + "mcgill\n", + "meds\n", + "megaage\n", + "megaface\n", + "mifs\n", + "mikki\n", + "mit_cbcl\n", + "mit_cbcl_ped\n", + "mit_cbclss\n", + "miw\n", + "mmi_facial_expression\n", + "moments_in_time\n", + "morph\n", + "morph_nc\n", + "mot\n", + "mpi_large\n", + "mpi_small\n", + "mpii_gaze\n", + "mpii_human_pose\n", + "mr2\n", + "mrp_drone\n", + "msceleb\n", + "msmt_17\n", + "muct\n", + "mug_faces\n", + "multi_pie\n", + "mtfl\n", + "names_and_faces_news\n", + "nd_2006\n", + "nist_mid_mugshot\n", + "nova_emotions\n", + "nudedetection\n", + "orl\n", + "penn_fudan\n", + "peta\n", + "pets\n", + "pilot_parliament\n", + "pipa\n", + "pku\n", + "pku_reid\n", + "pornodb\n", + "precarious\n", + "prid\n", + "prw\n", + "psu\n", + "pubfig\n", + "pubfig_83\n", + "put_face\n", + "qmul_grid\n", + "qmul_ilids\n", + "qmul_surv_face\n", + "rafd\n", + "raid\n", + "rap_pedestrian\n", + "reseed\n", + "saivt\n", + "samm\n", + "sarc3d\n", + "scface\n", + "scut_fbp\n", + "scut_head\n", + "sdu_vid\n", + "urban_sed\n", + "sheffield\n", + "shinpuhkan_2014\n", + "social_relation\n", + "soton\n", + "sports_videos_in_the_wild\n", + "stair_actions\n", + "stanford_drone\n", + "stickmen_buffy\n", + "stickmen_family\n", + "stickmen_pascal\n", + "stirling_esrc_3s\n", + "sun_attributes\n", + "svs\n", + "texas_3dfrd\n", + "tiny_faces\n", + "tiny_images\n", + "towncenter\n", + "tud_brussels\n", + "tud_campus\n", + "tud_crossing\n", + "tud_motionpairs\n", + "tud_multiview\n", + "tud_pedestrian\n", + "tud_stadtmitte\n", + "tvhi\n", + "twinsburg_twins\n", + "uccs\n", + "ucf_101\n", + "ucf_crowd\n", + "ucf_selfie\n", + "ufdd\n", + "umb\n", + "umd_faces\n", + "unbc_shoulder_pain\n", + "urban_tribes\n", + "used\n", + "utk_face\n", + "v47\n", + "vadana\n", + "vgg_celebs_in_places\n", + "vgg_faces\n", + "vgg_faces2\n", + "violent_flows\n", + "viper\n", + "visual_phrases\n", + "vmu\n", + "voc\n", + "vqa\n", + "ward\n", + "who_goes_there\n", + "wider\n", + "wider_face\n", + "wider_attribute\n", + "wildtrack\n", + "yale_faces\n", + "yale_faces_b\n", + "yale_faces_b_ext\n", + "yawdd\n", + "yfcc_100m\n", + "york_3d\n", + "youtube_faces\n", + "youtube_makeup\n", + "youtube_poses\n", + "wlfdb\n", + "sal\n", + "semaine\n", + "belfast_naturalistic\n", + "belfast_induced\n", + "vam_faces\n", + "manhob_hci\n", + "deap\n", + "amfed\n", + "recola\n", + "avec_13\n", + "avec_14\n", + "mimicry\n", + "meissner\n", + "nottingham_scans\n", + "nottingham_orig\n", + "stirling_pain\n", + "utrecht_ecvp\n", + "mooney\n", + "\n", + "vcr\n", + "hufrd_pilgrims\n", + "vidtimit\n", + "casme\n", + "face_place\n", + "flickr_faces\n", + "hku_face\n", + "ibm_dif\n", + "gfw\n", + "kasparov_rgbd\n", + "vap_rgbd\n", + "aau_thermal_sports\n", + "aau_thermal_soccer\n", + "wdref\n", + "pic\n", + "ucfi\n", + "ldhf\n", + "nfrad\n", + "specface\n" + ] + } + ], + "source": [ + "for k, v in sheet.items():\n", + " print(k)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
