{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# VGG Face (V1) Prepare Flickr API" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob, iglob\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import pandas as pd\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert annotation files to list of photo IDs" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "fp_dir_annos = '/data_store/datasets/people/vgg_face/downloads/vgg_face_dataset/files/'\n", "fp_photo_ids = '/data_store/datasets/people/vgg_face/research/photo_ids.csv'" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b92b24eac4c84f2f96e32f6eba8d2dc0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=2622), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "photo_ids = []\n", "all_photos = []\n", "fp_annos = glob(join(fp_dir_annos, '*.txt'))\n", "for fp_anno in tqdm(fp_annos):\n", " df_annos = pd.read_csv(fp_anno, delimiter=' ', names=['url', 'a', 'b', 'c', 'd', 'e', 'f', 'g'])\n", " records = df_annos.to_dict('records')\n", " for record in records:\n", " url = record['url']\n", " all_photos.append(url)\n", " if 'flickr.com' in url:\n", " photo_id = Path(url).stem.split('_')[0]\n", " photo_ids.append({'photo_id': photo_id})" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2604849\n" ] } ], "source": [ "print(len(all_photos))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PosixPath('/data_store/datasets/people/vgg_face/research/photo_ids.csv')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file_utils.ensure_posixpath(fp_photo_ids)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(photo_ids).to_csv(fp_photo_ids, index=False)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "## Convert Flickr API data to filepaths and counts" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "fp_in_flickr_api = '/data_store_hdd/datasets/people/vgg_face/research/vgg_flickr_api_photo_ids.csv'\n", "fp_out_filepaths = '/data_store_hdd/datasets/people/vgg_face/research/vgg_filepaths.csv'\n", "fp_out_counts = '/data_store_hdd/datasets/people/vgg_face/research/vgg_counts.csv'" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(fp_in_flickr_api)\n", "records = df.to_dict('records')\n", "\n", "# write filepaths\n", "results = []\n", "for record in records:\n", " photo_id = record['photo_id']\n", " obj = {\n", " 'filepath': f'{photo_id}.jpg',\n", " 'nsid': record['nsid'],\n", " 'photo_id': photo_id,\n", " 'url': record['url']\n", " }\n", " results.append(obj)\n", "\n", "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)\n", "\n", "# write counts\n", "results = []\n", "nsid_groups = df.groupby('nsid')\n", "for nsid, group in nsid_groups:\n", " results.append({'nsid': nsid, 'count': len(group)})\n", "\n", "pd.DataFrame.from_dict(results).to_csv(fp_out_counts, index=False)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "fp = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n", "df = pd.read_csv(fp)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "df_match = df[df['nsid'] == '50747072@N03']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " bureau country nsid path_alias type \\\n", "0 EUR Russia 50747072@N03 otkroyameriku Consulate \n", "\n", " url username \\\n", "0 http://www.flickr.com/photos/otkroyameriku Генконсульство США в СПб \n", "\n", " verified notes \n", "0 NaN NaN 1\n" ] } ], "source": [ "print(df_match, len(df_match))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'50747072@N03'" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "match.nsid" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }