summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
committeradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
commit1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
parente5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
add msc working utils
Diffstat (limited to 'megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb')
-rw-r--r--megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb287
1 files changed, 287 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
new file mode 100644
index 00000000..99bbe32e
--- /dev/null
+++ b/megapixels/notebooks/datasets/vgg_face1/flickr_meta.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# VGG Face (V1) Prepare Flickr API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob, iglob\n",
+ "from pathlib import Path\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "\n",
+ "import pandas as pd\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Convert annotation files to list of photo IDs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_dir_annos = '/data_store/datasets/people/vgg_face/downloads/vgg_face_dataset/files/'\n",
+ "fp_photo_ids = '/data_store/datasets/people/vgg_face/research/photo_ids.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b92b24eac4c84f2f96e32f6eba8d2dc0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=2622), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "photo_ids = []\n",
+ "all_photos = []\n",
+ "fp_annos = glob(join(fp_dir_annos, '*.txt'))\n",
+ "for fp_anno in tqdm(fp_annos):\n",
+ " df_annos = pd.read_csv(fp_anno, delimiter=' ', names=['url', 'a', 'b', 'c', 'd', 'e', 'f', 'g'])\n",
+ " records = df_annos.to_dict('records')\n",
+ " for record in records:\n",
+ " url = record['url']\n",
+ " all_photos.append(url)\n",
+ " if 'flickr.com' in url:\n",
+ " photo_id = Path(url).stem.split('_')[0]\n",
+ " photo_ids.append({'photo_id': photo_id})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2604849\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(all_photos))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/data_store/datasets/people/vgg_face/research/photo_ids.csv')"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "file_utils.ensure_posixpath(fp_photo_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(photo_ids).to_csv(fp_photo_ids, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Convert Flickr API data to filepaths and counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_in_flickr_api = '/data_store_hdd/datasets/people/vgg_face/research/vgg_flickr_api_photo_ids.csv'\n",
+ "fp_out_filepaths = '/data_store_hdd/datasets/people/vgg_face/research/vgg_filepaths.csv'\n",
+ "fp_out_counts = '/data_store_hdd/datasets/people/vgg_face/research/vgg_counts.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fp_in_flickr_api)\n",
+ "records = df.to_dict('records')\n",
+ "\n",
+ "# write filepaths\n",
+ "results = []\n",
+ "for record in records:\n",
+ " photo_id = record['photo_id']\n",
+ " obj = {\n",
+ " 'filepath': f'{photo_id}.jpg',\n",
+ " 'nsid': record['nsid'],\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': record['url']\n",
+ " }\n",
+ " results.append(obj)\n",
+ "\n",
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)\n",
+ "\n",
+ "# write counts\n",
+ "results = []\n",
+ "nsid_groups = df.groupby('nsid')\n",
+ "for nsid, group in nsid_groups:\n",
+ " results.append({'nsid': nsid, 'count': len(group)})\n",
+ "\n",
+ "pd.DataFrame.from_dict(results).to_csv(fp_out_counts, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'\n",
+ "df = pd.read_csv(fp)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_match = df[df['nsid'] == '50747072@N03']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " bureau country nsid path_alias type \\\n",
+ "0 EUR Russia 50747072@N03 otkroyameriku Consulate \n",
+ "\n",
+ " url username \\\n",
+ "0 http://www.flickr.com/photos/otkroyameriku Генконсульство США в СПб \n",
+ "\n",
+ " verified notes \n",
+ "0 NaN NaN 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df_match, len(df_match))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'50747072@N03'"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "match.nsid"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}