summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/megaface
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/megaface')
-rw-r--r--megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb (renamed from megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb)113
1 files changed, 111 insertions, 2 deletions
diff --git a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
index 48133228..3c0dd631 100644
--- a/megapixels/notebooks/datasets/megaface/megaface_prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/megaface/prepare_flickr_api.ipynb
@@ -4,12 +4,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Prepare Flickr API Batch CSV"
+ "# MegaFace: Prepare Flickr API Batch CSV"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -29,6 +29,115 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "## Create the file meta CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_in_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file.csv'\n",
+ "fp_out_meta_files = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_file_ext.csv'\n",
+ "fp_out_meta_flickr = '/data_store_hdd/datasets/people/megaface/research/megaface_meta_flickr_02.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_files = pd.read_csv(fp_in_meta_files)\n",
+ "df_files.rename(columns={'subdir': 'filepath'}, inplace=True)\n",
+ "file_records = df_files.to_dict('records')\n",
+ "photo_ids = [x['photo_id'] for x in file_records]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d91329c27b8b4fc4ae68eb817ea82e19",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=4753520), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "for file_record in tqdm(file_records):\n",
+ " photo_id = Path(file_record['url']).stem.split('_')[0]\n",
+ " filepath = f'{photo_id}.jpg'\n",
+ " file_record['filepath'] = filepath\n",
+ "\n",
+ "df_meta_file = pd.DataFrame.from_dict(file_records)\n",
+ "df_meta_file.drop_duplicates(inplace=True)\n",
+ "df_meta_file.to_csv(fp_out_meta_files, index=False)\n",
+ "print(f'Wrote {len(df_meta_file)} lines')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create the NSID/count CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total users: 48,382\n",
+ "Total images: 3,311,471\n"
+ ]
+ }
+ ],
+ "source": [
+ "nsid_groups = df_meta_file.groupby('nsid')\n",
+ "results = []\n",
+ "for nsid, group in nsid_groups:\n",
+ " results.append({'nsid': nsid, 'count': len(group)})\n",
+ "df_meta_flickr = pd.DataFrame.from_dict(results)\n",
+ "df_meta_flickr.to_csv(fp_out_meta_flickr, index=False)\n",
+ "\n",
+ "print(f'Total users: {len(results):,}')\n",
+ "print(f'Total images: {df_meta_flickr[\"count\"].sum():,}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Create CSV for API"
]
},