summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/helen
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
committeradamhrv <adam@ahprojects.com>2019-06-03 03:33:06 +0200
commit1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/helen
parente5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
add msc working utils
Diffstat (limited to 'megapixels/notebooks/datasets/helen')
-rw-r--r--megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb104
1 files changed, 104 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
index 311d3462..140b6361 100644
--- a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb
@@ -40,6 +40,110 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create filepaths CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_flickr_meta = '/data_store_hdd/datasets/people/helen/research/helen_flickr_api_dump.csv'\n",
+ "fp_photo_ids = '/data_store_hdd/datasets/people/helen/research/helen_flickr_photo_ids.csv'\n",
+ "fp_filepaths = '/data_store_hdd/datasets/people/helen/research/helen_file_meta.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_photo_ids = pd.read_csv(fp_photo_ids)\n",
+ "photo_ids = df_photo_ids.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_flickr_meta = pd.read_csv(fp_flickr_meta, dtype={'photo_id': str})\n",
+ "flickr_meta_records = df_flickr_meta.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1854\n",
+ "2122\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(flickr_meta_records))\n",
+ "print(len(df_photo_ids))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create the file meta csv\n",
+ "results = []\n",
+ "results_download = []\n",
+ "for flickr_meta_record in flickr_meta_records:\n",
+ " # farm, server, photo id, secret\n",
+ " photo_id = str(flickr_meta_record['photo_id'])\n",
+ " nsid = flickr_meta_record.get('nsid')\n",
+ " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n",
+ " json_data = file_utils.load_json(fp_json)\n",
+ " photo_meta = json_data.get('photo')\n",
+ " farm = photo_meta.get('farm')\n",
+ " server = photo_meta.get('server')\n",
+ " secret = photo_meta.get('secret')\n",
+ " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n",
+ " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n",
+ " obj = {\n",
+ " 'nsid': nsid,\n",
+ " 'photo_id': photo_id,\n",
+ " 'url': url,\n",
+ " 'filepath': f'{photo_id}.jpg'\n",
+ " }\n",
+ " results.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_out = pd.DataFrame.from_dict(results)\n",
+ "df_out.to_csv(fp_filepaths, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
"cell_type": "code",
"execution_count": 16,
"metadata": {},