diff options
Diffstat (limited to 'megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb | 166 |
1 files changed, 131 insertions, 35 deletions
diff --git a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb index a35c3b24..8d3b4251 100644 --- a/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/adience/prepare_flickr_api.ipynb @@ -29,41 +29,145 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create CSV for API" + "## Cleanup filepaths CSV" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in_photo_ids = '/data_store_hdd/datasets/people/adience/research/adience_photo_ids.csv'\n", + "fp_in_flickr_api_dump = '/data_store_hdd/datasets/people/adience/research/adience_flickr_api_dump.csv'\n", + "fp_in_flickr_api_dump_photo_ids = '/data_store_hdd/datasets/people/adience/research/flickr_api_dump_photo_id.csv'\n", + "fp_out_filepaths = '/data_store_hdd/datasets/people/adience/research/adience_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 44, "metadata": {}, + "outputs": [], "source": [ - "| filepath | query | count |\n", - "|:---|:---|:---|\n", - "| 12234 | 12234@123| 10 |" + "# photo id list\n", + "df = pd.read_csv(fp_in_photo_ids)\n", + "records = df.to_dict('records')\n", + "\n", + "# photo id --> url list\n", + "df_api_urls = pd.read_csv(fp_in_flickr_api_dump_photo_ids)\n", + "api_urls = df_api_urls.to_dict('records')\n", + "\n", + "df_flickr_api_dump = pd.read_csv(fp_in_flickr_api_dump)\n", + "flickr_api_dump = df_flickr_api_dump.to_dict('records')\n", + "\n", + "# create lookup table for user info?\n", + "flickr_api_lookup = {}\n", + "for api_item in flickr_api_dump:\n", + " nsid = api_item['nsid']\n", + " flickr_api_lookup[nsid] = api_item\n", + " \n", + "# create lookup table for user info?\n", + "api_url_lookup = {}\n", + "for api_url_item in api_urls:\n", + " photo_id = api_url_item['photo_id']\n", + " api_url_lookup[photo_id] = api_url_item\n", + " \n", + "results = []\n", + "for record in records:\n", + " photo_id = record['photo_id']\n", + " if photo_id in api_url_lookup.keys():\n", + " api_item = api_url_lookup.get(photo_id)\n", + " url = api_item.get('url')\n", + " nsid = api_item.get('nsid')\n", + " obj = {\n", + " 'filepath': f'{photo_id}.jpg',\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url\n", + " }\n", + " results.append(obj)\n", + " \n", + "pd.DataFrame.from_dict(results).to_csv(fp_out_filepaths, index=False)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create the file meta csv\n", + "results = []\n", + "results_download = []\n", + "for flickr_meta_record in flickr_meta_records:\n", + " # farm, server, photo id, secret\n", + " photo_id = str(flickr_meta_record['photo_id'])\n", + " nsid = flickr_meta_record.get('nsid')\n", + " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n", + " json_data = file_utils.load_json(fp_json)\n", + " photo_meta = json_data.get('photo')\n", + " farm = photo_meta.get('farm')\n", + " server = photo_meta.get('server')\n", + " secret = photo_meta.get('secret')\n", + " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n", + " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n", + " obj = {\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url,\n", + " 'filepath': f'{photo_id}.jpg'\n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "fp_in_dir = '/data_store/datasets/people/adience/dataset/'\n", - "fp_out_queries = '/data_store/datasets/people/adience/research/adience_flickr_api_queries.csv'" + "## Create Photo ID list" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9\n" - ] - } - ], + "outputs": [], "source": [ "fp_files = glob(join(fp_in_dir, '*.txt'))\n", "print(len(fp_files))" @@ -71,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -94,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -123,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -132,24 +236,16 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10804\n" - ] - } - ], + "outputs": [], "source": [ "print(len(df_images))" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ |
