diff options
Diffstat (limited to 'megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb index 311d3462..140b6361 100644 --- a/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/helen/prepare_flickr_api.ipynb @@ -40,6 +40,110 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create filepaths CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "fp_flickr_meta = '/data_store_hdd/datasets/people/helen/research/helen_flickr_api_dump.csv'\n", + "fp_photo_ids = '/data_store_hdd/datasets/people/helen/research/helen_flickr_photo_ids.csv'\n", + "fp_filepaths = '/data_store_hdd/datasets/people/helen/research/helen_file_meta.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "df_photo_ids = pd.read_csv(fp_photo_ids)\n", + "photo_ids = df_photo_ids.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_flickr_meta, dtype={'photo_id': str})\n", + "flickr_meta_records = df_flickr_meta.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1854\n", + "2122\n" + ] + } + ], + "source": [ + "print(len(flickr_meta_records))\n", + "print(len(df_photo_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# create the file meta csv\n", + "results = []\n", + "results_download = []\n", + "for flickr_meta_record in flickr_meta_records:\n", + " # farm, server, photo id, secret\n", + " photo_id = str(flickr_meta_record['photo_id'])\n", + " nsid = flickr_meta_record.get('nsid')\n", + " fp_json = join(fp_dir_flickr_meta, f'{photo_id}.json')\n", + " json_data = file_utils.load_json(fp_json)\n", + " photo_meta = json_data.get('photo')\n", + " farm = photo_meta.get('farm')\n", + " server = photo_meta.get('server')\n", + " secret = photo_meta.get('secret')\n", + " # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg\n", + " url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'\n", + " obj = {\n", + " 'nsid': nsid,\n", + " 'photo_id': photo_id,\n", + " 'url': url,\n", + " 'filepath': f'{photo_id}.jpg'\n", + " }\n", + " results.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = pd.DataFrame.from_dict(results)\n", + "df_out.to_csv(fp_filepaths, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { "cell_type": "code", "execution_count": 16, "metadata": {}, |
