summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb')
-rw-r--r--megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb312
1 files changed, 312 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
index c2ec5c84..66f803a4 100644
--- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
@@ -37,6 +37,318 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create filepaths CSV for individual lookup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n",
+ "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_flickr_meta = pd.read_csv(fp_flickr_meta)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Help on function drop in module pandas.core.frame:\n",
+ "\n",
+ "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n",
+ " Drop specified labels from rows or columns.\n",
+ " \n",
+ " Remove rows or columns by specifying label names and corresponding\n",
+ " axis, or by specifying directly index or column names. When using a\n",
+ " multi-index, labels on different levels can be removed by specifying\n",
+ " the level.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " labels : single label or list-like\n",
+ " Index or column labels to drop.\n",
+ " axis : {0 or 'index', 1 or 'columns'}, default 0\n",
+ " Whether to drop labels from the index (0 or 'index') or\n",
+ " columns (1 or 'columns').\n",
+ " index, columns : single label or list-like\n",
+ " Alternative to specifying axis (``labels, axis=1``\n",
+ " is equivalent to ``columns=labels``).\n",
+ " \n",
+ " .. versionadded:: 0.21.0\n",
+ " level : int or level name, optional\n",
+ " For MultiIndex, level from which the labels will be removed.\n",
+ " inplace : bool, default False\n",
+ " If True, do operation inplace and return None.\n",
+ " errors : {'ignore', 'raise'}, default 'raise'\n",
+ " If 'ignore', suppress error and only existing labels are\n",
+ " dropped.\n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " dropped : pandas.DataFrame\n",
+ " \n",
+ " Raises\n",
+ " ------\n",
+ " KeyError\n",
+ " If none of the labels are found in the selected axis\n",
+ " \n",
+ " See Also\n",
+ " --------\n",
+ " DataFrame.loc : Label-location based indexer for selection by label.\n",
+ " DataFrame.dropna : Return DataFrame with labels on given axis omitted\n",
+ " where (all or any) data are missing.\n",
+ " DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n",
+ " removed, optionally only considering certain columns.\n",
+ " Series.drop : Return Series with specified index labels removed.\n",
+ " \n",
+ " Examples\n",
+ " --------\n",
+ " >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n",
+ " ... columns=['A', 'B', 'C', 'D'])\n",
+ " >>> df\n",
+ " A B C D\n",
+ " 0 0 1 2 3\n",
+ " 1 4 5 6 7\n",
+ " 2 8 9 10 11\n",
+ " \n",
+ " Drop columns\n",
+ " \n",
+ " >>> df.drop(['B', 'C'], axis=1)\n",
+ " A D\n",
+ " 0 0 3\n",
+ " 1 4 7\n",
+ " 2 8 11\n",
+ " \n",
+ " >>> df.drop(columns=['B', 'C'])\n",
+ " A D\n",
+ " 0 0 3\n",
+ " 1 4 7\n",
+ " 2 8 11\n",
+ " \n",
+ " Drop a row by index\n",
+ " \n",
+ " >>> df.drop([0, 1])\n",
+ " A B C D\n",
+ " 2 8 9 10 11\n",
+ " \n",
+ " Drop columns and/or rows of MultiIndex DataFrame\n",
+ " \n",
+ " >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n",
+ " ... ['speed', 'weight', 'length']],\n",
+ " ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n",
+ " ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n",
+ " >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n",
+ " ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n",
+ " ... [250, 150], [1.5, 0.8], [320, 250],\n",
+ " ... [1, 0.8], [0.3,0.2]])\n",
+ " >>> df\n",
+ " big small\n",
+ " lama speed 45.0 30.0\n",
+ " weight 200.0 100.0\n",
+ " length 1.5 1.0\n",
+ " cow speed 30.0 20.0\n",
+ " weight 250.0 150.0\n",
+ " length 1.5 0.8\n",
+ " falcon speed 320.0 250.0\n",
+ " weight 1.0 0.8\n",
+ " length 0.3 0.2\n",
+ " \n",
+ " >>> df.drop(index='cow', columns='small')\n",
+ " big\n",
+ " lama speed 45.0\n",
+ " weight 200.0\n",
+ " length 1.5\n",
+ " falcon speed 320.0\n",
+ " weight 1.0\n",
+ " length 0.3\n",
+ " \n",
+ " >>> df.drop(index='length', level=1)\n",
+ " big small\n",
+ " lama speed 45.0 30.0\n",
+ " weight 200.0 100.0\n",
+ " cow speed 30.0 20.0\n",
+ " weight 250.0 150.0\n",
+ " falcon speed 320.0 250.0\n",
+ " weight 1.0 0.8\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "help(pd.DataFrame.drop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['nsid', 'photo_id', 'url'], dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n",
+ "print(df_flickr_meta.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df_flickr_meta['subdir'] = ''\n",
+ "df_flickr_meta['filepath'] = ''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>nsid</th>\n",
+ " <th>photo_id</th>\n",
+ " <th>url</th>\n",
+ " <th>filepath</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>51576145@N02</td>\n",
+ " <td>4762068863</td>\n",
+ " <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>29689383@N02</td>\n",
+ " <td>5711730606</td>\n",
+ " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>29689383@N02</td>\n",
+ " <td>5711730606</td>\n",
+ " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>27982139@N00</td>\n",
+ " <td>2439203939</td>\n",
+ " <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>27982139@N00</td>\n",
+ " <td>2464402099</td>\n",
+ " <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " nsid photo_id \\\n",
+ "0 51576145@N02 4762068863 \n",
+ "1 29689383@N02 5711730606 \n",
+ "2 29689383@N02 5711730606 \n",
+ "3 27982139@N00 2439203939 \n",
+ "4 27982139@N00 2464402099 \n",
+ "\n",
+ " url filepath \n",
+ "0 http://farm5.staticflickr.com/4117/4762068863_... \n",
+ "1 http://farm3.staticflickr.com/2800/5711730606_... \n",
+ "2 http://farm3.staticflickr.com/2800/5711730606_... \n",
+ "3 http://farm3.staticflickr.com/2105/2439203939_... \n",
+ "4 http://farm4.staticflickr.com/3030/2464402099_... "
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_flickr_meta.head()"
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": 31,
"metadata": {},