diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/who_goes_there | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/notebooks/datasets/who_goes_there')
| -rw-r--r-- | megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb | 312 |
1 files changed, 312 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb index c2ec5c84..66f803a4 100644 --- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb +++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb @@ -37,6 +37,318 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create filepaths CSV for individual lookup" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n", + "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "df_flickr_meta = pd.read_csv(fp_flickr_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function drop in module pandas.core.frame:\n", + "\n", + "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n", + " Drop specified labels from rows or columns.\n", + " \n", + " Remove rows or columns by specifying label names and corresponding\n", + " axis, or by specifying directly index or column names. When using a\n", + " multi-index, labels on different levels can be removed by specifying\n", + " the level.\n", + " \n", + " Parameters\n", + " ----------\n", + " labels : single label or list-like\n", + " Index or column labels to drop.\n", + " axis : {0 or 'index', 1 or 'columns'}, default 0\n", + " Whether to drop labels from the index (0 or 'index') or\n", + " columns (1 or 'columns').\n", + " index, columns : single label or list-like\n", + " Alternative to specifying axis (``labels, axis=1``\n", + " is equivalent to ``columns=labels``).\n", + " \n", + " .. versionadded:: 0.21.0\n", + " level : int or level name, optional\n", + " For MultiIndex, level from which the labels will be removed.\n", + " inplace : bool, default False\n", + " If True, do operation inplace and return None.\n", + " errors : {'ignore', 'raise'}, default 'raise'\n", + " If 'ignore', suppress error and only existing labels are\n", + " dropped.\n", + " \n", + " Returns\n", + " -------\n", + " dropped : pandas.DataFrame\n", + " \n", + " Raises\n", + " ------\n", + " KeyError\n", + " If none of the labels are found in the selected axis\n", + " \n", + " See Also\n", + " --------\n", + " DataFrame.loc : Label-location based indexer for selection by label.\n", + " DataFrame.dropna : Return DataFrame with labels on given axis omitted\n", + " where (all or any) data are missing.\n", + " DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n", + " removed, optionally only considering certain columns.\n", + " Series.drop : Return Series with specified index labels removed.\n", + " \n", + " Examples\n", + " --------\n", + " >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n", + " ... columns=['A', 'B', 'C', 'D'])\n", + " >>> df\n", + " A B C D\n", + " 0 0 1 2 3\n", + " 1 4 5 6 7\n", + " 2 8 9 10 11\n", + " \n", + " Drop columns\n", + " \n", + " >>> df.drop(['B', 'C'], axis=1)\n", + " A D\n", + " 0 0 3\n", + " 1 4 7\n", + " 2 8 11\n", + " \n", + " >>> df.drop(columns=['B', 'C'])\n", + " A D\n", + " 0 0 3\n", + " 1 4 7\n", + " 2 8 11\n", + " \n", + " Drop a row by index\n", + " \n", + " >>> df.drop([0, 1])\n", + " A B C D\n", + " 2 8 9 10 11\n", + " \n", + " Drop columns and/or rows of MultiIndex DataFrame\n", + " \n", + " >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n", + " ... ['speed', 'weight', 'length']],\n", + " ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n", + " ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n", + " >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n", + " ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n", + " ... [250, 150], [1.5, 0.8], [320, 250],\n", + " ... [1, 0.8], [0.3,0.2]])\n", + " >>> df\n", + " big small\n", + " lama speed 45.0 30.0\n", + " weight 200.0 100.0\n", + " length 1.5 1.0\n", + " cow speed 30.0 20.0\n", + " weight 250.0 150.0\n", + " length 1.5 0.8\n", + " falcon speed 320.0 250.0\n", + " weight 1.0 0.8\n", + " length 0.3 0.2\n", + " \n", + " >>> df.drop(index='cow', columns='small')\n", + " big\n", + " lama speed 45.0\n", + " weight 200.0\n", + " length 1.5\n", + " falcon speed 320.0\n", + " weight 1.0\n", + " length 0.3\n", + " \n", + " >>> df.drop(index='length', level=1)\n", + " big small\n", + " lama speed 45.0 30.0\n", + " weight 200.0 100.0\n", + " cow speed 30.0 20.0\n", + " weight 250.0 150.0\n", + " falcon speed 320.0 250.0\n", + " weight 1.0 0.8\n", + "\n" + ] + } + ], + "source": [ + "help(pd.DataFrame.drop)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['nsid', 'photo_id', 'url'], dtype='object')\n" + ] + } + ], + "source": [ + "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n", + "print(df_flickr_meta.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "#df_flickr_meta['subdir'] = ''\n", + "df_flickr_meta['filepath'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>nsid</th>\n", + " <th>photo_id</th>\n", + " <th>url</th>\n", + " <th>filepath</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>51576145@N02</td>\n", + " <td>4762068863</td>\n", + " <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29689383@N02</td>\n", + " <td>5711730606</td>\n", + " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29689383@N02</td>\n", + " <td>5711730606</td>\n", + " <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>27982139@N00</td>\n", + " <td>2439203939</td>\n", + " <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n", + " <td></td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>27982139@N00</td>\n", + " <td>2464402099</td>\n", + " <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n", + " <td></td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " nsid photo_id \\\n", + "0 51576145@N02 4762068863 \n", + "1 29689383@N02 5711730606 \n", + "2 29689383@N02 5711730606 \n", + "3 27982139@N00 2439203939 \n", + "4 27982139@N00 2464402099 \n", + "\n", + " url filepath \n", + "0 http://farm5.staticflickr.com/4117/4762068863_... \n", + "1 http://farm3.staticflickr.com/2800/5711730606_... \n", + "2 http://farm3.staticflickr.com/2800/5711730606_... \n", + "3 http://farm3.staticflickr.com/2105/2439203939_... \n", + "4 http://farm4.staticflickr.com/3030/2464402099_... " + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_flickr_meta.head()" + ] + }, + { "cell_type": "code", "execution_count": 31, "metadata": {}, |
