add msc working utils

author: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
committer: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
commit: 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree: 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/who_goes_there
parent: e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
1 files changed, 312 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
index c2ec5c84..66f803a4 100644
--- a/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
+++ b/megapixels/notebooks/datasets/who_goes_there/prepare_flickr_api.ipynb
@@ -37,6 +37,318 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Create filepaths CSV for individual lookup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_flickr_meta = '/data_store/datasets/people/who_goes_there/research/wgt_flickr_queries.csv'\n",
+    "fp_filepaths = '/data_store/datasets/people/who_goes_there/research/who_goes_there_filepaths.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_flickr_meta = pd.read_csv(fp_flickr_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on function drop in module pandas.core.frame:\n",
+      "\n",
+      "drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')\n",
+      "    Drop specified labels from rows or columns.\n",
+      "    \n",
+      "    Remove rows or columns by specifying label names and corresponding\n",
+      "    axis, or by specifying directly index or column names. When using a\n",
+      "    multi-index, labels on different levels can be removed by specifying\n",
+      "    the level.\n",
+      "    \n",
+      "    Parameters\n",
+      "    ----------\n",
+      "    labels : single label or list-like\n",
+      "        Index or column labels to drop.\n",
+      "    axis : {0 or 'index', 1 or 'columns'}, default 0\n",
+      "        Whether to drop labels from the index (0 or 'index') or\n",
+      "        columns (1 or 'columns').\n",
+      "    index, columns : single label or list-like\n",
+      "        Alternative to specifying axis (``labels, axis=1``\n",
+      "        is equivalent to ``columns=labels``).\n",
+      "    \n",
+      "        .. versionadded:: 0.21.0\n",
+      "    level : int or level name, optional\n",
+      "        For MultiIndex, level from which the labels will be removed.\n",
+      "    inplace : bool, default False\n",
+      "        If True, do operation inplace and return None.\n",
+      "    errors : {'ignore', 'raise'}, default 'raise'\n",
+      "        If 'ignore', suppress error and only existing labels are\n",
+      "        dropped.\n",
+      "    \n",
+      "    Returns\n",
+      "    -------\n",
+      "    dropped : pandas.DataFrame\n",
+      "    \n",
+      "    Raises\n",
+      "    ------\n",
+      "    KeyError\n",
+      "        If none of the labels are found in the selected axis\n",
+      "    \n",
+      "    See Also\n",
+      "    --------\n",
+      "    DataFrame.loc : Label-location based indexer for selection by label.\n",
+      "    DataFrame.dropna : Return DataFrame with labels on given axis omitted\n",
+      "        where (all or any) data are missing.\n",
+      "    DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n",
+      "        removed, optionally only considering certain columns.\n",
+      "    Series.drop : Return Series with specified index labels removed.\n",
+      "    \n",
+      "    Examples\n",
+      "    --------\n",
+      "    >>> df = pd.DataFrame(np.arange(12).reshape(3,4),\n",
+      "    ...                   columns=['A', 'B', 'C', 'D'])\n",
+      "    >>> df\n",
+      "       A  B   C   D\n",
+      "    0  0  1   2   3\n",
+      "    1  4  5   6   7\n",
+      "    2  8  9  10  11\n",
+      "    \n",
+      "    Drop columns\n",
+      "    \n",
+      "    >>> df.drop(['B', 'C'], axis=1)\n",
+      "       A   D\n",
+      "    0  0   3\n",
+      "    1  4   7\n",
+      "    2  8  11\n",
+      "    \n",
+      "    >>> df.drop(columns=['B', 'C'])\n",
+      "       A   D\n",
+      "    0  0   3\n",
+      "    1  4   7\n",
+      "    2  8  11\n",
+      "    \n",
+      "    Drop a row by index\n",
+      "    \n",
+      "    >>> df.drop([0, 1])\n",
+      "       A  B   C   D\n",
+      "    2  8  9  10  11\n",
+      "    \n",
+      "    Drop columns and/or rows of MultiIndex DataFrame\n",
+      "    \n",
+      "    >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n",
+      "    ...                              ['speed', 'weight', 'length']],\n",
+      "    ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n",
+      "    ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n",
+      "    >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n",
+      "    ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n",
+      "    ...                         [250, 150], [1.5, 0.8], [320, 250],\n",
+      "    ...                         [1, 0.8], [0.3,0.2]])\n",
+      "    >>> df\n",
+      "                    big     small\n",
+      "    lama    speed   45.0    30.0\n",
+      "            weight  200.0   100.0\n",
+      "            length  1.5     1.0\n",
+      "    cow     speed   30.0    20.0\n",
+      "            weight  250.0   150.0\n",
+      "            length  1.5     0.8\n",
+      "    falcon  speed   320.0   250.0\n",
+      "            weight  1.0     0.8\n",
+      "            length  0.3     0.2\n",
+      "    \n",
+      "    >>> df.drop(index='cow', columns='small')\n",
+      "                    big\n",
+      "    lama    speed   45.0\n",
+      "            weight  200.0\n",
+      "            length  1.5\n",
+      "    falcon  speed   320.0\n",
+      "            weight  1.0\n",
+      "            length  0.3\n",
+      "    \n",
+      "    >>> df.drop(index='length', level=1)\n",
+      "                    big     small\n",
+      "    lama    speed   45.0    30.0\n",
+      "            weight  200.0   100.0\n",
+      "    cow     speed   30.0    20.0\n",
+      "            weight  250.0   150.0\n",
+      "    falcon  speed   320.0   250.0\n",
+      "            weight  1.0     0.8\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "help(pd.DataFrame.drop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['nickname', 'nsid', 'photo_id', 'url'], dtype='object')"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['nsid', 'photo_id', 'url'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_flickr_meta.drop(labels=['subdir'],axis=1, inplace=True)\n",
+    "print(df_flickr_meta.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_flickr_meta['subdir'] = ''\n",
+    "df_flickr_meta['filepath'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame.from_dict(df_flickr_meta).to_csv(fp_filepaths, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>nsid</th>\n",
+       "      <th>photo_id</th>\n",
+       "      <th>url</th>\n",
+       "      <th>filepath</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>51576145@N02</td>\n",
+       "      <td>4762068863</td>\n",
+       "      <td>http://farm5.staticflickr.com/4117/4762068863_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>29689383@N02</td>\n",
+       "      <td>5711730606</td>\n",
+       "      <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>29689383@N02</td>\n",
+       "      <td>5711730606</td>\n",
+       "      <td>http://farm3.staticflickr.com/2800/5711730606_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>27982139@N00</td>\n",
+       "      <td>2439203939</td>\n",
+       "      <td>http://farm3.staticflickr.com/2105/2439203939_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>27982139@N00</td>\n",
+       "      <td>2464402099</td>\n",
+       "      <td>http://farm4.staticflickr.com/3030/2464402099_...</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           nsid    photo_id  \\\n",
+       "0  51576145@N02  4762068863   \n",
+       "1  29689383@N02  5711730606   \n",
+       "2  29689383@N02  5711730606   \n",
+       "3  27982139@N00  2439203939   \n",
+       "4  27982139@N00  2464402099   \n",
+       "\n",
+       "                                                 url filepath  \n",
+       "0  http://farm5.staticflickr.com/4117/4762068863_...           \n",
+       "1  http://farm3.staticflickr.com/2800/5711730606_...           \n",
+       "2  http://farm3.staticflickr.com/2800/5711730606_...           \n",
+       "3  http://farm3.staticflickr.com/2105/2439203939_...           \n",
+       "4  http://farm4.staticflickr.com/3030/2464402099_...           "
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_flickr_meta.head()"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 31,
    "metadata": {},
author	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
committer	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
commit	1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree	86c37309ff5bcb62716638562489ddb747c16159 /megapixels/notebooks/datasets/who_goes_there
parent	e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)