{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create Name List" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import requests\n", "import json\n", "from urllib.parse import unquote\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "fp_yfcc = '/data_store_hdd/datasets/people/yfcc100m/downloads/usernames.txt'\n", "fp_yfcc_clean = '/data_store_hdd/datasets/people/yfcc100m/downloads/usernames_clean.txt'\n", "fp_dir_research = '/data_store_hdd/datasets/people/ibm_dif/research/'\n", "fp_usernames = join(fp_dir_research, 'usernames_decoded.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert YFCC to Unique Name List" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "with open(fp_yfcc, 'r') as fp:\n", " usernames_raw = fp.readlines()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea6b4e8d2a8145f2b7efb559575862b8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=100000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "usernames = []\n", "for username in tqdm(usernames_raw):\n", " un_clean = unquote(username.strip().lower().replace(' ', '+'))\n", " usernames.append(un_clean)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "with open(fp_yfcc_clean, 'w') as fp:\n", " for username in usernames:\n", " fp.write(username + \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }