{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate sha256,username list" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import requests\n", "import json\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import hashlib\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "import numpy as np\n", "from random import randint\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store_hdd/datasets/people/yfcc100m/downloads/usernames_clean.txt'\n", "fp_10k_in = '/data_store_hdd/datasets/people/yfcc100m/downloads/usernames_clean_10k.txt'\n", "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/username_sha.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open(fp_in, 'r') as fp:\n", " usernames = fp.readlines()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "usernames = [x.strip() for x in usernames]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert usernames to SHA256" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "metadataset = []" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f2171f3e48eb4a71b7c8691169aefbcf", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=100000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for username in tqdm(usernames):\n", " shaname = hashlib.sha256(str.encode(username)).hexdigest()\n", " metadataset.append({'username': username, 'sha256': shaname})" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "del usernames\n", "df_users = pd.DataFrame.from_dict(metadataset)\n", "del metadataset\n", "df_users = df_users.drop_duplicates(subset='sha256', keep=\"last\")\n", "df_users.to_csv(fp_out, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create download CSV" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_metadata.csv'\n", "fp_out = '/data_store_hdd/datasets/people/ibm_dif/research/ibm_dif_urls.csv'" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "records = pd.read_csv(fp_in).to_dict('records')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# http://farm5.staticflickr.com/4086/4993389409_f1140639d5.jpg,old+guard+museum" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "filepaths = []\n", "for record in records:\n", " filepaths.append(Path(record['url']).name)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "url_maps = []\n", "for record in records:\n", " url_maps.append({'url': record['url'], 'filepath': Path(record['url']).name})" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "df_urls = pd.DataFrame.from_dict(url_maps)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "df_urls.to_csv(fp_out, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }