summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb')
-rw-r--r--megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb206
1 files changed, 206 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
new file mode 100644
index 00000000..b4a29243
--- /dev/null
+++ b/megapixels/notebooks/datasets/ijb_count/ibj_count.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Count IJB sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import requests\n",
+ "import json\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import file_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load JSON\n",
+ "fp_in_cs3 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs3_media.csv'\n",
+ "fp_in_cs4 = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/cs4_media.csv'\n",
+ "fp_in_ijb_b = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-B/ijbb_licenses_and_sources.csv'\n",
+ "fp_in_ijb_a = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-A/SOURCES.csv'\n",
+ "fp_out = '/data_store_hdd/datasets/people/ijb_c/downloads/tars/IJB/IJB-C/license/summary.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cs3 = pd.read_csv(fp_in_cs3)\n",
+ "df_cs4 = pd.read_csv(fp_in_cs4)\n",
+ "df_sources = df_cs3.append(df_cs4)\n",
+ "df_sources.fillna('', inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ijb b\n",
+ "#df_sources = pd.read_csv(fp_in_ijb_b).fillna('')\n",
+ "# ijb a\n",
+ "df_sources = pd.read_csv(fp_in_ijb_a).fillna('')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sources = df_sources.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = {}\n",
+ "others = []\n",
+ "keys = ['flickr.com', 'youtube.com', 'wikipedia.org', 'wikimedia.org']\n",
+ "for k in keys:\n",
+ " results[k] = []\n",
+ "for source in sources:\n",
+ " url = str(source['Media URL'])\n",
+ " media_id = source['Media ID']\n",
+ " if 'nonfaces' in media_id:\n",
+ " continue\n",
+ " found = False\n",
+ " for k in keys:\n",
+ " if k in url:\n",
+ " results[k].append(url)\n",
+ " found = True\n",
+ " if not found:\n",
+ " if url:\n",
+ " others.append(url)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "flickr.com 0\n",
+ "youtube.com 1388\n",
+ "wikipedia.org 0\n",
+ "wikimedia.org 4298\n"
+ ]
+ }
+ ],
+ "source": [
+ "for k,v in results.items():\n",
+ " print(k, len(set(v)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "siliconangle.com/files/2011/06/kaz-hirai.jpg\n",
+ "etnosi.files.wordpress.com/2012/05/sofi-marinova-baku.jpg\n",
+ "images.coveralia.com/audio/p/Pia_Zadora-When_The_Lights_Go_Out-Interior_Frontal.jpg\n",
+ "4.bp.blogspot.com/-TFHOJVIW3a8/T_1mD6MdOxI/AAAAAAAADAg/PhKDPx0Aqu0/s1600/ivan_pavlov.jpg\n",
+ "863793661388437597-a-1802744773732722657-s-sites.googlegroups.com/site/virginmarysite/Home/jackneosex.jpg\n",
+ "amckiereads.files.wordpress.com/2010/12/darwish.jpg?w=600\n",
+ "img.interia.pl/komputery/nimg/5/7/Kazuo_Hirai_plan_odbudowe_5726348.jpg\n",
+ "2.bp.blogspot.com/-JAYvKsHcQPI/T4f3wbCIMDI/AAAAAAAAFDM/lTs3uKlb3A0/s1600/deeksha_seth_launches_chandana_brothers_showroom_Yellow+Saree+smiling+pics+%25285%2529.jpg\n",
+ "1.bp.blogspot.com/-D3SI27GS7-g/U-iD5fPcFDI/AAAAAAAABOs/VaB_BRRa6OU/s320/news8.jpg\n",
+ "1.bp.blogspot.com/_ilOjS7A_kk4/SVGCtcyAAmI/AAAAAAAAAH4/9-KKBqYeDBA/s400/playstation-3-grill_12.jpg\n"
+ ]
+ }
+ ],
+ "source": [
+ "for other in others[:10]:\n",
+ " print(other)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "21319"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(results['flickr.com']) +len(results['wikimedia.org']) + len(others)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "megapixels",
+ "language": "python",
+ "name": "megapixels"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}