From cba3a913fb63a8b97d25f8c5c40274897d290d4e Mon Sep 17 00:00:00 2001 From: adamhrv Date: Thu, 18 Apr 2019 23:41:55 +0200 Subject: update msceleb --- .../datasets/50_people_one_question/index.html | 24 ++-- site/public/datasets/afad/index.html | 24 ++-- site/public/datasets/brainwash/index.html | 24 ++-- site/public/datasets/caltech_10k/index.html | 24 ++-- site/public/datasets/celeba/index.html | 24 ++-- site/public/datasets/cofw/index.html | 26 ++-- site/public/datasets/duke_mtmc/index.html | 24 ++-- site/public/datasets/feret/index.html | 24 ++-- site/public/datasets/hrt_transgender/index.html | 22 +-- site/public/datasets/index.html | 147 +++++++++++++++++++++ site/public/datasets/lfpw/index.html | 24 ++-- site/public/datasets/lfw/index.html | 24 ++-- site/public/datasets/market_1501/index.html | 24 ++-- site/public/datasets/msceleb/index.html | 28 ++-- site/public/datasets/oxford_town_centre/index.html | 24 ++-- site/public/datasets/pipa/index.html | 24 ++-- site/public/datasets/pubfig/index.html | 24 ++-- site/public/datasets/uccs/index.html | 24 ++-- site/public/datasets/vgg_face2/index.html | 24 ++-- site/public/datasets/viper/index.html | 24 ++-- .../public/datasets/youtube_celebrities/index.html | 24 ++-- 21 files changed, 389 insertions(+), 242 deletions(-) create mode 100644 site/public/datasets/index.html (limited to 'site/public/datasets') diff --git a/site/public/datasets/50_people_one_question/index.html b/site/public/datasets/50_people_one_question/index.html index dc7919f7..76d5b92f 100644 --- a/site/public/datasets/50_people_one_question/index.html +++ b/site/public/datasets/50_people_one_question/index.html @@ -88,7 +88,7 @@

Dataset Citations

- The dataset citations used in the visualizations were collected from Semantic Scholar, a website which aggregates and indexes research papers. Each citation was geocoded using names of institutions found in the PDF front matter, or as listed on other resources. These papers have been manually verified to show that researchers downloaded and used the dataset to train or test machine learning algorithms. + The dataset citations used in the visualizations were collected from Semantic Scholar, a website which aggregates and indexes research papers. Each citation was geocoded using names of institutions found in the PDF front matter, or as listed on other resources. These papers have been manually verified to show that researchers downloaded and used the dataset to train or test machine learning algorithms. If you use our data, please cite our work.

@@ -96,17 +96,17 @@ diff --git a/site/public/datasets/afad/index.html b/site/public/datasets/afad/index.html index f2b0a5ba..a3ff00cf 100644 --- a/site/public/datasets/afad/index.html +++ b/site/public/datasets/afad/index.html @@ -90,7 +90,7 @@

Dataset Citations

@@ -109,17 +109,17 @@ Motivation

diff --git a/site/public/datasets/brainwash/index.html b/site/public/datasets/brainwash/index.html index b17617a6..cf1f5e5e 100644 --- a/site/public/datasets/brainwash/index.html +++ b/site/public/datasets/brainwash/index.html @@ -99,7 +99,7 @@

Dataset Citations

@@ -137,17 +137,17 @@ diff --git a/site/public/datasets/caltech_10k/index.html b/site/public/datasets/caltech_10k/index.html index 04d63ee3..e86c5ca3 100644 --- a/site/public/datasets/caltech_10k/index.html +++ b/site/public/datasets/caltech_10k/index.html @@ -96,7 +96,7 @@

Dataset Citations

@@ -106,17 +106,17 @@ diff --git a/site/public/datasets/celeba/index.html b/site/public/datasets/celeba/index.html index c72f3798..0236b91c 100644 --- a/site/public/datasets/celeba/index.html +++ b/site/public/datasets/celeba/index.html @@ -94,7 +94,7 @@

Dataset Citations

@@ -108,17 +108,17 @@ diff --git a/site/public/datasets/cofw/index.html b/site/public/datasets/cofw/index.html index eef8cf5e..b0e73dac 100644 --- a/site/public/datasets/cofw/index.html +++ b/site/public/datasets/cofw/index.html @@ -87,7 +87,7 @@

Dataset Citations

@@ -138,7 +138,7 @@ To increase the number of training images, and since COFW has the exact same la

Dataset Citations

@@ -161,17 +161,17 @@ To increase the number of training images, and since COFW has the exact same la diff --git a/site/public/datasets/duke_mtmc/index.html b/site/public/datasets/duke_mtmc/index.html index 14e6bee0..90c131b8 100644 --- a/site/public/datasets/duke_mtmc/index.html +++ b/site/public/datasets/duke_mtmc/index.html @@ -246,7 +246,7 @@

Dataset Citations

@@ -369,17 +369,17 @@ diff --git a/site/public/datasets/feret/index.html b/site/public/datasets/feret/index.html index 387826b0..09abaee2 100644 --- a/site/public/datasets/feret/index.html +++ b/site/public/datasets/feret/index.html @@ -90,7 +90,7 @@

Dataset Citations

@@ -119,17 +119,17 @@ diff --git a/site/public/datasets/hrt_transgender/index.html b/site/public/datasets/hrt_transgender/index.html index 6b9ae7be..4e566a4a 100644 --- a/site/public/datasets/hrt_transgender/index.html +++ b/site/public/datasets/hrt_transgender/index.html @@ -49,17 +49,17 @@ diff --git a/site/public/datasets/index.html b/site/public/datasets/index.html new file mode 100644 index 00000000..6e43e73f --- /dev/null +++ b/site/public/datasets/index.html @@ -0,0 +1,147 @@ + + + + MegaPixels + + + + + + + + + + + +

+ +

MegaPixels

+ + +

+ Datasets + About +

+ + +

Face Recognition Datasets

Explore publicly available facial recognition datasets feeding into research and development of biometric surveillance technologies at the largest technology companies and defense contractors in the world.

+ +

+ + +

+ + + +

Person re-identification, multi-camera tracking

2,000,000 images

2,700

+ + + +

+ HRT Transgender Dataset +

2013

Face recognition, gender transition biometrics

10,564 images

+ + + +

+ IJB-C +

2017

face recognition challenge by NIST in full motion videos

+ + + +

Large-scale face recognition

1,000,000 images

100,000

+ + + +

+ Oxford Town Centre +

2009

Person detection, gaze estimation

images

2,200

+ + + +

+ UnConstrained College Students +

2016

Face recognition, face detection

16,149 images

1,732

+ + +

+ + + + + \ No newline at end of file diff --git a/site/public/datasets/lfpw/index.html b/site/public/datasets/lfpw/index.html index 45de2599..1238c8d3 100644 --- a/site/public/datasets/lfpw/index.html +++ b/site/public/datasets/lfpw/index.html @@ -83,7 +83,7 @@

Dataset Citations

@@ -98,17 +98,17 @@ diff --git a/site/public/datasets/lfw/index.html b/site/public/datasets/lfw/index.html index ca17b1cd..68021e93 100644 --- a/site/public/datasets/lfw/index.html +++ b/site/public/datasets/lfw/index.html @@ -97,7 +97,7 @@

Dataset Citations

@@ -148,17 +148,17 @@ diff --git a/site/public/datasets/market_1501/index.html b/site/public/datasets/market_1501/index.html index 7c545335..a72cb6cf 100644 --- a/site/public/datasets/market_1501/index.html +++ b/site/public/datasets/market_1501/index.html @@ -91,7 +91,7 @@

Dataset Citations

@@ -114,17 +114,17 @@ organization={Springer} diff --git a/site/public/datasets/msceleb/index.html b/site/public/datasets/msceleb/index.html index 345592d3..a00b3527 100644 --- a/site/public/datasets/msceleb/index.html +++ b/site/public/datasets/msceleb/index.html @@ -50,7 +50,7 @@

Website

msceleb.org

Microsoft Celeb (MS Celeb) is a dataset of 10 million face images scraped from the Internet and used for research and development of large-scale biometric recognition systems. According to Microsoft Research who created and published the dataset in 2016, MS Celeb is the largest publicly available face recognition dataset in the world, containing over 10 million images of nearly 100,000 individuals. Microsoft's goal in building this dataset was to distribute an initial training dataset of 100,000 individuals images and use this to accelerate reserch into recognizing a target list of one million individuals from their face images "using all the possibly collected face images of this individual on the web as training data". 1

These one million people, defined by Micrsoft Research as "celebrities", are often merely people who must maintain an online presence for their professional lives. Microsoft's list of 1 million people is an expansive exploitation of the current reality that for many people including academics, policy makers, writers, artists, and especially journalists maintaining an online presence is mandatory and should not allow Microsoft or anyone else to use their biometrics for reserach and development of surveillance technology. Many of names in target list even include people critical of the very technology Microsoft is using their name and biometric information to build. The list includes digital rights activists like Jillian York and [add more]; artists critical of surveillance including Trevor Paglen, Hito Steryl, Jill Magid, and Aram Bartholl; Intercept founders Laura Poitras, Jeremy Scahill, and Glen Greenwald; Data and Society founder danah boyd; and even Julie Brill the former FTC commissioner responsible for protecting consumer’s privacy to name a few.

These one million people, defined by Microsoft Research as "celebrities", are often merely people who must maintain an online presence for their professional lives. Microsoft's list of 1 million people is an expansive exploitation of the current reality that for many people including academics, policy makers, writers, artists, and especially journalists maintaining an online presence is mandatory and should not allow Microsoft or anyone else to use their biometrics for research and development of surveillance technology. Many of names in target list even include people critical of the very technology Microsoft is using their name and biometric information to build. The list includes digital rights activists like Jillian York and [add more]; artists critical of surveillance including Trevor Paglen, Hito Steryl, Jill Magid, and Aram Bartholl; Intercept founders Laura Poitras, Jeremy Scahill, and Glen Greenwald; Data and Society founder danah boyd; and even Julie Brill the former FTC commissioner responsible for protecting consumer’s privacy to name a few.

Microsoft's 1 Million Target List

Below is a list of names that were included in list of 1 million individuals curated to illustrate Microsoft's expansive and exploitative practice of scraping the Internet for biometric training data. The entire name file can be downloaded from msceleb.org. Email msceleb@microsoft.com to have your name removed. Names appearing with * indicate that Microsoft also distributed images.

@@ -166,7 +166,7 @@

Earlier in 2019, Microsoft CEO Brad Smith called for the governmental regulation of face recognition, citing the potential for misuse, a rare admission that Microsoft's surveillance-driven business model had lost its bearing. More recently Smith also announced that Microsoft would seemingly take stand against potential misuse and decided to not sell face recognition to an unnamed United States law enforcement agency, citing that their technology was not accurate enough to be used on minorities because it was trained mostly on white male faces.

What the decision to block the sale announces is not so much that Microsoft had upgraded their ethics, but that Microsoft publicly acknowledged it can't sell a data-driven product without data. In other words, Microsoft can't sell face recognition for faces they can't train on.

Until now, that data has been freely harvested from the Internet and packaged in training sets like MS Celeb, which are overwhelmingly white and male. Without balanced data, facial recognition contains blind spots. And without datasets like MS Celeb, the powerful yet inaccurate facial recognition services like Microsoft's Azure Cognitive Service also would not be able to see at all.

Microsoft didn't only create MS Celeb for other researchers to use, they also used it internally. In a publicly available 2017 Microsoft Research project called "(One-shot Face Recognition by Promoting Underrepresented Classes)", Microsoft leveraged the MS Celeb dataset to analyze their algorithms and advertise the results. Interestingly, Microsoft's corporate version of the paper does not mention they used the MS Celeb datset, but the open-access version published on arxiv.org explicitly mentions that Microsoft Research tested their algorithms "on the MS-Celeb-1M low-shot learning benchmark task."

Microsoft didn't only create MS Celeb for other researchers to use, they also used it internally. In a publicly available 2017 Microsoft Research project called One-shot Face Recognition by Promoting Underrepresented Classes, Microsoft leveraged the MS Celeb dataset to analyze their algorithms and advertise the results. Interestingly, Microsoft's corporate version of the paper does not mention they used the MS Celeb datset, but the open-access version published on arxiv.org explicitly mentions that Microsoft Research tested their algorithms "on the MS-Celeb-1M low-shot learning benchmark task."

We suggest that if Microsoft Research wants to make biometric data publicly available for surveillance research and development, they should start with releasing their researchers' own biometric data instead of scraping the Internet for journalists, artists, writers, actors, athletes, musicians, and academics.

Who used Microsoft Celeb?

@@ -215,7 +215,7 @@

Dataset Citations

@@ -234,17 +234,17 @@ diff --git a/site/public/datasets/oxford_town_centre/index.html b/site/public/datasets/oxford_town_centre/index.html index 4fbcaccb..fabcae6b 100644 --- a/site/public/datasets/oxford_town_centre/index.html +++ b/site/public/datasets/oxford_town_centre/index.html @@ -98,7 +98,7 @@

Dataset Citations

@@ -138,17 +138,17 @@ diff --git a/site/public/datasets/pipa/index.html b/site/public/datasets/pipa/index.html index 6c920b46..297f4d45 100644 --- a/site/public/datasets/pipa/index.html +++ b/site/public/datasets/pipa/index.html @@ -94,7 +94,7 @@

Dataset Citations

@@ -102,17 +102,17 @@ diff --git a/site/public/datasets/pubfig/index.html b/site/public/datasets/pubfig/index.html index e81e12bc..5feed748 100644 --- a/site/public/datasets/pubfig/index.html +++ b/site/public/datasets/pubfig/index.html @@ -91,7 +91,7 @@

Dataset Citations

@@ -99,17 +99,17 @@ diff --git a/site/public/datasets/uccs/index.html b/site/public/datasets/uccs/index.html index 23aeeff1..3296cabc 100644 --- a/site/public/datasets/uccs/index.html +++ b/site/public/datasets/uccs/index.html @@ -104,7 +104,7 @@ Their setup made it impossible for students to know they were being photographed

Dataset Citations

@@ -258,17 +258,17 @@ Their setup made it impossible for students to know they were being photographed diff --git a/site/public/datasets/vgg_face2/index.html b/site/public/datasets/vgg_face2/index.html index a9d318f1..5f314d9e 100644 --- a/site/public/datasets/vgg_face2/index.html +++ b/site/public/datasets/vgg_face2/index.html @@ -96,7 +96,7 @@

Dataset Citations

@@ -124,17 +124,17 @@ diff --git a/site/public/datasets/viper/index.html b/site/public/datasets/viper/index.html index bc4ddd3d..4d2abbe1 100644 --- a/site/public/datasets/viper/index.html +++ b/site/public/datasets/viper/index.html @@ -96,7 +96,7 @@

Dataset Citations

@@ -104,17 +104,17 @@ diff --git a/site/public/datasets/youtube_celebrities/index.html b/site/public/datasets/youtube_celebrities/index.html index 69b3a02e..d0a7a172 100644 --- a/site/public/datasets/youtube_celebrities/index.html +++ b/site/public/datasets/youtube_celebrities/index.html @@ -75,7 +75,7 @@

Dataset Citations

@@ -95,17 +95,17 @@ the views of our sponsors. -- cgit v1.2.3-70-g09d2