summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--megapixels/app/site/builder.py22
-rw-r--r--megapixels/app/site/loader.py123
-rw-r--r--megapixels/app/site/parser.py204
-rw-r--r--megapixels/commands/site/watch.py44
-rw-r--r--site/assets/css/css.css1
-rw-r--r--site/content/pages/datasets/lfw/index.md55
-rw-r--r--site/public/datasets/lfw/index.html43
8 files changed, 266 insertions, 228 deletions
diff --git a/README.md b/README.md
index e1a2c1d0..e46a6289 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ pip install numpy Pillow
pip install dlib
pip install requests simplejson click pdfminer.six
pip install urllib3 flask flask_sqlalchemy mysql-connector
-pip install pymediainfo tqdm opencv-python imutils
+pip install pymediainfo tqdm opencv-python imutils watchdog
pip install scikit-image python-dotenv imagehash scikit-learn colorlog
pip install celery keras tensorflow
pip install python.app # OSX only! needed for matplotlib
diff --git a/megapixels/app/site/builder.py b/megapixels/app/site/builder.py
index 188fbc25..15055110 100644
--- a/megapixels/app/site/builder.py
+++ b/megapixels/app/site/builder.py
@@ -7,6 +7,7 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape
import app.settings.app_cfg as cfg
import app.site.s3 as s3
+import app.site.loader as loader
import app.site.parser as parser
env = Environment(
@@ -21,7 +22,7 @@ def build_page(fn, research_posts, datasets):
- syncs any assets with s3
- handles certain index pages...
"""
- metadata, sections = parser.read_metadata(fn)
+ metadata, sections = loader.read_metadata(fn)
if metadata is None:
print("{} has no metadata".format(fn))
@@ -55,7 +56,7 @@ def build_page(fn, research_posts, datasets):
if 'index.md' in fn:
s3.sync_directory(dirname, s3_dir, metadata)
- content = parser.parse_markdown(sections, s3_path, skip_h1=skip_h1)
+ content = parser.parse_markdown(metadata, sections, s3_path, skip_h1=skip_h1)
html = template.render(
metadata=metadata,
@@ -73,11 +74,11 @@ def build_index(key, research_posts, datasets):
"""
build the index of research (blog) posts
"""
- metadata, sections = parser.read_metadata(os.path.join(cfg.DIR_SITE_CONTENT, key, 'index.md'))
+ metadata, sections = loader.read_metadata(os.path.join(cfg.DIR_SITE_CONTENT, key, 'index.md'))
template = env.get_template("page.html")
s3_path = s3.make_s3_path(cfg.S3_SITE_PATH, metadata['path'])
- content = parser.parse_markdown(sections, s3_path, skip_h1=False)
- content += parser.parse_research_index(research_posts)
+ content = parser.parse_markdown(metadata, sections, s3_path, skip_h1=False)
+ content += loader.parse_research_index(research_posts)
html = template.render(
metadata=metadata,
content=content,
@@ -93,8 +94,8 @@ def build_site():
"""
build the site! =^)
"""
- research_posts = parser.read_research_post_index()
- datasets = parser.read_datasets_index()
+ research_posts = loader.read_research_post_index()
+ datasets = loader.read_datasets_index()
for fn in glob.iglob(os.path.join(cfg.DIR_SITE_CONTENT, "**/*.md"), recursive=True):
build_page(fn, research_posts, datasets)
build_index('research', research_posts, datasets)
@@ -103,7 +104,8 @@ def build_file(fn):
"""
build just one page from a filename! =^)
"""
- research_posts = parser.read_research_post_index()
- datasets = parser.read_datasets_index()
- fn = os.path.join(cfg.DIR_SITE_CONTENT, fn)
+ research_posts = loader.read_research_post_index()
+ datasets = loader.read_datasets_index()
+ if cfg.DIR_SITE_CONTENT not in fn:
+ fn = os.path.join(cfg.DIR_SITE_CONTENT, fn)
build_page(fn, research_posts, datasets)
diff --git a/megapixels/app/site/loader.py b/megapixels/app/site/loader.py
new file mode 100644
index 00000000..691efb25
--- /dev/null
+++ b/megapixels/app/site/loader.py
@@ -0,0 +1,123 @@
+import os
+import re
+import glob
+import simplejson as json
+
+import app.settings.app_cfg as cfg
+
+def read_metadata(fn):
+ """
+ Read in read a markdown file and extract the metadata
+ """
+ with open(fn, "r") as file:
+ data = file.read()
+ data = data.replace("\n ", "\n")
+ if "\n" in data:
+ data = data.replace("\r", "")
+ else:
+ data = data.replace("\r", "\n")
+ sections = data.split("\n\n")
+ return parse_metadata(fn, sections)
+
+
+default_metadata = {
+ 'status': 'published',
+ 'title': 'Untitled Page',
+ 'desc': '',
+ 'slug': '',
+ 'published': '2018-12-31',
+ 'updated': '2018-12-31',
+ 'authors': 'Adam Harvey',
+ 'sync': 'true',
+ 'tagline': '',
+}
+
+def parse_metadata(fn, sections):
+ """
+ parse the metadata headers in a markdown file
+ (everything before the second ---------)
+ also generates appropriate urls for this page :)
+ """
+ found_meta = False
+ metadata = {}
+ valid_sections = []
+ for section in sections:
+ if not found_meta and ': ' in section:
+ found_meta = True
+ parse_metadata_section(metadata, section)
+ continue
+ if '-----' in section:
+ continue
+ if found_meta:
+ valid_sections.append(section)
+
+ if 'title' not in metadata:
+ print('warning: {} has no title'.format(fn))
+ for key in default_metadata:
+ if key not in metadata:
+ metadata[key] = default_metadata[key]
+
+ basedir = os.path.dirname(fn.replace(cfg.DIR_SITE_CONTENT, ''))
+ basename = os.path.basename(fn)
+ if basedir == '/':
+ metadata['path'] = '/'
+ metadata['url'] = '/'
+ elif basename == 'index.md':
+ metadata['path'] = basedir + '/'
+ metadata['url'] = metadata['path']
+ else:
+ metadata['path'] = basedir + '/'
+ metadata['url'] = metadata['path'] + basename.replace('.md', '') + '/'
+
+ if metadata['status'] == 'published|draft|private':
+ metadata['status'] = 'published'
+
+ metadata['sync'] = metadata['sync'] != 'false'
+
+ metadata['author_html'] = '<br>'.join(metadata['authors'].split(','))
+
+ return metadata, valid_sections
+
+def parse_metadata_section(metadata, section):
+ """
+ parse a metadata key: value pair
+ """
+ for line in section.split("\n"):
+ if ': ' not in line:
+ continue
+ key, value = line.split(': ', 1)
+ metadata[key.lower()] = value
+
+
+def read_research_post_index():
+ """
+ Generate an index of the research (blog) posts
+ """
+ return read_post_index('research')
+
+
+def read_datasets_index():
+ """
+ Generate an index of the datasets
+ """
+ return read_post_index('datasets')
+
+
+def read_post_index(basedir):
+ """
+ Generate an index of posts
+ """
+ posts = []
+ for fn in sorted(glob.glob(os.path.join(cfg.DIR_SITE_CONTENT, basedir, '*/index.md'))):
+ metadata, valid_sections = read_metadata(fn)
+ if metadata is None or metadata['status'] == 'private' or metadata['status'] == 'draft':
+ continue
+ posts.append(metadata)
+ if not len(posts):
+ posts.append({
+ 'title': 'Placeholder',
+ 'slug': 'placeholder',
+ 'date': 'Placeholder',
+ 'url': '/',
+ })
+ return posts
diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py
index d6705214..3792e6f1 100644
--- a/megapixels/app/site/parser.py
+++ b/megapixels/app/site/parser.py
@@ -10,6 +10,49 @@ import app.site.s3 as s3
renderer = mistune.Renderer(escape=False)
markdown = mistune.Markdown(renderer=renderer)
+def parse_markdown(metadata, sections, s3_path, skip_h1=False):
+ """
+ parse page into sections, preprocess the markdown to handle our modifications
+ """
+ groups = []
+ current_group = []
+ for section in sections:
+ if skip_h1 and section.startswith('# '):
+ continue
+ elif section.strip().startswith('```'):
+ groups.append(format_section(current_group, s3_path))
+ current_group = []
+ current_group.append(section)
+ if section.strip().endswith('```'):
+ groups.append(format_applet("\n\n".join(current_group), s3_path))
+ current_group = []
+ elif section.strip().endswith('```'):
+ current_group.append(section)
+ groups.append(format_applet("\n\n".join(current_group), s3_path))
+ current_group = []
+ elif section.startswith('+ '):
+ groups.append(format_section(current_group, s3_path))
+ groups.append(format_metadata(section))
+ current_group = []
+ elif '![fullwidth:' in section:
+ groups.append(format_section(current_group, s3_path))
+ groups.append(format_section([section], s3_path, type='fullwidth'))
+ current_group = []
+ elif '![wide:' in section:
+ groups.append(format_section(current_group, s3_path))
+ groups.append(format_section([section], s3_path, type='wide'))
+ current_group = []
+ elif '![' in section:
+ groups.append(format_section(current_group, s3_path))
+ groups.append(format_section([section], s3_path, type='images'))
+ current_group = []
+ else:
+ current_group.append(section)
+ groups.append(format_section(current_group, s3_path))
+ content = "".join(groups)
+ return content
+
+
def fix_images(lines, s3_path):
"""
do our own tranformation of the markdown around images to handle wide images etc
@@ -32,6 +75,7 @@ def fix_images(lines, s3_path):
real_lines.append(line)
return "\n".join(real_lines)
+
def format_section(lines, s3_path, type=''):
"""
format a normal markdown section
@@ -44,6 +88,7 @@ def format_section(lines, s3_path, type=''):
return "<section>" + markdown(lines) + "</section>"
return ""
+
def format_metadata(section):
"""
format a metadata section (+ key: value pairs)
@@ -54,7 +99,11 @@ def format_metadata(section):
meta.append("<div><div class='gray'>{}</div><div>{}</div></div>".format(key, value))
return "<section><div class='meta'>{}</div></section>".format(''.join(meta))
+
def format_applet(section, s3_path):
+ """
+ Format the applets, which load javascript modules like the map and CSVs
+ """
# print(section)
payload = section.strip('```').strip().strip('```').strip().split('\n')
applet = {}
@@ -79,47 +128,6 @@ def format_applet(section, s3_path):
applet['fields'] = payload[1:]
return "<section class='applet_container'><div class='applet' data-payload='{}'></div></section>".format(json.dumps(applet))
-def parse_markdown(sections, s3_path, skip_h1=False):
- """
- parse page into sections, preprocess the markdown to handle our modifications
- """
- groups = []
- current_group = []
- for section in sections:
- if skip_h1 and section.startswith('# '):
- continue
- elif section.strip().startswith('```'):
- groups.append(format_section(current_group, s3_path))
- current_group = []
- current_group.append(section)
- if section.strip().endswith('```'):
- groups.append(format_applet("\n\n".join(current_group), s3_path))
- current_group = []
- elif section.strip().endswith('```'):
- current_group.append(section)
- groups.append(format_applet("\n\n".join(current_group), s3_path))
- current_group = []
- elif section.startswith('+ '):
- groups.append(format_section(current_group, s3_path))
- groups.append(format_metadata(section))
- current_group = []
- elif '![fullwidth:' in section:
- groups.append(format_section(current_group, s3_path))
- groups.append(format_section([section], s3_path, type='fullwidth'))
- current_group = []
- elif '![wide:' in section:
- groups.append(format_section(current_group, s3_path))
- groups.append(format_section([section], s3_path, type='wide'))
- current_group = []
- elif '![' in section:
- groups.append(format_section(current_group, s3_path))
- groups.append(format_section([section], s3_path, type='images'))
- current_group = []
- else:
- current_group.append(section)
- groups.append(format_section(current_group, s3_path))
- content = "".join(groups)
- return content
def parse_research_index(research_posts):
"""
@@ -141,117 +149,3 @@ def parse_research_index(research_posts):
content += row
content += '</div>'
return content
-
-def read_metadata(fn):
- """
- Read in read a markdown file and extract the metadata
- """
- with open(fn, "r") as file:
- data = file.read()
- data = data.replace("\n ", "\n")
- if "\n" in data:
- data = data.replace("\r", "")
- else:
- data = data.replace("\r", "\n")
- sections = data.split("\n\n")
- return parse_metadata(fn, sections)
-
-default_metadata = {
- 'status': 'published',
- 'title': 'Untitled Page',
- 'desc': '',
- 'slug': '',
- 'published': '2018-12-31',
- 'updated': '2018-12-31',
- 'authors': 'Adam Harvey',
- 'sync': 'true',
- 'tagline': '',
-}
-
-def parse_metadata_section(metadata, section):
- """
- parse a metadata key: value pair
- """
- for line in section.split("\n"):
- if ': ' not in line:
- continue
- key, value = line.split(': ', 1)
- metadata[key.lower()] = value
-
-def parse_metadata(fn, sections):
- """
- parse the metadata headers in a markdown file
- (everything before the second ---------)
- also generates appropriate urls for this page :)
- """
- found_meta = False
- metadata = {}
- valid_sections = []
- for section in sections:
- if not found_meta and ': ' in section:
- found_meta = True
- parse_metadata_section(metadata, section)
- continue
- if '-----' in section:
- continue
- if found_meta:
- valid_sections.append(section)
-
- if 'title' not in metadata:
- print('warning: {} has no title'.format(fn))
- for key in default_metadata:
- if key not in metadata:
- metadata[key] = default_metadata[key]
-
- basedir = os.path.dirname(fn.replace(cfg.DIR_SITE_CONTENT, ''))
- basename = os.path.basename(fn)
- if basedir == '/':
- metadata['path'] = '/'
- metadata['url'] = '/'
- elif basename == 'index.md':
- metadata['path'] = basedir + '/'
- metadata['url'] = metadata['path']
- else:
- metadata['path'] = basedir + '/'
- metadata['url'] = metadata['path'] + basename.replace('.md', '') + '/'
-
- if metadata['status'] == 'published|draft|private':
- metadata['status'] = 'published'
-
- metadata['sync'] = metadata['sync'] != 'false'
-
- metadata['author_html'] = '<br>'.join(metadata['authors'].split(','))
-
- return metadata, valid_sections
-
-def read_research_post_index():
- """
- Generate an index of the research (blog) posts
- """
- return read_post_index('research')
-
-def read_datasets_index():
- """
- Generate an index of the datasets
- """
- return read_post_index('datasets')
-
-def read_post_index(basedir):
- """
- Generate an index of posts
- """
- posts = []
- for fn in sorted(glob.glob(os.path.join(cfg.DIR_SITE_CONTENT, basedir, '*/index.md'))):
- metadata, valid_sections = read_metadata(fn)
- if metadata is None or metadata['status'] == 'private' or metadata['status'] == 'draft':
- continue
- posts.append(metadata)
- if not len(posts):
- posts.append({
- 'title': 'Placeholder',
- 'slug': 'placeholder',
- 'date': 'Placeholder',
- 'url': '/',
- })
- return posts
-
diff --git a/megapixels/commands/site/watch.py b/megapixels/commands/site/watch.py
new file mode 100644
index 00000000..7fd3ba7c
--- /dev/null
+++ b/megapixels/commands/site/watch.py
@@ -0,0 +1,44 @@
+"""
+Watch for changes in the static site and build them
+"""
+
+import click
+import time
+from watchdog.observers import Observer
+from watchdog.events import PatternMatchingEventHandler
+
+import app.settings.app_cfg as cfg
+from app.site.builder import build_site, build_file
+
+class SiteBuilder(PatternMatchingEventHandler):
+ """
+ Handler for filesystem changes to the content path
+ """
+ patterns = ["*.md"]
+
+ def on_modified(self, event):
+ print(event.src_path, event.event_type)
+ build_file(event.src_path)
+
+ def on_created(self, event):
+ print(event.src_path, event.event_type)
+ build_file(event.src_path)
+
+@click.command()
+@click.pass_context
+def cli(ctx):
+ """
+ Run the observer and start watching for changes
+ """
+ print("{} is now being watched for changes.".format(cfg.DIR_SITE_CONTENT))
+ observer = Observer()
+ observer.schedule(SiteBuilder(), path=cfg.DIR_SITE_CONTENT, recursive=True)
+ observer.start()
+
+ try:
+ while True:
+ time.sleep(1)
+ except KeyboardInterrupt:
+ observer.stop()
+
+ observer.join()
diff --git a/site/assets/css/css.css b/site/assets/css/css.css
index 858d98eb..7b2e19fc 100644
--- a/site/assets/css/css.css
+++ b/site/assets/css/css.css
@@ -346,6 +346,7 @@ section.wide .image {
}
section.fullwidth {
width: 100%;
+ background-size: contain;
}
section.fullwidth .image {
max-width: 100%;
diff --git a/site/content/pages/datasets/lfw/index.md b/site/content/pages/datasets/lfw/index.md
index 8b37f035..48d86e1f 100644
--- a/site/content/pages/datasets/lfw/index.md
+++ b/site/content/pages/datasets/lfw/index.md
@@ -4,6 +4,8 @@ status: published
title: Labeled Faces in The Wild
desc: Labeled Faces in The Wild (LFW) is a database of face photographs designed for studying the problem of unconstrained face recognition
subdesc: It includes 13,456 images of 4,432 people’s images copied from the Internet during 2002-2004.
+image: lfw_index.gif
+caption: Eighteen of the 5,749 people in the Labeled Faces in the Wild Dataset. The most widely used face dataset for benchmarking commercial face recognition algorithms.
slug: lfw
published: 2019-2-23
updated: 2019-2-23
@@ -12,22 +14,13 @@ authors: Adam Harvey
------------
-# LFW
+### Statistics
+ Years: 2002-2004
+ Images: 13,233
+ Identities: 5,749
+ Origin: Yahoo News Images
-+ Funding: (Possibly, partially CIA*)
-
-![fullwidth:Eighteen of the 5,749 people in the Labeled Faces in the Wild Dataset. The most widely used face dataset for benchmarking commercial face recognition algorithms.](assets/lfw_index.gif)
-
-*Labeled Faces in The Wild* (LFW) is "a database of face photographs designed for studying the problem of unconstrained face recognition[^lfw_www]. It is used to evaluate and improve the performance of facial recognition algorithms in academic, commercial, and government research. According to BiometricUpdate.com[^lfw_pingan], LFW is "the most widely used evaluation set in the field of facial recognition, LFW attracts a few dozen teams from around the globe including Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong."
-
-The LFW dataset includes 13,233 images of 5,749 people that were collected between 2002-2004. LFW is a subset of *Names of Faces* and is part of the first facial recognition training dataset created entirely from images appearing on the Internet. The people appearing in LFW are...
-
-The *Names and Faces* dataset was the first face recognition dataset created entire from online photos. However, *Names and Faces* and *LFW* are not the first face recognition dataset created entirely "in the wild". That title belongs to the [UCD dataset](/datasets/ucd_faces/). Images obtained "in the wild" means using an image without explicit consent or awareness from the subject or photographer.
-
++ Funding: (Possibly, partially CIA)
### Analysis
@@ -39,25 +32,35 @@ The *Names and Faces* dataset was the first face recognition dataset created ent
- In all 3 of the LFW publications [^lfw_original_paper], [^lfw_survey], [^lfw_tech_report] the words "ethics", "consent", and "privacy" appear 0 times
- The word "future" appears 71 times
+## Labeled Faces in the Wild
+
+*Labeled Faces in The Wild* (LFW) is "a database of face photographs designed for studying the problem of unconstrained face recognition[^lfw_www]. It is used to evaluate and improve the performance of facial recognition algorithms in academic, commercial, and government research. According to BiometricUpdate.com[^lfw_pingan], LFW is "the most widely used evaluation set in the field of facial recognition, LFW attracts a few dozen teams from around the globe including Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong."
+
+The LFW dataset includes 13,233 images of 5,749 people that were collected between 2002-2004. LFW is a subset of *Names of Faces* and is part of the first facial recognition training dataset created entirely from images appearing on the Internet. The people appearing in LFW are...
+
+The *Names and Faces* dataset was the first face recognition dataset created entire from online photos. However, *Names and Faces* and *LFW* are not the first face recognition dataset created entirely "in the wild". That title belongs to the [UCD dataset](/datasets/ucd_faces/). Images obtained "in the wild" means using an image without explicit consent or awareness from the subject or photographer.
+
### Synthetic Faces
To visualize the types of photos in the dataset without explicitly publishing individual's identities a generative adversarial network (GAN) was trained on the entire dataset. The images in this video show a neural network learning the visual latent space and then interpolating between archetypical identities within the LFW dataset.
![fullwidth:](assets/lfw_synthetic.jpg)
-
### Biometric Trade Routes
To understand how this dataset has been used, its citations have been geocoded to show an approximate geographic digital trade route of the biometric data. Lines indicate an organization (education, commercial, or governmental) that has cited the LFW dataset in their research. Data is compiled from [SemanticScholar](https://www.semanticscholar.org).
-[add map here]
+```
+map
+```
### Citations
Browse or download the geocoded citation data collected for the LFW dataset.
-[add citations table here]
-
+```
+citations
+```
### Additional Information
@@ -69,27 +72,14 @@ Browse or download the geocoded citation data collected for the LFW dataset.
- The faces in the LFW dataset were detected using the Viola-Jones haarcascade face detector [^lfw_website] [^lfw-survey]
- The LFW dataset is used by several of the largest tech companies in the world including "Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong." [^lfw_pingan]
- All images in the LFW dataset were copied from Yahoo News between 2002 - 2004
-<<<<<<< HEAD
-- In 2014, two of the four original authors of the LFW dataset received funding from IARPA and ODNI for their follow up paper [Labeled Faces in the Wild: Updates and New Reporting Procedures](https://www.semanticscholar.org/paper/Labeled-Faces-in-the-Wild-%3A-Updates-and-New-Huang-Learned-Miller/2d3482dcff69c7417c7b933f22de606a0e8e42d4) via IARPA contract number 2014-14071600010
+- In 2014, two of the four original authors of the LFW dataset received funding from IARPA and ODNI for their followup paper [Labeled Faces in the Wild: Updates and New Reporting Procedures](https://www.semanticscholar.org/paper/Labeled-Faces-in-the-Wild-%3A-Updates-and-New-Huang-Learned-Miller/2d3482dcff69c7417c7b933f22de606a0e8e42d4) via IARPA contract number 2014-14071600010
- The dataset includes 2 images of [George Tenet](http://vis-www.cs.umass.edu/lfw/person/George_Tenet.html), the former Director of Central Intelligence (DCI) for the Central Intelligence Agency whose facial biometrics were eventually used to help train facial recognition software in China and Russia
-=======
-- In 2014, 2/4 of the original authors of the LFW dataset received funding from IARPA and ODNI for their follow up paper "Labeled Faces in the Wild: Updates and New Reporting Procedures" via IARPA contract number 2014-14071600010
-- The LFW dataset was used Center for Intelligent Information Retrieval, the Central Intelligence Agency, the National Security Agency and National
-
-TODO (need citations for the following)
-
-- SenseTime, who has relied on LFW for benchmarking their facial recognition performance, is one the leading provider of surveillance to the Chinese Government [need citation for this fact. is it the most? or is that Tencent?]
-- Two out of 4 of the original authors received funding from the Office of Director of National Intelligence and IARPA for their 2016 LFW survey follow up report
-
->>>>>>> 13d7a450affe8ea4f368a97ea2014faa17702a4c
![Person with the most face images in LFW: former President George W. Bush](assets/lfw_montage_top1_640.jpg)
![Persons with the next most face images in LFW: Colin Powell (236), Tony Blair (144), and Donald Rumsfeld (121)](assets/lfw_montage_top2_4_640.jpg)
![All 5,379 faces in the Labeled Faces in The Wild Dataset](assets/lfw_montage_all_crop.jpg)
-
-
## Code
The LFW dataset is so widely used that a popular code library called Sci-Kit Learn includes a function called `fetch_lfw_people` to download the faces in the LFW dataset.
@@ -133,7 +123,6 @@ imageio.imwrite('lfw_montage_960.jpg', montage)
### Supplementary Material
-
```
load_file assets/lfw_commercial_use.csv
name_display, company_url, example_url, country, description
@@ -141,14 +130,13 @@ name_display, company_url, example_url, country, description
Text and graphics ©Adam Harvey / megapixels.cc
-
-------
Ignore text below these lines
-------
-Research
+### Research
- "In our experiments, we used 10000 images and associated captions from the Faces in the wilddata set [3]."
- "This work was supported in part by the Center for Intelligent Information Retrieval, the Central Intelligence Agency, the National Security Agency and National Science Foundation under CAREER award IIS-0546666 and grant IIS-0326249."
@@ -159,6 +147,9 @@ Research
- This research is based upon work supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via contract number 2014-14071600010.
- From "Labeled Faces in the Wild: Updates and New Reporting Procedures"
+### Footnotes
+
[^lfw_www]: <http://vis-www.cs.umass.edu/lfw/results.html>
[^lfw_baidu]: Jingtuo Liu, Yafeng Deng, Tao Bai, Zhengping Wei, Chang Huang. Targeting Ultimate Accuracy: Face Recognition via Deep Embedding. <https://arxiv.org/abs/1506.07310>
[^lfw_pingan]: Lee, Justin. "PING AN Tech facial recognition receives high score in latest LFW test results". BiometricUpdate.com. Feb 13, 2017. <https://www.biometricupdate.com/201702/ping-an-tech-facial-recognition-receives-high-score-in-latest-lfw-test-results>
+
diff --git a/site/public/datasets/lfw/index.html b/site/public/datasets/lfw/index.html
index f83d8a66..86f49c52 100644
--- a/site/public/datasets/lfw/index.html
+++ b/site/public/datasets/lfw/index.html
@@ -27,11 +27,8 @@
</header>
<div class="content">
- <section><h1>LFW</h1>
-</section><section><div class='meta'><div><div class='gray'>Years</div><div>2002-2004</div></div><div><div class='gray'>Images</div><div>13,233</div></div><div><div class='gray'>Identities</div><div>5,749</div></div><div><div class='gray'>Origin</div><div>Yahoo News Images</div></div><div><div class='gray'>Funding</div><div>(Possibly, partially CIA*)</div></div></div></section><section class='fullwidth'><div class='image'><img src='https://nyc3.digitaloceanspaces.com/megapixels/v1/datasets/lfw/assets/lfw_index.gif' alt='Eighteen of the 5,749 people in the Labeled Faces in the Wild Dataset. The most widely used face dataset for benchmarking commercial face recognition algorithms.'><div class='caption'>Eighteen of the 5,749 people in the Labeled Faces in the Wild Dataset. The most widely used face dataset for benchmarking commercial face recognition algorithms.</div></div></section><section><p><em>Labeled Faces in The Wild</em> (LFW) is "a database of face photographs designed for studying the problem of unconstrained face recognition[^lfw_www]. It is used to evaluate and improve the performance of facial recognition algorithms in academic, commercial, and government research. According to BiometricUpdate.com[^lfw_pingan], LFW is "the most widely used evaluation set in the field of facial recognition, LFW attracts a few dozen teams from around the globe including Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong."</p>
-<p>The LFW dataset includes 13,233 images of 5,749 people that were collected between 2002-2004. LFW is a subset of <em>Names of Faces</em> and is part of the first facial recognition training dataset created entirely from images appearing on the Internet. The people appearing in LFW are...</p>
-<p>The <em>Names and Faces</em> dataset was the first face recognition dataset created entire from online photos. However, <em>Names and Faces</em> and <em>LFW</em> are not the first face recognition dataset created entirely "in the wild". That title belongs to the <a href="/datasets/ucd_faces/">UCD dataset</a>. Images obtained "in the wild" means using an image without explicit consent or awareness from the subject or photographer.</p>
-<h3>Analysis</h3>
+ <section><h3>Statistics</h3>
+</section><section><div class='meta'><div><div class='gray'>Years</div><div>2002-2004</div></div><div><div class='gray'>Images</div><div>13,233</div></div><div><div class='gray'>Identities</div><div>5,749</div></div><div><div class='gray'>Origin</div><div>Yahoo News Images</div></div><div><div class='gray'>Funding</div><div>(Possibly, partially CIA)</div></div></div></section><section><h3>Analysis</h3>
<ul>
<li>There are about 3 men for every 1 woman (4,277 men and 1,472 women) in the LFW dataset[^lfw_www]</li>
<li>The person with the most images is <a href="http://vis-www.cs.umass.edu/lfw/person/George_W_Bush_comp.html">George W. Bush</a> with 530</li>
@@ -41,15 +38,17 @@
<li>In all 3 of the LFW publications [^lfw_original_paper], [^lfw_survey], [^lfw_tech_report] the words "ethics", "consent", and "privacy" appear 0 times</li>
<li>The word "future" appears 71 times</li>
</ul>
+<h2>Labeled Faces in the Wild</h2>
+<p><em>Labeled Faces in The Wild</em> (LFW) is "a database of face photographs designed for studying the problem of unconstrained face recognition[^lfw_www]. It is used to evaluate and improve the performance of facial recognition algorithms in academic, commercial, and government research. According to BiometricUpdate.com[^lfw_pingan], LFW is "the most widely used evaluation set in the field of facial recognition, LFW attracts a few dozen teams from around the globe including Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong."</p>
+<p>The LFW dataset includes 13,233 images of 5,749 people that were collected between 2002-2004. LFW is a subset of <em>Names of Faces</em> and is part of the first facial recognition training dataset created entirely from images appearing on the Internet. The people appearing in LFW are...</p>
+<p>The <em>Names and Faces</em> dataset was the first face recognition dataset created entire from online photos. However, <em>Names and Faces</em> and <em>LFW</em> are not the first face recognition dataset created entirely "in the wild". That title belongs to the <a href="/datasets/ucd_faces/">UCD dataset</a>. Images obtained "in the wild" means using an image without explicit consent or awareness from the subject or photographer.</p>
<h3>Synthetic Faces</h3>
<p>To visualize the types of photos in the dataset without explicitly publishing individual's identities a generative adversarial network (GAN) was trained on the entire dataset. The images in this video show a neural network learning the visual latent space and then interpolating between archetypical identities within the LFW dataset.</p>
</section><section class='fullwidth'><div class='image'><img src='https://nyc3.digitaloceanspaces.com/megapixels/v1/datasets/lfw/assets/lfw_synthetic.jpg' alt=''></div></section><section><h3>Biometric Trade Routes</h3>
<p>To understand how this dataset has been used, its citations have been geocoded to show an approximate geographic digital trade route of the biometric data. Lines indicate an organization (education, commercial, or governmental) that has cited the LFW dataset in their research. Data is compiled from <a href="https://www.semanticscholar.org">SemanticScholar</a>.</p>
-<p>[add map here]</p>
-<h3>Citations</h3>
+</section><section class='applet_container'><div class='applet' data-payload='{"command": "map"}'></div></section><section><h3>Citations</h3>
<p>Browse or download the geocoded citation data collected for the LFW dataset.</p>
-<p>[add citations table here]</p>
-<h3>Additional Information</h3>
+</section><section class='applet_container'><div class='applet' data-payload='{"command": "citations"}'></div></section><section><h3>Additional Information</h3>
<p>(tweet-sized snippets go here)</p>
<ul>
<li>The LFW dataset is considered the "most popular benchmark for face recognition" [^lfw_baidu]</li>
@@ -57,27 +56,10 @@
<li>All images in LFW dataset were obtained "in the wild" meaning without any consent from the subject or from the photographer</li>
<li>The faces in the LFW dataset were detected using the Viola-Jones haarcascade face detector [^lfw_website] [^lfw-survey]</li>
<li>The LFW dataset is used by several of the largest tech companies in the world including "Google, Facebook, Microsoft Research Asia, Baidu, Tencent, SenseTime, Face++ and Chinese University of Hong Kong." [^lfw_pingan]</li>
-<li>All images in the LFW dataset were copied from Yahoo News between 2002 - 2004
-&lt;&lt;&lt;&lt;&lt;&lt;&lt; HEAD</li>
-<li>In 2014, two of the four original authors of the LFW dataset received funding from IARPA and ODNI for their follow up paper <a href="https://www.semanticscholar.org/paper/Labeled-Faces-in-the-Wild-%3A-Updates-and-New-Huang-Learned-Miller/2d3482dcff69c7417c7b933f22de606a0e8e42d4">Labeled Faces in the Wild: Updates and New Reporting Procedures</a> via IARPA contract number 2014-14071600010</li>
-<li><h1>The dataset includes 2 images of <a href="http://vis-www.cs.umass.edu/lfw/person/George_Tenet.html">George Tenet</a>, the former Director of Central Intelligence (DCI) for the Central Intelligence Agency whose facial biometrics were eventually used to help train facial recognition software in China and Russia</h1>
-</li>
-<li>In 2014, 2/4 of the original authors of the LFW dataset received funding from IARPA and ODNI for their follow up paper "Labeled Faces in the Wild: Updates and New Reporting Procedures" via IARPA contract number 2014-14071600010</li>
-<li>The LFW dataset was used Center for Intelligent Information Retrieval, the Central Intelligence Agency, the National Security Agency and National</li>
-</ul>
-<p>TODO (need citations for the following)</p>
-<ul>
-<li>SenseTime, who has relied on LFW for benchmarking their facial recognition performance, is one the leading provider of surveillance to the Chinese Government [need citation for this fact. is it the most? or is that Tencent?]</li>
-<li>Two out of 4 of the original authors received funding from the Office of Director of National Intelligence and IARPA for their 2016 LFW survey follow up report</li>
+<li>All images in the LFW dataset were copied from Yahoo News between 2002 - 2004</li>
+<li>In 2014, two of the four original authors of the LFW dataset received funding from IARPA and ODNI for their followup paper <a href="https://www.semanticscholar.org/paper/Labeled-Faces-in-the-Wild-%3A-Updates-and-New-Huang-Learned-Miller/2d3482dcff69c7417c7b933f22de606a0e8e42d4">Labeled Faces in the Wild: Updates and New Reporting Procedures</a> via IARPA contract number 2014-14071600010</li>
+<li>The dataset includes 2 images of <a href="http://vis-www.cs.umass.edu/lfw/person/George_Tenet.html">George Tenet</a>, the former Director of Central Intelligence (DCI) for the Central Intelligence Agency whose facial biometrics were eventually used to help train facial recognition software in China and Russia</li>
</ul>
-<blockquote><blockquote><blockquote><blockquote><blockquote><blockquote><blockquote><p>&gt; 13d7a450affe8ea4f368a97ea2014faa17702a4c</p>
-</blockquote>
-</blockquote>
-</blockquote>
-</blockquote>
-</blockquote>
-</blockquote>
-</blockquote>
</section><section class='images'><div class='image'><img src='https://nyc3.digitaloceanspaces.com/megapixels/v1/datasets/lfw/assets/lfw_montage_top1_640.jpg' alt=' former President George W. Bush'><div class='caption'> former President George W. Bush</div></div>
<div class='image'><img src='https://nyc3.digitaloceanspaces.com/megapixels/v1/datasets/lfw/assets/lfw_montage_top2_4_640.jpg' alt=' Colin Powell (236), Tony Blair (144), and Donald Rumsfeld (121)'><div class='caption'> Colin Powell (236), Tony Blair (144), and Donald Rumsfeld (121)</div></div></section><section class='images'><div class='image'><img src='https://nyc3.digitaloceanspaces.com/megapixels/v1/datasets/lfw/assets/lfw_montage_all_crop.jpg' alt='All 5,379 faces in the Labeled Faces in The Wild Dataset'><div class='caption'>All 5,379 faces in the Labeled Faces in The Wild Dataset</div></div></section><section><h2>Code</h2>
<p>The LFW dataset is so widely used that a popular code library called Sci-Kit Learn includes a function called <code>fetch_lfw_people</code> to download the faces in the LFW dataset.</p>
@@ -113,7 +95,7 @@ imageio.imwrite(&#39;lfw_montage_960.jpg&#39;, montage)
</section><section><h3>Supplementary Material</h3>
</section><section class='applet_container'><div class='applet' data-payload='{"command": "load_file assets/lfw_commercial_use.csv", "fields": ["name_display, company_url, example_url, country, description"]}'></div></section><section><p>Text and graphics ©Adam Harvey / megapixels.cc</p>
<p>Ignore text below these lines</p>
-<p>Research</p>
+<h3>Research</h3>
<ul>
<li>"In our experiments, we used 10000 images and associated captions from the Faces in the wilddata set [3]."</li>
<li>"This work was supported in part by the Center for Intelligent Information Retrieval, the Central Intelligence Agency, the National Security Agency and National Science Foundation under CAREER award IIS-0546666 and grant IIS-0326249."</li>
@@ -125,6 +107,7 @@ imageio.imwrite(&#39;lfw_montage_960.jpg&#39;, montage)
</li>
<li>From "Labeled Faces in the Wild: Updates and New Reporting Procedures"</li>
</ul>
+<h3>Footnotes</h3>
<div class="footnotes">
<hr>
<ol></ol>