diff options
Diffstat (limited to 'megapixels/app/site/parser.py')
| -rw-r--r-- | megapixels/app/site/parser.py | 347 |
1 files changed, 183 insertions, 164 deletions
diff --git a/megapixels/app/site/parser.py b/megapixels/app/site/parser.py index f739315a..ad4256ad 100644 --- a/megapixels/app/site/parser.py +++ b/megapixels/app/site/parser.py @@ -10,9 +10,141 @@ import app.site.s3 as s3 renderer = mistune.Renderer(escape=False) markdown = mistune.Markdown(renderer=renderer) +footnote_count = 0 + +def parse_markdown(metadata, sections, s3_path, skip_h1=False): + """ + parse page into sections, preprocess the markdown to handle our modifications + """ + groups = [] + current_group = [] + footnotes = [] + in_stats = False + in_footnotes = False + ignoring = False + + if 'desc' in metadata and 'subdesc' in metadata: + groups.append(intro_section(metadata, s3_path)) + + for section in sections: + if skip_h1 and section.startswith('# '): + continue + elif section.strip().startswith('---'): + continue + elif section.lower().strip().startswith('ignore text'): + ignoring = True + continue + elif section.strip().startswith('### Footnotes'): + groups.append(format_section(current_group, s3_path)) + current_group = [] + footnotes = [] + in_footnotes = True + elif in_footnotes: + footnotes.append(section) + elif ignoring: + continue + elif '### statistics' in section.lower() or '### sidebar' in section.lower(): + if len(current_group): + groups.append(format_section(current_group, s3_path)) + current_group = [] + if 'sidebar' not in section.lower(): + current_group.append(section) + in_stats = True + elif in_stats and not section.strip().startswith('## ') and 'end sidebar' not in section.lower(): + current_group.append(section) + elif in_stats and section.strip().startswith('## ') or 'end sidebar' in section.lower(): + current_group = [format_section(current_group, s3_path, 'right-sidebar', tag='div')] + if 'end sidebar' not in section.lower(): + current_group.append(section) + in_stats = False + elif section.strip().startswith('```'): + groups.append(format_section(current_group, s3_path)) + current_group = [] + current_group.append(section) + if section.strip().endswith('```'): + groups.append(format_applet("\n\n".join(current_group), s3_path)) + current_group = [] + elif section.strip().endswith('```'): + current_group.append(section) + groups.append(format_applet("\n\n".join(current_group), s3_path)) + current_group = [] + elif section.startswith('+ '): + groups.append(format_section(current_group, s3_path)) + groups.append('<section>' + format_metadata(section) + '<section>') + current_group = [] + elif '![fullwidth:' in section: + groups.append(format_section(current_group, s3_path)) + groups.append(format_section([section], s3_path, type='fullwidth')) + current_group = [] + elif '![wide:' in section: + groups.append(format_section(current_group, s3_path)) + groups.append(format_section([section], s3_path, type='wide')) + current_group = [] + elif '![' in section: + groups.append(format_section(current_group, s3_path)) + groups.append(format_section([section], s3_path, type='images')) + current_group = [] + else: + current_group.append(section) + groups.append(format_section(current_group, s3_path)) + + footnote_txt = '' + footnote_lookup = {} + + if len(footnotes): + footnote_txt, footnote_lookup = format_footnotes(footnotes, s3_path) + + content = "".join(groups) + + if footnote_lookup: + for key, index in footnote_lookup.items(): + global footnote_count + footnote_count = 0 + letters = "abcdefghijklmnopqrstuvwxyz" + footnote_backlinks = [] + def footnote_tag(match): + global footnote_count + footnote_count += 1 + footnote_backlinks.append('<a href="#{}_{}">{}</a>'.format(key, footnote_count, letters[footnote_count-1])) + return '<a class="footnote_shim" name="{}_{}"> </a><a href="#{}" class="footnote" title="Footnote {}">{}</a>'.format(key, footnote_count, key, index, index) + key_regex = re.compile(key.replace('[', '\\[').replace('^', '\\^').replace(']', '\\]')) + content = key_regex.sub(footnote_tag, content) + footnote_txt = footnote_txt.replace("{}_BACKLINKS".format(index), "".join(footnote_backlinks)) + content += footnote_txt + return content + + +def intro_section(metadata, s3_path): + """ + Build the intro section for datasets + """ + + section = "<section class='intro_section' style='background-image: url({})'>".format(s3_path + metadata['image']) + section += "<div class='inner'>" + + parts = [] + if 'desc' in metadata: + desc = metadata['desc'] + if 'color' in metadata and metadata['title'] in desc: + desc = desc.replace(metadata['title'], "<span style='color: {}'>{}</span>".format(metadata['color'], metadata['title'])) + section += "<div class='hero_desc'><span>{}</span></div>".format(desc, desc) + + if 'subdesc' in metadata: + subdesc = markdown(metadata['subdesc']).replace('<p>', '').replace('</p>', '') + section += "<div class='hero_subdesc'><span>{}</span></div>".format(subdesc, subdesc) + + section += "</div>" + section += "</section>" + + if 'caption' in metadata: + section += "<section><div class='image'><div class='caption'>{}</div></div></section>".format(metadata['caption']) + + return section + + def fix_images(lines, s3_path): """ - do our own tranformation of the markdown around images to handle wide images etc + do our own transformation of the markdown around images to handle wide images etc lines: markdown lines """ real_lines = [] @@ -22,48 +154,89 @@ def fix_images(lines, s3_path): line = line.replace(' url, tail = tail.split(')', 1) + tag = '' if ':' in alt_text: - tail, alt_text = alt_text.split(':', 1) + tag, alt_text = alt_text.split(':', 1) img_tag = "<img src='{}' alt='{}'>".format(s3_path + url, alt_text.replace("'", "")) - if len(alt_text): + if 'sideimage' in tag: + line = "<div class='sideimage'>{}<div>{}</div></div>".format(img_tag, markdown(tail)) + elif len(alt_text): line = "<div class='image'>{}<div class='caption'>{}</div></div>".format(img_tag, alt_text) else: line = "<div class='image'>{}</div>".format(img_tag, alt_text) real_lines.append(line) return "\n".join(real_lines) -def format_section(lines, s3_path, type=''): + +def format_section(lines, s3_path, type='', tag='section'): """ format a normal markdown section """ if len(lines): + lines = fix_meta(lines) lines = fix_images(lines, s3_path) if type: - return "<section class='{}'>{}</section>".format(type, markdown(lines)) + return "<{} class='{}'>{}</{}>".format(tag, type, markdown(lines), tag) else: - return "<section>" + markdown(lines) + "</section>" + return "<{}>{}</{}>".format(tag, markdown(lines), tag) return "" +def fix_meta(lines): + """ + Format metadata sections before passing to markdown + """ + new_lines = [] + for line in lines: + if line.startswith('+ '): + line = format_metadata(line) + new_lines.append(line) + return new_lines + def format_metadata(section): """ format a metadata section (+ key: value pairs) """ meta = [] for line in section.split('\n'): + if ': ' not in line: + continue key, value = line[2:].split(': ', 1) meta.append("<div><div class='gray'>{}</div><div>{}</div></div>".format(key, value)) - return "<section><div class='meta'>{}</div></section>".format(''.join(meta)) + return "<div class='meta'>{}</div>".format(''.join(meta)) + +def format_footnotes(footnotes, s3_path): + """ + Format the footnotes section separately and produce a lookup we can use to update the main site + """ + footnotes = '\n'.join(footnotes).split('\n') + index = 1 + footnote_index_lookup = {} + footnote_list = [] + for footnote in footnotes: + if not len(footnote) or '[^' not in footnote: + continue + key, note = footnote.split(': ', 1) + footnote_index_lookup[key] = index + footnote_list.append('<a name="{}" class="footnote_shim"></a><span class="backlinks">{}_BACKLINKS</span>'.format(key, index) + markdown(note)) + index += 1 + + footnote_txt = '<section><ul class="footnotes"><li>' + '</li><li>'.join(footnote_list) + '</li></ul></section>' + return footnote_txt, footnote_index_lookup def format_applet(section, s3_path): + """ + Format the applets, which load javascript modules like the map and CSVs + """ # print(section) payload = section.strip('```').strip().strip('```').strip().split('\n') applet = {} - print(payload) + # print(payload) if ': ' in payload[0]: - command, opt = payload[0].split(': ') + command, opt = payload[0].split(': ', 1) else: command = payload[0] opt = None + print(command) if command == 'python' or command == 'javascript' or command == 'code': return format_section([ section ], s3_path) if command == '': @@ -79,47 +252,6 @@ def format_applet(section, s3_path): applet['fields'] = payload[1:] return "<section class='applet_container'><div class='applet' data-payload='{}'></div></section>".format(json.dumps(applet)) -def parse_markdown(sections, s3_path, skip_h1=False): - """ - parse page into sections, preprocess the markdown to handle our modifications - """ - groups = [] - current_group = [] - for section in sections: - if skip_h1 and section.startswith('# '): - continue - elif section.strip().startswith('```'): - groups.append(format_section(current_group, s3_path)) - current_group = [] - current_group.append(section) - if section.strip().endswith('```'): - groups.append(format_applet("\n\n".join(current_group), s3_path)) - current_group = [] - elif section.strip().endswith('```'): - current_group.append(section) - groups.append(format_applet("\n\n".join(current_group), s3_path)) - current_group = [] - elif section.startswith('+ '): - groups.append(format_section(current_group, s3_path)) - groups.append(format_metadata(section)) - current_group = [] - elif '![fullwidth:' in section: - groups.append(format_section(current_group, s3_path)) - groups.append(format_section([section], s3_path, type='fullwidth')) - current_group = [] - elif '![wide:' in section: - groups.append(format_section(current_group, s3_path)) - groups.append(format_section([section], s3_path, type='wide')) - current_group = [] - elif '![' in section: - groups.append(format_section(current_group, s3_path)) - groups.append(format_section([section], s3_path, type='images')) - current_group = [] - else: - current_group.append(section) - groups.append(format_section(current_group, s3_path)) - content = "".join(groups) - return content def parse_research_index(research_posts): """ @@ -127,6 +259,7 @@ def parse_research_index(research_posts): """ content = "<div class='research_index'>" for post in research_posts: + print(post) s3_path = s3.make_s3_path(cfg.S3_SITE_PATH, post['path']) if 'image' in post: post_image = s3_path + post['image'] @@ -140,117 +273,3 @@ def parse_research_index(research_posts): content += row content += '</div>' return content - -def read_metadata(fn): - """ - Read in read a markdown file and extract the metadata - """ - with open(fn, "r") as file: - data = file.read() - data = data.replace("\n ", "\n") - if "\n" in data: - data = data.replace("\r", "") - else: - data = data.replace("\r", "\n") - sections = data.split("\n\n") - return parse_metadata(fn, sections) - -default_metadata = { - 'status': 'published', - 'title': 'Untitled Page', - 'desc': '', - 'slug': '', - 'published': '2018-12-31', - 'updated': '2018-12-31', - 'authors': 'Adam Harvey', - 'sync': 'true', - 'tagline': '', -} - -def parse_metadata_section(metadata, section): - """ - parse a metadata key: value pair - """ - for line in section.split("\n"): - if ': ' not in line: - continue - key, value = line.split(': ', 1) - metadata[key.lower()] = value - -def parse_metadata(fn, sections): - """ - parse the metadata headers in a markdown file - (everything before the second ---------) - also generates appropriate urls for this page :) - """ - found_meta = False - metadata = {} - valid_sections = [] - for section in sections: - if not found_meta and ': ' in section: - found_meta = True - parse_metadata_section(metadata, section) - continue - if '-----' in section: - continue - if found_meta: - valid_sections.append(section) - - if 'title' not in metadata: - print('warning: {} has no title'.format(fn)) - for key in default_metadata: - if key not in metadata: - metadata[key] = default_metadata[key] - - basedir = os.path.dirname(fn.replace(cfg.DIR_SITE_CONTENT, '')) - basename = os.path.basename(fn) - if basedir == '/': - metadata['path'] = '/' - metadata['url'] = '/' - elif basename == 'index.md': - metadata['path'] = basedir + '/' - metadata['url'] = metadata['path'] - else: - metadata['path'] = basedir + '/' - metadata['url'] = metadata['path'] + basename.replace('.md', '') + '/' - - if metadata['status'] == 'published|draft|private': - metadata['status'] = 'published' - - metadata['sync'] = metadata['sync'] != 'false' - - metadata['author_html'] = '<br>'.join(metadata['authors'].split(',')) - - return metadata, valid_sections - -def read_research_post_index(): - """ - Generate an index of the research (blog) posts - """ - return read_post_index('research') - -def read_datasets_index(): - """ - Generate an index of the datasets - """ - return read_post_index('datasets') - -def read_post_index(basedir): - """ - Generate an index of posts - """ - posts = [] - for fn in sorted(glob.glob('../site/content/{}/*/index.md'.format(basedir))): - metadata, valid_sections = read_metadata(fn) - if metadata is None or metadata['status'] == 'private' or metadata['status'] == 'draft': - continue - posts.append(metadata) - if not len(posts): - posts.append({ - 'title': 'Placeholder', - 'slug': 'placeholder', - 'date': 'Placeholder', - 'url': '/', - }) - return posts - |
