import os from os.path import join import re import glob import simplejson as json import mistune import app.settings.app_cfg as cfg import app.site.s3 as s3 renderer = mistune.Renderer(escape=False) markdown = mistune.Markdown(renderer=renderer) footnote_count = 0 def parse_markdown(metadata, sections, s3_path, skip_h1=False): """ parse page into sections, preprocess the markdown to handle our modifications """ groups = [] current_group = [] footnotes = [] in_stats = False in_footnotes = False ignoring = False if 'desc' in metadata and 'subdesc' in metadata: groups.append(intro_section(metadata, s3_path)) for section in sections: if skip_h1 and section.startswith('# '): continue elif section.strip().startswith('---'): continue elif section.lower().strip().startswith('ignore text'): ignoring = True continue elif section.strip().startswith('### Footnotes'): groups.append(format_section(current_group, s3_path)) current_group = [] footnotes = [] in_footnotes = True elif in_footnotes: footnotes.append(section) elif ignoring: continue elif '### statistics' in section.lower() or '### sidebar' in section.lower(): if len(current_group): groups.append(format_section(current_group, s3_path)) current_group = [] if 'sidebar' not in section.lower(): current_group.append(section) in_stats = True elif in_stats and not section.strip().startswith('## ') and 'end sidebar' not in section.lower(): current_group.append(section) elif in_stats and section.strip().startswith('## ') or 'end sidebar' in section.lower(): current_group = [format_section(current_group, s3_path, 'left-sidebar', tag='div')] if 'end sidebar' not in section.lower(): current_group.append(section) in_stats = False elif section.strip().startswith('{% include'): groups.append(format_section(current_group, s3_path)) current_group = [] current_group.append(section) if section.strip().endswith(' %}'): groups.append(format_include("\n\n".join(current_group))) current_group = [] elif section.strip().startswith('```'): groups.append(format_section(current_group, s3_path)) current_group = [] current_group.append(section) if section.strip().endswith('```'): groups.append(format_applet("\n\n".join(current_group), s3_path)) current_group = [] elif section.strip().endswith('```'): current_group.append(section) groups.append(format_applet("\n\n".join(current_group), s3_path)) current_group = [] elif section.startswith('+ '): groups.append(format_section(current_group, s3_path)) groups.append('
' + format_metadata(section) + '
') current_group = [] elif '![fullwidth:' in section: groups.append(format_section(current_group, s3_path)) groups.append(format_section([section], s3_path, type='fullwidth')) current_group = [] elif '![wide:' in section: groups.append(format_section(current_group, s3_path)) groups.append(format_section([section], s3_path, type='wide')) current_group = [] elif '![' in section: groups.append(format_section(current_group, s3_path)) groups.append(format_section([section], s3_path, type='images')) current_group = [] else: current_group.append(section) groups.append(format_section(current_group, s3_path)) footnote_txt = '' footnote_lookup = {} if len(footnotes): footnote_txt, footnote_lookup = format_footnotes(footnotes, s3_path) content = "".join(groups) if footnote_lookup: for key, index in footnote_lookup.items(): global footnote_count footnote_count = 0 letters = "abcdefghijklmnopqrstuvwxyz" footnote_backlinks = [] def footnote_tag(match): global footnote_count footnote_count += 1 footnote_backlinks.append('{}'.format(key, footnote_count, letters[footnote_count-1])) return ' {}'.format(key, footnote_count, key, index, index) key_regex = re.compile(key.replace('[', '\\[').replace('^', '\\^').replace(']', '\\]')) content = key_regex.sub(footnote_tag, content) footnote_txt = footnote_txt.replace("{}_BACKLINKS".format(index), "".join(footnote_backlinks)) content += footnote_txt return content def intro_section(metadata, s3_path): """ Build the intro section for datasets """ section = "
".format(s3_path + metadata['image']) section += "
" parts = [] if 'desc' in metadata: desc = metadata['desc'] # colorize the first instance of the database name in the header if 'color' in metadata and metadata['title'] in desc: desc = desc.replace(metadata['title'], "{}".format(metadata['color'], metadata['title']), 1) section += "
{}
".format(desc, desc) if 'subdesc' in metadata: subdesc = markdown(metadata['subdesc']).replace('

', '').replace('

', '') section += "
{}
".format(subdesc, subdesc) section += "
" section += "
" if 'caption' in metadata: section += "
{}
".format(metadata['caption']) return section def fix_images(lines, s3_path): """ do our own transformation of the markdown around images to handle wide images etc lines: markdown lines """ real_lines = [] block = "\n\n".join(lines) for line in block.split("\n"): if "![" in line: line = line.replace('![', '') alt_text, tail = line.split('](', 1) url, tail = tail.split(')', 1) tag = '' if ':' in alt_text: tag, alt_text = alt_text.split(':', 1) img_tag = "{}".format(s3_path + url, alt_text.replace("'", "")) if 'sideimage' in tag: line = "
{}
{}
".format(img_tag, markdown(tail)) elif len(alt_text): line = "
{}
{}
".format(img_tag, alt_text) else: line = "
{}
".format(img_tag, alt_text) real_lines.append(line) return "\n".join(real_lines) def format_section(lines, s3_path, type='', tag='section'): """ format a normal markdown section """ if len(lines): lines = fix_meta(lines) lines = fix_images(lines, s3_path) if type: return "<{} class='{}'>{}".format(tag, type, markdown(lines), tag) else: return "<{}>{}".format(tag, markdown(lines), tag) return "" def fix_meta(lines): """ Format metadata sections before passing to markdown """ new_lines = [] for line in lines: if line.startswith('+ '): line = format_metadata(line) new_lines.append(line) return new_lines def format_metadata(section): """ format a metadata section (+ key: value pairs) """ meta = [] for line in section.split('\n'): if ': ' not in line: continue key, value = line[2:].split(': ', 1) meta.append("
{}
{}
".format(key, value)) return "
{}
".format(''.join(meta)) def format_footnotes(footnotes, s3_path): """ Format the footnotes section separately and produce a lookup we can use to update the main site """ footnotes = '\n'.join(footnotes).split('\n') index = 1 footnote_index_lookup = {} footnote_list = [] for footnote in footnotes: if not len(footnote) or '[^' not in footnote: continue key, note = footnote.split(': ', 1) footnote_index_lookup[key] = index footnote_list.append('{}_BACKLINKS'.format(key, index) + markdown(note)) index += 1 footnote_txt = '
  • ' + '
  • '.join(footnote_list) + '
' return footnote_txt, footnote_index_lookup def format_include(section): """ Include html template """ include_dir = cfg.DIR_SITE_INCLUDES fp_html = section.strip().strip('\n').strip().strip('{%').strip().strip('%}').strip() fp_html = fp_html.strip('include').strip().strip('"').strip().strip("'").strip() try: with open(join(include_dir, fp_html), 'r') as fp: html = fp.read().replace('\n', '') return html except Exception as e: print(f'Error parsing include: {e}') return '' def format_applet(section, s3_path): """ Format the applets, which load javascript modules like the map and CSVs """ # print(section) payload = section.strip('```').strip().strip('```').strip().split('\n') applet = {} # print(payload) if ': ' in payload[0]: command, opt = payload[0].split(': ', 1) else: command = payload[0] opt = None print(command) if command == 'python' or command == 'javascript' or command == 'code': return format_section([ section ], s3_path) if command == '': return '' applet['command'] = command if opt: applet['opt'] = opt if command == 'load_file': if opt[0:4] != 'http': applet['opt'] = s3_path + opt if len(payload) > 1: applet['fields'] = payload[1:] return "
".format(json.dumps(applet)) def parse_research_index(research_posts): """ Generate an index file for the research pages """ content = "
" for post in research_posts: print(post) s3_path = s3.make_s3_path(cfg.S3_SITE_PATH, post['path']) if 'image' in post: post_image = s3_path + post['image'] else: post_image = 'data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==' row = "
Research post

{}

{}

".format( post['path'], post_image, post['title'], post['tagline']) content += row content += '
' return content