# Convert MSC HTML to CSV

- create name lists

In [1]:
%reload_ext autoreload
%autoreload 2

from os.path import join
from pathlib import Path
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool

import lxml
from bs4 import BeautifulSoup
import urllib.request
from tqdm import tqdm_notebook as tqdm
import pandas as pd

import sys
sys.path.append('/work/megapixels_dev/megapixels/')
from app.settings import app_cfg as cfg
from app.utils import file_utils, im_utils

In [147]:
fp_dir_in = '/data_store/datasets/munich_security_conference/participants/'
fp_dir_out = '/data_store/datasets/munich_security_conference/participants/'
fp_out_all_csv = join(fp_dir_ou, 'participants.csv')  # total list
years = ['2009', '2010', '2011', '2014']

In [143]:
def parse_name(name):
  try:
    ridx = name.rindex(',')
  except Exception as e:
    # names like "Ban Ki-moon" have no comman
    if name == 'Ban Ki-moon':
      name = 'Ki-moon, Ban'
    elif name == 'Fu Ying':
      name = 'Ying, Fu'
    elif name == 'Dr. Ng Eng Hen':
      # unclear: https://en.wikipedia.org/wiki/Ng_Eng_Hen
      name = 'Ng, Dr. Eng Hen' 
    elif name == 'Seok-soo Lee':
      name = 'Lee, Seok-soo'
    else:
      print(f'Could not handle: "{name}"')
    ridx = name.rindex(',')
    
  name_last = name[:ridx].strip()
  name_first = name[(ridx + 1):].strip()
  return name_first, name_last
  
def parse_year(fp_in_html, year):
  # create soup
  with open(fp_in_html, 'r') as fp:
    data = fp.read()
  soup = BeautifulSoup(data, 'lxml')
  
  # get rows
  table = soup.find('table', attrs={'class':'contenttable'})
  tbody = table.find('tbody')
  trows = tbody.find_all('tr')
  
  # parse by year
  participants = []
  for trow in trows[1:]:
    if year == '2009' or year == '2014':
      tds = trow.find_all('td')
      name = tds[0].text.strip()
      name_first, name_last = parse_name(name)
      desc = tds[1].text.strip()
    elif year == '2010':
      tds = trow.find_all('td')
      name_first = tds[0].text.strip()
      name_last = tds[1].text.strip()
      desc = tds[2].text.strip()
    elif year == '2011':
      tds = trow.find_all('td')
      name = tds[0].find_all('p')[0].text.strip()
      name_first, name_last = parse_name(name)
      desc = tds[1].find_all('p')[0].text.strip()
      
    obj = {'name_first': name_first, 'name_last': name_last, 'description': desc, 'year': year}
    participants.append(obj)
    
  return participants

In [148]:
participants_all = []
for year in years:
  fp_in_html = join(fp_dir_out, f'{year}.html')
  fp_out_csv = join(fp_dir_out, f'{year}.csv')
  participants = parse_year(fp_in_html, year)
  participants_all += participants
  df = pd.DataFrame.from_dict(participants)
  df.to_csv(fp_out_csv, index=False)
  print(f'Wrote: {fp_out_csv} with {len(participants)} items')

# write total list

df = pd.DataFrame.from_dict(participants_all)
df.to_csv(fp_out_all_csv, index=False)
print(f'Wrote: {fp_out_all_csv} with {len(participants_all)} items')

2009
Wrote: /data_store/datasets/munich_security_conference/participants/2009.csv with 346 items
2010
Wrote: /data_store/datasets/munich_security_conference/participants/2010.csv with 317 items
2011
Wrote: /data_store/datasets/munich_security_conference/participants/2011.csv with 341 items
2014
Wrote: /data_store/datasets/munich_security_conference/participants/2014.csv with 467 items
Wrote: /data_store/datasets/munich_security_conference/participants/participants.csv with 1471 items
