{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Convert MSC HTML to CSV\n", "\n", "- create name lists" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "from os.path import join\n", "from pathlib import Path\n", "from functools import partial\n", "from multiprocessing.dummy import Pool as ThreadPool\n", "\n", "import lxml\n", "from bs4 import BeautifulSoup\n", "import urllib.request\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.settings import app_cfg as cfg\n", "from app.utils import file_utils, im_utils" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "fp_dir_in = '/data_store/datasets/munich_security_conference/participants/'\n", "fp_dir_out = '/data_store/datasets/munich_security_conference/participants/'\n", "fp_out_all_csv = join(fp_dir_ou, 'participants.csv') # total list\n", "years = ['2009', '2010', '2011', '2014']" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [], "source": [ "def parse_name(name):\n", " try:\n", " ridx = name.rindex(',')\n", " except Exception as e:\n", " # names like \"Ban Ki-moon\" have no comman\n", " if name == 'Ban Ki-moon':\n", " name = 'Ki-moon, Ban'\n", " elif name == 'Fu Ying':\n", " name = 'Ying, Fu'\n", " elif name == 'Dr. Ng Eng Hen':\n", " # unclear: https://en.wikipedia.org/wiki/Ng_Eng_Hen\n", " name = 'Ng, Dr. Eng Hen' \n", " elif name == 'Seok-soo Lee':\n", " name = 'Lee, Seok-soo'\n", " else:\n", " print(f'Could not handle: \"{name}\"')\n", " ridx = name.rindex(',')\n", " \n", " name_last = name[:ridx].strip()\n", " name_first = name[(ridx + 1):].strip()\n", " return name_first, name_last\n", " \n", "def parse_year(fp_in_html, year):\n", " # create soup\n", " with open(fp_in_html, 'r') as fp:\n", " data = fp.read()\n", " soup = BeautifulSoup(data, 'lxml')\n", " \n", " # get rows\n", " table = soup.find('table', attrs={'class':'contenttable'})\n", " tbody = table.find('tbody')\n", " trows = tbody.find_all('tr')\n", " \n", " # parse by year\n", " participants = []\n", " for trow in trows[1:]:\n", " if year == '2009' or year == '2014':\n", " tds = trow.find_all('td')\n", " name = tds[0].text.strip()\n", " name_first, name_last = parse_name(name)\n", " desc = tds[1].text.strip()\n", " elif year == '2010':\n", " tds = trow.find_all('td')\n", " name_first = tds[0].text.strip()\n", " name_last = tds[1].text.strip()\n", " desc = tds[2].text.strip()\n", " elif year == '2011':\n", " tds = trow.find_all('td')\n", " name = tds[0].find_all('p')[0].text.strip()\n", " name_first, name_last = parse_name(name)\n", " desc = tds[1].find_all('p')[0].text.strip()\n", " \n", " obj = {'name_first': name_first, 'name_last': name_last, 'description': desc, 'year': year}\n", " participants.append(obj)\n", " \n", " return participants" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2009\n", "Wrote: /data_store/datasets/munich_security_conference/participants/2009.csv with 346 items\n", "2010\n", "Wrote: /data_store/datasets/munich_security_conference/participants/2010.csv with 317 items\n", "2011\n", "Wrote: /data_store/datasets/munich_security_conference/participants/2011.csv with 341 items\n", "2014\n", "Wrote: /data_store/datasets/munich_security_conference/participants/2014.csv with 467 items\n", "Wrote: /data_store/datasets/munich_security_conference/participants/participants.csv with 1471 items\n" ] } ], "source": [ "participants_all = []\n", "for year in years:\n", " fp_in_html = join(fp_dir_out, f'{year}.html')\n", " fp_out_csv = join(fp_dir_out, f'{year}.csv')\n", " participants = parse_year(fp_in_html, year)\n", " participants_all += participants\n", " df = pd.DataFrame.from_dict(participants)\n", " df.to_csv(fp_out_csv, index=False)\n", " print(f'Wrote: {fp_out_csv} with {len(participants)} items')\n", "\n", "# write total list\n", "\n", "df = pd.DataFrame.from_dict(participants_all)\n", "df.to_csv(fp_out_all_csv, index=False)\n", "print(f'Wrote: {fp_out_all_csv} with {len(participants_all)} items')" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }