summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-05-29 15:24:30 +0200
committeradamhrv <adam@ahprojects.com>2019-05-29 15:24:30 +0200
commit5b916111ee1a012650a586ec07bc9150d66020bc (patch)
tree128092857e6a9b6d67877e55e05da4f99ea2f5eb /megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb
parentf5141a7b48ee569089b07428bc75cb84a55c4834 (diff)
add MSC nbs and cli cmds
Diffstat (limited to 'megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb')
-rw-r--r--megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb189
1 files changed, 0 insertions, 189 deletions
diff --git a/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb b/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb
deleted file mode 100644
index aa819214..00000000
--- a/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb
+++ /dev/null
@@ -1,189 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Convert MSC HTML to CSV\n",
- "\n",
- "- create name lists"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext autoreload\n",
- "%autoreload 2\n",
- "\n",
- "from os.path import join\n",
- "from pathlib import Path\n",
- "from functools import partial\n",
- "from multiprocessing.dummy import Pool as ThreadPool\n",
- "\n",
- "import lxml\n",
- "from bs4 import BeautifulSoup\n",
- "import urllib.request\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "import pandas as pd\n",
- "\n",
- "import sys\n",
- "sys.path.append('/work/megapixels_dev/megapixels/')\n",
- "from app.settings import app_cfg as cfg\n",
- "from app.utils import file_utils, im_utils"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 147,
- "metadata": {},
- "outputs": [],
- "source": [
- "fp_dir_in = '/data_store/datasets/munich_security_conference/participants/'\n",
- "fp_dir_out = '/data_store/datasets/munich_security_conference/participants/'\n",
- "fp_out_all_csv = join(fp_dir_ou, 'participants.csv') # total list\n",
- "years = ['2009', '2010', '2011', '2014']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 143,
- "metadata": {},
- "outputs": [],
- "source": [
- "def parse_name(name):\n",
- " try:\n",
- " ridx = name.rindex(',')\n",
- " except Exception as e:\n",
- " # names like \"Ban Ki-moon\" have no comman\n",
- " if name == 'Ban Ki-moon':\n",
- " name = 'Ki-moon, Ban'\n",
- " elif name == 'Fu Ying':\n",
- " name = 'Ying, Fu'\n",
- " elif name == 'Dr. Ng Eng Hen':\n",
- " # unclear: https://en.wikipedia.org/wiki/Ng_Eng_Hen\n",
- " name = 'Ng, Dr. Eng Hen' \n",
- " elif name == 'Seok-soo Lee':\n",
- " name = 'Lee, Seok-soo'\n",
- " else:\n",
- " print(f'Could not handle: \"{name}\"')\n",
- " ridx = name.rindex(',')\n",
- " \n",
- " name_last = name[:ridx].strip()\n",
- " name_first = name[(ridx + 1):].strip()\n",
- " return name_first, name_last\n",
- " \n",
- "def parse_year(fp_in_html, year):\n",
- " # create soup\n",
- " with open(fp_in_html, 'r') as fp:\n",
- " data = fp.read()\n",
- " soup = BeautifulSoup(data, 'lxml')\n",
- " \n",
- " # get rows\n",
- " table = soup.find('table', attrs={'class':'contenttable'})\n",
- " tbody = table.find('tbody')\n",
- " trows = tbody.find_all('tr')\n",
- " \n",
- " # parse by year\n",
- " participants = []\n",
- " for trow in trows[1:]:\n",
- " if year == '2009' or year == '2014':\n",
- " tds = trow.find_all('td')\n",
- " name = tds[0].text.strip()\n",
- " name_first, name_last = parse_name(name)\n",
- " desc = tds[1].text.strip()\n",
- " elif year == '2010':\n",
- " tds = trow.find_all('td')\n",
- " name_first = tds[0].text.strip()\n",
- " name_last = tds[1].text.strip()\n",
- " desc = tds[2].text.strip()\n",
- " elif year == '2011':\n",
- " tds = trow.find_all('td')\n",
- " name = tds[0].find_all('p')[0].text.strip()\n",
- " name_first, name_last = parse_name(name)\n",
- " desc = tds[1].find_all('p')[0].text.strip()\n",
- " \n",
- " obj = {'name_first': name_first, 'name_last': name_last, 'description': desc, 'year': year}\n",
- " participants.append(obj)\n",
- " \n",
- " return participants"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 148,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2009\n",
- "Wrote: /data_store/datasets/munich_security_conference/participants/2009.csv with 346 items\n",
- "2010\n",
- "Wrote: /data_store/datasets/munich_security_conference/participants/2010.csv with 317 items\n",
- "2011\n",
- "Wrote: /data_store/datasets/munich_security_conference/participants/2011.csv with 341 items\n",
- "2014\n",
- "Wrote: /data_store/datasets/munich_security_conference/participants/2014.csv with 467 items\n",
- "Wrote: /data_store/datasets/munich_security_conference/participants/participants.csv with 1471 items\n"
- ]
- }
- ],
- "source": [
- "participants_all = []\n",
- "for year in years:\n",
- " fp_in_html = join(fp_dir_out, f'{year}.html')\n",
- " fp_out_csv = join(fp_dir_out, f'{year}.csv')\n",
- " participants = parse_year(fp_in_html, year)\n",
- " participants_all += participants\n",
- " df = pd.DataFrame.from_dict(participants)\n",
- " df.to_csv(fp_out_csv, index=False)\n",
- " print(f'Wrote: {fp_out_csv} with {len(participants)} items')\n",
- "\n",
- "# write total list\n",
- "\n",
- "df = pd.DataFrame.from_dict(participants_all)\n",
- "df.to_csv(fp_out_all_csv, index=False)\n",
- "print(f'Wrote: {fp_out_all_csv} with {len(participants_all)} items')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 94,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 95,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "megapixels",
- "language": "python",
- "name": "megapixels"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}