diff options
| author | adamhrv <adam@ahprojects.com> | 2019-05-29 15:24:30 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-05-29 15:24:30 +0200 |
| commit | 5b916111ee1a012650a586ec07bc9150d66020bc (patch) | |
| tree | 128092857e6a9b6d67877e55e05da4f99ea2f5eb /megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb | |
| parent | f5141a7b48ee569089b07428bc75cb84a55c4834 (diff) | |
add MSC nbs and cli cmds
Diffstat (limited to 'megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb | 189 |
1 files changed, 0 insertions, 189 deletions
diff --git a/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb b/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb deleted file mode 100644 index aa819214..00000000 --- a/megapixels/notebooks/datasets/munich_security_conference/html2csv.ipynb +++ /dev/null @@ -1,189 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Convert MSC HTML to CSV\n", - "\n", - "- create name lists" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "from os.path import join\n", - "from pathlib import Path\n", - "from functools import partial\n", - "from multiprocessing.dummy import Pool as ThreadPool\n", - "\n", - "import lxml\n", - "from bs4 import BeautifulSoup\n", - "import urllib.request\n", - "from tqdm import tqdm_notebook as tqdm\n", - "import pandas as pd\n", - "\n", - "import sys\n", - "sys.path.append('/work/megapixels_dev/megapixels/')\n", - "from app.settings import app_cfg as cfg\n", - "from app.utils import file_utils, im_utils" - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [], - "source": [ - "fp_dir_in = '/data_store/datasets/munich_security_conference/participants/'\n", - "fp_dir_out = '/data_store/datasets/munich_security_conference/participants/'\n", - "fp_out_all_csv = join(fp_dir_ou, 'participants.csv') # total list\n", - "years = ['2009', '2010', '2011', '2014']" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_name(name):\n", - " try:\n", - " ridx = name.rindex(',')\n", - " except Exception as e:\n", - " # names like \"Ban Ki-moon\" have no comman\n", - " if name == 'Ban Ki-moon':\n", - " name = 'Ki-moon, Ban'\n", - " elif name == 'Fu Ying':\n", - " name = 'Ying, Fu'\n", - " elif name == 'Dr. Ng Eng Hen':\n", - " # unclear: https://en.wikipedia.org/wiki/Ng_Eng_Hen\n", - " name = 'Ng, Dr. Eng Hen' \n", - " elif name == 'Seok-soo Lee':\n", - " name = 'Lee, Seok-soo'\n", - " else:\n", - " print(f'Could not handle: \"{name}\"')\n", - " ridx = name.rindex(',')\n", - " \n", - " name_last = name[:ridx].strip()\n", - " name_first = name[(ridx + 1):].strip()\n", - " return name_first, name_last\n", - " \n", - "def parse_year(fp_in_html, year):\n", - " # create soup\n", - " with open(fp_in_html, 'r') as fp:\n", - " data = fp.read()\n", - " soup = BeautifulSoup(data, 'lxml')\n", - " \n", - " # get rows\n", - " table = soup.find('table', attrs={'class':'contenttable'})\n", - " tbody = table.find('tbody')\n", - " trows = tbody.find_all('tr')\n", - " \n", - " # parse by year\n", - " participants = []\n", - " for trow in trows[1:]:\n", - " if year == '2009' or year == '2014':\n", - " tds = trow.find_all('td')\n", - " name = tds[0].text.strip()\n", - " name_first, name_last = parse_name(name)\n", - " desc = tds[1].text.strip()\n", - " elif year == '2010':\n", - " tds = trow.find_all('td')\n", - " name_first = tds[0].text.strip()\n", - " name_last = tds[1].text.strip()\n", - " desc = tds[2].text.strip()\n", - " elif year == '2011':\n", - " tds = trow.find_all('td')\n", - " name = tds[0].find_all('p')[0].text.strip()\n", - " name_first, name_last = parse_name(name)\n", - " desc = tds[1].find_all('p')[0].text.strip()\n", - " \n", - " obj = {'name_first': name_first, 'name_last': name_last, 'description': desc, 'year': year}\n", - " participants.append(obj)\n", - " \n", - " return participants" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2009\n", - "Wrote: /data_store/datasets/munich_security_conference/participants/2009.csv with 346 items\n", - "2010\n", - "Wrote: /data_store/datasets/munich_security_conference/participants/2010.csv with 317 items\n", - "2011\n", - "Wrote: /data_store/datasets/munich_security_conference/participants/2011.csv with 341 items\n", - "2014\n", - "Wrote: /data_store/datasets/munich_security_conference/participants/2014.csv with 467 items\n", - "Wrote: /data_store/datasets/munich_security_conference/participants/participants.csv with 1471 items\n" - ] - } - ], - "source": [ - "participants_all = []\n", - "for year in years:\n", - " fp_in_html = join(fp_dir_out, f'{year}.html')\n", - " fp_out_csv = join(fp_dir_out, f'{year}.csv')\n", - " participants = parse_year(fp_in_html, year)\n", - " participants_all += participants\n", - " df = pd.DataFrame.from_dict(participants)\n", - " df.to_csv(fp_out_csv, index=False)\n", - " print(f'Wrote: {fp_out_csv} with {len(participants)} items')\n", - "\n", - "# write total list\n", - "\n", - "df = pd.DataFrame.from_dict(participants_all)\n", - "df.to_csv(fp_out_all_csv, index=False)\n", - "print(f'Wrote: {fp_out_all_csv} with {len(participants_all)} items')" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "megapixels", - "language": "python", - "name": "megapixels" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} |
