import click from app.utils.logger_utils import Logger log = Logger.getLogger() fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt' fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, help='Input directory') @click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, help='Output file') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out): """Convert embassy list to CSV""" import sys from glob import glob from os.path import join from pathlib import Path import time import pandas as pd from tqdm import tqdm from app.utils import file_utils log = Logger.getLogger() log.info('converting flickr list to CSV') items = [] embassies = file_utils.load_text(opt_fp_in) for embassy in tqdm(embassies): splits = embassy.split(' ') url = splits[0].strip() title = ' '.join(splits[1:]).strip() username = Path(url).stem items.append({'title': title, 'url': url, 'username': username}) df = pd.DataFrame.from_dict(items) df.to_csv(opt_fp_out, index=False) log.debug(f'Wrote {len(df)} lines')