diff options
Diffstat (limited to 'megapixels/commands/msc/flickr_list_to_csv.py')
| -rw-r--r-- | megapixels/commands/msc/flickr_list_to_csv.py | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/megapixels/commands/msc/flickr_list_to_csv.py b/megapixels/commands/msc/flickr_list_to_csv.py new file mode 100644 index 00000000..f107db60 --- /dev/null +++ b/megapixels/commands/msc/flickr_list_to_csv.py @@ -0,0 +1,48 @@ +import click + +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt' +fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Convert embassy list to CSV""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils + + log = Logger.getLogger() + log.info('converting flickr list to CSV') + + items = [] + + embassies = file_utils.load_text(opt_fp_in) + + for embassy in tqdm(embassies): + splits = embassy.split(' ') + url = splits[0].strip() + title = ' '.join(splits[1:]).strip() + username = Path(url).stem + items.append({'title': title, 'url': url, 'username': username}) + + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.debug(f'Wrote {len(df)} lines') + + |
