1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
import click
from app.utils.logger_utils import Logger
log = Logger.getLogger()
fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt'
fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'
@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
help='Input directory')
@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
help='Output file')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out):
"""Convert embassy list to CSV"""
import sys
from glob import glob
from os.path import join
from pathlib import Path
import time
import pandas as pd
from tqdm import tqdm
from app.utils import file_utils
log = Logger.getLogger()
log.info('converting flickr list to CSV')
items = []
embassies = file_utils.load_text(opt_fp_in)
for embassy in tqdm(embassies):
splits = embassy.split(' ')
url = splits[0].strip()
title = ' '.join(splits[1:]).strip()
username = Path(url).stem
items.append({'title': title, 'url': url, 'username': username})
df = pd.DataFrame.from_dict(items)
df.to_csv(opt_fp_out, index=False)
log.debug(f'Wrote {len(df)} lines')
|