summaryrefslogtreecommitdiff
path: root/megapixels/commands/msc/flickr_list_to_csv.py
blob: f107db60bb67d980f959bc380994d4b464e432f4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import click

from app.utils.logger_utils import Logger

log = Logger.getLogger()

fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt'
fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
  help='Input directory')
@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
  help='Output file')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out):
  """Convert embassy list to CSV"""
  
  import sys
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time

  import pandas as pd
  from tqdm import tqdm

  from app.utils import file_utils

  log = Logger.getLogger()
  log.info('converting flickr list to CSV')
  
  items = []

  embassies = file_utils.load_text(opt_fp_in)

  for embassy in tqdm(embassies):
    splits = embassy.split(' ')
    url = splits[0].strip()
    title = ' '.join(splits[1:]).strip()
    username = Path(url).stem
    items.append({'title': title, 'url': url, 'username': username})

  df = pd.DataFrame.from_dict(items)
  df.to_csv(opt_fp_out, index=False)
  log.debug(f'Wrote {len(df)} lines')