summaryrefslogtreecommitdiff
path: root/megapixels/commands/msc/flickr_list_to_csv.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/msc/flickr_list_to_csv.py')
-rw-r--r--megapixels/commands/msc/flickr_list_to_csv.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/megapixels/commands/msc/flickr_list_to_csv.py b/megapixels/commands/msc/flickr_list_to_csv.py
new file mode 100644
index 00000000..f107db60
--- /dev/null
+++ b/megapixels/commands/msc/flickr_list_to_csv.py
@@ -0,0 +1,48 @@
+import click
+
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+fp_in = '/data_store/datasets/msc/embassies/embassy-list.txt'
+fp_out = '/data_store/datasets/msc/embassies/embassies_on_flickr.csv'
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
+ help='Output file')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+ """Convert embassy list to CSV"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ from tqdm import tqdm
+
+ from app.utils import file_utils
+
+ log = Logger.getLogger()
+ log.info('converting flickr list to CSV')
+
+ items = []
+
+ embassies = file_utils.load_text(opt_fp_in)
+
+ for embassy in tqdm(embassies):
+ splits = embassy.split(' ')
+ url = splits[0].strip()
+ title = ' '.join(splits[1:]).strip()
+ username = Path(url).stem
+ items.append({'title': title, 'url': url, 'username': username})
+
+ df = pd.DataFrame.from_dict(items)
+ df.to_csv(opt_fp_out, index=False)
+ log.debug(f'Wrote {len(df)} lines')
+
+