diff options
Diffstat (limited to 'megapixels/commands/msc/embassy_flickr_api_data_to_csv.py')
| -rw-r--r-- | megapixels/commands/msc/embassy_flickr_api_data_to_csv.py | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py new file mode 100644 index 00000000..1a0b6a91 --- /dev/null +++ b/megapixels/commands/msc/embassy_flickr_api_data_to_csv.py @@ -0,0 +1,120 @@ +""" +Converts directory of JSON API output files to CSV format +""" + +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + +data_types = ['nsid_url', 'nsid_profile'] + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-q', '--query', 'opt_query', required=True, type=click.Choice(data_types), + help='Flickr API data type') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import json + + + # ------------------------------------------------- + # process + if Path(opt_fp_out).is_file() and not opt_force: + log.error('File exists. Use "--force" to overwrite it') + return + + fp_files = glob(join(opt_fp_in, '*.json')) + fp_files = [f for f in fp_files if 'error' not in f] + if opt_slice: + fp_files = fp_files[opt_slice[0]:opt_slice[1]] + + log.debug(f'Found {len(fp_files)} files') + items = [] + """ + { + "stat": "ok", + "user": { + "id": "95216244@N04", + "username": { + "_content": "AfghanistanWien" + } + } + } + """ + """ + { + "profile": { + "city": "Oslo", + "country": "Norway", + "facebook": "", + "first_name": "US Embassy", + "hometown": "Oslo", + "id": "133886098@N05", + "instagram": "", + "join_date": "1436521589", + "last_name": "Oslo", + "nsid": "133886098@N05", + "occupation": "", + "pinterest": "", + "profile_description": "This is the official Flickr profile of the U.S. Embassy in Oslo, Norway. Contact us: osloirc@state.gov.", + "showcase_set": "72157677372281094", + "showcase_set_title": "Profile Showcase", + "tumblr": "", + "twitter": "", + "website": "http://norway.usembassy.gov/index.html" + }, + "stat": "ok" + } + """ + # Convert to |nsid|username| + for fp_file in tqdm(fp_files): + metadata = file_utils.load_json(fp_file) + + if opt_query == 'nsid_url': + path_alias = Path(fp_file).stem + metadata = metadata.get('user') + nsid = metadata.get('id') + username = metadata.get('username').get('_content') + url = f'https://www.flickr.com/photos/{path_alias}' + obj = { + 'nsid': nsid, + 'username': username, + 'url': url, + 'path_alias': path_alias, + 'filename': f'{path_alias}.json' + } + elif opt_query == 'nsid_profile': + obj = metadata.get('profile') + + items.append(obj) + + + # conver to DataFrame + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.info(f'Wrote {len(df)} to {opt_fp_out}')
\ No newline at end of file |
