diff options
Diffstat (limited to 'megapixels/commands/datasets/flickr_api_to_csv.py')
| -rw-r--r-- | megapixels/commands/datasets/flickr_api_to_csv.py | 382 |
1 files changed, 382 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py new file mode 100644 index 00000000..5b5f0ce3 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api_to_csv.py @@ -0,0 +1,382 @@ +""" +Converts directory of JSON API output files to CSV format +""" + +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['nsid_profile', 'nsid_url', 'photo_id'] + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import json + + + # ------------------------------------------------- + # process + if Path(opt_fp_out).is_file() and not opt_force: + log.error('File exists. Use "--force" to overwrite it') + return + + fp_files = glob(join(opt_fp_in, '*.json')) + fp_files = [f for f in fp_files if 'error' not in f] + if opt_slice: + fp_files = fp_files[opt_slice[0]:opt_slice[1]] + + log.debug(f'Found {len(fp_files)} files') + items = [] + + for fp_file in tqdm(fp_files): + + if opt_query_type == 'photo_id': + try: + photo = file_utils.load_json(fp_file).get('photo') + except Exception as e: + log.error(f'{e}, skipping: {fp_file}') + continue + dates = photo.get('dates') + posted = dates.get('posted') + taken = dates.get('taken') + description = photo.get('description').get('_content') + location = photo.get('location', {}) + country = location.get('country', {}) + location_country = country.get('_country', '') + location_place = country.get('place_id', '') + location_woeid = country.get('woeid', '') + location_lat = location.get('latitude', '') + location_lon = location.get('longitude', '') + location_place_id = location.get('place_id', '') + owner = photo.get('owner') + nsid = owner.get('nsid') + path_alias = owner.get('path_alias') + owner_realname = owner.get('realname') + owner_username = owner.get('username') + owner_location = owner.get('location') + photo_id = Path(fp_file).stem + server = photo.get('server') + farm = photo.get('farm') + secret = photo.get('secret') + # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg + image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg' + + obj = { + 'posted': posted, + 'taken': taken, + 'description': description, + 'country': location_country, + 'place': location_place, + 'woeid': location_woeid, + 'lat': location_lat, + 'lon': location_lon, + 'place_id': location_place_id, + 'nsid': nsid, + 'path_alias': path_alias, + 'realname': owner_realname, + 'username': owner_username, + 'owner_location': owner_location, + 'photo_id': photo_id, + 'secret': secret, + 'url': image_url + } + + + elif opt_query_type == 'nsid_profile': + obj = file_utils.load_json(fp_file).get('profile') + obj.pop('showcase_set') + obj.pop('showcase_set_title') + obj.pop('pinterest') + obj.pop('tumblr') + elif opt_query_type == 'nsid_url': + obj = file_utils.load_json(fp_file).get('user') + elif opt_query_type == 'user_profile': + metadata = file_utils.load_json(fp_file).get('photo') + owner = metadata.get('owner') + path_alias = owner.get('path_alias') + nsid = owner.get('nsid') + username = owner.get('username') + realname = owner.get('realname') + description = metadata.get('description').get('_content') + title = metadata.get('title').get('_content') + location = metadata.get('location') + dates = metadata.get('dates') + date_taken = dates.get('taken') + date_posted = dates.get('posted') + fname = Path(fp_file).stem + obj = { + 'photo_id': fname, + 'nsid': nsid, + 'path_alias': path_alias, + 'username': username, + 'realname': realname, + 'title': title, + 'description': description, + 'location': location, + 'date_taken': date_taken, + 'date_posted': date_posted + } + + items.append(obj) + + # conver to DataFrame + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.info(f'Wrote {len(df)} to {opt_fp_out}') + +""" +nsid_url + { + "stat": "ok", + "user": { + "nsid": "7153718@N04", + "url": "https://www.flickr.com/people/babyfish4/" + } +} +""" +""" + location: of the owner + dateuploaded + license + "dates": + "lastupdate": "1416447096" + "posted": "1112900873" + "taken": "2005-04-06 18:37:38" + description: + _content: playing in a field + title: + _content: jessica + location: cornwall, uk +""" + +""" + { + "profile": { + "city": null, + "country": null, + "facebook": "", + "first_name": null, + "hometown": "", + "id": "7153718@N04", + "instagram": "", + "join_date": "1172669959", + "last_name": null, + "nsid": "7153718@N04", + "occupation": "", + "pinterest": "", + "profile_description": "", + "showcase_set": "72157680616398790", + "showcase_set_title": "Profile Showcase", + "tumblr": "", + "twitter": "" + }, + "stat": "ok" +} +""" + +""" +photo_id + + + { + "photo": { + "comments": { + "_content": "0" + }, + "dates": { + "lastupdate": "0", + "posted": "1094612969", + "taken": "2004-09-04 22:41:18", + "takengranularity": "0", + "takenunknown": 0 + }, + "dateuploaded": "1094612969", + "description": { + "_content": "" + }, + "editability": { + "canaddmeta": 0, + "cancomment": 0 + }, + "farm": 1, + "geoperms": { + "iscontact": 0, + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + }, + "id": "371498", + "isfavorite": 0, + "license": "1", + "location": { + "accuracy": "15", + "context": "0", + "country": { + "_content": "United States", + "place_id": "nz.gsghTUb4c2WAecA", + "woeid": "23424977" + }, + "county": { + "_content": "Tompkins", + "place_id": "1uCJJtBQUL80G6hbPw", + "woeid": "12589366" + }, + "latitude": "42.399028", + "longitude": "-76.652519", + "place_id": "1uCJJtBQUL80G6hbPw", + "region": { + "_content": "New York", + "place_id": "ODHTuIhTUb75gdBu", + "woeid": "2347591" + }, + "woeid": "12589366" + }, + "media": "photo", + "notes": { + "note": [] + }, + "originalformat": "jpg", + "originalsecret": "704f392686", + "owner": { + "iconfarm": 1, + "iconserver": "1", + "location": "Los Angeles, CA, USA", + "nsid": "48600072071@N01", + "path_alias": "barb", + "realname": "Barb Dybwad", + "username": "doctor paradox" + }, + "people": { + "haspeople": 0 + }, + "publiceditability": { + "canaddmeta": 0, + "cancomment": 1 + }, + "rotation": 0, + "safety_level": "0", + "secret": "704f392686", + "server": "1", + "tags": { + "tag": [ + { + "_content": "unfound", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9017", + "machine_tag": 0, + "raw": "unfound" + }, + { + "_content": "digicam", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-40406", + "machine_tag": 0, + "raw": "digicam" + }, + { + "_content": "upstateny", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9655", + "machine_tag": 0, + "raw": "upstateny" + }, + { + "_content": "musefest", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72456", + "machine_tag": 0, + "raw": "musefest" + }, + { + "_content": "musicfestival", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72628", + "machine_tag": 0, + "raw": "musicfestival" + }, + { + "_content": "people", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-290", + "machine_tag": 0, + "raw": "people" + }, + { + "_content": "portrait", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-278", + "machine_tag": 0, + "raw": "portrait" + }, + { + "_content": "maco", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-19439", + "machine_tag": 0, + "raw": "maco" + } + ] + }, + "title": { + "_content": "maco2" + }, + "urls": { + "url": [ + { + "_content": "https://www.flickr.com/photos/barb/371498/", + "type": "photopage" + } + ] + }, + "usage": { + "canblog": 0, + "candownload": 1, + "canprint": 0, + "canshare": 1 + }, + "views": "290", + "visibility": { + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + } + }, + "stat": "ok" +} +"""
\ No newline at end of file |
