summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/flickr_api_to_csv.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/flickr_api_to_csv.py')
-rw-r--r--megapixels/commands/datasets/flickr_api_to_csv.py382
1 files changed, 382 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py
new file mode 100644
index 00000000..5b5f0ce3
--- /dev/null
+++ b/megapixels/commands/datasets/flickr_api_to_csv.py
@@ -0,0 +1,382 @@
+"""
+Converts directory of JSON API output files to CSV format
+"""
+
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+query_types = ['nsid_profile', 'nsid_url', 'photo_id']
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+ help='Force overwrite')
+@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type):
+ """Fetches Flickr API for user info. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import json
+
+
+ # -------------------------------------------------
+ # process
+ if Path(opt_fp_out).is_file() and not opt_force:
+ log.error('File exists. Use "--force" to overwrite it')
+ return
+
+ fp_files = glob(join(opt_fp_in, '*.json'))
+ fp_files = [f for f in fp_files if 'error' not in f]
+ if opt_slice:
+ fp_files = fp_files[opt_slice[0]:opt_slice[1]]
+
+ log.debug(f'Found {len(fp_files)} files')
+ items = []
+
+ for fp_file in tqdm(fp_files):
+
+ if opt_query_type == 'photo_id':
+ try:
+ photo = file_utils.load_json(fp_file).get('photo')
+ except Exception as e:
+ log.error(f'{e}, skipping: {fp_file}')
+ continue
+ dates = photo.get('dates')
+ posted = dates.get('posted')
+ taken = dates.get('taken')
+ description = photo.get('description').get('_content')
+ location = photo.get('location', {})
+ country = location.get('country', {})
+ location_country = country.get('_country', '')
+ location_place = country.get('place_id', '')
+ location_woeid = country.get('woeid', '')
+ location_lat = location.get('latitude', '')
+ location_lon = location.get('longitude', '')
+ location_place_id = location.get('place_id', '')
+ owner = photo.get('owner')
+ nsid = owner.get('nsid')
+ path_alias = owner.get('path_alias')
+ owner_realname = owner.get('realname')
+ owner_username = owner.get('username')
+ owner_location = owner.get('location')
+ photo_id = Path(fp_file).stem
+ server = photo.get('server')
+ farm = photo.get('farm')
+ secret = photo.get('secret')
+ # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg
+ image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'
+
+ obj = {
+ 'posted': posted,
+ 'taken': taken,
+ 'description': description,
+ 'country': location_country,
+ 'place': location_place,
+ 'woeid': location_woeid,
+ 'lat': location_lat,
+ 'lon': location_lon,
+ 'place_id': location_place_id,
+ 'nsid': nsid,
+ 'path_alias': path_alias,
+ 'realname': owner_realname,
+ 'username': owner_username,
+ 'owner_location': owner_location,
+ 'photo_id': photo_id,
+ 'secret': secret,
+ 'url': image_url
+ }
+
+
+ elif opt_query_type == 'nsid_profile':
+ obj = file_utils.load_json(fp_file).get('profile')
+ obj.pop('showcase_set')
+ obj.pop('showcase_set_title')
+ obj.pop('pinterest')
+ obj.pop('tumblr')
+ elif opt_query_type == 'nsid_url':
+ obj = file_utils.load_json(fp_file).get('user')
+ elif opt_query_type == 'user_profile':
+ metadata = file_utils.load_json(fp_file).get('photo')
+ owner = metadata.get('owner')
+ path_alias = owner.get('path_alias')
+ nsid = owner.get('nsid')
+ username = owner.get('username')
+ realname = owner.get('realname')
+ description = metadata.get('description').get('_content')
+ title = metadata.get('title').get('_content')
+ location = metadata.get('location')
+ dates = metadata.get('dates')
+ date_taken = dates.get('taken')
+ date_posted = dates.get('posted')
+ fname = Path(fp_file).stem
+ obj = {
+ 'photo_id': fname,
+ 'nsid': nsid,
+ 'path_alias': path_alias,
+ 'username': username,
+ 'realname': realname,
+ 'title': title,
+ 'description': description,
+ 'location': location,
+ 'date_taken': date_taken,
+ 'date_posted': date_posted
+ }
+
+ items.append(obj)
+
+ # conver to DataFrame
+ df = pd.DataFrame.from_dict(items)
+ df.to_csv(opt_fp_out, index=False)
+ log.info(f'Wrote {len(df)} to {opt_fp_out}')
+
+"""
+nsid_url
+ {
+ "stat": "ok",
+ "user": {
+ "nsid": "7153718@N04",
+ "url": "https://www.flickr.com/people/babyfish4/"
+ }
+}
+"""
+"""
+ location: of the owner
+ dateuploaded
+ license
+ "dates":
+ "lastupdate": "1416447096"
+ "posted": "1112900873"
+ "taken": "2005-04-06 18:37:38"
+ description:
+ _content: playing in a field
+ title:
+ _content: jessica
+ location: cornwall, uk
+"""
+
+"""
+ {
+ "profile": {
+ "city": null,
+ "country": null,
+ "facebook": "",
+ "first_name": null,
+ "hometown": "",
+ "id": "7153718@N04",
+ "instagram": "",
+ "join_date": "1172669959",
+ "last_name": null,
+ "nsid": "7153718@N04",
+ "occupation": "",
+ "pinterest": "",
+ "profile_description": "",
+ "showcase_set": "72157680616398790",
+ "showcase_set_title": "Profile Showcase",
+ "tumblr": "",
+ "twitter": ""
+ },
+ "stat": "ok"
+}
+"""
+
+"""
+photo_id
+
+
+ {
+ "photo": {
+ "comments": {
+ "_content": "0"
+ },
+ "dates": {
+ "lastupdate": "0",
+ "posted": "1094612969",
+ "taken": "2004-09-04 22:41:18",
+ "takengranularity": "0",
+ "takenunknown": 0
+ },
+ "dateuploaded": "1094612969",
+ "description": {
+ "_content": ""
+ },
+ "editability": {
+ "canaddmeta": 0,
+ "cancomment": 0
+ },
+ "farm": 1,
+ "geoperms": {
+ "iscontact": 0,
+ "isfamily": 0,
+ "isfriend": 0,
+ "ispublic": 1
+ },
+ "id": "371498",
+ "isfavorite": 0,
+ "license": "1",
+ "location": {
+ "accuracy": "15",
+ "context": "0",
+ "country": {
+ "_content": "United States",
+ "place_id": "nz.gsghTUb4c2WAecA",
+ "woeid": "23424977"
+ },
+ "county": {
+ "_content": "Tompkins",
+ "place_id": "1uCJJtBQUL80G6hbPw",
+ "woeid": "12589366"
+ },
+ "latitude": "42.399028",
+ "longitude": "-76.652519",
+ "place_id": "1uCJJtBQUL80G6hbPw",
+ "region": {
+ "_content": "New York",
+ "place_id": "ODHTuIhTUb75gdBu",
+ "woeid": "2347591"
+ },
+ "woeid": "12589366"
+ },
+ "media": "photo",
+ "notes": {
+ "note": []
+ },
+ "originalformat": "jpg",
+ "originalsecret": "704f392686",
+ "owner": {
+ "iconfarm": 1,
+ "iconserver": "1",
+ "location": "Los Angeles, CA, USA",
+ "nsid": "48600072071@N01",
+ "path_alias": "barb",
+ "realname": "Barb Dybwad",
+ "username": "doctor paradox"
+ },
+ "people": {
+ "haspeople": 0
+ },
+ "publiceditability": {
+ "canaddmeta": 0,
+ "cancomment": 1
+ },
+ "rotation": 0,
+ "safety_level": "0",
+ "secret": "704f392686",
+ "server": "1",
+ "tags": {
+ "tag": [
+ {
+ "_content": "unfound",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-9017",
+ "machine_tag": 0,
+ "raw": "unfound"
+ },
+ {
+ "_content": "digicam",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-40406",
+ "machine_tag": 0,
+ "raw": "digicam"
+ },
+ {
+ "_content": "upstateny",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-9655",
+ "machine_tag": 0,
+ "raw": "upstateny"
+ },
+ {
+ "_content": "musefest",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-72456",
+ "machine_tag": 0,
+ "raw": "musefest"
+ },
+ {
+ "_content": "musicfestival",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-72628",
+ "machine_tag": 0,
+ "raw": "musicfestival"
+ },
+ {
+ "_content": "people",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-290",
+ "machine_tag": 0,
+ "raw": "people"
+ },
+ {
+ "_content": "portrait",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-278",
+ "machine_tag": 0,
+ "raw": "portrait"
+ },
+ {
+ "_content": "maco",
+ "author": "48600072071@N01",
+ "authorname": "doctor paradox",
+ "id": "28255-371498-19439",
+ "machine_tag": 0,
+ "raw": "maco"
+ }
+ ]
+ },
+ "title": {
+ "_content": "maco2"
+ },
+ "urls": {
+ "url": [
+ {
+ "_content": "https://www.flickr.com/photos/barb/371498/",
+ "type": "photopage"
+ }
+ ]
+ },
+ "usage": {
+ "canblog": 0,
+ "candownload": 1,
+ "canprint": 0,
+ "canshare": 1
+ },
+ "views": "290",
+ "visibility": {
+ "isfamily": 0,
+ "isfriend": 0,
+ "ispublic": 1
+ }
+ },
+ "stat": "ok"
+}
+""" \ No newline at end of file