diff options
Diffstat (limited to 'megapixels/commands/datasets/flickr_api.py')
| -rw-r--r-- | megapixels/commands/datasets/flickr_api.py | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py new file mode 100644 index 00000000..780ede49 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api.py @@ -0,0 +1,202 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['photo_id', 'album_id', 'flickr_id'] +# flickr_id: 123456789@N01 +# photo_id: + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output directory') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--query-type', 'opt_query_type', default='photo_id', + type=click.Choice(query_types), + help='API query type') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source .env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,10): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | query, filepath | + + records = pd.read_csv(opt_fp_in).to_dict('records') + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + + for record in tqdm(records): + fp_out = join(opt_fp_out, record['filepath']) + fp_out_err = fp_out + '_error.txt' + if Path(fp_out).is_file() or Path(fp_out_err).is_file(): + continue + # append relevant data + try: + # shuffle the api keys to avoid rate limiting + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + + #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) + + #photo = flickr_api.Photo(id=record['query']) + # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + photo_id = record['query'] + flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + raise Exception('failed') + elif data['stat'] =='ok': + with open(fp_out, 'w') as fp: + json.dump(data, fp, sort_keys=True, indent=2) + + #except FlickrAPIError as e: + except Exception as e: + # if "HTTP Server Error 500" in str(e): + log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') + if "not found" in str(e) or 'failed' in str(e): + with open(fp_out_err, 'w') as fp: + fp.write('') + + if opt_delay: + time.sleep(opt_delay) + + + +""" +df_id_dict.update( { + 'user_name': info.get('username', ''), + 'location': info.get('location', ''), + 'real_name': info.get('realname', ''), + 'time_zone': info.get('timezone', {}).get('timezone_id', ''), + 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), + 'photos_count': info.get('photos_info', {}).get('count'), + 'description': info.get('description', ''), + 'id': info.get('id'), + 'path_alias': info.get('path_alias', ''), + 'is_pro': info.get('ispro', ''), + 'url_photos': info.get('photosurl', ''), + 'url_profile': info.get('photosurl', ''), + 'url_mobile': info.get('mobileurl', ''), + }) +""" + +""" +info = photo.getInfo() + +# serialize tags +info['tag_names'] = [] +info['tag_ids'] = [] +tags = info['tags'] +for t in tags: + info['tag_names'].append(str(t.text)) + info['tag_ids'].append(str(t.id)) + +owner = info['owner'] +info['owner_id'] = str(owner.id) +info['owner_username'] = str(owner.username) + +info.pop('tags') +info.pop('owner') + +""" + +""" +Example API data: +{'id': '7124086@N07', + 'nsid': '7124086@N07', + 'ispro': 1, + 'can_buy_pro': 0, + 'iconserver': '2325', + 'iconfarm': 3, + 'path_alias': 'shirleylin', + 'has_stats': '1', + 'pro_badge': 'standard', + 'expire': '0', + 'username': 'ShirleyLin', + 'realname': 'Shirley Lin', + 'location': 'Fremont, California, US', + 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', + 'offset': '-08:00', + 'timezone_id': 'PST8PDT'}, + 'description': '', + 'photosurl': 'https://www.flickr.com/photos/shirleylin/', + 'profileurl': 'https://www.flickr.com/people/shirleylin/', + 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', + 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', + 'firstdate': '1172556588', + 'count': 9665}} +""" + +""" +https://www.flickr.com/services/api/explore/flickr.photosets.getPhotos +https://www.flickr.com/services/api/explore/flickr.photos.getInfo +"""
\ No newline at end of file |
