from glob import glob import os from os.path import join from pathlib import Path import click from app.settings import types from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils import logger_utils import pandas as pd from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile'] # ??? # photo_id: 123456789 # flickr_id: 123456789@N01 log = logger_utils.Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input directory') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.option('-q', '--query', 'opt_query_type', required=True, type=click.Choice(query_types), help='API query type') @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') @click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') @click.option('-d', '--delay', 'opt_delay', default=None, type=float, help='Delay between API calls to prevent rate-limiting') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, opt_delay, opt_query_type): """Fetches Flickr API for user info. Saves to JSON""" from tqdm import tqdm from glob import glob import time import json import os, sys from random import randint import urllib.request import flickr_api # pip install flickr_api from flickr_api.flickrerrors import FlickrAPIError from requests.compat import urljoin, quote_plus # ------------------------------------------------- # process if not opt_api_key or not opt_api_secret: log.error('source env/flickr.env vars for Flickr API and try again') return # check how many flickr keys api_keys = [] api_secrets = [] for i in range(1,20): try: var_name_key = f'FLICKR_API_KEY_{i}' var_name_secret = f'FLICKR_API_SECRET_{i}' if os.environ[var_name_key] and os.environ[var_name_secret]: api_keys.append(os.environ[var_name_key]) api_secrets.append(os.environ[var_name_secret]) except Exception as e: pass log.info(f'Shuffling between: {len(api_keys)} api keys') # read in CSV # | query, filename, count | df_records = pd.read_csv(opt_fp_in) log.info(f'Dedpuplicating {len(df_records)}') if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile': df_records = df_records.drop_duplicates(subset='nsid', keep="last") else: df_records = df_records.drop_duplicates(subset='photo_id', keep="last") log.info(f'After deduplication: {len(df_records)}') records = df_records.to_dict('records') if opt_slice: records = records[opt_slice[0]:opt_slice[1]] log.info('Processing: {:,} items'.format(len(records))) identities = [] for record in tqdm(records): if 'nsid' in opt_query_type: fp_out = join(opt_fp_out, f"{record['nsid']}.json") else: fp_out = join(opt_fp_out, f'{record["photo_id"]}.json') fp_out_err = fp_out + '_error.txt' if Path(fp_out).is_file() or Path(fp_out_err).is_file(): continue try: # shuffle the api keys to avoid rate limiting rand_int = randint(0,len(api_keys)-1) api_key = api_keys[rand_int] api_secret = api_secrets[rand_int] # https://www.flickr.com/services/rest/ if opt_query_type == 'nsid_url': # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 nsid = record['nsid'] nsid_encoded = urllib.parse.quote_plus(nsid) flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile' flickr_url += f'&api_key={api_key}' flickr_url += f'&user_id={nsid_encoded}' flickr_url += '&format=json' flickr_url += '&nojsoncallback=1' # https://www.flickr.com/services/rest/ if opt_query_type == 'nsid_profile': # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 nsid = record['nsid'] nsid_encoded = urllib.parse.quote_plus(nsid) flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' flickr_url += f'&api_key={api_key}' flickr_url += f'&user_id={nsid_encoded}' flickr_url += '&format=json' flickr_url += '&nojsoncallback=1' elif opt_query_type == 'photo_id': # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 fp_out photo_id = record['photo_id'] flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo' flickr_url += f'&api_key={api_key}' flickr_url += f'&photo_id={photo_id}' flickr_url += '&format=json' flickr_url += '&nojsoncallback=1' with urllib.request.urlopen(flickr_url) as url: data = json.loads(url.read().decode()) if data['stat'] =='fail': error_msg = data["message"] log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') if error_msg == 'Service currently unavailable': time.sleep(10) raise Exception(error_msg) elif data['stat'] =='ok': with open(fp_out, 'w') as fp: json.dump(data, fp, sort_keys=True, indent=2) except Exception as e: log.error(f'{e}') if "not found" in str(e) or 'Invalid NSID provided' in str(e): with open(fp_out_err, 'w') as fp: fp.write('') if opt_delay: time.sleep(opt_delay) """ df_id_dict.update( { 'user_name': info.get('username', ''), 'location': info.get('location', ''), 'real_name': info.get('realname', ''), 'time_zone': info.get('timezone', {}).get('timezone_id', ''), 'time_first_photo': info.get('photos_info', {}).get('firstdatetaken'), 'photos_count': info.get('photos_info', {}).get('count'), 'description': info.get('description', ''), 'id': info.get('id'), 'path_alias': info.get('path_alias', ''), 'is_pro': info.get('ispro', ''), 'url_photos': info.get('photosurl', ''), 'url_profile': info.get('photosurl', ''), 'url_mobile': info.get('mobileurl', ''), }) """ """ info = photo.getInfo() # serialize tags info['tag_names'] = [] info['tag_ids'] = [] tags = info['tags'] for t in tags: info['tag_names'].append(str(t.text)) info['tag_ids'].append(str(t.id)) owner = info['owner'] info['owner_id'] = str(owner.id) info['owner_username'] = str(owner.username) info.pop('tags') info.pop('owner') """ """ Example API data: {'id': '7124086@N07', 'nsid': '7124086@N07', 'ispro': 1, 'can_buy_pro': 0, 'iconserver': '2325', 'iconfarm': 3, 'path_alias': 'shirleylin', 'has_stats': '1', 'pro_badge': 'standard', 'expire': '0', 'username': 'ShirleyLin', 'realname': 'Shirley Lin', 'location': 'Fremont, California, US', 'timezone': {'label': 'Pacific Time (US & Canada); Tijuana', 'offset': '-08:00', 'timezone_id': 'PST8PDT'}, 'description': '', 'photosurl': 'https://www.flickr.com/photos/shirleylin/', 'profileurl': 'https://www.flickr.com/people/shirleylin/', 'mobileurl': 'https://m.flickr.com/photostream.gne?id=7102756', 'photos_info': {'firstdatetaken': '2004-05-24 12:12:15', 'firstdate': '1172556588', 'count': 9665}} """ """ https://www.flickr.com/services/api/explore/flickr.photosets.getPhotos https://www.flickr.com/services/api/explore/flickr.photos.getInfo """