from glob import glob import os from os.path import join from pathlib import Path import click from app.settings import types from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils import logger_utils import pandas as pd from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils log = logger_utils.Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input directory') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output file') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') @click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') @click.option('-d', '--delay', 'opt_delay', default=None, type=float, help='Delay between API calls to prevent rate-limiting') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, opt_delay): """Fetches Flickr API for embassy. Saves to JSON""" from tqdm import tqdm from glob import glob import time import json import os, sys from random import randint import urllib.request import flickr_api # pip install flickr_api from flickr_api.flickrerrors import FlickrAPIError from requests.compat import urljoin, quote_plus # ------------------------------------------------- # process if not opt_api_key or not opt_api_secret: log.error('source ../env/flickr.env vars for Flickr API and try again') return # check how many flickr keys api_keys = [] api_secrets = [] for i in range(1,10): try: var_name_key = f'FLICKR_API_KEY_{i}' var_name_secret = f'FLICKR_API_SECRET_{i}' if os.environ[var_name_key] and os.environ[var_name_secret]: api_keys.append(os.environ[var_name_key]) api_secrets.append(os.environ[var_name_secret]) except Exception as e: pass log.info(f'Shuffling between: {len(api_keys)} api keys') # read in CSV # | username, ... | df_records = pd.read_csv(opt_fp_in) log.info(f'Dedpuplicating {len(df_records)}') df_records = df_records.drop_duplicates(subset='url', keep="last") log.info(f'Dedpuplicated {len(df_records)}') records = df_records.to_dict('records') if opt_slice: records = records[opt_slice[0]:opt_slice[1]] log.info('Processing: {:,} items'.format(len(records))) identities = [] for record in tqdm(records): if record.get('nsid', None): continue try: # shuffle the api keys to avoid rate limiting rand_int = randint(0,len(api_keys)-1) api_key = api_keys[rand_int] api_secret = api_secrets[rand_int] """ { "user": { "id": "46768316@N07", "username": { "_content": "U.S. Embassy Tirana Art Contest" } }, "stat": "ok" } """ # https://www.flickr.com/services/rest/ # ?method=flickr.urls.lookupUser&api_key=xxx&url=[encoded url]&format=json&nojsoncallback=1 url = record['url'] url_encoded = quote_plus(url) flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.urls.lookupUser' flickr_url += f'&api_key={api_key}' flickr_url += f'&url={url_encoded}' flickr_url += '&format=json' flickr_url += '&nojsoncallback=1' log.debug(f'{flickr_url}') with urllib.request.urlopen(flickr_url) as url: data = json.loads(url.read().decode()) if data['stat'] =='fail': error_msg = data["message"] log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') raise Exception(error_msg) elif data['stat'] =='ok': user_data = data.get('user') record['nsid'] = user_data.get('id') record['username'] = user_data.get('username').get('_content') except Exception as e: log.error(f'Exception: {e}, url: {flickr_url}') # write data df_records = pd.DataFrame.from_dict(records) df_records.to_csv(opt_fp_out)