diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/datasets/flickr_api.py | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/commands/datasets/flickr_api.py')
| -rw-r--r-- | megapixels/commands/datasets/flickr_api.py | 84 |
1 files changed, 59 insertions, 25 deletions
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py index 780ede49..f09f3089 100644 --- a/megapixels/commands/datasets/flickr_api.py +++ b/megapixels/commands/datasets/flickr_api.py @@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils -query_types = ['photo_id', 'album_id', 'flickr_id'] +query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile'] +# ??? +# photo_id: 123456789 # flickr_id: 123456789@N01 -# photo_id: log = logger_utils.Logger.getLogger() @@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger() help='Output directory') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') -@click.option('--query-type', 'opt_query_type', default='photo_id', +@click.option('-q', '--query', 'opt_query_type', required=True, type=click.Choice(query_types), help='API query type') @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') @@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, # process if not opt_api_key or not opt_api_secret: - log.error('source .env vars for Flickr API and try again') + log.error('source env/flickr.env vars for Flickr API and try again') return # check how many flickr keys api_keys = [] api_secrets = [] - for i in range(1,10): + for i in range(1,20): try: var_name_key = f'FLICKR_API_KEY_{i}' var_name_secret = f'FLICKR_API_SECRET_{i}' @@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, log.info(f'Shuffling between: {len(api_keys)} api keys') # read in CSV - # | query, filepath | - - records = pd.read_csv(opt_fp_in).to_dict('records') + # | query, filename, count | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile': + df_records = df_records.drop_duplicates(subset='nsid', keep="last") + else: + df_records = df_records.drop_duplicates(subset='photo_id', keep="last") + log.info(f'After deduplication: {len(df_records)}') + records = df_records.to_dict('records') + if opt_slice: records = records[opt_slice[0]:opt_slice[1]] @@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, for record in tqdm(records): - fp_out = join(opt_fp_out, record['filepath']) + if 'nsid' in opt_query_type: + fp_out = join(opt_fp_out, f"{record['nsid']}.json") + else: + fp_out = join(opt_fp_out, f'{record["photo_id"]}.json') + fp_out_err = fp_out + '_error.txt' if Path(fp_out).is_file() or Path(fp_out_err).is_file(): continue - # append relevant data try: # shuffle the api keys to avoid rate limiting rand_int = randint(0,len(api_keys)-1) api_key = api_keys[rand_int] api_secret = api_secrets[rand_int] - - #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) - #photo = flickr_api.Photo(id=record['query']) - # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 - photo_id = record['query'] - flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' - flickr_url += f'&api_key={api_key}' - flickr_url += f'&photo_id={photo_id}' - flickr_url += '&format=json' - flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_url': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_profile': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + elif opt_query_type == 'photo_id': + # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + fp_out + photo_id = record['photo_id'] + flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' with urllib.request.urlopen(flickr_url) as url: data = json.loads(url.read().decode()) if data['stat'] =='fail': - raise Exception('failed') + error_msg = data["message"] + log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') + if error_msg == 'Service currently unavailable': + time.sleep(10) + raise Exception(error_msg) elif data['stat'] =='ok': with open(fp_out, 'w') as fp: json.dump(data, fp, sort_keys=True, indent=2) - #except FlickrAPIError as e: except Exception as e: - # if "HTTP Server Error 500" in str(e): - log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') - if "not found" in str(e) or 'failed' in str(e): + log.error(f'{e}') + if "not found" in str(e) or 'Invalid NSID provided' in str(e): with open(fp_out_err, 'w') as fp: fp.write('') |
