summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/flickr_api.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/flickr_api.py')
-rw-r--r--megapixels/commands/datasets/flickr_api.py84
1 files changed, 59 insertions, 25 deletions
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py
index 780ede49..f09f3089 100644
--- a/megapixels/commands/datasets/flickr_api.py
+++ b/megapixels/commands/datasets/flickr_api.py
@@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter
from app.utils import file_utils, im_utils
-query_types = ['photo_id', 'album_id', 'flickr_id']
+query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile']
+# ???
+# photo_id: 123456789
# flickr_id: 123456789@N01
-# photo_id:
log = logger_utils.Logger.getLogger()
@@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger()
help='Output directory')
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
help='Slice list of files')
-@click.option('--query-type', 'opt_query_type', default='photo_id',
+@click.option('-q', '--query', 'opt_query_type', required=True,
type=click.Choice(query_types),
help='API query type')
@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
@@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
# process
if not opt_api_key or not opt_api_secret:
- log.error('source .env vars for Flickr API and try again')
+ log.error('source env/flickr.env vars for Flickr API and try again')
return
# check how many flickr keys
api_keys = []
api_secrets = []
- for i in range(1,10):
+ for i in range(1,20):
try:
var_name_key = f'FLICKR_API_KEY_{i}'
var_name_secret = f'FLICKR_API_SECRET_{i}'
@@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
log.info(f'Shuffling between: {len(api_keys)} api keys')
# read in CSV
- # | query, filepath |
-
- records = pd.read_csv(opt_fp_in).to_dict('records')
+ # | query, filename, count |
+ df_records = pd.read_csv(opt_fp_in)
+ log.info(f'Dedpuplicating {len(df_records)}')
+ if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile':
+ df_records = df_records.drop_duplicates(subset='nsid', keep="last")
+ else:
+ df_records = df_records.drop_duplicates(subset='photo_id', keep="last")
+ log.info(f'After deduplication: {len(df_records)}')
+ records = df_records.to_dict('records')
+
if opt_slice:
records = records[opt_slice[0]:opt_slice[1]]
@@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
for record in tqdm(records):
- fp_out = join(opt_fp_out, record['filepath'])
+ if 'nsid' in opt_query_type:
+ fp_out = join(opt_fp_out, f"{record['nsid']}.json")
+ else:
+ fp_out = join(opt_fp_out, f'{record["photo_id"]}.json')
+
fp_out_err = fp_out + '_error.txt'
if Path(fp_out).is_file() or Path(fp_out_err).is_file():
continue
- # append relevant data
try:
# shuffle the api keys to avoid rate limiting
rand_int = randint(0,len(api_keys)-1)
api_key = api_keys[rand_int]
api_secret = api_secrets[rand_int]
-
- #flickr_api.set_keys(api_key=api_key, api_secret=api_secret)
- #photo = flickr_api.Photo(id=record['query'])
- # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
- photo_id = record['query']
- flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo'
- flickr_url += f'&api_key={api_key}'
- flickr_url += f'&photo_id={photo_id}'
- flickr_url += '&format=json'
- flickr_url += '&nojsoncallback=1'
+ # https://www.flickr.com/services/rest/
+ if opt_query_type == 'nsid_url':
+ # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+ # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+ # https://www.flickr.com/services/rest/
+ if opt_query_type == 'nsid_profile':
+ # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+ # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+ elif opt_query_type == 'photo_id':
+ # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
+ fp_out
+ photo_id = record['photo_id']
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&photo_id={photo_id}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
with urllib.request.urlopen(flickr_url) as url:
data = json.loads(url.read().decode())
if data['stat'] =='fail':
- raise Exception('failed')
+ error_msg = data["message"]
+ log.error(f'Failed. Message: {error_msg}, url: {flickr_url}')
+ if error_msg == 'Service currently unavailable':
+ time.sleep(10)
+ raise Exception(error_msg)
elif data['stat'] =='ok':
with open(fp_out, 'w') as fp:
json.dump(data, fp, sort_keys=True, indent=2)
- #except FlickrAPIError as e:
except Exception as e:
- # if "HTTP Server Error 500" in str(e):
- log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}')
- if "not found" in str(e) or 'failed' in str(e):
+ log.error(f'{e}')
+ if "not found" in str(e) or 'Invalid NSID provided' in str(e):
with open(fp_out_err, 'w') as fp:
fp.write('')