summaryrefslogtreecommitdiff
path: root/megapixels/commands/msc/append_embassy_profile.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/msc/append_embassy_profile.py')
-rw-r--r--megapixels/commands/msc/append_embassy_profile.py150
1 files changed, 150 insertions, 0 deletions
diff --git a/megapixels/commands/msc/append_embassy_profile.py b/megapixels/commands/msc/append_embassy_profile.py
new file mode 100644
index 00000000..7d301f06
--- /dev/null
+++ b/megapixels/commands/msc/append_embassy_profile.py
@@ -0,0 +1,150 @@
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+ help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+ help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
+@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
+@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1')
+@click.option('-d', '--delay', 'opt_delay', default=None, type=float,
+ help='Delay between API calls to prevent rate-limiting')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
+ opt_delay):
+ """Fetches Flickr API for embassy. Saves to JSON"""
+
+ from tqdm import tqdm
+ from glob import glob
+ import time
+ import json
+ import os, sys
+ from random import randint
+ import urllib.request
+ import flickr_api # pip install flickr_api
+ from flickr_api.flickrerrors import FlickrAPIError
+ from requests.compat import urljoin, quote_plus
+
+
+ # -------------------------------------------------
+ # process
+
+ if not opt_api_key or not opt_api_secret:
+ log.error('source ../env/flickr.env vars for Flickr API and try again')
+ return
+
+ # check how many flickr keys
+ api_keys = []
+ api_secrets = []
+ for i in range(1,20):
+ try:
+ var_name_key = f'FLICKR_API_KEY_{i}'
+ var_name_secret = f'FLICKR_API_SECRET_{i}'
+ if os.environ[var_name_key] and os.environ[var_name_secret]:
+ api_keys.append(os.environ[var_name_key])
+ api_secrets.append(os.environ[var_name_secret])
+ except Exception as e:
+ pass
+
+ log.info(f'Shuffling between: {len(api_keys)} api keys')
+
+ # read in CSV
+ # | username, ... |
+ df_records = pd.read_csv(opt_fp_in)
+ log.info(f'Dedpuplicating {len(df_records)}')
+ df_records = df_records.drop_duplicates(subset='url', keep="last")
+ log.info(f'Dedpuplicated {len(df_records)}')
+ records = df_records.to_dict('records')
+
+ if opt_slice:
+ records = records[opt_slice[0]:opt_slice[1]]
+
+ log.info('Processing: {:,} items'.format(len(records)))
+
+ identities = []
+
+ for record in tqdm(records):
+ if not record.get('nsid', None):
+ log.warn(f'No NSID for {record["url"]}')
+ continue
+ try:
+ # shuffle the api keys to avoid rate limiting
+ error_msg = ''
+ rand_int = randint(0,len(api_keys)-1)
+ api_key = api_keys[rand_int]
+ api_secret = api_secrets[rand_int]
+
+ nsid = record['nsid']
+ nsid_encoded = urllib.parse.quote_plus(nsid)
+ flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+ flickr_url += f'&api_key={api_key}'
+ flickr_url += f'&user_id={nsid_encoded}'
+ flickr_url += '&format=json'
+ flickr_url += '&nojsoncallback=1'
+
+ with urllib.request.urlopen(flickr_url) as url:
+ data = json.loads(url.read().decode())
+
+ if data['stat'] =='fail':
+ error_msg = data["message"]
+ raise Exception(error_msg)
+ elif data['stat'] =='ok':
+ profile = data.get('profile')
+ # append data
+ tags = ['join_date', 'occupation', 'hometown', 'first_name', 'last_name']
+ tags += ['profile_description', 'city', 'country', 'twitter', 'facebook', 'instagram']
+ for tag in tags:
+ record.setdefault(tag, profile.get(tag))
+
+ except Exception as e:
+ log.error(f'Exception: {e}, message: {error_msg}, url: {flickr_url}')
+
+
+ # write data
+ df_records = pd.DataFrame.from_dict(records)
+ df_records.to_csv(opt_fp_out, index=False)
+
+
+"""
+{
+ "profile": {
+ "id": "129819216@N03",
+ "nsid": "129819216@N03",
+ "join_date": "1417769829",
+ "occupation": "",
+ "hometown": "",
+ "showcase_set": "72157680742231281",
+ "showcase_set_title": "Profile Showcase",
+ "first_name": "Ambasciata",
+ "last_name": "d'Italia a Praga",
+ "profile_description": "",
+ "city": "",
+ "country": "",
+ "facebook": "",
+ "twitter": "",
+ "tumblr": "",
+ "instagram": "",
+ "pinterest": ""
+ },
+ "stat": "ok"
+}
+""" \ No newline at end of file