diff options
Diffstat (limited to 'megapixels/commands/msc/append_embassy_profile.py')
| -rw-r--r-- | megapixels/commands/msc/append_embassy_profile.py | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/megapixels/commands/msc/append_embassy_profile.py b/megapixels/commands/msc/append_embassy_profile.py new file mode 100644 index 00000000..7d301f06 --- /dev/null +++ b/megapixels/commands/msc/append_embassy_profile.py @@ -0,0 +1,150 @@ +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') +@click.option('--api_secret', 'opt_api_secret', envvar='FLICKR_API_SECRET_1') +@click.option('-d', '--delay', 'opt_delay', default=None, type=float, + help='Delay between API calls to prevent rate-limiting') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, + opt_delay): + """Fetches Flickr API for embassy. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import time + import json + import os, sys + from random import randint + import urllib.request + import flickr_api # pip install flickr_api + from flickr_api.flickrerrors import FlickrAPIError + from requests.compat import urljoin, quote_plus + + + # ------------------------------------------------- + # process + + if not opt_api_key or not opt_api_secret: + log.error('source ../env/flickr.env vars for Flickr API and try again') + return + + # check how many flickr keys + api_keys = [] + api_secrets = [] + for i in range(1,20): + try: + var_name_key = f'FLICKR_API_KEY_{i}' + var_name_secret = f'FLICKR_API_SECRET_{i}' + if os.environ[var_name_key] and os.environ[var_name_secret]: + api_keys.append(os.environ[var_name_key]) + api_secrets.append(os.environ[var_name_secret]) + except Exception as e: + pass + + log.info(f'Shuffling between: {len(api_keys)} api keys') + + # read in CSV + # | username, ... | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + df_records = df_records.drop_duplicates(subset='url', keep="last") + log.info(f'Dedpuplicated {len(df_records)}') + records = df_records.to_dict('records') + + if opt_slice: + records = records[opt_slice[0]:opt_slice[1]] + + log.info('Processing: {:,} items'.format(len(records))) + + identities = [] + + for record in tqdm(records): + if not record.get('nsid', None): + log.warn(f'No NSID for {record["url"]}') + continue + try: + # shuffle the api keys to avoid rate limiting + error_msg = '' + rand_int = randint(0,len(api_keys)-1) + api_key = api_keys[rand_int] + api_secret = api_secrets[rand_int] + + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + + with urllib.request.urlopen(flickr_url) as url: + data = json.loads(url.read().decode()) + + if data['stat'] =='fail': + error_msg = data["message"] + raise Exception(error_msg) + elif data['stat'] =='ok': + profile = data.get('profile') + # append data + tags = ['join_date', 'occupation', 'hometown', 'first_name', 'last_name'] + tags += ['profile_description', 'city', 'country', 'twitter', 'facebook', 'instagram'] + for tag in tags: + record.setdefault(tag, profile.get(tag)) + + except Exception as e: + log.error(f'Exception: {e}, message: {error_msg}, url: {flickr_url}') + + + # write data + df_records = pd.DataFrame.from_dict(records) + df_records.to_csv(opt_fp_out, index=False) + + +""" +{ + "profile": { + "id": "129819216@N03", + "nsid": "129819216@N03", + "join_date": "1417769829", + "occupation": "", + "hometown": "", + "showcase_set": "72157680742231281", + "showcase_set_title": "Profile Showcase", + "first_name": "Ambasciata", + "last_name": "d'Italia a Praga", + "profile_description": "", + "city": "", + "country": "", + "facebook": "", + "twitter": "", + "tumblr": "", + "instagram": "", + "pinterest": "" + }, + "stat": "ok" +} +"""
\ No newline at end of file |
