diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/datasets | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 11 | ||||
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 9 | ||||
| -rw-r--r-- | megapixels/commands/datasets/flickr_api.py | 84 | ||||
| -rw-r--r-- | megapixels/commands/datasets/flickr_api_to_csv.py | 382 |
4 files changed, 454 insertions, 32 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py index ed717662..0b81fef6 100644 --- a/megapixels/commands/datasets/download_ibmdif.py +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t help='Output path') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') @click.option('--agents', 'opt_fp_agents', default=fp_user_agents) @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents): """Threaded image/file downloader""" """ @@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): urllib.request.urlretrieve(item['url'], fp_out) item['status'] = True except Exception as e: + log.debug(f'Failed: user: {item["username"]}, url: {url}') if str(e) != 'HTTP Error 403: Forbidden': log.debug(f'Error: {e}') fp_error = f'{fp_out}_error.txt' @@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): # setup multithreading data holders log.debug(f'loading {opt_fp_in}') df_records = pd.read_csv(opt_fp_in) + if opt_slice: + df_records = df_records[opt_slice[0]:opt_slice[1]] log.debug(f'loaded {len(df_records):,} csv records') log.debug('deduplicating') df_records = df_records.drop_duplicates(subset='sha256', keep="last") @@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): for x in tqdm(records): sha256 = x['sha256'] - + username = x['username'] fp_dst = join(opt_fp_out, f"{sha256}.json") fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() @@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): if not (fp_dst_is_file or fp_dst_is_err): url = url_prefix + sha256 + '.json' user_agent = user_agents[randint(0, len(user_agents)) - 1] - pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent}) else: n_skipped += 1 diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index c64afbba..45ca8f6e 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -6,9 +6,9 @@ import click help='Input') @click.option('-o', '--output', 'opt_fp_out', required=True, help='Output') -@click.option('-t', '--threads', 'opt_threads', default=8, +@click.option('-t', '--threads', 'opt_threads', default=8, show_default=True, help='Number of threads') -@click.option('--wayback', 'opt_wayback', is_flag=True, +@click.option('--wayback', 'opt_wayback', is_flag=True, default=False, help='Check Wayback archive for URL and download cached image') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): @@ -52,7 +52,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): estr = str(e) if item['opt_wayback']: if 'HTTP Error' in estr: - # check + # TODO add/parse/handle request for wayback machine archive url_wayback = url_wayback_base + item['url'] fp_error = f'{fp_out}_error.txt' with open(fp_error, 'w') as fp: @@ -67,6 +67,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items = [] + log.debug(f'Initializing multithreaded pool...') for x in tqdm(records): fp_dst = join(opt_fp_out, x['filepath']) fp_dst_is_file = Path(fp_dst).is_file() @@ -75,7 +76,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) - log.info(f'processing {num_items:,} items') + log.info(f'Going to download {num_items:,} files') pool_results = [] # run the multithreading with progress bar diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py index 780ede49..f09f3089 100644 --- a/megapixels/commands/datasets/flickr_api.py +++ b/megapixels/commands/datasets/flickr_api.py @@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter from app.utils import file_utils, im_utils -query_types = ['photo_id', 'album_id', 'flickr_id'] +query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile'] +# ??? +# photo_id: 123456789 # flickr_id: 123456789@N01 -# photo_id: log = logger_utils.Logger.getLogger() @@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger() help='Output directory') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') -@click.option('--query-type', 'opt_query_type', default='photo_id', +@click.option('-q', '--query', 'opt_query_type', required=True, type=click.Choice(query_types), help='API query type') @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1') @@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, # process if not opt_api_key or not opt_api_secret: - log.error('source .env vars for Flickr API and try again') + log.error('source env/flickr.env vars for Flickr API and try again') return # check how many flickr keys api_keys = [] api_secrets = [] - for i in range(1,10): + for i in range(1,20): try: var_name_key = f'FLICKR_API_KEY_{i}' var_name_secret = f'FLICKR_API_SECRET_{i}' @@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, log.info(f'Shuffling between: {len(api_keys)} api keys') # read in CSV - # | query, filepath | - - records = pd.read_csv(opt_fp_in).to_dict('records') + # | query, filename, count | + df_records = pd.read_csv(opt_fp_in) + log.info(f'Dedpuplicating {len(df_records)}') + if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile': + df_records = df_records.drop_duplicates(subset='nsid', keep="last") + else: + df_records = df_records.drop_duplicates(subset='photo_id', keep="last") + log.info(f'After deduplication: {len(df_records)}') + records = df_records.to_dict('records') + if opt_slice: records = records[opt_slice[0]:opt_slice[1]] @@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret, for record in tqdm(records): - fp_out = join(opt_fp_out, record['filepath']) + if 'nsid' in opt_query_type: + fp_out = join(opt_fp_out, f"{record['nsid']}.json") + else: + fp_out = join(opt_fp_out, f'{record["photo_id"]}.json') + fp_out_err = fp_out + '_error.txt' if Path(fp_out).is_file() or Path(fp_out_err).is_file(): continue - # append relevant data try: # shuffle the api keys to avoid rate limiting rand_int = randint(0,len(api_keys)-1) api_key = api_keys[rand_int] api_secret = api_secrets[rand_int] - - #flickr_api.set_keys(api_key=api_key, api_secret=api_secret) - #photo = flickr_api.Photo(id=record['query']) - # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 - photo_id = record['query'] - flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo' - flickr_url += f'&api_key={api_key}' - flickr_url += f'&photo_id={photo_id}' - flickr_url += '&format=json' - flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_url': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + # https://www.flickr.com/services/rest/ + if opt_query_type == 'nsid_profile': + # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1 + # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00 + nsid = record['nsid'] + nsid_encoded = urllib.parse.quote_plus(nsid) + flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&user_id={nsid_encoded}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' + elif opt_query_type == 'photo_id': + # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1 + fp_out + photo_id = record['photo_id'] + flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo' + flickr_url += f'&api_key={api_key}' + flickr_url += f'&photo_id={photo_id}' + flickr_url += '&format=json' + flickr_url += '&nojsoncallback=1' with urllib.request.urlopen(flickr_url) as url: data = json.loads(url.read().decode()) if data['stat'] =='fail': - raise Exception('failed') + error_msg = data["message"] + log.error(f'Failed. Message: {error_msg}, url: {flickr_url}') + if error_msg == 'Service currently unavailable': + time.sleep(10) + raise Exception(error_msg) elif data['stat'] =='ok': with open(fp_out, 'w') as fp: json.dump(data, fp, sort_keys=True, indent=2) - #except FlickrAPIError as e: except Exception as e: - # if "HTTP Server Error 500" in str(e): - log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}') - if "not found" in str(e) or 'failed' in str(e): + log.error(f'{e}') + if "not found" in str(e) or 'Invalid NSID provided' in str(e): with open(fp_out_err, 'w') as fp: fp.write('') diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py new file mode 100644 index 00000000..5b5f0ce3 --- /dev/null +++ b/megapixels/commands/datasets/flickr_api_to_csv.py @@ -0,0 +1,382 @@ +""" +Converts directory of JSON API output files to CSV format +""" + +from glob import glob +import os +from os.path import join +from pathlib import Path + +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils import logger_utils + +import pandas as pd +from PIL import Image, ImageOps, ImageFilter +from app.utils import file_utils, im_utils + + +query_types = ['nsid_profile', 'nsid_url', 'photo_id'] + +log = logger_utils.Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite') +@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True) +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type): + """Fetches Flickr API for user info. Saves to JSON""" + + from tqdm import tqdm + from glob import glob + import json + + + # ------------------------------------------------- + # process + if Path(opt_fp_out).is_file() and not opt_force: + log.error('File exists. Use "--force" to overwrite it') + return + + fp_files = glob(join(opt_fp_in, '*.json')) + fp_files = [f for f in fp_files if 'error' not in f] + if opt_slice: + fp_files = fp_files[opt_slice[0]:opt_slice[1]] + + log.debug(f'Found {len(fp_files)} files') + items = [] + + for fp_file in tqdm(fp_files): + + if opt_query_type == 'photo_id': + try: + photo = file_utils.load_json(fp_file).get('photo') + except Exception as e: + log.error(f'{e}, skipping: {fp_file}') + continue + dates = photo.get('dates') + posted = dates.get('posted') + taken = dates.get('taken') + description = photo.get('description').get('_content') + location = photo.get('location', {}) + country = location.get('country', {}) + location_country = country.get('_country', '') + location_place = country.get('place_id', '') + location_woeid = country.get('woeid', '') + location_lat = location.get('latitude', '') + location_lon = location.get('longitude', '') + location_place_id = location.get('place_id', '') + owner = photo.get('owner') + nsid = owner.get('nsid') + path_alias = owner.get('path_alias') + owner_realname = owner.get('realname') + owner_username = owner.get('username') + owner_location = owner.get('location') + photo_id = Path(fp_file).stem + server = photo.get('server') + farm = photo.get('farm') + secret = photo.get('secret') + # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg + image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg' + + obj = { + 'posted': posted, + 'taken': taken, + 'description': description, + 'country': location_country, + 'place': location_place, + 'woeid': location_woeid, + 'lat': location_lat, + 'lon': location_lon, + 'place_id': location_place_id, + 'nsid': nsid, + 'path_alias': path_alias, + 'realname': owner_realname, + 'username': owner_username, + 'owner_location': owner_location, + 'photo_id': photo_id, + 'secret': secret, + 'url': image_url + } + + + elif opt_query_type == 'nsid_profile': + obj = file_utils.load_json(fp_file).get('profile') + obj.pop('showcase_set') + obj.pop('showcase_set_title') + obj.pop('pinterest') + obj.pop('tumblr') + elif opt_query_type == 'nsid_url': + obj = file_utils.load_json(fp_file).get('user') + elif opt_query_type == 'user_profile': + metadata = file_utils.load_json(fp_file).get('photo') + owner = metadata.get('owner') + path_alias = owner.get('path_alias') + nsid = owner.get('nsid') + username = owner.get('username') + realname = owner.get('realname') + description = metadata.get('description').get('_content') + title = metadata.get('title').get('_content') + location = metadata.get('location') + dates = metadata.get('dates') + date_taken = dates.get('taken') + date_posted = dates.get('posted') + fname = Path(fp_file).stem + obj = { + 'photo_id': fname, + 'nsid': nsid, + 'path_alias': path_alias, + 'username': username, + 'realname': realname, + 'title': title, + 'description': description, + 'location': location, + 'date_taken': date_taken, + 'date_posted': date_posted + } + + items.append(obj) + + # conver to DataFrame + df = pd.DataFrame.from_dict(items) + df.to_csv(opt_fp_out, index=False) + log.info(f'Wrote {len(df)} to {opt_fp_out}') + +""" +nsid_url + { + "stat": "ok", + "user": { + "nsid": "7153718@N04", + "url": "https://www.flickr.com/people/babyfish4/" + } +} +""" +""" + location: of the owner + dateuploaded + license + "dates": + "lastupdate": "1416447096" + "posted": "1112900873" + "taken": "2005-04-06 18:37:38" + description: + _content: playing in a field + title: + _content: jessica + location: cornwall, uk +""" + +""" + { + "profile": { + "city": null, + "country": null, + "facebook": "", + "first_name": null, + "hometown": "", + "id": "7153718@N04", + "instagram": "", + "join_date": "1172669959", + "last_name": null, + "nsid": "7153718@N04", + "occupation": "", + "pinterest": "", + "profile_description": "", + "showcase_set": "72157680616398790", + "showcase_set_title": "Profile Showcase", + "tumblr": "", + "twitter": "" + }, + "stat": "ok" +} +""" + +""" +photo_id + + + { + "photo": { + "comments": { + "_content": "0" + }, + "dates": { + "lastupdate": "0", + "posted": "1094612969", + "taken": "2004-09-04 22:41:18", + "takengranularity": "0", + "takenunknown": 0 + }, + "dateuploaded": "1094612969", + "description": { + "_content": "" + }, + "editability": { + "canaddmeta": 0, + "cancomment": 0 + }, + "farm": 1, + "geoperms": { + "iscontact": 0, + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + }, + "id": "371498", + "isfavorite": 0, + "license": "1", + "location": { + "accuracy": "15", + "context": "0", + "country": { + "_content": "United States", + "place_id": "nz.gsghTUb4c2WAecA", + "woeid": "23424977" + }, + "county": { + "_content": "Tompkins", + "place_id": "1uCJJtBQUL80G6hbPw", + "woeid": "12589366" + }, + "latitude": "42.399028", + "longitude": "-76.652519", + "place_id": "1uCJJtBQUL80G6hbPw", + "region": { + "_content": "New York", + "place_id": "ODHTuIhTUb75gdBu", + "woeid": "2347591" + }, + "woeid": "12589366" + }, + "media": "photo", + "notes": { + "note": [] + }, + "originalformat": "jpg", + "originalsecret": "704f392686", + "owner": { + "iconfarm": 1, + "iconserver": "1", + "location": "Los Angeles, CA, USA", + "nsid": "48600072071@N01", + "path_alias": "barb", + "realname": "Barb Dybwad", + "username": "doctor paradox" + }, + "people": { + "haspeople": 0 + }, + "publiceditability": { + "canaddmeta": 0, + "cancomment": 1 + }, + "rotation": 0, + "safety_level": "0", + "secret": "704f392686", + "server": "1", + "tags": { + "tag": [ + { + "_content": "unfound", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9017", + "machine_tag": 0, + "raw": "unfound" + }, + { + "_content": "digicam", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-40406", + "machine_tag": 0, + "raw": "digicam" + }, + { + "_content": "upstateny", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-9655", + "machine_tag": 0, + "raw": "upstateny" + }, + { + "_content": "musefest", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72456", + "machine_tag": 0, + "raw": "musefest" + }, + { + "_content": "musicfestival", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-72628", + "machine_tag": 0, + "raw": "musicfestival" + }, + { + "_content": "people", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-290", + "machine_tag": 0, + "raw": "people" + }, + { + "_content": "portrait", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-278", + "machine_tag": 0, + "raw": "portrait" + }, + { + "_content": "maco", + "author": "48600072071@N01", + "authorname": "doctor paradox", + "id": "28255-371498-19439", + "machine_tag": 0, + "raw": "maco" + } + ] + }, + "title": { + "_content": "maco2" + }, + "urls": { + "url": [ + { + "_content": "https://www.flickr.com/photos/barb/371498/", + "type": "photopage" + } + ] + }, + "usage": { + "canblog": 0, + "candownload": 1, + "canprint": 0, + "canshare": 1 + }, + "views": "290", + "visibility": { + "isfamily": 0, + "isfriend": 0, + "ispublic": 1 + } + }, + "stat": "ok" +} +"""
\ No newline at end of file |
