add msc working utils

author: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
committer: adamhrv <adam@ahprojects.com> 2019-06-03 03:33:06 +0200
commit: 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree: 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/datasets
parent: e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)
4 files changed, 454 insertions, 32 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index ed717662..0b81fef6 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t
   help='Output path')
 @click.option('-t', '--threads', 'opt_threads', default=8,
   help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
 @click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents):
   """Threaded image/file downloader"""
   
   """
@@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
       urllib.request.urlretrieve(item['url'], fp_out)
       item['status'] = True
     except Exception as e:
+      log.debug(f'Failed: user: {item["username"]}, url: {url}')
       if str(e) != 'HTTP Error 403: Forbidden':
         log.debug(f'Error: {e}')
       fp_error = f'{fp_out}_error.txt'
@@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
   # setup multithreading data holders
   log.debug(f'loading {opt_fp_in}')
   df_records = pd.read_csv(opt_fp_in)
+  if opt_slice:
+    df_records = df_records[opt_slice[0]:opt_slice[1]]
   log.debug(f'loaded {len(df_records):,} csv records')
   log.debug('deduplicating')
   df_records = df_records.drop_duplicates(subset='sha256', keep="last")
@@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
 
   for x in tqdm(records):
     sha256 = x['sha256']
-    
+    username = x['username']
     fp_dst = join(opt_fp_out, f"{sha256}.json")
     fp_dst_is_file = Path(fp_dst).is_file()
     fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
@@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
     if not (fp_dst_is_file or fp_dst_is_err):
       url = url_prefix + sha256 + '.json'
       user_agent = user_agents[randint(0, len(user_agents)) - 1]
-      pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+      pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent})
     else:
       n_skipped += 1
 
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
index c64afbba..45ca8f6e 100644
--- a/megapixels/commands/datasets/download_images.py
+++ b/megapixels/commands/datasets/download_images.py
@@ -6,9 +6,9 @@ import click
   help='Input')
 @click.option('-o', '--output', 'opt_fp_out', required=True,
   help='Output')
-@click.option('-t', '--threads', 'opt_threads', default=8,
+@click.option('-t', '--threads', 'opt_threads', default=8, show_default=True,
   help='Number of threads')
-@click.option('--wayback', 'opt_wayback', is_flag=True,
+@click.option('--wayback', 'opt_wayback', is_flag=True, default=False,
   help='Check Wayback archive for URL and download cached image')
 @click.pass_context
 def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
@@ -52,7 +52,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
       estr = str(e)
       if item['opt_wayback']:
         if 'HTTP Error' in estr:
-          # check 
+          # TODO add/parse/handle request for wayback machine archive
           url_wayback = url_wayback_base + item['url']
       fp_error = f'{fp_out}_error.txt'
       with open(fp_error, 'w') as fp:
@@ -67,6 +67,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
 
 
   pool_items = []
+  log.debug(f'Initializing multithreaded pool...')
   for x in tqdm(records):
     fp_dst = join(opt_fp_out, x['filepath'])
     fp_dst_is_file = Path(fp_dst).is_file()
@@ -75,7 +76,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
       pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
 
   num_items = len(pool_items)
-  log.info(f'processing {num_items:,} items')
+  log.info(f'Going to download {num_items:,} files')
   pool_results = []
 
   # run the multithreading with progress bar
diff --git a/megapixels/commands/datasets/flickr_api.py b/megapixels/commands/datasets/flickr_api.py
index 780ede49..f09f3089 100644
--- a/megapixels/commands/datasets/flickr_api.py
+++ b/megapixels/commands/datasets/flickr_api.py
@@ -15,9 +15,10 @@ from PIL import Image, ImageOps, ImageFilter
 from app.utils import file_utils, im_utils
 
 
-query_types = ['photo_id', 'album_id', 'flickr_id']
+query_types = ['photo_id', 'album_id', 'nsid_url', 'nsid_profile']
+# ???
+# photo_id: 123456789
 # flickr_id: 123456789@N01
-# photo_id: 
 
 log = logger_utils.Logger.getLogger()
 
@@ -28,7 +29,7 @@ log = logger_utils.Logger.getLogger()
   help='Output directory')
 @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
   help='Slice list of files')
-@click.option('--query-type', 'opt_query_type', default='photo_id',
+@click.option('-q', '--query', 'opt_query_type', required=True,
   type=click.Choice(query_types),
   help='API query type')
 @click.option('--api_key', 'opt_api_key', envvar='FLICKR_API_KEY_1')
@@ -56,13 +57,13 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   # process  
   
   if not opt_api_key or not opt_api_secret:
-    log.error('source .env vars for Flickr API and try again')
+    log.error('source env/flickr.env vars for Flickr API and try again')
     return
 
   # check how many flickr keys
   api_keys = []
   api_secrets = []
-  for i in range(1,10):
+  for i in range(1,20):
     try:
       var_name_key = f'FLICKR_API_KEY_{i}'
       var_name_secret = f'FLICKR_API_SECRET_{i}'
@@ -75,9 +76,16 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   log.info(f'Shuffling between: {len(api_keys)} api keys')
 
   # read in CSV
-  # | query, filepath |
-
-  records = pd.read_csv(opt_fp_in).to_dict('records')
+  # | query, filename, count |
+  df_records = pd.read_csv(opt_fp_in)
+  log.info(f'Dedpuplicating {len(df_records)}')
+  if opt_query_type == 'nsid_url' or opt_query_type == 'nsid_profile':
+    df_records = df_records.drop_duplicates(subset='nsid', keep="last")
+  else:
+    df_records = df_records.drop_duplicates(subset='photo_id', keep="last")
+  log.info(f'After deduplication: {len(df_records)}')
+  records = df_records.to_dict('records')
+  
   if opt_slice:
     records = records[opt_slice[0]:opt_slice[1]]
   
@@ -87,42 +95,68 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_api_key, opt_api_secret,
   
 
   for record in tqdm(records):
-    fp_out = join(opt_fp_out, record['filepath'])
+    if 'nsid' in opt_query_type:
+      fp_out = join(opt_fp_out, f"{record['nsid']}.json")
+    else:
+      fp_out = join(opt_fp_out, f'{record["photo_id"]}.json')
+
     fp_out_err = fp_out + '_error.txt'
     if Path(fp_out).is_file() or Path(fp_out_err).is_file():
       continue
-    # append relevant data
     try:
       # shuffle the api keys to avoid rate limiting
       rand_int = randint(0,len(api_keys)-1)
       api_key = api_keys[rand_int]
       api_secret = api_secrets[rand_int]
-                                      
-      #flickr_api.set_keys(api_key=api_key, api_secret=api_secret)
 
-      #photo = flickr_api.Photo(id=record['query'])
-      # https://api.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
-      photo_id = record['query']
-      flickr_url = 'https://api.flickr.com/services/rest/?method=flickr.photos.getInfo'
-      flickr_url += f'&api_key={api_key}'
-      flickr_url += f'&photo_id={photo_id}'
-      flickr_url += '&format=json'
-      flickr_url += '&nojsoncallback=1'
+      # https://www.flickr.com/services/rest/
+      if opt_query_type == 'nsid_url':
+        # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+        # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+        nsid = record['nsid']
+        nsid_encoded = urllib.parse.quote_plus(nsid)
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.urls.getUserProfile'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&user_id={nsid_encoded}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
+      # https://www.flickr.com/services/rest/
+      if opt_query_type == 'nsid_profile':
+        # ?method=flickr.photos.getUserProfile&api_key=31cae6cbba7020585a34bf5fcd772e16&user_id=53991912%40N00&format=json&nojsoncallback=1
+        # ?method=flickr.urls.getUserProfile&api_key=7c905d9a22bc505fd90a3d98078363bc&user_id=97556162%40N00
+        nsid = record['nsid']
+        nsid_encoded = urllib.parse.quote_plus(nsid)
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.profile.getProfile'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&user_id={nsid_encoded}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
+      elif opt_query_type == 'photo_id':
+        # ?method=flickr.photos.getInfo&api_key=18dacfe5a2c45b794ee4aa832eb35e83&photo_id=6796778203&format=json&nojsoncallback=1
+        fp_out
+        photo_id = record['photo_id']
+        flickr_url = 'https://flickr.com/services/rest/?method=flickr.photos.getInfo'
+        flickr_url += f'&api_key={api_key}'
+        flickr_url += f'&photo_id={photo_id}'
+        flickr_url += '&format=json'
+        flickr_url += '&nojsoncallback=1'
 
       with urllib.request.urlopen(flickr_url) as url:
         data = json.loads(url.read().decode())
 
       if data['stat'] =='fail':
-        raise Exception('failed')
+        error_msg = data["message"]
+        log.error(f'Failed. Message: {error_msg}, url: {flickr_url}')
+        if error_msg == 'Service currently unavailable':
+          time.sleep(10)
+        raise Exception(error_msg)
       elif data['stat'] =='ok':
         with open(fp_out, 'w') as fp:
           json.dump(data, fp, sort_keys=True, indent=2)
       
-    #except FlickrAPIError as e:
     except Exception as e:
-      # if "HTTP Server Error 500" in str(e):
-      log.error(f'{e}, {record["query"]}, api_key: {api_key}, api_secret: {api_secret}')
-      if "not found" in str(e) or 'failed' in str(e):
+      log.error(f'{e}')
+      if "not found" in str(e) or 'Invalid NSID provided' in str(e):
         with open(fp_out_err, 'w') as fp:
           fp.write('')
     
diff --git a/megapixels/commands/datasets/flickr_api_to_csv.py b/megapixels/commands/datasets/flickr_api_to_csv.py
new file mode 100644
index 00000000..5b5f0ce3
--- /dev/null
+++ b/megapixels/commands/datasets/flickr_api_to_csv.py
@@ -0,0 +1,382 @@
+"""
+Converts directory of JSON API output files to CSV format
+"""
+
+from glob import glob
+import os
+from os.path import join
+from pathlib import Path
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils import logger_utils
+
+import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
+from app.utils import file_utils, im_utils
+
+
+query_types = ['nsid_profile', 'nsid_url', 'photo_id']
+
+log = logger_utils.Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output file')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite')
+@click.option('-q', '--query', 'opt_query_type', type=click.Choice(query_types), required=True)
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_force, opt_query_type):
+  """Fetches Flickr API for user info. Saves to JSON"""
+  
+  from tqdm import tqdm
+  from glob import glob
+  import json
+
+  
+  # -------------------------------------------------
+  # process  
+  if Path(opt_fp_out).is_file() and not opt_force:
+    log.error('File exists. Use "--force" to overwrite it')
+    return
+
+  fp_files = glob(join(opt_fp_in, '*.json'))
+  fp_files = [f for f in fp_files if 'error' not in f]
+  if opt_slice:
+    fp_files = fp_files[opt_slice[0]:opt_slice[1]]
+
+  log.debug(f'Found {len(fp_files)} files')
+  items = []
+
+  for fp_file in tqdm(fp_files):
+
+    if opt_query_type == 'photo_id':
+      try:
+        photo = file_utils.load_json(fp_file).get('photo')
+      except Exception as e:
+        log.error(f'{e}, skipping: {fp_file}')
+        continue
+      dates = photo.get('dates')
+      posted = dates.get('posted')
+      taken = dates.get('taken')
+      description = photo.get('description').get('_content')
+      location = photo.get('location', {})
+      country = location.get('country', {})
+      location_country = country.get('_country', '')
+      location_place = country.get('place_id', '')
+      location_woeid = country.get('woeid', '')
+      location_lat = location.get('latitude', '')
+      location_lon = location.get('longitude', '')
+      location_place_id = location.get('place_id', '')
+      owner = photo.get('owner')
+      nsid = owner.get('nsid')
+      path_alias = owner.get('path_alias')
+      owner_realname = owner.get('realname')
+      owner_username = owner.get('username')
+      owner_location = owner.get('location')
+      photo_id = Path(fp_file).stem
+      server = photo.get('server')
+      farm = photo.get('farm')
+      secret = photo.get('secret')
+      # https://farm4.staticflickr.com/3214/3036412907_65deee68e2.jpg
+      image_url = f'https://farm{farm}.staticflickr.com/{server}/{photo_id}_{secret}.jpg'
+
+      obj = {
+        'posted': posted,
+        'taken': taken,
+        'description': description,
+        'country': location_country,
+        'place': location_place,
+        'woeid': location_woeid,
+        'lat': location_lat,
+        'lon': location_lon,
+        'place_id': location_place_id,
+        'nsid': nsid,
+        'path_alias': path_alias,
+        'realname': owner_realname,
+        'username': owner_username,
+        'owner_location': owner_location,
+        'photo_id': photo_id,
+        'secret': secret,
+        'url': image_url
+      }
+
+  
+    elif opt_query_type == 'nsid_profile':
+      obj = file_utils.load_json(fp_file).get('profile')
+      obj.pop('showcase_set')
+      obj.pop('showcase_set_title')
+      obj.pop('pinterest')
+      obj.pop('tumblr')
+    elif opt_query_type == 'nsid_url':
+      obj = file_utils.load_json(fp_file).get('user')
+    elif opt_query_type == 'user_profile':
+      metadata = file_utils.load_json(fp_file).get('photo')
+      owner = metadata.get('owner')
+      path_alias = owner.get('path_alias')
+      nsid = owner.get('nsid')
+      username = owner.get('username')
+      realname = owner.get('realname')
+      description = metadata.get('description').get('_content')
+      title = metadata.get('title').get('_content')
+      location = metadata.get('location')
+      dates = metadata.get('dates')
+      date_taken = dates.get('taken')
+      date_posted = dates.get('posted')
+      fname = Path(fp_file).stem
+      obj = {
+        'photo_id': fname,
+        'nsid': nsid,
+        'path_alias': path_alias,
+        'username': username,
+        'realname': realname,
+        'title': title,
+        'description': description,
+        'location': location,
+        'date_taken': date_taken,
+        'date_posted': date_posted
+        }
+    
+    items.append(obj)
+
+  # conver to DataFrame
+  df = pd.DataFrame.from_dict(items)
+  df.to_csv(opt_fp_out, index=False)
+  log.info(f'Wrote {len(df)} to {opt_fp_out}')
+
+"""
+nsid_url
+  {
+  "stat": "ok",
+  "user": {
+    "nsid": "7153718@N04",
+    "url": "https://www.flickr.com/people/babyfish4/"
+  }
+}
+"""
+"""
+  location: of the owner
+  dateuploaded
+  license
+  "dates":
+    "lastupdate": "1416447096"
+    "posted": "1112900873"
+    "taken": "2005-04-06 18:37:38"
+  description:
+    _content: playing in a field
+  title:
+    _content: jessica
+  location: cornwall, uk
+"""
+
+"""
+  {
+  "profile": {
+    "city": null,
+    "country": null,
+    "facebook": "",
+    "first_name": null,
+    "hometown": "",
+    "id": "7153718@N04",
+    "instagram": "",
+    "join_date": "1172669959",
+    "last_name": null,
+    "nsid": "7153718@N04",
+    "occupation": "",
+    "pinterest": "",
+    "profile_description": "",
+    "showcase_set": "72157680616398790",
+    "showcase_set_title": "Profile Showcase",
+    "tumblr": "",
+    "twitter": ""
+  },
+  "stat": "ok"
+}
+"""
+
+"""
+photo_id
+
+
+  {
+  "photo": {
+    "comments": {
+      "_content": "0"
+    },
+    "dates": {
+      "lastupdate": "0",
+      "posted": "1094612969",
+      "taken": "2004-09-04 22:41:18",
+      "takengranularity": "0",
+      "takenunknown": 0
+    },
+    "dateuploaded": "1094612969",
+    "description": {
+      "_content": ""
+    },
+    "editability": {
+      "canaddmeta": 0,
+      "cancomment": 0
+    },
+    "farm": 1,
+    "geoperms": {
+      "iscontact": 0,
+      "isfamily": 0,
+      "isfriend": 0,
+      "ispublic": 1
+    },
+    "id": "371498",
+    "isfavorite": 0,
+    "license": "1",
+    "location": {
+      "accuracy": "15",
+      "context": "0",
+      "country": {
+        "_content": "United States",
+        "place_id": "nz.gsghTUb4c2WAecA",
+        "woeid": "23424977"
+      },
+      "county": {
+        "_content": "Tompkins",
+        "place_id": "1uCJJtBQUL80G6hbPw",
+        "woeid": "12589366"
+      },
+      "latitude": "42.399028",
+      "longitude": "-76.652519",
+      "place_id": "1uCJJtBQUL80G6hbPw",
+      "region": {
+        "_content": "New York",
+        "place_id": "ODHTuIhTUb75gdBu",
+        "woeid": "2347591"
+      },
+      "woeid": "12589366"
+    },
+    "media": "photo",
+    "notes": {
+      "note": []
+    },
+    "originalformat": "jpg",
+    "originalsecret": "704f392686",
+    "owner": {
+      "iconfarm": 1,
+      "iconserver": "1",
+      "location": "Los Angeles, CA, USA",
+      "nsid": "48600072071@N01",
+      "path_alias": "barb",
+      "realname": "Barb Dybwad",
+      "username": "doctor paradox"
+    },
+    "people": {
+      "haspeople": 0
+    },
+    "publiceditability": {
+      "canaddmeta": 0,
+      "cancomment": 1
+    },
+    "rotation": 0,
+    "safety_level": "0",
+    "secret": "704f392686",
+    "server": "1",
+    "tags": {
+      "tag": [
+        {
+          "_content": "unfound",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-9017",
+          "machine_tag": 0,
+          "raw": "unfound"
+        },
+        {
+          "_content": "digicam",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-40406",
+          "machine_tag": 0,
+          "raw": "digicam"
+        },
+        {
+          "_content": "upstateny",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-9655",
+          "machine_tag": 0,
+          "raw": "upstateny"
+        },
+        {
+          "_content": "musefest",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-72456",
+          "machine_tag": 0,
+          "raw": "musefest"
+        },
+        {
+          "_content": "musicfestival",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-72628",
+          "machine_tag": 0,
+          "raw": "musicfestival"
+        },
+        {
+          "_content": "people",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-290",
+          "machine_tag": 0,
+          "raw": "people"
+        },
+        {
+          "_content": "portrait",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-278",
+          "machine_tag": 0,
+          "raw": "portrait"
+        },
+        {
+          "_content": "maco",
+          "author": "48600072071@N01",
+          "authorname": "doctor paradox",
+          "id": "28255-371498-19439",
+          "machine_tag": 0,
+          "raw": "maco"
+        }
+      ]
+    },
+    "title": {
+      "_content": "maco2"
+    },
+    "urls": {
+      "url": [
+        {
+          "_content": "https://www.flickr.com/photos/barb/371498/",
+          "type": "photopage"
+        }
+      ]
+    },
+    "usage": {
+      "canblog": 0,
+      "candownload": 1,
+      "canprint": 0,
+      "canshare": 1
+    },
+    "views": "290",
+    "visibility": {
+      "isfamily": 0,
+      "isfriend": 0,
+      "ispublic": 1
+    }
+  },
+  "stat": "ok"
+}
+"""
+\ No newline at end of file
author	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
committer	adamhrv <adam@ahprojects.com>	2019-06-03 03:33:06 +0200
commit	1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch)
tree	86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/datasets
parent	e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff)