ability to skip words if you dont like the connection

author: Jules Laplace <julescarbon@gmail.com> 2020-04-02 15:31:46 +0200
committer: Jules Laplace <julescarbon@gmail.com> 2020-04-02 15:31:46 +0200
commit: c14810fe9f663d46b5477088d06047fea66d1524 (patch)
tree: 72eebe1589b71147f003ebb55c66a20eaf5bce6a /cli/app/thesaurus/api.py
parent: 0b55e297d5088962fe8397903041c2b1737c7cdd (diff)
1 files changed, 33 insertions, 6 deletions
diff --git a/cli/app/thesaurus/api.py b/cli/app/thesaurus/api.py
index 98e0210..467d5fb 100644
--- a/cli/app/thesaurus/api.py
+++ b/cli/app/thesaurus/api.py
@@ -1,4 +1,6 @@
 import os
+import re
+import time
 import requests
 from hashlib import sha256
 
@@ -13,10 +15,18 @@ class Thesaurus:
     sha = sha256(word)
     hash_path = os.path.join(base_path, sha[0:2])
     os.makedirs(hash_path, exist_ok=True)
-    path = os.path.join(hash_path, word + '.json')
+    clean_word = re.sub('[^0-9a-zA-Z]+', '*', word)
+    path = os.path.join(hash_path, clean_word + '.json')
     if os.path.exists(path):
       return read_json(path)
-    data = api_fn(word)
+    data = None
+    while data is None:
+      try:
+        data = api_fn(word)
+      except Exception as e:
+        print("Got HTTP error, sleeping for 5 seconds")
+        time.sleep(5)
+        pass
     write_json(path, data)
     return data
 
@@ -34,6 +44,11 @@ class ThesaurusAPI:
   }
 
   def search(self, word):
+    word = word.split('<')[0]
+    word = word.split('/')[0]
+    word = word.replace('(', '').replace(')', '')
+    if len(word) < 1:
+      return { 'word': word, 'categories': [] }
     query = {
       'qsearch': word,
     }
@@ -42,8 +57,18 @@ class ThesaurusAPI:
       return []
     data = resp.text
     data = data.split('<div id="resultsTimelineData">')[1].split('</div>')[0]
-    # print(data)
-    rows = json.loads(data)
+    data = data.replace('<span class="oesc">', '')
+    data = data.replace('</span>', '')
+    try:
+      rows = json.loads(data)
+    except Exception as e:
+      print(f"Error loading JSON for {word}")
+      print(data)
+      # raise e
+      return {
+        'word': word,
+        'categories': [],
+      }
     cats = []
     for row in rows:
       cat, years = row['popup'].split(']: ')
@@ -72,8 +97,10 @@ class ThesaurusAPI:
     raw_words = raw.split('"><b>')[1:]
     words = []
     for word in raw_words:
-      word, rest = word.split('</b>')
-      years = word.split(' <span')[0].strip()
+      word_partz = word.split('</b>')
+      word = word_partz[0]
+      years = word_partz[1].split(' <span')[0].strip()
+      years = years.replace('\u2013', '-')
       words.append({
         'word': word,
         'years': years,
author	Jules Laplace <julescarbon@gmail.com>	2020-04-02 15:31:46 +0200
committer	Jules Laplace <julescarbon@gmail.com>	2020-04-02 15:31:46 +0200
commit	c14810fe9f663d46b5477088d06047fea66d1524 (patch)
tree	72eebe1589b71147f003ebb55c66a20eaf5bce6a /cli/app/thesaurus/api.py
parent	0b55e297d5088962fe8397903041c2b1737c7cdd (diff)