diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-10-31 03:41:37 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-10-31 03:41:37 +0100 |
| commit | 2ede97c42b203c37a8a9f98784af4f31f01961f4 (patch) | |
| tree | 70bec449d9d8af3238efb1ae995dbda922469b89 | |
| parent | a16c3cf801b70670dffc7041d92f7ccec56a0e18 (diff) | |
merge
| -rw-r--r-- | datasets/scholar/entries/.csv (renamed from datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A.csv) | 0 | ||||
| -rw-r--r-- | datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A .csv | 0 | ||||
| -rw-r--r-- | fetch-entries.py | 14 | ||||
| -rwxr-xr-x | vendor/scholar.py | 11 |
4 files changed, 15 insertions, 10 deletions
diff --git a/datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A.csv b/datasets/scholar/entries/.csv index e69de29b..e69de29b 100644 --- a/datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A.csv +++ b/datasets/scholar/entries/.csv diff --git a/datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A
.csv b/datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A
.csv new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/datasets/scholar/entries/Pushing the frontiers of unconstrained face detection and recognition: IARPA Janus Benchmark A .csv diff --git a/fetch-entries.py b/fetch-entries.py index 37cd37f9..8c50d9a1 100644 --- a/fetch-entries.py +++ b/fetch-entries.py @@ -16,22 +16,24 @@ def fetch_entries(index): label = line[0] title = line[1] entries_fn = './datasets/scholar/entries/{}.csv'.format(title) - print(entries_fn) + # print(entries_fn) if not os.path.exists(entries_fn): with open(entries_fn, 'w') as f: - t = re.sub(r'\W+', '', title) + t = re.sub(r'[^-0-9a-zA-Z ]+', '', title) + print(t) subprocess.call([ './vendor/scholar.py', '-t', '-A', t, '--csv', ], stdout=f) - time.sleep(random.randint(20, 32)) + # time.sleep(random.randint(20, 32)) def read_citation_list(index): filename = './datasets/citations.csv' - fn, ext = os.path.splitext(filename) - in_fn = fn + '-' + str(index) + ext - with open(in_fn, 'r') as f: + # fn, ext = os.path.splitext(filename) + # in_fn = fn + '-' + str(index) + ext + # with open(in_fn, 'r') as f: + with open(filename, 'r') as f: reader = csv.reader(f) lines = list(reader) keys = lines[0] diff --git a/vendor/scholar.py b/vendor/scholar.py index 13ccd439..8070ff0d 100755 --- a/vendor/scholar.py +++ b/vendor/scholar.py @@ -241,11 +241,11 @@ class ScholarConf(object): VERSION = '2.10' LOG_LEVEL = 1 MAX_PAGE_RESULTS = 10 # Current default for per-page results - SCHOLAR_SITE = 'http://scholar.google.com' + SCHOLAR_SITE = 'https://scholar.google.de' # USER_AGENT = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9' # Let's update at this point (3/14): - USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' + USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0' # If set, we will use this file to read/save cookies to enable # cookie use across sessions. @@ -514,6 +514,8 @@ class ScholarArticleParser(object): """Helper, returns full URL in case path isn't one.""" if path.startswith('http://'): return path + if path.startswith('https://'): + return path if not path.startswith('/'): path = '/' + path return self.site + path @@ -1022,6 +1024,8 @@ class ScholarQuerier(object): log_msg='dump of query response HTML', err_msg='results retrieval failed') if html is None: + print(query.get_url()) + print("html is none") return self.parse(html) @@ -1100,10 +1104,9 @@ class ScholarQuerier(object): ScholarUtils.log('debug', 'headers:\n' + str(hdl.info())) ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-'*68) - - return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) + print(err.read()) return None |
