import os import csv import codecs import simplejson as json def read_citation_list(index=0): filename = './datasets/citations.csv' if index > 0: fn, ext = os.path.splitext(filename) filename = fn + '-' + str(index) + ext with open(filename, 'r') as f: reader = csv.reader(f) lines = list(reader) keys = lines[0] lines = lines[1:] return keys, lines def unfussy_reader(reader): while True: try: yield next(reader) except StopIteration: return except csv.Error: print(csv.Error) # log the problem or whatever continue def read_csv(fn, keys=True, create=False): try: with open(fn, 'r', newline='', encoding='utf-8') as f: # reader = csv.reader( (line.replace('\0','') for line in f) ) reader = csv.reader(f) lines = list(unfussy_reader(reader)) if keys: keys = lines[0] lines = lines[1:] return keys, lines return lines except: if create: return [] raise def write_csv(fn, keys, rows): with open(fn, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) if keys is not None: writer.writerow(keys) for row in rows: writer.writerow(row) def read_json(fn): with open(fn, 'r') as json_file: return json.load(json_file) def write_json(fn, data): with open(fn, 'w') as outfile: json.dump(data, outfile) def write_report(fn, title=None, keys=None, rows=[]): count = 0 with open(fn, 'w') as f: f.write("") f.write("") f.write("") if title is not None: f.write("{}".format(title)) f.write("") f.write("") f.write("") if title is not None: f.write("

{}

".format(title)) f.write("") if keys is not None: for key in keys: f.write("".format(key)) for row in rows: if row is None: return count += 1 f.write("") for cell in row: if isinstance(cell, list) or isinstance(cell, tuple): f.write("".format('
'.join(str(x) for x in cell))) else: f.write("".format(cell)) f.write("") f.write("
{}
{}{}
") f.write("") f.write("") print("{} {}".format(fn, count)) def paper_path(key='papers', paper_id=''): return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id) class DbPaper(object): def __init__(self, paper_id): self.paper_id = paper_id self.data = read_json(paper_path('db_papers', paper_id)) @property def title(self): return self.data['title'] @property def journal(self): return self.data['journalName'] @property def authors(self): return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ] class RawPaper(object): def __init__(self, paper_id): self.paper_id = paper_id data = read_json(paper_path('raw_papers', paper_id)) if 'paper' not in data: print(data) return None self.data = data['paper'] @property def title(self): return self.data['title']['text'] @property def journal(self): return self.data['journal']['name'] @property def authors(self): return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ] def load_paper(paper_id): if os.path.exists(paper_path('db_papers', paper_id)): # print('db paper') return DbPaper(paper_id) if os.path.exists(paper_path('raw_papers', paper_id)): # print('raw paper') return RawPaper(paper_id) print('no paper') return None