e@0: #!/usr/bin/env python3 e@0: # -*- coding: utf-8 -*- e@0: """ e@0: Created on Sat Apr 28 14:17:15 2018 e@0: e@0: @author: Emmanouil Theofanis Chourdakis e@0: e@0: Takes a .txt story and annotates it based on: e@0: e@0: characters, e@0: places, e@0: saywords, e@0: character_lines, e@0: spatial_indicators, e@0: e@0: @output: e@0: .ann file with the same name e@0: e@0: """ e@0: e@0: import argparse e@0: from sklearn.externals import joblib e@0: import ner e@0: import spacy e@0: e@0: e@0: def annotate(text, model, character_lut, saywords_lut): e@0: """ e@0: Function which annotates entities in text e@0: using the model in "model" e@0: """ e@0: e@0: # Create document from text e@0: nlp = spacy.load('en') e@0: doc = nlp(text) e@0: e@0: # Parse using LUTs e@0: e@0: # *- Characters e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: if __name__=="__main__": e@0: argparser = argparse.ArgumentParser() e@0: argparser.add_argument('input_path', help='.txt file to parse') e@0: argparser.add_argument('model_path', help='.pkl file containing model') e@0: argparser.add_argument('--say-lut', help='.txt file with list of saywords') e@0: argparser.add_argument('--char-lut', help='.txt file with known characters') e@0: argparser.add_argument('--place-lut', help='.txt file with known places') e@0: e@0: args = argparser.parse_args() e@0: e@0: # Load text file e@0: with open(args.input_path) as f: e@0: text = f.read() e@0: e@0: # Load model file e@0: model = joblib.load(args.model_path) e@0: e@0: # Load saywords e@0: if args.say_lut: e@0: saylut_path = args.say_lut e@0: else: e@0: saylut_path = 'saywords.txt' e@0: e@0: with open(saylut_path) as f: e@0: saylut = [s for s in f.read().split('\n') if s.strip() != ''] e@0: e@0: # Load places LUT e@0: if args.place_lut: e@0: placelut_path = args.place_lut e@0: else: e@0: placelut_path = 'places.txt' e@0: e@0: with open(placelut_path) as f: e@0: placelut = [s for s in f.read().split('\n') if s.strip() != ''] e@0: e@0: # Load character LUT e@0: if args.char_lut: e@0: charlut_path = args.char_lut e@0: else: e@0: charlut_path = 'characters.txt' e@0: e@0: with open(charlut_path) as f: e@0: e@0: charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line e@0: e@0: character_lut = {} # Stores character attributes indexed by name e@0: for l in charlist: e@0: name, attributes = l.split(':') e@0: e@0: gender = None e@0: age = None e@0: e@0: for a in attributes.split(','): e@0: if 'male' in a: e@0: gender = a e@0: elif a.lower() in ['young', 'old']: e@0: age = a e@0: e@0: character_lut[name] = {} e@0: if gender: e@0: character_lut[name]['gender'] = gender e@0: if age: e@0: character_lut[name]['age'] = age e@0: e@0: annotation_dict = annotate(text, model, character_lut, saylut) e@0: e@0: e@0: e@0: e@0: