Mercurial > hg > from-my-pen-to-your-ears-supplementary-material
view demo/text2ann.py~ @ 0:4dad87badb0c
initial commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 17:56:10 +0100 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Apr 28 14:17:15 2018 @author: Emmanouil Theofanis Chourdakis Takes a .txt story and annotates it based on: characters, places, saywords, character_lines, spatial_indicators, @output: .ann file with the same name """ import argparse from sklearn.externals import joblib import ner import spacy def annotate(text, model, character_lut, saywords_lut): """ Function which annotates entities in text using the model in "model" """ # Create document from text nlp = spacy.load('en') doc = nlp(text) # Parse using LUTs # *- Characters if __name__=="__main__": argparser = argparse.ArgumentParser() argparser.add_argument('input_path', help='.txt file to parse') argparser.add_argument('model_path', help='.pkl file containing model') argparser.add_argument('--say-lut', help='.txt file with list of saywords') argparser.add_argument('--char-lut', help='.txt file with known characters') argparser.add_argument('--place-lut', help='.txt file with known places') args = argparser.parse_args() # Load text file with open(args.input_path) as f: text = f.read() # Load model file model = joblib.load(args.model_path) # Load saywords if args.say_lut: saylut_path = args.say_lut else: saylut_path = 'saywords.txt' with open(saylut_path) as f: saylut = [s for s in f.read().split('\n') if s.strip() != ''] # Load places LUT if args.place_lut: placelut_path = args.place_lut else: placelut_path = 'places.txt' with open(placelut_path) as f: placelut = [s for s in f.read().split('\n') if s.strip() != ''] # Load character LUT if args.char_lut: charlut_path = args.char_lut else: charlut_path = 'characters.txt' with open(charlut_path) as f: charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line character_lut = {} # Stores character attributes indexed by name for l in charlist: name, attributes = l.split(':') gender = None age = None for a in attributes.split(','): if 'male' in a: gender = a elif a.lower() in ['young', 'old']: age = a character_lut[name] = {} if gender: character_lut[name]['gender'] = gender if age: character_lut[name]['age'] = age annotation_dict = annotate(text, model, character_lut, saylut)