Mercurial > hg > from-my-pen-to-your-ears-supplementary-material
diff demo/text2ann.py~ @ 0:4dad87badb0c
initial commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 17:56:10 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demo/text2ann.py~ Wed May 16 17:56:10 2018 +0100 @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Apr 28 14:17:15 2018 + +@author: Emmanouil Theofanis Chourdakis + +Takes a .txt story and annotates it based on: + + characters, + places, + saywords, + character_lines, + spatial_indicators, + +@output: + .ann file with the same name + +""" + +import argparse +from sklearn.externals import joblib +import ner +import spacy + + +def annotate(text, model, character_lut, saywords_lut): + """ + Function which annotates entities in text + using the model in "model" + """ + + # Create document from text + nlp = spacy.load('en') + doc = nlp(text) + + # Parse using LUTs + + # *- Characters + + + + + + + +if __name__=="__main__": + argparser = argparse.ArgumentParser() + argparser.add_argument('input_path', help='.txt file to parse') + argparser.add_argument('model_path', help='.pkl file containing model') + argparser.add_argument('--say-lut', help='.txt file with list of saywords') + argparser.add_argument('--char-lut', help='.txt file with known characters') + argparser.add_argument('--place-lut', help='.txt file with known places') + + args = argparser.parse_args() + + # Load text file + with open(args.input_path) as f: + text = f.read() + + # Load model file + model = joblib.load(args.model_path) + + # Load saywords + if args.say_lut: + saylut_path = args.say_lut + else: + saylut_path = 'saywords.txt' + + with open(saylut_path) as f: + saylut = [s for s in f.read().split('\n') if s.strip() != ''] + + # Load places LUT + if args.place_lut: + placelut_path = args.place_lut + else: + placelut_path = 'places.txt' + + with open(placelut_path) as f: + placelut = [s for s in f.read().split('\n') if s.strip() != ''] + + # Load character LUT + if args.char_lut: + charlut_path = args.char_lut + else: + charlut_path = 'characters.txt' + + with open(charlut_path) as f: + + charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line + + character_lut = {} # Stores character attributes indexed by name + for l in charlist: + name, attributes = l.split(':') + + gender = None + age = None + + for a in attributes.split(','): + if 'male' in a: + gender = a + elif a.lower() in ['young', 'old']: + age = a + + character_lut[name] = {} + if gender: + character_lut[name]['gender'] = gender + if age: + character_lut[name]['age'] = age + + annotation_dict = annotate(text, model, character_lut, saylut) + + + + + \ No newline at end of file