view demo/text2ann.py~ @ 0:4dad87badb0c

initial commit
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 16 May 2018 17:56:10 +0100
parents
children
line wrap: on
line source
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 28 14:17:15 2018

@author: Emmanouil Theofanis Chourdakis

Takes a .txt story and annotates it based on:
    
    characters, 
    places,
    saywords,
    character_lines,
    spatial_indicators,
    
@output:
    .ann file with the same name
    
"""

import argparse
from sklearn.externals import joblib
import ner
import spacy


def annotate(text, model, character_lut, saywords_lut):
    """
        Function which annotates entities in text
        using the model in "model"
    """
    
    # Create document from text
    nlp = spacy.load('en')
    doc = nlp(text)    
    
    # Parse using LUTs
    
    # *- Characters
    
    
    
    
    


if __name__=="__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('input_path', help='.txt file to parse')
    argparser.add_argument('model_path', help='.pkl file containing model')
    argparser.add_argument('--say-lut', help='.txt file with list of saywords')
    argparser.add_argument('--char-lut', help='.txt file with known characters')
    argparser.add_argument('--place-lut', help='.txt file with known places')
    
    args = argparser.parse_args()
    
    # Load text file
    with open(args.input_path) as f:
        text = f.read()
        
    # Load model file
    model = joblib.load(args.model_path)
    
    # Load saywords
    if args.say_lut:
        saylut_path = args.say_lut
    else:
        saylut_path = 'saywords.txt'
        
    with open(saylut_path) as f:
        saylut = [s for s in f.read().split('\n') if s.strip() != '']
        
    # Load places LUT
    if args.place_lut:
        placelut_path = args.place_lut
    else:
        placelut_path = 'places.txt'
        
    with open(placelut_path) as f:
        placelut = [s for s in f.read().split('\n') if s.strip() != '']
        
    # Load character LUT
    if args.char_lut:
        charlut_path = args.char_lut
    else:
        charlut_path = 'characters.txt'
        
    with open(charlut_path) as f:
        
        charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
        
        character_lut = {} # Stores character attributes indexed by name
        for l in charlist:
            name, attributes = l.split(':') 
            
            gender = None
            age = None
            
            for a in attributes.split(','):
                if 'male' in a:
                    gender = a
                elif a.lower() in ['young', 'old']:
                    age = a
            
            character_lut[name] = {}
            if gender:
                character_lut[name]['gender'] = gender
            if age:
                character_lut[name]['age'] = age
        
    annotation_dict = annotate(text, model, character_lut, saylut)