Mercurial > hg > from-my-pen-to-your-ears-supplementary-material

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 28 14:17:15 2018

@author: Emmanouil Theofanis Chourdakis

Takes a .txt story and annotates it based on:

    characters,
    places,
    saywords,
    character_lines,
    spatial_indicators,

@output:
    .ann file with the same name

"""

import argparse
from sklearn.externals import joblib
import ner
import spacy


def annotate(text, model, character_lut, saywords_lut):
    """
        Function which annotates entities in text
        using the model in "model"
    """

    # Create document from text
    nlp = spacy.load('en')
    doc = nlp(text)

    # Parse using LUTs

    # *- Characters


if __name__=="__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('input_path', help='.txt file to parse')
    argparser.add_argument('model_path', help='.pkl file containing model')
    argparser.add_argument('--say-lut', help='.txt file with list of saywords')
    argparser.add_argument('--char-lut', help='.txt file with known characters')
    argparser.add_argument('--place-lut', help='.txt file with known places')

    args = argparser.parse_args()

    # Load text file
    with open(args.input_path) as f:
        text = f.read()

    # Load model file
    model = joblib.load(args.model_path)

    # Load saywords
    if args.say_lut:
        saylut_path = args.say_lut
    else:
        saylut_path = 'saywords.txt'

    with open(saylut_path) as f:
        saylut = [s for s in f.read().split('\n') if s.strip() != '']

    # Load places LUT
    if args.place_lut:
        placelut_path = args.place_lut
    else:
        placelut_path = 'places.txt'

    with open(placelut_path) as f:
        placelut = [s for s in f.read().split('\n') if s.strip() != '']

    # Load character LUT
    if args.char_lut:
        charlut_path = args.char_lut
    else:
        charlut_path = 'characters.txt'

    with open(charlut_path) as f:

        charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line

        character_lut = {} # Stores character attributes indexed by name
        for l in charlist:
            name, attributes = l.split(':')

            gender = None
            age = None

            for a in attributes.split(','):
                if 'male' in a:
                    gender = a
                elif a.lower() in ['young', 'old']:
                    age = a

            character_lut[name] = {}
            if gender:
                character_lut[name]['gender'] = gender
            if age:
                character_lut[name]['age'] = age

    annotation_dict = annotate(text, model, character_lut, saylut)
author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Wed, 16 May 2018 17:56:10 +0100
parents
children