diff demo/text2ann.py~ @ 0:4dad87badb0c

initial commit
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 16 May 2018 17:56:10 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/demo/text2ann.py~	Wed May 16 17:56:10 2018 +0100
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Apr 28 14:17:15 2018
+
+@author: Emmanouil Theofanis Chourdakis
+
+Takes a .txt story and annotates it based on:
+    
+    characters, 
+    places,
+    saywords,
+    character_lines,
+    spatial_indicators,
+    
+@output:
+    .ann file with the same name
+    
+"""
+
+import argparse
+from sklearn.externals import joblib
+import ner
+import spacy
+
+
+def annotate(text, model, character_lut, saywords_lut):
+    """
+        Function which annotates entities in text
+        using the model in "model"
+    """
+    
+    # Create document from text
+    nlp = spacy.load('en')
+    doc = nlp(text)    
+    
+    # Parse using LUTs
+    
+    # *- Characters
+    
+    
+    
+    
+    
+
+
+if __name__=="__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('input_path', help='.txt file to parse')
+    argparser.add_argument('model_path', help='.pkl file containing model')
+    argparser.add_argument('--say-lut', help='.txt file with list of saywords')
+    argparser.add_argument('--char-lut', help='.txt file with known characters')
+    argparser.add_argument('--place-lut', help='.txt file with known places')
+    
+    args = argparser.parse_args()
+    
+    # Load text file
+    with open(args.input_path) as f:
+        text = f.read()
+        
+    # Load model file
+    model = joblib.load(args.model_path)
+    
+    # Load saywords
+    if args.say_lut:
+        saylut_path = args.say_lut
+    else:
+        saylut_path = 'saywords.txt'
+        
+    with open(saylut_path) as f:
+        saylut = [s for s in f.read().split('\n') if s.strip() != '']
+        
+    # Load places LUT
+    if args.place_lut:
+        placelut_path = args.place_lut
+    else:
+        placelut_path = 'places.txt'
+        
+    with open(placelut_path) as f:
+        placelut = [s for s in f.read().split('\n') if s.strip() != '']
+        
+    # Load character LUT
+    if args.char_lut:
+        charlut_path = args.char_lut
+    else:
+        charlut_path = 'characters.txt'
+        
+    with open(charlut_path) as f:
+        
+        charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
+        
+        character_lut = {} # Stores character attributes indexed by name
+        for l in charlist:
+            name, attributes = l.split(':') 
+            
+            gender = None
+            age = None
+            
+            for a in attributes.split(','):
+                if 'male' in a:
+                    gender = a
+                elif a.lower() in ['young', 'old']:
+                    age = a
+            
+            character_lut[name] = {}
+            if gender:
+                character_lut[name]['gender'] = gender
+            if age:
+                character_lut[name]['age'] = age
+        
+    annotation_dict = annotate(text, model, character_lut, saylut)
+    
+        
+        
+        
+    
\ No newline at end of file