e@0
|
1 #!/usr/bin/env python3
|
e@0
|
2 # -*- coding: utf-8 -*-
|
e@0
|
3 """
|
e@0
|
4 Created on Sat Apr 28 14:17:15 2018
|
e@0
|
5
|
e@0
|
6 @author: Emmanouil Theofanis Chourdakis
|
e@0
|
7
|
e@0
|
8 Takes a .txt story and annotates it based on:
|
e@0
|
9
|
e@0
|
10 characters,
|
e@0
|
11 places,
|
e@0
|
12 saywords,
|
e@0
|
13 character_lines,
|
e@0
|
14 spatial_indicators,
|
e@0
|
15
|
e@0
|
16 @output:
|
e@0
|
17 .ann file with the same name
|
e@0
|
18
|
e@0
|
19 """
|
e@0
|
20
|
e@0
|
21 import argparse
|
e@0
|
22 from sklearn.externals import joblib
|
e@0
|
23 import ner
|
e@0
|
24 import spacy
|
e@0
|
25
|
e@0
|
26
|
e@0
|
27 def annotate(text, model, character_lut, saywords_lut):
|
e@0
|
28 """
|
e@0
|
29 Function which annotates entities in text
|
e@0
|
30 using the model in "model"
|
e@0
|
31 """
|
e@0
|
32
|
e@0
|
33 # Create document from text
|
e@0
|
34 nlp = spacy.load('en')
|
e@0
|
35 doc = nlp(text)
|
e@0
|
36
|
e@0
|
37 # Parse using LUTs
|
e@0
|
38
|
e@0
|
39 # *- Characters
|
e@0
|
40
|
e@0
|
41
|
e@0
|
42
|
e@0
|
43
|
e@0
|
44
|
e@0
|
45
|
e@0
|
46
|
e@0
|
47 if __name__=="__main__":
|
e@0
|
48 argparser = argparse.ArgumentParser()
|
e@0
|
49 argparser.add_argument('input_path', help='.txt file to parse')
|
e@0
|
50 argparser.add_argument('model_path', help='.pkl file containing model')
|
e@0
|
51 argparser.add_argument('--say-lut', help='.txt file with list of saywords')
|
e@0
|
52 argparser.add_argument('--char-lut', help='.txt file with known characters')
|
e@0
|
53 argparser.add_argument('--place-lut', help='.txt file with known places')
|
e@0
|
54
|
e@0
|
55 args = argparser.parse_args()
|
e@0
|
56
|
e@0
|
57 # Load text file
|
e@0
|
58 with open(args.input_path) as f:
|
e@0
|
59 text = f.read()
|
e@0
|
60
|
e@0
|
61 # Load model file
|
e@0
|
62 model = joblib.load(args.model_path)
|
e@0
|
63
|
e@0
|
64 # Load saywords
|
e@0
|
65 if args.say_lut:
|
e@0
|
66 saylut_path = args.say_lut
|
e@0
|
67 else:
|
e@0
|
68 saylut_path = 'saywords.txt'
|
e@0
|
69
|
e@0
|
70 with open(saylut_path) as f:
|
e@0
|
71 saylut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
72
|
e@0
|
73 # Load places LUT
|
e@0
|
74 if args.place_lut:
|
e@0
|
75 placelut_path = args.place_lut
|
e@0
|
76 else:
|
e@0
|
77 placelut_path = 'places.txt'
|
e@0
|
78
|
e@0
|
79 with open(placelut_path) as f:
|
e@0
|
80 placelut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
81
|
e@0
|
82 # Load character LUT
|
e@0
|
83 if args.char_lut:
|
e@0
|
84 charlut_path = args.char_lut
|
e@0
|
85 else:
|
e@0
|
86 charlut_path = 'characters.txt'
|
e@0
|
87
|
e@0
|
88 with open(charlut_path) as f:
|
e@0
|
89
|
e@0
|
90 charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
|
e@0
|
91
|
e@0
|
92 character_lut = {} # Stores character attributes indexed by name
|
e@0
|
93 for l in charlist:
|
e@0
|
94 name, attributes = l.split(':')
|
e@0
|
95
|
e@0
|
96 gender = None
|
e@0
|
97 age = None
|
e@0
|
98
|
e@0
|
99 for a in attributes.split(','):
|
e@0
|
100 if 'male' in a:
|
e@0
|
101 gender = a
|
e@0
|
102 elif a.lower() in ['young', 'old']:
|
e@0
|
103 age = a
|
e@0
|
104
|
e@0
|
105 character_lut[name] = {}
|
e@0
|
106 if gender:
|
e@0
|
107 character_lut[name]['gender'] = gender
|
e@0
|
108 if age:
|
e@0
|
109 character_lut[name]['age'] = age
|
e@0
|
110
|
e@0
|
111 annotation_dict = annotate(text, model, character_lut, saylut)
|
e@0
|
112
|
e@0
|
113
|
e@0
|
114
|
e@0
|
115
|
e@0
|
116 |