e@0
|
1 #!/usr/bin/env python3
|
e@0
|
2 # -*- coding: utf-8 -*-
|
e@0
|
3 """
|
e@0
|
4 Created on Sat Apr 28 14:17:15 2018
|
e@0
|
5
|
e@0
|
6 @author: Emmanouil Theofanis Chourdakis
|
e@0
|
7
|
e@0
|
8 Takes a .txt story and annotates it based on:
|
e@0
|
9
|
e@0
|
10 characters,
|
e@0
|
11 places,
|
e@0
|
12 saywords,
|
e@0
|
13 character_lines,
|
e@0
|
14 spatial_indicators,
|
e@0
|
15
|
e@0
|
16 @output:
|
e@0
|
17 .ann file with the same name
|
e@0
|
18
|
e@0
|
19 """
|
e@0
|
20
|
e@0
|
21 # Change path to current directory
|
e@0
|
22 import os
|
e@0
|
23 os.chdir(os.path.dirname(os.path.realpath(__file__)))
|
e@0
|
24
|
e@0
|
25 import argparse
|
e@0
|
26 from sklearn.externals import joblib
|
e@0
|
27 import ner
|
e@0
|
28 import spacy
|
e@0
|
29 import re
|
e@0
|
30 import logging
|
e@0
|
31 import json
|
e@0
|
32
|
e@0
|
33 logging.basicConfig(level=logging.INFO)
|
e@0
|
34
|
e@0
|
35
|
e@0
|
36 def quotes2dict(text):
|
e@0
|
37 new_text = text
|
e@0
|
38 is_open = False
|
e@0
|
39
|
e@0
|
40 quote_no = 0
|
e@0
|
41 quote = []
|
e@0
|
42 quote_dict = {}
|
e@0
|
43
|
e@0
|
44 for n, c in enumerate(text):
|
e@0
|
45 if c == '"' and not is_open:
|
e@0
|
46 is_open = True
|
e@0
|
47 continue
|
e@0
|
48 elif c == '"' and is_open:
|
e@0
|
49 is_open = False
|
e@0
|
50 quote_dict["<cline{}>.".format(quote_no)] = ''.join(quote)
|
e@0
|
51 new_text = new_text.replace('"'+''.join(quote)+'"', "<cline{}>.".format(quote_no))
|
e@0
|
52 quote = []
|
e@0
|
53 quote_no += 1
|
e@0
|
54 continue
|
e@0
|
55
|
e@0
|
56 if is_open:
|
e@0
|
57 quote.append(c)
|
e@0
|
58
|
e@0
|
59 return new_text, quote_dict
|
e@0
|
60
|
e@0
|
61
|
e@0
|
62
|
e@0
|
63
|
e@0
|
64 def annotate_entities(text,
|
e@0
|
65 model,
|
e@0
|
66 character_lut,
|
e@0
|
67 saywords_lut,
|
e@0
|
68 spind_lut,
|
e@0
|
69 places_lut):
|
e@0
|
70 """
|
e@0
|
71 Function which annotates entities in text
|
e@0
|
72 using the model in "model",
|
e@0
|
73
|
e@0
|
74 returns: A ner.Document object with tokens labelled via
|
e@0
|
75 the LUTS provided and also the NER model in "model"
|
e@0
|
76 """
|
e@0
|
77
|
e@0
|
78 # Find and store character lines in a dictionary
|
e@0
|
79 logging.info('Swapping character lines for character line tags')
|
e@0
|
80 processed_text, quotes = quotes2dict(text)
|
e@0
|
81
|
e@0
|
82 # Create spacy document object from resulting text
|
e@0
|
83 # Create the nlp engine
|
e@0
|
84 logging.info("Loading 'en' spacy model")
|
e@0
|
85 nlp = spacy.load('en')
|
e@0
|
86
|
e@0
|
87 # Parse to spacy document
|
e@0
|
88 logging.info("Parsing document to spacy")
|
e@0
|
89 doc = nlp(processed_text)
|
e@0
|
90
|
e@0
|
91 # Parse to our custom Document object
|
e@0
|
92 logging.info("Parsing document to our object format for Named Entity Recognition")
|
e@0
|
93 mDoc = ner.Document(doc)
|
e@0
|
94
|
e@0
|
95 # Label <CLINE[0-9]+> as character line
|
e@0
|
96 logging.info("Labeling character lines")
|
e@0
|
97 spans = [r.span() for r in re.finditer(r'<cline[0-9]+>\.', mDoc.text)]
|
e@0
|
98 for span in spans:
|
e@0
|
99 mDoc.assign_label_to_tokens(span[0],span[1],'Character_Line')
|
e@0
|
100
|
e@0
|
101 # Parse using LUTs
|
e@0
|
102
|
e@0
|
103 # *- Characters
|
e@0
|
104
|
e@0
|
105 # Sort by number of words so that tokens with more words override
|
e@0
|
106 # tokens with less words in labelling. For example if you have
|
e@0
|
107 # `man' and `an old man' as characters, the character labelled is going to
|
e@0
|
108 # be `an old man' and not the included `man'.
|
e@0
|
109 logging.info("Labeling characters from LUT")
|
e@0
|
110 cLUT = [c.lower() for c in sorted(character_lut, key=lambda x: len(x.split()))]
|
e@0
|
111
|
e@0
|
112 # Find literals in document that match a character in cLUT
|
e@0
|
113 for c in cLUT:
|
e@0
|
114 spans = [r.span() for r in re.finditer(c, mDoc.text)]
|
e@0
|
115 for span in spans:
|
e@0
|
116 mDoc.assign_label_to_tokens(span[0],span[1],'Character')
|
e@0
|
117
|
e@0
|
118 # *- Saywords
|
e@0
|
119
|
e@0
|
120 # Assign labels to saywords. here saywords contain only one token. In addition
|
e@0
|
121 # we check against the saywords' lemma and not the saywords itself.
|
e@0
|
122 logging.info("Labeling saywords from LUT")
|
e@0
|
123 swLUT = [nlp(sw)[0].lemma_ for sw in saywords_lut]
|
e@0
|
124 for sw in swLUT:
|
e@0
|
125 mDoc.assign_label_to_tokens_by_matching_lemma(sw, 'Says')
|
e@0
|
126
|
e@0
|
127 # *- Places
|
e@0
|
128 logging.info("Labeling places from LUT")
|
e@0
|
129 plLUT = [pl.lower() for pl in sorted(places_lut, key=lambda x: len(x.split()))]
|
e@0
|
130
|
e@0
|
131 # Find literals in document that match a character in cLUT
|
e@0
|
132 for pl in plLUT:
|
e@0
|
133 spans = [r.span() for r in re.finditer(pl, mDoc.text)]
|
e@0
|
134 for span in spans:
|
e@0
|
135 mDoc.assign_label_to_tokens(span[0],span[1],'Place')
|
e@0
|
136
|
e@0
|
137 # *- Spatial indicators
|
e@0
|
138 logging.info("Labeling spatial indicators from LUT")
|
e@0
|
139 spLUT = [sp.lower() for sp in sorted(spind_lut, key=lambda x: len(x.split()))]
|
e@0
|
140 for sp in spLUT:
|
e@0
|
141 spans = [r.span() for r in re.finditer(sp, mDoc.text)]
|
e@0
|
142 for span in spans:
|
e@0
|
143 mDoc.assign_label_to_tokens(span[0],span[1],'Spatial_Signal')
|
e@0
|
144
|
e@0
|
145
|
e@0
|
146 logging.info("Extracting token features")
|
e@0
|
147 features, labels = mDoc.get_token_features_labels()
|
e@0
|
148
|
e@0
|
149 logging.info("Predicting labels")
|
e@0
|
150 new_labels = model.predict(features)
|
e@0
|
151
|
e@0
|
152
|
e@0
|
153 logging.info("Assning labels based on the NER model")
|
e@0
|
154 # If a label is not already assigned by a LUT, assign it using the model
|
e@0
|
155
|
e@0
|
156 #logging.info("{} {}".format(len(mDoc.tokens), len(new_labels)))
|
e@0
|
157 for m, sent in enumerate(mDoc.token_sentences):
|
e@0
|
158 for n, token in enumerate(sent):
|
e@0
|
159 if token.label == 'O':
|
e@0
|
160 token.label = new_labels[m][n]
|
e@0
|
161
|
e@0
|
162 return mDoc, quotes
|
e@0
|
163
|
e@0
|
164
|
e@0
|
165 def doc2brat(mDoc):
|
e@0
|
166 """ Returns a brat .ann file str based on mDoc """
|
e@0
|
167
|
e@0
|
168 # Variable generator for entities (T in brat format)
|
e@0
|
169 tvar = ner.var_generator('T')
|
e@0
|
170
|
e@0
|
171 ann_str = ""
|
e@0
|
172 # Extract characters in the format
|
e@0
|
173 # T1 Character START END character string
|
e@0
|
174
|
e@0
|
175 labels = ['Character', 'Says', 'Place', 'Spatial_Signal', 'Character_Line']
|
e@0
|
176
|
e@0
|
177 for label in labels:
|
e@0
|
178 token_sentences = mDoc.get_tokens_with_label(label)
|
e@0
|
179 for tlist in token_sentences:
|
e@0
|
180 if len(tlist) == 0:
|
e@0
|
181 continue
|
e@0
|
182
|
e@0
|
183 for tokens in tlist:
|
e@0
|
184 start = tokens[0].start
|
e@0
|
185 end = tokens[-1].end
|
e@0
|
186 txt = mDoc.text[start:end]
|
e@0
|
187 ann_str += "{}\t{} {} {}\t{}\n".format(next(tvar), label, start, end, txt)
|
e@0
|
188
|
e@0
|
189
|
e@0
|
190 return ann_str
|
e@0
|
191
|
e@0
|
192 if __name__=="__main__":
|
e@0
|
193 argparser = argparse.ArgumentParser()
|
e@0
|
194 argparser.add_argument('input_path', help='.txt file to parse')
|
e@0
|
195 argparser.add_argument('ner_model_path', help='.pkl file containing NER model')
|
e@0
|
196 argparser.add_argument('rel_model_path', help='.pkl file containing relational model')
|
e@0
|
197 argparser.add_argument('--say-lut', help='.txt file with list of saywords')
|
e@0
|
198 argparser.add_argument('--char-lut', help='.txt file with known characters')
|
e@0
|
199 argparser.add_argument('--place-lut', help='.txt file with known places')
|
e@0
|
200 argparser.add_argument('--spatial-indicator-lut', help='.txt file with known spatial indicators')
|
e@0
|
201 argparser.add_argument('--force', help='force overwrite when there is a file to be overwritten')
|
e@0
|
202
|
e@0
|
203 args = argparser.parse_args()
|
e@0
|
204
|
e@0
|
205 # Load text file
|
e@0
|
206 with open(args.input_path) as f:
|
e@0
|
207 text = f.read()
|
e@0
|
208
|
e@0
|
209 output_dir = os.path.dirname(args.input_path)
|
e@0
|
210 output_text_path = args.input_path[:-4] + '_processed.txt'
|
e@0
|
211 output_quotes_path = args.input_path[:-4] + '_quotes.json'
|
e@0
|
212 output_annotation_path = args.input_path[:-4] + '_processed.ann'
|
e@0
|
213
|
e@0
|
214 # Load NER model file
|
e@0
|
215 ner_model = joblib.load(args.ner_model_path)
|
e@0
|
216
|
e@0
|
217 # Load REL model file
|
e@0
|
218 rel_model = joblib.load(args.rel_model_path)
|
e@0
|
219
|
e@0
|
220 # Load saywords
|
e@0
|
221 if args.say_lut:
|
e@0
|
222 saylut_path = args.say_lut
|
e@0
|
223 else:
|
e@0
|
224 saylut_path = 'saywords.txt'
|
e@0
|
225
|
e@0
|
226 with open(saylut_path) as f:
|
e@0
|
227 saylut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
228
|
e@0
|
229 # Load places LUT
|
e@0
|
230 if args.place_lut:
|
e@0
|
231 placelut_path = args.place_lut
|
e@0
|
232 else:
|
e@0
|
233 placelut_path = 'places.txt'
|
e@0
|
234
|
e@0
|
235 with open(placelut_path) as f:
|
e@0
|
236 placelut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
237
|
e@0
|
238 # Load spatial indicators LUT
|
e@0
|
239 if args.spatial_indicator_lut:
|
e@0
|
240 spatial_indicator_lut_path = args.spatial_indicator_lut
|
e@0
|
241 else:
|
e@0
|
242 spatial_indicator_lut_path = 'spatial_indicators.txt'
|
e@0
|
243
|
e@0
|
244 with open(spatial_indicator_lut_path) as f:
|
e@0
|
245 spatial_indicator_lut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
246
|
e@0
|
247 # Load character LUT
|
e@0
|
248 if args.char_lut:
|
e@0
|
249 charlut_path = args.char_lut
|
e@0
|
250 else:
|
e@0
|
251 charlut_path = 'characters.txt'
|
e@0
|
252
|
e@0
|
253 with open(charlut_path) as f:
|
e@0
|
254
|
e@0
|
255 charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
|
e@0
|
256
|
e@0
|
257 character_lut = {} # Stores character attributes indexed by name
|
e@0
|
258 for l in charlist:
|
e@0
|
259 name, attributes = l.split(':')
|
e@0
|
260
|
e@0
|
261 gender = None
|
e@0
|
262 age = None
|
e@0
|
263
|
e@0
|
264 for a in attributes.split(','):
|
e@0
|
265 if 'male' in a:
|
e@0
|
266 gender = a
|
e@0
|
267 elif a.lower() in ['young', 'old']:
|
e@0
|
268 age = a
|
e@0
|
269
|
e@0
|
270 character_lut[name] = {}
|
e@0
|
271 if gender:
|
e@0
|
272 character_lut[name]['gender'] = gender
|
e@0
|
273 if age:
|
e@0
|
274 character_lut[name]['age'] = age
|
e@0
|
275
|
e@0
|
276 mDoc, quotes = annotate_entities(text, ner_model, character_lut, saylut, spatial_indicator_lut, placelut)
|
e@0
|
277
|
e@0
|
278 annotation_text = doc2brat(mDoc)
|
e@0
|
279
|
e@0
|
280 to_save = {
|
e@0
|
281 output_text_path: mDoc.text,
|
e@0
|
282 output_quotes_path: json.dumps(quotes),
|
e@0
|
283 output_annotation_path: annotation_text
|
e@0
|
284 }
|
e@0
|
285
|
e@0
|
286
|
e@0
|
287 for path in to_save:
|
e@0
|
288 if not os.path.exists(path) or args.force:
|
e@0
|
289 with open(path, 'w') as f:
|
e@0
|
290 f.write(to_save[path])
|
e@0
|
291 else:
|
e@0
|
292 overwrite = input('Path {} exists, overwrite? (y/N) '.format(path))
|
e@0
|
293 if overwrite[0] in ['Y', 'y']:
|
e@0
|
294 with open(path, 'w') as f:
|
e@0
|
295 f.write(to_save[path])
|
e@0
|
296
|
e@0
|
297
|
e@0
|
298
|
e@0
|
299
|