e@0
|
1 #!/usr/bin/env python3
|
e@0
|
2 # -*- coding: utf-8 -*-
|
e@0
|
3 """
|
e@0
|
4 Created on Sat Apr 28 14:17:15 2018
|
e@0
|
5
|
e@0
|
6 @author: Emmanouil Theofanis Chourdakis
|
e@0
|
7
|
e@0
|
8 Takes a .txt story and annotates it based on:
|
e@0
|
9
|
e@0
|
10 characters,
|
e@0
|
11 places,
|
e@0
|
12 saywords,
|
e@0
|
13 character_lines,
|
e@0
|
14 spatial_indicators,
|
e@0
|
15
|
e@0
|
16 @output:
|
e@0
|
17 .ann file with the same name
|
e@0
|
18 .json file with the extracted character lines
|
e@0
|
19
|
e@0
|
20 """
|
e@0
|
21
|
e@0
|
22 import os
|
e@0
|
23 import argparse
|
e@0
|
24 from sklearn.externals import joblib
|
e@0
|
25 import ner
|
e@0
|
26 import spacy
|
e@0
|
27 import re
|
e@0
|
28 import logging
|
e@0
|
29 import json
|
e@0
|
30 from difflib import SequenceMatcher
|
e@0
|
31 from neuralcoref import Coref
|
e@0
|
32 from rel import *
|
e@0
|
33
|
e@0
|
34 def pronoun2gender(word):
|
e@0
|
35 pronoun2gender = {
|
e@0
|
36 'he' : 'Male',
|
e@0
|
37 'him': 'Male',
|
e@0
|
38 'she': 'Female',
|
e@0
|
39 'her': 'Female',
|
e@0
|
40 'his': 'Male',
|
e@0
|
41 'hers': 'Female',
|
e@0
|
42 'himself': 'Male',
|
e@0
|
43 'herself': 'Female',
|
e@0
|
44 }
|
e@0
|
45
|
e@0
|
46 if word in pronoun2gender:
|
e@0
|
47 return pronoun2gender[word]
|
e@0
|
48 else:
|
e@0
|
49 return 'neutral'
|
e@0
|
50
|
e@0
|
51
|
e@0
|
52 logging.basicConfig(level=logging.INFO)
|
e@0
|
53
|
e@0
|
54 # given an iterable of pairs return the key corresponding to the greatest value
|
e@0
|
55 def argmax(pairs):
|
e@0
|
56 #https://stackoverflow.com/questions/5098580/implementing-argmax-in-python
|
e@0
|
57 return max(pairs, key=lambda x: x[1])[0]
|
e@0
|
58
|
e@0
|
59 # given an iterable of values return the index of the greatest value
|
e@0
|
60 def argmax_index(values):
|
e@0
|
61 return argmax(enumerate(values))
|
e@0
|
62
|
e@0
|
63 # given an iterable of keys and a function f, return the key with largest f(key)
|
e@0
|
64 def argmax_f(keys, f):
|
e@0
|
65 return max(keys, key=f)
|
e@0
|
66
|
e@0
|
67 def similar(a, b):
|
e@0
|
68 """ Returns string similarity between a and b """
|
e@0
|
69 # https://stackoverflow.com/questions/17388213/find-the-similarity-metric-between-two-strings
|
e@0
|
70 return SequenceMatcher(None, a, b).ratio()
|
e@0
|
71
|
e@0
|
72
|
e@0
|
73 def get_resolved_clusters(coref):
|
e@0
|
74 """ Gets a coref object (from neural coref) and
|
e@0
|
75 returns the clusters as words """
|
e@0
|
76
|
e@0
|
77 mentions = coref.get_mentions()
|
e@0
|
78 clusters = coref.get_clusters()[0]
|
e@0
|
79 result = []
|
e@0
|
80 for c in clusters:
|
e@0
|
81 result.append([mentions[r] for r in clusters[c]])
|
e@0
|
82 return result
|
e@0
|
83
|
e@0
|
84 def cluster_word(word, clusters):
|
e@0
|
85 """ Gets a word and a list of clusters of mentions
|
e@0
|
86 and figures out where the word matches most based on
|
e@0
|
87 string similarity """
|
e@0
|
88
|
e@0
|
89 similarities = []
|
e@0
|
90 for rc in clusters:
|
e@0
|
91 similarity = [similar(word.lower(), c.text.lower()) for c in rc]
|
e@0
|
92 similarities.append(similarity)
|
e@0
|
93 max_similarities = [max(s) for s in similarities]
|
e@0
|
94 if max(max_similarities) > 0.75:
|
e@0
|
95 return argmax_index(max_similarities)
|
e@0
|
96 else:
|
e@0
|
97 return -1
|
e@0
|
98
|
e@0
|
99 def quotes2dict(text):
|
e@0
|
100 new_text = text
|
e@0
|
101 is_open = False
|
e@0
|
102
|
e@0
|
103 quote_no = 0
|
e@0
|
104 quote = []
|
e@0
|
105 narrator = []
|
e@0
|
106 quote_dict = {}
|
e@0
|
107
|
e@0
|
108 for n, c in enumerate(text):
|
e@0
|
109 if c == '"' and not is_open:
|
e@0
|
110 is_open = True
|
e@0
|
111 quote_dict["<nline{}>.".format(quote_no)] = ''.join(narrator)
|
e@0
|
112 narrator = []
|
e@0
|
113 quote_no += 1
|
e@0
|
114 continue
|
e@0
|
115
|
e@0
|
116 elif c == '"' and is_open:
|
e@0
|
117 is_open = False
|
e@0
|
118 quote_dict["<cline{}>.".format(quote_no)] = ''.join(quote)
|
e@0
|
119 new_text = new_text.replace('"'+''.join(quote)+'"', "<cline{}>.".format(quote_no))
|
e@0
|
120 quote = []
|
e@0
|
121 quote_no += 1
|
e@0
|
122 continue
|
e@0
|
123
|
e@0
|
124 if is_open:
|
e@0
|
125 quote.append(c)
|
e@0
|
126 elif not is_open:
|
e@0
|
127 narrator.append(c)
|
e@0
|
128
|
e@0
|
129 return new_text, quote_dict
|
e@0
|
130
|
e@0
|
131 def figure_gender(word, clusters, character_lut):
|
e@0
|
132 for c in character_lut:
|
e@0
|
133 if c.lower() in [w.lower() for w in word] and character_lut[c]['gender'] in ['Male', 'Female']:
|
e@0
|
134 return character_lut[c]['gender']
|
e@0
|
135
|
e@0
|
136 cluster_idx = cluster_word(word, clusters)
|
e@0
|
137 if cluster_idx == -1:
|
e@0
|
138 return 'neutral'
|
e@0
|
139 genders = [pronoun2gender(c.text) for c in clusters[cluster_idx]]
|
e@0
|
140 if 'Male' in genders and 'Female' not in 'genders':
|
e@0
|
141 return 'Male'
|
e@0
|
142 if 'Female' in genders and 'Male' not in 'genders':
|
e@0
|
143 return 'Female'
|
e@0
|
144 return 'neutral'
|
e@0
|
145
|
e@0
|
146 def annotate(text,
|
e@0
|
147 ner_model,
|
e@0
|
148 rel_model,
|
e@0
|
149 character_lut,
|
e@0
|
150 saywords_lut,
|
e@0
|
151 spind_lut,
|
e@0
|
152 places_lut,
|
e@0
|
153 do_coreference_resolution=True):
|
e@0
|
154 """
|
e@0
|
155 Function which annotates entities in text
|
e@0
|
156 using the model in "model",
|
e@0
|
157
|
e@0
|
158 returns: A ner.Document object with tokens labelled via
|
e@0
|
159 the LUTS provided and also the NER model in "model"
|
e@0
|
160 """
|
e@0
|
161
|
e@0
|
162 # Find and store character lines in a dictionary
|
e@0
|
163 logging.info('Swapping character lines for character line tags')
|
e@0
|
164 processed_text, quotes = quotes2dict(text)
|
e@0
|
165
|
e@0
|
166 # Create spacy document object from resulting text
|
e@0
|
167 # Create the nlp engine
|
e@0
|
168 logging.info("Loading 'en' spacy model")
|
e@0
|
169 nlp = spacy.load('en')
|
e@0
|
170
|
e@0
|
171 # Loading coreference model
|
e@0
|
172 coref = Coref()
|
e@0
|
173
|
e@0
|
174
|
e@0
|
175 # Doing coreference resolution
|
e@0
|
176 if do_coreference_resolution:
|
e@0
|
177 logging.info("Doing one-shot coreference resolution (this might take some time)")
|
e@0
|
178 coref.one_shot_coref(processed_text)
|
e@0
|
179 resolved_clusters = get_resolved_clusters(coref)
|
e@0
|
180 processed_text = coref.get_resolved_utterances()[0]
|
e@0
|
181
|
e@0
|
182 # Parse to spacy document
|
e@0
|
183 logging.info("Parsing document to spacy")
|
e@0
|
184 doc = nlp(processed_text)
|
e@0
|
185
|
e@0
|
186 # Parse to our custom Document object
|
e@0
|
187 logging.info("Parsing document to our object format for Named Entity Recognition")
|
e@0
|
188 mDoc = ner.Document(doc)
|
e@0
|
189
|
e@0
|
190 # Label <CLINE[0-9]+> as character line
|
e@0
|
191 logging.info("Labeling character lines")
|
e@0
|
192 spans = [r.span() for r in re.finditer(r'<cline[0-9]+>\.', mDoc.text)]
|
e@0
|
193 for span in spans:
|
e@0
|
194 mDoc.assign_label_to_tokens(span[0],span[1],'Character_Line')
|
e@0
|
195
|
e@0
|
196 # Parse using LUTs
|
e@0
|
197
|
e@0
|
198 # *- Characters
|
e@0
|
199
|
e@0
|
200 # Sort by number of words so that tokens with more words override
|
e@0
|
201 # tokens with less words in labelling. For example if you have
|
e@0
|
202 # `man' and `an old man' as characters, the character labelled is going to
|
e@0
|
203 # be `an old man' and not the included `man'.
|
e@0
|
204 logging.info("Labeling characters from LUT")
|
e@0
|
205 cLUT = [c.lower() for c in sorted(character_lut, key=lambda x: len(x.split()))]
|
e@0
|
206
|
e@0
|
207 # Find literals in document that match a character in cLUT
|
e@0
|
208 for c in cLUT:
|
e@0
|
209 spans = [r.span() for r in re.finditer(c, mDoc.text)]
|
e@0
|
210 for span in spans:
|
e@0
|
211 mDoc.assign_label_to_tokens(span[0],span[1],'Character')
|
e@0
|
212
|
e@0
|
213 # *- Saywords
|
e@0
|
214
|
e@0
|
215 # Assign labels to saywords. here saywords contain only one token. In addition
|
e@0
|
216 # we check against the saywords' lemma and not the saywords itself.
|
e@0
|
217 logging.info("Labeling saywords from LUT")
|
e@0
|
218 swLUT = [nlp(sw)[0].lemma_ for sw in saywords_lut]
|
e@0
|
219 for sw in swLUT:
|
e@0
|
220 mDoc.assign_label_to_tokens_by_matching_lemma(sw, 'Says')
|
e@0
|
221
|
e@0
|
222 # *- Places
|
e@0
|
223 logging.info("Labeling places from LUT")
|
e@0
|
224 plLUT = [pl.lower() for pl in sorted(places_lut, key=lambda x: len(x.split()))]
|
e@0
|
225
|
e@0
|
226 # Find literals in document that match a character in cLUT
|
e@0
|
227 for pl in plLUT:
|
e@0
|
228 spans = [r.span() for r in re.finditer(pl, mDoc.text)]
|
e@0
|
229 for span in spans:
|
e@0
|
230 mDoc.assign_label_to_tokens(span[0],span[1],'Place')
|
e@0
|
231
|
e@0
|
232 # *- Spatial indicators
|
e@0
|
233 logging.info("Labeling spatial indicators from LUT")
|
e@0
|
234 spLUT = [sp.lower() for sp in sorted(spind_lut, key=lambda x: len(x.split()))]
|
e@0
|
235 for sp in spLUT:
|
e@0
|
236 spans = [r.span() for r in re.finditer(sp, mDoc.text)]
|
e@0
|
237 for span in spans:
|
e@0
|
238 mDoc.assign_label_to_tokens(span[0],span[1],'Spatial_Signal')
|
e@0
|
239
|
e@0
|
240 logging.info("Extracting token features")
|
e@0
|
241 features, labels = mDoc.get_token_features_labels()
|
e@0
|
242
|
e@0
|
243 logging.info("Predicting labels")
|
e@0
|
244 new_labels = ner_model.predict(features)
|
e@0
|
245
|
e@0
|
246
|
e@0
|
247 logging.info("Assigning labels based on the NER model")
|
e@0
|
248 # If a label is not already assigned by a LUT, assign it using the model
|
e@0
|
249
|
e@0
|
250 #logging.info("{} {}".format(len(mDoc.tokens), len(new_labels)))
|
e@0
|
251 for m, sent in enumerate(mDoc.token_sentences):
|
e@0
|
252 for n, token in enumerate(sent):
|
e@0
|
253 if token.label == 'O':
|
e@0
|
254 token.label = new_labels[m][n]
|
e@0
|
255
|
e@0
|
256 # Assign character labels
|
e@0
|
257 if do_coreference_resolution:
|
e@0
|
258 logging.info('Figuring out character genders')
|
e@0
|
259 character_tok_sent = mDoc.get_tokens_with_label('Character')
|
e@0
|
260 for sent in character_tok_sent:
|
e@0
|
261 for character in sent:
|
e@0
|
262 raw_string = " ".join([c.text for c in character])
|
e@0
|
263 gender = figure_gender(raw_string, resolved_clusters, character_lut)
|
e@0
|
264 for tok in character:
|
e@0
|
265 if gender in ['Male', 'Female']:
|
e@0
|
266 tok.set_attribute('gender', gender)
|
e@0
|
267
|
e@0
|
268 logging.info('Predicting the correct label for all possible relations in Document')
|
e@0
|
269 mDoc.predict_relations(rel_model)
|
e@0
|
270
|
e@0
|
271
|
e@0
|
272 return mDoc, quotes
|
e@0
|
273
|
e@0
|
274
|
e@0
|
275 def doc2brat(mDoc):
|
e@0
|
276 """ Returns a brat .ann file str based on mDoc """
|
e@0
|
277
|
e@0
|
278 # Dictionary that maps text span -> variable (to be used when
|
e@0
|
279 # adding relations )
|
e@0
|
280 span2var = {}
|
e@0
|
281
|
e@0
|
282 # Variable generator for entities (T in brat format)
|
e@0
|
283 tvar = ner.var_generator('T')
|
e@0
|
284
|
e@0
|
285 # Variable generator for relations (E in brat format)
|
e@0
|
286 rvar = ner.var_generator('E')
|
e@0
|
287
|
e@0
|
288 # Variable generator for attributions (E in brat format)
|
e@0
|
289 avar = ner.var_generator('A')
|
e@0
|
290
|
e@0
|
291 ann_str = ""
|
e@0
|
292 # Extract characters in the format
|
e@0
|
293 # T1 Character START END character string
|
e@0
|
294
|
e@0
|
295 labels = ['Character', 'Says', 'Place', 'Spatial_Signal', 'Character_Line']
|
e@0
|
296
|
e@0
|
297 for label in labels:
|
e@0
|
298 token_sentences = mDoc.get_tokens_with_label(label)
|
e@0
|
299 for tlist in token_sentences:
|
e@0
|
300 if len(tlist) == 0:
|
e@0
|
301 continue
|
e@0
|
302
|
e@0
|
303 for tokens in tlist:
|
e@0
|
304 start = tokens[0].start
|
e@0
|
305 end = tokens[-1].end
|
e@0
|
306 txt = mDoc.text[start:end]
|
e@0
|
307 var = next(tvar)
|
e@0
|
308 ann_str += "{}\t{} {} {}\t{}\n".format(var, label, start, end, txt)
|
e@0
|
309 if 'gender' in tokens[0].attributes:
|
e@0
|
310 ann_str += "{}\t{} {} {}\n".format(next(avar), 'Gender', var, tokens[0].attributes['gender'])
|
e@0
|
311
|
e@0
|
312 span2var[(start, end)] = var
|
e@0
|
313
|
e@0
|
314 # Map relations
|
e@0
|
315 for r in mDoc.relations:
|
e@0
|
316 var = next(rvar)
|
e@0
|
317 trigger = r.trigger
|
e@0
|
318 trigger_label = trigger[0].label[2:]
|
e@0
|
319 trigger_start = trigger[0].start
|
e@0
|
320 trigger_end = trigger[-1].end
|
e@0
|
321 trigger_var = span2var[(trigger_start, trigger_end)]
|
e@0
|
322
|
e@0
|
323 # If a trigger is Spatial_Signal then the
|
e@0
|
324 # arguments are of form Trajector and Landmark
|
e@0
|
325
|
e@0
|
326 if trigger_label == 'Spatial_Signal':
|
e@0
|
327 arg1_label = 'Trajector'
|
e@0
|
328 arg2_label = 'Landmark'
|
e@0
|
329
|
e@0
|
330
|
e@0
|
331 # If a trigger is Says then the
|
e@0
|
332 # arguments are WHO and WHAT
|
e@0
|
333
|
e@0
|
334 elif trigger_label == 'Says':
|
e@0
|
335 arg1_label = 'WHO'
|
e@0
|
336 arg2_label = 'WHAT'
|
e@0
|
337
|
e@0
|
338 # Span for the first argument
|
e@0
|
339 arg1_start = r.arg1[0].start
|
e@0
|
340 arg1_end = r.arg1[-1].end
|
e@0
|
341
|
e@0
|
342 # Variable for the first argument
|
e@0
|
343 arg1_var = span2var[(arg1_start, arg1_end)]
|
e@0
|
344
|
e@0
|
345 # Span for the second argument
|
e@0
|
346 arg2_start = r.arg2[0].start
|
e@0
|
347 arg2_end = r.arg2[-1].end
|
e@0
|
348
|
e@0
|
349 # Variable for the second argument
|
e@0
|
350 arg2_var = span2var[(arg2_start, arg2_end)]
|
e@0
|
351
|
e@0
|
352 annot_line = "{}\t{}:{} {}:{} {}:{}\n".format(var,
|
e@0
|
353 trigger_label,
|
e@0
|
354 trigger_var,
|
e@0
|
355 arg1_label,
|
e@0
|
356 arg1_var,
|
e@0
|
357 arg2_label,
|
e@0
|
358 arg2_var)
|
e@0
|
359
|
e@0
|
360 ann_str += annot_line
|
e@0
|
361
|
e@0
|
362
|
e@0
|
363
|
e@0
|
364
|
e@0
|
365 return ann_str
|
e@0
|
366
|
e@0
|
367 if __name__=="__main__":
|
e@0
|
368 argparser = argparse.ArgumentParser()
|
e@0
|
369 argparser.add_argument('input_path', help='.txt file to parse')
|
e@0
|
370 argparser.add_argument('ner_model_path', help='.pkl file containing NER model')
|
e@0
|
371 argparser.add_argument('rel_model_path', help='.pkl file containing relational model')
|
e@0
|
372 argparser.add_argument('--say-lut', help='.txt file with list of saywords')
|
e@0
|
373 argparser.add_argument('--char-lut', help='.txt file with known characters')
|
e@0
|
374 argparser.add_argument('--place-lut', help='.txt file with known places')
|
e@0
|
375 argparser.add_argument('--spatial-indicator-lut', help='.txt file with known spatial indicators')
|
e@0
|
376 argparser.add_argument('--force', help='force overwrite when there is a file to be overwritten')
|
e@0
|
377 argparser.add_argument('--no-coreference-resolution', action='store_true', help='omit coreference resolution step')
|
e@0
|
378
|
e@0
|
379 args = argparser.parse_args()
|
e@0
|
380
|
e@0
|
381 # Load text file
|
e@0
|
382 with open(args.input_path) as f:
|
e@0
|
383 text = " ".join(f.read().split())
|
e@0
|
384
|
e@0
|
385 output_dir = os.path.dirname(args.input_path)
|
e@0
|
386 output_text_path = args.input_path[:-4] + '_processed.txt'
|
e@0
|
387 output_quotes_path = args.input_path[:-4] + '_quotes.json'
|
e@0
|
388 output_annotation_path = args.input_path[:-4] + '_processed.ann'
|
e@0
|
389
|
e@0
|
390 # Load NER model file
|
e@0
|
391 ner_model = joblib.load(args.ner_model_path)
|
e@0
|
392
|
e@0
|
393 # Load REL model file
|
e@0
|
394 rel_model = joblib.load(args.rel_model_path)
|
e@0
|
395
|
e@0
|
396 # Load saywords
|
e@0
|
397 if args.say_lut:
|
e@0
|
398 saylut_path = args.say_lut
|
e@0
|
399 else:
|
e@0
|
400 saylut_path = 'saywords.txt'
|
e@0
|
401
|
e@0
|
402 with open(saylut_path) as f:
|
e@0
|
403 saylut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
404
|
e@0
|
405 # Load places LUT
|
e@0
|
406 if args.place_lut:
|
e@0
|
407 placelut_path = args.place_lut
|
e@0
|
408 else:
|
e@0
|
409 placelut_path = 'places.txt'
|
e@0
|
410
|
e@0
|
411 with open(placelut_path) as f:
|
e@0
|
412 placelut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
413
|
e@0
|
414 # Load spatial indicators LUT
|
e@0
|
415 if args.spatial_indicator_lut:
|
e@0
|
416 spatial_indicator_lut_path = args.spatial_indicator_lut
|
e@0
|
417 else:
|
e@0
|
418 spatial_indicator_lut_path = 'spatial_indicators.txt'
|
e@0
|
419
|
e@0
|
420 with open(spatial_indicator_lut_path) as f:
|
e@0
|
421 spatial_indicator_lut = [s for s in f.read().split('\n') if s.strip() != '']
|
e@0
|
422
|
e@0
|
423 # Load character LUT
|
e@0
|
424 if args.char_lut:
|
e@0
|
425 charlut_path = args.char_lut
|
e@0
|
426 else:
|
e@0
|
427 charlut_path = 'characters.txt'
|
e@0
|
428
|
e@0
|
429 with open(charlut_path) as f:
|
e@0
|
430
|
e@0
|
431 charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
|
e@0
|
432
|
e@0
|
433 character_lut = {} # Stores character attributes indexed by name
|
e@0
|
434 for l in charlist:
|
e@0
|
435 name, attributes = l.split(':')
|
e@0
|
436
|
e@0
|
437 gender = None
|
e@0
|
438 age = None
|
e@0
|
439
|
e@0
|
440 for a in attributes.split(','):
|
e@0
|
441 if 'male' in a:
|
e@0
|
442 gender = a
|
e@0
|
443 elif a.lower() in ['young', 'old']:
|
e@0
|
444 age = a
|
e@0
|
445
|
e@0
|
446 character_lut[name] = {}
|
e@0
|
447 if gender:
|
e@0
|
448 character_lut[name]['gender'] = gender
|
e@0
|
449 if age:
|
e@0
|
450 character_lut[name]['age'] = age
|
e@0
|
451
|
e@0
|
452 if args.no_coreference_resolution:
|
e@0
|
453 corefres = False
|
e@0
|
454 else:
|
e@0
|
455 corefres = True
|
e@0
|
456 mDoc, quotes = annotate(text, ner_model, rel_model, character_lut, saylut, spatial_indicator_lut, placelut, corefres)
|
e@0
|
457
|
e@0
|
458 annotation_text = doc2brat(mDoc)
|
e@0
|
459
|
e@0
|
460 to_save = {
|
e@0
|
461 output_text_path: mDoc.text,
|
e@0
|
462 output_quotes_path: json.dumps(quotes),
|
e@0
|
463 output_annotation_path: annotation_text
|
e@0
|
464 }
|
e@0
|
465
|
e@0
|
466
|
e@0
|
467 for path in to_save:
|
e@0
|
468 if not os.path.exists(path) or args.force:
|
e@0
|
469 with open(path, 'w') as f:
|
e@0
|
470 f.write(to_save[path])
|
e@0
|
471 else:
|
e@0
|
472 overwrite = input('Path {} exists, overwrite? (y/N) '.format(path))
|
e@0
|
473 if overwrite[0] in ['Y', 'y']:
|
e@0
|
474 with open(path, 'w') as f:
|
e@0
|
475 f.write(to_save[path])
|
e@0
|
476
|
e@0
|
477
|
e@0
|
478
|
e@0
|
479
|