Mercurial > hg > chourdakisreiss2018smc
comparison demo/ner.py @ 0:90155bdd5dd6
first commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 18:27:05 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:90155bdd5dd6 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 """ | |
4 Created on Sun Apr 1 14:05:17 2018 | |
5 | |
6 @author: Emmanouil Theofanis Chourdakis | |
7 """ | |
8 | |
9 from pypeg2 import * | |
10 import re | |
11 | |
12 def var_generator(T): | |
13 I = 0 | |
14 while True: | |
15 I+=1 | |
16 yield "{}{}".format(T, I) | |
17 | |
18 | |
19 | |
20 def l_label_generator(T): | |
21 I = 0 | |
22 while True: | |
23 | |
24 I+=1 | |
25 yield "<{}LINE{}>".format(T, I) | |
26 | |
27 annot_var = re.compile("[A-Z][0-9]+") | |
28 annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*") | |
29 annot_label = re.compile('[A-Za-z0-9_]+') | |
30 label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+') | |
31 | |
32 class AnnotationType(Keyword): | |
33 grammar = Enum(K("Place"), | |
34 K("Character"), | |
35 K("Character_Line"), | |
36 K("Motion"), | |
37 K("Motion_Signal"), | |
38 K("Says"), | |
39 K("Spatial_Signal")) | |
40 | |
41 class AttributeType(Keyword): | |
42 grammar = Enum(K("Age"), K("Gender")) | |
43 | |
44 class AnnotationTuple: | |
45 grammar = attr('variable',annot_var),\ | |
46 attr('type',AnnotationType),\ | |
47 attr('idx',annot_pos),\ | |
48 attr('annotation',restline) | |
49 | |
50 | |
51 class AttributeTuple: | |
52 grammar = attr('variable', annot_var),\ | |
53 attr('type',AttributeType), \ | |
54 attr('target', annot_var), \ | |
55 attr('annotation', restline) | |
56 | |
57 class VarArg: | |
58 grammar = attr('label', annot_label), ':', attr('target', annot_var) | |
59 | |
60 class VarArgs(List): | |
61 grammar = some(VarArg) | |
62 | |
63 class RelationTuple: | |
64 grammar = attr('variable', annot_var),\ | |
65 attr('args', VarArgs) | |
66 | |
67 class AnnotLine(List): | |
68 grammar = [AnnotationTuple, AttributeTuple, RelationTuple] | |
69 | |
70 class AnnotationFile(List): | |
71 grammar = some(AnnotLine) | |
72 | |
73 def get_tokens_by_label(label, sent_tokens, sent_labels): | |
74 | |
75 tokens = [] | |
76 blabel = "B-{}".format(label) | |
77 ilabel = 'I-{}'.format(label) | |
78 | |
79 tok_ = [] | |
80 for n,l in enumerate(sent_labels): | |
81 if l == blabel: | |
82 if len(tok_) > 0: | |
83 tokens.append(tok_) | |
84 tok_ = [sent_tokens[n]] | |
85 elif l == ilabel: | |
86 tok_.append(sent_tokens[n]) | |
87 else: | |
88 if len(tok_)>0: | |
89 tokens.append(tok_) | |
90 tok_ = [] | |
91 return tokens | |
92 | |
93 def get_token_head(span): | |
94 | |
95 span_idx = [tok.i for tok in span] | |
96 head = span[0] | |
97 while head.head.i in span_idx: | |
98 if head == head.head: | |
99 return head | |
100 | |
101 head = head.head | |
102 return head | |
103 | |
104 | |
105 def get_min_dep_path(a, b, doc, LCA): | |
106 | |
107 lca_idx = LCA[a,b] | |
108 | |
109 if lca_idx == -1: | |
110 return "<UND>" | |
111 | |
112 lca = doc[LCA[a, b]] | |
113 | |
114 m_a = [] | |
115 m_b = [] | |
116 | |
117 # From tra go up to the LCA. | |
118 | |
119 tok = doc[a] | |
120 while tok != lca: | |
121 if tok.head != None: | |
122 m_a.append(('up', tok.dep_)) | |
123 tok = tok.head | |
124 | |
125 tok = doc[b] | |
126 while tok != lca: | |
127 if tok.head != None: | |
128 m_b.append(('down', tok.dep_)) | |
129 tok = tok.head | |
130 m_b.reverse() | |
131 | |
132 path = m_a + m_b | |
133 | |
134 return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path) | |
135 def get_dep_with_head(tok): | |
136 dep_ = [] | |
137 while tok.head != tok: | |
138 dep_.append(tok.dep_) | |
139 tok = tok.head | |
140 | |
141 if len(dep_) == 1: | |
142 return dep_[0], tok.lemma_ | |
143 else: | |
144 return None, tok.lemma_ | |
145 | |
146 def var_generator(T): | |
147 I = 0 | |
148 while True: | |
149 I+=1 | |
150 yield "{}{}".format(T, I) | |
151 | |
152 def get_dep_with_head(tok): | |
153 dep_ = [] | |
154 while tok.head != tok: | |
155 dep_.append(tok.dep_) | |
156 tok = tok.head | |
157 | |
158 if len(dep_) == 1: | |
159 return dep_[0], tok.lemma_ | |
160 else: | |
161 return None, tok.lemma_ | |
162 | |
163 class Document: | |
164 def __init__(self, doc): | |
165 | |
166 self.doc = doc | |
167 self.LCA = doc.get_lca_matrix() | |
168 self.text = doc.text | |
169 self.sentences = [str(s) for s in doc.sents] | |
170 | |
171 self.tokens = [] | |
172 self.token_sentences = [] | |
173 | |
174 self.relations = [] | |
175 | |
176 for m, sent in enumerate(doc.sents): | |
177 tlist = [] | |
178 for n, tok in enumerate(sent): | |
179 token = Token(tok, doc, tok.i, sent, n) | |
180 tlist.append(token) | |
181 self.token_sentences.append(tlist) | |
182 self.tokens += tlist | |
183 | |
184 def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'): | |
185 token = Token(token, doc, doc_idx, sent, sent_idx, label) | |
186 self.tokens.append(token) | |
187 | |
188 def add_relation(self, trigger, arg1, arg2, label): | |
189 self.relations.append(Relation(arg1, arg2, trigger, self.LCA, label)) | |
190 | |
191 def find_tokens(self, start, end): | |
192 tokens = [] | |
193 for tok in self.tokens: | |
194 if tok.start >= start and tok.end <= end: | |
195 tokens.append(tok) | |
196 | |
197 return tokens | |
198 | |
199 def assign_label_to_tokens(self, start, end, label): | |
200 tokens = self.find_tokens(start, end) | |
201 for n, token in enumerate(tokens): | |
202 if n == 0: | |
203 IOB = 'B' | |
204 else: | |
205 IOB = 'I' | |
206 | |
207 token.set_label('{}-{}'.format(IOB, label)) | |
208 | |
209 def assign_label_to_tokens_by_matching_lemma(self, lemma, label): | |
210 for t in self.tokens: | |
211 if t.token.lemma_ == lemma: | |
212 t.label = 'B-{}'.format(label) | |
213 | |
214 def assign_attribute_to_tokens(self, start, end, label, attribute): | |
215 tokens = self.find_tokens(start, end) | |
216 for n, token in enumerate(tokens): | |
217 token.set_attribute(label, attribute) | |
218 | |
219 def get_token_features_labels(self): | |
220 features = [] | |
221 labels = [] | |
222 | |
223 for sentence in self.token_sentences: | |
224 sentence_features = [] | |
225 sentence_labels = [] | |
226 | |
227 for token in sentence: | |
228 sentence_features.append(token.get_feature_vector()) | |
229 sentence_labels.append(token.label) | |
230 | |
231 features.append(sentence_features) | |
232 labels.append(sentence_labels) | |
233 | |
234 return features, labels | |
235 | |
236 def get_token_features_attributes(self, label): | |
237 features = [] | |
238 labels = [] | |
239 | |
240 for sentence in self.token_sentences: | |
241 sentence_features = [] | |
242 sentence_labels = [] | |
243 | |
244 for token in sentence: | |
245 sentence_features.append(token.get_feature_vector()) | |
246 if label in token.attributes: | |
247 sentence_labels.append(token.attributes[label]) | |
248 else: | |
249 sentence_labels.append('O') | |
250 | |
251 features.append(sentence_features) | |
252 labels.append(sentence_labels) | |
253 | |
254 return features, labels | |
255 | |
256 def get_gold_relation_feature_labels(self): | |
257 features = [] | |
258 labels = [] | |
259 for r in self.relations: | |
260 feat = r.get_feature_vector() | |
261 label = r.label | |
262 | |
263 features.append(feat) | |
264 labels.append(label) | |
265 | |
266 return features, labels | |
267 | |
268 def get_candidate_relation_feature_labels(self): | |
269 features = [] | |
270 labels = [] | |
271 | |
272 candidate_relations = self.get_candidate_relations() | |
273 for r in candidate_relations: | |
274 feat = r.get_feature_vector() | |
275 label = r.label | |
276 | |
277 features.append(feat) | |
278 labels.append(label) | |
279 | |
280 return features, labels | |
281 | |
282 | |
283 def get_tokens_with_label(self, label): | |
284 | |
285 blabel = "B-{}".format(label) | |
286 ilabel = 'I-{}'.format(label) | |
287 | |
288 tokens = [] | |
289 | |
290 for I in range(len(self.token_sentences)): | |
291 tokens_ = [] | |
292 sent_tokens = self.token_sentences[I] | |
293 sent_labels = [t.label for t in sent_tokens] | |
294 | |
295 tok_ = [] | |
296 for n,l in enumerate(sent_labels): | |
297 if l == blabel: | |
298 if len(tok_) > 0: | |
299 tokens_.append(tok_) | |
300 tok_ = [sent_tokens[n]] | |
301 elif l == ilabel: | |
302 tok_.append(sent_tokens[n]) | |
303 else: | |
304 if len(tok_)>0: | |
305 tokens_.append(tok_) | |
306 tok_ = [] | |
307 tokens.append(tokens_) | |
308 | |
309 return tokens | |
310 | |
311 def get_candidate_relations(self): | |
312 candidate_relations = [] | |
313 | |
314 characters = self.get_tokens_with_label('Character') | |
315 places = self.get_tokens_with_label('Place') | |
316 spatial_signals = self.get_tokens_with_label('Spatial_Signal') | |
317 say_words = self.get_tokens_with_label('Says') | |
318 character_lines = self.get_tokens_with_label('Character_Line') | |
319 | |
320 for I in range(len(spatial_signals)): | |
321 for sp in spatial_signals[I]: | |
322 for ch in characters[I]: | |
323 for pl in places[I]: | |
324 rel = Relation(ch, pl, sp, self.LCA) | |
325 candidate_relations.append(rel) | |
326 | |
327 for I in range(len(say_words)): | |
328 for sw in say_words[I]: | |
329 for ch in characters[I]: | |
330 for cl in character_lines[I]: | |
331 rel = Relation(ch, cl, sw, self.LCA) | |
332 candidate_relations.append(rel) | |
333 | |
334 for cr in candidate_relations: | |
335 for r in self.relations: | |
336 if cr == r: | |
337 cr.label = r.label | |
338 | |
339 return candidate_relations | |
340 | |
341 def predict_relations(self, model): | |
342 relations = self.get_candidate_relations() | |
343 | |
344 for n, r in enumerate(relations): | |
345 f = r.get_feature_vector() | |
346 label = model.predict([f])[0] | |
347 if label != 'NONE': | |
348 r.label = label | |
349 self.relations.append(r) | |
350 | |
351 def __str__(self): | |
352 return self.text | |
353 | |
354 class Relation: | |
355 """ relation, has arg1, arg2, trigger as tokens, also label """ | |
356 def __init__(self, arg1, arg2, trigger, lca, label='NONE'): | |
357 self.arg1 = arg1 | |
358 self.arg2 = arg2 | |
359 self.trigger = trigger | |
360 self.doc = trigger[0].doc | |
361 self.LCA = lca | |
362 self.label = label | |
363 | |
364 def __repr__(self): | |
365 return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2) | |
366 | |
367 def __eq__(self, other): | |
368 return all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \ | |
369 and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \ | |
370 and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))]) | |
371 | |
372 def get_feature_vector(self): | |
373 rf = {} | |
374 | |
375 arg1 = get_token_head([t.token for t in self.arg1]) | |
376 arg2 = get_token_head([t.token for t in self.arg2]) | |
377 trigger = get_token_head([t.token for t in self.trigger]) | |
378 | |
379 arg1_type = self.arg1[0].label.replace('B-', '') | |
380 arg2_type = self.arg2[0].label.replace('B-', '') | |
381 | |
382 rf['10'] = arg1_type+ '::'+ arg2_type | |
383 | |
384 if trigger.i < arg1.i: | |
385 arg1_direction = 'right' | |
386 if trigger.i > arg1.i: | |
387 arg1_direction = 'left' | |
388 | |
389 if trigger.i < arg2.i: | |
390 arg2_direction = 'right' | |
391 if trigger.i > arg2.i: | |
392 arg2_direction = 'left' | |
393 | |
394 rf['12.1'] = arg1_direction | |
395 rf['12.2'] = arg2_direction | |
396 rf['13'] = arg1_direction+ '::'+ arg2_direction | |
397 | |
398 rf['1'] = trigger.text.lower() | |
399 rf['2'] = trigger.lemma_ | |
400 rf['3'] = trigger.pos_ | |
401 rf['4'] = rf['2'] + '::' + rf['3'] | |
402 rf['11'] = rf['10'] + '::' + rf['2'] | |
403 rf['14'] = rf['13'] + '::' + rf['2'] | |
404 | |
405 # RF15 | |
406 | |
407 for i, token in enumerate([arg1, arg2]): | |
408 rf['5.{}'.format(i)] = token.text.lower() | |
409 rf['6.{}'.format(i)] = token.lemma_ | |
410 rf['7.{}'.format(i)] = token.pos_ | |
411 rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_ | |
412 rf['9.{}'.format(i)] = arg1_type | |
413 rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA) | |
414 rf['20'] = len(rf['17.{}'.format(i)].split('::')) | |
415 | |
416 rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i) | |
417 | |
418 | |
419 | |
420 | |
421 rf['18'] = rf['17.0'] + '::' + rf['17.1'] | |
422 | |
423 deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA) | |
424 rf['19'] = deppath | |
425 rf['23'] = rf['22.0'] + rf['22.1'] | |
426 | |
427 return rf | |
428 | |
429 class Token: | |
430 """ Named entity, has doc, sent, doc_idx, sent_idx, and label """ | |
431 def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'): | |
432 self.token = token | |
433 self.text = token.text | |
434 self.doc = doc | |
435 self.doc_idx = doc_idx | |
436 self.sent = sent | |
437 self.sent_idx = sent_idx | |
438 self.attributes = {} | |
439 | |
440 self.label = label | |
441 self.start = self.token.idx | |
442 self.end = self.token.idx + len(self.token) | |
443 | |
444 def __repr__(self): | |
445 return "[{} -> {}]".format(repr(self.token), self.label) | |
446 | |
447 def set_label(self, label): | |
448 # print("Token {} label changed to {}".format(self.text, label)) | |
449 self.label = label | |
450 | |
451 def set_attribute(self, label, value): | |
452 self.attributes[label] = value | |
453 | |
454 def get_feature_vector(self): | |
455 | |
456 def find_ngrams(input_list, n): | |
457 return zip(*[input_list[i:] for i in range(n)]) | |
458 | |
459 # Stores featuer dictionary | |
460 feat_dict = {} | |
461 | |
462 #1. Create token spans | |
463 | |
464 # 5 token span | |
465 large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)] | |
466 | |
467 # 3 token span | |
468 short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)] | |
469 | |
470 for i, t in enumerate(large_span): | |
471 feat_dict['F.1_{}'.format(i)] = t.text | |
472 feat_dict['F.2_{}'.format(i)] = t.lemma_ | |
473 feat_dict['F.3_{}'.format(i)] = t.pos_ | |
474 feat_dict['F.4_{}'.format(i)] = t.ent_type_ | |
475 | |
476 for i, t in enumerate(short_span): | |
477 feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_]) | |
478 feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_]) | |
479 | |
480 ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams | |
481 for i, ng in enumerate(ngrams): | |
482 feat_dict['F.10_{}'.format(i)] = " ".join(ng) | |
483 | |
484 ngrams = find_ngrams([t.text for t in short_span], 2) # Raw-string bigrams | |
485 for i, ng in enumerate(ngrams): | |
486 feat_dict['F.11_{}'.format(i)] = " ".join(ng) | |
487 | |
488 # Get dependency with head if it exists | |
489 dirdep, headlemma = get_dep_with_head(self.token) | |
490 if dirdep is not None: | |
491 feat_dict['F.7'] = dirdep | |
492 feat_dict['F.8'] = "::".join([dirdep, headlemma]) | |
493 | |
494 # Get glove vector | |
495 vector = self.token.vector | |
496 for i in range(len(vector)): | |
497 feat_dict['F.9_{}'.format(i)] = vector[i] | |
498 | |
499 | |
500 return feat_dict | |
501 | |
502 class Character: | |
503 """ Named Entity consisting of one or more tokens """ | |
504 def __init__(self, name, age='none', gender='none'): | |
505 self.name = name | |
506 self.age = age | |
507 self.gender = gender | |
508 | |
509 def __repr__(self): | |
510 return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name, | |
511 self.age, | |
512 self.gender) | |
513 | |
514 def __eq__(self, other): | |
515 return self.name.lower() == other.name.lower() | |
516 | |
517 class Place: | |
518 """ Named Entity consisting of one or more tokens """ | |
519 def __init__(self, name): | |
520 self.name = name | |
521 | |
522 def __repr__(self): | |
523 return "<PLACE name='{}'>".format(self.name) | |
524 | |
525 def __eq__(self, other): | |
526 return self.name.lower() == other.name.lower() | |
527 | |
528 class Sayword: | |
529 """ Named Entity consisting of one or more tokens """ | |
530 def __init__(self, name): | |
531 self.name = name | |
532 | |
533 def __repr__(self): | |
534 return "<SAYWORD name='{}'>".format(self.name) | |
535 | |
536 def __eq__(self, other): | |
537 return self.name.lower() == other.name.lower() | |
538 | |
539 class CharacterLine: | |
540 """ Named Entity consisting of one or more tokens """ | |
541 def __init__(self, name): | |
542 self.name = name | |
543 | |
544 def __repr__(self): | |
545 return "<CHARACTER_LINE name='{}'>".format(self.name) | |
546 | |
547 def __eq__(self, other): | |
548 return self.name.lower() == other.name.lower() | |
549 | |
550 class SpatialSignal: | |
551 """ Named Entity consisting of one or more tokens """ | |
552 def __init__(self, name): | |
553 self.name = name | |
554 | |
555 def __repr__(self): | |
556 return "<SPATIAL_SIGNAL name='{}'>".format(self.name) | |
557 | |
558 def __eq__(self, other): | |
559 return self.name.lower() == other.name.lower() | |
560 | |
561 | |
562 | |
563 |