comparison demo/ner.py @ 0:90155bdd5dd6

first commit
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 16 May 2018 18:27:05 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:90155bdd5dd6
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 """
4 Created on Sun Apr 1 14:05:17 2018
5
6 @author: Emmanouil Theofanis Chourdakis
7 """
8
9 from pypeg2 import *
10 import re
11
12 def var_generator(T):
13 I = 0
14 while True:
15 I+=1
16 yield "{}{}".format(T, I)
17
18
19
20 def l_label_generator(T):
21 I = 0
22 while True:
23
24 I+=1
25 yield "<{}LINE{}>".format(T, I)
26
27 annot_var = re.compile("[A-Z][0-9]+")
28 annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*")
29 annot_label = re.compile('[A-Za-z0-9_]+')
30 label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+')
31
32 class AnnotationType(Keyword):
33 grammar = Enum(K("Place"),
34 K("Character"),
35 K("Character_Line"),
36 K("Motion"),
37 K("Motion_Signal"),
38 K("Says"),
39 K("Spatial_Signal"))
40
41 class AttributeType(Keyword):
42 grammar = Enum(K("Age"), K("Gender"))
43
44 class AnnotationTuple:
45 grammar = attr('variable',annot_var),\
46 attr('type',AnnotationType),\
47 attr('idx',annot_pos),\
48 attr('annotation',restline)
49
50
51 class AttributeTuple:
52 grammar = attr('variable', annot_var),\
53 attr('type',AttributeType), \
54 attr('target', annot_var), \
55 attr('annotation', restline)
56
57 class VarArg:
58 grammar = attr('label', annot_label), ':', attr('target', annot_var)
59
60 class VarArgs(List):
61 grammar = some(VarArg)
62
63 class RelationTuple:
64 grammar = attr('variable', annot_var),\
65 attr('args', VarArgs)
66
67 class AnnotLine(List):
68 grammar = [AnnotationTuple, AttributeTuple, RelationTuple]
69
70 class AnnotationFile(List):
71 grammar = some(AnnotLine)
72
73 def get_tokens_by_label(label, sent_tokens, sent_labels):
74
75 tokens = []
76 blabel = "B-{}".format(label)
77 ilabel = 'I-{}'.format(label)
78
79 tok_ = []
80 for n,l in enumerate(sent_labels):
81 if l == blabel:
82 if len(tok_) > 0:
83 tokens.append(tok_)
84 tok_ = [sent_tokens[n]]
85 elif l == ilabel:
86 tok_.append(sent_tokens[n])
87 else:
88 if len(tok_)>0:
89 tokens.append(tok_)
90 tok_ = []
91 return tokens
92
93 def get_token_head(span):
94
95 span_idx = [tok.i for tok in span]
96 head = span[0]
97 while head.head.i in span_idx:
98 if head == head.head:
99 return head
100
101 head = head.head
102 return head
103
104
105 def get_min_dep_path(a, b, doc, LCA):
106
107 lca_idx = LCA[a,b]
108
109 if lca_idx == -1:
110 return "<UND>"
111
112 lca = doc[LCA[a, b]]
113
114 m_a = []
115 m_b = []
116
117 # From tra go up to the LCA.
118
119 tok = doc[a]
120 while tok != lca:
121 if tok.head != None:
122 m_a.append(('up', tok.dep_))
123 tok = tok.head
124
125 tok = doc[b]
126 while tok != lca:
127 if tok.head != None:
128 m_b.append(('down', tok.dep_))
129 tok = tok.head
130 m_b.reverse()
131
132 path = m_a + m_b
133
134 return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path)
135 def get_dep_with_head(tok):
136 dep_ = []
137 while tok.head != tok:
138 dep_.append(tok.dep_)
139 tok = tok.head
140
141 if len(dep_) == 1:
142 return dep_[0], tok.lemma_
143 else:
144 return None, tok.lemma_
145
146 def var_generator(T):
147 I = 0
148 while True:
149 I+=1
150 yield "{}{}".format(T, I)
151
152 def get_dep_with_head(tok):
153 dep_ = []
154 while tok.head != tok:
155 dep_.append(tok.dep_)
156 tok = tok.head
157
158 if len(dep_) == 1:
159 return dep_[0], tok.lemma_
160 else:
161 return None, tok.lemma_
162
163 class Document:
164 def __init__(self, doc):
165
166 self.doc = doc
167 self.LCA = doc.get_lca_matrix()
168 self.text = doc.text
169 self.sentences = [str(s) for s in doc.sents]
170
171 self.tokens = []
172 self.token_sentences = []
173
174 self.relations = []
175
176 for m, sent in enumerate(doc.sents):
177 tlist = []
178 for n, tok in enumerate(sent):
179 token = Token(tok, doc, tok.i, sent, n)
180 tlist.append(token)
181 self.token_sentences.append(tlist)
182 self.tokens += tlist
183
184 def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'):
185 token = Token(token, doc, doc_idx, sent, sent_idx, label)
186 self.tokens.append(token)
187
188 def add_relation(self, trigger, arg1, arg2, label):
189 self.relations.append(Relation(arg1, arg2, trigger, self.LCA, label))
190
191 def find_tokens(self, start, end):
192 tokens = []
193 for tok in self.tokens:
194 if tok.start >= start and tok.end <= end:
195 tokens.append(tok)
196
197 return tokens
198
199 def assign_label_to_tokens(self, start, end, label):
200 tokens = self.find_tokens(start, end)
201 for n, token in enumerate(tokens):
202 if n == 0:
203 IOB = 'B'
204 else:
205 IOB = 'I'
206
207 token.set_label('{}-{}'.format(IOB, label))
208
209 def assign_label_to_tokens_by_matching_lemma(self, lemma, label):
210 for t in self.tokens:
211 if t.token.lemma_ == lemma:
212 t.label = 'B-{}'.format(label)
213
214 def assign_attribute_to_tokens(self, start, end, label, attribute):
215 tokens = self.find_tokens(start, end)
216 for n, token in enumerate(tokens):
217 token.set_attribute(label, attribute)
218
219 def get_token_features_labels(self):
220 features = []
221 labels = []
222
223 for sentence in self.token_sentences:
224 sentence_features = []
225 sentence_labels = []
226
227 for token in sentence:
228 sentence_features.append(token.get_feature_vector())
229 sentence_labels.append(token.label)
230
231 features.append(sentence_features)
232 labels.append(sentence_labels)
233
234 return features, labels
235
236 def get_token_features_attributes(self, label):
237 features = []
238 labels = []
239
240 for sentence in self.token_sentences:
241 sentence_features = []
242 sentence_labels = []
243
244 for token in sentence:
245 sentence_features.append(token.get_feature_vector())
246 if label in token.attributes:
247 sentence_labels.append(token.attributes[label])
248 else:
249 sentence_labels.append('O')
250
251 features.append(sentence_features)
252 labels.append(sentence_labels)
253
254 return features, labels
255
256 def get_gold_relation_feature_labels(self):
257 features = []
258 labels = []
259 for r in self.relations:
260 feat = r.get_feature_vector()
261 label = r.label
262
263 features.append(feat)
264 labels.append(label)
265
266 return features, labels
267
268 def get_candidate_relation_feature_labels(self):
269 features = []
270 labels = []
271
272 candidate_relations = self.get_candidate_relations()
273 for r in candidate_relations:
274 feat = r.get_feature_vector()
275 label = r.label
276
277 features.append(feat)
278 labels.append(label)
279
280 return features, labels
281
282
283 def get_tokens_with_label(self, label):
284
285 blabel = "B-{}".format(label)
286 ilabel = 'I-{}'.format(label)
287
288 tokens = []
289
290 for I in range(len(self.token_sentences)):
291 tokens_ = []
292 sent_tokens = self.token_sentences[I]
293 sent_labels = [t.label for t in sent_tokens]
294
295 tok_ = []
296 for n,l in enumerate(sent_labels):
297 if l == blabel:
298 if len(tok_) > 0:
299 tokens_.append(tok_)
300 tok_ = [sent_tokens[n]]
301 elif l == ilabel:
302 tok_.append(sent_tokens[n])
303 else:
304 if len(tok_)>0:
305 tokens_.append(tok_)
306 tok_ = []
307 tokens.append(tokens_)
308
309 return tokens
310
311 def get_candidate_relations(self):
312 candidate_relations = []
313
314 characters = self.get_tokens_with_label('Character')
315 places = self.get_tokens_with_label('Place')
316 spatial_signals = self.get_tokens_with_label('Spatial_Signal')
317 say_words = self.get_tokens_with_label('Says')
318 character_lines = self.get_tokens_with_label('Character_Line')
319
320 for I in range(len(spatial_signals)):
321 for sp in spatial_signals[I]:
322 for ch in characters[I]:
323 for pl in places[I]:
324 rel = Relation(ch, pl, sp, self.LCA)
325 candidate_relations.append(rel)
326
327 for I in range(len(say_words)):
328 for sw in say_words[I]:
329 for ch in characters[I]:
330 for cl in character_lines[I]:
331 rel = Relation(ch, cl, sw, self.LCA)
332 candidate_relations.append(rel)
333
334 for cr in candidate_relations:
335 for r in self.relations:
336 if cr == r:
337 cr.label = r.label
338
339 return candidate_relations
340
341 def predict_relations(self, model):
342 relations = self.get_candidate_relations()
343
344 for n, r in enumerate(relations):
345 f = r.get_feature_vector()
346 label = model.predict([f])[0]
347 if label != 'NONE':
348 r.label = label
349 self.relations.append(r)
350
351 def __str__(self):
352 return self.text
353
354 class Relation:
355 """ relation, has arg1, arg2, trigger as tokens, also label """
356 def __init__(self, arg1, arg2, trigger, lca, label='NONE'):
357 self.arg1 = arg1
358 self.arg2 = arg2
359 self.trigger = trigger
360 self.doc = trigger[0].doc
361 self.LCA = lca
362 self.label = label
363
364 def __repr__(self):
365 return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2)
366
367 def __eq__(self, other):
368 return all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \
369 and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \
370 and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))])
371
372 def get_feature_vector(self):
373 rf = {}
374
375 arg1 = get_token_head([t.token for t in self.arg1])
376 arg2 = get_token_head([t.token for t in self.arg2])
377 trigger = get_token_head([t.token for t in self.trigger])
378
379 arg1_type = self.arg1[0].label.replace('B-', '')
380 arg2_type = self.arg2[0].label.replace('B-', '')
381
382 rf['10'] = arg1_type+ '::'+ arg2_type
383
384 if trigger.i < arg1.i:
385 arg1_direction = 'right'
386 if trigger.i > arg1.i:
387 arg1_direction = 'left'
388
389 if trigger.i < arg2.i:
390 arg2_direction = 'right'
391 if trigger.i > arg2.i:
392 arg2_direction = 'left'
393
394 rf['12.1'] = arg1_direction
395 rf['12.2'] = arg2_direction
396 rf['13'] = arg1_direction+ '::'+ arg2_direction
397
398 rf['1'] = trigger.text.lower()
399 rf['2'] = trigger.lemma_
400 rf['3'] = trigger.pos_
401 rf['4'] = rf['2'] + '::' + rf['3']
402 rf['11'] = rf['10'] + '::' + rf['2']
403 rf['14'] = rf['13'] + '::' + rf['2']
404
405 # RF15
406
407 for i, token in enumerate([arg1, arg2]):
408 rf['5.{}'.format(i)] = token.text.lower()
409 rf['6.{}'.format(i)] = token.lemma_
410 rf['7.{}'.format(i)] = token.pos_
411 rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_
412 rf['9.{}'.format(i)] = arg1_type
413 rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA)
414 rf['20'] = len(rf['17.{}'.format(i)].split('::'))
415
416 rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i)
417
418
419
420
421 rf['18'] = rf['17.0'] + '::' + rf['17.1']
422
423 deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA)
424 rf['19'] = deppath
425 rf['23'] = rf['22.0'] + rf['22.1']
426
427 return rf
428
429 class Token:
430 """ Named entity, has doc, sent, doc_idx, sent_idx, and label """
431 def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'):
432 self.token = token
433 self.text = token.text
434 self.doc = doc
435 self.doc_idx = doc_idx
436 self.sent = sent
437 self.sent_idx = sent_idx
438 self.attributes = {}
439
440 self.label = label
441 self.start = self.token.idx
442 self.end = self.token.idx + len(self.token)
443
444 def __repr__(self):
445 return "[{} -> {}]".format(repr(self.token), self.label)
446
447 def set_label(self, label):
448 # print("Token {} label changed to {}".format(self.text, label))
449 self.label = label
450
451 def set_attribute(self, label, value):
452 self.attributes[label] = value
453
454 def get_feature_vector(self):
455
456 def find_ngrams(input_list, n):
457 return zip(*[input_list[i:] for i in range(n)])
458
459 # Stores featuer dictionary
460 feat_dict = {}
461
462 #1. Create token spans
463
464 # 5 token span
465 large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)]
466
467 # 3 token span
468 short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)]
469
470 for i, t in enumerate(large_span):
471 feat_dict['F.1_{}'.format(i)] = t.text
472 feat_dict['F.2_{}'.format(i)] = t.lemma_
473 feat_dict['F.3_{}'.format(i)] = t.pos_
474 feat_dict['F.4_{}'.format(i)] = t.ent_type_
475
476 for i, t in enumerate(short_span):
477 feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_])
478 feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_])
479
480 ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams
481 for i, ng in enumerate(ngrams):
482 feat_dict['F.10_{}'.format(i)] = " ".join(ng)
483
484 ngrams = find_ngrams([t.text for t in short_span], 2) # Raw-string bigrams
485 for i, ng in enumerate(ngrams):
486 feat_dict['F.11_{}'.format(i)] = " ".join(ng)
487
488 # Get dependency with head if it exists
489 dirdep, headlemma = get_dep_with_head(self.token)
490 if dirdep is not None:
491 feat_dict['F.7'] = dirdep
492 feat_dict['F.8'] = "::".join([dirdep, headlemma])
493
494 # Get glove vector
495 vector = self.token.vector
496 for i in range(len(vector)):
497 feat_dict['F.9_{}'.format(i)] = vector[i]
498
499
500 return feat_dict
501
502 class Character:
503 """ Named Entity consisting of one or more tokens """
504 def __init__(self, name, age='none', gender='none'):
505 self.name = name
506 self.age = age
507 self.gender = gender
508
509 def __repr__(self):
510 return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name,
511 self.age,
512 self.gender)
513
514 def __eq__(self, other):
515 return self.name.lower() == other.name.lower()
516
517 class Place:
518 """ Named Entity consisting of one or more tokens """
519 def __init__(self, name):
520 self.name = name
521
522 def __repr__(self):
523 return "<PLACE name='{}'>".format(self.name)
524
525 def __eq__(self, other):
526 return self.name.lower() == other.name.lower()
527
528 class Sayword:
529 """ Named Entity consisting of one or more tokens """
530 def __init__(self, name):
531 self.name = name
532
533 def __repr__(self):
534 return "<SAYWORD name='{}'>".format(self.name)
535
536 def __eq__(self, other):
537 return self.name.lower() == other.name.lower()
538
539 class CharacterLine:
540 """ Named Entity consisting of one or more tokens """
541 def __init__(self, name):
542 self.name = name
543
544 def __repr__(self):
545 return "<CHARACTER_LINE name='{}'>".format(self.name)
546
547 def __eq__(self, other):
548 return self.name.lower() == other.name.lower()
549
550 class SpatialSignal:
551 """ Named Entity consisting of one or more tokens """
552 def __init__(self, name):
553 self.name = name
554
555 def __repr__(self):
556 return "<SPATIAL_SIGNAL name='{}'>".format(self.name)
557
558 def __eq__(self, other):
559 return self.name.lower() == other.name.lower()
560
561
562
563