annotate demo/ner.py @ 13:16066f0a7127 tip

fixed the problem with brat
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 08 Dec 2018 11:02:40 +0000
parents 90155bdd5dd6
children
rev   line source
e@0 1 #!/usr/bin/env python3
e@0 2 # -*- coding: utf-8 -*-
e@0 3 """
e@0 4 Created on Sun Apr 1 14:05:17 2018
e@0 5
e@0 6 @author: Emmanouil Theofanis Chourdakis
e@0 7 """
e@0 8
e@0 9 from pypeg2 import *
e@0 10 import re
e@0 11
e@0 12 def var_generator(T):
e@0 13 I = 0
e@0 14 while True:
e@0 15 I+=1
e@0 16 yield "{}{}".format(T, I)
e@0 17
e@0 18
e@0 19
e@0 20 def l_label_generator(T):
e@0 21 I = 0
e@0 22 while True:
e@0 23
e@0 24 I+=1
e@0 25 yield "<{}LINE{}>".format(T, I)
e@0 26
e@0 27 annot_var = re.compile("[A-Z][0-9]+")
e@0 28 annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*")
e@0 29 annot_label = re.compile('[A-Za-z0-9_]+')
e@0 30 label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+')
e@0 31
e@0 32 class AnnotationType(Keyword):
e@0 33 grammar = Enum(K("Place"),
e@0 34 K("Character"),
e@0 35 K("Character_Line"),
e@0 36 K("Motion"),
e@0 37 K("Motion_Signal"),
e@0 38 K("Says"),
e@0 39 K("Spatial_Signal"))
e@0 40
e@0 41 class AttributeType(Keyword):
e@0 42 grammar = Enum(K("Age"), K("Gender"))
e@0 43
e@0 44 class AnnotationTuple:
e@0 45 grammar = attr('variable',annot_var),\
e@0 46 attr('type',AnnotationType),\
e@0 47 attr('idx',annot_pos),\
e@0 48 attr('annotation',restline)
e@0 49
e@0 50
e@0 51 class AttributeTuple:
e@0 52 grammar = attr('variable', annot_var),\
e@0 53 attr('type',AttributeType), \
e@0 54 attr('target', annot_var), \
e@0 55 attr('annotation', restline)
e@0 56
e@0 57 class VarArg:
e@0 58 grammar = attr('label', annot_label), ':', attr('target', annot_var)
e@0 59
e@0 60 class VarArgs(List):
e@0 61 grammar = some(VarArg)
e@0 62
e@0 63 class RelationTuple:
e@0 64 grammar = attr('variable', annot_var),\
e@0 65 attr('args', VarArgs)
e@0 66
e@0 67 class AnnotLine(List):
e@0 68 grammar = [AnnotationTuple, AttributeTuple, RelationTuple]
e@0 69
e@0 70 class AnnotationFile(List):
e@0 71 grammar = some(AnnotLine)
e@0 72
e@0 73 def get_tokens_by_label(label, sent_tokens, sent_labels):
e@0 74
e@0 75 tokens = []
e@0 76 blabel = "B-{}".format(label)
e@0 77 ilabel = 'I-{}'.format(label)
e@0 78
e@0 79 tok_ = []
e@0 80 for n,l in enumerate(sent_labels):
e@0 81 if l == blabel:
e@0 82 if len(tok_) > 0:
e@0 83 tokens.append(tok_)
e@0 84 tok_ = [sent_tokens[n]]
e@0 85 elif l == ilabel:
e@0 86 tok_.append(sent_tokens[n])
e@0 87 else:
e@0 88 if len(tok_)>0:
e@0 89 tokens.append(tok_)
e@0 90 tok_ = []
e@0 91 return tokens
e@0 92
e@0 93 def get_token_head(span):
e@0 94
e@0 95 span_idx = [tok.i for tok in span]
e@0 96 head = span[0]
e@0 97 while head.head.i in span_idx:
e@0 98 if head == head.head:
e@0 99 return head
e@0 100
e@0 101 head = head.head
e@0 102 return head
e@0 103
e@0 104
e@0 105 def get_min_dep_path(a, b, doc, LCA):
e@0 106
e@0 107 lca_idx = LCA[a,b]
e@0 108
e@0 109 if lca_idx == -1:
e@0 110 return "<UND>"
e@0 111
e@0 112 lca = doc[LCA[a, b]]
e@0 113
e@0 114 m_a = []
e@0 115 m_b = []
e@0 116
e@0 117 # From tra go up to the LCA.
e@0 118
e@0 119 tok = doc[a]
e@0 120 while tok != lca:
e@0 121 if tok.head != None:
e@0 122 m_a.append(('up', tok.dep_))
e@0 123 tok = tok.head
e@0 124
e@0 125 tok = doc[b]
e@0 126 while tok != lca:
e@0 127 if tok.head != None:
e@0 128 m_b.append(('down', tok.dep_))
e@0 129 tok = tok.head
e@0 130 m_b.reverse()
e@0 131
e@0 132 path = m_a + m_b
e@0 133
e@0 134 return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path)
e@0 135 def get_dep_with_head(tok):
e@0 136 dep_ = []
e@0 137 while tok.head != tok:
e@0 138 dep_.append(tok.dep_)
e@0 139 tok = tok.head
e@0 140
e@0 141 if len(dep_) == 1:
e@0 142 return dep_[0], tok.lemma_
e@0 143 else:
e@0 144 return None, tok.lemma_
e@0 145
e@0 146 def var_generator(T):
e@0 147 I = 0
e@0 148 while True:
e@0 149 I+=1
e@0 150 yield "{}{}".format(T, I)
e@0 151
e@0 152 def get_dep_with_head(tok):
e@0 153 dep_ = []
e@0 154 while tok.head != tok:
e@0 155 dep_.append(tok.dep_)
e@0 156 tok = tok.head
e@0 157
e@0 158 if len(dep_) == 1:
e@0 159 return dep_[0], tok.lemma_
e@0 160 else:
e@0 161 return None, tok.lemma_
e@0 162
e@0 163 class Document:
e@0 164 def __init__(self, doc):
e@0 165
e@0 166 self.doc = doc
e@0 167 self.LCA = doc.get_lca_matrix()
e@0 168 self.text = doc.text
e@0 169 self.sentences = [str(s) for s in doc.sents]
e@0 170
e@0 171 self.tokens = []
e@0 172 self.token_sentences = []
e@0 173
e@0 174 self.relations = []
e@0 175
e@0 176 for m, sent in enumerate(doc.sents):
e@0 177 tlist = []
e@0 178 for n, tok in enumerate(sent):
e@0 179 token = Token(tok, doc, tok.i, sent, n)
e@0 180 tlist.append(token)
e@0 181 self.token_sentences.append(tlist)
e@0 182 self.tokens += tlist
e@0 183
e@0 184 def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'):
e@0 185 token = Token(token, doc, doc_idx, sent, sent_idx, label)
e@0 186 self.tokens.append(token)
e@0 187
e@0 188 def add_relation(self, trigger, arg1, arg2, label):
e@0 189 self.relations.append(Relation(arg1, arg2, trigger, self.LCA, label))
e@0 190
e@0 191 def find_tokens(self, start, end):
e@0 192 tokens = []
e@0 193 for tok in self.tokens:
e@0 194 if tok.start >= start and tok.end <= end:
e@0 195 tokens.append(tok)
e@0 196
e@0 197 return tokens
e@0 198
e@0 199 def assign_label_to_tokens(self, start, end, label):
e@0 200 tokens = self.find_tokens(start, end)
e@0 201 for n, token in enumerate(tokens):
e@0 202 if n == 0:
e@0 203 IOB = 'B'
e@0 204 else:
e@0 205 IOB = 'I'
e@0 206
e@0 207 token.set_label('{}-{}'.format(IOB, label))
e@0 208
e@0 209 def assign_label_to_tokens_by_matching_lemma(self, lemma, label):
e@0 210 for t in self.tokens:
e@0 211 if t.token.lemma_ == lemma:
e@0 212 t.label = 'B-{}'.format(label)
e@0 213
e@0 214 def assign_attribute_to_tokens(self, start, end, label, attribute):
e@0 215 tokens = self.find_tokens(start, end)
e@0 216 for n, token in enumerate(tokens):
e@0 217 token.set_attribute(label, attribute)
e@0 218
e@0 219 def get_token_features_labels(self):
e@0 220 features = []
e@0 221 labels = []
e@0 222
e@0 223 for sentence in self.token_sentences:
e@0 224 sentence_features = []
e@0 225 sentence_labels = []
e@0 226
e@0 227 for token in sentence:
e@0 228 sentence_features.append(token.get_feature_vector())
e@0 229 sentence_labels.append(token.label)
e@0 230
e@0 231 features.append(sentence_features)
e@0 232 labels.append(sentence_labels)
e@0 233
e@0 234 return features, labels
e@0 235
e@0 236 def get_token_features_attributes(self, label):
e@0 237 features = []
e@0 238 labels = []
e@0 239
e@0 240 for sentence in self.token_sentences:
e@0 241 sentence_features = []
e@0 242 sentence_labels = []
e@0 243
e@0 244 for token in sentence:
e@0 245 sentence_features.append(token.get_feature_vector())
e@0 246 if label in token.attributes:
e@0 247 sentence_labels.append(token.attributes[label])
e@0 248 else:
e@0 249 sentence_labels.append('O')
e@0 250
e@0 251 features.append(sentence_features)
e@0 252 labels.append(sentence_labels)
e@0 253
e@0 254 return features, labels
e@0 255
e@0 256 def get_gold_relation_feature_labels(self):
e@0 257 features = []
e@0 258 labels = []
e@0 259 for r in self.relations:
e@0 260 feat = r.get_feature_vector()
e@0 261 label = r.label
e@0 262
e@0 263 features.append(feat)
e@0 264 labels.append(label)
e@0 265
e@0 266 return features, labels
e@0 267
e@0 268 def get_candidate_relation_feature_labels(self):
e@0 269 features = []
e@0 270 labels = []
e@0 271
e@0 272 candidate_relations = self.get_candidate_relations()
e@0 273 for r in candidate_relations:
e@0 274 feat = r.get_feature_vector()
e@0 275 label = r.label
e@0 276
e@0 277 features.append(feat)
e@0 278 labels.append(label)
e@0 279
e@0 280 return features, labels
e@0 281
e@0 282
e@0 283 def get_tokens_with_label(self, label):
e@0 284
e@0 285 blabel = "B-{}".format(label)
e@0 286 ilabel = 'I-{}'.format(label)
e@0 287
e@0 288 tokens = []
e@0 289
e@0 290 for I in range(len(self.token_sentences)):
e@0 291 tokens_ = []
e@0 292 sent_tokens = self.token_sentences[I]
e@0 293 sent_labels = [t.label for t in sent_tokens]
e@0 294
e@0 295 tok_ = []
e@0 296 for n,l in enumerate(sent_labels):
e@0 297 if l == blabel:
e@0 298 if len(tok_) > 0:
e@0 299 tokens_.append(tok_)
e@0 300 tok_ = [sent_tokens[n]]
e@0 301 elif l == ilabel:
e@0 302 tok_.append(sent_tokens[n])
e@0 303 else:
e@0 304 if len(tok_)>0:
e@0 305 tokens_.append(tok_)
e@0 306 tok_ = []
e@0 307 tokens.append(tokens_)
e@0 308
e@0 309 return tokens
e@0 310
e@0 311 def get_candidate_relations(self):
e@0 312 candidate_relations = []
e@0 313
e@0 314 characters = self.get_tokens_with_label('Character')
e@0 315 places = self.get_tokens_with_label('Place')
e@0 316 spatial_signals = self.get_tokens_with_label('Spatial_Signal')
e@0 317 say_words = self.get_tokens_with_label('Says')
e@0 318 character_lines = self.get_tokens_with_label('Character_Line')
e@0 319
e@0 320 for I in range(len(spatial_signals)):
e@0 321 for sp in spatial_signals[I]:
e@0 322 for ch in characters[I]:
e@0 323 for pl in places[I]:
e@0 324 rel = Relation(ch, pl, sp, self.LCA)
e@0 325 candidate_relations.append(rel)
e@0 326
e@0 327 for I in range(len(say_words)):
e@0 328 for sw in say_words[I]:
e@0 329 for ch in characters[I]:
e@0 330 for cl in character_lines[I]:
e@0 331 rel = Relation(ch, cl, sw, self.LCA)
e@0 332 candidate_relations.append(rel)
e@0 333
e@0 334 for cr in candidate_relations:
e@0 335 for r in self.relations:
e@0 336 if cr == r:
e@0 337 cr.label = r.label
e@0 338
e@0 339 return candidate_relations
e@0 340
e@0 341 def predict_relations(self, model):
e@0 342 relations = self.get_candidate_relations()
e@0 343
e@0 344 for n, r in enumerate(relations):
e@0 345 f = r.get_feature_vector()
e@0 346 label = model.predict([f])[0]
e@0 347 if label != 'NONE':
e@0 348 r.label = label
e@0 349 self.relations.append(r)
e@0 350
e@0 351 def __str__(self):
e@0 352 return self.text
e@0 353
e@0 354 class Relation:
e@0 355 """ relation, has arg1, arg2, trigger as tokens, also label """
e@0 356 def __init__(self, arg1, arg2, trigger, lca, label='NONE'):
e@0 357 self.arg1 = arg1
e@0 358 self.arg2 = arg2
e@0 359 self.trigger = trigger
e@0 360 self.doc = trigger[0].doc
e@0 361 self.LCA = lca
e@0 362 self.label = label
e@0 363
e@0 364 def __repr__(self):
e@0 365 return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2)
e@0 366
e@0 367 def __eq__(self, other):
e@0 368 return all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \
e@0 369 and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \
e@0 370 and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))])
e@0 371
e@0 372 def get_feature_vector(self):
e@0 373 rf = {}
e@0 374
e@0 375 arg1 = get_token_head([t.token for t in self.arg1])
e@0 376 arg2 = get_token_head([t.token for t in self.arg2])
e@0 377 trigger = get_token_head([t.token for t in self.trigger])
e@0 378
e@0 379 arg1_type = self.arg1[0].label.replace('B-', '')
e@0 380 arg2_type = self.arg2[0].label.replace('B-', '')
e@0 381
e@0 382 rf['10'] = arg1_type+ '::'+ arg2_type
e@0 383
e@0 384 if trigger.i < arg1.i:
e@0 385 arg1_direction = 'right'
e@0 386 if trigger.i > arg1.i:
e@0 387 arg1_direction = 'left'
e@0 388
e@0 389 if trigger.i < arg2.i:
e@0 390 arg2_direction = 'right'
e@0 391 if trigger.i > arg2.i:
e@0 392 arg2_direction = 'left'
e@0 393
e@0 394 rf['12.1'] = arg1_direction
e@0 395 rf['12.2'] = arg2_direction
e@0 396 rf['13'] = arg1_direction+ '::'+ arg2_direction
e@0 397
e@0 398 rf['1'] = trigger.text.lower()
e@0 399 rf['2'] = trigger.lemma_
e@0 400 rf['3'] = trigger.pos_
e@0 401 rf['4'] = rf['2'] + '::' + rf['3']
e@0 402 rf['11'] = rf['10'] + '::' + rf['2']
e@0 403 rf['14'] = rf['13'] + '::' + rf['2']
e@0 404
e@0 405 # RF15
e@0 406
e@0 407 for i, token in enumerate([arg1, arg2]):
e@0 408 rf['5.{}'.format(i)] = token.text.lower()
e@0 409 rf['6.{}'.format(i)] = token.lemma_
e@0 410 rf['7.{}'.format(i)] = token.pos_
e@0 411 rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_
e@0 412 rf['9.{}'.format(i)] = arg1_type
e@0 413 rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA)
e@0 414 rf['20'] = len(rf['17.{}'.format(i)].split('::'))
e@0 415
e@0 416 rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i)
e@0 417
e@0 418
e@0 419
e@0 420
e@0 421 rf['18'] = rf['17.0'] + '::' + rf['17.1']
e@0 422
e@0 423 deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA)
e@0 424 rf['19'] = deppath
e@0 425 rf['23'] = rf['22.0'] + rf['22.1']
e@0 426
e@0 427 return rf
e@0 428
e@0 429 class Token:
e@0 430 """ Named entity, has doc, sent, doc_idx, sent_idx, and label """
e@0 431 def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'):
e@0 432 self.token = token
e@0 433 self.text = token.text
e@0 434 self.doc = doc
e@0 435 self.doc_idx = doc_idx
e@0 436 self.sent = sent
e@0 437 self.sent_idx = sent_idx
e@0 438 self.attributes = {}
e@0 439
e@0 440 self.label = label
e@0 441 self.start = self.token.idx
e@0 442 self.end = self.token.idx + len(self.token)
e@0 443
e@0 444 def __repr__(self):
e@0 445 return "[{} -> {}]".format(repr(self.token), self.label)
e@0 446
e@0 447 def set_label(self, label):
e@0 448 # print("Token {} label changed to {}".format(self.text, label))
e@0 449 self.label = label
e@0 450
e@0 451 def set_attribute(self, label, value):
e@0 452 self.attributes[label] = value
e@0 453
e@0 454 def get_feature_vector(self):
e@0 455
e@0 456 def find_ngrams(input_list, n):
e@0 457 return zip(*[input_list[i:] for i in range(n)])
e@0 458
e@0 459 # Stores featuer dictionary
e@0 460 feat_dict = {}
e@0 461
e@0 462 #1. Create token spans
e@0 463
e@0 464 # 5 token span
e@0 465 large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)]
e@0 466
e@0 467 # 3 token span
e@0 468 short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)]
e@0 469
e@0 470 for i, t in enumerate(large_span):
e@0 471 feat_dict['F.1_{}'.format(i)] = t.text
e@0 472 feat_dict['F.2_{}'.format(i)] = t.lemma_
e@0 473 feat_dict['F.3_{}'.format(i)] = t.pos_
e@0 474 feat_dict['F.4_{}'.format(i)] = t.ent_type_
e@0 475
e@0 476 for i, t in enumerate(short_span):
e@0 477 feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_])
e@0 478 feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_])
e@0 479
e@0 480 ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams
e@0 481 for i, ng in enumerate(ngrams):
e@0 482 feat_dict['F.10_{}'.format(i)] = " ".join(ng)
e@0 483
e@0 484 ngrams = find_ngrams([t.text for t in short_span], 2) # Raw-string bigrams
e@0 485 for i, ng in enumerate(ngrams):
e@0 486 feat_dict['F.11_{}'.format(i)] = " ".join(ng)
e@0 487
e@0 488 # Get dependency with head if it exists
e@0 489 dirdep, headlemma = get_dep_with_head(self.token)
e@0 490 if dirdep is not None:
e@0 491 feat_dict['F.7'] = dirdep
e@0 492 feat_dict['F.8'] = "::".join([dirdep, headlemma])
e@0 493
e@0 494 # Get glove vector
e@0 495 vector = self.token.vector
e@0 496 for i in range(len(vector)):
e@0 497 feat_dict['F.9_{}'.format(i)] = vector[i]
e@0 498
e@0 499
e@0 500 return feat_dict
e@0 501
e@0 502 class Character:
e@0 503 """ Named Entity consisting of one or more tokens """
e@0 504 def __init__(self, name, age='none', gender='none'):
e@0 505 self.name = name
e@0 506 self.age = age
e@0 507 self.gender = gender
e@0 508
e@0 509 def __repr__(self):
e@0 510 return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name,
e@0 511 self.age,
e@0 512 self.gender)
e@0 513
e@0 514 def __eq__(self, other):
e@0 515 return self.name.lower() == other.name.lower()
e@0 516
e@0 517 class Place:
e@0 518 """ Named Entity consisting of one or more tokens """
e@0 519 def __init__(self, name):
e@0 520 self.name = name
e@0 521
e@0 522 def __repr__(self):
e@0 523 return "<PLACE name='{}'>".format(self.name)
e@0 524
e@0 525 def __eq__(self, other):
e@0 526 return self.name.lower() == other.name.lower()
e@0 527
e@0 528 class Sayword:
e@0 529 """ Named Entity consisting of one or more tokens """
e@0 530 def __init__(self, name):
e@0 531 self.name = name
e@0 532
e@0 533 def __repr__(self):
e@0 534 return "<SAYWORD name='{}'>".format(self.name)
e@0 535
e@0 536 def __eq__(self, other):
e@0 537 return self.name.lower() == other.name.lower()
e@0 538
e@0 539 class CharacterLine:
e@0 540 """ Named Entity consisting of one or more tokens """
e@0 541 def __init__(self, name):
e@0 542 self.name = name
e@0 543
e@0 544 def __repr__(self):
e@0 545 return "<CHARACTER_LINE name='{}'>".format(self.name)
e@0 546
e@0 547 def __eq__(self, other):
e@0 548 return self.name.lower() == other.name.lower()
e@0 549
e@0 550 class SpatialSignal:
e@0 551 """ Named Entity consisting of one or more tokens """
e@0 552 def __init__(self, name):
e@0 553 self.name = name
e@0 554
e@0 555 def __repr__(self):
e@0 556 return "<SPATIAL_SIGNAL name='{}'>".format(self.name)
e@0 557
e@0 558 def __eq__(self, other):
e@0 559 return self.name.lower() == other.name.lower()
e@0 560
e@0 561
e@0 562
e@0 563