Mercurial > hg > chourdakisreiss2018smc
view demo/ner.py @ 13:16066f0a7127 tip
fixed the problem with brat
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Sat, 08 Dec 2018 11:02:40 +0000 |
parents | 90155bdd5dd6 |
children |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Apr 1 14:05:17 2018 @author: Emmanouil Theofanis Chourdakis """ from pypeg2 import * import re def var_generator(T): I = 0 while True: I+=1 yield "{}{}".format(T, I) def l_label_generator(T): I = 0 while True: I+=1 yield "<{}LINE{}>".format(T, I) annot_var = re.compile("[A-Z][0-9]+") annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*") annot_label = re.compile('[A-Za-z0-9_]+') label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+') class AnnotationType(Keyword): grammar = Enum(K("Place"), K("Character"), K("Character_Line"), K("Motion"), K("Motion_Signal"), K("Says"), K("Spatial_Signal")) class AttributeType(Keyword): grammar = Enum(K("Age"), K("Gender")) class AnnotationTuple: grammar = attr('variable',annot_var),\ attr('type',AnnotationType),\ attr('idx',annot_pos),\ attr('annotation',restline) class AttributeTuple: grammar = attr('variable', annot_var),\ attr('type',AttributeType), \ attr('target', annot_var), \ attr('annotation', restline) class VarArg: grammar = attr('label', annot_label), ':', attr('target', annot_var) class VarArgs(List): grammar = some(VarArg) class RelationTuple: grammar = attr('variable', annot_var),\ attr('args', VarArgs) class AnnotLine(List): grammar = [AnnotationTuple, AttributeTuple, RelationTuple] class AnnotationFile(List): grammar = some(AnnotLine) def get_tokens_by_label(label, sent_tokens, sent_labels): tokens = [] blabel = "B-{}".format(label) ilabel = 'I-{}'.format(label) tok_ = [] for n,l in enumerate(sent_labels): if l == blabel: if len(tok_) > 0: tokens.append(tok_) tok_ = [sent_tokens[n]] elif l == ilabel: tok_.append(sent_tokens[n]) else: if len(tok_)>0: tokens.append(tok_) tok_ = [] return tokens def get_token_head(span): span_idx = [tok.i for tok in span] head = span[0] while head.head.i in span_idx: if head == head.head: return head head = head.head return head def get_min_dep_path(a, b, doc, LCA): lca_idx = LCA[a,b] if lca_idx == -1: return "<UND>" lca = doc[LCA[a, b]] m_a = [] m_b = [] # From tra go up to the LCA. tok = doc[a] while tok != lca: if tok.head != None: m_a.append(('up', tok.dep_)) tok = tok.head tok = doc[b] while tok != lca: if tok.head != None: m_b.append(('down', tok.dep_)) tok = tok.head m_b.reverse() path = m_a + m_b return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path) def get_dep_with_head(tok): dep_ = [] while tok.head != tok: dep_.append(tok.dep_) tok = tok.head if len(dep_) == 1: return dep_[0], tok.lemma_ else: return None, tok.lemma_ def var_generator(T): I = 0 while True: I+=1 yield "{}{}".format(T, I) def get_dep_with_head(tok): dep_ = [] while tok.head != tok: dep_.append(tok.dep_) tok = tok.head if len(dep_) == 1: return dep_[0], tok.lemma_ else: return None, tok.lemma_ class Document: def __init__(self, doc): self.doc = doc self.LCA = doc.get_lca_matrix() self.text = doc.text self.sentences = [str(s) for s in doc.sents] self.tokens = [] self.token_sentences = [] self.relations = [] for m, sent in enumerate(doc.sents): tlist = [] for n, tok in enumerate(sent): token = Token(tok, doc, tok.i, sent, n) tlist.append(token) self.token_sentences.append(tlist) self.tokens += tlist def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'): token = Token(token, doc, doc_idx, sent, sent_idx, label) self.tokens.append(token) def add_relation(self, trigger, arg1, arg2, label): self.relations.append(Relation(arg1, arg2, trigger, self.LCA, label)) def find_tokens(self, start, end): tokens = [] for tok in self.tokens: if tok.start >= start and tok.end <= end: tokens.append(tok) return tokens def assign_label_to_tokens(self, start, end, label): tokens = self.find_tokens(start, end) for n, token in enumerate(tokens): if n == 0: IOB = 'B' else: IOB = 'I' token.set_label('{}-{}'.format(IOB, label)) def assign_label_to_tokens_by_matching_lemma(self, lemma, label): for t in self.tokens: if t.token.lemma_ == lemma: t.label = 'B-{}'.format(label) def assign_attribute_to_tokens(self, start, end, label, attribute): tokens = self.find_tokens(start, end) for n, token in enumerate(tokens): token.set_attribute(label, attribute) def get_token_features_labels(self): features = [] labels = [] for sentence in self.token_sentences: sentence_features = [] sentence_labels = [] for token in sentence: sentence_features.append(token.get_feature_vector()) sentence_labels.append(token.label) features.append(sentence_features) labels.append(sentence_labels) return features, labels def get_token_features_attributes(self, label): features = [] labels = [] for sentence in self.token_sentences: sentence_features = [] sentence_labels = [] for token in sentence: sentence_features.append(token.get_feature_vector()) if label in token.attributes: sentence_labels.append(token.attributes[label]) else: sentence_labels.append('O') features.append(sentence_features) labels.append(sentence_labels) return features, labels def get_gold_relation_feature_labels(self): features = [] labels = [] for r in self.relations: feat = r.get_feature_vector() label = r.label features.append(feat) labels.append(label) return features, labels def get_candidate_relation_feature_labels(self): features = [] labels = [] candidate_relations = self.get_candidate_relations() for r in candidate_relations: feat = r.get_feature_vector() label = r.label features.append(feat) labels.append(label) return features, labels def get_tokens_with_label(self, label): blabel = "B-{}".format(label) ilabel = 'I-{}'.format(label) tokens = [] for I in range(len(self.token_sentences)): tokens_ = [] sent_tokens = self.token_sentences[I] sent_labels = [t.label for t in sent_tokens] tok_ = [] for n,l in enumerate(sent_labels): if l == blabel: if len(tok_) > 0: tokens_.append(tok_) tok_ = [sent_tokens[n]] elif l == ilabel: tok_.append(sent_tokens[n]) else: if len(tok_)>0: tokens_.append(tok_) tok_ = [] tokens.append(tokens_) return tokens def get_candidate_relations(self): candidate_relations = [] characters = self.get_tokens_with_label('Character') places = self.get_tokens_with_label('Place') spatial_signals = self.get_tokens_with_label('Spatial_Signal') say_words = self.get_tokens_with_label('Says') character_lines = self.get_tokens_with_label('Character_Line') for I in range(len(spatial_signals)): for sp in spatial_signals[I]: for ch in characters[I]: for pl in places[I]: rel = Relation(ch, pl, sp, self.LCA) candidate_relations.append(rel) for I in range(len(say_words)): for sw in say_words[I]: for ch in characters[I]: for cl in character_lines[I]: rel = Relation(ch, cl, sw, self.LCA) candidate_relations.append(rel) for cr in candidate_relations: for r in self.relations: if cr == r: cr.label = r.label return candidate_relations def predict_relations(self, model): relations = self.get_candidate_relations() for n, r in enumerate(relations): f = r.get_feature_vector() label = model.predict([f])[0] if label != 'NONE': r.label = label self.relations.append(r) def __str__(self): return self.text class Relation: """ relation, has arg1, arg2, trigger as tokens, also label """ def __init__(self, arg1, arg2, trigger, lca, label='NONE'): self.arg1 = arg1 self.arg2 = arg2 self.trigger = trigger self.doc = trigger[0].doc self.LCA = lca self.label = label def __repr__(self): return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2) def __eq__(self, other): return all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \ and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \ and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))]) def get_feature_vector(self): rf = {} arg1 = get_token_head([t.token for t in self.arg1]) arg2 = get_token_head([t.token for t in self.arg2]) trigger = get_token_head([t.token for t in self.trigger]) arg1_type = self.arg1[0].label.replace('B-', '') arg2_type = self.arg2[0].label.replace('B-', '') rf['10'] = arg1_type+ '::'+ arg2_type if trigger.i < arg1.i: arg1_direction = 'right' if trigger.i > arg1.i: arg1_direction = 'left' if trigger.i < arg2.i: arg2_direction = 'right' if trigger.i > arg2.i: arg2_direction = 'left' rf['12.1'] = arg1_direction rf['12.2'] = arg2_direction rf['13'] = arg1_direction+ '::'+ arg2_direction rf['1'] = trigger.text.lower() rf['2'] = trigger.lemma_ rf['3'] = trigger.pos_ rf['4'] = rf['2'] + '::' + rf['3'] rf['11'] = rf['10'] + '::' + rf['2'] rf['14'] = rf['13'] + '::' + rf['2'] # RF15 for i, token in enumerate([arg1, arg2]): rf['5.{}'.format(i)] = token.text.lower() rf['6.{}'.format(i)] = token.lemma_ rf['7.{}'.format(i)] = token.pos_ rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_ rf['9.{}'.format(i)] = arg1_type rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA) rf['20'] = len(rf['17.{}'.format(i)].split('::')) rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i) rf['18'] = rf['17.0'] + '::' + rf['17.1'] deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA) rf['19'] = deppath rf['23'] = rf['22.0'] + rf['22.1'] return rf class Token: """ Named entity, has doc, sent, doc_idx, sent_idx, and label """ def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'): self.token = token self.text = token.text self.doc = doc self.doc_idx = doc_idx self.sent = sent self.sent_idx = sent_idx self.attributes = {} self.label = label self.start = self.token.idx self.end = self.token.idx + len(self.token) def __repr__(self): return "[{} -> {}]".format(repr(self.token), self.label) def set_label(self, label): # print("Token {} label changed to {}".format(self.text, label)) self.label = label def set_attribute(self, label, value): self.attributes[label] = value def get_feature_vector(self): def find_ngrams(input_list, n): return zip(*[input_list[i:] for i in range(n)]) # Stores featuer dictionary feat_dict = {} #1. Create token spans # 5 token span large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)] # 3 token span short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)] for i, t in enumerate(large_span): feat_dict['F.1_{}'.format(i)] = t.text feat_dict['F.2_{}'.format(i)] = t.lemma_ feat_dict['F.3_{}'.format(i)] = t.pos_ feat_dict['F.4_{}'.format(i)] = t.ent_type_ for i, t in enumerate(short_span): feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_]) feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_]) ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams for i, ng in enumerate(ngrams): feat_dict['F.10_{}'.format(i)] = " ".join(ng) ngrams = find_ngrams([t.text for t in short_span], 2) # Raw-string bigrams for i, ng in enumerate(ngrams): feat_dict['F.11_{}'.format(i)] = " ".join(ng) # Get dependency with head if it exists dirdep, headlemma = get_dep_with_head(self.token) if dirdep is not None: feat_dict['F.7'] = dirdep feat_dict['F.8'] = "::".join([dirdep, headlemma]) # Get glove vector vector = self.token.vector for i in range(len(vector)): feat_dict['F.9_{}'.format(i)] = vector[i] return feat_dict class Character: """ Named Entity consisting of one or more tokens """ def __init__(self, name, age='none', gender='none'): self.name = name self.age = age self.gender = gender def __repr__(self): return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name, self.age, self.gender) def __eq__(self, other): return self.name.lower() == other.name.lower() class Place: """ Named Entity consisting of one or more tokens """ def __init__(self, name): self.name = name def __repr__(self): return "<PLACE name='{}'>".format(self.name) def __eq__(self, other): return self.name.lower() == other.name.lower() class Sayword: """ Named Entity consisting of one or more tokens """ def __init__(self, name): self.name = name def __repr__(self): return "<SAYWORD name='{}'>".format(self.name) def __eq__(self, other): return self.name.lower() == other.name.lower() class CharacterLine: """ Named Entity consisting of one or more tokens """ def __init__(self, name): self.name = name def __repr__(self): return "<CHARACTER_LINE name='{}'>".format(self.name) def __eq__(self, other): return self.name.lower() == other.name.lower() class SpatialSignal: """ Named Entity consisting of one or more tokens """ def __init__(self, name): self.name = name def __repr__(self): return "<SPATIAL_SIGNAL name='{}'>".format(self.name) def __eq__(self, other): return self.name.lower() == other.name.lower()