Mercurial > hg > chourdakisreiss2018smc
diff demo/ner.py @ 0:90155bdd5dd6
first commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 18:27:05 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demo/ner.py Wed May 16 18:27:05 2018 +0100 @@ -0,0 +1,563 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Apr 1 14:05:17 2018 + +@author: Emmanouil Theofanis Chourdakis +""" + +from pypeg2 import * +import re + +def var_generator(T): + I = 0 + while True: + I+=1 + yield "{}{}".format(T, I) + + + +def l_label_generator(T): + I = 0 + while True: + + I+=1 + yield "<{}LINE{}>".format(T, I) + +annot_var = re.compile("[A-Z][0-9]+") +annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*") +annot_label = re.compile('[A-Za-z0-9_]+') +label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+') + +class AnnotationType(Keyword): + grammar = Enum(K("Place"), + K("Character"), + K("Character_Line"), + K("Motion"), + K("Motion_Signal"), + K("Says"), + K("Spatial_Signal")) + +class AttributeType(Keyword): + grammar = Enum(K("Age"), K("Gender")) + +class AnnotationTuple: + grammar = attr('variable',annot_var),\ + attr('type',AnnotationType),\ + attr('idx',annot_pos),\ + attr('annotation',restline) + + +class AttributeTuple: + grammar = attr('variable', annot_var),\ + attr('type',AttributeType), \ + attr('target', annot_var), \ + attr('annotation', restline) + +class VarArg: + grammar = attr('label', annot_label), ':', attr('target', annot_var) + +class VarArgs(List): + grammar = some(VarArg) + +class RelationTuple: + grammar = attr('variable', annot_var),\ + attr('args', VarArgs) + +class AnnotLine(List): + grammar = [AnnotationTuple, AttributeTuple, RelationTuple] + +class AnnotationFile(List): + grammar = some(AnnotLine) + +def get_tokens_by_label(label, sent_tokens, sent_labels): + + tokens = [] + blabel = "B-{}".format(label) + ilabel = 'I-{}'.format(label) + + tok_ = [] + for n,l in enumerate(sent_labels): + if l == blabel: + if len(tok_) > 0: + tokens.append(tok_) + tok_ = [sent_tokens[n]] + elif l == ilabel: + tok_.append(sent_tokens[n]) + else: + if len(tok_)>0: + tokens.append(tok_) + tok_ = [] + return tokens + +def get_token_head(span): + + span_idx = [tok.i for tok in span] + head = span[0] + while head.head.i in span_idx: + if head == head.head: + return head + + head = head.head + return head + + +def get_min_dep_path(a, b, doc, LCA): + + lca_idx = LCA[a,b] + + if lca_idx == -1: + return "<UND>" + + lca = doc[LCA[a, b]] + + m_a = [] + m_b = [] + + # From tra go up to the LCA. + + tok = doc[a] + while tok != lca: + if tok.head != None: + m_a.append(('up', tok.dep_)) + tok = tok.head + + tok = doc[b] + while tok != lca: + if tok.head != None: + m_b.append(('down', tok.dep_)) + tok = tok.head + m_b.reverse() + + path = m_a + m_b + + return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path) +def get_dep_with_head(tok): + dep_ = [] + while tok.head != tok: + dep_.append(tok.dep_) + tok = tok.head + + if len(dep_) == 1: + return dep_[0], tok.lemma_ + else: + return None, tok.lemma_ + +def var_generator(T): + I = 0 + while True: + I+=1 + yield "{}{}".format(T, I) + +def get_dep_with_head(tok): + dep_ = [] + while tok.head != tok: + dep_.append(tok.dep_) + tok = tok.head + + if len(dep_) == 1: + return dep_[0], tok.lemma_ + else: + return None, tok.lemma_ + +class Document: + def __init__(self, doc): + + self.doc = doc + self.LCA = doc.get_lca_matrix() + self.text = doc.text + self.sentences = [str(s) for s in doc.sents] + + self.tokens = [] + self.token_sentences = [] + + self.relations = [] + + for m, sent in enumerate(doc.sents): + tlist = [] + for n, tok in enumerate(sent): + token = Token(tok, doc, tok.i, sent, n) + tlist.append(token) + self.token_sentences.append(tlist) + self.tokens += tlist + + def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'): + token = Token(token, doc, doc_idx, sent, sent_idx, label) + self.tokens.append(token) + + def add_relation(self, trigger, arg1, arg2, label): + self.relations.append(Relation(arg1, arg2, trigger, self.LCA, label)) + + def find_tokens(self, start, end): + tokens = [] + for tok in self.tokens: + if tok.start >= start and tok.end <= end: + tokens.append(tok) + + return tokens + + def assign_label_to_tokens(self, start, end, label): + tokens = self.find_tokens(start, end) + for n, token in enumerate(tokens): + if n == 0: + IOB = 'B' + else: + IOB = 'I' + + token.set_label('{}-{}'.format(IOB, label)) + + def assign_label_to_tokens_by_matching_lemma(self, lemma, label): + for t in self.tokens: + if t.token.lemma_ == lemma: + t.label = 'B-{}'.format(label) + + def assign_attribute_to_tokens(self, start, end, label, attribute): + tokens = self.find_tokens(start, end) + for n, token in enumerate(tokens): + token.set_attribute(label, attribute) + + def get_token_features_labels(self): + features = [] + labels = [] + + for sentence in self.token_sentences: + sentence_features = [] + sentence_labels = [] + + for token in sentence: + sentence_features.append(token.get_feature_vector()) + sentence_labels.append(token.label) + + features.append(sentence_features) + labels.append(sentence_labels) + + return features, labels + + def get_token_features_attributes(self, label): + features = [] + labels = [] + + for sentence in self.token_sentences: + sentence_features = [] + sentence_labels = [] + + for token in sentence: + sentence_features.append(token.get_feature_vector()) + if label in token.attributes: + sentence_labels.append(token.attributes[label]) + else: + sentence_labels.append('O') + + features.append(sentence_features) + labels.append(sentence_labels) + + return features, labels + + def get_gold_relation_feature_labels(self): + features = [] + labels = [] + for r in self.relations: + feat = r.get_feature_vector() + label = r.label + + features.append(feat) + labels.append(label) + + return features, labels + + def get_candidate_relation_feature_labels(self): + features = [] + labels = [] + + candidate_relations = self.get_candidate_relations() + for r in candidate_relations: + feat = r.get_feature_vector() + label = r.label + + features.append(feat) + labels.append(label) + + return features, labels + + + def get_tokens_with_label(self, label): + + blabel = "B-{}".format(label) + ilabel = 'I-{}'.format(label) + + tokens = [] + + for I in range(len(self.token_sentences)): + tokens_ = [] + sent_tokens = self.token_sentences[I] + sent_labels = [t.label for t in sent_tokens] + + tok_ = [] + for n,l in enumerate(sent_labels): + if l == blabel: + if len(tok_) > 0: + tokens_.append(tok_) + tok_ = [sent_tokens[n]] + elif l == ilabel: + tok_.append(sent_tokens[n]) + else: + if len(tok_)>0: + tokens_.append(tok_) + tok_ = [] + tokens.append(tokens_) + + return tokens + + def get_candidate_relations(self): + candidate_relations = [] + + characters = self.get_tokens_with_label('Character') + places = self.get_tokens_with_label('Place') + spatial_signals = self.get_tokens_with_label('Spatial_Signal') + say_words = self.get_tokens_with_label('Says') + character_lines = self.get_tokens_with_label('Character_Line') + + for I in range(len(spatial_signals)): + for sp in spatial_signals[I]: + for ch in characters[I]: + for pl in places[I]: + rel = Relation(ch, pl, sp, self.LCA) + candidate_relations.append(rel) + + for I in range(len(say_words)): + for sw in say_words[I]: + for ch in characters[I]: + for cl in character_lines[I]: + rel = Relation(ch, cl, sw, self.LCA) + candidate_relations.append(rel) + + for cr in candidate_relations: + for r in self.relations: + if cr == r: + cr.label = r.label + + return candidate_relations + + def predict_relations(self, model): + relations = self.get_candidate_relations() + + for n, r in enumerate(relations): + f = r.get_feature_vector() + label = model.predict([f])[0] + if label != 'NONE': + r.label = label + self.relations.append(r) + + def __str__(self): + return self.text + +class Relation: + """ relation, has arg1, arg2, trigger as tokens, also label """ + def __init__(self, arg1, arg2, trigger, lca, label='NONE'): + self.arg1 = arg1 + self.arg2 = arg2 + self.trigger = trigger + self.doc = trigger[0].doc + self.LCA = lca + self.label = label + + def __repr__(self): + return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2) + + def __eq__(self, other): + return all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \ + and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \ + and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))]) + + def get_feature_vector(self): + rf = {} + + arg1 = get_token_head([t.token for t in self.arg1]) + arg2 = get_token_head([t.token for t in self.arg2]) + trigger = get_token_head([t.token for t in self.trigger]) + + arg1_type = self.arg1[0].label.replace('B-', '') + arg2_type = self.arg2[0].label.replace('B-', '') + + rf['10'] = arg1_type+ '::'+ arg2_type + + if trigger.i < arg1.i: + arg1_direction = 'right' + if trigger.i > arg1.i: + arg1_direction = 'left' + + if trigger.i < arg2.i: + arg2_direction = 'right' + if trigger.i > arg2.i: + arg2_direction = 'left' + + rf['12.1'] = arg1_direction + rf['12.2'] = arg2_direction + rf['13'] = arg1_direction+ '::'+ arg2_direction + + rf['1'] = trigger.text.lower() + rf['2'] = trigger.lemma_ + rf['3'] = trigger.pos_ + rf['4'] = rf['2'] + '::' + rf['3'] + rf['11'] = rf['10'] + '::' + rf['2'] + rf['14'] = rf['13'] + '::' + rf['2'] + + # RF15 + + for i, token in enumerate([arg1, arg2]): + rf['5.{}'.format(i)] = token.text.lower() + rf['6.{}'.format(i)] = token.lemma_ + rf['7.{}'.format(i)] = token.pos_ + rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_ + rf['9.{}'.format(i)] = arg1_type + rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA) + rf['20'] = len(rf['17.{}'.format(i)].split('::')) + + rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i) + + + + + rf['18'] = rf['17.0'] + '::' + rf['17.1'] + + deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA) + rf['19'] = deppath + rf['23'] = rf['22.0'] + rf['22.1'] + + return rf + +class Token: + """ Named entity, has doc, sent, doc_idx, sent_idx, and label """ + def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'): + self.token = token + self.text = token.text + self.doc = doc + self.doc_idx = doc_idx + self.sent = sent + self.sent_idx = sent_idx + self.attributes = {} + + self.label = label + self.start = self.token.idx + self.end = self.token.idx + len(self.token) + + def __repr__(self): + return "[{} -> {}]".format(repr(self.token), self.label) + + def set_label(self, label): +# print("Token {} label changed to {}".format(self.text, label)) + self.label = label + + def set_attribute(self, label, value): + self.attributes[label] = value + + def get_feature_vector(self): + + def find_ngrams(input_list, n): + return zip(*[input_list[i:] for i in range(n)]) + + # Stores featuer dictionary + feat_dict = {} + + #1. Create token spans + + # 5 token span + large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)] + + # 3 token span + short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)] + + for i, t in enumerate(large_span): + feat_dict['F.1_{}'.format(i)] = t.text + feat_dict['F.2_{}'.format(i)] = t.lemma_ + feat_dict['F.3_{}'.format(i)] = t.pos_ + feat_dict['F.4_{}'.format(i)] = t.ent_type_ + + for i, t in enumerate(short_span): + feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_]) + feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_]) + + ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams + for i, ng in enumerate(ngrams): + feat_dict['F.10_{}'.format(i)] = " ".join(ng) + + ngrams = find_ngrams([t.text for t in short_span], 2) # Raw-string bigrams + for i, ng in enumerate(ngrams): + feat_dict['F.11_{}'.format(i)] = " ".join(ng) + + # Get dependency with head if it exists + dirdep, headlemma = get_dep_with_head(self.token) + if dirdep is not None: + feat_dict['F.7'] = dirdep + feat_dict['F.8'] = "::".join([dirdep, headlemma]) + + # Get glove vector + vector = self.token.vector + for i in range(len(vector)): + feat_dict['F.9_{}'.format(i)] = vector[i] + + + return feat_dict + +class Character: + """ Named Entity consisting of one or more tokens """ + def __init__(self, name, age='none', gender='none'): + self.name = name + self.age = age + self.gender = gender + + def __repr__(self): + return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name, + self.age, + self.gender) + + def __eq__(self, other): + return self.name.lower() == other.name.lower() + +class Place: + """ Named Entity consisting of one or more tokens """ + def __init__(self, name): + self.name = name + + def __repr__(self): + return "<PLACE name='{}'>".format(self.name) + + def __eq__(self, other): + return self.name.lower() == other.name.lower() + +class Sayword: + """ Named Entity consisting of one or more tokens """ + def __init__(self, name): + self.name = name + + def __repr__(self): + return "<SAYWORD name='{}'>".format(self.name) + + def __eq__(self, other): + return self.name.lower() == other.name.lower() + +class CharacterLine: + """ Named Entity consisting of one or more tokens """ + def __init__(self, name): + self.name = name + + def __repr__(self): + return "<CHARACTER_LINE name='{}'>".format(self.name) + + def __eq__(self, other): + return self.name.lower() == other.name.lower() + +class SpatialSignal: + """ Named Entity consisting of one or more tokens """ + def __init__(self, name): + self.name = name + + def __repr__(self): + return "<SPATIAL_SIGNAL name='{}'>".format(self.name) + + def __eq__(self, other): + return self.name.lower() == other.name.lower() + + + + \ No newline at end of file