view demo/ner.py @ 0:90155bdd5dd6

first commit
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 16 May 2018 18:27:05 +0100
parents
children
line wrap: on
line source
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr  1 14:05:17 2018

@author: Emmanouil Theofanis Chourdakis
"""

from pypeg2 import *
import re 

def var_generator(T):
    I = 0
    while True:
        I+=1
        yield "{}{}".format(T, I)
        


def l_label_generator(T):
    I = 0
    while True:

        I+=1
        yield "<{}LINE{}>".format(T, I)        

annot_var = re.compile("[A-Z][0-9]+")
annot_pos = re.compile("[0-9]+ [0-9]+(\;[0-9]+ [0-9]+)*")
annot_label = re.compile('[A-Za-z0-9_]+')
label_var_tuple = re.compile(r'[A-Za-z0-9_]+\:[A-Z][0-9]+')

class AnnotationType(Keyword):
    grammar =  Enum(K("Place"), 
                    K("Character"),
                    K("Character_Line"),
                    K("Motion"),                    
                    K("Motion_Signal"),                                        
                    K("Says"),
                    K("Spatial_Signal"))

class AttributeType(Keyword):
    grammar = Enum(K("Age"), K("Gender"))

class AnnotationTuple:
    grammar = attr('variable',annot_var),\
              attr('type',AnnotationType),\
              attr('idx',annot_pos),\
              attr('annotation',restline)
                

class AttributeTuple:
    grammar = attr('variable', annot_var),\
              attr('type',AttributeType), \
              attr('target', annot_var), \
              attr('annotation', restline)
              
class VarArg:
    grammar = attr('label', annot_label), ':', attr('target', annot_var)
              
class VarArgs(List):
    grammar = some(VarArg)
    
class RelationTuple:
    grammar = attr('variable', annot_var),\
              attr('args', VarArgs)
                          
class AnnotLine(List):
    grammar = [AnnotationTuple, AttributeTuple, RelationTuple]
              
class AnnotationFile(List):
    grammar = some(AnnotLine)

def get_tokens_by_label(label, sent_tokens, sent_labels):

    tokens = []
    blabel = "B-{}".format(label)
    ilabel = 'I-{}'.format(label)

    tok_ = []
    for n,l in enumerate(sent_labels):
        if l == blabel:
            if len(tok_) > 0:
                tokens.append(tok_)
            tok_ = [sent_tokens[n]]
        elif l == ilabel:
            tok_.append(sent_tokens[n])
        else:
            if len(tok_)>0:
                tokens.append(tok_)
            tok_ = []
    return tokens         

def get_token_head(span):

    span_idx = [tok.i for tok in span]
    head = span[0]
    while head.head.i in span_idx:
        if head == head.head:
            return head
    
        head = head.head
    return head
    

def get_min_dep_path(a, b, doc, LCA):
    
    lca_idx = LCA[a,b]
    
    if lca_idx == -1:
        return "<UND>"
    
    lca = doc[LCA[a, b]]

    m_a = []
    m_b = []

    # From tra go up to the LCA.

    tok = doc[a]
    while tok != lca:
        if tok.head != None:
            m_a.append(('up', tok.dep_))
            tok = tok.head

    tok = doc[b]
    while tok != lca:
        if tok.head != None:
            m_b.append(('down', tok.dep_))
            tok = tok.head    
    m_b.reverse()

    path = m_a + m_b
    
    return "::".join("{}|{}".format(tup[0], tup[1]) for tup in path)
def get_dep_with_head(tok):
    dep_ = []
    while tok.head != tok:
        dep_.append(tok.dep_)
        tok = tok.head
        
    if len(dep_) == 1:
        return dep_[0], tok.lemma_
    else:
        return None, tok.lemma_

def var_generator(T):
    I = 0
    while True:
        I+=1
        yield "{}{}".format(T, I)

def get_dep_with_head(tok):
    dep_ = []
    while tok.head != tok:
        dep_.append(tok.dep_)
        tok = tok.head
        
    if len(dep_) == 1:
        return dep_[0], tok.lemma_
    else:
        return None, tok.lemma_
    
class Document:
    def __init__(self, doc):
        
        self.doc = doc
        self.LCA = doc.get_lca_matrix()
        self.text = doc.text
        self.sentences = [str(s) for s in doc.sents]
        
        self.tokens = []
        self.token_sentences = []
        
        self.relations = []
        
        for m, sent in enumerate(doc.sents):
            tlist = []
            for n, tok in enumerate(sent):
                token = Token(tok, doc, tok.i, sent, n)
                tlist.append(token)
            self.token_sentences.append(tlist)
            self.tokens += tlist
    
    def add_token(self, token, doc, doc_idx, sent, sent_idx, label='NONE'):
        token = Token(token, doc, doc_idx, sent, sent_idx, label)
        self.tokens.append(token)
        
    def add_relation(self, trigger, arg1, arg2, label):
        self.relations.append(Relation(arg1, arg2, trigger,  self.LCA, label))
        
    def find_tokens(self, start, end):
        tokens = []
        for tok in self.tokens:
            if tok.start >= start and tok.end <= end:
                tokens.append(tok)
                
        return tokens
    
    def assign_label_to_tokens(self, start, end, label):
        tokens = self.find_tokens(start, end)
        for n, token in enumerate(tokens):
            if n == 0:
                IOB = 'B'
            else:
                IOB = 'I'
            
            token.set_label('{}-{}'.format(IOB, label))
            
    def assign_label_to_tokens_by_matching_lemma(self, lemma, label):
        for t in self.tokens:
            if t.token.lemma_ == lemma:
                t.label = 'B-{}'.format(label)
            
    def assign_attribute_to_tokens(self, start, end, label, attribute):
        tokens = self.find_tokens(start, end)
        for n, token in enumerate(tokens):
            token.set_attribute(label, attribute)
            
    def get_token_features_labels(self):
        features = []
        labels = []
        
        for sentence in self.token_sentences:
            sentence_features = []
            sentence_labels = []
            
            for token in sentence:
                sentence_features.append(token.get_feature_vector())
                sentence_labels.append(token.label)
                
            features.append(sentence_features)
            labels.append(sentence_labels)
            
        return features, labels
    
    def get_token_features_attributes(self, label):
        features = []
        labels = []
        
        for sentence in self.token_sentences:
            sentence_features = []
            sentence_labels = []
            
            for token in sentence:
                sentence_features.append(token.get_feature_vector())
                if label in token.attributes:
                    sentence_labels.append(token.attributes[label])
                else:
                    sentence_labels.append('O')
                
            features.append(sentence_features)
            labels.append(sentence_labels)
            
        return features, labels    
    
    def get_gold_relation_feature_labels(self):
        features = []
        labels = []
        for r in self.relations:
            feat = r.get_feature_vector()
            label = r.label
            
            features.append(feat)
            labels.append(label)
            
        return features, labels
    
    def get_candidate_relation_feature_labels(self):
        features = []
        labels = []
        
        candidate_relations = self.get_candidate_relations()
        for r in candidate_relations:
            feat = r.get_feature_vector()
            label = r.label
            
            features.append(feat)
            labels.append(label)
            
        return features, labels     
                        
    
    def get_tokens_with_label(self, label):

        blabel = "B-{}".format(label)
        ilabel = 'I-{}'.format(label)

        tokens = []
        
        for I in range(len(self.token_sentences)):
            tokens_ = []        
            sent_tokens = self.token_sentences[I]
            sent_labels = [t.label for t in sent_tokens]

            tok_ = []
            for n,l in enumerate(sent_labels):
                if l == blabel:
                    if len(tok_) > 0:
                        tokens_.append(tok_)
                    tok_ = [sent_tokens[n]]
                elif l == ilabel:
                    tok_.append(sent_tokens[n])
                else:
                    if len(tok_)>0:
                        tokens_.append(tok_)
                    tok_ = []
            tokens.append(tokens_)       
    
        return tokens 
    
    def get_candidate_relations(self):
        candidate_relations = []
        
        characters = self.get_tokens_with_label('Character')
        places = self.get_tokens_with_label('Place')
        spatial_signals = self.get_tokens_with_label('Spatial_Signal')
        say_words = self.get_tokens_with_label('Says')
        character_lines = self.get_tokens_with_label('Character_Line')
        
        for I in range(len(spatial_signals)):
            for sp in spatial_signals[I]:
                for ch in characters[I]:
                    for pl in places[I]:
                        rel = Relation(ch, pl, sp, self.LCA)
                        candidate_relations.append(rel)
                        
        for I in range(len(say_words)):
            for sw in say_words[I]:
                for ch in characters[I]:
                    for cl in character_lines[I]:
                        rel = Relation(ch, cl, sw, self.LCA)
                        candidate_relations.append(rel)
                        
        for cr in candidate_relations:
            for r in self.relations:
                if cr == r:
                    cr.label = r.label
                        
        return candidate_relations
    
    def predict_relations(self, model):
        relations = self.get_candidate_relations()
        
        for n, r in enumerate(relations):
            f = r.get_feature_vector()
            label = model.predict([f])[0]
            if label != 'NONE':
                r.label = label
                self.relations.append(r)
    
    def __str__(self):
        return self.text

class Relation:
    """ relation, has arg1, arg2, trigger as tokens, also label """
    def __init__(self, arg1, arg2, trigger, lca, label='NONE'):
        self.arg1 = arg1
        self.arg2 = arg2
        self.trigger = trigger
        self.doc = trigger[0].doc
        self.LCA = lca
        self.label = label
        
    def __repr__(self):
        return "<{}| trigger: {}, arg1: {}, arg2: {}>".format(self.label, self.trigger, self.arg1, self.arg2)
    
    def __eq__(self, other):
        return  all([self.arg1[n].text == other.arg1[n].text for n in range(min(len(self.arg1), len(other.arg1)))]) \
                and all([self.arg2[n].text == other.arg2[n].text for n in range(min(len(self.arg2), len(other.arg2)))]) \
                and all([self.trigger[n].text == other.trigger[n].text for n in range(min(len(self.trigger), len(other.trigger)))])
            
    def get_feature_vector(self):        
        rf = {}

        arg1 = get_token_head([t.token for t in self.arg1])
        arg2 = get_token_head([t.token for t in self.arg2])
        trigger = get_token_head([t.token for t in self.trigger])

        arg1_type = self.arg1[0].label.replace('B-', '')
        arg2_type = self.arg2[0].label.replace('B-', '')
    
        rf['10'] = arg1_type+ '::'+ arg2_type

        if trigger.i < arg1.i:
            arg1_direction = 'right'
        if trigger.i > arg1.i:
            arg1_direction = 'left'
        
        if trigger.i < arg2.i:
            arg2_direction = 'right'
        if trigger.i > arg2.i:
            arg2_direction = 'left'        
                
        rf['12.1'] = arg1_direction
        rf['12.2'] = arg2_direction
        rf['13'] = arg1_direction+ '::'+ arg2_direction

        rf['1'] = trigger.text.lower()
        rf['2'] = trigger.lemma_
        rf['3'] = trigger.pos_
        rf['4'] = rf['2'] + '::' + rf['3']
        rf['11'] = rf['10'] + '::' + rf['2']
        rf['14'] = rf['13'] + '::' + rf['2']
        
        # RF15
            
        for i, token in enumerate([arg1, arg2]):
            rf['5.{}'.format(i)] = token.text.lower()
            rf['6.{}'.format(i)] = token.lemma_
            rf['7.{}'.format(i)] = token.pos_
            rf['8.{}'.format(i)] = token.lemma_ + '::' + token.pos_
            rf['9.{}'.format(i)] = arg1_type
            rf['17.{}'.format(i)] = get_min_dep_path(token.i, trigger.i, self.doc, self.LCA)
            rf['20'] = len(rf['17.{}'.format(i)].split('::'))

            rf['22.{}'.format(i)] = max(arg1.i, trigger.i) - min(arg1.i, trigger.i)
            
            
    
            
        rf['18'] = rf['17.0'] + '::' + rf['17.1']
        
        deppath = get_min_dep_path(arg1.i, arg2.i, self.doc, self.LCA)
        rf['19'] = deppath
        rf['23'] = rf['22.0'] + rf['22.1']

        return rf
    
class Token:
    """ Named entity, has doc, sent, doc_idx, sent_idx, and label """
    def __init__(self, token, doc, doc_idx, sent, sent_idx, label='O'):
        self.token = token
        self.text = token.text
        self.doc = doc
        self.doc_idx = doc_idx
        self.sent = sent
        self.sent_idx = sent_idx
        self.attributes = {}
        
        self.label = label
        self.start = self.token.idx
        self.end = self.token.idx + len(self.token)
            
    def __repr__(self):
        return "[{} -> {}]".format(repr(self.token), self.label)
    
    def set_label(self, label):
#        print("Token {} label changed to {}".format(self.text, label))
        self.label = label
    
    def set_attribute(self, label, value):
        self.attributes[label] = value
    
    def get_feature_vector(self):
        
        def find_ngrams(input_list, n):
            return zip(*[input_list[i:] for i in range(n)])

        # Stores featuer dictionary
        feat_dict = {}
        
        #1. Create token spans
        
        # 5 token span
        large_span = self.sent[max(0, self.sent_idx - 2):min(len(self.sent), self.sent_idx + 3)]
        
        # 3 token span
        short_span = self.sent[max(0, self.sent_idx - 1):min(len(self.sent), self.sent_idx + 2)]
        
        for i, t in enumerate(large_span):
            feat_dict['F.1_{}'.format(i)] = t.text
            feat_dict['F.2_{}'.format(i)] = t.lemma_
            feat_dict['F.3_{}'.format(i)] = t.pos_
            feat_dict['F.4_{}'.format(i)] = t.ent_type_
            
        for i, t in enumerate(short_span):
            feat_dict['F.5_{}'.format(i)] = "::".join([t.lemma_, t.pos_])
            feat_dict['F.6_{}'.format(i)] = "::".join([t.ent_type_, t.pos_])
            
        ngrams = find_ngrams([t.pos_ for t in large_span], 2) # POS bigrams
        for i, ng in enumerate(ngrams):
            feat_dict['F.10_{}'.format(i)] = " ".join(ng)
                
        ngrams = find_ngrams([t.text for t in short_span], 2)  # Raw-string bigrams
        for i, ng in enumerate(ngrams):
            feat_dict['F.11_{}'.format(i)] = " ".join(ng)                   
        
        # Get dependency with head if it exists
        dirdep, headlemma = get_dep_with_head(self.token)
        if dirdep is not None:
            feat_dict['F.7'] = dirdep
            feat_dict['F.8'] = "::".join([dirdep, headlemma])
            
        # Get glove vector
        vector = self.token.vector
        for i in range(len(vector)):
            feat_dict['F.9_{}'.format(i)] = vector[i]            
            
            
        return feat_dict        
    
class Character:
    """ Named Entity consisting of one or more tokens """
    def __init__(self, name, age='none', gender='none'):
        self.name = name
        self.age = age
        self.gender = gender
        
    def __repr__(self):
        return "<CHARACTER name='{}' age='{}' gender='{}'>".format(self.name, 
                                                                   self.age, 
                                                                   self.gender)
        
    def __eq__(self, other):
        return self.name.lower() == other.name.lower()
    
class Place:
    """ Named Entity consisting of one or more tokens """
    def __init__(self, name):
        self.name = name
        
    def __repr__(self):
        return "<PLACE name='{}'>".format(self.name)
    
    def __eq__(self, other):
        return self.name.lower() == other.name.lower()
    
class Sayword:
    """ Named Entity consisting of one or more tokens """
    def __init__(self, name):
        self.name = name
        
    def __repr__(self):
        return "<SAYWORD name='{}'>".format(self.name)
    
    def __eq__(self, other):
        return self.name.lower() == other.name.lower()

class CharacterLine:
    """ Named Entity consisting of one or more tokens """
    def __init__(self, name):
        self.name = name
        
    def __repr__(self):
        return "<CHARACTER_LINE name='{}'>".format(self.name)
    
    def __eq__(self, other):
        return self.name.lower() == other.name.lower()
        
class SpatialSignal:
    """ Named Entity consisting of one or more tokens """
    def __init__(self, name):
        self.name = name
        
    def __repr__(self):
        return "<SPATIAL_SIGNAL name='{}'>".format(self.name)
    
    def __eq__(self, other):
        return self.name.lower() == other.name.lower()