Mercurial > hg > from-my-pen-to-your-ears-supplementary-material
diff demo/text2annotation.py~ @ 0:4dad87badb0c
initial commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 17:56:10 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demo/text2annotation.py~ Wed May 16 17:56:10 2018 +0100 @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Apr 28 14:17:15 2018 + +@author: Emmanouil Theofanis Chourdakis + +Takes a .txt story and annotates it based on: + + characters, + places, + saywords, + character_lines, + spatial_indicators, + +@output: + .ann file with the same name + +""" + +# Change path to current directory +import os +os.chdir(os.path.dirname(os.path.realpath(__file__))) + +import argparse +from sklearn.externals import joblib +import ner +import spacy +import re +import logging +import json + +logging.basicConfig(level=logging.INFO) + + +def quotes2dict(text): + new_text = text + is_open = False + + quote_no = 0 + quote = [] + quote_dict = {} + + for n, c in enumerate(text): + if c == '"' and not is_open: + is_open = True + continue + elif c == '"' and is_open: + is_open = False + quote_dict["<cline{}>.".format(quote_no)] = ''.join(quote) + new_text = new_text.replace('"'+''.join(quote)+'"', "<cline{}>.".format(quote_no)) + quote = [] + quote_no += 1 + continue + + if is_open: + quote.append(c) + + return new_text, quote_dict + + + + +def annotate_entities(text, + model, + character_lut, + saywords_lut, + spind_lut, + places_lut): + """ + Function which annotates entities in text + using the model in "model", + + returns: A ner.Document object with tokens labelled via + the LUTS provided and also the NER model in "model" + """ + + # Find and store character lines in a dictionary + logging.info('Swapping character lines for character line tags') + processed_text, quotes = quotes2dict(text) + + # Create spacy document object from resulting text + # Create the nlp engine + logging.info("Loading 'en' spacy model") + nlp = spacy.load('en') + + # Parse to spacy document + logging.info("Parsing document to spacy") + doc = nlp(processed_text) + + # Parse to our custom Document object + logging.info("Parsing document to our object format for Named Entity Recognition") + mDoc = ner.Document(doc) + + # Label <CLINE[0-9]+> as character line + logging.info("Labeling character lines") + spans = [r.span() for r in re.finditer(r'<cline[0-9]+>\.', mDoc.text)] + for span in spans: + mDoc.assign_label_to_tokens(span[0],span[1],'Character_Line') + + # Parse using LUTs + + # *- Characters + + # Sort by number of words so that tokens with more words override + # tokens with less words in labelling. For example if you have + # `man' and `an old man' as characters, the character labelled is going to + # be `an old man' and not the included `man'. + logging.info("Labeling characters from LUT") + cLUT = [c.lower() for c in sorted(character_lut, key=lambda x: len(x.split()))] + + # Find literals in document that match a character in cLUT + for c in cLUT: + spans = [r.span() for r in re.finditer(c, mDoc.text)] + for span in spans: + mDoc.assign_label_to_tokens(span[0],span[1],'Character') + + # *- Saywords + + # Assign labels to saywords. here saywords contain only one token. In addition + # we check against the saywords' lemma and not the saywords itself. + logging.info("Labeling saywords from LUT") + swLUT = [nlp(sw)[0].lemma_ for sw in saywords_lut] + for sw in swLUT: + mDoc.assign_label_to_tokens_by_matching_lemma(sw, 'Says') + + # *- Places + logging.info("Labeling places from LUT") + plLUT = [pl.lower() for pl in sorted(places_lut, key=lambda x: len(x.split()))] + + # Find literals in document that match a character in cLUT + for pl in plLUT: + spans = [r.span() for r in re.finditer(pl, mDoc.text)] + for span in spans: + mDoc.assign_label_to_tokens(span[0],span[1],'Place') + + # *- Spatial indicators + logging.info("Labeling spatial indicators from LUT") + spLUT = [sp.lower() for sp in sorted(spind_lut, key=lambda x: len(x.split()))] + for sp in spLUT: + spans = [r.span() for r in re.finditer(sp, mDoc.text)] + for span in spans: + mDoc.assign_label_to_tokens(span[0],span[1],'Spatial_Signal') + + + logging.info("Extracting token features") + features, labels = mDoc.get_token_features_labels() + + logging.info("Predicting labels") + new_labels = model.predict(features) + + + logging.info("Assning labels based on the NER model") + # If a label is not already assigned by a LUT, assign it using the model + + #logging.info("{} {}".format(len(mDoc.tokens), len(new_labels))) + for m, sent in enumerate(mDoc.token_sentences): + for n, token in enumerate(sent): + if token.label == 'O': + token.label = new_labels[m][n] + + return mDoc, quotes + + +def doc2brat(mDoc): + """ Returns a brat .ann file str based on mDoc """ + + # Variable generator for entities (T in brat format) + tvar = ner.var_generator('T') + + ann_str = "" + # Extract characters in the format + # T1 Character START END character string + + labels = ['Character', 'Says', 'Place', 'Spatial_Signal', 'Character_Line'] + + for label in labels: + token_sentences = mDoc.get_tokens_with_label(label) + for tlist in token_sentences: + if len(tlist) == 0: + continue + + for tokens in tlist: + start = tokens[0].start + end = tokens[-1].end + txt = mDoc.text[start:end] + ann_str += "{}\t{} {} {}\t{}\n".format(next(tvar), label, start, end, txt) + + + return ann_str + +if __name__=="__main__": + argparser = argparse.ArgumentParser() + argparser.add_argument('input_path', help='.txt file to parse') + argparser.add_argument('ner_model_path', help='.pkl file containing NER model') + argparser.add_argument('rel_model_path', help='.pkl file containing relational model') + argparser.add_argument('--say-lut', help='.txt file with list of saywords') + argparser.add_argument('--char-lut', help='.txt file with known characters') + argparser.add_argument('--place-lut', help='.txt file with known places') + argparser.add_argument('--spatial-indicator-lut', help='.txt file with known spatial indicators') + argparser.add_argument('--force', help='force overwrite when there is a file to be overwritten') + + args = argparser.parse_args() + + # Load text file + with open(args.input_path) as f: + text = f.read() + + output_dir = os.path.dirname(args.input_path) + output_text_path = args.input_path[:-4] + '_processed.txt' + output_quotes_path = args.input_path[:-4] + '_quotes.json' + output_annotation_path = args.input_path[:-4] + '_processed.ann' + + # Load NER model file + ner_model = joblib.load(args.ner_model_path) + + # Load REL model file + rel_model = joblib.load(args.rel_model_path) + + # Load saywords + if args.say_lut: + saylut_path = args.say_lut + else: + saylut_path = 'saywords.txt' + + with open(saylut_path) as f: + saylut = [s for s in f.read().split('\n') if s.strip() != ''] + + # Load places LUT + if args.place_lut: + placelut_path = args.place_lut + else: + placelut_path = 'places.txt' + + with open(placelut_path) as f: + placelut = [s for s in f.read().split('\n') if s.strip() != ''] + + # Load spatial indicators LUT + if args.spatial_indicator_lut: + spatial_indicator_lut_path = args.spatial_indicator_lut + else: + spatial_indicator_lut_path = 'spatial_indicators.txt' + + with open(spatial_indicator_lut_path) as f: + spatial_indicator_lut = [s for s in f.read().split('\n') if s.strip() != ''] + + # Load character LUT + if args.char_lut: + charlut_path = args.char_lut + else: + charlut_path = 'characters.txt' + + with open(charlut_path) as f: + + charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line + + character_lut = {} # Stores character attributes indexed by name + for l in charlist: + name, attributes = l.split(':') + + gender = None + age = None + + for a in attributes.split(','): + if 'male' in a: + gender = a + elif a.lower() in ['young', 'old']: + age = a + + character_lut[name] = {} + if gender: + character_lut[name]['gender'] = gender + if age: + character_lut[name]['age'] = age + + mDoc, quotes = annotate_entities(text, ner_model, character_lut, saylut, spatial_indicator_lut, placelut) + + annotation_text = doc2brat(mDoc) + + to_save = { + output_text_path: mDoc.text, + output_quotes_path: json.dumps(quotes), + output_annotation_path: annotation_text + } + + + for path in to_save: + if not os.path.exists(path) or args.force: + with open(path, 'w') as f: + f.write(to_save[path]) + else: + overwrite = input('Path {} exists, overwrite? (y/N) '.format(path)) + if overwrite[0] in ['Y', 'y']: + with open(path, 'w') as f: + f.write(to_save[path]) + + + +