from-my-pen-to-your-ears-supplementary-material: demo/text2annotation.py~ annotate

annotate demo/text2annotation.py~ @ 1:eb3b846ae0ef tip

second commit

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Wed, 16 May 2018 18:13:41 +0100
parents	4dad87badb0c
children

rev	line source
e@0	1 #!/usr/bin/env python3
e@0	2 # -- coding: utf-8 --
e@0	3 """
e@0	4 Created on Sat Apr 28 14:17:15 2018
e@0	5
e@0	6 @author: Emmanouil Theofanis Chourdakis
e@0	7
e@0	8 Takes a .txt story and annotates it based on:
e@0	9
e@0	10 characters,
e@0	11 places,
e@0	12 saywords,
e@0	13 character_lines,
e@0	14 spatial_indicators,
e@0	15
e@0	16 @output:
e@0	17 .ann file with the same name
e@0	18
e@0	19 """
e@0	20
e@0	21 # Change path to current directory
e@0	22 import os
e@0	23 os.chdir(os.path.dirname(os.path.realpath(__file__)))
e@0	24
e@0	25 import argparse
e@0	26 from sklearn.externals import joblib
e@0	27 import ner
e@0	28 import spacy
e@0	29 import re
e@0	30 import logging
e@0	31 import json
e@0	32
e@0	33 logging.basicConfig(level=logging.INFO)
e@0	34
e@0	35
e@0	36 def quotes2dict(text):
e@0	37 new_text = text
e@0	38 is_open = False
e@0	39
e@0	40 quote_no = 0
e@0	41 quote = []
e@0	42 quote_dict = {}
e@0	43
e@0	44 for n, c in enumerate(text):
e@0	45 if c == '"' and not is_open:
e@0	46 is_open = True
e@0	47 continue
e@0	48 elif c == '"' and is_open:
e@0	49 is_open = False
e@0	50 quote_dict["<cline{}>.".format(quote_no)] = ''.join(quote)
e@0	51 new_text = new_text.replace('"'+''.join(quote)+'"', "<cline{}>.".format(quote_no))
e@0	52 quote = []
e@0	53 quote_no += 1
e@0	54 continue
e@0	55
e@0	56 if is_open:
e@0	57 quote.append(c)
e@0	58
e@0	59 return new_text, quote_dict
e@0	60
e@0	61
e@0	62
e@0	63
e@0	64 def annotate_entities(text,
e@0	65 model,
e@0	66 character_lut,
e@0	67 saywords_lut,
e@0	68 spind_lut,
e@0	69 places_lut):
e@0	70 """
e@0	71 Function which annotates entities in text
e@0	72 using the model in "model",
e@0	73
e@0	74 returns: A ner.Document object with tokens labelled via
e@0	75 the LUTS provided and also the NER model in "model"
e@0	76 """
e@0	77
e@0	78 # Find and store character lines in a dictionary
e@0	79 logging.info('Swapping character lines for character line tags')
e@0	80 processed_text, quotes = quotes2dict(text)
e@0	81
e@0	82 # Create spacy document object from resulting text
e@0	83 # Create the nlp engine
e@0	84 logging.info("Loading 'en' spacy model")
e@0	85 nlp = spacy.load('en')
e@0	86
e@0	87 # Parse to spacy document
e@0	88 logging.info("Parsing document to spacy")
e@0	89 doc = nlp(processed_text)
e@0	90
e@0	91 # Parse to our custom Document object
e@0	92 logging.info("Parsing document to our object format for Named Entity Recognition")
e@0	93 mDoc = ner.Document(doc)
e@0	94
e@0	95 # Label <CLINE[0-9]+> as character line
e@0	96 logging.info("Labeling character lines")
e@0	97 spans = [r.span() for r in re.finditer(r'<cline[0-9]+>\.', mDoc.text)]
e@0	98 for span in spans:
e@0	99 mDoc.assign_label_to_tokens(span[0],span[1],'Character_Line')
e@0	100
e@0	101 # Parse using LUTs
e@0	102
e@0	103 # *- Characters
e@0	104
e@0	105 # Sort by number of words so that tokens with more words override
e@0	106 # tokens with less words in labelling. For example if you have
e@0	107 # `man' and `an old man' as characters, the character labelled is going to
e@0	108 # be `an old man' and not the included `man'.
e@0	109 logging.info("Labeling characters from LUT")
e@0	110 cLUT = [c.lower() for c in sorted(character_lut, key=lambda x: len(x.split()))]
e@0	111
e@0	112 # Find literals in document that match a character in cLUT
e@0	113 for c in cLUT:
e@0	114 spans = [r.span() for r in re.finditer(c, mDoc.text)]
e@0	115 for span in spans:
e@0	116 mDoc.assign_label_to_tokens(span[0],span[1],'Character')
e@0	117
e@0	118 # *- Saywords
e@0	119
e@0	120 # Assign labels to saywords. here saywords contain only one token. In addition
e@0	121 # we check against the saywords' lemma and not the saywords itself.
e@0	122 logging.info("Labeling saywords from LUT")
e@0	123 swLUT = [nlp(sw)[0].lemma_ for sw in saywords_lut]
e@0	124 for sw in swLUT:
e@0	125 mDoc.assign_label_to_tokens_by_matching_lemma(sw, 'Says')
e@0	126
e@0	127 # *- Places
e@0	128 logging.info("Labeling places from LUT")
e@0	129 plLUT = [pl.lower() for pl in sorted(places_lut, key=lambda x: len(x.split()))]
e@0	130
e@0	131 # Find literals in document that match a character in cLUT
e@0	132 for pl in plLUT:
e@0	133 spans = [r.span() for r in re.finditer(pl, mDoc.text)]
e@0	134 for span in spans:
e@0	135 mDoc.assign_label_to_tokens(span[0],span[1],'Place')
e@0	136
e@0	137 # *- Spatial indicators
e@0	138 logging.info("Labeling spatial indicators from LUT")
e@0	139 spLUT = [sp.lower() for sp in sorted(spind_lut, key=lambda x: len(x.split()))]
e@0	140 for sp in spLUT:
e@0	141 spans = [r.span() for r in re.finditer(sp, mDoc.text)]
e@0	142 for span in spans:
e@0	143 mDoc.assign_label_to_tokens(span[0],span[1],'Spatial_Signal')
e@0	144
e@0	145
e@0	146 logging.info("Extracting token features")
e@0	147 features, labels = mDoc.get_token_features_labels()
e@0	148
e@0	149 logging.info("Predicting labels")
e@0	150 new_labels = model.predict(features)
e@0	151
e@0	152
e@0	153 logging.info("Assning labels based on the NER model")
e@0	154 # If a label is not already assigned by a LUT, assign it using the model
e@0	155
e@0	156 #logging.info("{} {}".format(len(mDoc.tokens), len(new_labels)))
e@0	157 for m, sent in enumerate(mDoc.token_sentences):
e@0	158 for n, token in enumerate(sent):
e@0	159 if token.label == 'O':
e@0	160 token.label = new_labels[m][n]
e@0	161
e@0	162 return mDoc, quotes
e@0	163
e@0	164
e@0	165 def doc2brat(mDoc):
e@0	166 """ Returns a brat .ann file str based on mDoc """
e@0	167
e@0	168 # Variable generator for entities (T in brat format)
e@0	169 tvar = ner.var_generator('T')
e@0	170
e@0	171 ann_str = ""
e@0	172 # Extract characters in the format
e@0	173 # T1 Character START END character string
e@0	174
e@0	175 labels = ['Character', 'Says', 'Place', 'Spatial_Signal', 'Character_Line']
e@0	176
e@0	177 for label in labels:
e@0	178 token_sentences = mDoc.get_tokens_with_label(label)
e@0	179 for tlist in token_sentences:
e@0	180 if len(tlist) == 0:
e@0	181 continue
e@0	182
e@0	183 for tokens in tlist:
e@0	184 start = tokens[0].start
e@0	185 end = tokens[-1].end
e@0	186 txt = mDoc.text[start:end]
e@0	187 ann_str += "{}\t{} {} {}\t{}\n".format(next(tvar), label, start, end, txt)
e@0	188
e@0	189
e@0	190 return ann_str
e@0	191
e@0	192 if __name__=="__main__":
e@0	193 argparser = argparse.ArgumentParser()
e@0	194 argparser.add_argument('input_path', help='.txt file to parse')
e@0	195 argparser.add_argument('ner_model_path', help='.pkl file containing NER model')
e@0	196 argparser.add_argument('rel_model_path', help='.pkl file containing relational model')
e@0	197 argparser.add_argument('--say-lut', help='.txt file with list of saywords')
e@0	198 argparser.add_argument('--char-lut', help='.txt file with known characters')
e@0	199 argparser.add_argument('--place-lut', help='.txt file with known places')
e@0	200 argparser.add_argument('--spatial-indicator-lut', help='.txt file with known spatial indicators')
e@0	201 argparser.add_argument('--force', help='force overwrite when there is a file to be overwritten')
e@0	202
e@0	203 args = argparser.parse_args()
e@0	204
e@0	205 # Load text file
e@0	206 with open(args.input_path) as f:
e@0	207 text = f.read()
e@0	208
e@0	209 output_dir = os.path.dirname(args.input_path)
e@0	210 output_text_path = args.input_path[:-4] + '_processed.txt'
e@0	211 output_quotes_path = args.input_path[:-4] + '_quotes.json'
e@0	212 output_annotation_path = args.input_path[:-4] + '_processed.ann'
e@0	213
e@0	214 # Load NER model file
e@0	215 ner_model = joblib.load(args.ner_model_path)
e@0	216
e@0	217 # Load REL model file
e@0	218 rel_model = joblib.load(args.rel_model_path)
e@0	219
e@0	220 # Load saywords
e@0	221 if args.say_lut:
e@0	222 saylut_path = args.say_lut
e@0	223 else:
e@0	224 saylut_path = 'saywords.txt'
e@0	225
e@0	226 with open(saylut_path) as f:
e@0	227 saylut = [s for s in f.read().split('\n') if s.strip() != '']
e@0	228
e@0	229 # Load places LUT
e@0	230 if args.place_lut:
e@0	231 placelut_path = args.place_lut
e@0	232 else:
e@0	233 placelut_path = 'places.txt'
e@0	234
e@0	235 with open(placelut_path) as f:
e@0	236 placelut = [s for s in f.read().split('\n') if s.strip() != '']
e@0	237
e@0	238 # Load spatial indicators LUT
e@0	239 if args.spatial_indicator_lut:
e@0	240 spatial_indicator_lut_path = args.spatial_indicator_lut
e@0	241 else:
e@0	242 spatial_indicator_lut_path = 'spatial_indicators.txt'
e@0	243
e@0	244 with open(spatial_indicator_lut_path) as f:
e@0	245 spatial_indicator_lut = [s for s in f.read().split('\n') if s.strip() != '']
e@0	246
e@0	247 # Load character LUT
e@0	248 if args.char_lut:
e@0	249 charlut_path = args.char_lut
e@0	250 else:
e@0	251 charlut_path = 'characters.txt'
e@0	252
e@0	253 with open(charlut_path) as f:
e@0	254
e@0	255 charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
e@0	256
e@0	257 character_lut = {} # Stores character attributes indexed by name
e@0	258 for l in charlist:
e@0	259 name, attributes = l.split(':')
e@0	260
e@0	261 gender = None
e@0	262 age = None
e@0	263
e@0	264 for a in attributes.split(','):
e@0	265 if 'male' in a:
e@0	266 gender = a
e@0	267 elif a.lower() in ['young', 'old']:
e@0	268 age = a
e@0	269
e@0	270 character_lut[name] = {}
e@0	271 if gender:
e@0	272 character_lut[name]['gender'] = gender
e@0	273 if age:
e@0	274 character_lut[name]['age'] = age
e@0	275
e@0	276 mDoc, quotes = annotate_entities(text, ner_model, character_lut, saylut, spatial_indicator_lut, placelut)
e@0	277
e@0	278 annotation_text = doc2brat(mDoc)
e@0	279
e@0	280 to_save = {
e@0	281 output_text_path: mDoc.text,
e@0	282 output_quotes_path: json.dumps(quotes),
e@0	283 output_annotation_path: annotation_text
e@0	284 }
e@0	285
e@0	286
e@0	287 for path in to_save:
e@0	288 if not os.path.exists(path) or args.force:
e@0	289 with open(path, 'w') as f:
e@0	290 f.write(to_save[path])
e@0	291 else:
e@0	292 overwrite = input('Path {} exists, overwrite? (y/N) '.format(path))
e@0	293 if overwrite[0] in ['Y', 'y']:
e@0	294 with open(path, 'w') as f:
e@0	295 f.write(to_save[path])
e@0	296
e@0	297
e@0	298
e@0	299

Mercurial > hg > from-my-pen-to-your-ears-supplementary-material

annotate demo/text2annotation.py~ @ 1:eb3b846ae0ef tip