annotate demo/text2annotation.py~ @ 1:eb3b846ae0ef tip

second commit
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 16 May 2018 18:13:41 +0100
parents 4dad87badb0c
children
rev   line source
e@0 1 #!/usr/bin/env python3
e@0 2 # -*- coding: utf-8 -*-
e@0 3 """
e@0 4 Created on Sat Apr 28 14:17:15 2018
e@0 5
e@0 6 @author: Emmanouil Theofanis Chourdakis
e@0 7
e@0 8 Takes a .txt story and annotates it based on:
e@0 9
e@0 10 characters,
e@0 11 places,
e@0 12 saywords,
e@0 13 character_lines,
e@0 14 spatial_indicators,
e@0 15
e@0 16 @output:
e@0 17 .ann file with the same name
e@0 18
e@0 19 """
e@0 20
e@0 21 # Change path to current directory
e@0 22 import os
e@0 23 os.chdir(os.path.dirname(os.path.realpath(__file__)))
e@0 24
e@0 25 import argparse
e@0 26 from sklearn.externals import joblib
e@0 27 import ner
e@0 28 import spacy
e@0 29 import re
e@0 30 import logging
e@0 31 import json
e@0 32
e@0 33 logging.basicConfig(level=logging.INFO)
e@0 34
e@0 35
e@0 36 def quotes2dict(text):
e@0 37 new_text = text
e@0 38 is_open = False
e@0 39
e@0 40 quote_no = 0
e@0 41 quote = []
e@0 42 quote_dict = {}
e@0 43
e@0 44 for n, c in enumerate(text):
e@0 45 if c == '"' and not is_open:
e@0 46 is_open = True
e@0 47 continue
e@0 48 elif c == '"' and is_open:
e@0 49 is_open = False
e@0 50 quote_dict["<cline{}>.".format(quote_no)] = ''.join(quote)
e@0 51 new_text = new_text.replace('"'+''.join(quote)+'"', "<cline{}>.".format(quote_no))
e@0 52 quote = []
e@0 53 quote_no += 1
e@0 54 continue
e@0 55
e@0 56 if is_open:
e@0 57 quote.append(c)
e@0 58
e@0 59 return new_text, quote_dict
e@0 60
e@0 61
e@0 62
e@0 63
e@0 64 def annotate_entities(text,
e@0 65 model,
e@0 66 character_lut,
e@0 67 saywords_lut,
e@0 68 spind_lut,
e@0 69 places_lut):
e@0 70 """
e@0 71 Function which annotates entities in text
e@0 72 using the model in "model",
e@0 73
e@0 74 returns: A ner.Document object with tokens labelled via
e@0 75 the LUTS provided and also the NER model in "model"
e@0 76 """
e@0 77
e@0 78 # Find and store character lines in a dictionary
e@0 79 logging.info('Swapping character lines for character line tags')
e@0 80 processed_text, quotes = quotes2dict(text)
e@0 81
e@0 82 # Create spacy document object from resulting text
e@0 83 # Create the nlp engine
e@0 84 logging.info("Loading 'en' spacy model")
e@0 85 nlp = spacy.load('en')
e@0 86
e@0 87 # Parse to spacy document
e@0 88 logging.info("Parsing document to spacy")
e@0 89 doc = nlp(processed_text)
e@0 90
e@0 91 # Parse to our custom Document object
e@0 92 logging.info("Parsing document to our object format for Named Entity Recognition")
e@0 93 mDoc = ner.Document(doc)
e@0 94
e@0 95 # Label <CLINE[0-9]+> as character line
e@0 96 logging.info("Labeling character lines")
e@0 97 spans = [r.span() for r in re.finditer(r'<cline[0-9]+>\.', mDoc.text)]
e@0 98 for span in spans:
e@0 99 mDoc.assign_label_to_tokens(span[0],span[1],'Character_Line')
e@0 100
e@0 101 # Parse using LUTs
e@0 102
e@0 103 # *- Characters
e@0 104
e@0 105 # Sort by number of words so that tokens with more words override
e@0 106 # tokens with less words in labelling. For example if you have
e@0 107 # `man' and `an old man' as characters, the character labelled is going to
e@0 108 # be `an old man' and not the included `man'.
e@0 109 logging.info("Labeling characters from LUT")
e@0 110 cLUT = [c.lower() for c in sorted(character_lut, key=lambda x: len(x.split()))]
e@0 111
e@0 112 # Find literals in document that match a character in cLUT
e@0 113 for c in cLUT:
e@0 114 spans = [r.span() for r in re.finditer(c, mDoc.text)]
e@0 115 for span in spans:
e@0 116 mDoc.assign_label_to_tokens(span[0],span[1],'Character')
e@0 117
e@0 118 # *- Saywords
e@0 119
e@0 120 # Assign labels to saywords. here saywords contain only one token. In addition
e@0 121 # we check against the saywords' lemma and not the saywords itself.
e@0 122 logging.info("Labeling saywords from LUT")
e@0 123 swLUT = [nlp(sw)[0].lemma_ for sw in saywords_lut]
e@0 124 for sw in swLUT:
e@0 125 mDoc.assign_label_to_tokens_by_matching_lemma(sw, 'Says')
e@0 126
e@0 127 # *- Places
e@0 128 logging.info("Labeling places from LUT")
e@0 129 plLUT = [pl.lower() for pl in sorted(places_lut, key=lambda x: len(x.split()))]
e@0 130
e@0 131 # Find literals in document that match a character in cLUT
e@0 132 for pl in plLUT:
e@0 133 spans = [r.span() for r in re.finditer(pl, mDoc.text)]
e@0 134 for span in spans:
e@0 135 mDoc.assign_label_to_tokens(span[0],span[1],'Place')
e@0 136
e@0 137 # *- Spatial indicators
e@0 138 logging.info("Labeling spatial indicators from LUT")
e@0 139 spLUT = [sp.lower() for sp in sorted(spind_lut, key=lambda x: len(x.split()))]
e@0 140 for sp in spLUT:
e@0 141 spans = [r.span() for r in re.finditer(sp, mDoc.text)]
e@0 142 for span in spans:
e@0 143 mDoc.assign_label_to_tokens(span[0],span[1],'Spatial_Signal')
e@0 144
e@0 145
e@0 146 logging.info("Extracting token features")
e@0 147 features, labels = mDoc.get_token_features_labels()
e@0 148
e@0 149 logging.info("Predicting labels")
e@0 150 new_labels = model.predict(features)
e@0 151
e@0 152
e@0 153 logging.info("Assning labels based on the NER model")
e@0 154 # If a label is not already assigned by a LUT, assign it using the model
e@0 155
e@0 156 #logging.info("{} {}".format(len(mDoc.tokens), len(new_labels)))
e@0 157 for m, sent in enumerate(mDoc.token_sentences):
e@0 158 for n, token in enumerate(sent):
e@0 159 if token.label == 'O':
e@0 160 token.label = new_labels[m][n]
e@0 161
e@0 162 return mDoc, quotes
e@0 163
e@0 164
e@0 165 def doc2brat(mDoc):
e@0 166 """ Returns a brat .ann file str based on mDoc """
e@0 167
e@0 168 # Variable generator for entities (T in brat format)
e@0 169 tvar = ner.var_generator('T')
e@0 170
e@0 171 ann_str = ""
e@0 172 # Extract characters in the format
e@0 173 # T1 Character START END character string
e@0 174
e@0 175 labels = ['Character', 'Says', 'Place', 'Spatial_Signal', 'Character_Line']
e@0 176
e@0 177 for label in labels:
e@0 178 token_sentences = mDoc.get_tokens_with_label(label)
e@0 179 for tlist in token_sentences:
e@0 180 if len(tlist) == 0:
e@0 181 continue
e@0 182
e@0 183 for tokens in tlist:
e@0 184 start = tokens[0].start
e@0 185 end = tokens[-1].end
e@0 186 txt = mDoc.text[start:end]
e@0 187 ann_str += "{}\t{} {} {}\t{}\n".format(next(tvar), label, start, end, txt)
e@0 188
e@0 189
e@0 190 return ann_str
e@0 191
e@0 192 if __name__=="__main__":
e@0 193 argparser = argparse.ArgumentParser()
e@0 194 argparser.add_argument('input_path', help='.txt file to parse')
e@0 195 argparser.add_argument('ner_model_path', help='.pkl file containing NER model')
e@0 196 argparser.add_argument('rel_model_path', help='.pkl file containing relational model')
e@0 197 argparser.add_argument('--say-lut', help='.txt file with list of saywords')
e@0 198 argparser.add_argument('--char-lut', help='.txt file with known characters')
e@0 199 argparser.add_argument('--place-lut', help='.txt file with known places')
e@0 200 argparser.add_argument('--spatial-indicator-lut', help='.txt file with known spatial indicators')
e@0 201 argparser.add_argument('--force', help='force overwrite when there is a file to be overwritten')
e@0 202
e@0 203 args = argparser.parse_args()
e@0 204
e@0 205 # Load text file
e@0 206 with open(args.input_path) as f:
e@0 207 text = f.read()
e@0 208
e@0 209 output_dir = os.path.dirname(args.input_path)
e@0 210 output_text_path = args.input_path[:-4] + '_processed.txt'
e@0 211 output_quotes_path = args.input_path[:-4] + '_quotes.json'
e@0 212 output_annotation_path = args.input_path[:-4] + '_processed.ann'
e@0 213
e@0 214 # Load NER model file
e@0 215 ner_model = joblib.load(args.ner_model_path)
e@0 216
e@0 217 # Load REL model file
e@0 218 rel_model = joblib.load(args.rel_model_path)
e@0 219
e@0 220 # Load saywords
e@0 221 if args.say_lut:
e@0 222 saylut_path = args.say_lut
e@0 223 else:
e@0 224 saylut_path = 'saywords.txt'
e@0 225
e@0 226 with open(saylut_path) as f:
e@0 227 saylut = [s for s in f.read().split('\n') if s.strip() != '']
e@0 228
e@0 229 # Load places LUT
e@0 230 if args.place_lut:
e@0 231 placelut_path = args.place_lut
e@0 232 else:
e@0 233 placelut_path = 'places.txt'
e@0 234
e@0 235 with open(placelut_path) as f:
e@0 236 placelut = [s for s in f.read().split('\n') if s.strip() != '']
e@0 237
e@0 238 # Load spatial indicators LUT
e@0 239 if args.spatial_indicator_lut:
e@0 240 spatial_indicator_lut_path = args.spatial_indicator_lut
e@0 241 else:
e@0 242 spatial_indicator_lut_path = 'spatial_indicators.txt'
e@0 243
e@0 244 with open(spatial_indicator_lut_path) as f:
e@0 245 spatial_indicator_lut = [s for s in f.read().split('\n') if s.strip() != '']
e@0 246
e@0 247 # Load character LUT
e@0 248 if args.char_lut:
e@0 249 charlut_path = args.char_lut
e@0 250 else:
e@0 251 charlut_path = 'characters.txt'
e@0 252
e@0 253 with open(charlut_path) as f:
e@0 254
e@0 255 charlist = [s for s in f.read().split('\n') if s.strip() != ''] # One character per line
e@0 256
e@0 257 character_lut = {} # Stores character attributes indexed by name
e@0 258 for l in charlist:
e@0 259 name, attributes = l.split(':')
e@0 260
e@0 261 gender = None
e@0 262 age = None
e@0 263
e@0 264 for a in attributes.split(','):
e@0 265 if 'male' in a:
e@0 266 gender = a
e@0 267 elif a.lower() in ['young', 'old']:
e@0 268 age = a
e@0 269
e@0 270 character_lut[name] = {}
e@0 271 if gender:
e@0 272 character_lut[name]['gender'] = gender
e@0 273 if age:
e@0 274 character_lut[name]['age'] = age
e@0 275
e@0 276 mDoc, quotes = annotate_entities(text, ner_model, character_lut, saylut, spatial_indicator_lut, placelut)
e@0 277
e@0 278 annotation_text = doc2brat(mDoc)
e@0 279
e@0 280 to_save = {
e@0 281 output_text_path: mDoc.text,
e@0 282 output_quotes_path: json.dumps(quotes),
e@0 283 output_annotation_path: annotation_text
e@0 284 }
e@0 285
e@0 286
e@0 287 for path in to_save:
e@0 288 if not os.path.exists(path) or args.force:
e@0 289 with open(path, 'w') as f:
e@0 290 f.write(to_save[path])
e@0 291 else:
e@0 292 overwrite = input('Path {} exists, overwrite? (y/N) '.format(path))
e@0 293 if overwrite[0] in ['Y', 'y']:
e@0 294 with open(path, 'w') as f:
e@0 295 f.write(to_save[path])
e@0 296
e@0 297
e@0 298
e@0 299