chourdakisreiss2018smc: demo/annotation2script.py annotate

annotate demo/annotation2script.py @ 13:16066f0a7127 tip

fixed the problem with brat

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Sat, 08 Dec 2018 11:02:40 +0000
parents	90155bdd5dd6
children

rev	line source
e@0	1 #!/usr/bin/env python3
e@0	2 # -- coding: utf-8 --
e@0	3 """
e@0	4 Created on Tue May 1 17:00:26 2018
e@0	5
e@0	6 @author: Emmanouil Theofanis Chourdakis
e@0	7
e@0	8 Takes an .ann annotation and a .json character line
e@0	9 file and creates a _script.txt script file.
e@0	10
e@0	11 """
e@0	12
e@0	13 import argparse
e@0	14 import logging
e@0	15 import ner
e@0	16 from rel import *
e@0	17 import pypeg2 as pg
e@0	18 import pandas as pd
e@0	19 import json
e@0	20 import os
e@0	21
e@0	22 logging.basicConfig(level=logging.INFO)
e@0	23
e@0	24
e@0	25 def annotation2script(annot, quotesdict):
e@0	26 logging.info('Parsing annotation')
e@0	27 parsed = pg.parse(annot, ner.AnnotationFile)
e@0	28
e@0	29 characters = {}
e@0	30 places = {}
e@0	31 character_lines = {}
e@0	32 scenes = []
e@0	33
e@0	34 # Store an entity and relations dictionary since relations
e@0	35 # point to such entities
e@0	36
e@0	37 dictionary = {}
e@0	38
e@0	39 # Visit all the parsed lines. Do it in two passes, first parse
e@0	40 # entities and then relations. The reason for that is that some times
e@0	41 # a relation refers to an entity that has not been defined.
e@0	42
e@0	43 for line in parsed:
e@0	44 # Every annotation line has a single object
e@0	45 obj = line[0]
e@0	46
e@0	47 if isinstance(obj, ner.AnnotationTuple):
e@0	48
e@0	49 annotation = obj.annotation.lower()
e@0	50
e@0	51 # Store to dictionary the string relating
e@0	52 # to the annotation
e@0	53
e@0	54
e@0	55 if annotation.split()[0].lower() in ['a', 'the']:
e@0	56 annotation = annotation.split()[1]
e@0	57
e@0	58 dictionary[obj.variable] = annotation
e@0	59
e@0	60 if obj.type == 'Character':
e@0	61 characters[annotation] = {}
e@0	62 elif obj.type == 'Character_Line':
e@0	63 character_lines[annotation] = {}
e@0	64 elif obj.type == 'Place':
e@0	65 places[annotation] = {}
e@0	66
e@0	67 for line in parsed:
e@0	68 obj = line[0]
e@0	69 if isinstance(obj, ner.AttributeTuple):
e@0	70 # If it is an instance of an attribute tuple,
e@0	71 # find out whether it is a gender assignment, then find
e@0	72 # the character it refers to and add the gender as attribute
e@0	73
e@0	74 target = dictionary[obj.target]
e@0	75 value = obj.annotation
e@0	76
e@0	77 if obj.type == 'Gender':
e@0	78 characters[target]['gender'] = value
e@0	79 elif obj.type == 'Age':
e@0	80 characters[target]['age'] = value
e@0	81
e@0	82
e@0	83 for line in parsed:
e@0	84 # Every annotation line has a single object
e@0	85 obj = line[0]
e@0	86
e@0	87 if isinstance(obj, ner.RelationTuple):
e@0	88
e@0	89 # Relations have a trigger, a first argument `arg1' and a
e@0	90 # second argument `arg2'. There are going to be
e@0	91 # \|arg1\| * \|arg2\| relations constructed for each trigger
e@0	92 # where \|arg1\| is the number of candidates for argument 1
e@0	93 # and \|arg2\| the number of candidates for argument 2
e@0	94
e@0	95 arg1_candidates = []
e@0	96 arg2_candidates = []
e@0	97
e@0	98 # Check relation's arguments:
e@0	99 for arg in obj.args:
e@0	100 if arg.label == 'Says':
e@0	101 trigger = dictionary[arg.target]
e@0	102 label = 'Quote'
e@0	103 elif arg.label == 'Spatial_Signal':
e@0	104 trigger = dictionary[arg.target]
e@0	105 label = 'Spatial_Relation'
e@0	106 if arg.label in ['Trajector', 'WHO']:
e@0	107 arg1_candidates.append(dictionary[arg.target])
e@0	108 if arg.label in ['Landmark', 'WHAT']:
e@0	109 arg2_candidates.append(dictionary[arg.target])
e@0	110
e@0	111 for arg1 in arg1_candidates:
e@0	112 for arg2 in arg2_candidates:
e@0	113 relation = (trigger, arg1, arg2, label)
e@0	114 if label == 'Quote':
e@0	115 character_lines[arg2]['who'] = arg1
e@0	116 if label == 'Spatial_Relation':
e@0	117 scenes.append(arg2)
e@0	118
e@0	119 # Generate cast list
e@0	120 cast_list_section = r"""Cast List:
e@0	121 Narrator - male or female - panned center
e@0	122 """
e@0	123
e@0	124 # Ping - pong the characters
e@0	125 panned = 'right'
e@0	126 for c in characters:
e@0	127 if 'gender' not in characters[c]:
e@0	128 gender = 'male or female'
e@0	129 else:
e@0	130 gender = characters[c]['gender'].lower()
e@0	131
e@0	132 cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned)
e@0	133 if panned == 'right':
e@0	134 panned = 'left'
e@0	135 else:
e@0	136 panned = 'right'
e@0	137
e@0	138
e@0	139 scenes_definition = r"""Scenes:
e@0	140 """
e@0	141
e@0	142 for n, scene in enumerate(scenes):
e@0	143 scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene)
e@0	144
e@0	145 # Scene introduction
e@0	146 ## TODO: Do it so that scenes follow the text
e@0	147
e@0	148 # Keep the correct order in lines
e@0	149 lines_order = [qq for qq in quotesdict]
e@0	150
e@0	151 # The lines are of the format <line0> <line1> etc,
e@0	152 # sort them based on the number just before the closing >
e@0	153 lines_order = sorted(lines_order, key=lambda x: int(x[-3]))
e@0	154 lines_section = r"""Script:
e@0	155 --- Scene 1 ---
e@0	156 """
e@0	157
e@0	158 for l in lines_order:
e@0	159 if l[1] == 'n':
e@0	160 lines_section += "[Narrator] {}\n".format(quotesdict[l])
e@0	161 elif l[1] == 'c':
e@0	162 lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l])
e@0	163
e@0	164 script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section
e@0	165
e@0	166
e@0	167
e@0	168 # Create transcript
e@0	169
e@0	170 lines = []
e@0	171
e@0	172 for cline in quotesdict:
e@0	173 ldict = {}
e@0	174
e@0	175 cline = cline[:-1] # Remove the trailing dot
e@0	176
e@0	177 if cline[1] == 'c':
e@0	178 ldict['cast'] = character_lines[cline]['who'].capitalize()
e@0	179 else:
e@0	180 ldict['cast'] = 'Narrator'
e@0	181
e@0	182 lineno = cline.replace('<', '').replace('>', '')
e@0	183 ldict['filename'] = '{}.wav'.format(lineno)
e@0	184 ldict['line'] = quotesdict['{}.'.format(cline)]
e@0	185 lines.append(ldict)
e@0	186
e@0	187 # Create sfx dataframe
e@0	188
e@0	189 sfx = []
e@0	190 for scene in scenes:
e@0	191 sfx_dict = {'sfx': scene, 'url':'http://edit_me'}
e@0	192 sfx.append(sfx_dict)
e@0	193
e@0	194 sfx_df = pd.DataFrame.from_records(sfx)
e@0	195
e@0	196 transcript_df = pd.DataFrame.from_records(lines)
e@0	197 return script, transcript_df, sfx_df
e@0	198
e@0	199
e@0	200 if __name__ == "__main__":
e@0	201 argparser = argparse.ArgumentParser()
e@0	202 argparser.add_argument('input_annotation_path',
e@0	203 help='.ann file with annotation')
e@0	204
e@0	205 argparser.add_argument('input_json_path',
e@0	206 help='.json file containing the character quotes')
e@0	207
e@0	208 args = argparser.parse_args()
e@0	209
e@0	210 # Load annotation and quotes dictionary
e@0	211 with open(args.input_annotation_path) as f:
e@0	212 annot = f.read()
e@0	213
e@0	214 with open(args.input_json_path) as f:
e@0	215 quotesdict = json.load(f)
e@0	216
e@0	217 script, transcript_df, sfx_df = annotation2script(annot, quotesdict)
e@0	218
e@0	219 output_path = args.input_annotation_path[:-4] + '_script.txt'
e@0	220
e@0	221 with open(output_path, 'w') as f:
e@0	222 f.write(script)
e@0	223
e@0	224 transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls'))
e@0	225 sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls'))
e@0	226

Mercurial > hg > chourdakisreiss2018smc

annotate demo/annotation2script.py @ 13:16066f0a7127 tip