Mercurial > hg > chourdakisreiss2018smc
comparison demo/annotation2script.py @ 0:90155bdd5dd6
first commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 18:27:05 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:90155bdd5dd6 |
---|---|
1 #!/usr/bin/env python3 | |
2 # -*- coding: utf-8 -*- | |
3 """ | |
4 Created on Tue May 1 17:00:26 2018 | |
5 | |
6 @author: Emmanouil Theofanis Chourdakis | |
7 | |
8 Takes an .ann annotation and a .json character line | |
9 file and creates a _script.txt script file. | |
10 | |
11 """ | |
12 | |
13 import argparse | |
14 import logging | |
15 import ner | |
16 from rel import * | |
17 import pypeg2 as pg | |
18 import pandas as pd | |
19 import json | |
20 import os | |
21 | |
22 logging.basicConfig(level=logging.INFO) | |
23 | |
24 | |
25 def annotation2script(annot, quotesdict): | |
26 logging.info('Parsing annotation') | |
27 parsed = pg.parse(annot, ner.AnnotationFile) | |
28 | |
29 characters = {} | |
30 places = {} | |
31 character_lines = {} | |
32 scenes = [] | |
33 | |
34 # Store an entity and relations dictionary since relations | |
35 # point to such entities | |
36 | |
37 dictionary = {} | |
38 | |
39 # Visit all the parsed lines. Do it in two passes, first parse | |
40 # entities and then relations. The reason for that is that some times | |
41 # a relation refers to an entity that has not been defined. | |
42 | |
43 for line in parsed: | |
44 # Every annotation line has a single object | |
45 obj = line[0] | |
46 | |
47 if isinstance(obj, ner.AnnotationTuple): | |
48 | |
49 annotation = obj.annotation.lower() | |
50 | |
51 # Store to dictionary the string relating | |
52 # to the annotation | |
53 | |
54 | |
55 if annotation.split()[0].lower() in ['a', 'the']: | |
56 annotation = annotation.split()[1] | |
57 | |
58 dictionary[obj.variable] = annotation | |
59 | |
60 if obj.type == 'Character': | |
61 characters[annotation] = {} | |
62 elif obj.type == 'Character_Line': | |
63 character_lines[annotation] = {} | |
64 elif obj.type == 'Place': | |
65 places[annotation] = {} | |
66 | |
67 for line in parsed: | |
68 obj = line[0] | |
69 if isinstance(obj, ner.AttributeTuple): | |
70 # If it is an instance of an attribute tuple, | |
71 # find out whether it is a gender assignment, then find | |
72 # the character it refers to and add the gender as attribute | |
73 | |
74 target = dictionary[obj.target] | |
75 value = obj.annotation | |
76 | |
77 if obj.type == 'Gender': | |
78 characters[target]['gender'] = value | |
79 elif obj.type == 'Age': | |
80 characters[target]['age'] = value | |
81 | |
82 | |
83 for line in parsed: | |
84 # Every annotation line has a single object | |
85 obj = line[0] | |
86 | |
87 if isinstance(obj, ner.RelationTuple): | |
88 | |
89 # Relations have a trigger, a first argument `arg1' and a | |
90 # second argument `arg2'. There are going to be | |
91 # |arg1| * |arg2| relations constructed for each trigger | |
92 # where |arg1| is the number of candidates for argument 1 | |
93 # and |arg2| the number of candidates for argument 2 | |
94 | |
95 arg1_candidates = [] | |
96 arg2_candidates = [] | |
97 | |
98 # Check relation's arguments: | |
99 for arg in obj.args: | |
100 if arg.label == 'Says': | |
101 trigger = dictionary[arg.target] | |
102 label = 'Quote' | |
103 elif arg.label == 'Spatial_Signal': | |
104 trigger = dictionary[arg.target] | |
105 label = 'Spatial_Relation' | |
106 if arg.label in ['Trajector', 'WHO']: | |
107 arg1_candidates.append(dictionary[arg.target]) | |
108 if arg.label in ['Landmark', 'WHAT']: | |
109 arg2_candidates.append(dictionary[arg.target]) | |
110 | |
111 for arg1 in arg1_candidates: | |
112 for arg2 in arg2_candidates: | |
113 relation = (trigger, arg1, arg2, label) | |
114 if label == 'Quote': | |
115 character_lines[arg2]['who'] = arg1 | |
116 if label == 'Spatial_Relation': | |
117 scenes.append(arg2) | |
118 | |
119 # Generate cast list | |
120 cast_list_section = r"""Cast List: | |
121 Narrator - male or female - panned center | |
122 """ | |
123 | |
124 # Ping - pong the characters | |
125 panned = 'right' | |
126 for c in characters: | |
127 if 'gender' not in characters[c]: | |
128 gender = 'male or female' | |
129 else: | |
130 gender = characters[c]['gender'].lower() | |
131 | |
132 cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned) | |
133 if panned == 'right': | |
134 panned = 'left' | |
135 else: | |
136 panned = 'right' | |
137 | |
138 | |
139 scenes_definition = r"""Scenes: | |
140 """ | |
141 | |
142 for n, scene in enumerate(scenes): | |
143 scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene) | |
144 | |
145 # Scene introduction | |
146 ## TODO: Do it so that scenes follow the text | |
147 | |
148 # Keep the correct order in lines | |
149 lines_order = [qq for qq in quotesdict] | |
150 | |
151 # The lines are of the format <*line0> <*line1> etc, | |
152 # sort them based on the number just before the closing > | |
153 lines_order = sorted(lines_order, key=lambda x: int(x[-3])) | |
154 lines_section = r"""Script: | |
155 --- Scene 1 --- | |
156 """ | |
157 | |
158 for l in lines_order: | |
159 if l[1] == 'n': | |
160 lines_section += "[Narrator] {}\n".format(quotesdict[l]) | |
161 elif l[1] == 'c': | |
162 lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l]) | |
163 | |
164 script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section | |
165 | |
166 | |
167 | |
168 # Create transcript | |
169 | |
170 lines = [] | |
171 | |
172 for cline in quotesdict: | |
173 ldict = {} | |
174 | |
175 cline = cline[:-1] # Remove the trailing dot | |
176 | |
177 if cline[1] == 'c': | |
178 ldict['cast'] = character_lines[cline]['who'].capitalize() | |
179 else: | |
180 ldict['cast'] = 'Narrator' | |
181 | |
182 lineno = cline.replace('<', '').replace('>', '') | |
183 ldict['filename'] = '{}.wav'.format(lineno) | |
184 ldict['line'] = quotesdict['{}.'.format(cline)] | |
185 lines.append(ldict) | |
186 | |
187 # Create sfx dataframe | |
188 | |
189 sfx = [] | |
190 for scene in scenes: | |
191 sfx_dict = {'sfx': scene, 'url':'http://edit_me'} | |
192 sfx.append(sfx_dict) | |
193 | |
194 sfx_df = pd.DataFrame.from_records(sfx) | |
195 | |
196 transcript_df = pd.DataFrame.from_records(lines) | |
197 return script, transcript_df, sfx_df | |
198 | |
199 | |
200 if __name__ == "__main__": | |
201 argparser = argparse.ArgumentParser() | |
202 argparser.add_argument('input_annotation_path', | |
203 help='.ann file with annotation') | |
204 | |
205 argparser.add_argument('input_json_path', | |
206 help='.json file containing the character quotes') | |
207 | |
208 args = argparser.parse_args() | |
209 | |
210 # Load annotation and quotes dictionary | |
211 with open(args.input_annotation_path) as f: | |
212 annot = f.read() | |
213 | |
214 with open(args.input_json_path) as f: | |
215 quotesdict = json.load(f) | |
216 | |
217 script, transcript_df, sfx_df = annotation2script(annot, quotesdict) | |
218 | |
219 output_path = args.input_annotation_path[:-4] + '_script.txt' | |
220 | |
221 with open(output_path, 'w') as f: | |
222 f.write(script) | |
223 | |
224 transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls')) | |
225 sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls')) | |
226 |