Mercurial > hg > chourdakisreiss2018smc
view demo/annotation2script.py @ 0:90155bdd5dd6
first commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 18:27:05 +0100 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue May 1 17:00:26 2018 @author: Emmanouil Theofanis Chourdakis Takes an .ann annotation and a .json character line file and creates a _script.txt script file. """ import argparse import logging import ner from rel import * import pypeg2 as pg import pandas as pd import json import os logging.basicConfig(level=logging.INFO) def annotation2script(annot, quotesdict): logging.info('Parsing annotation') parsed = pg.parse(annot, ner.AnnotationFile) characters = {} places = {} character_lines = {} scenes = [] # Store an entity and relations dictionary since relations # point to such entities dictionary = {} # Visit all the parsed lines. Do it in two passes, first parse # entities and then relations. The reason for that is that some times # a relation refers to an entity that has not been defined. for line in parsed: # Every annotation line has a single object obj = line[0] if isinstance(obj, ner.AnnotationTuple): annotation = obj.annotation.lower() # Store to dictionary the string relating # to the annotation if annotation.split()[0].lower() in ['a', 'the']: annotation = annotation.split()[1] dictionary[obj.variable] = annotation if obj.type == 'Character': characters[annotation] = {} elif obj.type == 'Character_Line': character_lines[annotation] = {} elif obj.type == 'Place': places[annotation] = {} for line in parsed: obj = line[0] if isinstance(obj, ner.AttributeTuple): # If it is an instance of an attribute tuple, # find out whether it is a gender assignment, then find # the character it refers to and add the gender as attribute target = dictionary[obj.target] value = obj.annotation if obj.type == 'Gender': characters[target]['gender'] = value elif obj.type == 'Age': characters[target]['age'] = value for line in parsed: # Every annotation line has a single object obj = line[0] if isinstance(obj, ner.RelationTuple): # Relations have a trigger, a first argument `arg1' and a # second argument `arg2'. There are going to be # |arg1| * |arg2| relations constructed for each trigger # where |arg1| is the number of candidates for argument 1 # and |arg2| the number of candidates for argument 2 arg1_candidates = [] arg2_candidates = [] # Check relation's arguments: for arg in obj.args: if arg.label == 'Says': trigger = dictionary[arg.target] label = 'Quote' elif arg.label == 'Spatial_Signal': trigger = dictionary[arg.target] label = 'Spatial_Relation' if arg.label in ['Trajector', 'WHO']: arg1_candidates.append(dictionary[arg.target]) if arg.label in ['Landmark', 'WHAT']: arg2_candidates.append(dictionary[arg.target]) for arg1 in arg1_candidates: for arg2 in arg2_candidates: relation = (trigger, arg1, arg2, label) if label == 'Quote': character_lines[arg2]['who'] = arg1 if label == 'Spatial_Relation': scenes.append(arg2) # Generate cast list cast_list_section = r"""Cast List: Narrator - male or female - panned center """ # Ping - pong the characters panned = 'right' for c in characters: if 'gender' not in characters[c]: gender = 'male or female' else: gender = characters[c]['gender'].lower() cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned) if panned == 'right': panned = 'left' else: panned = 'right' scenes_definition = r"""Scenes: """ for n, scene in enumerate(scenes): scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene) # Scene introduction ## TODO: Do it so that scenes follow the text # Keep the correct order in lines lines_order = [qq for qq in quotesdict] # The lines are of the format <*line0> <*line1> etc, # sort them based on the number just before the closing > lines_order = sorted(lines_order, key=lambda x: int(x[-3])) lines_section = r"""Script: --- Scene 1 --- """ for l in lines_order: if l[1] == 'n': lines_section += "[Narrator] {}\n".format(quotesdict[l]) elif l[1] == 'c': lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l]) script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section # Create transcript lines = [] for cline in quotesdict: ldict = {} cline = cline[:-1] # Remove the trailing dot if cline[1] == 'c': ldict['cast'] = character_lines[cline]['who'].capitalize() else: ldict['cast'] = 'Narrator' lineno = cline.replace('<', '').replace('>', '') ldict['filename'] = '{}.wav'.format(lineno) ldict['line'] = quotesdict['{}.'.format(cline)] lines.append(ldict) # Create sfx dataframe sfx = [] for scene in scenes: sfx_dict = {'sfx': scene, 'url':'http://edit_me'} sfx.append(sfx_dict) sfx_df = pd.DataFrame.from_records(sfx) transcript_df = pd.DataFrame.from_records(lines) return script, transcript_df, sfx_df if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('input_annotation_path', help='.ann file with annotation') argparser.add_argument('input_json_path', help='.json file containing the character quotes') args = argparser.parse_args() # Load annotation and quotes dictionary with open(args.input_annotation_path) as f: annot = f.read() with open(args.input_json_path) as f: quotesdict = json.load(f) script, transcript_df, sfx_df = annotation2script(annot, quotesdict) output_path = args.input_annotation_path[:-4] + '_script.txt' with open(output_path, 'w') as f: f.write(script) transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls')) sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls'))