Mercurial > hg > chourdakisreiss2018smc
diff demo/annotation2script.py @ 0:90155bdd5dd6
first commit
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 16 May 2018 18:27:05 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demo/annotation2script.py Wed May 16 18:27:05 2018 +0100 @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 1 17:00:26 2018 + +@author: Emmanouil Theofanis Chourdakis + +Takes an .ann annotation and a .json character line +file and creates a _script.txt script file. + +""" + +import argparse +import logging +import ner +from rel import * +import pypeg2 as pg +import pandas as pd +import json +import os + +logging.basicConfig(level=logging.INFO) + + +def annotation2script(annot, quotesdict): + logging.info('Parsing annotation') + parsed = pg.parse(annot, ner.AnnotationFile) + + characters = {} + places = {} + character_lines = {} + scenes = [] + + # Store an entity and relations dictionary since relations + # point to such entities + + dictionary = {} + + # Visit all the parsed lines. Do it in two passes, first parse + # entities and then relations. The reason for that is that some times + # a relation refers to an entity that has not been defined. + + for line in parsed: + # Every annotation line has a single object + obj = line[0] + + if isinstance(obj, ner.AnnotationTuple): + + annotation = obj.annotation.lower() + + # Store to dictionary the string relating + # to the annotation + + + if annotation.split()[0].lower() in ['a', 'the']: + annotation = annotation.split()[1] + + dictionary[obj.variable] = annotation + + if obj.type == 'Character': + characters[annotation] = {} + elif obj.type == 'Character_Line': + character_lines[annotation] = {} + elif obj.type == 'Place': + places[annotation] = {} + + for line in parsed: + obj = line[0] + if isinstance(obj, ner.AttributeTuple): + # If it is an instance of an attribute tuple, + # find out whether it is a gender assignment, then find + # the character it refers to and add the gender as attribute + + target = dictionary[obj.target] + value = obj.annotation + + if obj.type == 'Gender': + characters[target]['gender'] = value + elif obj.type == 'Age': + characters[target]['age'] = value + + + for line in parsed: + # Every annotation line has a single object + obj = line[0] + + if isinstance(obj, ner.RelationTuple): + + # Relations have a trigger, a first argument `arg1' and a + # second argument `arg2'. There are going to be + # |arg1| * |arg2| relations constructed for each trigger + # where |arg1| is the number of candidates for argument 1 + # and |arg2| the number of candidates for argument 2 + + arg1_candidates = [] + arg2_candidates = [] + + # Check relation's arguments: + for arg in obj.args: + if arg.label == 'Says': + trigger = dictionary[arg.target] + label = 'Quote' + elif arg.label == 'Spatial_Signal': + trigger = dictionary[arg.target] + label = 'Spatial_Relation' + if arg.label in ['Trajector', 'WHO']: + arg1_candidates.append(dictionary[arg.target]) + if arg.label in ['Landmark', 'WHAT']: + arg2_candidates.append(dictionary[arg.target]) + + for arg1 in arg1_candidates: + for arg2 in arg2_candidates: + relation = (trigger, arg1, arg2, label) + if label == 'Quote': + character_lines[arg2]['who'] = arg1 + if label == 'Spatial_Relation': + scenes.append(arg2) + + # Generate cast list + cast_list_section = r"""Cast List: +Narrator - male or female - panned center +""" + + # Ping - pong the characters + panned = 'right' + for c in characters: + if 'gender' not in characters[c]: + gender = 'male or female' + else: + gender = characters[c]['gender'].lower() + + cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned) + if panned == 'right': + panned = 'left' + else: + panned = 'right' + + + scenes_definition = r"""Scenes: +""" + + for n, scene in enumerate(scenes): + scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene) + + # Scene introduction + ## TODO: Do it so that scenes follow the text + + # Keep the correct order in lines + lines_order = [qq for qq in quotesdict] + + # The lines are of the format <*line0> <*line1> etc, + # sort them based on the number just before the closing > + lines_order = sorted(lines_order, key=lambda x: int(x[-3])) + lines_section = r"""Script: +--- Scene 1 --- +""" + + for l in lines_order: + if l[1] == 'n': + lines_section += "[Narrator] {}\n".format(quotesdict[l]) + elif l[1] == 'c': + lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l]) + + script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section + + + + # Create transcript + + lines = [] + + for cline in quotesdict: + ldict = {} + + cline = cline[:-1] # Remove the trailing dot + + if cline[1] == 'c': + ldict['cast'] = character_lines[cline]['who'].capitalize() + else: + ldict['cast'] = 'Narrator' + + lineno = cline.replace('<', '').replace('>', '') + ldict['filename'] = '{}.wav'.format(lineno) + ldict['line'] = quotesdict['{}.'.format(cline)] + lines.append(ldict) + + # Create sfx dataframe + + sfx = [] + for scene in scenes: + sfx_dict = {'sfx': scene, 'url':'http://edit_me'} + sfx.append(sfx_dict) + + sfx_df = pd.DataFrame.from_records(sfx) + + transcript_df = pd.DataFrame.from_records(lines) + return script, transcript_df, sfx_df + + +if __name__ == "__main__": + argparser = argparse.ArgumentParser() + argparser.add_argument('input_annotation_path', + help='.ann file with annotation') + + argparser.add_argument('input_json_path', + help='.json file containing the character quotes') + + args = argparser.parse_args() + + # Load annotation and quotes dictionary + with open(args.input_annotation_path) as f: + annot = f.read() + + with open(args.input_json_path) as f: + quotesdict = json.load(f) + + script, transcript_df, sfx_df = annotation2script(annot, quotesdict) + + output_path = args.input_annotation_path[:-4] + '_script.txt' + + with open(output_path, 'w') as f: + f.write(script) + + transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls')) + sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls')) +