Mercurial > hg > chourdakisreiss2018smc

diff demo/annotation2script.py @ 0:90155bdd5dd6
first commit
author: Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date: Wed, 16 May 2018 18:27:05 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/demo/annotation2script.py	Wed May 16 18:27:05 2018 +0100
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May  1 17:00:26 2018
+
+@author: Emmanouil Theofanis Chourdakis
+
+Takes an .ann annotation and a .json character line
+file and creates a _script.txt script file.
+
+"""
+
+import argparse
+import logging
+import ner
+from rel import *
+import pypeg2 as pg
+import pandas as pd
+import json
+import os
+
+logging.basicConfig(level=logging.INFO)
+
+
+def annotation2script(annot, quotesdict):
+    logging.info('Parsing annotation')
+    parsed = pg.parse(annot, ner.AnnotationFile)
+
+    characters = {}
+    places = {}
+    character_lines = {}
+    scenes = []
+
+    # Store an entity and relations dictionary since relations
+    # point to such entities
+
+    dictionary = {}
+
+    # Visit all the parsed lines. Do it in two passes, first parse
+    # entities and then relations. The reason for that is that some times
+    # a relation refers to an entity that has not been defined.
+
+    for line in parsed:
+        # Every annotation line has a single object
+        obj = line[0]
+
+        if isinstance(obj, ner.AnnotationTuple):
+
+            annotation = obj.annotation.lower()
+
+            # Store to dictionary the string relating
+            # to the annotation
+
+
+            if annotation.split()[0].lower() in ['a', 'the']:
+                annotation = annotation.split()[1]
+
+            dictionary[obj.variable] = annotation
+
+            if obj.type == 'Character':
+                characters[annotation] = {}
+            elif obj.type == 'Character_Line':
+                character_lines[annotation] = {}
+            elif obj.type == 'Place':
+                places[annotation] = {}
+
+    for line in parsed:
+        obj = line[0]
+        if isinstance(obj, ner.AttributeTuple):
+            # If it is an instance of an attribute tuple,
+            # find out whether it is a gender assignment, then find
+            # the character it refers to and add the gender as attribute
+
+            target = dictionary[obj.target]
+            value = obj.annotation
+
+            if obj.type == 'Gender':
+                characters[target]['gender'] = value
+            elif obj.type == 'Age':
+                characters[target]['age'] = value
+
+
+    for line in parsed:
+        # Every annotation line has a single object
+        obj = line[0]
+
+        if isinstance(obj, ner.RelationTuple):
+
+            # Relations have a trigger, a first argument `arg1' and a
+            # second argument `arg2'. There are going to be
+            # |arg1| * |arg2| relations constructed for each trigger
+            # where |arg1| is the number of candidates for argument 1
+            # and |arg2| the number of candidates for argument 2
+
+            arg1_candidates = []
+            arg2_candidates = []
+
+            # Check relation's arguments:
+            for arg in obj.args:
+                if arg.label == 'Says':
+                    trigger = dictionary[arg.target]
+                    label = 'Quote'
+                elif arg.label == 'Spatial_Signal':
+                    trigger = dictionary[arg.target]
+                    label = 'Spatial_Relation'
+                if arg.label in ['Trajector', 'WHO']:
+                    arg1_candidates.append(dictionary[arg.target])
+                if arg.label in ['Landmark', 'WHAT']:
+                    arg2_candidates.append(dictionary[arg.target])
+
+            for arg1 in arg1_candidates:
+                for arg2 in arg2_candidates:
+                    relation = (trigger, arg1, arg2, label)
+                    if label == 'Quote':
+                        character_lines[arg2]['who'] = arg1
+                    if label == 'Spatial_Relation':
+                        scenes.append(arg2)
+
+    # Generate cast list
+    cast_list_section = r"""Cast List:
+Narrator - male or female - panned center
+"""
+
+    # Ping - pong the characters
+    panned = 'right'
+    for c in characters:
+        if 'gender' not in characters[c]:
+            gender = 'male or female'
+        else:
+            gender = characters[c]['gender'].lower()
+
+        cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned)
+        if panned == 'right':
+            panned = 'left'
+        else:
+            panned = 'right'
+
+
+    scenes_definition = r"""Scenes:
+"""
+
+    for n, scene in enumerate(scenes):
+        scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene)
+
+    # Scene introduction
+    ## TODO: Do it so that scenes follow the text
+
+    # Keep the correct order in lines
+    lines_order = [qq for qq in quotesdict]
+
+    # The lines are of the format <*line0> <*line1> etc,
+    # sort them based on the number just before the closing >
+    lines_order = sorted(lines_order, key=lambda x: int(x[-3]))
+    lines_section = r"""Script:
+--- Scene 1 ---
+"""
+
+    for l in lines_order:
+        if l[1] == 'n':
+            lines_section += "[Narrator] {}\n".format(quotesdict[l])
+        elif l[1] == 'c':
+            lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l])
+
+    script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section
+
+    
+    
+    # Create transcript 
+    
+    lines = []
+
+    for cline in quotesdict:
+        ldict = {}
+        
+        cline = cline[:-1] # Remove the trailing dot
+        
+        if cline[1] == 'c':
+            ldict['cast'] = character_lines[cline]['who'].capitalize()
+        else:
+            ldict['cast'] = 'Narrator'
+
+        lineno = cline.replace('<', '').replace('>', '')
+        ldict['filename'] = '{}.wav'.format(lineno)
+        ldict['line'] = quotesdict['{}.'.format(cline)]
+        lines.append(ldict)
+        
+    # Create sfx dataframe
+    
+    sfx = []
+    for scene in scenes:
+        sfx_dict = {'sfx': scene, 'url':'http://edit_me'}
+        sfx.append(sfx_dict)
+        
+    sfx_df = pd.DataFrame.from_records(sfx)
+        
+    transcript_df = pd.DataFrame.from_records(lines)
+    return script, transcript_df, sfx_df
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('input_annotation_path',
+                           help='.ann file with annotation')
+
+    argparser.add_argument('input_json_path',
+                           help='.json file containing the character quotes')
+
+    args = argparser.parse_args()
+
+    # Load annotation and quotes dictionary
+    with open(args.input_annotation_path) as f:
+        annot = f.read()
+
+    with open(args.input_json_path) as f:
+        quotesdict = json.load(f)
+
+    script, transcript_df, sfx_df = annotation2script(annot, quotesdict)
+
+    output_path = args.input_annotation_path[:-4] + '_script.txt'
+
+    with open(output_path, 'w') as f:
+        f.write(script)
+        
+    transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls'))
+    sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls'))
+
author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Wed, 16 May 2018 18:27:05 +0100
parents
children