view demo/annotation2script.py @ 13:16066f0a7127 tip

fixed the problem with brat
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 08 Dec 2018 11:02:40 +0000
parents 90155bdd5dd6
children
line wrap: on
line source
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May  1 17:00:26 2018

@author: Emmanouil Theofanis Chourdakis

Takes an .ann annotation and a .json character line
file and creates a _script.txt script file.

"""

import argparse
import logging
import ner
from rel import *
import pypeg2 as pg
import pandas as pd
import json
import os

logging.basicConfig(level=logging.INFO)


def annotation2script(annot, quotesdict):
    logging.info('Parsing annotation')
    parsed = pg.parse(annot, ner.AnnotationFile)

    characters = {}
    places = {}
    character_lines = {}
    scenes = []

    # Store an entity and relations dictionary since relations
    # point to such entities

    dictionary = {}

    # Visit all the parsed lines. Do it in two passes, first parse
    # entities and then relations. The reason for that is that some times
    # a relation refers to an entity that has not been defined.

    for line in parsed:
        # Every annotation line has a single object
        obj = line[0]

        if isinstance(obj, ner.AnnotationTuple):

            annotation = obj.annotation.lower()

            # Store to dictionary the string relating
            # to the annotation


            if annotation.split()[0].lower() in ['a', 'the']:
                annotation = annotation.split()[1]

            dictionary[obj.variable] = annotation

            if obj.type == 'Character':
                characters[annotation] = {}
            elif obj.type == 'Character_Line':
                character_lines[annotation] = {}
            elif obj.type == 'Place':
                places[annotation] = {}

    for line in parsed:
        obj = line[0]
        if isinstance(obj, ner.AttributeTuple):
            # If it is an instance of an attribute tuple,
            # find out whether it is a gender assignment, then find
            # the character it refers to and add the gender as attribute

            target = dictionary[obj.target]
            value = obj.annotation

            if obj.type == 'Gender':
                characters[target]['gender'] = value
            elif obj.type == 'Age':
                characters[target]['age'] = value


    for line in parsed:
        # Every annotation line has a single object
        obj = line[0]

        if isinstance(obj, ner.RelationTuple):

            # Relations have a trigger, a first argument `arg1' and a
            # second argument `arg2'. There are going to be
            # |arg1| * |arg2| relations constructed for each trigger
            # where |arg1| is the number of candidates for argument 1
            # and |arg2| the number of candidates for argument 2

            arg1_candidates = []
            arg2_candidates = []

            # Check relation's arguments:
            for arg in obj.args:
                if arg.label == 'Says':
                    trigger = dictionary[arg.target]
                    label = 'Quote'
                elif arg.label == 'Spatial_Signal':
                    trigger = dictionary[arg.target]
                    label = 'Spatial_Relation'
                if arg.label in ['Trajector', 'WHO']:
                    arg1_candidates.append(dictionary[arg.target])
                if arg.label in ['Landmark', 'WHAT']:
                    arg2_candidates.append(dictionary[arg.target])

            for arg1 in arg1_candidates:
                for arg2 in arg2_candidates:
                    relation = (trigger, arg1, arg2, label)
                    if label == 'Quote':
                        character_lines[arg2]['who'] = arg1
                    if label == 'Spatial_Relation':
                        scenes.append(arg2)

    # Generate cast list
    cast_list_section = r"""Cast List:
Narrator - male or female - panned center
"""

    # Ping - pong the characters
    panned = 'right'
    for c in characters:
        if 'gender' not in characters[c]:
            gender = 'male or female'
        else:
            gender = characters[c]['gender'].lower()

        cast_list_section += '{} - {} - panned {}\n'.format(c.capitalize(), gender, panned)
        if panned == 'right':
            panned = 'left'
        else:
            panned = 'right'


    scenes_definition = r"""Scenes:
"""

    for n, scene in enumerate(scenes):
        scenes_definition += "{} - {} - fxive:{} - none".format(n+1, scene, scene)

    # Scene introduction
    ## TODO: Do it so that scenes follow the text

    # Keep the correct order in lines
    lines_order = [qq for qq in quotesdict]

    # The lines are of the format <*line0> <*line1> etc,
    # sort them based on the number just before the closing >
    lines_order = sorted(lines_order, key=lambda x: int(x[-3]))
    lines_section = r"""Script:
--- Scene 1 ---
"""

    for l in lines_order:
        if l[1] == 'n':
            lines_section += "[Narrator] {}\n".format(quotesdict[l])
        elif l[1] == 'c':
            lines_section += "[{}] {}\n".format(character_lines[l[:-1]]['who'].capitalize(), quotesdict[l])

    script = cast_list_section + '\n' + scenes_definition + '\n' + lines_section

    
    
    # Create transcript 
    
    lines = []

    for cline in quotesdict:
        ldict = {}
        
        cline = cline[:-1] # Remove the trailing dot
        
        if cline[1] == 'c':
            ldict['cast'] = character_lines[cline]['who'].capitalize()
        else:
            ldict['cast'] = 'Narrator'

        lineno = cline.replace('<', '').replace('>', '')
        ldict['filename'] = '{}.wav'.format(lineno)
        ldict['line'] = quotesdict['{}.'.format(cline)]
        lines.append(ldict)
        
    # Create sfx dataframe
    
    sfx = []
    for scene in scenes:
        sfx_dict = {'sfx': scene, 'url':'http://edit_me'}
        sfx.append(sfx_dict)
        
    sfx_df = pd.DataFrame.from_records(sfx)
        
    transcript_df = pd.DataFrame.from_records(lines)
    return script, transcript_df, sfx_df


if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('input_annotation_path',
                           help='.ann file with annotation')

    argparser.add_argument('input_json_path',
                           help='.json file containing the character quotes')

    args = argparser.parse_args()

    # Load annotation and quotes dictionary
    with open(args.input_annotation_path) as f:
        annot = f.read()

    with open(args.input_json_path) as f:
        quotesdict = json.load(f)

    script, transcript_df, sfx_df = annotation2script(annot, quotesdict)

    output_path = args.input_annotation_path[:-4] + '_script.txt'

    with open(output_path, 'w') as f:
        f.write(script)
        
    transcript_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path),'transcript.xls'))
    sfx_df.to_excel(os.path.join(os.path.dirname(args.input_annotation_path), 'sfx.xls'))