view python/score_parser.py @ 2376:c41caaa96633

Some fixes for #90. Also a failsafe loop if the server never responds with meaningul information from saves (for instance, running only on apache or basic http servers). More changes to pythonServer for python 3.5. Please check if still valid on 2.7
author Nicholas Jillings <nicholas.jillings@mail.bcu.ac.uk>
date Thu, 19 May 2016 10:44:19 +0100
parents 4843377d9976
children dde81c372fdc
line wrap: on
line source
#!/usr/bin/python

import xml.etree.ElementTree as ET
import os
import sys
import csv

# COMMAND LINE ARGUMENTS

assert len(sys.argv)<3, "score_parser takes at most 1 command line argument\n"+\
                        "Use: python score_parser.py [rating_folder_location]"

# XML results files location
if len(sys.argv) == 1:
    folder_name = "../saves"    # Looks in 'saves/' folder from 'scripts/' folder
    print("Use: python score_parser.py [rating_folder_location]")
    print("Using default path: " + folder_name)
elif len(sys.argv) == 2:
    folder_name = sys.argv[1]   # First command line argument is folder

# check if folder_name exists
if not os.path.exists(folder_name):
    #the file is not there
    print("Folder '"+folder_name+"' does not exist.")
    sys.exit() # terminate script execution
elif not os.access(os.path.dirname(folder_name), os.W_OK):
    #the file does exist but write privileges are not given
    print("No write privileges in folder '"+folder_name+"'.")

    
# CODE

# remember which files have been opened this time
file_history = []

# get every XML file in folder
for file_name in os.listdir(folder_name):
    if file_name.endswith(".xml"):
        tree = ET.parse(folder_name + '/' + file_name)
        root = tree.getroot()

        # get subject ID from XML file
        subject_id = file_name[:-4] # file name (without extension) as subject ID

        # get list of all pages this subject evaluated
        for page in root.findall("./page"):    # iterate over pages
            page_name = page.get('ref') # get page reference ID
                       
            if page_name is None: # ignore 'empty' audio_holders
                print("WARNING: " + file_name + " contains empty audio holder. (score_parser.py)")
                break
                
            if page.get('state') != "complete":
                print("WARNING: " + file_name + " contains incomplete page " +page_name+ ". (score_parser.py)")
                break;

            file_name = folder_name+'/ratings/'+page_name+'-ratings.csv' # score file name

            # create folder 'ratings' if not yet created
            if not os.path.exists(folder_name + '/ratings'):
                os.makedirs(folder_name + '/ratings')

            # header: fragment IDs in 'alphabetical' order
            # go to fragment column, or create new column if it doesn't exist yet

            # get array of audio elements and number of audio elements
            audiolist = page.findall("./audioelement")
            n_fragments = len(audiolist)

            # get alphabetical array of fragment IDs from this subject's XML
            fragmentnamelist = []    # make empty list
            for audioelement in audiolist: # iterate over all audioelements
                fragmentnamelist.append(audioelement.get('ref')) # add to list


            # if file exists, get header and add any 'new' fragments not yet in the header
            if os.path.isfile(file_name):
                with open(file_name, 'r') as readfile:
                    filereader = csv.reader(readfile, delimiter=',')
                    headerrow = next(filereader)

                # If file hasn't been opened yet this time, remove all rows except header
                if file_name not in file_history:
                    with open(file_name, 'w') as writefile:
                        filewriter = csv.writer(writefile, delimiter=',')
                        headerrow = sorted(headerrow)
                        filewriter.writerow(headerrow)
                    file_history.append(file_name)

                # Which of the fragments are in fragmentnamelist but not in headerrow?
                newfragments = list(set(fragmentnamelist)-set(headerrow))
                newfragments = sorted(newfragments) # new fragments in alphabetical order
                # If not empty, read file and rewrite adding extra columns
                if newfragments: # if not empty
                    with open('temp.csv', 'w') as writefile:
                        filewriter = csv.writer(writefile, delimiter=',')
                        filewriter.writerow(headerrow + newfragments) # write new header
                        with open(file_name, 'r') as readfile:
                            filereader = csv.reader(readfile, delimiter=',')
                            next(filereader) # skip header
                            for row in filereader: # rewrite row plus empty cells for every new fragment name
                                filewriter.writerow(row + ['']*len(newfragments))
                    os.rename('temp.csv', file_name) # replace old file with temp file
                    headerrow = headerrow + newfragments
                    

            # if file does not exist yet, create file and make header
            else:
                headerrow = sorted(fragmentnamelist) # sort alphabetically
                headerrow.insert(0,'')
                fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
                with open(file_name, 'w') as writefile:
                    filewriter = csv.writer(writefile, delimiter=',')
                    filewriter.writerow(headerrow)
                file_history.append(file_name)

            # open file to write for this page
            writefile = open(file_name, 'a')
            filewriter = csv.writer(writefile, delimiter=',')

            # prepare row to be written for this subject for this page
            ratingrow = [subject_id]

            # get scores related to fragment [id]
            for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column)
                elementvalue = page.find("./audioelement/[@ref='"
                                       + fragmentname
                                       + "']/value")
                if hasattr(elementvalue, 'text'): # if rating for this fragment exists
                    ratingrow.append(elementvalue.text) # add to rating row
                else: # if this subject has not rated this fragment
                    ratingrow.append('') # append empty cell

            # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
            if any(ratingrow[1:]): # append to file if row non-empty (except subject name)
                filewriter.writerow(ratingrow)