annotate scripts/score_parser.py @ 1598:7da3e8c7039c

Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix').
author Brecht De Man <b.deman@qmul.ac.uk>
date Sun, 31 May 2015 14:45:30 +0100
parents
children 97ebdb6b266a
rev   line source
b@1598 1 import xml.etree.ElementTree as ET
b@1598 2 import os
b@1598 3 import csv
b@1598 4
b@1598 5 #TODO Remove DEBUG statements
b@1598 6
b@1598 7 # get every XML file in folder
b@1598 8 for file in os.listdir("."): # You have to put this in folder where output XML files are.
b@1598 9 if file.endswith(".xml"):
b@1598 10 tree = ET.parse(file)
b@1598 11 root = tree.getroot()
b@1598 12 #print ["DEBUG Reading " + file + "..."]
b@1598 13
b@1598 14 # get subject ID from XML file
b@1598 15 subject_id = file # file name as subject ID
b@1598 16
b@1598 17 # get list of all pages this subject evaluated
b@1598 18 for audioholder in root.findall("./audioholder"): # iterate over pages
b@1598 19 page_name = audioholder.get('id') # get page name
b@1598 20 #print ["DEBUG page " + page_name]
b@1598 21
b@1598 22 file_name = 'ratings/'+page_name+'-ratings.csv' # score file name
b@1598 23
b@1598 24 # create folder 'ratings if not yet created
b@1598 25 if not os.path.exists('ratings'):
b@1598 26 os.makedirs('ratings')
b@1598 27
b@1598 28 # header: fragment IDs in 'alphabetical' order
b@1598 29 # go to fragment column, or create new column if it doesn't exist yet
b@1598 30
b@1598 31 # get array of audio elements and number of audio elements
b@1598 32 audiolist = root.findall("*/[@id='"+page_name+"']/audioelement")
b@1598 33 n_fragments = len(audiolist)
b@1598 34
b@1598 35 # get alphabetical array of fragment IDs from this subject's XML
b@1598 36 fragmentnamelist = [] # make empty list
b@1598 37 for audioelement in audiolist: # iterate over all audioelements
b@1598 38 fragmentnamelist.append(audioelement.get('id')) # add to list
b@1598 39
b@1598 40
b@1598 41 # if file exists, get header and add 'new' fragments
b@1598 42 if os.path.isfile(file_name):
b@1598 43 #print ["DEBUG file " + file_name + " already exists - reading header"]
b@1598 44 with open(file_name, 'r') as readfile:
b@1598 45 filereader = csv.reader(readfile, delimiter=',')
b@1598 46 headerrow = filereader.next()
b@1598 47 #headerrow = headerrow[1:] # remove first column (empty)
b@1598 48
b@1598 49 # Which of the fragmentes are in fragmentnamelist but not in headerrow?
b@1598 50 newfragments = list(set(fragmentnamelist)-set(headerrow))
b@1598 51 newfragments = sorted(newfragments) # new fragments in alphabetical order
b@1598 52 # If not empty, read file and rewrite adding extra columns
b@1598 53 if newfragments: # if not empty
b@1598 54 print ["DEBUG New fragments found: " + str(newfragments)]
b@1598 55 with open('temp.csv', 'w') as writefile:
b@1598 56 filewriter = csv.writer(writefile, delimiter=',')
b@1598 57 filewriter.writerow(headerrow + newfragments) # write new header
b@1598 58 for row in filereader: # rewrite row plus empty cells for every new fragment name
b@1598 59 #print ["DEBUG Old row: " + str(row)]
b@1598 60 filewriter.writerow(row + ['']*len(newfragments))
b@1598 61 #print ["DEBUG New row: " + str(row + ['']*len(newfragments))]
b@1598 62 os.rename('temp.csv', file_name) # replace old file with temp file
b@1598 63 headerrow = headerrow + newfragments
b@1598 64 print ["DEBUG New header row: " + str(headerrow)]
b@1598 65
b@1598 66 # if not, create file and make header
b@1598 67 else:
b@1598 68 #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"]
b@1598 69 headerrow = sorted(fragmentnamelist) # sort alphabetically
b@1598 70 headerrow.insert(0,'')
b@1598 71 fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
b@1598 72 with open(file_name, 'w') as writefile:
b@1598 73 filewriter = csv.writer(writefile, delimiter=',')
b@1598 74 filewriter.writerow(headerrow)
b@1598 75
b@1598 76 # open file to write for this page
b@1598 77 writefile = open(file_name, 'a')
b@1598 78 filewriter = csv.writer(writefile, delimiter=',')
b@1598 79
b@1598 80 # prepare row to be written for this subject for this page
b@1598 81 ratingrow = [subject_id]
b@1598 82
b@1598 83 # get scores related to fragment [id]
b@1598 84 for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column)
b@1598 85 elementvalue = root.find("*/[@id='"
b@1598 86 + page_name
b@1598 87 + "']/audioelement/[@id='"
b@1598 88 + fragmentname
b@1598 89 + "']/value")
b@1598 90 if hasattr(elementvalue, 'text'): # if rating for this fragment exists
b@1598 91 ratingrow.append(elementvalue.text) # add to rating row
b@1598 92 else: # if this subject has not rated this fragment
b@1598 93 ratingrow.append('') # append empty cell
b@1598 94
b@1598 95 # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
b@1598 96 filewriter.writerow(ratingrow)