annotate scripts/score_parser.py @ 147:927d05a43a70

Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix').
author Brecht De Man <b.deman@qmul.ac.uk>
date Sun, 31 May 2015 14:45:30 +0100
parents
children 97ebdb6b266a
rev   line source
b@147 1 import xml.etree.ElementTree as ET
b@147 2 import os
b@147 3 import csv
b@147 4
b@147 5 #TODO Remove DEBUG statements
b@147 6
b@147 7 # get every XML file in folder
b@147 8 for file in os.listdir("."): # You have to put this in folder where output XML files are.
b@147 9 if file.endswith(".xml"):
b@147 10 tree = ET.parse(file)
b@147 11 root = tree.getroot()
b@147 12 #print ["DEBUG Reading " + file + "..."]
b@147 13
b@147 14 # get subject ID from XML file
b@147 15 subject_id = file # file name as subject ID
b@147 16
b@147 17 # get list of all pages this subject evaluated
b@147 18 for audioholder in root.findall("./audioholder"): # iterate over pages
b@147 19 page_name = audioholder.get('id') # get page name
b@147 20 #print ["DEBUG page " + page_name]
b@147 21
b@147 22 file_name = 'ratings/'+page_name+'-ratings.csv' # score file name
b@147 23
b@147 24 # create folder 'ratings if not yet created
b@147 25 if not os.path.exists('ratings'):
b@147 26 os.makedirs('ratings')
b@147 27
b@147 28 # header: fragment IDs in 'alphabetical' order
b@147 29 # go to fragment column, or create new column if it doesn't exist yet
b@147 30
b@147 31 # get array of audio elements and number of audio elements
b@147 32 audiolist = root.findall("*/[@id='"+page_name+"']/audioelement")
b@147 33 n_fragments = len(audiolist)
b@147 34
b@147 35 # get alphabetical array of fragment IDs from this subject's XML
b@147 36 fragmentnamelist = [] # make empty list
b@147 37 for audioelement in audiolist: # iterate over all audioelements
b@147 38 fragmentnamelist.append(audioelement.get('id')) # add to list
b@147 39
b@147 40
b@147 41 # if file exists, get header and add 'new' fragments
b@147 42 if os.path.isfile(file_name):
b@147 43 #print ["DEBUG file " + file_name + " already exists - reading header"]
b@147 44 with open(file_name, 'r') as readfile:
b@147 45 filereader = csv.reader(readfile, delimiter=',')
b@147 46 headerrow = filereader.next()
b@147 47 #headerrow = headerrow[1:] # remove first column (empty)
b@147 48
b@147 49 # Which of the fragmentes are in fragmentnamelist but not in headerrow?
b@147 50 newfragments = list(set(fragmentnamelist)-set(headerrow))
b@147 51 newfragments = sorted(newfragments) # new fragments in alphabetical order
b@147 52 # If not empty, read file and rewrite adding extra columns
b@147 53 if newfragments: # if not empty
b@147 54 print ["DEBUG New fragments found: " + str(newfragments)]
b@147 55 with open('temp.csv', 'w') as writefile:
b@147 56 filewriter = csv.writer(writefile, delimiter=',')
b@147 57 filewriter.writerow(headerrow + newfragments) # write new header
b@147 58 for row in filereader: # rewrite row plus empty cells for every new fragment name
b@147 59 #print ["DEBUG Old row: " + str(row)]
b@147 60 filewriter.writerow(row + ['']*len(newfragments))
b@147 61 #print ["DEBUG New row: " + str(row + ['']*len(newfragments))]
b@147 62 os.rename('temp.csv', file_name) # replace old file with temp file
b@147 63 headerrow = headerrow + newfragments
b@147 64 print ["DEBUG New header row: " + str(headerrow)]
b@147 65
b@147 66 # if not, create file and make header
b@147 67 else:
b@147 68 #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"]
b@147 69 headerrow = sorted(fragmentnamelist) # sort alphabetically
b@147 70 headerrow.insert(0,'')
b@147 71 fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
b@147 72 with open(file_name, 'w') as writefile:
b@147 73 filewriter = csv.writer(writefile, delimiter=',')
b@147 74 filewriter.writerow(headerrow)
b@147 75
b@147 76 # open file to write for this page
b@147 77 writefile = open(file_name, 'a')
b@147 78 filewriter = csv.writer(writefile, delimiter=',')
b@147 79
b@147 80 # prepare row to be written for this subject for this page
b@147 81 ratingrow = [subject_id]
b@147 82
b@147 83 # get scores related to fragment [id]
b@147 84 for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column)
b@147 85 elementvalue = root.find("*/[@id='"
b@147 86 + page_name
b@147 87 + "']/audioelement/[@id='"
b@147 88 + fragmentname
b@147 89 + "']/value")
b@147 90 if hasattr(elementvalue, 'text'): # if rating for this fragment exists
b@147 91 ratingrow.append(elementvalue.text) # add to rating row
b@147 92 else: # if this subject has not rated this fragment
b@147 93 ratingrow.append('') # append empty cell
b@147 94
b@147 95 # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
b@147 96 filewriter.writerow(ratingrow)