annotate scripts/score_parser.py @ 928:ba58cf8d0dbc

Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix').
author Brecht De Man <BrechtDeMan@users.noreply.github.com>
date Sun, 31 May 2015 14:45:30 +0100
parents
children 97ebdb6b266a
rev   line source
BrechtDeMan@928 1 import xml.etree.ElementTree as ET
BrechtDeMan@928 2 import os
BrechtDeMan@928 3 import csv
BrechtDeMan@928 4
BrechtDeMan@928 5 #TODO Remove DEBUG statements
BrechtDeMan@928 6
BrechtDeMan@928 7 # get every XML file in folder
BrechtDeMan@928 8 for file in os.listdir("."): # You have to put this in folder where output XML files are.
BrechtDeMan@928 9 if file.endswith(".xml"):
BrechtDeMan@928 10 tree = ET.parse(file)
BrechtDeMan@928 11 root = tree.getroot()
BrechtDeMan@928 12 #print ["DEBUG Reading " + file + "..."]
BrechtDeMan@928 13
BrechtDeMan@928 14 # get subject ID from XML file
BrechtDeMan@928 15 subject_id = file # file name as subject ID
BrechtDeMan@928 16
BrechtDeMan@928 17 # get list of all pages this subject evaluated
BrechtDeMan@928 18 for audioholder in root.findall("./audioholder"): # iterate over pages
BrechtDeMan@928 19 page_name = audioholder.get('id') # get page name
BrechtDeMan@928 20 #print ["DEBUG page " + page_name]
BrechtDeMan@928 21
BrechtDeMan@928 22 file_name = 'ratings/'+page_name+'-ratings.csv' # score file name
BrechtDeMan@928 23
BrechtDeMan@928 24 # create folder 'ratings if not yet created
BrechtDeMan@928 25 if not os.path.exists('ratings'):
BrechtDeMan@928 26 os.makedirs('ratings')
BrechtDeMan@928 27
BrechtDeMan@928 28 # header: fragment IDs in 'alphabetical' order
BrechtDeMan@928 29 # go to fragment column, or create new column if it doesn't exist yet
BrechtDeMan@928 30
BrechtDeMan@928 31 # get array of audio elements and number of audio elements
BrechtDeMan@928 32 audiolist = root.findall("*/[@id='"+page_name+"']/audioelement")
BrechtDeMan@928 33 n_fragments = len(audiolist)
BrechtDeMan@928 34
BrechtDeMan@928 35 # get alphabetical array of fragment IDs from this subject's XML
BrechtDeMan@928 36 fragmentnamelist = [] # make empty list
BrechtDeMan@928 37 for audioelement in audiolist: # iterate over all audioelements
BrechtDeMan@928 38 fragmentnamelist.append(audioelement.get('id')) # add to list
BrechtDeMan@928 39
BrechtDeMan@928 40
BrechtDeMan@928 41 # if file exists, get header and add 'new' fragments
BrechtDeMan@928 42 if os.path.isfile(file_name):
BrechtDeMan@928 43 #print ["DEBUG file " + file_name + " already exists - reading header"]
BrechtDeMan@928 44 with open(file_name, 'r') as readfile:
BrechtDeMan@928 45 filereader = csv.reader(readfile, delimiter=',')
BrechtDeMan@928 46 headerrow = filereader.next()
BrechtDeMan@928 47 #headerrow = headerrow[1:] # remove first column (empty)
BrechtDeMan@928 48
BrechtDeMan@928 49 # Which of the fragmentes are in fragmentnamelist but not in headerrow?
BrechtDeMan@928 50 newfragments = list(set(fragmentnamelist)-set(headerrow))
BrechtDeMan@928 51 newfragments = sorted(newfragments) # new fragments in alphabetical order
BrechtDeMan@928 52 # If not empty, read file and rewrite adding extra columns
BrechtDeMan@928 53 if newfragments: # if not empty
BrechtDeMan@928 54 print ["DEBUG New fragments found: " + str(newfragments)]
BrechtDeMan@928 55 with open('temp.csv', 'w') as writefile:
BrechtDeMan@928 56 filewriter = csv.writer(writefile, delimiter=',')
BrechtDeMan@928 57 filewriter.writerow(headerrow + newfragments) # write new header
BrechtDeMan@928 58 for row in filereader: # rewrite row plus empty cells for every new fragment name
BrechtDeMan@928 59 #print ["DEBUG Old row: " + str(row)]
BrechtDeMan@928 60 filewriter.writerow(row + ['']*len(newfragments))
BrechtDeMan@928 61 #print ["DEBUG New row: " + str(row + ['']*len(newfragments))]
BrechtDeMan@928 62 os.rename('temp.csv', file_name) # replace old file with temp file
BrechtDeMan@928 63 headerrow = headerrow + newfragments
BrechtDeMan@928 64 print ["DEBUG New header row: " + str(headerrow)]
BrechtDeMan@928 65
BrechtDeMan@928 66 # if not, create file and make header
BrechtDeMan@928 67 else:
BrechtDeMan@928 68 #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"]
BrechtDeMan@928 69 headerrow = sorted(fragmentnamelist) # sort alphabetically
BrechtDeMan@928 70 headerrow.insert(0,'')
BrechtDeMan@928 71 fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
BrechtDeMan@928 72 with open(file_name, 'w') as writefile:
BrechtDeMan@928 73 filewriter = csv.writer(writefile, delimiter=',')
BrechtDeMan@928 74 filewriter.writerow(headerrow)
BrechtDeMan@928 75
BrechtDeMan@928 76 # open file to write for this page
BrechtDeMan@928 77 writefile = open(file_name, 'a')
BrechtDeMan@928 78 filewriter = csv.writer(writefile, delimiter=',')
BrechtDeMan@928 79
BrechtDeMan@928 80 # prepare row to be written for this subject for this page
BrechtDeMan@928 81 ratingrow = [subject_id]
BrechtDeMan@928 82
BrechtDeMan@928 83 # get scores related to fragment [id]
BrechtDeMan@928 84 for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column)
BrechtDeMan@928 85 elementvalue = root.find("*/[@id='"
BrechtDeMan@928 86 + page_name
BrechtDeMan@928 87 + "']/audioelement/[@id='"
BrechtDeMan@928 88 + fragmentname
BrechtDeMan@928 89 + "']/value")
BrechtDeMan@928 90 if hasattr(elementvalue, 'text'): # if rating for this fragment exists
BrechtDeMan@928 91 ratingrow.append(elementvalue.text) # add to rating row
BrechtDeMan@928 92 else: # if this subject has not rated this fragment
BrechtDeMan@928 93 ratingrow.append('') # append empty cell
BrechtDeMan@928 94
BrechtDeMan@928 95 # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
BrechtDeMan@928 96 filewriter.writerow(ratingrow)