comparison scripts/score_parser.py @ 928:ba58cf8d0dbc

Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix').
author Brecht De Man <BrechtDeMan@users.noreply.github.com>
date Sun, 31 May 2015 14:45:30 +0100
parents
children 97ebdb6b266a
comparison
equal deleted inserted replaced
927:5db0069046d5 928:ba58cf8d0dbc
1 import xml.etree.ElementTree as ET
2 import os
3 import csv
4
5 #TODO Remove DEBUG statements
6
7 # get every XML file in folder
8 for file in os.listdir("."): # You have to put this in folder where output XML files are.
9 if file.endswith(".xml"):
10 tree = ET.parse(file)
11 root = tree.getroot()
12 #print ["DEBUG Reading " + file + "..."]
13
14 # get subject ID from XML file
15 subject_id = file # file name as subject ID
16
17 # get list of all pages this subject evaluated
18 for audioholder in root.findall("./audioholder"): # iterate over pages
19 page_name = audioholder.get('id') # get page name
20 #print ["DEBUG page " + page_name]
21
22 file_name = 'ratings/'+page_name+'-ratings.csv' # score file name
23
24 # create folder 'ratings if not yet created
25 if not os.path.exists('ratings'):
26 os.makedirs('ratings')
27
28 # header: fragment IDs in 'alphabetical' order
29 # go to fragment column, or create new column if it doesn't exist yet
30
31 # get array of audio elements and number of audio elements
32 audiolist = root.findall("*/[@id='"+page_name+"']/audioelement")
33 n_fragments = len(audiolist)
34
35 # get alphabetical array of fragment IDs from this subject's XML
36 fragmentnamelist = [] # make empty list
37 for audioelement in audiolist: # iterate over all audioelements
38 fragmentnamelist.append(audioelement.get('id')) # add to list
39
40
41 # if file exists, get header and add 'new' fragments
42 if os.path.isfile(file_name):
43 #print ["DEBUG file " + file_name + " already exists - reading header"]
44 with open(file_name, 'r') as readfile:
45 filereader = csv.reader(readfile, delimiter=',')
46 headerrow = filereader.next()
47 #headerrow = headerrow[1:] # remove first column (empty)
48
49 # Which of the fragmentes are in fragmentnamelist but not in headerrow?
50 newfragments = list(set(fragmentnamelist)-set(headerrow))
51 newfragments = sorted(newfragments) # new fragments in alphabetical order
52 # If not empty, read file and rewrite adding extra columns
53 if newfragments: # if not empty
54 print ["DEBUG New fragments found: " + str(newfragments)]
55 with open('temp.csv', 'w') as writefile:
56 filewriter = csv.writer(writefile, delimiter=',')
57 filewriter.writerow(headerrow + newfragments) # write new header
58 for row in filereader: # rewrite row plus empty cells for every new fragment name
59 #print ["DEBUG Old row: " + str(row)]
60 filewriter.writerow(row + ['']*len(newfragments))
61 #print ["DEBUG New row: " + str(row + ['']*len(newfragments))]
62 os.rename('temp.csv', file_name) # replace old file with temp file
63 headerrow = headerrow + newfragments
64 print ["DEBUG New header row: " + str(headerrow)]
65
66 # if not, create file and make header
67 else:
68 #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"]
69 headerrow = sorted(fragmentnamelist) # sort alphabetically
70 headerrow.insert(0,'')
71 fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
72 with open(file_name, 'w') as writefile:
73 filewriter = csv.writer(writefile, delimiter=',')
74 filewriter.writerow(headerrow)
75
76 # open file to write for this page
77 writefile = open(file_name, 'a')
78 filewriter = csv.writer(writefile, delimiter=',')
79
80 # prepare row to be written for this subject for this page
81 ratingrow = [subject_id]
82
83 # get scores related to fragment [id]
84 for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column)
85 elementvalue = root.find("*/[@id='"
86 + page_name
87 + "']/audioelement/[@id='"
88 + fragmentname
89 + "']/value")
90 if hasattr(elementvalue, 'text'): # if rating for this fragment exists
91 ratingrow.append(elementvalue.text) # add to rating row
92 else: # if this subject has not rated this fragment
93 ratingrow.append('') # append empty cell
94
95 # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
96 filewriter.writerow(ratingrow)