# HG changeset patch # User Brecht De Man # Date 1433079930 -3600 # Node ID 7da3e8c7039c8a5e14a18ec0f845c1c416c41d36 # Parent d8d018172b7a42e2a0db37d23651e74edf9466f6 Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix'). diff -r d8d018172b7a -r 7da3e8c7039c scripts/comment_parser.py --- a/scripts/comment_parser.py Sat May 30 18:12:32 2015 +0100 +++ b/scripts/comment_parser.py Sun May 31 14:45:30 2015 +0100 @@ -8,21 +8,21 @@ tree = ET.parse(file) root = tree.getroot() - # get list of all songs - for audioholder in root.findall("./audioholder"): # iterate over songs - song_name = audioholder.get('id') # get song name + # get list of all page names + for audioholder in root.findall("./audioholder"): # iterate over pages + page_name = audioholder.get('id') # get page name - # create folder [song_name] if not yet created - if not os.path.exists(song_name): - os.makedirs(song_name) + # create folder [page_name] if not yet created + if not os.path.exists(page_name): + os.makedirs(page_name) - # for song [song_name], print comments related to mix [id] - for audioelement in root.findall("*/[@id='"+song_name+"']/audioelement"): + # for page [page_name], print comments related to fragment [id] + for audioelement in root.findall("*/[@id='"+page_name+"']/audioelement"): audio_id = str(audioelement.get('id')) - # append to file [song_name]/[song_name]-comments-[id].csv - with open(song_name+'/'+song_name+'-comments-'+audio_id+'.csv', 'a') as csvfile: + # append to file [page_name]/[page_name]-comments-[id].csv + with open(page_name+'/'+page_name+'-comments-'+audio_id+'.csv', 'a') as csvfile: commentstr = root.find("*/[@id='" - + song_name + + page_name + "']/audioelement/[@id='" + audio_id + "']/comment/response").text diff -r d8d018172b7a -r 7da3e8c7039c scripts/score_parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/score_parser.py Sun May 31 14:45:30 2015 +0100 @@ -0,0 +1,96 @@ +import xml.etree.ElementTree as ET +import os +import csv + +#TODO Remove DEBUG statements + +# get every XML file in folder +for file in os.listdir("."): # You have to put this in folder where output XML files are. + if file.endswith(".xml"): + tree = ET.parse(file) + root = tree.getroot() + #print ["DEBUG Reading " + file + "..."] + + # get subject ID from XML file + subject_id = file # file name as subject ID + + # get list of all pages this subject evaluated + for audioholder in root.findall("./audioholder"): # iterate over pages + page_name = audioholder.get('id') # get page name + #print ["DEBUG page " + page_name] + + file_name = 'ratings/'+page_name+'-ratings.csv' # score file name + + # create folder 'ratings if not yet created + if not os.path.exists('ratings'): + os.makedirs('ratings') + + # header: fragment IDs in 'alphabetical' order + # go to fragment column, or create new column if it doesn't exist yet + + # get array of audio elements and number of audio elements + audiolist = root.findall("*/[@id='"+page_name+"']/audioelement") + n_fragments = len(audiolist) + + # get alphabetical array of fragment IDs from this subject's XML + fragmentnamelist = [] # make empty list + for audioelement in audiolist: # iterate over all audioelements + fragmentnamelist.append(audioelement.get('id')) # add to list + + + # if file exists, get header and add 'new' fragments + if os.path.isfile(file_name): + #print ["DEBUG file " + file_name + " already exists - reading header"] + with open(file_name, 'r') as readfile: + filereader = csv.reader(readfile, delimiter=',') + headerrow = filereader.next() + #headerrow = headerrow[1:] # remove first column (empty) + + # Which of the fragmentes are in fragmentnamelist but not in headerrow? + newfragments = list(set(fragmentnamelist)-set(headerrow)) + newfragments = sorted(newfragments) # new fragments in alphabetical order + # If not empty, read file and rewrite adding extra columns + if newfragments: # if not empty + print ["DEBUG New fragments found: " + str(newfragments)] + with open('temp.csv', 'w') as writefile: + filewriter = csv.writer(writefile, delimiter=',') + filewriter.writerow(headerrow + newfragments) # write new header + for row in filereader: # rewrite row plus empty cells for every new fragment name + #print ["DEBUG Old row: " + str(row)] + filewriter.writerow(row + ['']*len(newfragments)) + #print ["DEBUG New row: " + str(row + ['']*len(newfragments))] + os.rename('temp.csv', file_name) # replace old file with temp file + headerrow = headerrow + newfragments + print ["DEBUG New header row: " + str(headerrow)] + + # if not, create file and make header + else: + #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"] + headerrow = sorted(fragmentnamelist) # sort alphabetically + headerrow.insert(0,'') + fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist + with open(file_name, 'w') as writefile: + filewriter = csv.writer(writefile, delimiter=',') + filewriter.writerow(headerrow) + + # open file to write for this page + writefile = open(file_name, 'a') + filewriter = csv.writer(writefile, delimiter=',') + + # prepare row to be written for this subject for this page + ratingrow = [subject_id] + + # get scores related to fragment [id] + for fragmentname in headerrow[1:]: # iterate over fragments in header (skip first empty column) + elementvalue = root.find("*/[@id='" + + page_name + + "']/audioelement/[@id='" + + fragmentname + + "']/value") + if hasattr(elementvalue, 'text'): # if rating for this fragment exists + ratingrow.append(elementvalue.text) # add to rating row + else: # if this subject has not rated this fragment + ratingrow.append('') # append empty cell + + # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M] + filewriter.writerow(ratingrow) \ No newline at end of file