annotate scripts/comment_parser.py @ 928:ba58cf8d0dbc

Score parsing: make csv from all XML files (one file per page ID, one column per fragment ID, one row per subject). Supports varying selections of pages and fragments across subjects. Generalised to 'page' (instead of 'song') and 'fragment' (instead of 'mix').
author Brecht De Man <BrechtDeMan@users.noreply.github.com>
date Sun, 31 May 2015 14:45:30 +0100
parents 5db0069046d5
children 97ebdb6b266a
rev   line source
BrechtDeMan@927 1 import xml.etree.ElementTree as ET
BrechtDeMan@927 2 import os
BrechtDeMan@927 3 import csv
BrechtDeMan@927 4
BrechtDeMan@927 5 # get every XML file in folder
BrechtDeMan@927 6 for file in os.listdir("."): # You have to put this script in folder where output XML files are.
BrechtDeMan@927 7 if file.endswith(".xml"):
BrechtDeMan@927 8 tree = ET.parse(file)
BrechtDeMan@927 9 root = tree.getroot()
BrechtDeMan@927 10
BrechtDeMan@928 11 # get list of all page names
BrechtDeMan@928 12 for audioholder in root.findall("./audioholder"): # iterate over pages
BrechtDeMan@928 13 page_name = audioholder.get('id') # get page name
BrechtDeMan@927 14
BrechtDeMan@928 15 # create folder [page_name] if not yet created
BrechtDeMan@928 16 if not os.path.exists(page_name):
BrechtDeMan@928 17 os.makedirs(page_name)
BrechtDeMan@927 18
BrechtDeMan@928 19 # for page [page_name], print comments related to fragment [id]
BrechtDeMan@928 20 for audioelement in root.findall("*/[@id='"+page_name+"']/audioelement"):
BrechtDeMan@927 21 audio_id = str(audioelement.get('id'))
BrechtDeMan@928 22 # append to file [page_name]/[page_name]-comments-[id].csv
BrechtDeMan@928 23 with open(page_name+'/'+page_name+'-comments-'+audio_id+'.csv', 'a') as csvfile:
BrechtDeMan@927 24 commentstr = root.find("*/[@id='"
BrechtDeMan@928 25 + page_name
BrechtDeMan@927 26 + "']/audioelement/[@id='"
BrechtDeMan@927 27 + audio_id
BrechtDeMan@927 28 + "']/comment/response").text
BrechtDeMan@927 29 writer = csv.writer(csvfile, delimiter=',')
BrechtDeMan@927 30 writer.writerow([commentstr.encode("utf-8")])
BrechtDeMan@927 31 #TODO Comma doesn't act as delimiter now!
BrechtDeMan@927 32 # (when adding more than just a comment per line):
BrechtDeMan@927 33 # writer.writerow([file + ',' + commentstr.encode("utf-8")])
BrechtDeMan@927 34
BrechtDeMan@927 35 #TODO Replace 'new line' with something else?
BrechtDeMan@927 36
BrechtDeMan@927 37 #TODO 'Append' means duplicate entries if run several times...