changeset 290:2c7dc66c102e

Scripts: comment_parser and score_parser start new CSV files each time; various plots added to generated PDF report
author Brecht De Man <b.deman@qmul.ac.uk>
date Thu, 20 Aug 2015 11:29:29 +0200
parents 8a8e768bcbff
children a76081548018
files .hgignore README.txt scripts/comment_parser.py scripts/generate_report.py scripts/score_parser.py scripts/score_plot.py
diffstat 6 files changed, 265 insertions(+), 88 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Tue Aug 18 23:56:05 2015 +0200
+++ b/.hgignore	Thu Aug 20 11:29:29 2015 +0200
@@ -32,4 +32,5 @@
 saves/*/*.png
 saves/*/*.xml
 saves/ratings/*.pdf
-saves/timelines_movement/*.pdf
\ No newline at end of file
+saves/timelines_movement/*.pdf
+saves
\ No newline at end of file
--- a/README.txt	Tue Aug 18 23:56:05 2015 +0200
+++ b/README.txt	Thu Aug 20 11:29:29 2015 +0200
@@ -91,7 +91,8 @@
 SCRIPTS
 
 The tool comes with a few handy Python (2.7) scripts for easy extraction of ratings or comments, and visualisation of ratings and timelines. See below for a quick guide on how to use them. All scripts written for Python 2.7. Visualisation requires the free matplotlib toolbox (http://matplotlib.org), numpy and scipy. 
-By default, the scripts can be run from the ‘scripts’ folder, with the result files in the ‘saves’ folder (the default location where result XMLs are stored). Each script takes the XML file folder as an argument, along with other arguments in some cases. 
+By default, the scripts can be run from the ‘scripts’ folder, with the result files in the ‘saves’ folder (the default location where result XMLs are stored). Each script takes the XML file folder as an argument, along with other arguments in some cases.
+Note: to avoid all kinds of problems, please avoid using spaces in file and folder names (this may work on some systems, but others don’t like it). 
 
 	comment_parser.py
 		Extracts comments from the output XML files corresponding with the different subjects found in ‘saves/’. It creates a folder per ‘audioholder’/page it finds, and stores a CSV file with comments for every ‘audioelement’/fragment within these respective ‘audioholders’/pages. In this CSV file, every line corresponds with a subject/output XML file. Depending on the settings, the first column containing the name of the corresponding XML file can be omitted (for anonymisation). 
@@ -100,6 +101,9 @@
 	evaluation_stats.py
 		Shows a few statistics of tests in the ‘saves/‘ folder so far, mainly for checking for errors. Shows the number of files that are there, the audioholder IDs that were tested (and how many of each separate ID), the duration of each page, the duration of each complete test, the average duration per page, and the average duration in function of the page number. 
 
+	generate_report.py
+		Similar to ‘evaluation_stats.py’, but generates a PDF report based on the output files in the ‘saves/‘ folder - or any folder specified as command line argument. Uses pdflatex to write a LaTeX document, then convert to a PDF. 
+
 	score_parser.py
 		Extracts rating values from the XML to CSV - necessary for running visualisation of ratings. Creates the folder ‘saves/ratings/‘ if not yet created, to which it writes a separate file for every ‘audioholder’/page in any of the output XMLs it finds in ‘saves/‘. Within each file, rows represent different subjects (output XML file names) and columns represent different ‘audioelements’/fragments. 
 
--- a/scripts/comment_parser.py	Tue Aug 18 23:56:05 2015 +0200
+++ b/scripts/comment_parser.py	Thu Aug 20 11:29:29 2015 +0200
@@ -4,7 +4,7 @@
 import xml.etree.ElementTree as ET
 import os
 import csv
-
+import sys
 
 # COMMAND LINE ARGUMENTS
 
@@ -31,6 +31,9 @@
 
 # CODE
 
+# remember which files have been opened this time
+file_history = []
+
 # get every XML file in folder
 for file in os.listdir(folder_name): 
     if file.endswith(".xml"):
@@ -56,23 +59,28 @@
                     
                     csv_name = folder_name +'/' + page_name+'/'+page_name+'-comments-'+audio_id+'.csv'
 
-                    # append (!) to file [page_name]/[page_name]-comments-[id].csv
-                    with open(csv_name, 'a') as csvfile:
-                        writer = csv.writer(csvfile, 
-                                            delimiter=',', 
-                                            dialect="excel",
-                                            quoting=csv.QUOTE_ALL)
-                        commentstr = audioelement.find("./comment/response").text
+                    # If file hasn't been opened yet this time, empty
+                    if csv_name not in file_history:
+                        csvfile = open(csv_name, 'w')
+                        file_history.append(csv_name) # remember this file has been written to this time around
+                    else: 
+                        # append (!) to file [page_name]/[page_name]-comments-[id].csv
+                        csvfile = open(csv_name, 'a')
+                    writer = csv.writer(csvfile, 
+                                        delimiter=',', 
+                                        dialect="excel",
+                                        quoting=csv.QUOTE_ALL)
+                    commentstr = audioelement.find("./comment/response").text
                         
-                        if commentstr is None:
-                           commentstr = '';
-                            
-                        # anonymous comments:
-                        #writer.writerow([commentstr.encode("utf-8")]) 
-                        # comments with (file) name:
-                        writer.writerow([file[:-4]] + [commentstr.encode("utf-8")]) 
+                    if commentstr is None:
+                       commentstr = ''
+                        
+                    # anonymous comments:
+                    #writer.writerow([commentstr.encode("utf-8")]) 
+                    # comments with (file) name:
+                    writer.writerow([file[:-4]] + [commentstr.encode("utf-8")]) 
 
-                        #TODO Replace 'new line' in comment with something else?
+                    #TODO Replace 'new line' in comment with something else?
                         
 # PRO TIP: Change from csv to txt by running this in bash: 
 # $ cd folder_where_csvs_are/
--- a/scripts/generate_report.py	Tue Aug 18 23:56:05 2015 +0200
+++ b/scripts/generate_report.py	Thu Aug 20 11:29:29 2015 +0200
@@ -11,16 +11,27 @@
 import numpy as np # numbers
 
 # Command line arguments
-assert len(sys.argv)<3, "evaluation_stats takes at most 1 command line argument\n"+\
-                        "Use: python evaluation_stats.py [results_folder]"
+assert len(sys.argv)<4, "evaluation_stats takes at most 2 command line argument\n"+\
+                        "Use: python generate_report.py [results_folder] [no_render | -nr]"
+
+render_figures = True
 
 # XML results files location
 if len(sys.argv) == 1:
     folder_name = "../saves"    # Looks in 'saves/' folder from 'scripts/' folder
-    print "Use: python evaluation_stats.py [results_folder]"
+    print "Use: python generate_report.py [results_folder] [no_render | -nr]"
     print "Using default path: " + folder_name
 elif len(sys.argv) == 2:
     folder_name = sys.argv[1]   # First command line argument is folder
+elif len(sys.argv) == 3:
+    folder_name = sys.argv[1]   # First command line argument is folder
+    assert sys.argv[2] in ('no_render','-nr'), "Second argument not recognised. \n" +\
+           "Use: python generate_report.py [results_folder] [no_render | -nr]"
+    # Second command line argument is [no_render | -nr]
+    render_figures = False
+    
+    
+#TODO add 'skip regenerating figures'
 
 # Turn number of seconds (int) to '[minutes] min [seconds] s' (string)
 def seconds2timestr(time_in_seconds):
@@ -39,11 +50,17 @@
 
 # arrays initialisation
 page_names       = []
+real_page_names  = [] # regardless of differing numbers of fragments
+subject_count    = [] # subjects per audioholder name
 page_count       = []
 duration_page    = []      # duration of experiment in function of page content
 duration_order   = []      # duration of experiment in function of page number
 fragments_per_page = []    # number of fragments for corresponding page
 
+# survey stats
+gender = []
+age    = []
+
 # get username if available
 for name in ('LOGNAME', 'USER', 'LNAME', 'USERNAME'):
     user = os.environ.get(name)
@@ -56,9 +73,11 @@
 # begin LaTeX document
 header = r'''\documentclass[11pt, oneside]{article} 
           \usepackage{geometry}
-          \geometry{letterpaper}
-          \usepackage[parfill]{parskip}
-          \usepackage{graphicx}
+          \geometry{a4paper}
+          \usepackage[parfill]{parskip} % empty line instead of indent
+          \usepackage{graphicx}    % figures
+          \usepackage{hyperref}
+          \usepackage{tikz}		   % pie charts
           \title{Report}
           \author{'''+\
           user+\
@@ -66,17 +85,31 @@
           \graphicspath{{'''+\
           folder_name+\
           r'''/}}
+          %\setcounter{section}{-1} % Summary section 0 so number of sections equals number of files
           \begin{document}
           \maketitle
+          This is an automatically generated report using the `generate\_report.py' Python script 
+          included with the Web Audio Evaluation Tool \cite{WAET} distribution which can be found 
+          at \texttt{code.soundsoftware.ac.uk/projects/webaudioevaluationtool}.
           \tableofcontents
+          
           '''
           
-footer = '\end{document}'
+footer = r'''\begin{thebibliography}{9}
+         \bibitem{WAET} % reference to accompanying publication
+        Nicholas Jillings, Brecht De Man, David Moffat and Joshua D. Reiss, 
+        ``Web Audio Evaluation Tool: A browser-based listening test environment,'' 
+        presented at the 12th Sound and Music Computing Conference, July 2015.
+        \end{thebibliography}
+        \end{document}'''
 
 body = ''
 
 # generate images for later use
-subprocess.call("timeline_view_movement.py", shell=True)
+if render_figures:
+    subprocess.call("python timeline_view_movement.py", shell=True)
+    subprocess.call("python score_parser.py", shell=True)
+    subprocess.call("python score_plot.py", shell=True)
 
 # get every XML file in folder
 files_list = os.listdir(folder_name)
@@ -94,6 +127,19 @@
         page_number = 0
         
         individual_table = '' # table with stats for this individual test file
+        timeline_plots = '' # plots of timeline (movements and plays)
+        
+        # DEMO survey stats
+        # get gender
+        this_subjects_gender = root.find("./posttest/radio/[@id='gender']")
+        if this_subjects_gender is not None:
+            gender.append(this_subjects_gender.get("name"))
+        else:
+            gender.append('UNAVAILABLE')
+        # get age
+        this_subjects_age = root.find("./posttest/number/[@id='age']")
+        if this_subjects_age is not None:
+            age.append(this_subjects_age.text)
         
         # get list of all page names
         for audioholder in root.findall("./audioholder"):   # iterate over pages
@@ -149,7 +195,6 @@
                 body += '\\emph{\\textbf{ATTENTION: '+str(not_moved)+' markers were not moved in '+page_name+'!}} \\\\ \n'
             if not_moved == 1: 
                 body += '\\emph{\\textbf{ATTENTION: one marker was not moved in '+page_name+'!}} \\\\ \n'
-                
             #TODO which one not moved/listened to? 
             
             # PRINT song-specific statistic
@@ -164,27 +209,18 @@
             # check if available
             if os.path.isfile(folder_name+'/'+img_path):
                 # SHOW timeline image
-                body += r'''\begin{figure}[htbp]
-                         \begin{center}
-                         \includegraphics[width=\textwidth]{'''+\
-                         folder_name+'/'+img_path+\
-                        r'''}
-                        \caption{Timeline of '''+\
-                         page_name+' by '+ file[:-4].capitalize() +\
-                        r'''.}
-                         \end{center}
-                         \end{figure}
-                         '''
+                timeline_plots += '\\includegraphics[width=\\textwidth]{'+\
+                         folder_name+'/'+img_path+'}\n\n'
             
             # keep track of duration in function of page index
             if len(duration_order)>page_number:
                 duration_order[page_number].append(duration)
             else:
                 duration_order.append([duration])
-                
+            
             # keep list of audioholder ids and count how many times each audioholder id
-            # was tested, how long it took, and how many fragments there were (if number of 
-            # fragments is different, store as different audioholder id)
+            # was tested, how long it took, and how many fragments there were 
+            # (if number of fragments is different, store as different audioholder id)
             if page_name in page_names: 
                 page_index = page_names.index(page_name) # get index
                 # check if number of audioelements the same
@@ -207,7 +243,16 @@
                 page_count.append(1)
                 duration_page.append([duration])
                 fragments_per_page.append(len(audioelements))
-                
+            
+            # number of subjects per audioholder regardless of differing numbers of 
+            # fragments (for inclusion in box plots)
+            if page_name in real_page_names:
+                page_index = real_page_names.index(page_name) # get index
+                subject_count[page_index] += 1
+            else: 
+                real_page_names.append(page_name)
+                subject_count.append(1)
+            
             # bookkeeping
             page_number += 1 # increase page count for this specific test
             number_of_pages += 1 # increase total number of pages
@@ -223,7 +268,11 @@
                   seconds2timestr(total_duration)+\
                  r'''}\\
                   \hline 
-                  \end{tabular}'''
+                  \end{tabular}
+                  
+                  '''
+        # PRINT timeline plots
+        body += timeline_plots
 
 # join to footer
 footer = body + footer
@@ -231,8 +280,9 @@
 # empty body again
 body = ''
 
-# PRINT summary of everything (at start)
-body += '\section{Summary}\n'
+# PRINT summary of everything (at start) 
+#       unnumbered so that number of sections equals number of files
+body += '\section*{Summary}\n\\addcontentsline{toc}{section}{Summary}'
 
 # PRINT table with statistics
 body += '\\begin{tabular}{ll}'
@@ -246,15 +296,6 @@
 body += r'Number of unmoved markers: &' + str(total_not_moved) +\
       " (" + str(round(100.0*total_not_moved/number_of_fragments,2)) + r"\%)\\"
 body += r'Average time per page: &' + seconds2timestr(time_per_page_accum/number_of_pages) + r"\\"
-
-
-# Pages and number of times tested
-page_count_strings = list(str(x) for x in page_count)
-count_list = page_names + page_count_strings
-count_list[::2] = page_names
-count_list[1::2] = page_count_strings
-#body +=  r'Pages tested: &' + str(count_list) + r"\\"
-
 body += '\\end{tabular} \\vspace{1.5cm} \\\\ \n'
 
 # Average duration for first, second, ... page
@@ -272,23 +313,13 @@
 body += '\\end{tabular} \\vspace{1.5cm} \\\\ \n'
 
 # SHOW bar plot of average time per page
-plt.bar(range(1,len(duration_order)+1), tpp_averages)
-plt.xlabel('Page')
+plt.bar(range(1,len(duration_order)+1), np.array(tpp_averages)/60)
+plt.xlabel('Page order')
 plt.xlim(.8, len(duration_order)+1)
 plt.xticks(np.arange(1,len(duration_order)+1)+.4, range(1,len(duration_order)+1))
-plt.ylabel('Time [seconds]')
+plt.ylabel('Average time [minutes]')
 plt.savefig(folder_name+"/time_per_page.pdf", bbox_inches='tight')
 plt.close()
-body += r'''\begin{figure}[htbp]
-         \begin{center}
-         \includegraphics[width=\textwidth]{'''+\
-         folder_name+"/time_per_page.pdf"+\
-        r'''}
-        \caption{Average time spent per audioholder page.}
-        \label{fig:avgtimeperpage}
-         \end{center}
-         \end{figure}
-         '''
 #TODO add error bars
 
 
@@ -306,36 +337,170 @@
 combined_list = sorted(zip(*combined_list), key=operator.itemgetter(1, 2)) # sort
 
 # Show average duration for all songs
-body += r'''\vspace{.5cm} Average duration per audioholder: \\
+body += r'''\vspace{.5cm} Average duration per audioholder (see also Figure \ref{fig:avgtimeperaudioholder}): \\
         \begin{tabular}{llll}
         \textbf{Audioholder} & \textbf{Duration} & \textbf{\# subjects} & \textbf{\# fragments} \\
         '''
+audioholder_names_ordered = []
+average_duration_audioholder_ordered = []
+number_of_subjects = []
 for page_index in range(len(page_names)):
+    audioholder_names_ordered.append(combined_list[page_index][0])
+    average_duration_audioholder_ordered.append(combined_list[page_index][1])
+    number_of_subjects.append(combined_list[page_index][3])
     body +=  combined_list[page_index][0] + "&" +\
              seconds2timestr(combined_list[page_index][1]) + "&" +\
              str(combined_list[page_index][3]) + "&" +\
              str(combined_list[page_index][2]) + r"\\"
 body += '\\end{tabular}\n'
 
+# SHOW bar plot of average time per page
+plt.bar(range(1,len(audioholder_names_ordered)+1), np.array(average_duration_audioholder_ordered)/60)
+plt.xlabel('Audioholder')
+plt.xlim(.8, len(audioholder_names_ordered)+1)
+plt.xticks(np.arange(1,len(audioholder_names_ordered)+1)+.4, audioholder_names_ordered, rotation=90)
+plt.ylabel('Average time [minutes]')
+plt.savefig(folder_name+"/time_per_audioholder.pdf", bbox_inches='tight')
+plt.close()
+
+# SHOW bar plot of average time per page
+plt.bar(range(1,len(audioholder_names_ordered)+1), number_of_subjects)
+plt.xlabel('Audioholder')
+plt.xlim(.8, len(audioholder_names_ordered)+1)
+plt.xticks(np.arange(1,len(audioholder_names_ordered)+1)+.4, audioholder_names_ordered, rotation=90)
+plt.ylabel('Number of subjects')
+ax = plt.gca()
+ylims = ax.get_ylim()
+yint = np.arange(int(np.floor(ylims[0])), int(np.ceil(ylims[1]))+1)
+plt.yticks(yint)
+plt.savefig(folder_name+"/subjects_per_audioholder.pdf", bbox_inches='tight')
+plt.close()
+
+# SHOW both figures
+body += r'''\begin{figure}[htbp]
+         \begin{center}
+         \includegraphics[width=.65\textwidth]{'''+\
+         folder_name+"/time_per_page.pdf"+\
+        r'''}
+        \caption{Average time spent per page.}
+        \label{fig:avgtimeperpage}
+         \end{center}
+         \end{figure}
+         '''
+body += r'''\begin{figure}[htbp]
+         \begin{center}
+         \includegraphics[width=.65\textwidth]{'''+\
+         folder_name+"/time_per_audioholder.pdf"+\
+        r'''}
+        \caption{Average time spent per audioholder.}
+        \label{fig:avgtimeperaudioholder}
+         \end{center}
+         \end{figure}
+         '''
+body += r'''\begin{figure}[htbp]
+         \begin{center}
+         \includegraphics[width=.65\textwidth]{'''+\
+         folder_name+"/subjects_per_audioholder.pdf"+\
+        r'''}
+        \caption{Number of subjects per audioholder.}
+        \label{fig:avgtimeperaudioholder}
+         \end{center}
+         \end{figure}
+         '''
+#TODO add error bars
+#TODO layout of figures
+
+# SHOW boxplot per audioholder
+#TODO order in decreasing order of participants
+for audioholder_name in page_names: # get each name
+    # plot boxplot if exists (not so for the 'alt' names)
+    if os.path.isfile(folder_name+'/ratings/'+audioholder_name+'-ratings-box.pdf'):
+        body += r'''\begin{figure}[htbp]
+             \begin{center}
+             \includegraphics[width=.65\textwidth]{'''+\
+             folder_name+"/ratings/"+audioholder_name+'-ratings-box.pdf'+\
+            r'''}
+            \caption{Box plot of ratings for audioholder '''+\
+            audioholder_name+' ('+str(subject_count[real_page_names.index(audioholder_name)])+\
+            ''' participants).}
+            \label{fig:avgtimeperpage}
+             \end{center}
+             \end{figure}
+             '''
+
+# DEMO pie chart of gender distribution among subjects
+genders = ['male', 'female', 'other', 'preferNotToSay', 'UNAVAILABLE']
+# TODO: get the above automatically
+gender_distribution = ''
+for item in genders:
+    number = gender.count(item)
+    if number>0:
+        gender_distribution += str("{:.2f}".format((100.0*number)/len(gender)))+\
+                               '/'+item.capitalize()+' ('+str(number)+'),\n'
+
+body += r'''
+        \def\angle{0}
+        \def\radius{3}
+        \def\cyclelist{{"orange","blue","red","green"}}
+        \newcount\cyclecount \cyclecount=-1
+        \newcount\ind \ind=-1
+        \begin{figure}[htbp]
+        \begin{center}\begin{tikzpicture}[nodes = {font=\sffamily}]
+        \foreach \percent/\name in {'''+\
+        gender_distribution+\
+        r'''} {\ifx\percent\empty\else               % If \percent is empty, do nothing
+        \global\advance\cyclecount by 1     % Advance cyclecount
+        \global\advance\ind by 1            % Advance list index
+        \ifnum6<\cyclecount                 % If cyclecount is larger than list
+          \global\cyclecount=0              %   reset cyclecount and
+          \global\ind=0                     %   reset list index
+        \fi
+        \pgfmathparse{\cyclelist[\the\ind]} % Get color from cycle list
+        \edef\color{\pgfmathresult}         %   and store as \color
+        % Draw angle and set labels
+        \draw[fill={\color!50},draw={\color}] (0,0) -- (\angle:\radius)
+          arc (\angle:\angle+\percent*3.6:\radius) -- cycle;
+        \node at (\angle+0.5*\percent*3.6:0.7*\radius) {\percent\,\%};
+        \node[pin=\angle+0.5*\percent*3.6:\name]
+          at (\angle+0.5*\percent*3.6:\radius) {};
+        \pgfmathparse{\angle+\percent*3.6}  % Advance angle
+        \xdef\angle{\pgfmathresult}         %   and store in \angle
+        \fi
+        };
+        \end{tikzpicture}
+        \caption{Representation of gender across subjects}
+        \label{default}
+        \end{center}
+        \end{figure}
+        '''
+# problem: some people entered twice? 
+
 #TODO
 # time per page in function of number of fragments (plot)
 # time per participant in function of number of pages
 # plot total time for each participant
-# plot total time
 # show 'count' per page (in order)
 
 # clear up page_index <> page_count <> page_number confusion
 
 
-texfile = header+body+footer
+texfile = header+body+footer # add bits together
 
 # write TeX file
-with open(folder_name + '/' + 'test.tex','w') as f:
+with open(folder_name + '/' + 'Report.tex','w') as f:
     f.write(texfile)
-proc=subprocess.Popen(shlex.split('pdflatex -output-directory='+folder_name+' '+ folder_name + '/test.tex'))
+proc=subprocess.Popen(shlex.split('pdflatex -output-directory='+folder_name+' '+ folder_name + '/Report.tex'))
 proc.communicate()
 # run again
-proc=subprocess.Popen(shlex.split('pdflatex -output-directory='+folder_name+' '+ folder_name + '/test.tex'))
+proc=subprocess.Popen(shlex.split('pdflatex -output-directory='+folder_name+' '+ folder_name + '/Report.tex'))
 proc.communicate()
 
-#TODO remove auxiliary LaTeX files
\ No newline at end of file
+#TODO remove auxiliary LaTeX files
+try:
+    os.remove(folder_name + '/' + 'Report.aux')
+    os.remove(folder_name + '/' + 'Report.log')
+    os.remove(folder_name + '/' + 'Report.out')
+    os.remove(folder_name + '/' + 'Report.toc')
+except OSError:
+    pass
+    
\ No newline at end of file
--- a/scripts/score_parser.py	Tue Aug 18 23:56:05 2015 +0200
+++ b/scripts/score_parser.py	Thu Aug 20 11:29:29 2015 +0200
@@ -5,8 +5,6 @@
 import sys
 import csv
 
-#TODO Remove DEBUG statements
-
 # COMMAND LINE ARGUMENTS
 
 assert len(sys.argv)<3, "score_parser takes at most 1 command line argument\n"+\
@@ -32,12 +30,14 @@
     
 # CODE
 
+# remember which files have been opened this time
+file_history = []
+
 # get every XML file in folder
 for file in os.listdir(folder_name):
     if file.endswith(".xml"):
         tree = ET.parse(folder_name + '/' + file)
         root = tree.getroot()
-        #print "DEBUG Reading " + file + "..."
 
         # get subject ID from XML file
         subject_id = file[:-4] # file name (without extension) as subject ID
@@ -52,7 +52,7 @@
 
             file_name = folder_name+'/ratings/'+page_name+'-ratings.csv' # score file name
 
-            # create folder 'ratings if not yet created
+            # create folder 'ratings' if not yet created
             if not os.path.exists(folder_name + '/ratings'):
                 os.makedirs(folder_name + '/ratings')
 
@@ -68,38 +68,38 @@
             for audioelement in audiolist: # iterate over all audioelements
                 fragmentnamelist.append(audioelement.get('id')) # add to list
 
-
             # if file exists, get header and add 'new' fragments
             if os.path.isfile(file_name):
-                #print "DEBUG file " + file_name + " already exists - reading header"
                 with open(file_name, 'r') as readfile:
                     filereader = csv.reader(readfile, delimiter=',')
                     headerrow = filereader.next()
 
+                # If file hasn't been opened yet this time, remove all rows except header
+                if file_name not in file_history:
+                    with open(file_name, 'w') as writefile:
+                        filewriter = csv.writer(writefile, delimiter=',')
+                        headerrow = sorted(headerrow)
+                        filewriter.writerow(headerrow)
+                    file_history.append(file_name)
+
                 # Which of the fragmentes are in fragmentnamelist but not in headerrow?
                 newfragments = list(set(fragmentnamelist)-set(headerrow))
                 newfragments = sorted(newfragments) # new fragments in alphabetical order
                 # If not empty, read file and rewrite adding extra columns
                 if newfragments: # if not empty
-                    #print "DEBUG New fragments found: " + str(newfragments)
                     with open('temp.csv', 'w') as writefile:
                         filewriter = csv.writer(writefile, delimiter=',')
                         filewriter.writerow(headerrow + newfragments) # write new header
-                        #print "        "+str(headerrow + newfragments) # DEBUG
                         with open(file_name, 'r') as readfile:
                             filereader = csv.reader(readfile, delimiter=',')
                             filereader.next() # skip header
                             for row in filereader: # rewrite row plus empty cells for every new fragment name
-                                #print "            Old row: " + str(row) # DEBUG
                                 filewriter.writerow(row + ['']*len(newfragments))
-                                #print "            New row: " + str(row + ['']*len(newfragments)) # DEBUG
                     os.rename('temp.csv', file_name) # replace old file with temp file
                     headerrow = headerrow + newfragments
-                    #print "DEBUG New header row: " + str(headerrow)
 
             # if not, create file and make header
             else:
-                #print ["DEBUG file " + file_name + " doesn't exist yet - making new one"]
                 headerrow = sorted(fragmentnamelist) # sort alphabetically
                 headerrow.insert(0,'')
                 fragmentnamelist = fragmentnamelist[1:] #HACKY FIX inserting in firstrow also affects fragmentnamelist
@@ -127,4 +127,3 @@
             # write row: [subject ID, rating fragment ID 1, ..., rating fragment ID M]
             if any(ratingrow[1:]): # append to file if row non-empty (except subject name)
                 filewriter.writerow(ratingrow)
-
--- a/scripts/score_plot.py	Tue Aug 18 23:56:05 2015 +0200
+++ b/scripts/score_plot.py	Thu Aug 20 11:29:29 2015 +0200
@@ -214,7 +214,7 @@
         plt.title(page_name)
         plt.xlabel('Fragment')
         plt.xlim(0, len(headerrow)+1) # only show relevant region, leave space left & right)
-        plt.xticks(range(1, len(headerrow)+1), headerrow) # show fragment names
+        plt.xticks(range(1, len(headerrow)+1), headerrow, rotation=90) # show fragment names
         plt.ylabel('Rating')
         plt.ylim(0,1)