comparison python/survey_parser.py @ 2938:76c06bd88bbe

Adding surver parser. closes #169
author Dave Moffat <me@davemoffat.com>
date Tue, 12 Sep 2017 15:23:30 +0100
parents
children 356a09527abc c29ef0cc741f
comparison
equal deleted inserted replaced
2937:8bcba5c95656 2938:76c06bd88bbe
1 #!/usr/bin/python
2 import xml.etree.ElementTree as ET
3 import os
4 import sys
5 import csv
6 import matplotlib.pyplot as plt
7 import numpy as np
8 import scipy as sp
9 import scipy.stats
10
11
12 # COMMAND LINE ARGUMENTS
13
14 assert len(sys.argv)<3, "score_parser takes at most 1 command line argument\n"+\
15 "Use: python score_parser.py [rating_folder_location]"
16
17 # XML results files location
18 if len(sys.argv) == 1:
19 folder_name = "../saves" # Looks in 'saves/' folder from 'scripts/' folder
20 print("Use: python score_parser.py [rating_folder_location]")
21 print("Using default path: " + folder_name)
22 elif len(sys.argv) == 2:
23 folder_name = sys.argv[1] # First command line argument is folder
24
25 # check if folder_name exists
26 if not os.path.exists(folder_name):
27 #the file is not there
28 print("Folder '"+folder_name+"' does not exist.")
29 sys.exit() # terminate script execution
30 elif not os.access(os.path.dirname(folder_name), os.W_OK):
31 #the file does exist but write privileges are not given
32 print("No write privileges in folder '"+folder_name+"'.")
33
34 # CODE
35
36 storage = {"globals":{}, "pages": {}}
37
38 def decodeSurveyTree(session_id, surveyroot, store):
39 # Get all the childs
40 for survey_entry in list(surveyroot):
41 survey_id = survey_entry.get("ref")
42 if survey_id not in store.keys():
43 store[survey_id] = {"responses": []}
44 survey_type = survey_entry.get("type")
45 store[survey_id]["type"] = survey_type
46 if survey_type == "statement" or survey_type == "video":
47 if "header" not in store[survey_id]:
48 store[survey_id]["header"] = ("ids", "duration")
49 store[survey_id] = decodeSurveyStatement(session_id, survey_entry, store[survey_id])
50 elif survey_type == "question" or survey_type == "number" or survey_type == "slider":
51 if "header" not in store[survey_id]:
52 store[survey_id]["header"] = ("ids", "durations", "response")
53 store[survey_id] = decodeSurveyQuestion(session_id, survey_entry, store[survey_id])
54 elif survey_type == "checkbox":
55 if "header" not in store[survey_id]:
56 head = ["ids", "duration"]
57 for option in survey_entry.findall("./response"):
58 head.append(option.get("name"))
59 store[survey_id]["header"] = tuple(head)
60 store[survey_id] = decodeSurveyCheckbox(session_id, survey_entry, store[survey_id])
61 elif survey_type == "radio":
62 if "header" not in store[survey_id]:
63 store[survey_id]["header"] = ("ids", "duration", "response")
64 store[survey_id] = decodeSurveyRadio(session_id, survey_entry, store[survey_id])
65 return store
66
67 def decodeSurveyStatement(session_id, survey_entry, store):
68 resp = (session_id, survey_entry.get("duration"))
69 store["responses"].append(resp)
70 return store
71
72 def decodeSurveyQuestion(session_id, survey_entry, store):
73 if survey_entry.find("./response") is not None:
74 resp = (session_id, survey_entry.get("duration"), survey_entry.find("./response").text)
75 else:
76 resp = (session_id, survey_entry.get("duration"), None)
77 store["responses"].append(resp)
78 return store
79 # return None
80
81 def decodeSurveyCheckbox(session_id, survey_entry, store):
82 response = [session_id, survey_entry.get("duration")]
83 for node in survey_entry.findall("./response"):
84 response.append(node.get("checked"))
85 store["responses"].append(tuple(response))
86 return store
87
88 def decodeSurveyRadio(session_id, survey_entry, store):
89 if survey_entry.find("./response") is not None:
90 response = (session_id, survey_entry.get("duration"), survey_entry.find("./response").get("name"))
91 else:
92 response = (session_id, survey_entry.get("duration"), None)
93 store["responses"].append(response)
94 return store
95 # return None
96
97 if folder_name.endswith("/") is False:
98 folder_name += "/"
99
100 # Create the folder 'surveys' if not yet created
101 if not os.path.exists(folder_name + 'surveys'):
102 os.makedirs(folder_name + 'surveys')
103
104 #Iterate through every XML file in folder_name
105 for file_name in os.listdir(folder_name):
106 if file_name.endswith(".xml"):
107 tree = ET.parse(folder_name +file_name)
108 root = tree.getroot()
109 subject_id = root.get('key')
110 pre_survey = root.find("./survey[@location='pre']")
111 # print pre_survey
112 if pre_survey is not None:
113 if len(pre_survey) is not 0:
114 if "pre" not in storage["globals"].keys():
115 storage["globals"]["pre"] = {}
116 storage["globals"]["pre"] = decodeSurveyTree(subject_id, pre_survey, storage["globals"]["pre"])
117 post_survey = root.find("./survey[@location='post']")
118 if post_survey is not None:
119 if len(post_survey) is not 0:
120 if "post" not in storage["globals"].keys():
121 storage["globals"]["post"] = {}
122 storage["globals"]["post"] = decodeSurveyTree(subject_id, post_survey, storage["globals"]["post"])
123
124 # Now iterate through the page specifics
125 for page in root.findall("./page[@state='complete']"):
126 page_name = page.get("ref")
127 pre_survey = page.find("./survey[@location='pre']")
128 try:
129 page_store = storage["pages"][page_name]
130 except KeyError:
131 storage["pages"][page_name] = {}
132 page_store = storage["pages"][page_name]
133 if pre_survey is not None:
134 if len(pre_survey) is not 0:
135 if "pre" not in page_store.keys():
136 page_store["pre"] = {}
137 page_store["pre"] = decodeSurveyTree(subject_id, pre_survey, page_store["pre"])
138 post_survey = page.find("./survey[@location='post']")
139 if post_survey is not None:
140 if len(post_survey) is not 0:
141 if "post" not in page_store.keys():
142 page_store["post"] = {}
143 page_store["post"] = decodeSurveyTree(subject_id, post_survey, page_store["post"])
144
145 #Storage now holds entire survey structure
146 # Time to start exporting to files
147
148 # Store globals
149 file_store_root = folder_name + 'surveys/'
150 for position in storage["globals"].keys():
151 for ref in storage["globals"][position].keys():
152 with open(file_store_root+ref+".csv", "w") as f:
153 filewriter = csv.writer(f, delimiter=",")
154 filewriter.writerow(storage["globals"][position][ref]["header"])
155 for row in storage["globals"][position][ref]["responses"]:
156 filewriter.writerow(row)
157 for page_name in storage["pages"].keys():
158 for position in storage["pages"][page_name].keys():
159 if not os.path.exists(file_store_root + page_name):
160 os.makedirs(file_store_root + page_name)
161 for ref in storage["pages"][page_name][position].keys():
162 with open(file_store_root+page_name+"/"+ref+".csv", "w") as f:
163 filewriter = csv.writer(f, delimiter=",")
164 filewriter.writerow(storage["pages"][page_name][position][ref]["header"])
165 for row in storage["pages"][page_name][position][ref]["responses"]:
166 filewriter.writerow(row)
167
168 #Time to plot
169
170 def plotDurationHistogram(store, plot_id, saveloc):
171 x = []
172 for row in store["responses"]:
173 r_temp = row[1]
174 if r_temp is None:
175 r_temp = 0;
176 x.append(float(r_temp))
177 x = np.asarray(x)
178 plt.figure()
179 n, bins, patches = plt.hist(x, 10, facecolor='green', alpha=0.75)
180 plt.xlabel("Duration")
181 plt.ylabel("Count")
182 plt.grid(True)
183 plt.title("Histogram of durations for "+plot_id)
184 plt.savefig(saveloc+plot_id+"-duration.pdf", bbox_inches='tight')
185
186 def plotRadio(store, plot_id, saveloc):
187 plt.figure()
188 data = {}
189 for row in store["responses"]:
190 try:
191 data[row[2]] += 1
192 except KeyError:
193 data[row[2]] = 1
194 labels = data.keys()
195 sizes = data.values()
196 plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
197 plt.title("Selections of "+plot_id)
198 plt.savefig(saveloc+plot_id+".pdf", bbox_inches='tight')
199
200 def plotCheckbox(store, plot_id, saveloc):
201 data = []
202 labels = []
203 for h in store["header"][2::1]:
204 labels.append(h)
205 data.append(0)
206 for row in store["responses"]:
207 for i in range(2, len(labels)+2):
208 if row[i] == "true":
209 data[i-2] += 1
210 x = scipy.arange(4)
211 plt.figure()
212 plt.bar(x, data, width=0.8)
213 plt.xticks(x+0.4, labels)
214 plt.xlabel("Option")
215 plt.ylabel("Count")
216 plt.title("Selection counts of "+plot_id)
217 plt.savefig(saveloc+plot_id+".pdf", bbox_inches='tight')
218
219 for page_name in storage["pages"].keys():
220 for position in storage["pages"][page_name].keys():
221 saveloc = file_store_root+page_name+"/"
222 for ref in storage["pages"][page_name][position].keys():
223 plotDurationHistogram(storage["pages"][page_name][position][ref],ref, saveloc)
224 if storage["pages"][page_name][position][ref]["type"] == "radio":
225 plotRadio(storage["pages"][page_name][position][ref],ref, saveloc)
226 if storage["pages"][page_name][position][ref]["type"] == "checkbox":
227 plotCheckbox(storage["pages"][page_name][position][ref],ref, saveloc)