Mercurial > hg > chourdakisreiss2016
comparison experiment-reverb/code/vgs.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:246d5546657c |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Fri May 15 12:37:53 2015 | |
4 | |
5 @author: mmxgn | |
6 """ | |
7 | |
8 # Trying VGS algorithm as described in the VOGUE paper | |
9 from numpy import * | |
10 | |
11 S = "ACBDAHCBADFGAIEB" | |
12 | |
13 S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA" | |
14 # Input Parameters | |
15 maxgap = 2 | |
16 minsup = 2 | |
17 k = 2 | |
18 | |
19 seq = list(S) | |
20 | |
21 # For k = 1 | |
22 | |
23 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet) | |
24 | |
25 # For k = 2 | |
26 | |
27 # Possible sequences of size 2 | |
28 | |
29 seq_2 = [(i,j) for i in seq_1 for j in seq_1] | |
30 | |
31 # Generate positions of symbols | |
32 positions = {} | |
33 for i in range(0, len(seq)): | |
34 if seq[i] not in positions: | |
35 positions[seq[i]] = [i] | |
36 else: | |
37 if i not in positions[seq[i]]: | |
38 positions[seq[i]].append(i) | |
39 | |
40 def all_subsequences (S, k, supp=1): | |
41 if k == 1: | |
42 l = [m for m in S] | |
43 | |
44 support = {} | |
45 | |
46 for m in l: | |
47 if m in support: | |
48 support[m] += 1 | |
49 else: | |
50 support[m] = 1 | |
51 | |
52 | |
53 | |
54 | |
55 s = [m for m in l if support[m] >= supp] | |
56 | |
57 | |
58 return s | |
59 | |
60 | |
61 if k == 2: | |
62 P_ = all_subsequences(S,1,supp) | |
63 | |
64 | |
65 S = [m for m in S if m in P_] | |
66 | |
67 | |
68 l = [(m,n) for m in S for n in S if m != n] | |
69 | |
70 | |
71 | |
72 return l | |
73 | |
74 | |
75 else: | |
76 l = [] | |
77 sseqs_ = all_subsequences(S, k-1, supp) | |
78 P_ = all_subsequences(S,1,supp) | |
79 | |
80 S = [m for m in S if m in P_] | |
81 | |
82 for m in S: | |
83 for n in sseqs_: | |
84 conc = (m, ) + n | |
85 if conc not in l: | |
86 l.append(conc) | |
87 | |
88 | |
89 | |
90 return l | |
91 | |
92 | |
93 def subsequence_to_indices (sseq, seq): | |
94 | |
95 positions = {} | |
96 for i in range(0, len(seq)): | |
97 if seq[i] not in positions: | |
98 positions[seq[i]] = [i] | |
99 else: | |
100 if i not in positions[seq[i]]: | |
101 positions[seq[i]].append(i) | |
102 | |
103 | |
104 for sq in sseq: | |
105 idx = [] | |
106 for s in sq: | |
107 idx.append(positions[s]) | |
108 | |
109 | |
110 def cartesian_product(*lists): | |
111 if len(lists) == 2: | |
112 return [(i,j) for i in lists[0] for j in lists[1]] | |
113 else: | |
114 return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])] | |
115 | |
116 def possible_sequences(S, minsup, maxgap, K): | |
117 singletons = set(S) | |
118 | |
119 # Show appearances | |
120 | |
121 positions_singletons = {} | |
122 for i in range(0, len(S)): | |
123 if S[i] not in positions_singletons: | |
124 positions_singletons[S[i]] = [i] | |
125 else: | |
126 if i not in positions_singletons[S[i]]: | |
127 positions_singletons[S[i]].append(i) | |
128 | |
129 | |
130 toremove = [] | |
131 for p in positions_singletons: | |
132 if len(positions_singletons[p]) < minsup: | |
133 singletons.remove(p) | |
134 toremove.append(p) | |
135 | |
136 | |
137 for i in toremove: | |
138 positions_singletons.pop(i) | |
139 | |
140 | |
141 positions = [] | |
142 | |
143 for p in positions_singletons: | |
144 positions.append(positions_singletons[p]) | |
145 subsequences = [] | |
146 | |
147 for k in range(2, K+1): | |
148 subsequences_ = [] | |
149 | |
150 sseq = all_subsequences(list(singletons), k) | |
151 | |
152 for seq in sseq: | |
153 pos = [] | |
154 for s in seq: | |
155 pos.append(positions_singletons[s]) | |
156 possible_sequences = cartesian_product(*pos) | |
157 subsequences_ = subsequences_+possible_sequences | |
158 | |
159 | |
160 | |
161 | |
162 subsequences = subsequences+subsequences_ | |
163 | |
164 | |
165 | |
166 | |
167 | |
168 | |
169 | |
170 | |
171 | |
172 | |
173 | |
174 subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences) | |
175 subsequences_alpha = [] | |
176 | |
177 support = {} | |
178 for s in subsequences: | |
179 seq = [] | |
180 for i in s: | |
181 seq.append(S[i]) | |
182 | |
183 if tuple(seq) not in subsequences_alpha: | |
184 subsequences_alpha.append(tuple(seq)) | |
185 | |
186 if tuple(seq) in support: | |
187 support[tuple(seq)] += 1 | |
188 else: | |
189 support[tuple(seq)] = 1 | |
190 | |
191 gap_sequences = [] | |
192 | |
193 # for s in subsequences: | |
194 # for i in range(0, len(s)-1): | |
195 # | |
196 | |
197 | |
198 return support | |
199 | |
200 #pseq = possible_sequences(seq, 2, 2, 3) | |
201 # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm | |
202 | |
203 #print pseq | |
204 | |
205 | |
206 def find_frequent_subsequences(seq, K, minsup, maxgap): | |
207 S = seq | |
208 | |
209 L = [] | |
210 | |
211 # Map each individual symbol to its position in the sequence | |
212 for k in range(0, len(seq)): | |
213 L.append((seq[k],k)) | |
214 | |
215 | |
216 # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]} | |
217 Lr = {} | |
218 | |
219 for k in range(0, len(L)): | |
220 if L[k][0] in Lr: | |
221 Lr[L[k][0]].append(L[k][1]) | |
222 else: | |
223 Lr[L[k][0]] = [L[k][1]] | |
224 | |
225 # Filter out the symbols with less than minsup support | |
226 Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup) | |
227 Lr = Lk | |
228 | |
229 # Recursively combine->reduce | |
230 for k in range(1, K): | |
231 L = [] | |
232 for l1 in Lk: | |
233 for l2 in Lr: | |
234 | |
235 # Combine symbol1,..,symbolN-1 and symbolN | |
236 subseq = tuple(l1)+tuple(l2) | |
237 | |
238 for v1 in Lk[l1]: | |
239 for v2 in Lr[l2]: | |
240 if k == 1: | |
241 v = [v1, v2] | |
242 else: | |
243 v = v1 + [v2] | |
244 | |
245 # Keep only those that are sorted in order with a maximum gap of maxgap | |
246 if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]): | |
247 L.append((subseq, v)) | |
248 | |
249 | |
250 # Reduce again and filter | |
251 Lk = {} | |
252 | |
253 for k in range(0, len(L)): | |
254 if L[k][0] in Lk: | |
255 Lk[L[k][0]].append(L[k][1]) | |
256 else: | |
257 Lk[L[k][0]] = [L[k][1]] | |
258 | |
259 Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) | |
260 | |
261 # Return a record with gap symbols and frequent subsequences | |
262 record = {} | |
263 | |
264 for lk in Lk: | |
265 length = len(lk) | |
266 | |
267 | |
268 gap_symbols = [] | |
269 for i in range(0, len(Lk[lk])): | |
270 gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]] | |
271 if gap: | |
272 gap_symbols += gap | |
273 | |
274 record[lk] = (len(Lk[lk]), gap_symbols) | |
275 | |
276 | |
277 | |
278 return record | |
279 | |
280 | |
281 | |
282 | |
283 | |
284 print find_frequent_subsequences(seq,5, 4, 2) | |
285 | |
286 |