comparison experiment-reverb/code/vgs.py @ 0:246d5546657c

initial commit, needs cleanup
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 14 Dec 2016 13:15:48 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:246d5546657c
1 # -*- coding: utf-8 -*-
2 """
3 Created on Fri May 15 12:37:53 2015
4
5 @author: mmxgn
6 """
7
8 # Trying VGS algorithm as described in the VOGUE paper
9 from numpy import *
10
11 S = "ACBDAHCBADFGAIEB"
12
13 S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
14 # Input Parameters
15 maxgap = 2
16 minsup = 2
17 k = 2
18
19 seq = list(S)
20
21 # For k = 1
22
23 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
24
25 # For k = 2
26
27 # Possible sequences of size 2
28
29 seq_2 = [(i,j) for i in seq_1 for j in seq_1]
30
31 # Generate positions of symbols
32 positions = {}
33 for i in range(0, len(seq)):
34 if seq[i] not in positions:
35 positions[seq[i]] = [i]
36 else:
37 if i not in positions[seq[i]]:
38 positions[seq[i]].append(i)
39
40 def all_subsequences (S, k, supp=1):
41 if k == 1:
42 l = [m for m in S]
43
44 support = {}
45
46 for m in l:
47 if m in support:
48 support[m] += 1
49 else:
50 support[m] = 1
51
52
53
54
55 s = [m for m in l if support[m] >= supp]
56
57
58 return s
59
60
61 if k == 2:
62 P_ = all_subsequences(S,1,supp)
63
64
65 S = [m for m in S if m in P_]
66
67
68 l = [(m,n) for m in S for n in S if m != n]
69
70
71
72 return l
73
74
75 else:
76 l = []
77 sseqs_ = all_subsequences(S, k-1, supp)
78 P_ = all_subsequences(S,1,supp)
79
80 S = [m for m in S if m in P_]
81
82 for m in S:
83 for n in sseqs_:
84 conc = (m, ) + n
85 if conc not in l:
86 l.append(conc)
87
88
89
90 return l
91
92
93 def subsequence_to_indices (sseq, seq):
94
95 positions = {}
96 for i in range(0, len(seq)):
97 if seq[i] not in positions:
98 positions[seq[i]] = [i]
99 else:
100 if i not in positions[seq[i]]:
101 positions[seq[i]].append(i)
102
103
104 for sq in sseq:
105 idx = []
106 for s in sq:
107 idx.append(positions[s])
108
109
110 def cartesian_product(*lists):
111 if len(lists) == 2:
112 return [(i,j) for i in lists[0] for j in lists[1]]
113 else:
114 return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
115
116 def possible_sequences(S, minsup, maxgap, K):
117 singletons = set(S)
118
119 # Show appearances
120
121 positions_singletons = {}
122 for i in range(0, len(S)):
123 if S[i] not in positions_singletons:
124 positions_singletons[S[i]] = [i]
125 else:
126 if i not in positions_singletons[S[i]]:
127 positions_singletons[S[i]].append(i)
128
129
130 toremove = []
131 for p in positions_singletons:
132 if len(positions_singletons[p]) < minsup:
133 singletons.remove(p)
134 toremove.append(p)
135
136
137 for i in toremove:
138 positions_singletons.pop(i)
139
140
141 positions = []
142
143 for p in positions_singletons:
144 positions.append(positions_singletons[p])
145 subsequences = []
146
147 for k in range(2, K+1):
148 subsequences_ = []
149
150 sseq = all_subsequences(list(singletons), k)
151
152 for seq in sseq:
153 pos = []
154 for s in seq:
155 pos.append(positions_singletons[s])
156 possible_sequences = cartesian_product(*pos)
157 subsequences_ = subsequences_+possible_sequences
158
159
160
161
162 subsequences = subsequences+subsequences_
163
164
165
166
167
168
169
170
171
172
173
174 subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
175 subsequences_alpha = []
176
177 support = {}
178 for s in subsequences:
179 seq = []
180 for i in s:
181 seq.append(S[i])
182
183 if tuple(seq) not in subsequences_alpha:
184 subsequences_alpha.append(tuple(seq))
185
186 if tuple(seq) in support:
187 support[tuple(seq)] += 1
188 else:
189 support[tuple(seq)] = 1
190
191 gap_sequences = []
192
193 # for s in subsequences:
194 # for i in range(0, len(s)-1):
195 #
196
197
198 return support
199
200 #pseq = possible_sequences(seq, 2, 2, 3)
201 # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
202
203 #print pseq
204
205
206 def find_frequent_subsequences(seq, K, minsup, maxgap):
207 S = seq
208
209 L = []
210
211 # Map each individual symbol to its position in the sequence
212 for k in range(0, len(seq)):
213 L.append((seq[k],k))
214
215
216 # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
217 Lr = {}
218
219 for k in range(0, len(L)):
220 if L[k][0] in Lr:
221 Lr[L[k][0]].append(L[k][1])
222 else:
223 Lr[L[k][0]] = [L[k][1]]
224
225 # Filter out the symbols with less than minsup support
226 Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
227 Lr = Lk
228
229 # Recursively combine->reduce
230 for k in range(1, K):
231 L = []
232 for l1 in Lk:
233 for l2 in Lr:
234
235 # Combine symbol1,..,symbolN-1 and symbolN
236 subseq = tuple(l1)+tuple(l2)
237
238 for v1 in Lk[l1]:
239 for v2 in Lr[l2]:
240 if k == 1:
241 v = [v1, v2]
242 else:
243 v = v1 + [v2]
244
245 # Keep only those that are sorted in order with a maximum gap of maxgap
246 if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
247 L.append((subseq, v))
248
249
250 # Reduce again and filter
251 Lk = {}
252
253 for k in range(0, len(L)):
254 if L[k][0] in Lk:
255 Lk[L[k][0]].append(L[k][1])
256 else:
257 Lk[L[k][0]] = [L[k][1]]
258
259 Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)
260
261 # Return a record with gap symbols and frequent subsequences
262 record = {}
263
264 for lk in Lk:
265 length = len(lk)
266
267
268 gap_symbols = []
269 for i in range(0, len(Lk[lk])):
270 gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
271 if gap:
272 gap_symbols += gap
273
274 record[lk] = (len(Lk[lk]), gap_symbols)
275
276
277
278 return record
279
280
281
282
283
284 print find_frequent_subsequences(seq,5, 4, 2)
285
286