e@0
|
1 # -*- coding: utf-8 -*-
|
e@0
|
2 """
|
e@0
|
3 Created on Fri May 15 12:37:53 2015
|
e@0
|
4
|
e@0
|
5 @author: mmxgn
|
e@0
|
6 """
|
e@0
|
7
|
e@0
|
8 # Trying VGS algorithm as described in the VOGUE paper
|
e@0
|
9 from numpy import *
|
e@0
|
10
|
e@0
|
11 S = "ACBDAHCBADFGAIEB"
|
e@0
|
12
|
e@0
|
13 S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
|
e@0
|
14 # Input Parameters
|
e@0
|
15 maxgap = 2
|
e@0
|
16 minsup = 2
|
e@0
|
17 k = 2
|
e@0
|
18
|
e@0
|
19 seq = list(S)
|
e@0
|
20
|
e@0
|
21 # For k = 1
|
e@0
|
22
|
e@0
|
23 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
|
e@0
|
24
|
e@0
|
25 # For k = 2
|
e@0
|
26
|
e@0
|
27 # Possible sequences of size 2
|
e@0
|
28
|
e@0
|
29 seq_2 = [(i,j) for i in seq_1 for j in seq_1]
|
e@0
|
30
|
e@0
|
31 # Generate positions of symbols
|
e@0
|
32 positions = {}
|
e@0
|
33 for i in range(0, len(seq)):
|
e@0
|
34 if seq[i] not in positions:
|
e@0
|
35 positions[seq[i]] = [i]
|
e@0
|
36 else:
|
e@0
|
37 if i not in positions[seq[i]]:
|
e@0
|
38 positions[seq[i]].append(i)
|
e@0
|
39
|
e@0
|
40 def all_subsequences (S, k, supp=1):
|
e@0
|
41 if k == 1:
|
e@0
|
42 l = [m for m in S]
|
e@0
|
43
|
e@0
|
44 support = {}
|
e@0
|
45
|
e@0
|
46 for m in l:
|
e@0
|
47 if m in support:
|
e@0
|
48 support[m] += 1
|
e@0
|
49 else:
|
e@0
|
50 support[m] = 1
|
e@0
|
51
|
e@0
|
52
|
e@0
|
53
|
e@0
|
54
|
e@0
|
55 s = [m for m in l if support[m] >= supp]
|
e@0
|
56
|
e@0
|
57
|
e@0
|
58 return s
|
e@0
|
59
|
e@0
|
60
|
e@0
|
61 if k == 2:
|
e@0
|
62 P_ = all_subsequences(S,1,supp)
|
e@0
|
63
|
e@0
|
64
|
e@0
|
65 S = [m for m in S if m in P_]
|
e@0
|
66
|
e@0
|
67
|
e@0
|
68 l = [(m,n) for m in S for n in S if m != n]
|
e@0
|
69
|
e@0
|
70
|
e@0
|
71
|
e@0
|
72 return l
|
e@0
|
73
|
e@0
|
74
|
e@0
|
75 else:
|
e@0
|
76 l = []
|
e@0
|
77 sseqs_ = all_subsequences(S, k-1, supp)
|
e@0
|
78 P_ = all_subsequences(S,1,supp)
|
e@0
|
79
|
e@0
|
80 S = [m for m in S if m in P_]
|
e@0
|
81
|
e@0
|
82 for m in S:
|
e@0
|
83 for n in sseqs_:
|
e@0
|
84 conc = (m, ) + n
|
e@0
|
85 if conc not in l:
|
e@0
|
86 l.append(conc)
|
e@0
|
87
|
e@0
|
88
|
e@0
|
89
|
e@0
|
90 return l
|
e@0
|
91
|
e@0
|
92
|
e@0
|
93 def subsequence_to_indices (sseq, seq):
|
e@0
|
94
|
e@0
|
95 positions = {}
|
e@0
|
96 for i in range(0, len(seq)):
|
e@0
|
97 if seq[i] not in positions:
|
e@0
|
98 positions[seq[i]] = [i]
|
e@0
|
99 else:
|
e@0
|
100 if i not in positions[seq[i]]:
|
e@0
|
101 positions[seq[i]].append(i)
|
e@0
|
102
|
e@0
|
103
|
e@0
|
104 for sq in sseq:
|
e@0
|
105 idx = []
|
e@0
|
106 for s in sq:
|
e@0
|
107 idx.append(positions[s])
|
e@0
|
108
|
e@0
|
109
|
e@0
|
110 def cartesian_product(*lists):
|
e@0
|
111 if len(lists) == 2:
|
e@0
|
112 return [(i,j) for i in lists[0] for j in lists[1]]
|
e@0
|
113 else:
|
e@0
|
114 return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
|
e@0
|
115
|
e@0
|
116 def possible_sequences(S, minsup, maxgap, K):
|
e@0
|
117 singletons = set(S)
|
e@0
|
118
|
e@0
|
119 # Show appearances
|
e@0
|
120
|
e@0
|
121 positions_singletons = {}
|
e@0
|
122 for i in range(0, len(S)):
|
e@0
|
123 if S[i] not in positions_singletons:
|
e@0
|
124 positions_singletons[S[i]] = [i]
|
e@0
|
125 else:
|
e@0
|
126 if i not in positions_singletons[S[i]]:
|
e@0
|
127 positions_singletons[S[i]].append(i)
|
e@0
|
128
|
e@0
|
129
|
e@0
|
130 toremove = []
|
e@0
|
131 for p in positions_singletons:
|
e@0
|
132 if len(positions_singletons[p]) < minsup:
|
e@0
|
133 singletons.remove(p)
|
e@0
|
134 toremove.append(p)
|
e@0
|
135
|
e@0
|
136
|
e@0
|
137 for i in toremove:
|
e@0
|
138 positions_singletons.pop(i)
|
e@0
|
139
|
e@0
|
140
|
e@0
|
141 positions = []
|
e@0
|
142
|
e@0
|
143 for p in positions_singletons:
|
e@0
|
144 positions.append(positions_singletons[p])
|
e@0
|
145 subsequences = []
|
e@0
|
146
|
e@0
|
147 for k in range(2, K+1):
|
e@0
|
148 subsequences_ = []
|
e@0
|
149
|
e@0
|
150 sseq = all_subsequences(list(singletons), k)
|
e@0
|
151
|
e@0
|
152 for seq in sseq:
|
e@0
|
153 pos = []
|
e@0
|
154 for s in seq:
|
e@0
|
155 pos.append(positions_singletons[s])
|
e@0
|
156 possible_sequences = cartesian_product(*pos)
|
e@0
|
157 subsequences_ = subsequences_+possible_sequences
|
e@0
|
158
|
e@0
|
159
|
e@0
|
160
|
e@0
|
161
|
e@0
|
162 subsequences = subsequences+subsequences_
|
e@0
|
163
|
e@0
|
164
|
e@0
|
165
|
e@0
|
166
|
e@0
|
167
|
e@0
|
168
|
e@0
|
169
|
e@0
|
170
|
e@0
|
171
|
e@0
|
172
|
e@0
|
173
|
e@0
|
174 subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
|
e@0
|
175 subsequences_alpha = []
|
e@0
|
176
|
e@0
|
177 support = {}
|
e@0
|
178 for s in subsequences:
|
e@0
|
179 seq = []
|
e@0
|
180 for i in s:
|
e@0
|
181 seq.append(S[i])
|
e@0
|
182
|
e@0
|
183 if tuple(seq) not in subsequences_alpha:
|
e@0
|
184 subsequences_alpha.append(tuple(seq))
|
e@0
|
185
|
e@0
|
186 if tuple(seq) in support:
|
e@0
|
187 support[tuple(seq)] += 1
|
e@0
|
188 else:
|
e@0
|
189 support[tuple(seq)] = 1
|
e@0
|
190
|
e@0
|
191 gap_sequences = []
|
e@0
|
192
|
e@0
|
193 # for s in subsequences:
|
e@0
|
194 # for i in range(0, len(s)-1):
|
e@0
|
195 #
|
e@0
|
196
|
e@0
|
197
|
e@0
|
198 return support
|
e@0
|
199
|
e@0
|
200 #pseq = possible_sequences(seq, 2, 2, 3)
|
e@0
|
201 # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
|
e@0
|
202
|
e@0
|
203 #print pseq
|
e@0
|
204
|
e@0
|
205
|
e@0
|
206 def find_frequent_subsequences(seq, K, minsup, maxgap):
|
e@0
|
207 S = seq
|
e@0
|
208
|
e@0
|
209 L = []
|
e@0
|
210
|
e@0
|
211 # Map each individual symbol to its position in the sequence
|
e@0
|
212 for k in range(0, len(seq)):
|
e@0
|
213 L.append((seq[k],k))
|
e@0
|
214
|
e@0
|
215
|
e@0
|
216 # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
|
e@0
|
217 Lr = {}
|
e@0
|
218
|
e@0
|
219 for k in range(0, len(L)):
|
e@0
|
220 if L[k][0] in Lr:
|
e@0
|
221 Lr[L[k][0]].append(L[k][1])
|
e@0
|
222 else:
|
e@0
|
223 Lr[L[k][0]] = [L[k][1]]
|
e@0
|
224
|
e@0
|
225 # Filter out the symbols with less than minsup support
|
e@0
|
226 Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
|
e@0
|
227 Lr = Lk
|
e@0
|
228
|
e@0
|
229 # Recursively combine->reduce
|
e@0
|
230 for k in range(1, K):
|
e@0
|
231 L = []
|
e@0
|
232 for l1 in Lk:
|
e@0
|
233 for l2 in Lr:
|
e@0
|
234
|
e@0
|
235 # Combine symbol1,..,symbolN-1 and symbolN
|
e@0
|
236 subseq = tuple(l1)+tuple(l2)
|
e@0
|
237
|
e@0
|
238 for v1 in Lk[l1]:
|
e@0
|
239 for v2 in Lr[l2]:
|
e@0
|
240 if k == 1:
|
e@0
|
241 v = [v1, v2]
|
e@0
|
242 else:
|
e@0
|
243 v = v1 + [v2]
|
e@0
|
244
|
e@0
|
245 # Keep only those that are sorted in order with a maximum gap of maxgap
|
e@0
|
246 if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
|
e@0
|
247 L.append((subseq, v))
|
e@0
|
248
|
e@0
|
249
|
e@0
|
250 # Reduce again and filter
|
e@0
|
251 Lk = {}
|
e@0
|
252
|
e@0
|
253 for k in range(0, len(L)):
|
e@0
|
254 if L[k][0] in Lk:
|
e@0
|
255 Lk[L[k][0]].append(L[k][1])
|
e@0
|
256 else:
|
e@0
|
257 Lk[L[k][0]] = [L[k][1]]
|
e@0
|
258
|
e@0
|
259 Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)
|
e@0
|
260
|
e@0
|
261 # Return a record with gap symbols and frequent subsequences
|
e@0
|
262 record = {}
|
e@0
|
263
|
e@0
|
264 for lk in Lk:
|
e@0
|
265 length = len(lk)
|
e@0
|
266
|
e@0
|
267
|
e@0
|
268 gap_symbols = []
|
e@0
|
269 for i in range(0, len(Lk[lk])):
|
e@0
|
270 gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
|
e@0
|
271 if gap:
|
e@0
|
272 gap_symbols += gap
|
e@0
|
273
|
e@0
|
274 record[lk] = (len(Lk[lk]), gap_symbols)
|
e@0
|
275
|
e@0
|
276
|
e@0
|
277
|
e@0
|
278 return record
|
e@0
|
279
|
e@0
|
280
|
e@0
|
281
|
e@0
|
282
|
e@0
|
283
|
e@0
|
284 print find_frequent_subsequences(seq,5, 4, 2)
|
e@0
|
285
|
e@0
|
286
|