annotate experiment-reverb/code/vgs.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 30 Sep 2017 13:25:50 +0100
parents 246d5546657c
children
rev   line source
e@0 1 # -*- coding: utf-8 -*-
e@0 2 """
e@0 3 Created on Fri May 15 12:37:53 2015
e@0 4
e@0 5 @author: mmxgn
e@0 6 """
e@0 7
e@0 8 # Trying VGS algorithm as described in the VOGUE paper
e@0 9 from numpy import *
e@0 10
e@0 11 S = "ACBDAHCBADFGAIEB"
e@0 12
e@0 13 S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
e@0 14 # Input Parameters
e@0 15 maxgap = 2
e@0 16 minsup = 2
e@0 17 k = 2
e@0 18
e@0 19 seq = list(S)
e@0 20
e@0 21 # For k = 1
e@0 22
e@0 23 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
e@0 24
e@0 25 # For k = 2
e@0 26
e@0 27 # Possible sequences of size 2
e@0 28
e@0 29 seq_2 = [(i,j) for i in seq_1 for j in seq_1]
e@0 30
e@0 31 # Generate positions of symbols
e@0 32 positions = {}
e@0 33 for i in range(0, len(seq)):
e@0 34 if seq[i] not in positions:
e@0 35 positions[seq[i]] = [i]
e@0 36 else:
e@0 37 if i not in positions[seq[i]]:
e@0 38 positions[seq[i]].append(i)
e@0 39
e@0 40 def all_subsequences (S, k, supp=1):
e@0 41 if k == 1:
e@0 42 l = [m for m in S]
e@0 43
e@0 44 support = {}
e@0 45
e@0 46 for m in l:
e@0 47 if m in support:
e@0 48 support[m] += 1
e@0 49 else:
e@0 50 support[m] = 1
e@0 51
e@0 52
e@0 53
e@0 54
e@0 55 s = [m for m in l if support[m] >= supp]
e@0 56
e@0 57
e@0 58 return s
e@0 59
e@0 60
e@0 61 if k == 2:
e@0 62 P_ = all_subsequences(S,1,supp)
e@0 63
e@0 64
e@0 65 S = [m for m in S if m in P_]
e@0 66
e@0 67
e@0 68 l = [(m,n) for m in S for n in S if m != n]
e@0 69
e@0 70
e@0 71
e@0 72 return l
e@0 73
e@0 74
e@0 75 else:
e@0 76 l = []
e@0 77 sseqs_ = all_subsequences(S, k-1, supp)
e@0 78 P_ = all_subsequences(S,1,supp)
e@0 79
e@0 80 S = [m for m in S if m in P_]
e@0 81
e@0 82 for m in S:
e@0 83 for n in sseqs_:
e@0 84 conc = (m, ) + n
e@0 85 if conc not in l:
e@0 86 l.append(conc)
e@0 87
e@0 88
e@0 89
e@0 90 return l
e@0 91
e@0 92
e@0 93 def subsequence_to_indices (sseq, seq):
e@0 94
e@0 95 positions = {}
e@0 96 for i in range(0, len(seq)):
e@0 97 if seq[i] not in positions:
e@0 98 positions[seq[i]] = [i]
e@0 99 else:
e@0 100 if i not in positions[seq[i]]:
e@0 101 positions[seq[i]].append(i)
e@0 102
e@0 103
e@0 104 for sq in sseq:
e@0 105 idx = []
e@0 106 for s in sq:
e@0 107 idx.append(positions[s])
e@0 108
e@0 109
e@0 110 def cartesian_product(*lists):
e@0 111 if len(lists) == 2:
e@0 112 return [(i,j) for i in lists[0] for j in lists[1]]
e@0 113 else:
e@0 114 return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
e@0 115
e@0 116 def possible_sequences(S, minsup, maxgap, K):
e@0 117 singletons = set(S)
e@0 118
e@0 119 # Show appearances
e@0 120
e@0 121 positions_singletons = {}
e@0 122 for i in range(0, len(S)):
e@0 123 if S[i] not in positions_singletons:
e@0 124 positions_singletons[S[i]] = [i]
e@0 125 else:
e@0 126 if i not in positions_singletons[S[i]]:
e@0 127 positions_singletons[S[i]].append(i)
e@0 128
e@0 129
e@0 130 toremove = []
e@0 131 for p in positions_singletons:
e@0 132 if len(positions_singletons[p]) < minsup:
e@0 133 singletons.remove(p)
e@0 134 toremove.append(p)
e@0 135
e@0 136
e@0 137 for i in toremove:
e@0 138 positions_singletons.pop(i)
e@0 139
e@0 140
e@0 141 positions = []
e@0 142
e@0 143 for p in positions_singletons:
e@0 144 positions.append(positions_singletons[p])
e@0 145 subsequences = []
e@0 146
e@0 147 for k in range(2, K+1):
e@0 148 subsequences_ = []
e@0 149
e@0 150 sseq = all_subsequences(list(singletons), k)
e@0 151
e@0 152 for seq in sseq:
e@0 153 pos = []
e@0 154 for s in seq:
e@0 155 pos.append(positions_singletons[s])
e@0 156 possible_sequences = cartesian_product(*pos)
e@0 157 subsequences_ = subsequences_+possible_sequences
e@0 158
e@0 159
e@0 160
e@0 161
e@0 162 subsequences = subsequences+subsequences_
e@0 163
e@0 164
e@0 165
e@0 166
e@0 167
e@0 168
e@0 169
e@0 170
e@0 171
e@0 172
e@0 173
e@0 174 subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
e@0 175 subsequences_alpha = []
e@0 176
e@0 177 support = {}
e@0 178 for s in subsequences:
e@0 179 seq = []
e@0 180 for i in s:
e@0 181 seq.append(S[i])
e@0 182
e@0 183 if tuple(seq) not in subsequences_alpha:
e@0 184 subsequences_alpha.append(tuple(seq))
e@0 185
e@0 186 if tuple(seq) in support:
e@0 187 support[tuple(seq)] += 1
e@0 188 else:
e@0 189 support[tuple(seq)] = 1
e@0 190
e@0 191 gap_sequences = []
e@0 192
e@0 193 # for s in subsequences:
e@0 194 # for i in range(0, len(s)-1):
e@0 195 #
e@0 196
e@0 197
e@0 198 return support
e@0 199
e@0 200 #pseq = possible_sequences(seq, 2, 2, 3)
e@0 201 # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
e@0 202
e@0 203 #print pseq
e@0 204
e@0 205
e@0 206 def find_frequent_subsequences(seq, K, minsup, maxgap):
e@0 207 S = seq
e@0 208
e@0 209 L = []
e@0 210
e@0 211 # Map each individual symbol to its position in the sequence
e@0 212 for k in range(0, len(seq)):
e@0 213 L.append((seq[k],k))
e@0 214
e@0 215
e@0 216 # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
e@0 217 Lr = {}
e@0 218
e@0 219 for k in range(0, len(L)):
e@0 220 if L[k][0] in Lr:
e@0 221 Lr[L[k][0]].append(L[k][1])
e@0 222 else:
e@0 223 Lr[L[k][0]] = [L[k][1]]
e@0 224
e@0 225 # Filter out the symbols with less than minsup support
e@0 226 Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
e@0 227 Lr = Lk
e@0 228
e@0 229 # Recursively combine->reduce
e@0 230 for k in range(1, K):
e@0 231 L = []
e@0 232 for l1 in Lk:
e@0 233 for l2 in Lr:
e@0 234
e@0 235 # Combine symbol1,..,symbolN-1 and symbolN
e@0 236 subseq = tuple(l1)+tuple(l2)
e@0 237
e@0 238 for v1 in Lk[l1]:
e@0 239 for v2 in Lr[l2]:
e@0 240 if k == 1:
e@0 241 v = [v1, v2]
e@0 242 else:
e@0 243 v = v1 + [v2]
e@0 244
e@0 245 # Keep only those that are sorted in order with a maximum gap of maxgap
e@0 246 if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
e@0 247 L.append((subseq, v))
e@0 248
e@0 249
e@0 250 # Reduce again and filter
e@0 251 Lk = {}
e@0 252
e@0 253 for k in range(0, len(L)):
e@0 254 if L[k][0] in Lk:
e@0 255 Lk[L[k][0]].append(L[k][1])
e@0 256 else:
e@0 257 Lk[L[k][0]] = [L[k][1]]
e@0 258
e@0 259 Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)
e@0 260
e@0 261 # Return a record with gap symbols and frequent subsequences
e@0 262 record = {}
e@0 263
e@0 264 for lk in Lk:
e@0 265 length = len(lk)
e@0 266
e@0 267
e@0 268 gap_symbols = []
e@0 269 for i in range(0, len(Lk[lk])):
e@0 270 gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
e@0 271 if gap:
e@0 272 gap_symbols += gap
e@0 273
e@0 274 record[lk] = (len(Lk[lk]), gap_symbols)
e@0 275
e@0 276
e@0 277
e@0 278 return record
e@0 279
e@0 280
e@0 281
e@0 282
e@0 283
e@0 284 print find_frequent_subsequences(seq,5, 4, 2)
e@0 285
e@0 286