chourdakisreiss2016: experiment-reverb/code/vgs.py annotate

annotate experiment-reverb/code/vgs.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Sat, 30 Sep 2017 13:25:50 +0100
parents	246d5546657c
children

rev	line source
e@0	1 # -- coding: utf-8 --
e@0	2 """
e@0	3 Created on Fri May 15 12:37:53 2015
e@0	4
e@0	5 @author: mmxgn
e@0	6 """
e@0	7
e@0	8 # Trying VGS algorithm as described in the VOGUE paper
e@0	9 from numpy import *
e@0	10
e@0	11 S = "ACBDAHCBADFGAIEB"
e@0	12
e@0	13 S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
e@0	14 # Input Parameters
e@0	15 maxgap = 2
e@0	16 minsup = 2
e@0	17 k = 2
e@0	18
e@0	19 seq = list(S)
e@0	20
e@0	21 # For k = 1
e@0	22
e@0	23 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
e@0	24
e@0	25 # For k = 2
e@0	26
e@0	27 # Possible sequences of size 2
e@0	28
e@0	29 seq_2 = [(i,j) for i in seq_1 for j in seq_1]
e@0	30
e@0	31 # Generate positions of symbols
e@0	32 positions = {}
e@0	33 for i in range(0, len(seq)):
e@0	34 if seq[i] not in positions:
e@0	35 positions[seq[i]] = [i]
e@0	36 else:
e@0	37 if i not in positions[seq[i]]:
e@0	38 positions[seq[i]].append(i)
e@0	39
e@0	40 def all_subsequences (S, k, supp=1):
e@0	41 if k == 1:
e@0	42 l = [m for m in S]
e@0	43
e@0	44 support = {}
e@0	45
e@0	46 for m in l:
e@0	47 if m in support:
e@0	48 support[m] += 1
e@0	49 else:
e@0	50 support[m] = 1
e@0	51
e@0	52
e@0	53
e@0	54
e@0	55 s = [m for m in l if support[m] >= supp]
e@0	56
e@0	57
e@0	58 return s
e@0	59
e@0	60
e@0	61 if k == 2:
e@0	62 P_ = all_subsequences(S,1,supp)
e@0	63
e@0	64
e@0	65 S = [m for m in S if m in P_]
e@0	66
e@0	67
e@0	68 l = [(m,n) for m in S for n in S if m != n]
e@0	69
e@0	70
e@0	71
e@0	72 return l
e@0	73
e@0	74
e@0	75 else:
e@0	76 l = []
e@0	77 sseqs_ = all_subsequences(S, k-1, supp)
e@0	78 P_ = all_subsequences(S,1,supp)
e@0	79
e@0	80 S = [m for m in S if m in P_]
e@0	81
e@0	82 for m in S:
e@0	83 for n in sseqs_:
e@0	84 conc = (m, ) + n
e@0	85 if conc not in l:
e@0	86 l.append(conc)
e@0	87
e@0	88
e@0	89
e@0	90 return l
e@0	91
e@0	92
e@0	93 def subsequence_to_indices (sseq, seq):
e@0	94
e@0	95 positions = {}
e@0	96 for i in range(0, len(seq)):
e@0	97 if seq[i] not in positions:
e@0	98 positions[seq[i]] = [i]
e@0	99 else:
e@0	100 if i not in positions[seq[i]]:
e@0	101 positions[seq[i]].append(i)
e@0	102
e@0	103
e@0	104 for sq in sseq:
e@0	105 idx = []
e@0	106 for s in sq:
e@0	107 idx.append(positions[s])
e@0	108
e@0	109
e@0	110 def cartesian_product(*lists):
e@0	111 if len(lists) == 2:
e@0	112 return [(i,j) for i in lists[0] for j in lists[1]]
e@0	113 else:
e@0	114 return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
e@0	115
e@0	116 def possible_sequences(S, minsup, maxgap, K):
e@0	117 singletons = set(S)
e@0	118
e@0	119 # Show appearances
e@0	120
e@0	121 positions_singletons = {}
e@0	122 for i in range(0, len(S)):
e@0	123 if S[i] not in positions_singletons:
e@0	124 positions_singletons[S[i]] = [i]
e@0	125 else:
e@0	126 if i not in positions_singletons[S[i]]:
e@0	127 positions_singletons[S[i]].append(i)
e@0	128
e@0	129
e@0	130 toremove = []
e@0	131 for p in positions_singletons:
e@0	132 if len(positions_singletons[p]) < minsup:
e@0	133 singletons.remove(p)
e@0	134 toremove.append(p)
e@0	135
e@0	136
e@0	137 for i in toremove:
e@0	138 positions_singletons.pop(i)
e@0	139
e@0	140
e@0	141 positions = []
e@0	142
e@0	143 for p in positions_singletons:
e@0	144 positions.append(positions_singletons[p])
e@0	145 subsequences = []
e@0	146
e@0	147 for k in range(2, K+1):
e@0	148 subsequences_ = []
e@0	149
e@0	150 sseq = all_subsequences(list(singletons), k)
e@0	151
e@0	152 for seq in sseq:
e@0	153 pos = []
e@0	154 for s in seq:
e@0	155 pos.append(positions_singletons[s])
e@0	156 possible_sequences = cartesian_product(*pos)
e@0	157 subsequences_ = subsequences_+possible_sequences
e@0	158
e@0	159
e@0	160
e@0	161
e@0	162 subsequences = subsequences+subsequences_
e@0	163
e@0	164
e@0	165
e@0	166
e@0	167
e@0	168
e@0	169
e@0	170
e@0	171
e@0	172
e@0	173
e@0	174 subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
e@0	175 subsequences_alpha = []
e@0	176
e@0	177 support = {}
e@0	178 for s in subsequences:
e@0	179 seq = []
e@0	180 for i in s:
e@0	181 seq.append(S[i])
e@0	182
e@0	183 if tuple(seq) not in subsequences_alpha:
e@0	184 subsequences_alpha.append(tuple(seq))
e@0	185
e@0	186 if tuple(seq) in support:
e@0	187 support[tuple(seq)] += 1
e@0	188 else:
e@0	189 support[tuple(seq)] = 1
e@0	190
e@0	191 gap_sequences = []
e@0	192
e@0	193 # for s in subsequences:
e@0	194 # for i in range(0, len(s)-1):
e@0	195 #
e@0	196
e@0	197
e@0	198 return support
e@0	199
e@0	200 #pseq = possible_sequences(seq, 2, 2, 3)
e@0	201 # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
e@0	202
e@0	203 #print pseq
e@0	204
e@0	205
e@0	206 def find_frequent_subsequences(seq, K, minsup, maxgap):
e@0	207 S = seq
e@0	208
e@0	209 L = []
e@0	210
e@0	211 # Map each individual symbol to its position in the sequence
e@0	212 for k in range(0, len(seq)):
e@0	213 L.append((seq[k],k))
e@0	214
e@0	215
e@0	216 # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
e@0	217 Lr = {}
e@0	218
e@0	219 for k in range(0, len(L)):
e@0	220 if L[k][0] in Lr:
e@0	221 Lr[L[k][0]].append(L[k][1])
e@0	222 else:
e@0	223 Lr[L[k][0]] = [L[k][1]]
e@0	224
e@0	225 # Filter out the symbols with less than minsup support
e@0	226 Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
e@0	227 Lr = Lk
e@0	228
e@0	229 # Recursively combine->reduce
e@0	230 for k in range(1, K):
e@0	231 L = []
e@0	232 for l1 in Lk:
e@0	233 for l2 in Lr:
e@0	234
e@0	235 # Combine symbol1,..,symbolN-1 and symbolN
e@0	236 subseq = tuple(l1)+tuple(l2)
e@0	237
e@0	238 for v1 in Lk[l1]:
e@0	239 for v2 in Lr[l2]:
e@0	240 if k == 1:
e@0	241 v = [v1, v2]
e@0	242 else:
e@0	243 v = v1 + [v2]
e@0	244
e@0	245 # Keep only those that are sorted in order with a maximum gap of maxgap
e@0	246 if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
e@0	247 L.append((subseq, v))
e@0	248
e@0	249
e@0	250 # Reduce again and filter
e@0	251 Lk = {}
e@0	252
e@0	253 for k in range(0, len(L)):
e@0	254 if L[k][0] in Lk:
e@0	255 Lk[L[k][0]].append(L[k][1])
e@0	256 else:
e@0	257 Lk[L[k][0]] = [L[k][1]]
e@0	258
e@0	259 Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)
e@0	260
e@0	261 # Return a record with gap symbols and frequent subsequences
e@0	262 record = {}
e@0	263
e@0	264 for lk in Lk:
e@0	265 length = len(lk)
e@0	266
e@0	267
e@0	268 gap_symbols = []
e@0	269 for i in range(0, len(Lk[lk])):
e@0	270 gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
e@0	271 if gap:
e@0	272 gap_symbols += gap
e@0	273
e@0	274 record[lk] = (len(Lk[lk]), gap_symbols)
e@0	275
e@0	276
e@0	277
e@0	278 return record
e@0	279
e@0	280
e@0	281
e@0	282
e@0	283
e@0	284 print find_frequent_subsequences(seq,5, 4, 2)
e@0	285
e@0	286

Mercurial > hg > chourdakisreiss2016

annotate experiment-reverb/code/vgs.py @ 2:c87a9505f294 tip