Mercurial > hg > chourdakisreiss2016
view experiment-reverb/code/vgs.py @ 2:c87a9505f294 tip
Added LICENSE for code, removed .wav files
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Sat, 30 Sep 2017 13:25:50 +0100 |
parents | 246d5546657c |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Fri May 15 12:37:53 2015 @author: mmxgn """ # Trying VGS algorithm as described in the VOGUE paper from numpy import * S = "ACBDAHCBADFGAIEB" S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA" # Input Parameters maxgap = 2 minsup = 2 k = 2 seq = list(S) # For k = 1 seq_1 = unique(seq) # Unique sequences of length one (also the alphabet) # For k = 2 # Possible sequences of size 2 seq_2 = [(i,j) for i in seq_1 for j in seq_1] # Generate positions of symbols positions = {} for i in range(0, len(seq)): if seq[i] not in positions: positions[seq[i]] = [i] else: if i not in positions[seq[i]]: positions[seq[i]].append(i) def all_subsequences (S, k, supp=1): if k == 1: l = [m for m in S] support = {} for m in l: if m in support: support[m] += 1 else: support[m] = 1 s = [m for m in l if support[m] >= supp] return s if k == 2: P_ = all_subsequences(S,1,supp) S = [m for m in S if m in P_] l = [(m,n) for m in S for n in S if m != n] return l else: l = [] sseqs_ = all_subsequences(S, k-1, supp) P_ = all_subsequences(S,1,supp) S = [m for m in S if m in P_] for m in S: for n in sseqs_: conc = (m, ) + n if conc not in l: l.append(conc) return l def subsequence_to_indices (sseq, seq): positions = {} for i in range(0, len(seq)): if seq[i] not in positions: positions[seq[i]] = [i] else: if i not in positions[seq[i]]: positions[seq[i]].append(i) for sq in sseq: idx = [] for s in sq: idx.append(positions[s]) def cartesian_product(*lists): if len(lists) == 2: return [(i,j) for i in lists[0] for j in lists[1]] else: return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])] def possible_sequences(S, minsup, maxgap, K): singletons = set(S) # Show appearances positions_singletons = {} for i in range(0, len(S)): if S[i] not in positions_singletons: positions_singletons[S[i]] = [i] else: if i not in positions_singletons[S[i]]: positions_singletons[S[i]].append(i) toremove = [] for p in positions_singletons: if len(positions_singletons[p]) < minsup: singletons.remove(p) toremove.append(p) for i in toremove: positions_singletons.pop(i) positions = [] for p in positions_singletons: positions.append(positions_singletons[p]) subsequences = [] for k in range(2, K+1): subsequences_ = [] sseq = all_subsequences(list(singletons), k) for seq in sseq: pos = [] for s in seq: pos.append(positions_singletons[s]) possible_sequences = cartesian_product(*pos) subsequences_ = subsequences_+possible_sequences subsequences = subsequences+subsequences_ subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences) subsequences_alpha = [] support = {} for s in subsequences: seq = [] for i in s: seq.append(S[i]) if tuple(seq) not in subsequences_alpha: subsequences_alpha.append(tuple(seq)) if tuple(seq) in support: support[tuple(seq)] += 1 else: support[tuple(seq)] = 1 gap_sequences = [] # for s in subsequences: # for i in range(0, len(s)-1): # return support #pseq = possible_sequences(seq, 2, 2, 3) # PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm #print pseq def find_frequent_subsequences(seq, K, minsup, maxgap): S = seq L = [] # Map each individual symbol to its position in the sequence for k in range(0, len(seq)): L.append((seq[k],k)) # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]} Lr = {} for k in range(0, len(L)): if L[k][0] in Lr: Lr[L[k][0]].append(L[k][1]) else: Lr[L[k][0]] = [L[k][1]] # Filter out the symbols with less than minsup support Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup) Lr = Lk # Recursively combine->reduce for k in range(1, K): L = [] for l1 in Lk: for l2 in Lr: # Combine symbol1,..,symbolN-1 and symbolN subseq = tuple(l1)+tuple(l2) for v1 in Lk[l1]: for v2 in Lr[l2]: if k == 1: v = [v1, v2] else: v = v1 + [v2] # Keep only those that are sorted in order with a maximum gap of maxgap if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]): L.append((subseq, v)) # Reduce again and filter Lk = {} for k in range(0, len(L)): if L[k][0] in Lk: Lk[L[k][0]].append(L[k][1]) else: Lk[L[k][0]] = [L[k][1]] Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) # Return a record with gap symbols and frequent subsequences record = {} for lk in Lk: length = len(lk) gap_symbols = [] for i in range(0, len(Lk[lk])): gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]] if gap: gap_symbols += gap record[lk] = (len(Lk[lk]), gap_symbols) return record print find_frequent_subsequences(seq,5, 4, 2)