Mercurial > hg > chourdakisreiss2016
diff experiment-reverb/code/vgs.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/experiment-reverb/code/vgs.py Wed Dec 14 13:15:48 2016 +0000 @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri May 15 12:37:53 2015 + +@author: mmxgn +""" + +# Trying VGS algorithm as described in the VOGUE paper +from numpy import * + +S = "ACBDAHCBADFGAIEB" + +S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA" +# Input Parameters +maxgap = 2 +minsup = 2 +k = 2 + +seq = list(S) + +# For k = 1 + +seq_1 = unique(seq) # Unique sequences of length one (also the alphabet) + +# For k = 2 + +# Possible sequences of size 2 + +seq_2 = [(i,j) for i in seq_1 for j in seq_1] + +# Generate positions of symbols +positions = {} +for i in range(0, len(seq)): + if seq[i] not in positions: + positions[seq[i]] = [i] + else: + if i not in positions[seq[i]]: + positions[seq[i]].append(i) + +def all_subsequences (S, k, supp=1): + if k == 1: + l = [m for m in S] + + support = {} + + for m in l: + if m in support: + support[m] += 1 + else: + support[m] = 1 + + + + + s = [m for m in l if support[m] >= supp] + + + return s + + + if k == 2: + P_ = all_subsequences(S,1,supp) + + + S = [m for m in S if m in P_] + + + l = [(m,n) for m in S for n in S if m != n] + + + + return l + + + else: + l = [] + sseqs_ = all_subsequences(S, k-1, supp) + P_ = all_subsequences(S,1,supp) + + S = [m for m in S if m in P_] + + for m in S: + for n in sseqs_: + conc = (m, ) + n + if conc not in l: + l.append(conc) + + + + return l + + +def subsequence_to_indices (sseq, seq): + + positions = {} + for i in range(0, len(seq)): + if seq[i] not in positions: + positions[seq[i]] = [i] + else: + if i not in positions[seq[i]]: + positions[seq[i]].append(i) + + + for sq in sseq: + idx = [] + for s in sq: + idx.append(positions[s]) + + +def cartesian_product(*lists): + if len(lists) == 2: + return [(i,j) for i in lists[0] for j in lists[1]] + else: + return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])] + +def possible_sequences(S, minsup, maxgap, K): + singletons = set(S) + + # Show appearances + + positions_singletons = {} + for i in range(0, len(S)): + if S[i] not in positions_singletons: + positions_singletons[S[i]] = [i] + else: + if i not in positions_singletons[S[i]]: + positions_singletons[S[i]].append(i) + + + toremove = [] + for p in positions_singletons: + if len(positions_singletons[p]) < minsup: + singletons.remove(p) + toremove.append(p) + + + for i in toremove: + positions_singletons.pop(i) + + + positions = [] + + for p in positions_singletons: + positions.append(positions_singletons[p]) + subsequences = [] + + for k in range(2, K+1): + subsequences_ = [] + + sseq = all_subsequences(list(singletons), k) + + for seq in sseq: + pos = [] + for s in seq: + pos.append(positions_singletons[s]) + possible_sequences = cartesian_product(*pos) + subsequences_ = subsequences_+possible_sequences + + + + + subsequences = subsequences+subsequences_ + + + + + + + + + + + + subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences) + subsequences_alpha = [] + + support = {} + for s in subsequences: + seq = [] + for i in s: + seq.append(S[i]) + + if tuple(seq) not in subsequences_alpha: + subsequences_alpha.append(tuple(seq)) + + if tuple(seq) in support: + support[tuple(seq)] += 1 + else: + support[tuple(seq)] = 1 + + gap_sequences = [] + + # for s in subsequences: + # for i in range(0, len(s)-1): + # + + + return support + +#pseq = possible_sequences(seq, 2, 2, 3) +# PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm + +#print pseq + + +def find_frequent_subsequences(seq, K, minsup, maxgap): + S = seq + + L = [] + + # Map each individual symbol to its position in the sequence + for k in range(0, len(seq)): + L.append((seq[k],k)) + + + # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]} + Lr = {} + + for k in range(0, len(L)): + if L[k][0] in Lr: + Lr[L[k][0]].append(L[k][1]) + else: + Lr[L[k][0]] = [L[k][1]] + + # Filter out the symbols with less than minsup support + Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup) + Lr = Lk + + # Recursively combine->reduce + for k in range(1, K): + L = [] + for l1 in Lk: + for l2 in Lr: + + # Combine symbol1,..,symbolN-1 and symbolN + subseq = tuple(l1)+tuple(l2) + + for v1 in Lk[l1]: + for v2 in Lr[l2]: + if k == 1: + v = [v1, v2] + else: + v = v1 + [v2] + + # Keep only those that are sorted in order with a maximum gap of maxgap + if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]): + L.append((subseq, v)) + + + # Reduce again and filter + Lk = {} + + for k in range(0, len(L)): + if L[k][0] in Lk: + Lk[L[k][0]].append(L[k][1]) + else: + Lk[L[k][0]] = [L[k][1]] + + Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) + + # Return a record with gap symbols and frequent subsequences + record = {} + + for lk in Lk: + length = len(lk) + + + gap_symbols = [] + for i in range(0, len(Lk[lk])): + gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]] + if gap: + gap_symbols += gap + + record[lk] = (len(Lk[lk]), gap_symbols) + + + + return record + + + + + +print find_frequent_subsequences(seq,5, 4, 2) + +