e@0: # -*- coding: utf-8 -*- e@0: """ e@0: Created on Fri May 15 12:37:53 2015 e@0: e@0: @author: mmxgn e@0: """ e@0: e@0: # Trying VGS algorithm as described in the VOGUE paper e@0: from numpy import * e@0: e@0: S = "ACBDAHCBADFGAIEB" e@0: e@0: S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA" e@0: # Input Parameters e@0: maxgap = 2 e@0: minsup = 2 e@0: k = 2 e@0: e@0: seq = list(S) e@0: e@0: # For k = 1 e@0: e@0: seq_1 = unique(seq) # Unique sequences of length one (also the alphabet) e@0: e@0: # For k = 2 e@0: e@0: # Possible sequences of size 2 e@0: e@0: seq_2 = [(i,j) for i in seq_1 for j in seq_1] e@0: e@0: # Generate positions of symbols e@0: positions = {} e@0: for i in range(0, len(seq)): e@0: if seq[i] not in positions: e@0: positions[seq[i]] = [i] e@0: else: e@0: if i not in positions[seq[i]]: e@0: positions[seq[i]].append(i) e@0: e@0: def all_subsequences (S, k, supp=1): e@0: if k == 1: e@0: l = [m for m in S] e@0: e@0: support = {} e@0: e@0: for m in l: e@0: if m in support: e@0: support[m] += 1 e@0: else: e@0: support[m] = 1 e@0: e@0: e@0: e@0: e@0: s = [m for m in l if support[m] >= supp] e@0: e@0: e@0: return s e@0: e@0: e@0: if k == 2: e@0: P_ = all_subsequences(S,1,supp) e@0: e@0: e@0: S = [m for m in S if m in P_] e@0: e@0: e@0: l = [(m,n) for m in S for n in S if m != n] e@0: e@0: e@0: e@0: return l e@0: e@0: e@0: else: e@0: l = [] e@0: sseqs_ = all_subsequences(S, k-1, supp) e@0: P_ = all_subsequences(S,1,supp) e@0: e@0: S = [m for m in S if m in P_] e@0: e@0: for m in S: e@0: for n in sseqs_: e@0: conc = (m, ) + n e@0: if conc not in l: e@0: l.append(conc) e@0: e@0: e@0: e@0: return l e@0: e@0: e@0: def subsequence_to_indices (sseq, seq): e@0: e@0: positions = {} e@0: for i in range(0, len(seq)): e@0: if seq[i] not in positions: e@0: positions[seq[i]] = [i] e@0: else: e@0: if i not in positions[seq[i]]: e@0: positions[seq[i]].append(i) e@0: e@0: e@0: for sq in sseq: e@0: idx = [] e@0: for s in sq: e@0: idx.append(positions[s]) e@0: e@0: e@0: def cartesian_product(*lists): e@0: if len(lists) == 2: e@0: return [(i,j) for i in lists[0] for j in lists[1]] e@0: else: e@0: return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])] e@0: e@0: def possible_sequences(S, minsup, maxgap, K): e@0: singletons = set(S) e@0: e@0: # Show appearances e@0: e@0: positions_singletons = {} e@0: for i in range(0, len(S)): e@0: if S[i] not in positions_singletons: e@0: positions_singletons[S[i]] = [i] e@0: else: e@0: if i not in positions_singletons[S[i]]: e@0: positions_singletons[S[i]].append(i) e@0: e@0: e@0: toremove = [] e@0: for p in positions_singletons: e@0: if len(positions_singletons[p]) < minsup: e@0: singletons.remove(p) e@0: toremove.append(p) e@0: e@0: e@0: for i in toremove: e@0: positions_singletons.pop(i) e@0: e@0: e@0: positions = [] e@0: e@0: for p in positions_singletons: e@0: positions.append(positions_singletons[p]) e@0: subsequences = [] e@0: e@0: for k in range(2, K+1): e@0: subsequences_ = [] e@0: e@0: sseq = all_subsequences(list(singletons), k) e@0: e@0: for seq in sseq: e@0: pos = [] e@0: for s in seq: e@0: pos.append(positions_singletons[s]) e@0: possible_sequences = cartesian_product(*pos) e@0: subsequences_ = subsequences_+possible_sequences e@0: e@0: e@0: e@0: e@0: subsequences = subsequences+subsequences_ e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: subsequences = filter(lambda x: all([x[i]= minsup) e@0: Lr = Lk e@0: e@0: # Recursively combine->reduce e@0: for k in range(1, K): e@0: L = [] e@0: for l1 in Lk: e@0: for l2 in Lr: e@0: e@0: # Combine symbol1,..,symbolN-1 and symbolN e@0: subseq = tuple(l1)+tuple(l2) e@0: e@0: for v1 in Lk[l1]: e@0: for v2 in Lr[l2]: e@0: if k == 1: e@0: v = [v1, v2] e@0: else: e@0: v = v1 + [v2] e@0: e@0: # Keep only those that are sorted in order with a maximum gap of maxgap e@0: if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]): e@0: L.append((subseq, v)) e@0: e@0: e@0: # Reduce again and filter e@0: Lk = {} e@0: e@0: for k in range(0, len(L)): e@0: if L[k][0] in Lk: e@0: Lk[L[k][0]].append(L[k][1]) e@0: else: e@0: Lk[L[k][0]] = [L[k][1]] e@0: e@0: Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) e@0: e@0: # Return a record with gap symbols and frequent subsequences e@0: record = {} e@0: e@0: for lk in Lk: e@0: length = len(lk) e@0: e@0: e@0: gap_symbols = [] e@0: for i in range(0, len(Lk[lk])): e@0: gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]] e@0: if gap: e@0: gap_symbols += gap e@0: e@0: record[lk] = (len(Lk[lk]), gap_symbols) e@0: e@0: e@0: e@0: return record e@0: e@0: e@0: e@0: e@0: e@0: print find_frequent_subsequences(seq,5, 4, 2) e@0: e@0: