view experiment-reverb/code/vgs.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 30 Sep 2017 13:25:50 +0100
parents 246d5546657c
children
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Fri May 15 12:37:53 2015

@author: mmxgn
"""

# Trying VGS algorithm as described in the VOGUE paper
from numpy  import *

S = "ACBDAHCBADFGAIEB"

S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
# Input Parameters
maxgap = 2
minsup = 2
k = 2

seq = list(S)

# For k = 1

seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)

# For k = 2

# Possible sequences of size 2

seq_2 = [(i,j) for i in seq_1 for j in seq_1]

# Generate positions of symbols
positions = {}
for i in range(0, len(seq)):
    if seq[i] not in positions:
        positions[seq[i]] = [i]
    else:
        if i not in positions[seq[i]]:
            positions[seq[i]].append(i)
            
def all_subsequences (S, k, supp=1):
   if k == 1:
       l = [m for m in S]
       
       support = {}
       
       for m in l:
           if m in support:
               support[m] += 1
           else:
               support[m] = 1
               
               

               
       s = [m for m in l if support[m] >= supp]
 
       
       return s
       
        
   if k == 2:
       P_ = all_subsequences(S,1,supp)
       
       
       S = [m for m in S if m in P_]
               
               
       l = [(m,n) for m in S for n in S if m != n]
       
       
           
       return l
       
       
   else:
       l = []
       sseqs_ = all_subsequences(S, k-1, supp)
       P_ = all_subsequences(S,1,supp)
       
       S = [m for m in S if m in P_]
       
       for m in S:
           for n in sseqs_:
               conc = (m, ) + n
               if conc not in l:
                   l.append(conc)
                       
               
    
       return l
               
       
def subsequence_to_indices (sseq, seq):
    
    positions = {}
    for i in range(0, len(seq)):
        if seq[i] not in positions:
            positions[seq[i]] = [i]
        else:
            if i not in positions[seq[i]]:
                positions[seq[i]].append(i)    
    
    
    for sq in sseq:
        idx = []
        for s in sq:
            idx.append(positions[s])
            
            
def cartesian_product(*lists):
    if len(lists) == 2:
        return [(i,j) for i in lists[0] for j in lists[1]]
    else:
        return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]

def possible_sequences(S, minsup, maxgap, K):
    singletons = set(S)
    
    # Show appearances
    
    positions_singletons = {}
    for i in range(0, len(S)):
        if S[i] not in positions_singletons:
            positions_singletons[S[i]] = [i]
        else:
            if i not in positions_singletons[S[i]]:
                positions_singletons[S[i]].append(i)     
                
                
    toremove = []
    for p in positions_singletons:
        if len(positions_singletons[p]) < minsup:
            singletons.remove(p)
            toremove.append(p)
            
            
    for i in toremove:
        positions_singletons.pop(i)
        
        
    positions = []
    
    for p in positions_singletons:
        positions.append(positions_singletons[p])
    subsequences = []
    
    for k in range(2, K+1):
        subsequences_ = []
    
        sseq = all_subsequences(list(singletons), k)
        
        for seq in sseq:
            pos = []
            for s in seq:
                pos.append(positions_singletons[s])
            possible_sequences = cartesian_product(*pos)
            subsequences_ = subsequences_+possible_sequences
            
        

            
        subsequences = subsequences+subsequences_
        
        
                    
        
            
        
  
      
    
    

    subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
    subsequences_alpha = []
    
    support = {}
    for s in subsequences:
        seq = []
        for i in s:
            seq.append(S[i])
        
        if tuple(seq) not in subsequences_alpha:
            subsequences_alpha.append(tuple(seq))
            
        if tuple(seq) in support:
            support[tuple(seq)] += 1
        else:
            support[tuple(seq)] = 1
        
    gap_sequences = []
    
 #   for s in subsequences:
  #      for i in range(0, len(s)-1):
  #          
            
    
    return support
    
#pseq =  possible_sequences(seq, 2, 2, 3)
# PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm

#print pseq


def find_frequent_subsequences(seq, K,  minsup, maxgap):
    S = seq
    
    L = []
    
    # Map each individual symbol to its position in the sequence
    for k in range(0, len(seq)):
        L.append((seq[k],k))
        
        
    # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
    Lr = {}
    
    for k in range(0, len(L)):
        if L[k][0] in Lr:
            Lr[L[k][0]].append(L[k][1])
        else:
            Lr[L[k][0]] = [L[k][1]]
            
    # Filter out the symbols with less than minsup support     
    Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
    Lr = Lk
    
    # Recursively combine->reduce        
    for k in range(1, K):
        L = []
        for l1 in Lk:
            for l2 in Lr:

                # Combine symbol1,..,symbolN-1 and symbolN
                subseq = tuple(l1)+tuple(l2)
                
                for v1 in Lk[l1]:
                    for v2 in Lr[l2]:
                        if k == 1:
                            v = [v1, v2]
                        else:
                            v = v1 + [v2]
                        
                        # Keep only those that are sorted in order with a maximum gap  of maxgap
                        if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
                            L.append((subseq, v))
                            
                            
        # Reduce again and filter                               
        Lk = {}
        
        for k in range(0, len(L)):
            if L[k][0] in Lk:
                Lk[L[k][0]].append(L[k][1])
            else:
                Lk[L[k][0]] = [L[k][1]]
                
        Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) 
    
    # Return a record with gap symbols and frequent subsequences    
    record = {}
    
    for lk in Lk:
        length = len(lk)
        
        
        gap_symbols = []
        for i in range(0, len(Lk[lk])):
            gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
            if gap:
                gap_symbols += gap
        
        record[lk] = (len(Lk[lk]), gap_symbols)
            
        
        
    return  record


        


print find_frequent_subsequences(seq,5, 4, 2)