Mercurial > hg > chourdakisreiss2016

diff experiment-reverb/code/vgs.py @ 0:246d5546657c
initial commit, needs cleanup
author: Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date: Wed, 14 Dec 2016 13:15:48 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/experiment-reverb/code/vgs.py	Wed Dec 14 13:15:48 2016 +0000
@@ -0,0 +1,286 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri May 15 12:37:53 2015
+
+@author: mmxgn
+"""
+
+# Trying VGS algorithm as described in the VOGUE paper
+from numpy  import *
+
+S = "ACBDAHCBADFGAIEB"
+
+S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
+# Input Parameters
+maxgap = 2
+minsup = 2
+k = 2
+
+seq = list(S)
+
+# For k = 1
+
+seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
+
+# For k = 2
+
+# Possible sequences of size 2
+
+seq_2 = [(i,j) for i in seq_1 for j in seq_1]
+
+# Generate positions of symbols
+positions = {}
+for i in range(0, len(seq)):
+    if seq[i] not in positions:
+        positions[seq[i]] = [i]
+    else:
+        if i not in positions[seq[i]]:
+            positions[seq[i]].append(i)
+            
+def all_subsequences (S, k, supp=1):
+   if k == 1:
+       l = [m for m in S]
+       
+       support = {}
+       
+       for m in l:
+           if m in support:
+               support[m] += 1
+           else:
+               support[m] = 1
+               
+               
+
+               
+       s = [m for m in l if support[m] >= supp]
+ 
+       
+       return s
+       
+        
+   if k == 2:
+       P_ = all_subsequences(S,1,supp)
+       
+       
+       S = [m for m in S if m in P_]
+               
+               
+       l = [(m,n) for m in S for n in S if m != n]
+       
+       
+           
+       return l
+       
+       
+   else:
+       l = []
+       sseqs_ = all_subsequences(S, k-1, supp)
+       P_ = all_subsequences(S,1,supp)
+       
+       S = [m for m in S if m in P_]
+       
+       for m in S:
+           for n in sseqs_:
+               conc = (m, ) + n
+               if conc not in l:
+                   l.append(conc)
+                       
+               
+    
+       return l
+               
+       
+def subsequence_to_indices (sseq, seq):
+    
+    positions = {}
+    for i in range(0, len(seq)):
+        if seq[i] not in positions:
+            positions[seq[i]] = [i]
+        else:
+            if i not in positions[seq[i]]:
+                positions[seq[i]].append(i)    
+    
+    
+    for sq in sseq:
+        idx = []
+        for s in sq:
+            idx.append(positions[s])
+            
+            
+def cartesian_product(*lists):
+    if len(lists) == 2:
+        return [(i,j) for i in lists[0] for j in lists[1]]
+    else:
+        return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
+
+def possible_sequences(S, minsup, maxgap, K):
+    singletons = set(S)
+    
+    # Show appearances
+    
+    positions_singletons = {}
+    for i in range(0, len(S)):
+        if S[i] not in positions_singletons:
+            positions_singletons[S[i]] = [i]
+        else:
+            if i not in positions_singletons[S[i]]:
+                positions_singletons[S[i]].append(i)     
+                
+                
+    toremove = []
+    for p in positions_singletons:
+        if len(positions_singletons[p]) < minsup:
+            singletons.remove(p)
+            toremove.append(p)
+            
+            
+    for i in toremove:
+        positions_singletons.pop(i)
+        
+        
+    positions = []
+    
+    for p in positions_singletons:
+        positions.append(positions_singletons[p])
+    subsequences = []
+    
+    for k in range(2, K+1):
+        subsequences_ = []
+    
+        sseq = all_subsequences(list(singletons), k)
+        
+        for seq in sseq:
+            pos = []
+            for s in seq:
+                pos.append(positions_singletons[s])
+            possible_sequences = cartesian_product(*pos)
+            subsequences_ = subsequences_+possible_sequences
+            
+        
+
+            
+        subsequences = subsequences+subsequences_
+        
+        
+                    
+        
+            
+        
+  
+      
+    
+    
+
+    subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
+    subsequences_alpha = []
+    
+    support = {}
+    for s in subsequences:
+        seq = []
+        for i in s:
+            seq.append(S[i])
+        
+        if tuple(seq) not in subsequences_alpha:
+            subsequences_alpha.append(tuple(seq))
+            
+        if tuple(seq) in support:
+            support[tuple(seq)] += 1
+        else:
+            support[tuple(seq)] = 1
+        
+    gap_sequences = []
+    
+ #   for s in subsequences:
+  #      for i in range(0, len(s)-1):
+  #          
+            
+    
+    return support
+    
+#pseq =  possible_sequences(seq, 2, 2, 3)
+# PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
+
+#print pseq
+
+
+def find_frequent_subsequences(seq, K,  minsup, maxgap):
+    S = seq
+    
+    L = []
+    
+    # Map each individual symbol to its position in the sequence
+    for k in range(0, len(seq)):
+        L.append((seq[k],k))
+        
+        
+    # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
+    Lr = {}
+    
+    for k in range(0, len(L)):
+        if L[k][0] in Lr:
+            Lr[L[k][0]].append(L[k][1])
+        else:
+            Lr[L[k][0]] = [L[k][1]]
+            
+    # Filter out the symbols with less than minsup support     
+    Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
+    Lr = Lk
+    
+    # Recursively combine->reduce        
+    for k in range(1, K):
+        L = []
+        for l1 in Lk:
+            for l2 in Lr:
+
+                # Combine symbol1,..,symbolN-1 and symbolN
+                subseq = tuple(l1)+tuple(l2)
+                
+                for v1 in Lk[l1]:
+                    for v2 in Lr[l2]:
+                        if k == 1:
+                            v = [v1, v2]
+                        else:
+                            v = v1 + [v2]
+                        
+                        # Keep only those that are sorted in order with a maximum gap  of maxgap
+                        if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
+                            L.append((subseq, v))
+                            
+                            
+        # Reduce again and filter                               
+        Lk = {}
+        
+        for k in range(0, len(L)):
+            if L[k][0] in Lk:
+                Lk[L[k][0]].append(L[k][1])
+            else:
+                Lk[L[k][0]] = [L[k][1]]
+                
+        Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup) 
+    
+    # Return a record with gap symbols and frequent subsequences    
+    record = {}
+    
+    for lk in Lk:
+        length = len(lk)
+        
+        
+        gap_symbols = []
+        for i in range(0, len(Lk[lk])):
+            gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
+            if gap:
+                gap_symbols += gap
+        
+        record[lk] = (len(Lk[lk]), gap_symbols)
+            
+        
+        
+    return  record
+
+
+        
+
+
+print find_frequent_subsequences(seq,5, 4, 2)
+
+
author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Wed, 14 Dec 2016 13:15:48 +0000
parents
children