chourdakisreiss2016: experiment-reverb/code/vgs.py comparison

comparison experiment-reverb/code/vgs.py @ 0:246d5546657c

initial commit, needs cleanup

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Wed, 14 Dec 2016 13:15:48 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:246d5546657c
+# -*- coding: utf-8 -*-
+"""
+Created on Fri May 15 12:37:53 2015
+@author: mmxgn
+"""
+# Trying VGS algorithm as described in the VOGUE paper
+from numpy  import *
+S = "ACBDAHCBADFGAIEB"
+S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
+# Input Parameters
+maxgap = 2
+minsup = 2
+k = 2
+seq = list(S)
+# For k = 1
+seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)
+# For k = 2
+# Possible sequences of size 2
+seq_2 = [(i,j) for i in seq_1 for j in seq_1]
+# Generate positions of symbols
+positions = {}
+for i in range(0, len(seq)):
+if seq[i] not in positions:
+positions[seq[i]] = [i]
+else:
+if i not in positions[seq[i]]:
+positions[seq[i]].append(i)
+def all_subsequences (S, k, supp=1):
+if k == 1:
+l = [m for m in S]
+support = {}
+for m in l:
+if m in support:
+support[m] += 1
+else:
+support[m] = 1
+s = [m for m in l if support[m] >= supp]
+return s
+if k == 2:
+P_ = all_subsequences(S,1,supp)
+S = [m for m in S if m in P_]
+l = [(m,n) for m in S for n in S if m != n]
+return l
+else:
+l = []
+sseqs_ = all_subsequences(S, k-1, supp)
+P_ = all_subsequences(S,1,supp)
+S = [m for m in S if m in P_]
+for m in S:
+for n in sseqs_:
+conc = (m, ) + n
+if conc not in l:
+l.append(conc)
+return l
+def subsequence_to_indices (sseq, seq):
+positions = {}
+for i in range(0, len(seq)):
+if seq[i] not in positions:
+positions[seq[i]] = [i]
+else:
+if i not in positions[seq[i]]:
+positions[seq[i]].append(i)
+for sq in sseq:
+idx = []
+for s in sq:
+idx.append(positions[s])
+def cartesian_product(*lists):
+if len(lists) == 2:
+return [(i,j) for i in lists[0] for j in lists[1]]
+else:
+return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]
+def possible_sequences(S, minsup, maxgap, K):
+singletons = set(S)
+# Show appearances
+positions_singletons = {}
+for i in range(0, len(S)):
+if S[i] not in positions_singletons:
+positions_singletons[S[i]] = [i]
+else:
+if i not in positions_singletons[S[i]]:
+positions_singletons[S[i]].append(i)
+toremove = []
+for p in positions_singletons:
+if len(positions_singletons[p]) < minsup:
+singletons.remove(p)
+toremove.append(p)
+for i in toremove:
+positions_singletons.pop(i)
+positions = []
+for p in positions_singletons:
+positions.append(positions_singletons[p])
+subsequences = []
+for k in range(2, K+1):
+subsequences_ = []
+sseq = all_subsequences(list(singletons), k)
+for seq in sseq:
+pos = []
+for s in seq:
+pos.append(positions_singletons[s])
+possible_sequences = cartesian_product(*pos)
+subsequences_ = subsequences_+possible_sequences
+subsequences = subsequences+subsequences_
+subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
+subsequences_alpha = []
+support = {}
+for s in subsequences:
+seq = []
+for i in s:
+seq.append(S[i])
+if tuple(seq) not in subsequences_alpha:
+subsequences_alpha.append(tuple(seq))
+if tuple(seq) in support:
+support[tuple(seq)] += 1
+else:
+support[tuple(seq)] = 1
+gap_sequences = []
+#   for s in subsequences:
+#      for i in range(0, len(s)-1):
+#
+return support
+#pseq =  possible_sequences(seq, 2, 2, 3)
+# PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm
+#print pseq
+def find_frequent_subsequences(seq, K,  minsup, maxgap):
+S = seq
+L = []
+# Map each individual symbol to its position in the sequence
+for k in range(0, len(seq)):
+L.append((seq[k],k))
+# Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
+Lr = {}
+for k in range(0, len(L)):
+if L[k][0] in Lr:
+Lr[L[k][0]].append(L[k][1])
+else:
+Lr[L[k][0]] = [L[k][1]]
+# Filter out the symbols with less than minsup support
+Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
+Lr = Lk
+# Recursively combine->reduce
+for k in range(1, K):
+L = []
+for l1 in Lk:
+for l2 in Lr:
+# Combine symbol1,..,symbolN-1 and symbolN
+subseq = tuple(l1)+tuple(l2)
+for v1 in Lk[l1]:
+for v2 in Lr[l2]:
+if k == 1:
+v = [v1, v2]
+else:
+v = v1 + [v2]
+# Keep only those that are sorted in order with a maximum gap  of maxgap
+if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
+L.append((subseq, v))
+# Reduce again and filter
+Lk = {}
+for k in range(0, len(L)):
+if L[k][0] in Lk:
+Lk[L[k][0]].append(L[k][1])
+else:
+Lk[L[k][0]] = [L[k][1]]
+Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)
+# Return a record with gap symbols and frequent subsequences
+record = {}
+for lk in Lk:
+length = len(lk)
+gap_symbols = []
+for i in range(0, len(Lk[lk])):
+gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
+if gap:
+gap_symbols += gap
+record[lk] = (len(Lk[lk]), gap_symbols)
+return  record
+print find_frequent_subsequences(seq,5, 4, 2)

Mercurial > hg > chourdakisreiss2016

comparison experiment-reverb/code/vgs.py @ 0:246d5546657c