Mercurial > hg > chourdakisreiss2016

# -*- coding: utf-8 -*-
"""
Created on Fri May 15 12:37:53 2015

@author: mmxgn
"""

# Trying VGS algorithm as described in the VOGUE paper
from numpy  import *

S = "ACBDAHCBADFGAIEB"

S = "AGTGATGATGCGAGATGCGAGATCGATGCAGCTA"
# Input Parameters
maxgap = 2
minsup = 2
k = 2

seq = list(S)

# For k = 1

seq_1 = unique(seq) # Unique sequences of length one (also the alphabet)

# For k = 2

# Possible sequences of size 2

seq_2 = [(i,j) for i in seq_1 for j in seq_1]

# Generate positions of symbols
positions = {}
for i in range(0, len(seq)):
    if seq[i] not in positions:
        positions[seq[i]] = [i]
    else:
        if i not in positions[seq[i]]:
            positions[seq[i]].append(i)

def all_subsequences (S, k, supp=1):
   if k == 1:
       l = [m for m in S]

       support = {}

       for m in l:
           if m in support:
               support[m] += 1
           else:
               support[m] = 1


       s = [m for m in l if support[m] >= supp]


       return s


   if k == 2:
       P_ = all_subsequences(S,1,supp)


       S = [m for m in S if m in P_]


       l = [(m,n) for m in S for n in S if m != n]


       return l


   else:
       l = []
       sseqs_ = all_subsequences(S, k-1, supp)
       P_ = all_subsequences(S,1,supp)

       S = [m for m in S if m in P_]

       for m in S:
           for n in sseqs_:
               conc = (m, ) + n
               if conc not in l:
                   l.append(conc)


       return l


def subsequence_to_indices (sseq, seq):

    positions = {}
    for i in range(0, len(seq)):
        if seq[i] not in positions:
            positions[seq[i]] = [i]
        else:
            if i not in positions[seq[i]]:
                positions[seq[i]].append(i)


    for sq in sseq:
        idx = []
        for s in sq:
            idx.append(positions[s])


def cartesian_product(*lists):
    if len(lists) == 2:
        return [(i,j) for i in lists[0] for j in lists[1]]
    else:
        return [(i,)+j for i in lists[0] for j in cartesian_product(*lists[1:])]

def possible_sequences(S, minsup, maxgap, K):
    singletons = set(S)

    # Show appearances

    positions_singletons = {}
    for i in range(0, len(S)):
        if S[i] not in positions_singletons:
            positions_singletons[S[i]] = [i]
        else:
            if i not in positions_singletons[S[i]]:
                positions_singletons[S[i]].append(i)


    toremove = []
    for p in positions_singletons:
        if len(positions_singletons[p]) < minsup:
            singletons.remove(p)
            toremove.append(p)


    for i in toremove:
        positions_singletons.pop(i)


    positions = []

    for p in positions_singletons:
        positions.append(positions_singletons[p])
    subsequences = []

    for k in range(2, K+1):
        subsequences_ = []

        sseq = all_subsequences(list(singletons), k)

        for seq in sseq:
            pos = []
            for s in seq:
                pos.append(positions_singletons[s])
            possible_sequences = cartesian_product(*pos)
            subsequences_ = subsequences_+possible_sequences


        subsequences = subsequences+subsequences_


    subsequences = filter(lambda x: all([x[i]<x[i+1] and x[i+1]-x[i] < maxgap+1 for i in range(0, len(x)-1)]),subsequences)
    subsequences_alpha = []

    support = {}
    for s in subsequences:
        seq = []
        for i in s:
            seq.append(S[i])

        if tuple(seq) not in subsequences_alpha:
            subsequences_alpha.append(tuple(seq))

        if tuple(seq) in support:
            support[tuple(seq)] += 1
        else:
            support[tuple(seq)] = 1

    gap_sequences = []

 #   for s in subsequences:
  #      for i in range(0, len(s)-1):
  #


    return support

#pseq =  possible_sequences(seq, 2, 2, 3)
# PROBLEM, MAXGAP=2 IT SHOULD PURGE SEQUENCES LARGER THAN THAT. filtering does not happen. hmm

#print pseq


def find_frequent_subsequences(seq, K,  minsup, maxgap):
    S = seq

    L = []

    # Map each individual symbol to its position in the sequence
    for k in range(0, len(seq)):
        L.append((seq[k],k))


    # Reduce [(symbol1, position1), (symbol2, position2), ... ] to {symbol1: [position1, position2, ...]}
    Lr = {}

    for k in range(0, len(L)):
        if L[k][0] in Lr:
            Lr[L[k][0]].append(L[k][1])
        else:
            Lr[L[k][0]] = [L[k][1]]

    # Filter out the symbols with less than minsup support
    Lk = dict((k,v) for k,v in Lr.items() if len(v) >= minsup)
    Lr = Lk

    # Recursively combine->reduce
    for k in range(1, K):
        L = []
        for l1 in Lk:
            for l2 in Lr:

                # Combine symbol1,..,symbolN-1 and symbolN
                subseq = tuple(l1)+tuple(l2)

                for v1 in Lk[l1]:
                    for v2 in Lr[l2]:
                        if k == 1:
                            v = [v1, v2]
                        else:
                            v = v1 + [v2]

                        # Keep only those that are sorted in order with a maximum gap  of maxgap
                        if all([v[i] < v[i+1] and v[i+1]-v[i] < maxgap+1 for i in range(0, len(v)-1)]):
                            L.append((subseq, v))


        # Reduce again and filter
        Lk = {}

        for k in range(0, len(L)):
            if L[k][0] in Lk:
                Lk[L[k][0]].append(L[k][1])
            else:
                Lk[L[k][0]] = [L[k][1]]

        Lk = dict((k,v) for k,v in Lk.items() if len(v) >= minsup)

    # Return a record with gap symbols and frequent subsequences
    record = {}

    for lk in Lk:
        length = len(lk)


        gap_symbols = []
        for i in range(0, len(Lk[lk])):
            gap = S[Lk[lk][i][0]+1:Lk[lk][i][1]]
            if gap:
                gap_symbols += gap

        record[lk] = (len(Lk[lk]), gap_symbols)


    return  record


print find_frequent_subsequences(seq,5, 4, 2)
author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Sat, 30 Sep 2017 13:25:50 +0100
parents	246d5546657c
children