Shortest Common Supersequence algorithms « Python recipes

The Shortest Common Supersequence (SCS) problem is an NP-hard problem (https://en.wikipedia.org/wiki/Shortest_common_supersequence), which occurs in problems originating from various domains, e.g. Bio Genetics. Given a set of strings, the common supersequence of minimal length is sought after. Below a set of algorithms is given, which I used in approximating and/or backtracking the optimal solution(s).

      import random
import copy
import time

def supersequence(x, y):
    """ True if and only if all elements in y occur in x in order (x is a supersequence of y)"""
    idx = 0
    try:
        for i in y:
            idx = x.index(i, idx)+1
    except ValueError, e:
        return False
    return True


def subsequence(x, y):
    """ x is a subsequence of y  <==>  y is a supersequence of x """
    return supersequence(y, x)


def is_supersequence_of_sequences(sequence, sequences):
    """ True if sequence is a supersequence of every s in sequences """
    result = True
    for s in sequences:
        if not supersequence(sequence, s):
            return False
    return result


def remove_redundant_sequences(redundant_sequences, debug=False):
    """ Removes every doublure sequence and every sequence that is a subsequence of another """
    if debug:
        print "Removing redundant sequences:"
    sequences = []
    for i,s in enumerate(redundant_sequences):
        remove = False
        for j in range(i+1, len(redundant_sequences)):
            if s == redundant_sequences[j]:
                remove = True
                if debug:
                    print "  Found doublure     ", s, "==", redundant_sequences[j]
        if not remove:
            sequences.append(s)
    sequences_p = []
    for i,s1 in enumerate(sequences):
        found = False
        for j,s2 in enumerate(sequences):
            if ((i != j) and (supersequence(s2, s1))):
                found = True
                if debug:
                    print "  Found supersequence", s2, "for", s1
                break
        if not found:
            sequences_p.append(s1)
    if debug:
        print "Removed %s redundant sequence(s). Reduced set:"%(len(redundant_sequences)-len(sequences_p))
        for i,s in enumerate(sequences_p):
            print " s%s:"%i, s
    return sequences_p


def lower_bound(scs):
    """ Finds a lower bound based on some fast algorithms."""
    def scs_length(s1, s2):
        """ shortest common supersequence length for two single sequences
            can be computed using dynamic programming. Based on algorithm for
            computing Longest Common Subsequence (== len(s1)+len(s2)-SCS )"""
        S = []
        for i in range(0, len(s1)+1):
            S.append([-1]*(len(s2)+1))
        for i in range(len(s1)+1):
            S[i][0] = i
        for j in range(len(s2)+1):
            S[0][j] = j
        for i in range(1, len(s1)+1):
            for j in range(1, len(s2)+1):
                if s1[i-1] == s2[j-1]:
                    S[i][j] = min(S[i-1][j]+1, S[i][j-1]+1, S[i-1][j-1]+1)
                else:
                    S[i][j] = min(S[i-1][j]+1, S[i][j-1]+1)
        return S[len(s1)][len(s2)]

    def find_max_scs_length_of_2_combinations(sequences):
        """ Returns maximum scs_length(s1,s2) for any s1, s2 in sequences. Note that this does not
            guarantee the length of the actual SCS of the entire set of sequences. """
        result = -1
        for i in range(len(sequences)):
            for j in range(i+1, len(sequences)):
                length = scs_length(sequences[i], sequences[j])
                if length > result:
                    result = length
        return result

    def max_count_occurrences(sequences):
        """ for every value v in the alphabet, for all s in sequences, count occurrences of v in s.
            The length of the SCS has be least the sum of maximum occurrences."""
        lower_bound = 0
        for v in set([s for seq in sequences for s in seq]):
            count_v = -1
            for s in sequences:
                count = s.count(v)
                if count > count_v:
                    count_v = count
            lower_bound += count_v
        return lower_bound

    return max(max_count_occurrences(scs), find_max_scs_length_of_2_combinations(scs))


def upper_bound(sequences, max_run_time_random_seconds = 1):
    """ Finds an upper bound based on some fast approximation algorithms."""

    def alphabet_leftmost(sequences):
        """ Approximation algorithm by looping through a random permutation on the alphabet """
        seqs = copy.deepcopy(sequences)
        common_supersequence = []
        permutation = list(set([x for s in seqs for x in s]))
        random.shuffle(permutation)
        i = 0
        while any(seqs):
            found = False
            for s in seqs:
                if s and s[0] == permutation[i]:
                    found = True
                    break
            if found:
                for s in seqs:
                    if s and s[0] == permutation[i]:
                        s.remove(permutation[i])
                common_supersequence.append(permutation[i])
            i = (i+1) % len(permutation)
        return common_supersequence


    def alphabet_leftmost_rand(sequences):
        """ Approximation algorithm using random selection on alphabet values. """
        seqs = copy.deepcopy(sequences)
        common_supersequence = []
        while any(seqs):
            firsts = list(set([s[0] for s in seqs if s]))
            next = firsts[random.randint(0, len(firsts)-1)]
            for s in seqs:
                if s and s[0] == next:
                    s.remove(next)
            common_supersequence.append(next)
        return common_supersequence

    def majority_merge(seqs):
        """ Quite fast approximation algorithm """
        sequences = copy.deepcopy(seqs)
        common_supersequence = []
        while any(sequences):
            firsts = [(s[0], len(s)) for s in sequences if s]
            counts = {}
            for x in firsts:
                if x[0] in counts:
                    counts[x[0]] += x[1]    #use +1 if not weighted MM
                else:
                    counts[x[0]] = x[1]     #use 1 if not weighted MM
            most_common = sorted(counts, key=lambda x: counts[x], reverse=True)[0]
            common_supersequence.append(most_common)
            for s in sequences:
                if s and s[0] == most_common:
                    s.remove(most_common)
                    if s == []:
                        sequences.remove([])
        return common_supersequence

    trivial = sum(len(s) for s in sequences)
    mm = len(majority_merge(sequences))
    alm, almr = 10000000, 10000000
    start = time.time()
    while time.time() < start + max_run_time_random_seconds:
        alm_test = alphabet_leftmost(sequences)
        if len(alm_test) < alm:
            alm = len(alm_test)
        almr_test = alphabet_leftmost_rand(sequences)
        if len(almr_test) < almr:
            almr = len(almr_test)
    upperbounds = [trivial, mm, alm, almr]
    solutions = bfs_genetic(sequences, keep_best = 1000, best_so_far = min(upperbounds))
    if solutions:
        upperbounds.append(sorted(solutions)[0])
    return min(upperbounds)


def backtrack_shortest_common_supersequences(sequences, upperbound=None):
    """ backtrack with pruning, then filter out the shortest SCSs (they are not unique!)"""
    if not upperbound:
        upperbound = 10000000
    result = backtrack_scs(sequences, [], upperbound)
    return [x for x in result if len(x) == min([len(y) for y in result])]

def backtrack_all_common_supersequences(sequences):
    """ backtrack all supersequences (not just shortest) """
    upperbound = 10000000
    return backtrack_scs(sequences, [], upperbound, False)

def backtrack_scs(sequences, choices, best_so_far, prune=True):
    """ Recursive DFS algorithm, potentially risky with large #sequences or long s in sequences"""
    solutions = []
    if any(sequences):
        # no need to evaluate subsolution choices if it will become longer than best_so_far.
        if not prune or len(choices) + max([len(s) for s in sequences]) <= best_so_far:
            firsts = set([s[0] for s in sequences if s])
            for c in firsts:
                indices = []
                for i,s in enumerate(sequences):
                    if s and s[0] == c:
                        s.remove(c)
                        indices.append(i)
                for solution in backtrack_scs(sequences, choices+[c], best_so_far):
                    solutions.append(solution)
                for i in indices:
                    sequences[i] = [c]+sequences[i]
    else:
        solutions = [choices]
        if len(choices) < best_so_far:
            best_so_far = len(choices)
    return solutions


def bfs_genetic(sequences, keep_best = 100000000, best_so_far = 100000000):
    """ Breath first search algorithm. Prunes away up to keep_best subsolutions according to metric() """
    def metric((choices, remaining_sequences)):
        # TODO: find good metric, given current choices and remaining sequences to solve.
        # return sum(len(s) for s in remaining_sequences)
        return len(remove_redundant_sequences(copy.deepcopy(remaining_sequences)))

    Q = []
    solutions = {}
    firsts = set([s[0] for s in sequences if s])
    for f in firsts:
        seqs = copy.deepcopy(sequences)
        for s in seqs:
            if s and s[0] == f:
                s.remove(f)
        Q.append(([f], seqs))
    depth = 1
    while Q:
        if depth < len(Q[0][0]):
            if len(Q) > keep_best:
                # print "Throwing away", len(Q) - keep_best, "out of", len(Q), "at depth", depth
                Q = sorted(Q, key=metric)[:keep_best]
            depth += 1
        else:
            choices, seqs = Q.pop(0)
            if len(choices) + max([len(s) for s in seqs]) <= best_so_far:
                if any(seqs):
                    firsts = set([s[0] for s in seqs if s])
                    for f in firsts:
                        sequences = copy.deepcopy(seqs)
                        for s in sequences:
                            if s and s[0] == f:
                                s.remove(f)
                        Q.append((choices[:]+[f], sequences))
                else:
                    if depth in solutions:
                        solutions[depth].append(choices[:])
                    else:
                        solutions[depth] = [choices[:]]
    return solutions


def pretty_print_scs_with_sequences(scs, sequences):
    print "SCS: " + "".join([str(s) for s in scs])
    for i,aseq in enumerate(sequences):
        seq = [-1]*len(scs)
        idx = 0
        for v in aseq:
            while scs.index(v, idx) > idx:
                idx += 1
            seq[idx] = v
            idx += 1
        print " s%s: "%i + "".join([str(s) for s in seq]).replace("-1", ".")


if __name__=="__main__":
    def run_SCS_algorithms(sequences):
        sequences = remove_redundant_sequences(sequences, debug=True)
        lowerbound, upperbound = lower_bound(sequences), upper_bound(sequences)
        print "Lowerbound on Shortest Common Supersequence:", lowerbound
        print "Upperbound on Shortest Common Supersequence:", upperbound
        solutions = backtrack_shortest_common_supersequences(sequences, upperbound)
        print "Backtrack found %s SCSs of optimal length %s."%(len(solutions), len(solutions[0]))
        print "E.g.", solutions[0], "is valid:", is_supersequence_of_sequences(solutions[0], sequences)
        pretty_print_scs_with_sequences(solutions[0], sequences)

    sequences_1 = [ [1,2,3],
                    [1,2,5],
                    [3,1,5,4],
                    [1,2,1,5],
                    [1,2,5]]

    sequences_2 = [ [1,2,1,2,3],    # "acacg"
                    [1,4,1,3,1],    # "ataga"
                    [2,1,2,3,4],    # "cacgt"
                    [3,4,1,1,4]]    # "ctaat"

    sequences_3 = [ [5,2,1,6,3,6,3,1],
                    [1,4,1,3,1,5,1,2],
                    [2,1,6,3,4,4,2,3],
                    [3,4,5,1,4,6,1,2]]

    sequences_4 = [ [5,1,3,5,2,6],
                    [1,5,2,1,3,2,3,1],
                    [5,1,3,5,2,1,6,2],
                    [3,5,1,6,2,4,6,1,2]]

    run_SCS_algorithms(sequences_4)

      

Usually SCS is applied to DNA strings, with nucleotides like "ataga" and "cacgt". My code operates on integers, the mapping is left to the user. Running the above script results in the following output:

Removing redundant sequences:
  Found supersequence [5, 1, 3, 5, 2, 1, 6, 2] for [5, 1, 3, 5, 2, 6]
Removed 1 redundant sequence(s). Reduced set:
 s0: [1, 5, 2, 1, 3, 2, 3, 1]
 s1: [5, 1, 3, 5, 2, 1, 6, 2]
 s2: [3, 5, 1, 6, 2, 4, 6, 1, 2]
Lowerbound on Shortest Common Supersequence: 13
Upperbound on Shortest Common Supersequence: 14
Backtrack found 6 SCSs of optimal length 14.
E.g. [5, 1, 3, 5, 2, 1, 3, 6, 2, 3, 4, 6, 1, 2] is valid: True

SCS: 51352136234612
 s0: .1.5213.23..1.
 s1: 513521.62.....
 s2: ..35.1.62.4612

Tags: approximation, bound, breadth_first_search, common, depth_first_search, sequence, shortest, super

◄	Python recipes (4591)	►
◄	Rutger Saalmink's recipes (4)	►

Shortest Common Supersequence algorithms (Python recipe) by Rutger Saalmink
ActiveState Code (http://code.activestate.com/recipes/578678/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Shortest Common Supersequence algorithms (Python recipe) by Rutger Saalmink ActiveState Code (http://code.activestate.com/recipes/578678/)