Script to find the longest common subsequence (LCS) of 3+ strings using dynamic programming.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | def calc_cache_pos(strings, indexes):
factor = 1
pos = 0
for s, i in zip(strings, indexes):
pos += i * factor
factor *= len(s)
return pos
def lcs_back(strings, indexes, cache):
if -1 in indexes:
return ""
match = all(strings[0][indexes[0]] == s[i]
for s, i in zip(strings, indexes))
if match:
new_indexes = [i - 1 for i in indexes]
result = lcs_back(strings, new_indexes, cache) + strings[0][indexes[0]]
else:
substrings = [""] * len(strings)
for n in range(len(strings)):
if indexes[n] > 0:
new_indexes = indexes[:]
new_indexes[n] -= 1
cache_pos = calc_cache_pos(strings, new_indexes)
if cache[cache_pos] is None:
substrings[n] = lcs_back(strings, new_indexes, cache)
else:
substrings[n] = cache[cache_pos]
result = max(substrings, key=len)
cache[calc_cache_pos(strings, indexes)] = result
return result
def lcs(strings):
"""
>>> lcs(['666222054263314443712', '5432127413542377777', '6664664565464057425'])
'54442'
>>> lcs(['abacbdab', 'bdcaba', 'cbacaa'])
'baa'
"""
if len(strings) == 0:
return ""
elif len(strings) == 1:
return strings[0]
else:
cache_size = 1
for s in strings:
cache_size *= len(s)
cache = [None] * cache_size
indexes = [len(s) - 1 for s in strings]
return lcs_back(strings, indexes, cache)
|
The LCS of 3+ sequences could be used, for example, for calculating a 3-way diff. This script is a Python translation of a C# program written by Kent Munthe Caspersen: http://stackoverflow.com/a/19544946/393304