Welcome, guest | Sign In | My Account | Store | Cart

be kind and comment, especially if you downvote

levenshtein_distance() is an implementation of the iterative algorithm for the levenshtein distance (cf. http://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_full_matrix)

levenshtein_sequence() is an attempt to retrieve one of the levenshtein paths (the one that give priority to substitution, deletion, insertion, in this order). The result is a list of tuples made of:

  1. the operation ( =, -, +, * for respectively keep, delete, insert, substitute)
  2. the coordinate in the first
  3. and in the second string.
>>> levenshtein_sequence('saturday', 'sunday')  
[('=', 0, 0), ('-', 1, 0), ('-', 2, 0), ('=', 3, 1), ('*', 4, 2), ('=', 5, 3), ('=', 6, 4), ('=', 7, 5)]
>>> levenshtein_sequence('kitten', 'sitting')
[('*', 0, 0), ('=', 1, 1), ('=', 2, 2), ('=', 3, 3), ('*', 4, 4), ('=', 5, 5), ('+', 5, 6)]

This code is part of foreplays, in a plan I have to improve difflib with alternative SequenceMatchers \o/

/!\ tab indented, as usual.

Python, 85 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3

def square_list(a, b, value=0) :
	return [[value,] * b for j in range(a)]

def arg_min(* arg_list) :
	arg_s = None
	for i, arg in enumerate(arg_list) :
		if i == 0 or arg < arg_s :
			arg_s = arg
			i_s = i
	return i_s, arg_s

MODIFIED = 0
DELETED = 1
CREATED = 2

def levenshtein_distance(a, b) :
	""" return the levenshtein distance between two strings of list of """

	len_a = len(a)
	len_b = len(b)
	
	d = square_list(len_a+1, len_b+1)
	
	for i in range(1, len_a+1) :
		d[i][0] = i
	for j in range(1, len_b+1) :
		d[0][j] = j
		
	for j in range(1, len_b+1) :
		for i in range(1, len_a+1) :
			if a[i-1] == b[j-1] :
				d[i][j] = d[i-1][j-1]
			else :
				d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1]) + 1
	return d[-1][-1]

def levenshtein_sequence(a, b) :
	""" return an explicit list of difference between a and b """

	len_a = len(a)
	len_b = len(b)
	
	s = list()
	
	d = square_list(len_a+1, len_b+1)
	
	for i in range(1, len_a+1) :
		d[i][0] = i
	for j in range(1, len_b+1) :
		d[0][j] = j
	
	for j in range(1, len_b+1) :
		for i in range(1, len_a+1) :
			if a[i-1] == b[j-1] :
				d[i][j] = d[i-1][j-1]
			else :
				d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1]) + 1
				
	prev_i, prev_j = i, j
	while i > 0 and j > 0 :
		if i == 1 and j == 1 :
			if prev_i != i and prev_j != j :
				u = MODIFIED
			elif prev_i == i :
				u = CREATED
			elif prev_j == j :
				u = DELETED
			new_i, new_j = i-1, j-1
		elif i == 1 :
			new_i, new_j = i, j-1
			u = CREATED
		elif j == 1 :
			u = DELETED
			new_i, new_j = i-1, j
		else :
			u, null = arg_min(d[i-1][j-1], d[i-1][j], d[i][j-1])
			new_i, new_j = i - (1,1,0)[u], j - (1,0,1)[u]
		op = '*-+'[u] if d[i][j] != d[new_i][new_j] else '='
		s.append((op, i-1, j-1))
		prev_i, prev_j = i, j
		i, j = new_i, new_j
		
	return list(reversed(s))
Created by yota on Wed, 15 Jan 2014 (GPL3)
Python recipes (4591)
yota's recipes (13)

Required Modules

  • (none specified)

Other Information and Tasks

  • Licensed under the GPL 3
  • Viewed 7822 times
  • Revision 3 (updated 10 years ago)