| 1 |
from itertools import repeat |
|---|
| 2 |
from heapq import * |
|---|
| 3 |
__all__=[ |
|---|
| 4 |
'LEVENSHTEIN_DEFAULT_MAXDIST', |
|---|
| 5 |
|
|---|
| 6 |
'levenshtein_distance', 'levenshtein_selectone', '_levenshtein_select'] |
|---|
| 7 |
|
|---|
| 8 |
LEVENSHTEIN_DEFAULT_MAXDIST = 3 |
|---|
| 9 |
# this constant is simply used to accelerate distance function startup. |
|---|
| 10 |
# NOTE: tests with the profile module indicated the advandages of this were |
|---|
| 11 |
# marginal at best! Exhaustive tests with large data & input sets are TBD. |
|---|
| 12 |
_levenshtein_maxwordrange = 0 |
|---|
| 13 |
_levenshtein_firstcol = [] |
|---|
| 14 |
_levenshtein_firstrow = [] |
|---|
| 15 |
|
|---|
| 16 |
def _levenshtein_accomodate_wordlen(wordlenplusone): |
|---|
| 17 |
global _levenshtein_maxwordrange, _levenshtein_firstcol, _levenshtein_firstrow |
|---|
| 18 |
if wordlenplusone <= _levenshtein_maxwordrange: |
|---|
| 19 |
return _levenshtein_firstcol, _levenshtein_firstrow |
|---|
| 20 |
|
|---|
| 21 |
_levenshtein_firstcol.extend([((i,0),i) |
|---|
| 22 |
for i in xrange(_levenshtein_maxwordrange, wordlenplusone)]) |
|---|
| 23 |
_levenshtein_firstrow.extend([((0,j),j) |
|---|
| 24 |
for j in xrange(_levenshtein_maxwordrange, wordlenplusone)]) |
|---|
| 25 |
_levenshtein_maxwordrange = wordlenplusone |
|---|
| 26 |
return _levenshtein_firstcol, _levenshtein_firstrow |
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 |
def _levenshtein_prepare_dtab(a,b, dtab=None): |
|---|
| 30 |
dtab = dtab or {} |
|---|
| 31 |
dtab.clear() |
|---|
| 32 |
n,m=len(a), len(b) |
|---|
| 33 |
firstcol, firstrow = _levenshtein_accomodate_wordlen( |
|---|
| 34 |
max(n, m) + 1) |
|---|
| 35 |
dtab.update(firstcol[:n+1]) |
|---|
| 36 |
dtab.update(firstrow[:m+1]) |
|---|
| 37 |
return dtab, n, m |
|---|
| 38 |
|
|---|
| 39 |
def levenshtein_distance(a,b): |
|---|
| 40 |
"""Levenshtein word distance algorithm. |
|---|
| 41 |
|
|---|
| 42 |
a is the target word, b is the candidate. return the minimum edit distance |
|---|
| 43 |
from b to a. that is, the minimum number of insertions, deletions and |
|---|
| 44 |
substitutions required to produce a from b. |
|---|
| 45 |
|
|---|
| 46 |
Picked this implementation of the net. However it is described well in: |
|---|
| 47 |
Speech & Langauge Processing: ISBN-X013122798X, Ch 5. pp 154 |
|---|
| 48 |
|
|---|
| 49 |
Abridged from the above reference: |
|---|
| 50 |
|
|---|
| 51 |
This algorithm measures the 'minimum edit distance' between two |
|---|
| 52 |
strings a,b. med(a,b) is defined as the minimum number of editing |
|---|
| 53 |
operations required to transform one string into another. |
|---|
| 54 |
|
|---|
| 55 |
The operations are delete,substitute,insert |
|---|
| 56 |
|
|---|
| 57 |
Levenshtein solves this problem using 'dynamic programming', essentialy a |
|---|
| 58 |
table driven mechanism for solving a problem a bit at a time and |
|---|
| 59 |
accumulating the result. Dynamic programming solutions for sequences in |
|---|
| 60 |
general work by creating a distance matrix with one column for each item in |
|---|
| 61 |
the target sequence and one row for each item in the source - ie., target |
|---|
| 62 |
allong the bottom & source down the side. For computing the minimum edit |
|---|
| 63 |
distance, this matrix is then the edit-distance matrix. Each cell |
|---|
| 64 |
edit-distance[i,j] contains the distance between the first i characters of |
|---|
| 65 |
the target and the first j characters of the source. The value of each |
|---|
| 66 |
cell represents the minimum of the three possible paths through the matrix |
|---|
| 67 |
that arrive there. |
|---|
| 68 |
""" |
|---|
| 69 |
c, n, m = _levenshtein_prepare_dtab(a,b) |
|---|
| 70 |
#c, n, m = {}, len(a), len(b) |
|---|
| 71 |
#c.update([((i,0),i) for i in range(0,n+1)]) |
|---|
| 72 |
#c.update([((0,j),j) for j in range(0,m+1)]) |
|---|
| 73 |
for i in range(1,n+1): |
|---|
| 74 |
for j in range(1,m+1): |
|---|
| 75 |
x = c[i-1,j]+1 |
|---|
| 76 |
y = c[i,j-1]+1 |
|---|
| 77 |
if a[i-1] == b[j-1]: |
|---|
| 78 |
z = c[i-1,j-1] |
|---|
| 79 |
else: |
|---|
| 80 |
z = c[i-1,j-1]+1 |
|---|
| 81 |
c[i,j] = min(x,y,z) |
|---|
| 82 |
return c[n,m] |
|---|
| 83 |
|
|---|
| 84 |
def levenshtein_selectone(asequence, b, |
|---|
| 85 |
dtab=None, maxwordlen = 0, maxdist = LEVENSHTEIN_DEFAULT_MAXDIST): |
|---|
| 86 |
maxdist=min(len(b),maxdist) |
|---|
| 87 |
if len(asequence) == 1: |
|---|
| 88 |
d = levenshtein_distance(asequence[0], b) |
|---|
| 89 |
return d < maxdist and (0,d) or (-1,d) |
|---|
| 90 |
state = _levenshtein_select(asequence, b, dtab, maxwordlen, maxdist) |
|---|
| 91 |
h=state[1] |
|---|
| 92 |
return h[0][0] < maxdist and (h[0][-1], h[0][0]) or (-1, maxdist) # ic,d |
|---|
| 93 |
|
|---|
| 94 |
# performance tests for this routine, versus calling levenshtein_distance, are |
|---|
| 95 |
# still TBD. I beleive that theoretical worst case performance is no better than |
|---|
| 96 |
# levenshtein_distance. small scale tests _did_ show a small advantage. But |
|---|
| 97 |
# probably not enough to warant the considerable additional complexity. n'mind |
|---|
| 98 |
# it was fun to write! |
|---|
| 99 |
def _levenshtein_select(asequence, |
|---|
| 100 |
b, # ignored if state is not None, otherwise |
|---|
| 101 |
dtab = None, # ignored if state is not None |
|---|
| 102 |
maxwordlen = 0, # ignored if state is not None |
|---|
| 103 |
maxdist = LEVENSHTEIN_DEFAULT_MAXDIST, # ignored if state is not None, |
|---|
| 104 |
state = None |
|---|
| 105 |
): |
|---|
| 106 |
|
|---|
| 107 |
b, h, dtabs, maxwordlen, maxdist = state or ( |
|---|
| 108 |
b, None,None, maxwordlen, maxdist) |
|---|
| 109 |
if maxwordlen == 0: |
|---|
| 110 |
#assert state is None, "re-entry state must include original maxwordlen" |
|---|
| 111 |
asequence.append(b) |
|---|
| 112 |
maxwordlen = maxwordlen or reduce(max, map(len, asequence)) |
|---|
| 113 |
asequence.pop() # pop b |
|---|
| 114 |
_levenshtein_accomodate_wordlen(maxwordlen+1) |
|---|
| 115 |
|
|---|
| 116 |
dtabs = dtabs or [ |
|---|
| 117 |
[0, 0, c, n] |
|---|
| 118 |
for c, n, m in map( |
|---|
| 119 |
_levenshtein_prepare_dtab, asequence, repeat( |
|---|
| 120 |
b, len(asequence)), repeat(dtab, len(asequence)))] |
|---|
| 121 |
|
|---|
| 122 |
# obviously, m is invariant for all permutaions of 'asequence' with b |
|---|
| 123 |
m = len(b) |
|---|
| 124 |
#m = not state and m or len(b) |
|---|
| 125 |
|
|---|
| 126 |
# the purpose of 'h' is to enable efficient selection of an alternate match |
|---|
| 127 |
# of b against an item in asequence when the current selection starts to |
|---|
| 128 |
# deteriorate. |
|---|
| 129 |
# |
|---|
| 130 |
# h is a priority queue of the current match score of each item in |
|---|
| 131 |
# 'asequence' against 'b'. it's items are a 3 tuple: (d, m', ic). d is |
|---|
| 132 |
# the 'on diagonal' levenshtein distance of 'asequence[ic]' for m' |
|---|
| 133 |
# characters of b. |
|---|
| 134 |
# |
|---|
| 135 |
# heapq orders tuple keys lexicaly. h[0][1] = m' = characters of b consumed |
|---|
| 136 |
# for the best existing match. hence when h[0][1] = m' = m we know that |
|---|
| 137 |
# there are no further matches in the heap with un consumed characters. |
|---|
| 138 |
|
|---|
| 139 |
h = h or [(0, 0, i) for i in range(len(asequence))] |
|---|
| 140 |
htop = heappop(h) |
|---|
| 141 |
irange = range(1, maxwordlen+1) |
|---|
| 142 |
jrange = range(1, m+1) |
|---|
| 143 |
while htop[0] <= h[0][0] and htop[1] < m: |
|---|
| 144 |
ic = htop[-1] |
|---|
| 145 |
dstart, (istart, jstart, c, n) = htop[0], dtabs[ic] |
|---|
| 146 |
for i in irange[istart:n]: |
|---|
| 147 |
# range of j is len(b) and hence is invariant for all in asequence. |
|---|
| 148 |
for j in jrange[jstart:]: |
|---|
| 149 |
x,y = c[i-1,j]+1, c[i,j-1]+1 |
|---|
| 150 |
if asequence[ic][i-1] == b[j-1]: |
|---|
| 151 |
z = c[i-1,j-1] |
|---|
| 152 |
else: |
|---|
| 153 |
z = c[i-1,j-1]+1 |
|---|
| 154 |
d = c[i,j] = min(x,y,z) |
|---|
| 155 |
if i==n and j == m and d == 0: |
|---|
| 156 |
# * BEST MATCH *, early exit it is not possible to find a |
|---|
| 157 |
# better match. note, however, if the elements of |
|---|
| 158 |
# 'asequence' are not unique terminating here will ignore |
|---|
| 159 |
# equivelent matches. |
|---|
| 160 |
heappush(h, htop) |
|---|
| 161 |
return (b, h, dtabs, maxwordlen, maxdist) |
|---|
| 162 |
if i==j and d > dstart: |
|---|
| 163 |
# for any given word pair the 'on diagonal' distance will |
|---|
| 164 |
# remain constant or deteriorate. |
|---|
| 165 |
#if j < m: |
|---|
| 166 |
# dtabs[ic][0:2]=[i-1, j] |
|---|
| 167 |
#else: |
|---|
| 168 |
# dtabs[ic][0:2]=[i, 0] |
|---|
| 169 |
dtabs[ic][0:2] = j < m and [i-1, j] or [i, 0] |
|---|
| 170 |
# because of i==j above |
|---|
| 171 |
# i*m+h[q][1] >= i*m+j |
|---|
| 172 |
# for q[0:len(asequence)] and j[0:i] |
|---|
| 173 |
|
|---|
| 174 |
htop = heapreplace(h, (d, j, htop[-1])) |
|---|
| 175 |
break |
|---|
| 176 |
jstart = 0 |
|---|
| 177 |
if i==j and d > dstart: |
|---|
| 178 |
break |
|---|
| 179 |
# if we complete a full examination without a deteriation in distance |
|---|
| 180 |
# we need to ensure we cycle the heap. note that without this then |
|---|
| 181 |
# ['apple','apple','apple'], 'param' will not halt. |
|---|
| 182 |
if ic == htop[-1]: |
|---|
| 183 |
htop = heapreplace(h, (d, m, htop[-1])) |
|---|
| 184 |
if htop[0] > maxdist: |
|---|
| 185 |
return (b, h, dtabs, maxwordlen, maxdist) |
|---|
| 186 |
# if we get here there are no 0 distance matches and we have full considered |
|---|
| 187 |
# all permutations that share the best distance. |
|---|
| 188 |
|
|---|
| 189 |
# in the case where all matches have equivelent distance we take the first. |
|---|
| 190 |
# a side effect of the above is that in this case htop is going to be the |
|---|
| 191 |
# last item. |
|---|
| 192 |
# |
|---|
| 193 |
# this tends to be 'what you'd expect'. there is a cost in herent in |
|---|
| 194 |
# providing this. if it turns out not to be appropriate it will be removed. |
|---|
| 195 |
# |
|---|
| 196 |
heappush(h, htop) |
|---|
| 197 |
return (b, h, dtabs, maxwordlen, maxdist) |
|---|
| 198 |
|
|---|