root/freeform/trunk/freeform/levenshtein.py

Revision 48, 8.4 kB (checked in by robin, 3 years ago)

BUGFIX: force minimum edit distance to min(len(candidate), applicationdefault). NEW: examples. and quoted examples in top level documentaion

Line 
1 from itertools import repeat
2 from heapq import *
3 __all__=[
4     'LEVENSHTEIN_DEFAULT_MAXDIST',
5
6     'levenshtein_distance', 'levenshtein_selectone', '_levenshtein_select']
7
8 LEVENSHTEIN_DEFAULT_MAXDIST = 3
9 # this constant is simply used to accelerate distance function startup.
10 # NOTE: tests with the profile module indicated the advandages of this were
11 # marginal at best! Exhaustive tests with large data & input sets are TBD.
12 _levenshtein_maxwordrange = 0
13 _levenshtein_firstcol = []
14 _levenshtein_firstrow = []
15
16 def _levenshtein_accomodate_wordlen(wordlenplusone):
17     global _levenshtein_maxwordrange, _levenshtein_firstcol, _levenshtein_firstrow
18     if wordlenplusone <= _levenshtein_maxwordrange:
19         return _levenshtein_firstcol, _levenshtein_firstrow
20  
21     _levenshtein_firstcol.extend([((i,0),i)
22         for i in xrange(_levenshtein_maxwordrange, wordlenplusone)])
23     _levenshtein_firstrow.extend([((0,j),j)
24         for j in xrange(_levenshtein_maxwordrange, wordlenplusone)])   
25     _levenshtein_maxwordrange = wordlenplusone
26     return _levenshtein_firstcol, _levenshtein_firstrow
27
28
29 def _levenshtein_prepare_dtab(a,b, dtab=None):
30     dtab = dtab or {}
31     dtab.clear()
32     n,m=len(a), len(b)
33     firstcol, firstrow = _levenshtein_accomodate_wordlen(
34         max(n, m) + 1)
35     dtab.update(firstcol[:n+1])
36     dtab.update(firstrow[:m+1])
37     return dtab, n, m
38
39 def levenshtein_distance(a,b):
40     """Levenshtein word distance algorithm.
41
42     a is the target word, b is the candidate. return the minimum edit distance
43     from b to a. that is, the minimum number of insertions, deletions and
44     substitutions required to produce a from b.
45    
46     Picked this implementation of the net. However it is described well in:
47         Speech & Langauge Processing: ISBN-X013122798X, Ch 5. pp 154
48    
49     Abridged from the above reference:
50    
51     This algorithm measures the 'minimum edit distance' between two
52     strings a,b. med(a,b) is defined as the minimum number of editing
53     operations required to transform one string into another.
54    
55     The operations are delete,substitute,insert
56
57     Levenshtein solves this problem using 'dynamic programming', essentialy a
58     table driven mechanism for solving a problem a bit at a time and
59     accumulating the result. Dynamic programming solutions for sequences in
60     general work by creating a distance matrix with one column for each item in
61     the target sequence and one row for each item in the source - ie., target
62     allong the bottom & source down the side. For computing the minimum edit
63     distance, this matrix is then the edit-distance matrix. Each cell
64     edit-distance[i,j] contains the distance between the first i characters of
65     the target and the first j characters of the source. The value of each
66     cell represents the minimum of the three possible paths through the matrix
67     that arrive there.
68     """
69     c, n, m = _levenshtein_prepare_dtab(a,b)
70     #c, n, m = {}, len(a), len(b)
71     #c.update([((i,0),i) for i in range(0,n+1)])
72     #c.update([((0,j),j) for j in range(0,m+1)])
73     for i in range(1,n+1):
74         for j in range(1,m+1):
75             x = c[i-1,j]+1
76             y = c[i,j-1]+1
77             if a[i-1] == b[j-1]:
78                 z = c[i-1,j-1]
79             else:
80                 z = c[i-1,j-1]+1
81             c[i,j] = min(x,y,z)
82     return c[n,m]
83
84 def levenshtein_selectone(asequence, b,
85     dtab=None, maxwordlen = 0, maxdist = LEVENSHTEIN_DEFAULT_MAXDIST):
86     maxdist=min(len(b),maxdist)
87     if len(asequence) == 1:
88         d = levenshtein_distance(asequence[0], b)
89         return d < maxdist and (0,d) or (-1,d)
90     state = _levenshtein_select(asequence, b, dtab, maxwordlen, maxdist)
91     h=state[1]
92     return h[0][0] < maxdist and (h[0][-1], h[0][0]) or (-1, maxdist) # ic,d
93
94 # performance tests for this routine, versus calling levenshtein_distance, are
95 # still TBD. I beleive that theoretical worst case performance is no better than
96 # levenshtein_distance. small scale tests _did_ show a small advantage. But
97 # probably not enough to warant the considerable additional complexity. n'mind
98 # it was fun to write!
99 def _levenshtein_select(asequence,
100         b,                  # ignored if state is not None, otherwise 
101         dtab = None,        # ignored if state is not None
102         maxwordlen = 0,     # ignored if state is not None
103         maxdist = LEVENSHTEIN_DEFAULT_MAXDIST, # ignored if state is not None,
104         state = None
105         ):
106    
107     b, h, dtabs, maxwordlen, maxdist = state or (
108         b, None,None, maxwordlen, maxdist)
109     if maxwordlen == 0:
110         #assert state is None, "re-entry state must include original maxwordlen"
111         asequence.append(b)
112         maxwordlen = maxwordlen or reduce(max, map(len, asequence))
113         asequence.pop() # pop b
114         _levenshtein_accomodate_wordlen(maxwordlen+1)
115        
116     dtabs = dtabs or [
117         [0, 0, c, n]
118         for c, n, m in map(
119             _levenshtein_prepare_dtab, asequence, repeat(
120                 b, len(asequence)), repeat(dtab, len(asequence)))]       
121        
122     # obviously, m is invariant for all permutaions of 'asequence' with b
123     m = len(b)
124     #m = not state and m or len(b)
125    
126     # the purpose of 'h' is to enable efficient selection of an alternate match
127     # of b against an item in asequence when the current selection starts to
128     # deteriorate.
129     #
130     # h is a priority queue of the current match score of each item in
131     # 'asequence' against 'b'.  it's items are a 3 tuple: (d, m', ic).  d is
132     # the 'on diagonal' levenshtein distance of 'asequence[ic]' for m'
133     # characters of b.
134     #
135     # heapq orders tuple keys lexicaly. h[0][1] = m' = characters of b consumed
136     # for the best existing match. hence when h[0][1] = m' = m we know that
137     # there are no further matches in the heap with un consumed characters.
138
139     h = h or [(0, 0, i) for i in range(len(asequence))]
140     htop = heappop(h)
141     irange = range(1, maxwordlen+1)
142     jrange = range(1, m+1)
143     while htop[0] <= h[0][0] and htop[1] < m:
144         ic = htop[-1]
145         dstart, (istart, jstart, c, n) = htop[0], dtabs[ic]
146         for i in irange[istart:n]:
147             # range of j is len(b) and hence is invariant for all in asequence.
148             for j in jrange[jstart:]:
149                 x,y = c[i-1,j]+1, c[i,j-1]+1
150                 if asequence[ic][i-1] == b[j-1]:
151                     z = c[i-1,j-1]
152                 else:
153                     z = c[i-1,j-1]+1
154                 d = c[i,j] = min(x,y,z)
155                 if i==n and j == m and d == 0:
156                     # * BEST MATCH *, early exit it is not possible to find a
157                     # better match.  note, however, if the elements of
158                     # 'asequence' are not unique terminating here will ignore
159                     # equivelent matches.
160                     heappush(h, htop)
161                     return (b, h, dtabs, maxwordlen, maxdist)
162                 if i==j and d > dstart:
163                     # for any given word pair the 'on diagonal' distance will
164                     # remain constant or deteriorate.
165                     #if j < m:
166                     #    dtabs[ic][0:2]=[i-1, j]
167                     #else:
168                     #    dtabs[ic][0:2]=[i, 0]
169                     dtabs[ic][0:2] = j < m and [i-1, j] or [i, 0]
170                     # because of i==j above
171                     #   i*m+h[q][1] >= i*m+j
172                     #   for q[0:len(asequence)] and j[0:i]
173                    
174                     htop = heapreplace(h, (d, j, htop[-1]))
175                     break
176             jstart = 0
177             if i==j and d > dstart:
178                 break
179         # if we complete a full examination without a deteriation in distance
180         # we need to ensure we cycle the heap. note that without this then
181         # ['apple','apple','apple'], 'param' will not halt.
182         if ic == htop[-1]:
183             htop = heapreplace(h, (d, m, htop[-1]))
184         if htop[0] > maxdist:
185             return (b, h, dtabs, maxwordlen, maxdist)
186     # if we get here there are no 0 distance matches and we have full considered
187     # all permutations that share the best distance.
188    
189     # in the case where all matches have equivelent distance we take the first.
190     # a side effect of the above is that in this case htop is going to be the
191     # last item.
192     #
193     # this tends to be 'what you'd expect'. there is a cost in herent in
194     # providing this. if it turns out not to be appropriate it will be removed.
195     #
196     heappush(h, htop)
197     return (b, h, dtabs, maxwordlen, maxdist)
198
Note: See TracBrowser for help on using the browser.