root/freeform/trunk/freeform/match.py

Revision 156, 13.3 kB (checked in by robin, 2 years ago)

tweaked the param list regex, added yield_commands

Line 
1 from itertools import repeat
2 from heapq import *
3 from levenshtein import *
4 from pprint import pprint # for _debug_current_production
5
6 __all__=[
7     'formtable_prepare', 'match_scentence', 'match_command',   
8     # 'match_scentence_noplurals', needs fixing for latest formtable_prepare
9    
10     'FIELDTYPE_INVALID', 'FIELDTYPE_UNKNOWN', 'FIELDTYPE_KEYWORD',
11     'FIELDTYPE_WORD', 'FIELDTYPE_WORDS', 'FIELDTYPE_MENU',
12     'FIELDTYPE_LIST', 'FIELDTYPE_LISTMENU',
13    
14     'FIELDTYPE_FIRST','FIELDTYPE_LAST',
15    
16     'fieldtype_flavour',
17
18     'FIELDTYPE_FLAVOUR_FLAG_SINGULAR', 'FIELDTYPE_FLAVOUR_FLAG_PLURAL',
19     'FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE',
20     'FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION',
21     'FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL',
22    
23     'FIELDTYPE_FLAVOUR_FLAG_COUNT',
24
25     'repr_fieldtype', 'repr_fieldtype_compact', 'repr_fieldtype_flavour',
26     'repr_fieldtype_flavour_compact'
27 ]
28    
29 FIELDTYPE_INVALID=-1
30 FIELDTYPE_UNKNOWN=0
31 FIELDTYPE_KEYWORD=1
32 FIELDTYPE_WORD=2
33 FIELDTYPE_WORDS=3
34 FIELDTYPE_MENU=4
35 FIELDTYPE_LIST=5
36 FIELDTYPE_LISTMENU=6
37 FIELDTYPE_FIRST=FIELDTYPE_UNKNOWN
38 FIELDTYPE_LAST=FIELDTYPE_LISTMENU
39
40 _fieldtype_reprtab=dict([
41     (FIELDTYPE_INVALID,'INVALID'),
42     (FIELDTYPE_UNKNOWN, 'UNKNOWN'),
43     (FIELDTYPE_KEYWORD, 'KEYWORD'),
44     (FIELDTYPE_WORD, 'WORD'),
45     (FIELDTYPE_WORDS, 'WORDS'),
46     (FIELDTYPE_MENU, 'MENU'),
47     (FIELDTYPE_LIST, 'LIST'),
48     (FIELDTYPE_LISTMENU, 'LISTMENU')])
49
50 _fieldtype_reprtab_compact=dict([
51     (FIELDTYPE_INVALID,'IN'),
52     (FIELDTYPE_UNKNOWN, 'UN'),
53     (FIELDTYPE_KEYWORD, 'KE'),
54     (FIELDTYPE_WORD, 'WO'),
55     (FIELDTYPE_WORDS, 'WW'),
56     (FIELDTYPE_MENU, 'ME'),
57     (FIELDTYPE_LIST, 'LI'),
58     (FIELDTYPE_LISTMENU, 'LM')])
59        
60 def repr_fieldtype(typecode):
61     return "FIELDTYPE_%s" % _fieldtype_reprtab.get(typecode, 'INVALID')
62 def repr_fieldtype_compact(typecode):
63     return _fieldtype_reprtab_compact.get(typecode, 'IN')
64
65 FIELDTYPE_FLAVOUR_FLAG_SINGULAR = 1<<0
66 FIELDTYPE_FLAVOUR_FLAG_PLURAL = 1<<1
67 FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE = 1<<2
68 FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION = 1<<3
69 FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL = 1<<4
70 FIELDTYPE_FLAVOUR_FLAG_COUNT = 5
71
72 _fieldtype_flavour_reprtab = dict([
73     (FIELDTYPE_FLAVOUR_FLAG_SINGULAR,
74         'SINGULAR'),
75     (FIELDTYPE_FLAVOUR_FLAG_PLURAL,
76         'PLURAL'),
77     (FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE,
78         'SELECTSINGLE'),
79     (FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION,
80         'VALUE_ENUMERATION'),
81     (FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL,
82         'VALUE_LABEL')])
83
84 _fieldtype_flavour_reprtab_compact = dict([
85     (FIELDTYPE_FLAVOUR_FLAG_SINGULAR,
86         'SIN'),
87     (FIELDTYPE_FLAVOUR_FLAG_PLURAL,
88         'PLU'),
89     (FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE,
90         'VSELS'),
91     (FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION,
92         'VENU'),
93     (FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL,
94         'VLAB')])
95    
96 def repr_fieldtype_flavour(flags):
97     return "|".join([
98             "FIELDTYPE_FLAVOUR_FLAG_%s" % _fieldtype_flavour_reprtab[(1<<i)]
99         for i in range(0, FIELDTYPE_FLAVOUR_FLAG_COUNT)
100         if (1 << i) & flags])
101    
102 def repr_fieldtype_flavour_compact(flags):
103     if flags is None:
104         return 'None'
105     compact = "|".join([
106             _fieldtype_flavour_reprtab_compact[(1<<i)]
107         for i in range(0, FIELDTYPE_FLAVOUR_FLAG_COUNT)
108         if (1 << i) & flags])
109     return compact
110    
111 fieldtype_flavour=dict([
112     (FIELDTYPE_INVALID, -1),
113     (FIELDTYPE_UNKNOWN, 0),
114     (FIELDTYPE_KEYWORD,
115         FIELDTYPE_FLAVOUR_FLAG_SINGULAR),
116     (FIELDTYPE_WORD,
117         FIELDTYPE_FLAVOUR_FLAG_SINGULAR),
118     (FIELDTYPE_WORDS,
119         FIELDTYPE_FLAVOUR_FLAG_PLURAL),
120     (FIELDTYPE_MENU,
121         FIELDTYPE_FLAVOUR_FLAG_SINGULAR |
122         FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE |
123         FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL),
124     (FIELDTYPE_LIST,
125         FIELDTYPE_FLAVOUR_FLAG_SINGULAR |
126         FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE |
127         FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION),
128     (FIELDTYPE_LISTMENU,
129         FIELDTYPE_FLAVOUR_FLAG_SINGULAR |
130         FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE |
131         FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION |
132         FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL)])
133
134
135
136 def match_scentence(formtable, words):
137     """match a scentence, uses backtracking to handle plurals."""
138
139     wordcount = len(words)
140     maxfieldcount = formtable['maxfieldcount']
141     formcount = formtable['formcount']
142     fieldcount = min(maxfieldcount, wordcount)
143    
144     formfieldtypes = formtable['form2fieldtypes']
145     fieldformidmatchsequence = formtable['fieldformidmatchsequence']
146     fieldmatchformids = formtable['fieldmatchformids']
147     fieldexactmatchformids = formtable['fieldexactmatchformids']
148        
149     ipluralstart = -1
150     iword = 0
151     match=[]
152     ifield=0
153     candidateforms = [formid for formid in range(0, formcount)]
154     nextfieldforms = None
155     backtrack=[]
156     while ifield != fieldcount and iword < len(words):
157         matched=None
158         word=words[iword]
159         nextfieldforms = fieldexactmatchformids[ifield].get(
160             word, None)
161         #if word == 'x':
162         #    print "*" * 30
163         if not nextfieldforms:
164             # TODO: accelerate the asequence construction for first field
165             # by precomputing.
166             asequence = []
167             [asequence.extend(fieldformidmatchsequence[ifield][formid])
168                 for formid in candidateforms]
169             if asequence:
170                 asequence[:]=dict.fromkeys(asequence).keys()
171                 # TODO: we could implement levenshtein_selectbest to select the
172                 # best _equivelent_ matches (wrt edit distance) and use it here
173                 # instead. This would (correctly) defer full judgement on
174                 # ambiguity resolution to this algorithm. I'm just being lazy
175                 # for now.
176                 ic,d = levenshtein_selectone(asequence, word)
177                 #if word == 'x':
178                 #    print "%s:x, ic=%s,d=%s" % (str(asequence), ic,d)
179                 if ic > -1:
180                     word = asequence[ic] # *! take the corrected word
181                     nextfieldforms = fieldmatchformids[ifield][word]
182         if nextfieldforms:
183             match.append(word)
184             if ipluralstart > -1:
185                 if backtrack:
186                     backtrack[-1][1]=iword+1
187                     backtrack[-1][3]=candidateforms
188                 ipluralstart = -1
189             # else: to handle phrases that start with WW, we'd have to setup a
190             # backtracking context here.
191             ## candidates
192             candidateforms = [formid for formid in candidateforms
193                             if formid in nextfieldforms]           
194             ifield += 1
195         else:
196             # assume WO or WW match, candidates are all the remaining forms
197             # with words at the current position.           
198             if ipluralstart == -1:
199                 for formid in candidateforms:
200                     if formfieldtypes[formid][ifield] == FIELDTYPE_WORDS:
201                         match.append([])
202                         ipluralstart=iword
203                         ifield+=1 # lookahead
204                         backtrack.append([
205                             ifield, iword, ipluralstart,
206                             candidateforms])
207                         break
208             if ipluralstart > -1:
209                 match[-1].append(word)
210             else: # take it as a singular word, WO
211                 candidateforms = [
212                     formid for formid in candidateforms
213                     if formfieldtypes[formid][ifield]==FIELDTYPE_WORD]
214                 if candidateforms:
215                     match.append(word)
216                 ifield += 1
217         iword += 1
218         if not candidateforms:
219             if backtrack:
220                 (ifield, iword, ipluralstart, candidateforms
221                     ) = backtrack.pop()
222                 match = match[:ifield]
223                 # if ipluralstart == iword, the backtrack entry is redundant
224                 # but we are about to fall out the main loop anyway. hence the
225                 # iword-1 case.
226                 match[-1].append(
227                     ipluralstart == iword and words[iword] or words[iword-1])
228             else: break
229         #_debug_current_production(match, candidateforms, formfieldtypes,
230         #    ifield-1,ifield,iword,ipluralstart,fieldcount-1)
231     # if the last field was a plural, make sure we collect all the trailing
232     # words.
233     if ipluralstart > -1:
234         match[-1].extend(words[iword:])
235     return match, candidateforms
236
237 def match_command(formtable, scentence):
238     match, forms = match_scentence(formtable, scentence)
239     if not match or not forms:
240         return None, forms
241     if len(forms) > 1:
242         return match, forms
243     form=forms[0]
244     cmd = formtable['form2command'][form]
245     valuemap = {}
246     for type,name,value in zip(
247         formtable['form2fieldtypes'][form],
248         formtable['form2fieldnames'][form],
249         match):
250         if type != FIELDTYPE_KEYWORD:
251             valuemap.setdefault(name, []).append(value)
252     return (cmd,form), valuemap
253
254 def yield_commands(ft, source):
255     """Match and yield all commands in ``source``.
256
257     yield is (lineno,((command,formid),valuemap))
258
259     * '#' indicates the start of a comment.
260     * '#' can be excaped by prefixing with '\'.
261     * multiple escapes are not collapsed:
262         in 'foo\#bar#baz' and 'foo\\#bar#baz' #baz is the comment part
263     * exactly one escape char is striped before matching:
264        'foo\#bar#baz' becomes 'foo#bar', and
265        'foo\\#bar#baz' becomes 'foo\#bar'
266     * \-continuation is not supported.
267
268     """
269     for lineno, i in enumerate(source.split('\n')):
270         ln=[]
271         for j in i.split('#'):
272             ln.append(j)
273             if not j:
274                 break
275             if j[-1]!='\\':
276                 break
277             ln.pop()
278             ln.append(j[:-1]+'#')
279         ln=''.join(ln)
280         if ln:
281             yield lineno+1, match_command(ft, ln.split())
282  
283 def _debug_current_production(matched, candidatenextformids, formfieldtypes,
284     ifield, ifieldnext, iword, ipluralstart, ilastfield):
285     """This function carries a heavy buyer beware badge. Tweak it if you need to,
286     and call from match_scentence to get an idea of how the algorithm works."""
287
288     pprint([
289         matched, 'if=%s, ifn=%s, iw=%s, iwss=%s' % (ifield, ifieldnext,iword,ipluralstart),
290         candidatenextformids,
291         ' '.join(map(repr_fieldtype_compact,
292              [formfieldtypes[formid][ifield]
293               for formid in candidatenextformids])),
294         ' '.join(map(repr_fieldtype_compact, ifield < ilastfield and [
295             formfieldtypes[formid][ifieldnext]
296             for formid in candidatenextformids
297         ]
298         or []))
299         ])
300
301 def formtable_prepare(formtable):
302     """Prepare a formtable for use in match_ xxx above.
303    
304     Don't call repeatedly on the same table!
305
306     This seperation is preparing the way for dynamic asequence construction,
307     ie support for convenient dynamicaly  fild list and listmenu fields.
308     """
309
310     formcount = formtable['formcount']
311     maxfieldcount = formtable['maxfieldcount']
312     fieldformtypes = formtable['field2fieldtypes']
313     fieldselectsequences = formtable['field2selectvalues']
314
315     # prepare tables to enable 'on the fly' asequence construction based on
316     # the remaining formids at the current match field.
317     #
318     #asequence = []
319     #[asequence.extend(fieldformidmatchsequence[ifield][formid])
320     #    for formid in candidates]
321     # ic,d = levenshtein(asequence, word)
322     # nextforms = fieldmatchformids[ifield][asequence[ic]]
323
324     # menu: [ a, b, c]
325     # keyword: [matchword]
326     # list: [matchworda, matchwordb, ...]
327     # listmenu: [[matchworda, matchwordb, ... ], [a, b, ... ]]
328     fieldformidmatchsequence = []
329     fieldmatchformids = []
330     fieldexactmatchformids = []
331     for ifield in range(0, maxfieldcount):
332         fieldformidmatchsequence.append([])
333         fieldmatchformids.append({})
334         fieldexactmatchformids.append({})
335         for formid in range(0, formcount):
336             fieldformidmatchsequence[ifield].append([])
337             if fieldformtypes[ifield][formid] in [
338                 FIELDTYPE_KEYWORD, FIELDTYPE_LIST]:
339                 asequence=fieldselectsequences[ifield][formid]
340                 fieldformidmatchsequence[ifield][formid]=asequence
341                 for match in asequence:
342                     fieldmatchformids[ifield].setdefault(match, []).append(formid)
343             elif fieldformtypes[ifield][formid] is FIELDTYPE_LISTMENU:
344                 asequence=fieldselectsequences[ifield][formid][0]
345                 fieldformidmatchsequence[ifield][formid]=asequence
346                 for match in asequence:
347                     fieldmatchformids[ifield].setdefault(match, []).append(formid)
348                    
349                 exactsequence = fieldselectsequences[ifield][formid][1]
350                 for match in exactsequence:
351                     fieldexactmatchformids[ifield].setdefault(match, []).append(formid)
352
353             elif fieldformtypes[ifield][formid] is FIELDTYPE_MENU:
354                 exactsequence = fieldselectsequences[ifield][formid]
355                 for match in exactsequence:
356                     fieldexactmatchformids[ifield].setdefault(match, []).append(formid)
357     formtable['fieldformidmatchsequence'] = fieldformidmatchsequence
358     formtable['fieldmatchformids'] = fieldmatchformids
359     formtable['fieldexactmatchformids'] = fieldexactmatchformids
360    
361     return formtable
362
363
Note: See TracBrowser for help on using the browser.