| 1 |
from itertools import repeat |
|---|
| 2 |
from heapq import * |
|---|
| 3 |
from levenshtein import * |
|---|
| 4 |
from pprint import pprint # for _debug_current_production |
|---|
| 5 |
|
|---|
| 6 |
__all__=[ |
|---|
| 7 |
'formtable_prepare', 'match_scentence', 'match_command', |
|---|
| 8 |
# 'match_scentence_noplurals', needs fixing for latest formtable_prepare |
|---|
| 9 |
|
|---|
| 10 |
'FIELDTYPE_INVALID', 'FIELDTYPE_UNKNOWN', 'FIELDTYPE_KEYWORD', |
|---|
| 11 |
'FIELDTYPE_WORD', 'FIELDTYPE_WORDS', 'FIELDTYPE_MENU', |
|---|
| 12 |
'FIELDTYPE_LIST', 'FIELDTYPE_LISTMENU', |
|---|
| 13 |
|
|---|
| 14 |
'FIELDTYPE_FIRST','FIELDTYPE_LAST', |
|---|
| 15 |
|
|---|
| 16 |
'fieldtype_flavour', |
|---|
| 17 |
|
|---|
| 18 |
'FIELDTYPE_FLAVOUR_FLAG_SINGULAR', 'FIELDTYPE_FLAVOUR_FLAG_PLURAL', |
|---|
| 19 |
'FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE', |
|---|
| 20 |
'FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION', |
|---|
| 21 |
'FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL', |
|---|
| 22 |
|
|---|
| 23 |
'FIELDTYPE_FLAVOUR_FLAG_COUNT', |
|---|
| 24 |
|
|---|
| 25 |
'repr_fieldtype', 'repr_fieldtype_compact', 'repr_fieldtype_flavour', |
|---|
| 26 |
'repr_fieldtype_flavour_compact' |
|---|
| 27 |
] |
|---|
| 28 |
|
|---|
| 29 |
FIELDTYPE_INVALID=-1 |
|---|
| 30 |
FIELDTYPE_UNKNOWN=0 |
|---|
| 31 |
FIELDTYPE_KEYWORD=1 |
|---|
| 32 |
FIELDTYPE_WORD=2 |
|---|
| 33 |
FIELDTYPE_WORDS=3 |
|---|
| 34 |
FIELDTYPE_MENU=4 |
|---|
| 35 |
FIELDTYPE_LIST=5 |
|---|
| 36 |
FIELDTYPE_LISTMENU=6 |
|---|
| 37 |
FIELDTYPE_FIRST=FIELDTYPE_UNKNOWN |
|---|
| 38 |
FIELDTYPE_LAST=FIELDTYPE_LISTMENU |
|---|
| 39 |
|
|---|
| 40 |
_fieldtype_reprtab=dict([ |
|---|
| 41 |
(FIELDTYPE_INVALID,'INVALID'), |
|---|
| 42 |
(FIELDTYPE_UNKNOWN, 'UNKNOWN'), |
|---|
| 43 |
(FIELDTYPE_KEYWORD, 'KEYWORD'), |
|---|
| 44 |
(FIELDTYPE_WORD, 'WORD'), |
|---|
| 45 |
(FIELDTYPE_WORDS, 'WORDS'), |
|---|
| 46 |
(FIELDTYPE_MENU, 'MENU'), |
|---|
| 47 |
(FIELDTYPE_LIST, 'LIST'), |
|---|
| 48 |
(FIELDTYPE_LISTMENU, 'LISTMENU')]) |
|---|
| 49 |
|
|---|
| 50 |
_fieldtype_reprtab_compact=dict([ |
|---|
| 51 |
(FIELDTYPE_INVALID,'IN'), |
|---|
| 52 |
(FIELDTYPE_UNKNOWN, 'UN'), |
|---|
| 53 |
(FIELDTYPE_KEYWORD, 'KE'), |
|---|
| 54 |
(FIELDTYPE_WORD, 'WO'), |
|---|
| 55 |
(FIELDTYPE_WORDS, 'WW'), |
|---|
| 56 |
(FIELDTYPE_MENU, 'ME'), |
|---|
| 57 |
(FIELDTYPE_LIST, 'LI'), |
|---|
| 58 |
(FIELDTYPE_LISTMENU, 'LM')]) |
|---|
| 59 |
|
|---|
| 60 |
def repr_fieldtype(typecode): |
|---|
| 61 |
return "FIELDTYPE_%s" % _fieldtype_reprtab.get(typecode, 'INVALID') |
|---|
| 62 |
def repr_fieldtype_compact(typecode): |
|---|
| 63 |
return _fieldtype_reprtab_compact.get(typecode, 'IN') |
|---|
| 64 |
|
|---|
| 65 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR = 1<<0 |
|---|
| 66 |
FIELDTYPE_FLAVOUR_FLAG_PLURAL = 1<<1 |
|---|
| 67 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE = 1<<2 |
|---|
| 68 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION = 1<<3 |
|---|
| 69 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL = 1<<4 |
|---|
| 70 |
FIELDTYPE_FLAVOUR_FLAG_COUNT = 5 |
|---|
| 71 |
|
|---|
| 72 |
_fieldtype_flavour_reprtab = dict([ |
|---|
| 73 |
(FIELDTYPE_FLAVOUR_FLAG_SINGULAR, |
|---|
| 74 |
'SINGULAR'), |
|---|
| 75 |
(FIELDTYPE_FLAVOUR_FLAG_PLURAL, |
|---|
| 76 |
'PLURAL'), |
|---|
| 77 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE, |
|---|
| 78 |
'SELECTSINGLE'), |
|---|
| 79 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION, |
|---|
| 80 |
'VALUE_ENUMERATION'), |
|---|
| 81 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL, |
|---|
| 82 |
'VALUE_LABEL')]) |
|---|
| 83 |
|
|---|
| 84 |
_fieldtype_flavour_reprtab_compact = dict([ |
|---|
| 85 |
(FIELDTYPE_FLAVOUR_FLAG_SINGULAR, |
|---|
| 86 |
'SIN'), |
|---|
| 87 |
(FIELDTYPE_FLAVOUR_FLAG_PLURAL, |
|---|
| 88 |
'PLU'), |
|---|
| 89 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE, |
|---|
| 90 |
'VSELS'), |
|---|
| 91 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION, |
|---|
| 92 |
'VENU'), |
|---|
| 93 |
(FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL, |
|---|
| 94 |
'VLAB')]) |
|---|
| 95 |
|
|---|
| 96 |
def repr_fieldtype_flavour(flags): |
|---|
| 97 |
return "|".join([ |
|---|
| 98 |
"FIELDTYPE_FLAVOUR_FLAG_%s" % _fieldtype_flavour_reprtab[(1<<i)] |
|---|
| 99 |
for i in range(0, FIELDTYPE_FLAVOUR_FLAG_COUNT) |
|---|
| 100 |
if (1 << i) & flags]) |
|---|
| 101 |
|
|---|
| 102 |
def repr_fieldtype_flavour_compact(flags): |
|---|
| 103 |
if flags is None: |
|---|
| 104 |
return 'None' |
|---|
| 105 |
compact = "|".join([ |
|---|
| 106 |
_fieldtype_flavour_reprtab_compact[(1<<i)] |
|---|
| 107 |
for i in range(0, FIELDTYPE_FLAVOUR_FLAG_COUNT) |
|---|
| 108 |
if (1 << i) & flags]) |
|---|
| 109 |
return compact |
|---|
| 110 |
|
|---|
| 111 |
fieldtype_flavour=dict([ |
|---|
| 112 |
(FIELDTYPE_INVALID, -1), |
|---|
| 113 |
(FIELDTYPE_UNKNOWN, 0), |
|---|
| 114 |
(FIELDTYPE_KEYWORD, |
|---|
| 115 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR), |
|---|
| 116 |
(FIELDTYPE_WORD, |
|---|
| 117 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR), |
|---|
| 118 |
(FIELDTYPE_WORDS, |
|---|
| 119 |
FIELDTYPE_FLAVOUR_FLAG_PLURAL), |
|---|
| 120 |
(FIELDTYPE_MENU, |
|---|
| 121 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR | |
|---|
| 122 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE | |
|---|
| 123 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL), |
|---|
| 124 |
(FIELDTYPE_LIST, |
|---|
| 125 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR | |
|---|
| 126 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE | |
|---|
| 127 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION), |
|---|
| 128 |
(FIELDTYPE_LISTMENU, |
|---|
| 129 |
FIELDTYPE_FLAVOUR_FLAG_SINGULAR | |
|---|
| 130 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_SELECTSINGLE | |
|---|
| 131 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_ENUMERATION | |
|---|
| 132 |
FIELDTYPE_FLAVOUR_FLAG_VALUE_LABEL)]) |
|---|
| 133 |
|
|---|
| 134 |
|
|---|
| 135 |
|
|---|
| 136 |
def match_scentence(formtable, words): |
|---|
| 137 |
"""match a scentence, uses backtracking to handle plurals.""" |
|---|
| 138 |
|
|---|
| 139 |
wordcount = len(words) |
|---|
| 140 |
maxfieldcount = formtable['maxfieldcount'] |
|---|
| 141 |
formcount = formtable['formcount'] |
|---|
| 142 |
fieldcount = min(maxfieldcount, wordcount) |
|---|
| 143 |
|
|---|
| 144 |
formfieldtypes = formtable['form2fieldtypes'] |
|---|
| 145 |
fieldformidmatchsequence = formtable['fieldformidmatchsequence'] |
|---|
| 146 |
fieldmatchformids = formtable['fieldmatchformids'] |
|---|
| 147 |
fieldexactmatchformids = formtable['fieldexactmatchformids'] |
|---|
| 148 |
|
|---|
| 149 |
ipluralstart = -1 |
|---|
| 150 |
iword = 0 |
|---|
| 151 |
match=[] |
|---|
| 152 |
ifield=0 |
|---|
| 153 |
candidateforms = [formid for formid in range(0, formcount)] |
|---|
| 154 |
nextfieldforms = None |
|---|
| 155 |
backtrack=[] |
|---|
| 156 |
while ifield != fieldcount and iword < len(words): |
|---|
| 157 |
matched=None |
|---|
| 158 |
word=words[iword] |
|---|
| 159 |
nextfieldforms = fieldexactmatchformids[ifield].get( |
|---|
| 160 |
word, None) |
|---|
| 161 |
#if word == 'x': |
|---|
| 162 |
# print "*" * 30 |
|---|
| 163 |
if not nextfieldforms: |
|---|
| 164 |
# TODO: accelerate the asequence construction for first field |
|---|
| 165 |
# by precomputing. |
|---|
| 166 |
asequence = [] |
|---|
| 167 |
[asequence.extend(fieldformidmatchsequence[ifield][formid]) |
|---|
| 168 |
for formid in candidateforms] |
|---|
| 169 |
if asequence: |
|---|
| 170 |
asequence[:]=dict.fromkeys(asequence).keys() |
|---|
| 171 |
# TODO: we could implement levenshtein_selectbest to select the |
|---|
| 172 |
# best _equivelent_ matches (wrt edit distance) and use it here |
|---|
| 173 |
# instead. This would (correctly) defer full judgement on |
|---|
| 174 |
# ambiguity resolution to this algorithm. I'm just being lazy |
|---|
| 175 |
# for now. |
|---|
| 176 |
ic,d = levenshtein_selectone(asequence, word) |
|---|
| 177 |
#if word == 'x': |
|---|
| 178 |
# print "%s:x, ic=%s,d=%s" % (str(asequence), ic,d) |
|---|
| 179 |
if ic > -1: |
|---|
| 180 |
word = asequence[ic] # *! take the corrected word |
|---|
| 181 |
nextfieldforms = fieldmatchformids[ifield][word] |
|---|
| 182 |
if nextfieldforms: |
|---|
| 183 |
match.append(word) |
|---|
| 184 |
if ipluralstart > -1: |
|---|
| 185 |
if backtrack: |
|---|
| 186 |
backtrack[-1][1]=iword+1 |
|---|
| 187 |
backtrack[-1][3]=candidateforms |
|---|
| 188 |
ipluralstart = -1 |
|---|
| 189 |
# else: to handle phrases that start with WW, we'd have to setup a |
|---|
| 190 |
# backtracking context here. |
|---|
| 191 |
## candidates |
|---|
| 192 |
candidateforms = [formid for formid in candidateforms |
|---|
| 193 |
if formid in nextfieldforms] |
|---|
| 194 |
ifield += 1 |
|---|
| 195 |
else: |
|---|
| 196 |
# assume WO or WW match, candidates are all the remaining forms |
|---|
| 197 |
# with words at the current position. |
|---|
| 198 |
if ipluralstart == -1: |
|---|
| 199 |
for formid in candidateforms: |
|---|
| 200 |
if formfieldtypes[formid][ifield] == FIELDTYPE_WORDS: |
|---|
| 201 |
match.append([]) |
|---|
| 202 |
ipluralstart=iword |
|---|
| 203 |
ifield+=1 # lookahead |
|---|
| 204 |
backtrack.append([ |
|---|
| 205 |
ifield, iword, ipluralstart, |
|---|
| 206 |
candidateforms]) |
|---|
| 207 |
break |
|---|
| 208 |
if ipluralstart > -1: |
|---|
| 209 |
match[-1].append(word) |
|---|
| 210 |
else: # take it as a singular word, WO |
|---|
| 211 |
candidateforms = [ |
|---|
| 212 |
formid for formid in candidateforms |
|---|
| 213 |
if formfieldtypes[formid][ifield]==FIELDTYPE_WORD] |
|---|
| 214 |
if candidateforms: |
|---|
| 215 |
match.append(word) |
|---|
| 216 |
ifield += 1 |
|---|
| 217 |
iword += 1 |
|---|
| 218 |
if not candidateforms: |
|---|
| 219 |
if backtrack: |
|---|
| 220 |
(ifield, iword, ipluralstart, candidateforms |
|---|
| 221 |
) = backtrack.pop() |
|---|
| 222 |
match = match[:ifield] |
|---|
| 223 |
# if ipluralstart == iword, the backtrack entry is redundant |
|---|
| 224 |
# but we are about to fall out the main loop anyway. hence the |
|---|
| 225 |
# iword-1 case. |
|---|
| 226 |
match[-1].append( |
|---|
| 227 |
ipluralstart == iword and words[iword] or words[iword-1]) |
|---|
| 228 |
else: break |
|---|
| 229 |
#_debug_current_production(match, candidateforms, formfieldtypes, |
|---|
| 230 |
# ifield-1,ifield,iword,ipluralstart,fieldcount-1) |
|---|
| 231 |
# if the last field was a plural, make sure we collect all the trailing |
|---|
| 232 |
# words. |
|---|
| 233 |
if ipluralstart > -1: |
|---|
| 234 |
match[-1].extend(words[iword:]) |
|---|
| 235 |
return match, candidateforms |
|---|
| 236 |
|
|---|
| 237 |
def match_command(formtable, scentence): |
|---|
| 238 |
match, forms = match_scentence(formtable, scentence) |
|---|
| 239 |
if not match or not forms: |
|---|
| 240 |
return None, forms |
|---|
| 241 |
if len(forms) > 1: |
|---|
| 242 |
return match, forms |
|---|
| 243 |
form=forms[0] |
|---|
| 244 |
cmd = formtable['form2command'][form] |
|---|
| 245 |
valuemap = {} |
|---|
| 246 |
for type,name,value in zip( |
|---|
| 247 |
formtable['form2fieldtypes'][form], |
|---|
| 248 |
formtable['form2fieldnames'][form], |
|---|
| 249 |
match): |
|---|
| 250 |
if type != FIELDTYPE_KEYWORD: |
|---|
| 251 |
valuemap.setdefault(name, []).append(value) |
|---|
| 252 |
return (cmd,form), valuemap |
|---|
| 253 |
|
|---|
| 254 |
def yield_commands(ft, source): |
|---|
| 255 |
"""Match and yield all commands in ``source``. |
|---|
| 256 |
|
|---|
| 257 |
yield is (lineno,((command,formid),valuemap)) |
|---|
| 258 |
|
|---|
| 259 |
* '#' indicates the start of a comment. |
|---|
| 260 |
* '#' can be excaped by prefixing with '\'. |
|---|
| 261 |
* multiple escapes are not collapsed: |
|---|
| 262 |
in 'foo\#bar#baz' and 'foo\\#bar#baz' #baz is the comment part |
|---|
| 263 |
* exactly one escape char is striped before matching: |
|---|
| 264 |
'foo\#bar#baz' becomes 'foo#bar', and |
|---|
| 265 |
'foo\\#bar#baz' becomes 'foo\#bar' |
|---|
| 266 |
* \-continuation is not supported. |
|---|
| 267 |
|
|---|
| 268 |
""" |
|---|
| 269 |
for lineno, i in enumerate(source.split('\n')): |
|---|
| 270 |
ln=[] |
|---|
| 271 |
for j in i.split('#'): |
|---|
| 272 |
ln.append(j) |
|---|
| 273 |
if not j: |
|---|
| 274 |
break |
|---|
| 275 |
if j[-1]!='\\': |
|---|
| 276 |
break |
|---|
| 277 |
ln.pop() |
|---|
| 278 |
ln.append(j[:-1]+'#') |
|---|
| 279 |
ln=''.join(ln) |
|---|
| 280 |
if ln: |
|---|
| 281 |
yield lineno+1, match_command(ft, ln.split()) |
|---|
| 282 |
|
|---|
| 283 |
def _debug_current_production(matched, candidatenextformids, formfieldtypes, |
|---|
| 284 |
ifield, ifieldnext, iword, ipluralstart, ilastfield): |
|---|
| 285 |
"""This function carries a heavy buyer beware badge. Tweak it if you need to, |
|---|
| 286 |
and call from match_scentence to get an idea of how the algorithm works.""" |
|---|
| 287 |
|
|---|
| 288 |
pprint([ |
|---|
| 289 |
matched, 'if=%s, ifn=%s, iw=%s, iwss=%s' % (ifield, ifieldnext,iword,ipluralstart), |
|---|
| 290 |
candidatenextformids, |
|---|
| 291 |
' '.join(map(repr_fieldtype_compact, |
|---|
| 292 |
[formfieldtypes[formid][ifield] |
|---|
| 293 |
for formid in candidatenextformids])), |
|---|
| 294 |
' '.join(map(repr_fieldtype_compact, ifield < ilastfield and [ |
|---|
| 295 |
formfieldtypes[formid][ifieldnext] |
|---|
| 296 |
for formid in candidatenextformids |
|---|
| 297 |
] |
|---|
| 298 |
or [])) |
|---|
| 299 |
]) |
|---|
| 300 |
|
|---|
| 301 |
def formtable_prepare(formtable): |
|---|
| 302 |
"""Prepare a formtable for use in match_ xxx above. |
|---|
| 303 |
|
|---|
| 304 |
Don't call repeatedly on the same table! |
|---|
| 305 |
|
|---|
| 306 |
This seperation is preparing the way for dynamic asequence construction, |
|---|
| 307 |
ie support for convenient dynamicaly fild list and listmenu fields. |
|---|
| 308 |
""" |
|---|
| 309 |
|
|---|
| 310 |
formcount = formtable['formcount'] |
|---|
| 311 |
maxfieldcount = formtable['maxfieldcount'] |
|---|
| 312 |
fieldformtypes = formtable['field2fieldtypes'] |
|---|
| 313 |
fieldselectsequences = formtable['field2selectvalues'] |
|---|
| 314 |
|
|---|
| 315 |
# prepare tables to enable 'on the fly' asequence construction based on |
|---|
| 316 |
# the remaining formids at the current match field. |
|---|
| 317 |
# |
|---|
| 318 |
#asequence = [] |
|---|
| 319 |
#[asequence.extend(fieldformidmatchsequence[ifield][formid]) |
|---|
| 320 |
# for formid in candidates] |
|---|
| 321 |
# ic,d = levenshtein(asequence, word) |
|---|
| 322 |
# nextforms = fieldmatchformids[ifield][asequence[ic]] |
|---|
| 323 |
|
|---|
| 324 |
# menu: [ a, b, c] |
|---|
| 325 |
# keyword: [matchword] |
|---|
| 326 |
# list: [matchworda, matchwordb, ...] |
|---|
| 327 |
# listmenu: [[matchworda, matchwordb, ... ], [a, b, ... ]] |
|---|
| 328 |
fieldformidmatchsequence = [] |
|---|
| 329 |
fieldmatchformids = [] |
|---|
| 330 |
fieldexactmatchformids = [] |
|---|
| 331 |
for ifield in range(0, maxfieldcount): |
|---|
| 332 |
fieldformidmatchsequence.append([]) |
|---|
| 333 |
fieldmatchformids.append({}) |
|---|
| 334 |
fieldexactmatchformids.append({}) |
|---|
| 335 |
for formid in range(0, formcount): |
|---|
| 336 |
fieldformidmatchsequence[ifield].append([]) |
|---|
| 337 |
if fieldformtypes[ifield][formid] in [ |
|---|
| 338 |
FIELDTYPE_KEYWORD, FIELDTYPE_LIST]: |
|---|
| 339 |
asequence=fieldselectsequences[ifield][formid] |
|---|
| 340 |
fieldformidmatchsequence[ifield][formid]=asequence |
|---|
| 341 |
for match in asequence: |
|---|
| 342 |
fieldmatchformids[ifield].setdefault(match, []).append(formid) |
|---|
| 343 |
elif fieldformtypes[ifield][formid] is FIELDTYPE_LISTMENU: |
|---|
| 344 |
asequence=fieldselectsequences[ifield][formid][0] |
|---|
| 345 |
fieldformidmatchsequence[ifield][formid]=asequence |
|---|
| 346 |
for match in asequence: |
|---|
| 347 |
fieldmatchformids[ifield].setdefault(match, []).append(formid) |
|---|
| 348 |
|
|---|
| 349 |
exactsequence = fieldselectsequences[ifield][formid][1] |
|---|
| 350 |
for match in exactsequence: |
|---|
| 351 |
fieldexactmatchformids[ifield].setdefault(match, []).append(formid) |
|---|
| 352 |
|
|---|
| 353 |
elif fieldformtypes[ifield][formid] is FIELDTYPE_MENU: |
|---|
| 354 |
exactsequence = fieldselectsequences[ifield][formid] |
|---|
| 355 |
for match in exactsequence: |
|---|
| 356 |
fieldexactmatchformids[ifield].setdefault(match, []).append(formid) |
|---|
| 357 |
formtable['fieldformidmatchsequence'] = fieldformidmatchsequence |
|---|
| 358 |
formtable['fieldmatchformids'] = fieldmatchformids |
|---|
| 359 |
formtable['fieldexactmatchformids'] = fieldexactmatchformids |
|---|
| 360 |
|
|---|
| 361 |
return formtable |
|---|
| 362 |
|
|---|
| 363 |
|
|---|