Author: Tomas Rutkauskas
How to implement fuzzy search
Answer:
Solve 1:
This DLL calculates the Levenshtein Distance between two strings. Please note that
ShareMem must be the first unit in the Uses clause of the Interface section of your
unit, if your DLL exports procedures or functions, which pass string parameters or
function results. ShareMem is the interface to delphimm.dll, which you have to
distribute together with your own DLL. To avoid using delphimm.dll, pass string
parameters by using PChar or ShortString parameters.
1 library Levensh;
2
3 uses
4 ShareMem, SysUtils;
5
6 var
7 FiR0: integer;
8 FiP0: integer;
9 FiQ0: integer;
10
11 function Min(X, Y, Z: Integer): Integer;
12 begin
13 if (X < Y) then
14 Result := X
15 else
16 Result := Y;
17 if (Result > Z) then
18 Result := Z;
19 end;
20
21 procedure LevenshteinPQR(p, q, r: integer);
22 begin
23 FiP0 := p;
24 FiQ0 := q;
25 FiR0 := r;
26 end;
27
28 function LevenshteinDistance(const sString, sPattern: string): Integer;
29 const
30 MAX_SIZE = 50;
31 var
32 aiDistance: array[0..MAX_SIZE, 0..MAX_SIZE] of Integer;
33 i, j, iStringLength, iPatternLength, iMaxI, iMaxJ: Integer;
34 chChar: Char;
35 iP, iQ, iR, iPP: Integer;
36 begin
37 iStringLength := length(sString);
38 if (iStringLength > MAX_SIZE) then
39 iMaxI := MAX_SIZE
40 else
41 iMaxI := iStringLength;
42 iPatternLength := length(sPattern);
43 if (iPatternLength > MAX_SIZE) then
44 iMaxJ := MAX_SIZE
45 else
46 iMaxJ := iPatternLength;
47 aiDistance[0, 0] := 0;
48 for i := 1 to iMaxI do
49 aiDistance[i, 0] := aiDistance[i - 1, 0] + FiR0;
50 for j := 1 to iMaxJ do
51 begin
52 chChar := sPattern[j];
53 if ((chChar = '*') or (chChar = '?')) then
54 iP := 0
55 else
56 iP := FiP0;
57 if (chChar = '*') then
58 iQ := 0
59 else
60 iQ := FiQ0;
61 if (chChar = '*') then
62 iR := 0
63 else
64 iR := FiR0;
65 aiDistance[0, j] := aiDistance[0, j - 1] + iQ;
66 for i := 1 to iMaxI do
67 begin
68 if (sString[i] = sPattern[j]) then
69 iPP := 0
70 else
71 iPP := iP;
72 {aiDistance[i, j] := Minimum of 3 values}
73 aiDistance[i, j] := Min(aiDistance[i - 1, j - 1] + iPP,
74 aiDistance[i, j - 1] + iQ,
75 aiDistance[i - 1, j] + iR);
76 end;
77 end;
78 Result := aiDistance[iMaxI, iMaxJ];
79 end;
80
81 exports
82 LevenshteinDistance Index 1,
83 LevenshteinPQR Index 2;
84
85 begin
86 FiR0 := 1;
87 FiP0 := 1;
88 FiQ0 := 1;
89 end.
Solve 2:
This is an old Pascal code snippet, which is based on a C project published in the
C't magazine somewhen back in the 1990's. Can't remember where I found it on the
WWW. Please note that the code below accesses a simple *.txt file to search in.
90 program FuzzySearch;
91 {Translation from C to Pascal by Karsten Paulini and Simon Reinhardt}
92 const
93 MaxParLen = 255;
94 var
95 InFile: Text;
96 Filename: string;
97 InputStr: string;
98 SearchStr: string;
99 Treshold: Integer;
100
101 function PrepareTheString(OriginStr: string; var ConvStr: string): Integer;
102 var
103 i: Integer;
104 begin
105 ConvStr := OriginStr;
106 for i := 1 to Length(OriginStr) do
107 begin
108 ConvStr[i] := UpCase(ConvStr[i]);
109 if ConvStr[i] < '0' then
110 ConvStr[i] := ' '
111 else
112 case ConvStr[i] of
113 Chr(196): ConvStr[i] := Chr(228);
114 Chr(214): ConvStr[i] := Chr(246);
115 Chr(220): ConvStr[i] := Chr(252);
116 Chr(142): ConvStr[i] := Chr(132);
117 Chr(153): ConvStr[i] := Chr(148);
118 Chr(154): ConvStr[i] := Chr(129);
119 ':': ConvStr[i] := ' ';
120 ';': ConvStr[i] := ' ';
121 '<': ConvStr[i] := ' ';
122 '>': ConvStr[i] := ' ';
123 '=': ConvStr[i] := ' ';
124 '?': ConvStr[i] := ' ';
125 '[': ConvStr[i] := ' ';
126 ']': ConvStr[i] := ' ';
127 end;
128 end;
129 PrepareTheString := i;
130 end;
131
132 function NGramMatch(TextPara, SearchStr: string; SearchStrLen, NGramLen: Integer;
133 var MaxMatch: Integer): Integer;
134 var
135 NGram: string[8];
136 NGramCount: Integer;
137 i, Count: Integer;
138 begin
139 NGramCount := SearchStrLen - NGramLen + 1;
140 Count := 0;
141 MaxMatch := 0;
142 for i := 1 to NGramCount do
143 begin
144 NGram := Copy(SearchStr, i, NGramLen);
145 if (NGram[NGramLen - 1] = ' ') and (NGram[1] < > ' ') then
146 Inc(i, NGramLen - 3) {will be increased in the loop}
147 else
148 begin
149 Inc(MaxMatch, NGramLen);
150 if Pos(NGram, TextPara) > 0 then
151 Inc(Count);
152 end;
153 end;
154 NGramMatch := Count * NGramLen;
155 end;
156
157 procedure FuzzyMatching(SearchStr: string; Treshold: Integer; var InFile: Text);
158 var
159 TextPara: string;
160 TextBuffer: string;
161 TextLen: Integer;
162 SearchStrLen: Integer;
163 NGram1Len: Integer;
164 NGram2Len: Integer;
165 MatchCount1: Integer;
166 MatchCount2: Integer;
167 MaxMatch1: Integer;
168 MaxMatch2: Integer;
169 Similarity: Real;
170 BestSim: Real;
171 begin
172 BestSim := 0.0;
173 SearchStrLen := PrepareTheString(SearchStr, SearchStr);
174 NGram1Len := 3;
175 if SearchStrLen < 7 then
176 NGram2Len := 2
177 else
178 NGram2Len := 5;
179 while not Eof(InFile) do
180 begin
181 Readln(InFile, TextBuffer);
182 TextLen := PrepareTheString(TextBuffer, TextPara) + 1;
183 TextPara := Concat(' ', TextPara);
184 if TextLen < MaxParLen - 2 then
185 begin
186 MatchCount1 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram1Len,
187 MaxMatch1);
188 MatchCount2 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram2Len,
189 MaxMatch2);
190 Similarity := 100.0 * (MatchCount1 + MatchCount2) / (MaxMatch1 + MaxMatch2);
191 if Similarity > BestSim then
192 BestSim := Similarity;
193 if Similarity >= Treshold then
194 begin
195 Writeln;
196 Writeln('[', Similarity, '] ', TextBuffer);
197 end;
198 end;
199 else
200 Writeln('Paragraph too long');
201 end;
202 if BestSim < Treshold then
203 Writeln('No match; Best Match was ', BestSim);
204 end;
205
206 begin
207 Writeln;
208 Writeln('+------------------------------------------+');
209 Writeln('| Fuzzy Search in Information Retrieval |');
210 Writeln('| (C) 1997 Reinhard Rapp |');
211 Writeln('+------------------------------------------+');
212 Writeln;
213 write('Name of file to search in: ');
214 Readln(Filename);
215 write('Search string: ');
216 Readln(InputStr);
217 SearchStr := Concat(' ', InputStr, ' ');
218 write('Minimum hit quality in % : ');
219 Readln(Treshold);
220 if (Treshold > 0) and (Treshold <= 100) and (SearchStr < > '') and (Filename < >
221 '') then
222 begin
223 Assign(InFile, Filename);
224 Reset(InFile);
225 FuzzyMatching(SearchStr, Treshold, InFile);
226 Close(InFile);
227 end;
228 Writeln;
229 Writeln('Bye!');
230 end.
Solve 3:
231 unit FuzzyMatch;
232
233 {This unit provides a basic 'fuzzy match' index on how alike two strings are
234 The result is of type 'single': near 0 - poor match
235 near 1 - close match
236 The intention is that HowAlike(s1,s2)=HowAlike(s2,s1)
237 The Function is not case sensitive}
238
239 interface
240
241 uses sysutils;
242
243 function HowAlike(s1, s2: string): single;
244
245 implementation
246
247 function instr(start: integer; ToSearch, ToFind: string): integer;
248 begin
249 //This is a quick implementation of the VB InStr, since Pos just doesn't do what
250 is needed!!
251 //NB - case sensitive!!
252 if start > 1 then
253 Delete(ToSearch, 1, start - 1);
254 result := pos(ToFind, ToSearch);
255 if (result > 0) and (start > 1) then
256 inc(result, start);
257 end;
258
259 function HowAlike(s1, s2: string): single;
260 var
261 l1, l2, pass, position, size, foundpos, maxscore: integer;
262 score, scored, string1pos, string2pos, bestmatchpos: single;
263 swapstring, searchblock: string;
264 begin
265 s1 := Uppercase(trim(s1));
266 s2 := Uppercase(trim(s2));
267
268 score := 0;
269 maxscore := 0;
270 scored := 0;
271
272 //deal with zero length strings...
273 if (s1 = '') and (s2 = '') then
274 begin
275 result := 1;
276 exit;
277 end
278 else if (s1 = '') or (s2 = '') then
279 begin
280 result := 0;
281 exit;
282 end;
283
284 //why perform any mathematics is the result is clear?
285 if s1 = s2 then
286 begin
287 result := 1;
288 exit;
289 end;
290
291 //make two passes,
292 // with s1 and s2 each way round to ensure
293 // consistent results
294 for pass := 1 to 2 do
295 begin
296 l1 := length(s1);
297 l2 := length(s2);
298 for size := l1 downto 1 do
299 begin
300 for position := 1 to (l1 - size + 1) do
301 begin
302 //try to find implied block in the other string
303 //Big blocks score much better than small blocks
304 searchblock := copy(s1, position, size);
305 foundpos := pos(searchblock, s2);
306
307 if size = l1 then
308 string1pos := 0.5
309 else
310 string1pos := (position - 1) / (l1 - size);
311
312 if foundpos > 0 then
313 begin
314 //the string is in somewhere in there
315 // - find the 'closest' one.
316 bestmatchpos := -100; //won't find anything that far away!
317
318 repeat
319 if size = l2 then
320 string2pos := 0.5
321 else
322 string2pos := (foundpos - 1) / (l2 - size);
323
324 //If this closer than the previous best?
325 if abs(string2pos - string1pos) < abs(bestmatchpos - string1pos) then
326 bestmatchpos := string2pos;
327
328 foundpos := instr(foundpos + 1, s2, searchblock);
329 until foundpos = 0; //loop while foundpos>0..
330
331 //The closest position is now known: Score it!
332 //Score as follows: (1-distance of best match)
333 score := score + (1 - abs(string1pos - bestmatchpos));
334 end;
335
336 //Keep track if the maximum possible score
337 //BE CAREFUL IF CHANGING THIS FUNCTION!!!
338
339 //maxscore:=maxscore+1;
340 inc(maxscore);
341 end; //for position..
342 end; //for size..
343
344 if pass = 1 then
345 begin
346 //swap the strings around
347 swapstring := s1;
348 s1 := s2;
349 s2 := swapstring;
350 end;
351
352 //Each pass is weighted equally
353
354 scored := scored + (0.5 * (score / maxscore));
355 score := 0;
356 maxscore := 0;
357 end; //for pass..
358
359 //HowAlike=score/maxscore
360 result := scored;
361 end;
Solve 4:
A Delphi implementation of the Levenshtein Distance Algorithm
362 unit Levenshtein;
363
364 {Objeto que calcula la distancia de Levenshtein entre 2 cadenas.
365 Alvaro Jeria Madariaga. 04/10/2002
366 barbaro@hotpop.com}
367
368 interface
369
370 uses
371 sysutils, Math;
372
373 type
374 Tdistance = class(TObject)
375 private
376 function minimum(a, b, c: Integer): Integer;
377 public
378 function LD(s, t: string): Integer;
379 end;
380
381 implementation
382
383 function Tdistance.minimum(a, b, c: Integer): Integer;
384 var
385 mi: Integer;
386 begin
387 mi := a;
388 if (b < mi) then
389 mi := b;
390 if (c < mi) then
391 mi := c;
392 Result := mi;
393 end;
394
395 function Tdistance.LD(s, t: string): Integer;
396 var
397 d: array of array of Integer;
398 n, m, i, j, costo: Integer;
399 s_i, t_j: char;
400 begin
401 n := Length(s);
402 m := Length(t);
403 if (n = 0) then
404 begin
405 Result := m;
406 Exit;
407 end;
408 if m = 0 then
409 begin
410 Result := n;
411 Exit;
412 end;
413 setlength(d, n + 1, m + 1);
414 for i := 0 to n do
415 d[i, 0] := i;
416 for j := 0 to m do
417 d[0, j] := j;
418 for i := 1 to n do
419 begin
420 s_i := s[i];
421 for j := 1 to m do
422 begin
423 t_j := t[j];
424 if s_i = t_j then
425 costo := 0
426 else
427 costo := 1;
428 d[i, j] := Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + costo);
429 end;
430 end;
431 Result := d[n, m];
432 end;
433
434 end.
I've written some function that compares two strings and returns true, if they are
identical or similar ('house' is similar to 'mouse', 'hose', 'houses' or 'horse').
It works quite good, but I guess there are other implementations around, and I'd
like to compare mine to others. I'd also like to be able to find any identical or
similar substrings inside a longer string, what's not possible with my (very
simple) algorithm. Do you know about any sources or other documentation?
Levenshtein matching gives the number of steps (single character replacement or
addition) needed to transform StringA into String B. Ratcliffe matching gives the
percentage of possible character matches between StringA and StringB, based on the
longest matching sequences and subsequences between the two strings.
435
436 function CompareStrings_Levenshtein(const A, B: string; CaseSensitive: Boolean =
437 False): Integer;
438
439 function Minimum3(x, y, z: Integer): Integer;
440 begin
441 Result := Min(x, y);
442 Result := Min(Result, z);
443 end;
444
445 var
446 D: array of array of Integer;
447 n, m, i, j, Cost: Integer;
448 AI, BJ: Char;
449 A1, B1: string;
450 begin
451 n := Length(A);
452 m := Length(B);
453 if (n = 0) then
454 Result := m
455 else if m = 0 then
456 Result := n
457 else
458 begin
459 if CaseSensitive then
460 A1 := A
461 else
462 A1 := UpperCase(A);
463 if CaseSensitive then
464 B1 := B
465 else
466 B1 := UpperCase(B);
467 Setlength(D, n + 1, m + 1);
468 for i := 0 to n do
469 D[i, 0] := i;
470 for j := 0 to m do
471 D[0, j] := j;
472 for i := 1 to n do
473 begin
474 AI := A1[i];
475 for j := 1 to m do
476 begin
477 BJ := B1[j];
478 Cost := iff(AI = BJ, 0, 1);
479 D[i, j] := Minimum3(D[i - 1][j] + 1, D[i][j - 1] + 1, D[i - 1][j - 1] +
480 Cost);
481 end;
482 end;
483 Result := D[n, m];
484 end;
485 end;
486
487 function CompareStrings_Ratcliff(const A, B: string; CaseSensitive: Boolean =
488 False):
489 Double;
490 var
491 A1, B1: string;
492 LenA, LenB: Integer;
493
494 function CSRSub(StartA, EndA, StartB, EndB: Integer): Integer;
495 var
496 a, b, i, Matches, NewStartA, NewStartB: Integer;
497 begin
498 Result := 0;
499 NewStartA := 0;
500 NewStartB := 0;
501 if (StartA > EndA) or (StartB > EndB) or (StartA <= 0) or (StartB <= 0) then
502 Exit;
503 for a := StartA to EndA do
504 begin
505 for B := StartB to EndB do
506 begin
507 Matches := 0;
508 i := 0;
509 while (a + i <= EndA) and (b + i <= EndB) and (A1[a + i] = B1[b + i]) do
510 begin
511 Inc(Matches);
512 if Matches > Result then
513 begin
514 NewStartA := a;
515 NewStartB := b;
516 Result := Matches;
517 end;
518 Inc(i);
519 end;
520 end;
521 end;
522 if Result > 0 then
523 begin
524 Inc(Result, CSR_Sub(NewStartA + Result, EndA, NewStartB + Result, EndB));
525 Inc(Result, CSR_Sub(StartA, NewStartA - 1, StartB, NewStartB - 1));
526 end;
527 end;
528
529 begin
530 if CaseSensitive then
531 A1 := A
532 else
533 A1 := UpperCase(A);
534 if CaseSensitive then
535 B1 := B
536 else
537 B1 := UpperCase(B);
538 LenA := Length(A1);
539 LenB := Length(B1);
540 if A1 = B1 then
541 Result := 100
542 else if (LenA = 0) or (LenB = 0) then
543 Result := 0
544 else
545 Result := CSR_Sub(1, LenA, 1, LenB) * 200 / (LenA + LenB);
546 end;
|