Author: Paramjeet Reen
How to match strings based on the way they sound & not on their spellings.
Answer:
This article is in continuation of my previous article "Searching Strings by the
way they sound" and represents an attempt at making the SoundEx() more versatile so
as to theoratically accomodate languages other than English - the only restriction
being that the language should use the ASCII character set. Another advantage is
that the function can be "tuned" to peculiarities of a language e.g. "Knife" is
pronounced as "Nife" in English. There is theoratically no limit to this
"tunability" - of course with corresponding decrease in performance. But you can
get amazing results which are better than what SoundEx() gives.
I have chosen to post a new article rather than update the original one since the
original function has been modified quite significantly (in concept) thus making it
different from the industry standard SoundEx() function - which was implemented in
the original article.
Since the function now supports language "tuning", it can give different results
than the industry standard SoundEx(). I have thus renamed the function to
"Sound()". This also gives me the freedom to implement it differently.
Sound() returns the same value (M240) for each of Micael/Maical/Michael/Maichael.
Additionally, since it has been (partially) tuned for English, it will give the
same result (F500) for "Phone"/"Fone".
I guess the "Ultimate" Sound Matching logic will be based on phonemes - of which I
currently know very little. If you help me by providing me details of phonemes that
you may have, then I will make yet another attempt at improving "Sound()" even
further...
I thank Toninho Nunes and Joe Meyer for providing me ideas & inputs respectively.
Please save the code below in a file called "Sounds.pas". You will need to include
the file in your source (Uses Sounds) and then you will have access to the Sound()
function.
1
2 {********************************************************************}
3 {* Description: Modified Soundex function in which it is attempted to include *}
4 {* language pecularities which theoratically makes it adaptable to languages *}
5 {* other than English - the only restriction being that the language in *}
6 {* question should use ASCII character set *}
7 {********************************************************************}
8 {* Date Created : 15-Nov-2000 *}
9 {* Last Modified : 16-Nov-2000 *}
10 {* Version : 0.10 *}
11 {* Author : Paramjeet Reen *}
12 {* eMail : Paramjeet.Reen@EudoraMail.com *}
13 {******************************************************************************}
14 {* This program is based on an algorithm that I had found in a magazine, *}
15 {* merged with an algorithm of a program posted by Joe Meyer. I do not *}
16 {* gurantee the fitness of this program in any way. Use it at your own risk. *}
17 {********************************************************************}
18 {* Category: Freeware. *}
19 {********************************************************************}
20
21 unit Sounds;
22
23 interface
24
25 //Returns a code for InpStr depending upon how it sounds.
26 function Sound(const InpStr: ShortString): ShortString;
27
28 implementation
29
30 type
31 TReplacePos = (pStart, pMid, pEnd);
32 TReplacePosSet = set of TReplacePos;
33
34 const
35 {********************************************************************}
36 {* The following are selected letters of the alphabet which are divided *}
37 {* into their corresponding code (1-6). You might need to modify these for *}
38 {* different languages depending upon whether the language requires *}
39 {* alphabets other than the ones specified below *}
40 {********************************************************************}
41 Chars1 = ['B', 'P', 'F', 'V'];
42 Chars2 = ['C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z'];
43 Chars3 = ['D', 'T'];
44 Chars4 = ['L'];
45 Chars5 = ['M', 'N'];
46 Chars6 = ['R'];
47
48 procedure ReplaceStr(var InpStr: ShortString; const SubStr, WithStr: ShortString;
49 const ReplacePositions: TReplacePosSet);
50 var
51 i: Integer;
52 begin
53 if (pStart in ReplacePositions) then
54 begin
55 i := Pos(SubStr, InpStr);
56
57 if (i = 1) then
58 begin
59 Delete(InpStr, i, Length(SubStr));
60 Insert(WithStr, InpStr, i);
61 end;
62 end;
63
64 if (pMid in ReplacePositions) then
65 begin
66 i := Pos(SubStr, InpStr);
67
68 while (i > 1) and (i <= (Length(InpStr) - Length(SubStr))) do
69 begin
70 Delete(InpStr, i, Length(SubStr));
71 Insert(WithStr, InpStr, i);
72 i := Pos(SubStr, InpStr);
73 end;
74 end;
75
76 if (pEnd in ReplacePositions) then
77 begin
78 i := Pos(SubStr, InpStr);
79
80 if (i > 1) and (i > (Length(InpStr) - Length(SubStr))) then
81 begin
82 Delete(InpStr, i, Length(SubStr));
83 Insert(WithStr, InpStr, i);
84 end;
85 end;
86 end;
87
88 function Sound(const InpStr: ShortString): ShortString;
89 var
90 vStr: ShortString;
91 PrevCh: Char;
92 CurrCh: Char;
93 i: Word;
94 begin
95 {********************************************************************}
96 {* Uppercase & remove invalid characters from given string *}
97 {********************************************************************}
98 {* Please have a long & hard look at this code if you have modified any of *}
99 {* the constants Chars1,Chars2 ... Chars6 by increasing the overall range *}
100 {* of alphabets *}
101 {********************************************************************}
102 vStr := '';
103 for i := 1 to Length(InpStr) do
104 case InpStr[i] of
105 'a'..'z': vStr := vStr + UpCase(InpStr[i]);
106 'A'..'Z': vStr := vStr + InpStr[i];
107 end; {case}
108
109 if (vStr <> '') then
110 begin
111 {**************************************************************************}
112 {* Language Tweaking Section *}
113 {********************************************************************}
114 {* Tweak for language peculiarities e.g. "CAt"="KAt", "KNIfe"="NIfe" *}
115 {* "PHone"="Fone", "PSYchology"="SIchology", "EXcel"="Xcel" etc... *}
116 {* You will need to modify these for different languages. Optionally, you *}
117 {* may choose not to have this section at all, in which case, the output *}
118 {* of Sound() will correspond to that of SoundEx(). Please note however *}
119 {* the importance of what you replace & the order in which you replace. *}
120 {********************************************************************}
121 {* Also, please note that the following replacements are targeted for the *}
122 {* English language & that too is subject to improvements *}
123 {********************************************************************}
124 ReplaceStr(vStr, 'CA', 'KA', [pStart, pMid, pEnd]); //arCAde = arKAde
125 ReplaceStr(vStr, 'CL', 'KL', [pStart, pMid, pEnd]); //CLass = Klass
126 ReplaceStr(vStr, 'CK', 'K', [pStart, pMid, pEnd]); //baCK = baK
127 ReplaceStr(vStr, 'EX', 'X', [pStart, pMid, pEnd]); //EXcel = Xcel
128 ReplaceStr(vStr, 'X', 'Z', [pStart]); //Xylene = Zylene
129 ReplaceStr(vStr, 'PH', 'F', [pStart, pMid, pEnd]); //PHone = Fone
130 ReplaceStr(vStr, 'KN', 'N', [pStart]); //KNife = Nife
131 ReplaceStr(vStr, 'PSY', 'SI', [pStart]); //PSYche = SIche
132 ReplaceStr(vStr, 'SCE', 'CE', [pStart, pMid, pEnd]); //SCEne = CEne
133
134 {********************************************************************}
135 {* String Assembly Section *}
136 {********************************************************************}
137 PrevCh := #0;
138 Result := vStr[1];
139 for i := 2 to Length(vStr) do
140 begin
141 if Length(Result) = 4 then
142 break;
143
144 CurrCh := vStr[i];
145 if (CurrCh <> PrevCh) then
146 begin
147 if CurrCh in Chars1 then
148 Result := Result + '1'
149 else if CurrCh in Chars2 then
150 Result := Result + '2'
151 else if CurrCh in Chars3 then
152 Result := Result + '3'
153 else if CurrCh in Chars4 then
154 Result := Result + '4'
155 else if CurrCh in Chars5 then
156 Result := Result + '5'
157 else if CurrCh in Chars6 then
158 Result := Result + '6';
159
160 PrevCh := CurrCh;
161 end;
162 end;
163 end
164 else
165 Result := '';
166
167 while (Length(Result) < 4) do
168 Result := Result + '0';
169 end;
170
171 end.
|