How to search for a string using the Soundex algorithm (Views: 708)
Problem/Question/Abstract: How to search for a string using the Soundex algorithm Answer: Solve 1: unit SndxAlgs; interface uses SysUtils; function Soundex(in_str: string): string; function NumericSoundex(in_str: string): Smallint; function ExtendedSoundex(in_str: string): string; implementation {Calculate a normal Soundex encoding.} function Soundex(in_str: string): string; var no_vowels, coded, out_str: string; ch: Char; i: Integer; begin {Make upper case and remove leading and trailing spaces.} in_str := Trim(UpperCase(in_str)); {Remove vowels, spaces, H, W, and Y except for the first character.} no_vowels := in_str[1]; for i := 2 to Length(in_str) do begin ch := in_str[i]; case ch of 'A', 'E', 'I', 'O', 'U', ' ', 'H', 'W', 'Y': ; {Do nothing.} else no_vowels := no_vowels + ch; end; end; {Encode the characters.} for i := 1 to Length(no_vowels) do begin ch := no_vowels[i]; case ch of 'B', 'F', 'P', 'V': ch := '1'; 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z': ch := '2'; 'D', 'T': ch := '3'; 'L': ch := '4'; 'M', 'N': ch := '5'; 'R': ch := '6'; else {Vowels, H, W, and Y as the 1st letter.} ch := '0'; end; coded := coded + ch; end; {Use the first letter.} out_str := no_vowels[1]; {Find three non-repeating codes.} for i := 2 to Length(no_vowels) do begin {Look for a non-repeating code.} if (coded[i] <> coded[i - 1]) then begin {This one works.} out_str := out_str + coded[i]; if (Length(out_str) >= 4) then Break; end; end; Soundex := out_str; end; {Calculate a numeric Soundex encoding.} function NumericSoundex(in_str: string): Smallint; var value: Integer; begin {Calculate the normal Soundex encoding.} in_str := Soundex(in_str); {Convert this into a numeric value.} value := (Ord(in_str[1]) - Ord('A')) * 1000; if (Length(in_str) > 1) then value := value + StrToInt(Copy(in_str, 2, Length(in_str) - 1)); NumericSoundex := value; end; {Calculate an extended Soundex encoding.} function ExtendedSoundex(in_str: string): string; {Replace instances of fr_str with to_str in str.} procedure ReplaceString(var str: string; fr_str, to_str: string); var fr_len, i: Integer; begin fr_len := Length(fr_str); i := Pos(fr_str, str); while (i > 0) do begin str := Copy(str, 1, i - 1) + to_str + Copy(str, i + fr_len, Length(str) - i - fr_len + 1); i := Pos(fr_str, str); end; end; var no_vowels: string; ch, last_ch: Char; i: Integer; begin {Make upper case and remove leading and trailing spaces.} in_str := Trim(UpperCase(in_str)); {Remove internal spaces.} ReplaceString(in_str, ' ', ''); {Convert CHR to CR.} ReplaceString(in_str, 'CHR', 'CR'); {Convert PH to F.} ReplaceString(in_str, 'PH', 'F'); {Convert Z to S.} ReplaceString(in_str, 'Z', 'S'); {Remove vowels and repeats.} last_ch := in_str[1]; {The last character used.} no_vowels := last_ch; for i := 2 to Length(in_str) do begin ch := in_str[i]; case ch of 'A', 'E', 'I', 'O', 'U': ; {Do nothing.} else {Skip it if it's a duplicate.} if (ch <> last_ch) then begin no_vowels := no_vowels + ch; last_ch := ch; end; end; end; ExtendedSoundex := no_vowels; end; end. Used like this: unit Sndx; interface uses Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs, StdCtrls, ExtCtrls, SndxAlgs; type TForm1 = class(TForm) InputText: TEdit; Label1: TLabel; CmdEncode: TButton; Label2: TLabel; Label3: TLabel; Panel1: TPanel; SoundexLabel: TLabel; Panel2: TPanel; NumericLabel: TLabel; Label4: TLabel; Panel3: TPanel; ExtendedLabel: TLabel; procedure CmdEncodeClick(Sender: TObject); private { Private declarations } public { Public declarations } end; var Form1: TForm1; implementation {$R *.DFM} procedure TForm1.CmdEncodeClick(Sender: TObject); begin SoundexLabel.Caption := Soundex(InputText.Text); NumericLabel.Caption := Format('%d', [NumericSoundex(InputText.Text)]); ExtendedLabel.Caption := ExtendedSoundex(InputText.Text); end; end. Solve 2: The code below is designed for use in English language and does not work for special characters like French accents or German Umlauts function StrSoundEx(const OrgString: string): string; var s: string; PrevCh: Char; Ch: Char; i: Integer; begin s := UpperCase(Trim(OrgString)); if s <> '' then begin PrevCh := #0; result := s[1]; for i := 2 to Length(s) do begin if Length(result) = 4 then break; Ch := s[i]; if (Ch <> PrevCh) then begin if Ch in ['B', 'P', 'F', 'V'] then result := result + '1' else if Ch in ['C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z'] then result := result + '2' else if Ch in ['D', 'T'] then result := result + '3' else if Ch in ['L'] then result := result + '4' else if Ch in ['M', 'N'] then result := result + '5' else if Ch in ['R'] then result := result + '6'; PrevCh := Ch; end; end; end; while Length(result) < 4 do result := result + '0'; end; Solve 3: The following differs from the standard Russell Soundex algorithm in that it lets you set the size of the Soundex code to something other than four characters: {Given a string this fuction returns the Russell Soundex code for that string. Although the Russell Soundex code is limited to four characters this function allows you to get a code up to 16 characters in length. For names a six to eight character code reduces the number of false matches significantly. Parameters: TheWord: The string to be encoded. SoundexSize: The number of characters in the returned code. Returns: The Soundex code.} function dgGetSoundexCode(TheWord: string; SoundexSize: Integer): string; const MaxSize = 16; var I: Integer; WorkString1, WorkString2: string; begin {Raise an exception if the SoundexSize parameter is not in the allowed range} if not SoundexSize in [1..MaxSize] then raise Exception.Create('Soundex size must in the range 1 - 16.'); {Convert the word to upper case} TheWord := UpperCase(TheWord); {Copy the first letter} WorkString1 := TheWord[1]; {Copy the rest of the word to WordString1 deleting duplicate letters} for I := 2 to Length(TheWord) do if TheWord[I - 1] <> TheWord[I] then AppendStr(WorkString1, TheWord[I]); {Move the first letter to WorkString2} WorkString2 := WorkString1[1]; {Compute the Soundex codes for the remaining letters} for I := 2 to Length(WorkString1) do case WorkString1[I] of 'B', 'F', 'P', 'V': AppendStr(WorkString2, '1'); 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z': Appendstr(WorkString2, '2'); 'D', 'T': Appendstr(WorkString2, '3'); 'L': Appendstr(WorkString2, '4'); 'M', 'N': Appendstr(WorkString2, '5'); 'R': Appendstr(WorkString2, '6'); end; {Pad the string with zeros} WorkString1 := ''; WorkString1 := dgFillString('0', MaxSize); AppendStr(WorkString2, WorkString1); Result := Copy(WorkString2, 1, SoundexSize); end; |