The Soundex Algorithm (DE)

  • Comments 1
And for my German collegues here is a version of the Soundex algorithm suited for German pronounciation. This is based on the Cologne phonetics algorithm. More information can be found here:  [http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik]
 
Soundex
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
 
namespace Soundex
{
    public class SoundexClass
    {
        public static string createSoundexCodeDE(string word)
        {
            string code = "", char0 = "", char1 = "", char2 = "";
 
            word = word.ToLower();
            if ((word.Length) < 1) { return ""; }
 
            // Umwandlung: v->f, w->f, j->i, y->i, ph->f, ä->a, ö->o, ü->u, ß->ss, é->e, è->e, ê->e, à->a, á->a, â->a, ë->e
            word = word.Replace("ç", "c").Replace("v", "f").Replace("w", "f").Replace("j", "i").Replace("y", "i").Replace("ph", "f").Replace("ä", "a").Replace("ö", "o").Replace("ü", "u").Replace("ß", "ss").Replace("é", "e").Replace("è", "e").Replace("ê", "e").Replace("à", "a").Replace("á", "a").Replace("â", "a").Replace("ë", "e");
 
            // Nur Buchstaben (keine Zahlen, keine Sonderzeichen)
            word = Regex.Replace(word, @"[^A-Za-z ]+", "");
 
            // Wir hängen bei 1-buchstabigen Strings ein Leerzeichen an, sonst funktioniert die Anlautprüfung auf den zweiten Buchstaben nicht.
            if (word.Length == 1) { word += " "; }
 
            // Sonderfälle bei Wortanfang (Anlaut)
            int index = 0;
            if (word.Substring(0, 1) == "c")
            {
                // vor a,h,k,l,o,q,r,u,x
                switch (word.Substring(1, 1))
                {
                    case "a":
                    case "h":
                    case "k":
                    case "l":
                    case "o":
                    case "q":
                    case "r":
                    case "u":
                    case "x":
                        code = "4";
                        break;
                    default:
                        code = "8";
                        break;
                }
                index = 1;
            }
            else
            {
                index = 0;
            }
 
            //* Umwandlungstabelle:
            //* ============================================
            //* Buchstabe      Kontext                  Code
            //* -------------  -----------------------  ----
            //* A,E,I,J,O,U,Y                            0
            //* H                                        -
            //* B                                        1
            //* P              nicht vor H               1
            //* D,T            nicht vor C,S,Z           2
            //* F,V,W                                    3
            //* P              vor H                     3
            //* G,K,Q                                    4
            //* C              im Wortanfang            
            //*                vor A,H,K,L,O,Q,R,U,X     4
            //* C              vor A,H,K,O,Q,U,X
            //*                ausser nach S,Z           4
            //* X              nicht nach C,K,Q         48
            //* L                                        5
            //* M,N                                      6
            //* R                                        7
            //* S,Z                                      8
            //* C              nach S,Z                  8
            //* C              im Wortanfang ausser vor  
            //*                A,H,K,L,O,Q,R,U,X         8
            //* C              nicht vor A,H,K,O,Q,U,X   8
            //* D,T            vor C,S,Z                 8
            //* X              nach C,K,Q                8
            //* --------------------------------------------
            while (index < word.Length)
            {
                // aktuelle Zeichen
                char1 = word.Substring(index, 1);
                // vorherige Zeichen
                char0 = "";
                try { char0 = word.Substring(index - 1, 1); }
                catch { }
                // nächsten Zeichen
                char2 = "";
                try { char2 = word.Substring(index + 1, 1); }
                catch { }
 
                switch (char1)
                {
                    case "a":
                    case "e":
                    case "i":
                    case "o":
                    case "u":
                        code += "0";
                        break;
                    case "b":
                    case "p":
                        code += "1";
                        break;
                    case "d":
                    case "t":
                        if ((index + 1) < word.Length)
                        {
                            switch (char2)
                            {
                                case "c":
                                case "s":
                                case "z":
                                    code += "8";
                                    break;
                                default:
                                    code += "2";
                                    break;
                            }
                        }
                        else
                        {
                            code += "2";
                        }
                        break;
                    case "f":
                        code += "3";
                        break;
                    case "g":
                    case "k":
                    case "q":
                        code += "4";
                        break;
                    case "c":
                        if ((index + 1) < word.Length)
                        {
                            switch (char2)
                            {
                                case "a":
                                case "h":
                                case "k":
                                case "o":
                                case "q":
                                case "u":
                                case "x":
                                    switch (char0)
                                    {
                                        case "s":
                                        case "z":
                                            code += "8";
                                            break;
                                        default:
                                            code += "4";
                                            break;
                                    }
                                    break;
                                default:
                                    code += "8";
                                    break;
                            }
                        }
                        else
                        {
                            code += "8";
                        }
                        break;
                    case "x":
                        if (index > 0)
                        {
                            switch (char0)
                            {
                                case "c":
                                case "k":
                                case "q":
                                    code += "8";
                                    break;
                                default:
                                    code += "48";
                                    break;
                            }
                        }
                        else
                        {
                            code += "48";
                        }
                        break;
                    case "l":
                        code += "5";
                        break;
                    case "m":
                    case "n":
                        code += "6";
                       break;
                    case "r":
                        code += "7";
                        break;
                    case "s":
                    case "z":
                        code += "8";
                        break;
                }
 
                index++;
            }
 
            // entfernen aller Codes "0" ausser am Anfang
            string code1 = code.Substring(0, 1);
            code = code.Replace("0", "");
 
            // Mehrfach Codes entfernen und Rückgabe
            for (int i = 1; i < code.Length; i++)
            {
                char1 = code.Substring(i, 1);
                char0 = "";
                try { char0 = code.Substring(i - 1, 1); }
                catch { }
                if (char1 != char0)
                {
                    code1 += char1;
                }
            }
 
            return code1;
        }
    }
}
 
 

·         I’ve converted the above code to .Net from PHP, so please thoroughly test before putting into production. The original PHP script can be found here: [https://github.com/deezaster/germanphonetic/blob/master/x3m_soundex_ger.php]

 
The new X++ required to populate the Soundex Table is as follows:
 
Job to bulk populate Soundex Table (names + addresses)
static void PopulateSoundexDE(Args _args)
{
    DirPartyTable           objDirPartyTable;
    LogisticsPostalAddress  objLogisticsPostalAddress;
    Soundex                 objSoundex;
    List                    list;
    ListIterator            iterator;
    str                     word, text, code;
    int                     position;
   
    // noise words for entities (DE)
    container noiseWords1 = ["???", "???", "???"];
    // noise word list for addresses (DE)
    container noiseWords2 = ["???", "???", "???"];
 
    // format for a valid word (alpha chars only)
    System.Text.RegularExpressions.Regex objRegex1 = new System.Text.RegularExpressions.Regex("^[A-Z][a-z]+$");
       
    // clear [Soundex] table
    delete_from objSoundex where objSoundex.LanguageId == "de";
   
    // process entity names
    while select RecId, Name from objDirPartyTable
    {
        list = new List(Types::String);
        text = objDirPartyTable.Name;
       
        // replace any punctuation and formatting with a word seperator
        text = strReplace(text, '\n', ' ');
        text = strReplace(text, '-', ' ');
        text = strReplace(text, '/', ' ');
        text = strReplace(text, ',', ' ');
        text = strReplace(text, '.', ' ');
       
        // split text on space
        list = Global::strSplit(text, " ");
        iterator = new ListIterator(list);
        position = 1;
        while(iterator.more())
        {
            word = iterator.value();
            if (objRegex1.IsMatch(word) && conFind(noiseWords1, word) == 0)
            {
                objSoundex.initValue();
                objSoundex.ContextTableId = tableName2id("DirPartyTable");
                objSoundex.ContextRecId = objDirPartyTable.RecId;
                objSoundex.LanguageId = "de";
                objSoundex.Position = position;
                objSoundex.Word = word;
               
                code = Soundex.SoundexClass::createSoundexCodeDE(word);
                objSoundex.SoundexCode = str2int(code);
                objSoundex.insert();
            }
            position++;
            iterator.next();
        }
    }
 
    // process entity addresses
    while select RecId, Address from objLogisticsPostalAddress
    {
        list = new List(Types::String);
        text = objLogisticsPostalAddress.Address;
       
        // replace any punctuation and formatting with a word seperator
        text = strReplace(text, '\n', ' ');
        text = strReplace(text, '-', ' ');
        text = strReplace(text, '/', ' ');
        text = strReplace(text, ',', ' ');
        text = strReplace(text, '.', ' ');
       
        // split text on space
        list = Global::strSplit(text, " ");
        iterator = new ListIterator(list);
        position = 1;
        while(iterator.more())
        {
            word = iterator.value();
            if (objRegex1.IsMatch(word) && conFind(noiseWords2, word) == 0)
            {
                objSoundex.initValue();
                objSoundex.ContextTableId = tableName2id("LogisticsPostalAddress");
                objSoundex.ContextRecId = objLogisticsPostalAddress.RecId;
                objSoundex.LanguageId = "de";
                objSoundex.Position = position;
                objSoundex.Word = word;
               
                code = Soundex.SoundexClass::createSoundexCodeDE(word);
                objSoundex.SoundexCode = str2int(code);
                objSoundex.insert();
            }
            position++;
            iterator.next();
        }
    }
}
 
 
 
REGARDS
 
 
 
  • And again, greater job Khalid!

    I think we will use a combination of EN and DE Soundex Algorithm.

    For now I am again looking forward to your next post to use the created data...

    Regards,

    Paul