/*
 * GT CLIPPER STANDARD HEADER
 *
 * File......: fonetix.prg
 * Author....: Andy M Leighton
 * BBS.......: The Dark Knight Returns
 * Net/Node..: 050/069
 * User Name.: Andy Leighton
 * Date......: $Date$
 * Revision..: $Revision$
 *
 * This is an original work by Andy Leighton and is placed in the
 * public domain.
 *
 * Modification history:
 * ---------------------
 *
 * $Log$
 *
 */

/*  $DOC$
 *  $FUNCNAME$
 *       GT_FONETIX()
 *  $CATEGORY$
 *       String
 *  $ONELINER$
 *       Make a phonetic match string
 *  $SYNTAX$
 *       GT_Fonetix(<cStr>) --> cFonStr
 *  $ARGUMENTS$
 *      <cStr>    -  The input string
 *  $RETURNS$
 *      cFonStr   -  A phonetic representation of the input string
 *  $DESCRIPTION$
 *      Make a phonetic match string for a passed string.
 *      Only works in english.  Words with foreign roots
 *      may not match very well.  Words with more than one
 *      syllable also translate poorly in some cases.
 *      However this approach seems to match most intelligent
 *      misspellings and most of those made by children.
 *      Note the order in which these rules are applied are
 *      important, nevertheless please feel free to experiment
 *      with reordering or even adding new rules, after all you
 *      know what kind of strings/names are going to be matched.
 *
 *      Rules for producing the phonetic string
 *
 *      1.  Uppercase <cStr>
 *
 *      2.  Replace KN with N                       (Knight)
 *
 *      3.  Replace GN with N                       (Gnome)
 *
 *      4.  Replace WR with R                       (Wright)
 *
 *      5.  Replace WH with H if WH is followed by O
 *
 *      6.  Replace WH with W if WH isn't follwed by O
 *                                                   (Whately)
 *
 *      7.  Replace MC with MK
 *
 *      8.  Replace MAC with MK
 *
 *      9.  Replace EIGH with AY
 *          (Can't have it not matching *my* name :-)
 *
 *     10.  Replace IGHT with ITE                   (Wright)
 *
 *     11.  Replace C with S if followed by E, I or Y
 *
 *     12.  Replace C with K if not followed by E, I, Y, H
 *
 *     13.  Replace D[JG] with J
 *
 *     14.  Replace G with J if followed by E, I, or Y
 *
 *     15.  Replace GH with H
 *
 *     16.  Replace PH with F
 *
 *     17.  Replace Q with KW
 *
 *     18.  Replace TI with SH if it is followed by a vowel and
 *          is not at start of a word
 *
 *     19.  Replace X with KS
 *
 *     20   Replace Y with I if it is not the first or last
 *          character
 *
 *     21.  Replace Z with S
 *
 *     22.  Replace MB with M if MB is at end of string (thanks METAPH.PRG)
 *
 *     23.  Replace double consonants with just one of 'em
 *
 *     24.  Replace AIT with ATE
 *
 *     25.  Replace IE at end of word with Y
 *
 *     26.  Replace LE with L
 *
 *     27.  TERN at end of word gets replaced with TN
 *
 *     28.  EVE is replaced with EFE
 *
 *     29   Replace the schwa sound with "" if succeeded by
 *          R, L, M, N
 *
 *     30.  Replace SCH with SH
 *
 *     31.  ARY, IRY, ORY, ERY all get replaced with RY if
 *          at end of word
 *
 *     32.  Replace OO with U
 *
 *     33.  Replace OI with OY
 *
 *     34.  Remove vowels at end of word
 *
 *     35.  Replace all vowel groups with just the first
 *          vowel in the group (you can try last as well)
 *
 *  $EXAMPLES$
 *     use PERSONS                     // assume a personnel table
 *
 *     index on GT_fonetix(PERSONS->LAST_NAME) to LASTNAME
 *
 *     seek GT_fonetix("Leighton")
 *     ? found(), PERSONS_LAST_NAME    // .T., Leighton
 *
 *     seek GT_fonetix("Layton")
 *     ? found(), PERSONS->LAST_NAME   // .T., Leighton
 *
 *     Also compile with -DTEST
 *  $REFERENCES$
 *     .EXE Magazine Vol 4, Issue 3
 *     Soundex()
 *     METAPH.PRG in nanfor.lib
 *  $END$
 */

#include "gt_LIB.ch"

// a translate to make the code read nicer

#translate REPLACERULE(<cStr>, <cPhoneme>, <cRepl>) =>               ;
                     iif(<cPhoneme> $ <cStr>,                        ;
                              strtran(<cStr>, <cPhoneme>, <cRepl>),  ;
                              <cStr>)

/*
 * TEST HARNESS
 *
 * create a tbrowse of names, and their GT_Fonetix() equivalents
 */

#ifdef TEST
#include "inkey.ch"

static aNames := {}

function main()

   local tb, i, nKey

   cls

   aadd(aNames, "Adams"        )
   aadd(aNames, "Addams"       )
   aadd(aNames, "Smith"        )
   aadd(aNames, "Smythe"       )
   aadd(aNames, "Naylor"       )
   aadd(aNames, "Nailer"       )
   aadd(aNames, "Holberry"     )
   aadd(aNames, "Wholebary"    )
   aadd(aNames, "Jackson"      )
   aadd(aNames, "Jaksun"       )
   aadd(aNames, "Fischer"      )
   aadd(aNames, "Fisher"       )
   aadd(aNames, "Knight"       )
   aadd(aNames, "Nite"         )
   aadd(aNames, "Stephens"     )
   aadd(aNames, "Stevens"      )
   aadd(aNames, "Neilson"      )
   aadd(aNames, "Nelson"       )
   aadd(aNames, "Wright"       )
   aadd(aNames, "Write"        )
   aadd(aNames, "Right"        )
   aadd(aNames, "McLean"       )
   aadd(aNames, "McLane"       )
   aadd(aNames, "Maclean"      )
   aadd(aNames, "Leighton"     )
   aadd(aNames, "Layton"       )
   aadd(aNames, "Whately"      )
   aadd(aNames, "Waitly"       )
   aadd(aNames, "Swaine"       )
   aadd(aNames, "Swane"        )
   aadd(aNames, "Codie"        )
   aadd(aNames, "Cody"         )
   aadd(aNames, "Griffon"      )
   aadd(aNames, "Griphon"      )
   aadd(aNames, "Gryphon"      )
   aadd(aNames, "Pearson"      )
   aadd(aNames, "Peerson"      )
   aadd(aNames, "Peersun"      )
   aadd(aNames, "Chilton"      )
   aadd(aNames, "Chiltern"     )
   aadd(aNames, "Chiltun"      )

   i := 1
   tb := tbrowseNew(1, 1, 23, 78)
   tb:addColumn(tbColumnNew(padc("Name", 25),                           ;
                            {|| padc(aNames[i], 25)}))
   tb:addColumn(tbColumnNew(padc("Phoneme", 25),                        ;
                            {|| padc(GT_FONETIX(aNames[i]), 25)}))
   tb:skipBlock := {|SkipCnt| SkipIt(@i, SkipCnt, len(aNames)) }
   tb:goTopBlock := {|| i := 1}
   tb:goBottomBlock := {|| i := len(aNames)}

   do while lastkey() != K_ESC
      do while nextkey() = 0 .and. !tb:stabilize()
      enddo
      nKey = inkey(0)
      do case
         case nKey = K_DOWN
               tb:down()
         case nKey = K_UP
               tb:up()
         case nKey = K_PGDN
               tb:pagedown()
         case nKey = K_PGUP
               tb:pageup()
         case nKey = K_CTRL_PGUP
               tb:gotop()
         case nKey = K_CTRL_PGDN
               tb:gobottom()
      endcase
   enddo

   cls

return NIL

static function SkipIt(ele, skip_cnt, maxval)

   local movement := 0                 // this will be returned to TBROWSE

   if skip_cnt > 0
      do while ele + movement < maxval .and. movement < skip_cnt
         movement++
      enddo

   elseif skip_cnt < 0
      do while ele + movement > 1 .and. movement > skip_cnt
         movement--
      enddo
   endif
   ele += movement

return movement

#endif

/**/

function GT_fonetix(cStr)

   local cPhone := upper(cStr)
   local nPtr   := 1

   cPhone := REPLACERULE(cPhone, "KN", "N")
   cPhone := REPLACERULE(cPhone, "GN", "N")
   cPhone := REPLACERULE(cPhone, "WR", "R")
   cPhone := REPLACERULE(cPhone, "WHO", "HO")
   cPhone := REPLACERULE(cPhone, "WH", "W")
   cPhone := REPLACERULE(cPhone, "MAC", "MK")
   cPhone := REPLACERULE(cPhone, "MC", "MK")
   cPhone := REPLACERULE(cPhone, "EIGH", "AY")
   cPhone := REPLACERULE(cPhone, "IGHT", "ITE")

   cPhone := REPLACERULE(cPhone, "CE", "S")
   cPhone := REPLACERULE(cPhone, "CI", "S")
   cPhone := REPLACERULE(cPhone, "CY", "S")

/*
 * do not split the following 3 rules on pain of death
 */
   cPhone := REPLACERULE(cPhone, "CH", "||")
   cPhone := REPLACERULE(cPhone, "C",  "K")
   cPhone := REPLACERULE(cPhone, "||", "CH")

   cPhone := REPLACERULE(cPhone, "DG", "J")
   cPhone := REPLACERULE(cPhone, "DJ", "J")

   cPhone := REPLACERULE(cPhone, "GE", "JE")
   cPhone := REPLACERULE(cPhone, "GI", "JY")
   cPhone := REPLACERULE(cPhone, "GY", "JY")

   cPhone := REPLACERULE(cPhone, "GH", "H")

   cPhone := REPLACERULE(cPhone, "PH", "F")
   cPhone := REPLACERULE(cPhone, "Q", "KW")

/*
 * do not split the TI rules on pain of death
 */

   if substr(cPhone, 1, 2) == "TI"
      cPhone := "||" + substr(cPhone, 3)
   endif
   cPhone := REPLACERULE(cPhone, "TION", "SHUN")
   cPhone := REPLACERULE(cPhone, "TIA", "SHA")
   cPhone := REPLACERULE(cPhone, "TIE", "SHE")
   cPhone := REPLACERULE(cPhone, "TII", "SHI")       // ???
   cPhone := REPLACERULE(cPhone, "TIO", "SHO")
   cPhone := REPLACERULE(cPhone, "TIU", "SHU")
   cPhone := REPLACERULE(cPhone, "||", "TI")

   cPhone := REPLACERULE(cPhone, "X", "KS")

/*
 * do not split the Y rules on pain of death
 */
   if substr(cPhone, 1, 1) == "Y"
      cPhone := "|" + substr(cPhone, 2)
   endif
   if substr(cPhone, len(cPhone), 1) == "Y"
      cPhone := substr(cPhone, 1, len(cPhone) - 1) + '|'
   endif
   cPhone := REPLACERULE(cPhone, "Y", "I")
   cPhone := REPLACERULE(cPhone, "|", "Y")

   cPhone := REPLACERULE(cPhone, "Z", "S")

   if substr(cPhone, len(cPhone) - 1, 2) == 'MB'
      cPhone := substr(cPhone, 1, len(cPhone) - 1)
   endif

/*
 * double consonants NOTE no C, Q, X, or Z
 * they have been replaced away already
 */

   cPhone := REPLACERULE(cPhone, "BB", "B")
   cPhone := REPLACERULE(cPhone, "DD", "D")
   cPhone := REPLACERULE(cPhone, "FF", "F")
   cPhone := REPLACERULE(cPhone, "GG", "G")
   cPhone := REPLACERULE(cPhone, "HH", "H")
   cPhone := REPLACERULE(cPhone, "JJ", "J")
   cPhone := REPLACERULE(cPhone, "KK", "K")
   cPhone := REPLACERULE(cPhone, "LL", "L")
   cPhone := REPLACERULE(cPhone, "MM", "M")
   cPhone := REPLACERULE(cPhone, "NN", "N")
   cPhone := REPLACERULE(cPhone, "PP", "P")
   cPhone := REPLACERULE(cPhone, "RR", "R")
   cPhone := REPLACERULE(cPhone, "SS", "S")
   cPhone := REPLACERULE(cPhone, "TT", "T")
   cPhone := REPLACERULE(cPhone, "VV", "V")
   cPhone := REPLACERULE(cPhone, "WW", "W")

   cPhone := REPLACERULE(cPhone, "LE", "L")

   if substr(cPhone, len(cPhone) - 3, 4) == 'TERN'
      cPhone := substr(cPhone, 1, len(cPhone) - 4) + "TN"
   endif
   cPhone := REPLACERULE(cPhone, "EVE", "EFE")

   cPhone := gGT_FoneSchwa(cPhone, "E", "R")
   cPhone := gGT_FoneSchwa(cPhone, "E", "L")
   cPhone := gGT_FoneSchwa(cPhone, "E", "M")
   cPhone := gGT_FoneSchwa(cPhone, "E", "N")
   cPhone := gGT_FoneSchwa(cPhone, "OU", "R")
   cPhone := gGT_FoneSchwa(cPhone, "OU", "L")
   cPhone := gGT_FoneSchwa(cPhone, "OU", "M")
   cPhone := gGT_FoneSchwa(cPhone, "OU", "N")
   cPhone := gGT_FoneSchwa(cPhone, "O", "R")
   cPhone := gGT_FoneSchwa(cPhone, "O", "L")
   cPhone := gGT_FoneSchwa(cPhone, "O", "M")
   cPhone := gGT_FoneSchwa(cPhone, "O", "N")
   cPhone := gGT_FoneSchwa(cPhone, "I", "R")
   cPhone := gGT_FoneSchwa(cPhone, "I", "L")
   cPhone := gGT_FoneSchwa(cPhone, "I", "M")
   cPhone := gGT_FoneSchwa(cPhone, "I", "N")
   cPhone := gGT_FoneSchwa(cPhone, "U", "R")
   cPhone := gGT_FoneSchwa(cPhone, "U", "L")
   cPhone := gGT_FoneSchwa(cPhone, "U", "M")
   cPhone := gGT_FoneSchwa(cPhone, "U", "N")

   cPhone := REPLACERULE(cPhone, "SCH", "SH")

   cPhone := REPLACERULE(cPhone, "OO", "U")
   cPhone := REPLACERULE(cPhone, "OI", "OY")

   if substr(cPhone, len(cPhone) - 1, 2) == 'IE'
      cPhone := substr(cPhone, 1, len(cPhone) - 2) + "Y"
   endif

   if substr(cPhone, len(cPhone) - 1, 2) == 'RY'
      if substr(cPhone, len(cPhone) - 2, 1) $ [AEIOUY]
         cPhone := substr(cPhone, 1, len(cPhone) - 3) + "RY"
      endif
   endif

/*
 * remove trailing vowels
 */
   do while substr(cPhone, len(cPhone), 1) $ [AEIOU]
      cPhone := substr(cPhone, 1, len(cPhone) - 1)
   enddo

   do while nPtr <= len(cPhone)
      if substr(cPhone, nPtr, 1) $ [AEIOU]
         do while substr(cPhone, nPtr + 1, 1) $ [AEIOU]
            cPhone := substr(cPhone, 1, nPtr) + substr(cPhone, nPtr + 2)
         enddo
      endif
      nPtr++
   enddo

return cPhone




/*
 * Internal Function: gGT_FoneSchwa()
 *
 * handle a schwa phoneme.
 *
 * A schwa is the er or uh sound for example the o in carbon.
 *
 */

function gGT_FoneSchwa(cPhone, cSchwa, cFollow)

   local nPos

   do while (cSchwa + cFollow) $ cPhone
      nPos := at(cSchwa + cFollow, cPhone)

      if substr(cPhone, nPos - 1, 1) $ [BDGFJKLMPRSTVW]
         cPhone := substr(cPhone, 1, nPos - 1) +;
                   substr(cPhone, nPos + len(cSchwa))
      else
         exit
      endif
   enddo

return cPhone
