Newsportal USENET - Re: (Long post) Metaphone Algorithm In AWK

Okay, essentially complete (enough for me at least).

- added sorely needed 'TH' digraph handling

- helper functions refactored

- misc code cleanup

# Metaphone Algorithm In AWK v4: Michael Sanders - 2024
#
# https://en.wikipedia.org/wiki/Metaphone
#
# example invocation: awk -f metaphone.awk -v find=butter < words.txt

BEGIN { find_code = metaphone(find) }

# -----------------------------------------------------------------

# emit metaphone codes only
# { for (x = 1; x <= NF; x++) print $x " : " metaphone($x) }

# tweek levenshtein distance to open/constrain results...
{

for (x = 1; x <= NF; x++)
   if (metaphone($x) == find_code && levenshtein($x, find) <= 2)
print $x

}

# -----------------------------------------------------------------

function metaphone(w, m, c, n, z, i) {
w = toupper(w)
gsub(/[^A-Z]/, "", w) # strip non-alphabetic characters
z = length(w)

# handle initial letters
if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
w = substr(w, 2)
z--
}

for (i = 1; i <= z; i++) {
c = substr(w, i, 1)
n = (i < z) ? substr(w, i + 1, 1) : ""

# skip duplicate letters except for 'C'
if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue

# handle vowels: retain only if it's 1st letter
if (isvowel(c)) {
if (i == 1) m += c
}
# consonants
else if (c == "B") {
if (!(i == z && substr(w, i - 1, 1) == "M")) m += "B"
}
else if (c == "C") {
if (substr(w, i, 2) == "CH") {
m += "X"
i++
} else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) {
m += "S"
} else {
m += "K"
}
}
else if (c == "D") {
if (substr(w, i, 2) == "DG" && substr(w, i + 2, 1) ~ /[IEY]/) {
m += "J"
i += 2
} else {
m += "T"
}
}
else if (c == "G") {
if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) {
i++
} else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) {
continue
} else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) {
m += "J"
} else {
m += "K"
}
}
else if (c == "H") {
if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) {
if (i < z && !isvowel(n)) {
m += "H"
}
}
}
else if (c == "K") {
if (i == 1 || substr(w, i - 1, 1) != "C") m += "K"
}
else if (c == "P") {
if (substr(w, i, 2) == "PH") {
m += "F"
i++
} else {
m += "P"
}
}
else if (c == "Q") {
m += "K"
}
else if (c == "S") {
if (substr(w, i, 2) == "SH") {
m += "X"
i++
} else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
m += "X"
i += 2
} else {
m += "S"
}
}
else if (c == "T") {
if (substr(w, i, 2) == "TH") {
m += "0" # add '0' for 'TH' digraph to distinguish from regular 'T'
i++
} else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
m += "X"
i += 2
} else {
m += "T"
}
}
else if (c == "V") {
m += "F"
}
else if (c == "W" || c == "Y") {
if (i < z && isvowel(n)) m += c
}
else if (c == "X") {
m += "KS"
}
else if (c == "Z") {
m += "S"
}
# ensure 'M', 'N', and 'L' are always retained
else if (c == "M" || c == "N" || c == "L") {
m += c
}
}

return m
}

# -----------------------------------------------------------------

function levenshtein(w1, w2, l1, l2, i, j, cst, diz) {
l1 = length(w1)
l2 = length(w2)

# initialize distance array
for (i = 0; i <= l1; i++) diz[i, 0] = i
for (j = 0; j <= l2; j++) diz[0, j] = j

# compute distance
for (i = 1; i <= l1; i++) {
for (j = 1; j <= l2; j++) {
cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1
diz[i, j] = min3(diz[i-1, j] + 1,    # deletion
   diz[i, j-1] + 1,    # insertion
   diz[i-1, j-1] + cst) # substitution
}
}

return diz[l1, l2]
}

# -----------------------------------------------------------------

# metaphone helper function
function isvowel(char) { return char ~ /[AEIOU]/ }

# -----------------------------------------------------------------

# levenshtein helper function
function min3(a, b, c) {
return (a < b ? (a < c ? a : c) : (b < c ? b : c))
}

# eof

--
:wq
Mike Sanders

Date	Sujet	#	Auteur
17 Aug 24	(Long post) Metaphone Algorithm In AWK	17	Mike Sanders
19 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	10	Ben Bacarisse
19 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Ben Bacarisse
19 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	2	Mike Sanders
19 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders
20 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	6	Mike Sanders
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	5	Ben Bacarisse
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	4	Mike Sanders
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	2	Ben Bacarisse
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders
20 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	3	Mike Sanders
21 Aug 24	AWK language trivia (Was: (Long post) Metaphone Algorithm In AWK)	2	Kenny McCormack
21 Aug 24	Re: AWK language trivia	1	Mike Sanders
21 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders
23 Aug 24	Re: (Long post) Metaphone Algorithm In AWK	1	Mike Sanders