1 | package Text::Soundex;
|
---|
2 | require 5.000;
|
---|
3 | require Exporter;
|
---|
4 |
|
---|
5 | @ISA = qw(Exporter);
|
---|
6 | @EXPORT = qw(&soundex $soundex_nocode);
|
---|
7 |
|
---|
8 | $VERSION = '1.01';
|
---|
9 |
|
---|
10 | # $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $
|
---|
11 | #
|
---|
12 | # Implementation of soundex algorithm as described by Knuth in volume
|
---|
13 | # 3 of The Art of Computer Programming, with ideas stolen from Ian
|
---|
14 | # Phillipps <[email protected]>.
|
---|
15 | #
|
---|
16 | # Mike Stok <[email protected]>, 2 March 1994.
|
---|
17 | #
|
---|
18 | # Knuth's test cases are:
|
---|
19 | #
|
---|
20 | # Euler, Ellery -> E460
|
---|
21 | # Gauss, Ghosh -> G200
|
---|
22 | # Hilbert, Heilbronn -> H416
|
---|
23 | # Knuth, Kant -> K530
|
---|
24 | # Lloyd, Ladd -> L300
|
---|
25 | # Lukasiewicz, Lissajous -> L222
|
---|
26 | #
|
---|
27 | # $Log: soundex.pl,v $
|
---|
28 | # Revision 1.2 1994/03/24 00:30:27 mike
|
---|
29 | # Subtle bug (any excuse :-) spotted by Rich Pinder <[email protected]>
|
---|
30 | # in the way I handles leasing characters which were different but had
|
---|
31 | # the same soundex code. This showed up comparing it with Oracle's
|
---|
32 | # soundex output.
|
---|
33 | #
|
---|
34 | # Revision 1.1 1994/03/02 13:01:30 mike
|
---|
35 | # Initial revision
|
---|
36 | #
|
---|
37 | #
|
---|
38 | ##############################################################################
|
---|
39 |
|
---|
40 | # $soundex_nocode is used to indicate a string doesn't have a soundex
|
---|
41 | # code, I like undef other people may want to set it to 'Z000'.
|
---|
42 |
|
---|
43 | $soundex_nocode = undef;
|
---|
44 |
|
---|
45 | sub soundex
|
---|
46 | {
|
---|
47 | local (@s, $f, $fc, $_) = @_;
|
---|
48 |
|
---|
49 | push @s, '' unless @s; # handle no args as a single empty string
|
---|
50 |
|
---|
51 | foreach (@s)
|
---|
52 | {
|
---|
53 | $_ = uc $_;
|
---|
54 | tr/A-Z//cd;
|
---|
55 |
|
---|
56 | if ($_ eq '')
|
---|
57 | {
|
---|
58 | $_ = $soundex_nocode;
|
---|
59 | }
|
---|
60 | else
|
---|
61 | {
|
---|
62 | ($f) = /^(.)/;
|
---|
63 | tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/;
|
---|
64 | ($fc) = /^(.)/;
|
---|
65 | s/^$fc+//;
|
---|
66 | tr///cs;
|
---|
67 | tr/0//d;
|
---|
68 | $_ = $f . $_ . '000';
|
---|
69 | s/^(.{4}).*/$1/;
|
---|
70 | }
|
---|
71 | }
|
---|
72 |
|
---|
73 | wantarray ? @s : shift @s;
|
---|
74 | }
|
---|
75 |
|
---|
76 | 1;
|
---|
77 |
|
---|
78 | __END__
|
---|
79 |
|
---|
80 | =head1 NAME
|
---|
81 |
|
---|
82 | Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth
|
---|
83 |
|
---|
84 | =head1 SYNOPSIS
|
---|
85 |
|
---|
86 | use Text::Soundex;
|
---|
87 |
|
---|
88 | $code = soundex $string; # get soundex code for a string
|
---|
89 | @codes = soundex @list; # get list of codes for list of strings
|
---|
90 |
|
---|
91 | # set value to be returned for strings without soundex code
|
---|
92 |
|
---|
93 | $soundex_nocode = 'Z000';
|
---|
94 |
|
---|
95 | =head1 DESCRIPTION
|
---|
96 |
|
---|
97 | This module implements the soundex algorithm as described by Donald Knuth
|
---|
98 | in Volume 3 of B<The Art of Computer Programming>. The algorithm is
|
---|
99 | intended to hash words (in particular surnames) into a small space using a
|
---|
100 | simple model which approximates the sound of the word when spoken by an English
|
---|
101 | speaker. Each word is reduced to a four character string, the first
|
---|
102 | character being an upper case letter and the remaining three being digits.
|
---|
103 |
|
---|
104 | If there is no soundex code representation for a string then the value of
|
---|
105 | C<$soundex_nocode> is returned. This is initially set to C<undef>, but
|
---|
106 | many people seem to prefer an I<unlikely> value like C<Z000>
|
---|
107 | (how unlikely this is depends on the data set being dealt with.) Any value
|
---|
108 | can be assigned to C<$soundex_nocode>.
|
---|
109 |
|
---|
110 | In scalar context C<soundex> returns the soundex code of its first
|
---|
111 | argument, and in list context a list is returned in which each element is the
|
---|
112 | soundex code for the corresponding argument passed to C<soundex> e.g.
|
---|
113 |
|
---|
114 | @codes = soundex qw(Mike Stok);
|
---|
115 |
|
---|
116 | leaves C<@codes> containing C<('M200', 'S320')>.
|
---|
117 |
|
---|
118 | =head1 EXAMPLES
|
---|
119 |
|
---|
120 | Knuth's examples of various names and the soundex codes they map to
|
---|
121 | are listed below:
|
---|
122 |
|
---|
123 | Euler, Ellery -> E460
|
---|
124 | Gauss, Ghosh -> G200
|
---|
125 | Hilbert, Heilbronn -> H416
|
---|
126 | Knuth, Kant -> K530
|
---|
127 | Lloyd, Ladd -> L300
|
---|
128 | Lukasiewicz, Lissajous -> L222
|
---|
129 |
|
---|
130 | so:
|
---|
131 |
|
---|
132 | $code = soundex 'Knuth'; # $code contains 'K530'
|
---|
133 | @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
|
---|
134 |
|
---|
135 | =head1 LIMITATIONS
|
---|
136 |
|
---|
137 | As the soundex algorithm was originally used a B<long> time ago in the US
|
---|
138 | it considers only the English alphabet and pronunciation.
|
---|
139 |
|
---|
140 | As it is mapping a large space (arbitrary length strings) onto a small
|
---|
141 | space (single letter plus 3 digits) no inference can be made about the
|
---|
142 | similarity of two strings which end up with the same soundex code. For
|
---|
143 | example, both C<Hilbert> and C<Heilbronn> end up with a soundex code
|
---|
144 | of C<H416>.
|
---|
145 |
|
---|
146 | =head1 AUTHOR
|
---|
147 |
|
---|
148 | This code was implemented by Mike Stok (C<[email protected]>) from the
|
---|
149 | description given by Knuth. Ian Phillipps (C<[email protected]>) and Rich Pinder
|
---|
150 | (C<[email protected]>) supplied ideas and spotted mistakes.
|
---|