Context Navigation

source: trunk/gsdl/perllib/classify/phind.pm@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago
Changes to get phind working under windows
Property svn:keywords set to `Author Date Id Revision`
File size: 39.2 KB

Line
1	###########################################################################
2	#
3	# phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl phind" at the command line for a summary.
32
33	package phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	sub BEGIN {
41	@ISA = ('BasClas');
42	}
43
44
45	sub print_usage {
46	print STDERR "
47	usage: classify phind [options]
48
49	options:
50	-text Fields The text used to build the phrase hierarchy.
51	(default: 'section:Title,section:text')
52
53	-title Title The metadata field used to describe each document.
54	(default: 'Title')
55
56	-button Name The label for the classifier screen and button in
57	navigation bar.
58	(default: 'Phrase')
59
60	-language Regex Language or languages to use building hierarchy.
61	Languages are identified by two-letter country codes
62	like en (English), es (Spanish), and fr (French).
63	Language is a regular expression, so 'en\|fr' (English or
64	French) and '..' (match any language) are valid.
65	(default: 'en'.)
66
67	-savephrases File If set, the phrase infomation will be stored in
68	the given file as text. It is probably a good idea
69	to use an absolute path.
70	(defualt: not set)
71
72	-suffixmode N The smode parameter to the phrase extraction program. A
73	value of 0 means that stopwords are ignored, and of 1
74	means that stopwords are used.
75	(default: 1)
76
77	-thesaurus Name Name of a thesaurus stored in phind format in the
78	collection's etc directory.
79	(default: not set)
80
81	-untidy Don't remove working files.
82
83	"; }
84
85
86	%wanted_index_files = ('td'=>1,
87	't'=>1,
88	'ti'=>1,
89	'tl'=>1,
90	'tsd'=>1,
91	'idb'=>1,
92	'ib1'=>1,
93	'ib2'=>1,
94	'ib3'=>1,
95	'i'=>1,
96	'il'=>1,
97	'w'=>1,
98	'wa'=>1);
99
100
101
102	# Phrase delimiter symbols - these should be abstracted out someplace
103
104	my $colstart = "COLLECTIONSTART";
105	my $colend = "COLLECTIONEND";
106	my $doclimit = "DOCUMENTLIMIT";
107	my $senlimit = "SENTENCELIMIT";
108	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
109
110
111	# Create a new phind browser based on collect.cfg
112
113	sub new {
114	my $class = shift (@_);
115	my $self = new BasClas($class, @_);
116
117	my $out = $self->{'outhandle'};
118
119
120	# Phind installation check
121	# The phind phrase browser is research software and is not installed
122	# by defualt. If the user attepts to use it we warn them that it's a
123	# bit dodgy, then tell them how to install it. If they can do that
124	# and get all the files in place, then we let them proceed.
125
126	print $out "Checking Phind phrase browser requirements...\n";
127
128	# Ensure the Phind generate scripts are in place
129	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
130	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
131	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133	if (!(-e $file1)) {
134	print STDERR "The phind \"suffix\" program is not installed. ";
135	print STDERR "To install it, change to the directory\n";
136	print STDERR " $src\n";
137	print STDERR "and type \"make install-phind\".\n\n";
138	exit(1);
139	}
140
141	# Ensure the Phind CGI script is in place
142	$file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
143	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
145
146	if (!(-e $file1)) {
147	print STDERR "The phind CGI program is not installed. ";
148	print STDERR "To install it, change to the directory\n";
149	print STDERR " $src\n";
150	print STDERR "and type \"make install-phind\".\n\n";
151	exit(1);
152	}
153
154	# Ensure the Phind Java applet is in place
155	$src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
156	$file1 = &util::filename_cat($src, "Phind.class");
157
158	if (!(-e $file1)) {
159	print STDERR "The phind Java classes are not compiled. ";
160	print STDERR "To compile them, change to the directory\n";
161	print STDERR " $src\n";
162	print STDERR "and use your Java compiler to compile Phind.java.\n";
163	print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
164	exit(1);
165	}
166
167	# Parse classifier arguments
168	my $builddir = "";
169	if (!parsargv::parse(\@_,
170	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
171	q^title/.*/Title^, \$self->{'titlefield'},
172	q^button/.*/Phrase^, \$self->{'buttonname'},
173	q^language/.*/en^, \$language,
174	q^builddir/.*/^, \$builddir,
175	q^savephrases/\d/0^, \$self->{'savephrases'},
176	q^suffixmode/\d/1^, \$self->{'suffixmode'},
177	q^thesaurus/.*/^, \$self->{'thesaurus'},
178	q^untidy^, \$self->{'untidy'},
179	"allow_extra_options")) {
180
181	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
182	&print_usage();
183	die "\n";
184	}
185
186	# classifier information
187	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
188
189	# limit languages
190	$self->{'language_exp'} = $language;
191
192	# collection directories
193	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
194	if (!$builddir) {
195	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
196	}
197	$self->{'builddir'} = $builddir;
198
199	$self->{'total'} = 0;
200
201	return bless $self, $class;
202	}
203
204
205	# Initialise the phind classifier
206
207	sub init {
208	my $self = shift (@_);
209
210	# ensure we have a build directory
211	my $builddir = $self->{'builddir'};
212	die unless (-e "$builddir");
213
214	# create phind directory
215	my $phnumber = 1;
216	my $phinddir = &util::filename_cat($builddir, "phind1");
217	while (-e "$phinddir") {
218	$phnumber++;
219	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
220	}
221	&util::mk_dir("$phinddir");
222	$self->{'phinddir'} = $phinddir;
223	$self->{'phindnumber'} = $phnumber;
224
225	# open filehandles for documents and text
226	my $clausefile = &util::filename_cat("$phinddir", "clauses");
227	&util::rm($clausefile) if (-e $clausefile);
228
229	my $txthandle = 'TEXT' . $phnumber;
230	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
231	$self->{'txthandle'} = $txthandle;
232
233	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234	&util::rm($docfile) if (-e $docfile);
235
236	my $dochandle = 'DOC' . $phnumber;
237	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
238	$self->{'dochandle'} = $dochandle;
239
240	}
241
242
243	# Classify each document.
244	#
245	# Each document is passed here in turn. The classifier extracts the
246	# text of each and stores it in the clauses file. Document details are
247	# stored in the docs.txt file.
248
249	sub classify {
250	my $self = shift (@_);
251	my $doc_obj = shift @_;
252
253	my $verbosity = $self->{'verbosity'};
254	my $top_section = $doc_obj->get_top_section();
255
256	my $titlefield = $self->{'titlefield'};
257
258	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259	print "process: $title\n" if ($verbosity > 2);
260
261	# Only consider the file if it is in the correct language
262	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
263	my $phrlanguage = $self->{'language_exp'};
264	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
265
266	# record this file
267	$self->{'total'} ++;
268	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
269
270
271	# Store document details
272	my $OID = $doc_obj->get_OID();
273	$OID = "NULL" unless defined $OID;
274	my $dochandle = $self->{'dochandle'};
275	print $dochandle "<Document>\t$OID\t$title\n";
276
277	# Store the text occuring in this object
278
279	# output the document delimiter
280	my $txthandle = $self->{'txthandle'};
281	print $txthandle "$doclimit\n";
282
283	# iterarate over the required indexes and store their text
284	my $indexes = $self->{'indexes'};
285	my $text = "";
286	my ($part, $level, $field, $section, $data, $dataref);
287
288	foreach $part (split(/,/, $indexes)) {
289
290	# Each field has a level and a data element ((e.g. document:Title)
291	($level, $field) = split(/:/, $part);
292	die unless ($level && $field);
293
294	# Extract the text from every section
295	# (In phind, document:text and section:text are equivalent)
296	if ($field eq "text") {
297	$data = "";
298	$section = $doc_obj->get_top_section();
299	while (defined($section)) {
300	$data .= $doc_obj->get_text($section) . "\n";
301	$section = $doc_obj->get_next_section($section);
302	}
303	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
304	}
305
306	# Extract a metadata field from a document
307	# (If ther eis more than one element of the given type, get them all.)
308	elsif ($level eq "document") {
309	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
310	foreach $data (@$dataref) {
311	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
312	}
313	}
314
315	# Extract metadata from every section in a document
316	elsif ($level eq "section") {
317	$data = "";
318	$section = $doc_obj->get_top_section();
319	while (defined($section)) {
320	$dataref = $doc_obj->get_metadata($section, $field);
321	$data .= join("\n", @$dataref) . "\n";
322	$section = $doc_obj->get_next_section($section);
323	}
324	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
325	}
326
327	# Some sort of specification which I don't understand
328	else {
329	die "Unknown level ($level) in phind index ($part)\n";
330	}
331
332	}
333
334	# output the text
335	$text =~ tr/\n//s;
336	print $txthandle "$text";
337	}
338
339
340	# Construct the classifier from the information already gathered
341	#
342	# When get_classify_info is called, the clauses and docs.txt files have
343	# already been constructed in the phind directory. This function will
344	# translate them into compressed, indexed MGPP files that can be read by
345	# the phindcgi script. It will also register our classifier so that it
346	# shows up in the navigation bar.
347
348	sub get_classify_info {
349	my $self = shift (@_);
350
351	close $self->{'dochandle'};
352	close $self->{'txthandle'};
353	my $verbosity = $self->{'verbosity'};
354	my $out = $self->{'outhandle'};
355	my $phinddir = $self->{'phinddir'};
356
357	my $osextra = "";
358	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
359	$osextra = " -d /";
360	}
361
362	if ($verbosity) {
363	print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
364	print $out "*** in", $self->{'phinddir'}, "\n";
365	}
366
367	# Construct phind indexes
368	my $suffixmode = $self->{'suffixmode'};
369	my ($command, $status);
370
371	# Generate the vocabulary, symbol statistics, and numbers file
372	# from the clauses file
373	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
374	&extract_vocabulary($self);
375
376	# Use the suffix program to generate the phind/phrases file
377	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
378	&execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
379
380	# Create the phrase file and put phrase numbers in phind/phrases
381	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
382	&renumber_phrases($self);
383
384	print $out "\nCreating phrase databases\n";
385	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
386	my $mg_stem = &util::filename_cat($phinddir, "pdata");
387
388	&execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
389	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
390	&execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
391
392	# create the mg index of words
393	print $out "\nCreating word-level search indexes\n";
394	$mg_input = &util::filename_cat($phinddir, "pword.txt");
395	$mg_stem = &util::filename_cat($phinddir, "pword");
396
397	&execute("mgpp_passes $osextra -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
398	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
399	&execute("mgpp_perf_hash_build $osextra -f $mg_stem", $verbosity, $out);
400	&execute("mgpp_passes $osextra -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
401	&execute("mgpp_weights_build $osextra -f $mg_stem", $verbosity, $out);
402	&execute("mgpp_invf_dict $osextra -f $mg_stem", $verbosity, $out);
403
404	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 1", $verbosity, $out);
405	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 2", $verbosity, $out);
406	&execute("mgpp_stem_idx $osextra -f $mg_stem -s 3", $verbosity, $out);
407
408	# create the mg document information database
409	print $out "\nCreating document information databases\n";
410	$mg_input = &util::filename_cat($phinddir, "docs.txt");
411	$mg_stem = &util::filename_cat($phinddir, "docs");
412
413	&execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
414	&execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
415	&execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
416
417	# Tidy up stray files
418	if (!$self->{'untidy'}) {
419	print $out "\nCleaning up\n" if ($verbosity > 2);
420	opendir (DIR, $phinddir) \|\| die;
421	my @files = readdir DIR;
422	closedir DIR;
423
424	foreach $file (@files) {
425	next if $file =~ /^\.\.?$/;
426	my ($suffix) = $file =~ /\.([^\.]+)$/;
427	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
428	# delete it!
429	print $out "deleting $file\n"; # if $verbosity > 2;
430	&util::rm (&util::filename_cat ($phinddir, $file));
431	}
432	}
433	}
434
435	# Return the information about the classifier that we'll later want to
436	# use to create macros when the Phind classifier document is displayed.
437	my %classifyinfo = ('thistype'=>'Invisible',
438	'childtype'=>'Phind',
439	'Title'=>$self->{'buttonname'},
440	'parameters'=>"phindnumber=$self->{'phindnumber'}",
441	'contains'=>[]);
442
443	my $collection = $self->{'collection'};
444	my $url = "library?a=p&p=phind&c=$collection";
445	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
446
447	return \%classifyinfo;
448	}
449
450
451
452	sub convert_gml_to_tokens {
453
454	my ($language_exp, $text) = @_;
455
456	if ($language_exp =~ /en/) {
457	return &convert_gml_to_tokens_EN($text);
458	}
459
460	$_ = $text;
461
462	# 1. remove GML tags
463
464	# Remove everything that is in a tag
465	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
466	s/\s<br>\s/ LINEBREAK /isgo;
467	s/<[^>]*>/ /sgo;
468
469	# Now we have the text, but it may contain HTML
470	# elements coded as > etc. Remove these tags.
471	s/&/&/sgo;
472	s/</</sgo;
473	s/>/>/sgo;
474	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
475	s/\s<br>\s/ LINEBREAK /isgo;
476	s/<[^>]*>/ /sgo;
477
478	# replace<p> and <br> placeholders with clause break symbol (\n)
479	s/\s+/ /gso;
480	s/PARAGRAPHBREAK/\n/sgo;
481	s/LINEBREAK/\n/sgo;
482
483
484
485
486	# 2. Split the remaining text into space-delimited tokens
487
488	# Convert any HTML special characters (like ") to their UTF8 equivalent
489	s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
490
491	# Split text at word boundaries
492	s/\b/ /go;
493
494	# 3. Convert the remaining text to "clause format"
495
496	# Insert newline if the end of a sentence is detected
497	# (delimter is: "[\.\?\!]\s")
498	# s/\s*[\.\?\!]\s+/\n/go;
499
500	# remove unnecessary punctuation and replace with clause break symbol (\n)
501	s/[^\w ]/\n/go;
502
503	# remove extraneous whitespace
504	s/ +/ /sgo;
505	s/^\s+//mgo;
506	s/\s*$/\n/mgo;
507
508	# remove lines that contain one word or less
509	s/^\S*$//mgo;
510	s/^\s*$//mgo;
511	tr/\n//s;
512
513	return $_;
514	}
515
516	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
517
518	sub convert_gml_to_tokens_EN {
519	$_ = shift @_;
520
521	# FIRST, remove GML tags
522
523	# Replace all whitespace with a simple space
524	s/\s+/ /gs;
525
526	# Remove everything that is in a tag
527	s/\s<p>\s/ PARAGRAPHBREAK /isg;
528	s/\s<br>\s/ LINEBREAK /isg;
529	s/<[^>]*>/ /sg;
530
531	# Now we have the text, but it may contain HTML
532	# elements coded as > etc. Remove these tags.
533	s/</</sg;
534	s/>/>/sg;
535
536	s/\s+/ /sg;
537	s/\s<p>\s/ PARAGRAPHBREAK /isg;
538	s/\s<br>\s/ LINEBREAK /isg;
539	s/<[^>]*>/ /sg;
540
541	# remove & and other miscellaneous markup tags
542	s/&/&/sg;
543	s/</</sg;
544	s/>/>/sg;
545	s/&/&/sg;
546
547	# replace<p> and <br> placeholders with carriage returns
548	s/PARAGRAPHBREAK/\n/sg;
549	s/LINEBREAK/\n/sg;
550
551
552	# Exceptional punctuation
553	#
554	# We make special cases of some punctuation
555
556	# remove any apostrophe that indicates omitted letters
557	s/(\w+)\'(\w*\s)/ $1$2 /g;
558
559	# remove period that appears in a person's initals
560	s/\s([A-Z])\./ $1 /g;
561
562	# replace hyphens in hypheanted words and names with a space
563	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
564
565	# Convert the remaining text to "clause format",
566	# This means removing all excess punctuation and garbage text,
567	# normalising valid punctuation to fullstops and commas,
568	# then putting one cluse on each line.
569
570	# Insert newline when the end of a sentence is detected
571	# (delimter is: "[\.\?\!]\s")
572	s/\s*[\.\?\!]\s+/\n/g;
573
574	# split numbers after four digits
575	s/(\d\d\d\d)/$1 /g;
576
577	# split words after 32 characters
578
579	# squash repeated punctuation
580	tr/A-Za-z0-9 //cs;
581
582	# save email addresses
583	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
584
585	# normalise clause breaks (mostly punctuation symbols) to commas
586	s/[^A-Za-z0-9 \n]+/ , /g;
587
588	# Remove repeated commas, and replace with newline
589	s/\s*,[, ]+/\n/g;
590
591	# remove extra whitespace
592	s/ +/ /sg;
593	s/^\s+//mg;
594	s/\s*$/\n/mg;
595
596	# remove lines that contain one word or less
597	s/^\w*$//mg;
598	s/^\s*$//mg;
599	tr/\n//s;
600
601	return $_;
602
603	}
604
605
606
607	# Execute a system command
608
609	sub execute {
610	my ($command, $verbosity, $outhandle) = @_;
611	print $outhandle "Executing: $command\n" if ($verbosity > 2);
612	$! = 0;
613	my $status = system($command);
614	if ($status != 0) {
615	print STDERR "phind - Error executing '$command': $!\n";
616	exit($status);
617	}
618	}
619
620
621	# Generate the vocabulary, symbol statistics, and numbers file from the
622	# clauses file. This is legacy code, so is a bit messy and probably wont
623	# run under windows.
624
625	sub extract_vocabulary {
626	my ($self) = @_;
627
628	my $verbosity = $self->{'verbosity'};
629	my $out = $self->{'outhandle'};
630
631	my $collectiondir = $self->{'collectiondir'};
632	my $phinddir = $self->{'phinddir'};
633
634	my $language_exp = $self->{'language_exp'};
635
636	my ($w, $l, $line, $word);
637
638	my ($first_delimiter, $last_delimiter,
639	$first_stopword, $last_stopword,
640	$first_extractword, $last_extractword,
641	$first_contentword, $last_contentword,
642	$phrasedelimiter);
643
644	my $thesaurus = $self->{'thesaurus'};
645	my ($thesaurus_links, $thesaurus_terms,
646	%thesaurus, $first_thesaurusword, $last_thesaurusword);
647
648	my %symbol;
649	my (%freq);
650
651	print $out "Calculating vocabulary\n" if ($verbosity > 1);
652
653	# Read and store the stopwords
654	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
655	my $stopword_files = ();
656	my ($language, $language_dir, $file, $file_name);
657	my %stopwords;
658
659	# Examine each directory in the stopword directory
660	opendir(STOPDIR, $stopdir);
661	foreach $language (readdir STOPDIR) {
662
663	# Ignore entries that do not match the classifier's language
664	next unless ($language =~ /$language_exp/);
665	$language_dir = &util::filename_cat($stopdir, $language);
666	next unless (-d "$language_dir");
667
668	opendir(LANGDIR, $language_dir);
669	foreach $file (readdir LANGDIR) {
670
671	# Ignore entries that are not stopword files
672	next unless ($file =~ /sw$/);
673	$file_name = &util::filename_cat($language_dir, $file);
674	next unless (-f "$file_name");
675
676	# Read the stopwords
677	open(STOPFILE, "<$file_name");
678	while (<STOPFILE>) {
679	s/^\s+//;
680	s/\s.*//;
681	$word = $_;
682	$l = lc($word);
683	$stopwords{$l} = $word;
684	}
685	close STOPFILE;
686
687	}
688	closedir LANGDIR;
689	}
690	closedir STOPDIR;
691
692	# Read thesaurus information
693	if ($thesaurus) {
694
695	# link file exists
696	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
697	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
698
699	# ensure term file exists in the correct language
700	if ($language_exp =~ /^([a-z][a-z])/) {
701	$language = $1;
702	} else {
703	$language = 'en';
704	}
705	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
706	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
707
708
709	# Read the thesaurus terms
710	open(TH, "<$thesaurus_terms");
711	while(<TH>) {
712	s/^\d+ //;
713	s/\(.*\)//;
714	foreach $w (split(/\s+/, $_)) {
715	$thesaurus{lc($w)} = $w;
716	}
717	}
718	close TH;
719	}
720
721	# Read words in the text and count occurences
722	open(TXT, "<$phinddir/clauses");
723	my @words;
724
725	while(<TXT>) {
726	$line = $_;
727	next unless ($line =~ /./);
728
729	@words = split(/\s+/, $line);
730	foreach $w (@words) {
731	$l = lc($w);
732	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
733	$freq{$w}++;
734	}
735	$freq{$senlimit}++;
736	}
737
738	close TXT;
739
740	# Calculate the "best" form of each word
741	my (%bestform, %totalfreq, %bestfreq);
742
743	foreach $w (sort (keys %freq)) {
744	$l = lc($w);
745
746	# totalfreq is the number of times a term appears in any form
747	$totalfreq{$l} += $freq{$w};
748
749	if (defined $stopwords{$l}) {
750	$bestform{$l} = $stopwords{$l};
751
752	} elsif (defined $thesaurus{$l}) {
753	$bestform{$l} = $thesaurus{$l};
754
755	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
756	$bestfreq{$l} = $freq{$w};
757	$bestform{$l} = $w;
758	}
759	}
760
761	undef %freq;
762	undef %bestfreq;
763
764
765	# Assign symbol numbers to tokens
766	my $nextsymbol = 1;
767	my (@vocab);
768
769	# Delimiters
770	$first_delimiter = 1;
771
772	foreach $word (@delimiters) {
773
774	$word = lc($word);
775	$bestform{$word} = uc($word);
776	$vocab[$nextsymbol] = $word;
777	$symbol{$word} = $nextsymbol;
778	$nextsymbol++;
779	}
780	$last_delimiter = $nextsymbol - 1;
781
782	# Stopwords
783	$first_stopword = $nextsymbol;
784
785	foreach my $word (sort keys %stopwords) {
786
787	# don't incluse stopword unless it occurs in the text
788	$word = lc($word);
789	next unless ($totalfreq{$word});
790	next if ($symbol{$word});
791
792	$vocab[$nextsymbol] = $word;
793	$symbol{$word} = $nextsymbol;
794	$nextsymbol++;
795	}
796	$last_stopword = $nextsymbol - 1;
797	$first_contentword = $nextsymbol;
798
799	# Thesaurus terms
800	if ($thesaurus) {
801	$first_thesaurusword = $nextsymbol;
802
803	foreach my $word (sort keys %thesaurus) {
804
805	$word = lc($word);
806	next if ($symbol{$word});
807	$bestform{$word} = $thesaurus{$word};
808
809	$vocab[$nextsymbol] = $word;
810	$symbol{$word} = $nextsymbol;
811	$nextsymbol++;
812
813	}
814	$last_thesaurusword = $nextsymbol - 1;
815	}
816
817	# Other content words
818	$first_extractword = $nextsymbol;
819
820	foreach my $word (sort (keys %bestform)) {
821
822	next if ($symbol{$word});
823
824	$vocab[$nextsymbol] = $word;
825	$symbol{$word} = $nextsymbol;
826	$nextsymbol++;
827	}
828	$last_extractword = $nextsymbol - 1;
829	$last_contentword = $nextsymbol - 1;
830
831
832	# Outut the words
833	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
834	open(VOC, ">$phinddir/clauses.vocab");
835
836	for (my $i = 1; $i < $nextsymbol; $i++) {
837	$w = $vocab[$i];
838
839	print VOC "$bestform{$w}\n";
840	$totalfreq{$w} = 0 unless ($totalfreq{$w});
841	}
842	close VOC;
843
844
845	# Create statistics file
846	# Output statistics about the vocablary
847	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
848	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
849
850	open(STAT, ">$phinddir/clauses.stats")
851	\|\| die "Cannot open $phinddir/clauses.stats: $!";
852
853	print STAT "first_delimiter $first_delimiter\n";
854	print STAT "last_delimiter $last_delimiter\n";
855	print STAT "first_stopword $first_stopword\n";
856	print STAT "last_stopword $last_stopword\n";
857	if ($thesaurus) {
858	print STAT "first_thesaurusword $first_thesaurusword\n";
859	print STAT "last_thesaurusword $last_thesaurusword\n";
860	}
861	print STAT "first_extractword $first_extractword\n";
862	print STAT "last_extractword $last_extractword\n";
863	print STAT "first_contentword $first_contentword\n";
864	print STAT "last_contentword $last_contentword\n";
865	print STAT "first_symbol $first_delimiter\n";
866	print STAT "last_symbol $last_contentword\n";
867	print STAT "first_word $first_stopword\n";
868	print STAT "last_word $last_contentword\n";
869	close STAT;
870
871	undef @vocab;
872
873
874	# Create numbers file
875	# Save text as symbol numbers
876	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
877
878	open(TXT, "<$phinddir/clauses");
879	open(NUM, ">$phinddir/clauses.numbers");
880
881	$phrasedelimiter = $symbol{lc($senlimit)};
882	print NUM "$symbol{lc($colstart)}\n";
883
884	# set up the special symbols that delimit documents and sentences
885	while(<TXT>) {
886
887	# split sentence into a list of tokens
888	$line = $_;
889	next unless ($line =~ /./);
890	@words = split(/\s+/, $line);
891
892	# output one token at a time
893	foreach $word (@words) {
894	$word = lc($word);
895	print NUM "$symbol{$word}\n";
896	}
897
898	# output phrase delimiter
899	print NUM "$phrasedelimiter\n";
900	}
901
902	close TXT;
903	print NUM "$symbol{lc($colend)}\n";
904	close NUM;
905
906	# Save thesaurus data in one convienient file
907	if ($thesaurus) {
908
909	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
910
911
912	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
913	if ($verbosity > 1);
914
915	# Read the thesaurus terms
916	my ($num, $text, %thes_symbols);
917
918	open(TH, "<$thesaurus_terms");
919	while(<TH>) {
920	chomp;
921	@words = split(/\s+/, $_);
922	$num = shift @words;
923	$text = "";
924
925	# translate words into symbol numbers
926	foreach $word (@words) {
927	$word = lc($word);
928	if ($symbol{$word}) {
929	$text .= "s$symbol{$word} ";
930	} elsif ($verbosity) {
931	print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
932	}
933	}
934	$text =~ s/ $//;
935	$thes_symbols{$num} = $text;
936	}
937	close TH;
938
939	# Read the thesaurus links and write the corresponding data
940	open(TH, "<$thesaurus_links");
941	open(THOUT, ">$thesaurusfile");
942
943	while(<TH>) {
944	chomp;
945	($num, $text) = split(/:/, $_);
946
947	if (defined($thes_symbols{$num})) {
948	print THOUT "$num:$thes_symbols{$num}:$text\n";
949	} else {
950	print THOUT "$num:untranslated:$text\n";
951	}
952	}
953	close TH;
954	close THOUT;
955	}
956
957
958
959
960	}
961
962
963	# renumber_phrases
964	#
965	# Prepare the phrases file to be input to mgpp. The biggest problem is
966	# reconciling the phrase identifiers used by the suffix program (which
967	# we'll call suffix-id numbers) with the numbers used in the thesaurus
968	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
969	# Phind-id numbers must be sorted by frequency of occurance.
970	#
971	# Start creating a set of phind-id numbers from the sorted suffix-id
972	# numbers and (if required) the thesaurus-id numbers. Then add any other
973	# phrases occuring in the thesaurus.
974	#
975	# The last thing we have to do is restore the vocabulary information to the
976	# phrase file so that the phrases are stored as words, not as symbol
977	# numbers.
978
979	# The original phrases file looks something like this:
980	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
981	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
982	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
983	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
984
985
986	sub renumber_phrases {
987	my ($self) = @_;
988
989	renumber_suffix_data($self);
990	renumber_thesaurus_data($self);
991	restore_vocabulary_data($self);
992
993	}
994
995
996
997	# renumber_suffix_data
998	#
999	# Translate phrases file to phrases.2 using phind keys instead
1000	# of suffix keys and sorting the expansion data.
1001
1002	sub renumber_suffix_data {
1003	my ($self) = @_;
1004
1005	my $verbosity = $self->{'verbosity'};
1006	my $out = $self->{'outhandle'};
1007	print $out "Translate phrases: suffix-ids become phind-id's\n"
1008	if ($verbosity);
1009
1010	my $phinddir = $self->{'phinddir'};
1011	my $infile = &util::filename_cat($phinddir, 'phrases');
1012	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1013
1014	# Read the phrase file. Calculate initial set of phind-id
1015	# numbers and store (suffixid -> frequency) relation.
1016
1017	my %suffixtophind;
1018	my @phindfrequency;
1019	my (@fields, $suffixid);
1020	my $nextphind = 1;
1021
1022	open(IN, "<$infile");
1023	while(<IN>) {
1024
1025	chomp;
1026	@fields = split(/:/, $_);
1027
1028	# get next suffixid and phindid
1029	$suffixid = shift @fields;
1030	$suffixtophind{$suffixid} = $nextphind;
1031
1032	# store total frequency
1033	shift @fields;
1034	$totalfrequency[$nextphind] = shift @fields;
1035
1036	$nextphind++;
1037	}
1038	close IN;
1039
1040
1041	# Translate phrases file to phrases.2. Use phind keys (not suffix
1042	# keys), sort expansion and document occurance data in order of
1043	# descending frequency..
1044	open(IN, "<$infile");
1045	open(OUT, ">$outfile");
1046
1047	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1048	my (@documwents, @newexp, $k, $n);
1049	my $linenumber = 0;
1050
1051	while(<IN>) {
1052
1053	# read the line
1054	chomp;
1055	@fields = split(/:/, $_);
1056
1057	# get a phrase number for this line
1058	$suffixid = shift @fields;
1059	die unless (defined($suffixtophind{$suffixid}));
1060	$phindid = $suffixtophind{$suffixid};
1061
1062	# get the symbols in the phrase
1063	$text = shift @fields;
1064
1065	# output status information
1066	$linenumber++;
1067	if ($verbosity > 2) {
1068	if ($linenumber % 1000 == 0) {
1069	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1070	}
1071	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1072	}
1073
1074	# get the phrase frequency
1075	$tf = shift @fields;
1076
1077	# get the number of expansions
1078	$countexp = shift @fields;
1079
1080	# get the expansions, convert them into phind-id numbers, and sort them
1081	$expansions = shift @fields;
1082	@newexp = ();
1083	foreach $k (split(/,/, $expansions)) {
1084	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1085	$n = $suffixtophind{$k};
1086	push @newexp, $n;
1087	}
1088	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1089
1090	# get the number of documents
1091	$countdocs = shift @fields;
1092
1093	# get the documents and sort them
1094	$documents = shift @fields;
1095	$documents =~ s/d//g;
1096	@documents = split(/;/, $documents);
1097	@documents = sort by_doc_frequency @documents;
1098
1099	# output the phrase data
1100	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1101	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1102
1103	}
1104
1105	close IN;
1106	close OUT;
1107	}
1108
1109
1110	# renumber_thesaurus_data
1111	#
1112	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1113
1114	sub renumber_thesaurus_data {
1115	my ($self) = @_;
1116
1117	my $out = $self->{'outhandle'};
1118	my $verbosity = $self->{'verbosity'};
1119	my $thesaurus = $self->{'thesaurus'};
1120
1121	my $phinddir = $self->{'phinddir'};
1122	my $infile = &util::filename_cat($phinddir, "phrases.2");
1123	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1124
1125
1126	# If no thesaurus is defined, simply move the phrases file.
1127	if (!$thesaurus) {
1128	print $out "Translate phrases.2: no thesaurus data\n"
1129	if ($verbosity);
1130	&util::mv($infile, $outfile);
1131	return;
1132	}
1133
1134	print $out "Translate phrases.2: add thesaurus data\n"
1135	if ($verbosity);
1136
1137	# 1.
1138	# Read thesaurus file and store (symbols->thesaurusid) mapping
1139	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1140	my %symbolstothesid;
1141	my (@fields, $thesid, $symbols);
1142
1143	open(TH, "<$thesaurusfile");
1144
1145	while (<TH>) {
1146
1147	chomp;
1148	@fields = split(/:/, $_);
1149
1150	# get id and text
1151	$thesid = shift @fields;
1152	$symbols = shift @fields;
1153	$symbolstothesid{$symbols} = $thesid;
1154	}
1155	close TH;
1156
1157	# 2.
1158	# Read phrases file to find thesaurus entries that already
1159	# have a phindid. Store their phind-ids for later translation,
1160	# and store their frequency for later sorting.
1161	my %thesaurustophindid;
1162	my %phindidtofrequency;
1163	my ($phindid, $freq);
1164
1165	open(IN, "<$infile");
1166
1167	while(<IN>) {
1168
1169	chomp;
1170	@fields = split(/:/, $_);
1171
1172	# phindid and symbols for this line
1173	$phindid = shift @fields;
1174	$symbols = shift @fields;
1175	$freq = shift @fields;
1176
1177	# do we have a thesaurus id corresponding to this phrase?
1178	if (defined($symbolstothesid{$symbols})) {
1179	$thesid = $symbolstothesid{$symbols};
1180	$thesaurustophindid{$thesid} = $phindid;
1181	$phindidtofrequency{$phindid} = $freq;
1182	}
1183	}
1184	close IN;
1185
1186	undef %symbolstothesid;
1187
1188	# 3.
1189	# Create phind-id numbers for remaining thesaurus entries,
1190	# and note that their frequency is 0 for later sorting.
1191	my $nextphindid = $phindid + 1;
1192
1193	open(TH, "<$thesaurusfile");
1194	while(<TH>) {
1195
1196	chomp;
1197	@fields = split(/:/, $_);
1198
1199	# read thesaurus-id and ensure it has a corresponding phind-id
1200	$thesid = shift @fields;
1201	if (!defined($thesaurustophindid{$thesid})) {
1202	$thesaurustophindid{$thesid} = $nextphindid;
1203	$phindidtofrequency{$nextphindid} = 0;
1204	$nextphindid++;
1205	}
1206	}
1207	close TH;
1208
1209	# 4.
1210	# Translate thesaurus file, replacing thesaurus-id numbers with
1211	# phind-id numbers.
1212	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1213	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1214	my (@links, $linkid, %linkidtotype, $newrelation);
1215
1216	open(TH, "<$thesaurusfile");
1217	open(TO, ">$newthesaurusfile");
1218	while(<TH>) {
1219
1220	chomp;
1221	@fields = split(/:/, $_);
1222
1223	# phindid and symbols for this line
1224	($thesid, $symbols, $relations) = @fields;
1225
1226	die unless ($thesid && $symbols);
1227	die unless $thesaurustophindid{$thesid};
1228	$phindid = $thesaurustophindid{$thesid};
1229
1230	# convert each part of the relation string to use phind-id numbers
1231	# at the same time, we want to sort the list by frequency.
1232	undef %linkidtotype;
1233
1234	foreach $linktext (split(/;/, $relations)) {
1235	@linkdata = split(/,/, $linktext);
1236
1237	# remember the linktype (e.g. BT, NT)
1238	$linktype = shift @linkdata;
1239
1240	# store the type of each link
1241	foreach $thesid (@linkdata) {
1242	die unless (defined($thesaurustophindid{$thesid}));
1243	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1244	}
1245	}
1246
1247	# sort the list of links, first by frequency, then by type.
1248	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1249	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1250	$linkcounter = (scalar @links);
1251
1252	# create a string describing the link information
1253	$linktype = $linkidtotype{$links[0]};
1254	$newrelation = $linktype;
1255	foreach $linkid (@links) {
1256	if ($linkidtotype{$linkid} ne $linktype) {
1257	$linktype = $linkidtotype{$linkid};
1258	$newrelation .= ";" . $linktype;
1259	}
1260	$newrelation .= "," . $linkid;
1261	}
1262	$newrelation .= ";";
1263
1264
1265	# output the new line
1266	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1267	}
1268	close TH;
1269	close TO;
1270
1271	undef %thesaurustophindid;
1272	undef %linkidtotype;
1273	undef %phindidtofrequency;
1274
1275	# 5.
1276	# Read thesaurus data (in phind-id format) into memory
1277	my %thesaurusdata;
1278
1279	open(TH, "<$newthesaurusfile");
1280	while(<TH>) {
1281	chomp;
1282	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1283	die unless ($phindid && $symbols);
1284	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1285	}
1286	close TH;
1287
1288	# 6.
1289	# Add thesaurus data to phrases file
1290	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1291	my (@documwents, @newexp, $k, $n);
1292	my $linenumber = 0;
1293
1294	open(IN, "<$infile");
1295	open(OUT, ">$outfile");
1296
1297	# Update existing phrases
1298	while(<IN>) {
1299
1300	chomp;
1301	@fields = split(/:/, $_);
1302
1303	# get data for this line
1304	$phindid = shift @fields;
1305
1306	# output the phrase data, with thesaurus information
1307	print OUT "$phindid:", join(":", @fields);
1308
1309	# add thesaurus data
1310	if (defined($thesaurusdata{$phindid})) {
1311	@fields = split(/:/, $thesaurusdata{$phindid});
1312	shift @fields;
1313	$linkcounter = shift @fields;
1314	$relations = shift @fields;
1315
1316	print OUT ":$linkcounter:$relations";
1317	$thesaurusdata{$phindid} = "";
1318	}
1319	print OUT "\n";
1320	}
1321	close IN;
1322
1323	# Add phrases that aren't already in the file
1324	foreach $phindid (sort numerically keys %thesaurusdata) {
1325	next unless ($thesaurusdata{$phindid});
1326
1327	@fields = split(/:/, $thesaurusdata{$phindid});
1328	$symbols = shift @fields;
1329	$linkcounter = shift @fields;
1330	$relations = shift @fields;
1331
1332	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1333	}
1334	close OUT;
1335
1336	}
1337
1338	# restore_vocabulary_data
1339	#
1340	# Read phrases.3 and restore vocabulary information. Then write
1341	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1342	# (if requested) to the saved phrases file.
1343
1344	sub restore_vocabulary_data {
1345	my ($self) = @_;
1346
1347	my $out = $self->{'outhandle'};
1348	my $verbosity = $self->{'verbosity'};
1349	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1350
1351	my $phinddir = $self->{'phinddir'};
1352	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1353	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1354	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1355	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1356
1357	my $savephrases = $self->{'savephrases'};
1358
1359	# 1.
1360	# Read the vocabulary file
1361	open(V, "<$vocabfile")
1362	\|\| die "Cannot open $vocabfile: $!";
1363	my @symbol;
1364	my $i = 1;
1365	while(<V>) {
1366	chomp;
1367	$symbol[$i++] = $_;
1368	}
1369	close V;
1370
1371	# 2.
1372	# Translate phrases.3 to MGPP input files
1373	my ($key, $text, $word, $isThesaurus);
1374	my @fields;
1375	my $linenumber = 0;
1376
1377	open(IN, "<$infile");
1378	open(DATA, ">$datafile");
1379	open(WORD, ">$wordfile");
1380
1381	# Save the phrases in a separate text file
1382	if ($savephrases) {
1383	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1384	open(SAVE, ">$savephrases");
1385	}
1386
1387	while(<IN>) {
1388
1389	# read the line
1390	chomp;
1391	$line = $_;
1392	@fields = split(/:/, $line);
1393
1394	# get a phrase number for this line
1395	$key = shift @fields;
1396
1397	# restore the text of the phrase
1398	$text = shift @fields;
1399	$text =~ s/s(\d+)/$symbol[$1]/g;
1400	if ($text =~ / /) {
1401	$word = "";
1402	} elsif ($text ne 'untranslated') {
1403	$word = $text;
1404	}
1405
1406	# output the phrase data
1407	print DATA "<Document>";
1408	print DATA "$key:$text:", join(":", @fields), ":\n";
1409
1410	# output the word index search data
1411	print WORD "<Document>$word\n";
1412
1413	# output the phrases to a text file
1414	if ($savephrases) {
1415	if ((scalar @fields) == 7) {
1416	$isThesaurus = 1;
1417	} else {
1418	$isThesaurus = 0;
1419	}
1420	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1421	}
1422	}
1423	close IN;
1424	close WORD;
1425	close DATA;
1426	close SAVE if ($savephrases);
1427
1428	}
1429
1430
1431
1432	# sort routines used to renumber phrases
1433
1434	sub numerically { $a <=> $b }
1435
1436	sub by_doc_frequency {
1437	my $fa = 1;
1438	if ($a =~ /,/) {
1439	$fa = $a;
1440	$fa =~ s/\d+,//;
1441	}
1442	my $fb = 1;
1443	if ($b =~ /,/) {
1444	$fb = $b;
1445	$fb =~ s/\d+,//;
1446	}
1447
1448	return ($fb <=> $fa);
1449	}
1450
1451	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: