Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/classify/Phind.pm@ 3472

Last change on this file since 3472 was 3472, checked in by kjdon, 22 years ago
renamed phind.pm to Phind.pm in keeping with the names of the other classifiers
Property svn:keywords set to `Author Date Id Revision`
File size: 39.7 KB

Line
1	###########################################################################
2	#
3	# Phind.pm -- the Phind classifier
4	#
5	# Copyright (C) 2000 Gordon W. Paynter
6	# Copyright (C) 2000 New Zealand Digital Library Project
7	#
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# The Phind clasifier plugin.
30	# Options are dexcribed in the print_usage function.
31	# Type "classinfo.pl Phind" at the command line for a summary.
32
33	package Phind;
34
35	use BasClas;
36	use util;
37	use ghtml;
38	use unicode;
39
40	my @removedirs = ();
41
42	my %wanted_index_files = ('td'=>1,
43	't'=>1,
44	'ti'=>1,
45	'tl'=>1,
46	'tsd'=>1,
47	'idb'=>1,
48	'ib1'=>1,
49	'ib2'=>1,
50	'ib3'=>1,
51	'i'=>1,
52	'il'=>1,
53	'w'=>1,
54	'wa'=>1);
55
56	sub BEGIN {
57	@ISA = ('BasClas');
58	}
59
60	sub END {
61
62	# Tidy up stray files - we do this here as there's some weird problem
63	# preventing us from doing it in the get_classify_info() function (on
64	# windows at least) where the close() appears to fail on txthandle and
65	# dochandle, thus preventing us from deleting those files
66
67	foreach my $dir (@removedirs) {
68	if (-d $dir && opendir (DIR, $dir)) {
69	my @files = readdir DIR;
70	closedir DIR;
71
72	foreach $file (@files) {
73	next if $file =~ /^\.\.?$/;
74	my ($suffix) = $file =~ /\.([^\.]+)$/;
75	if (!defined $suffix \|\| !defined $wanted_index_files{$suffix}) {
76	# delete it!
77	&util::rm (&util::filename_cat ($dir, $file));
78	}
79	}
80	}
81	}
82	}
83
84	sub print_usage {
85	print STDERR "
86	usage: classify Phind [options]
87
88	options:
89	-text Fields The text used to build the phrase hierarchy.
90	(default: 'section:Title,section:text')
91
92	-title Title The metadata field used to describe each document.
93	(default: 'Title')
94
95	-button Name The label for the classifier screen and button in
96	navigation bar.
97	(default: 'Phrase')
98
99	-language Regex Language or languages to use building hierarchy.
100	Languages are identified by two-letter country codes
101	like en (English), es (Spanish), and fr (French).
102	Language is a regular expression, so 'en\|fr' (English or
103	French) and '..' (match any language) are valid.
104	(default: 'en'.)
105
106	-savephrases File If set, the phrase infomation will be stored in
107	the given file as text. It is probably a good idea
108	to use an absolute path.
109	(default: not set)
110
111	-suffixmode N The smode parameter to the phrase extraction program. A
112	value of 0 means that stopwords are ignored, and of 1
113	means that stopwords are used.
114	(default: 1)
115
116	-thesaurus Name Name of a thesaurus stored in phind format in the
117	collection's etc directory.
118	(default: not set)
119
120	-untidy Don't remove working files.
121
122	"; }
123
124	# Phrase delimiter symbols - these should be abstracted out someplace
125
126	my $colstart = "COLLECTIONSTART";
127	my $colend = "COLLECTIONEND";
128	my $doclimit = "DOCUMENTLIMIT";
129	my $senlimit = "SENTENCELIMIT";
130	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
133	# Create a new Phind browser based on collect.cfg
134
135	sub new {
136	my $class = shift (@_);
137	my $self = new BasClas($class, @_);
138
139	my $out = $self->{'outhandle'};
140
141	# Ensure the Phind generate scripts are in place
142	my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
143	$file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144	my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146	if (!(-e $file1)) {
147	print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
148	exit(1);
149	}
150
151	# Parse classifier arguments
152	my $builddir = "";
153	if (!parsargv::parse(\@_,
154	q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155	q^title/.*/Title^, \$self->{'titlefield'},
156	q^button/.*/Phrase^, \$self->{'buttonname'},
157	q^language/.*/en^, \$language,
158	q^builddir/.*/^, \$builddir,
159	q^savephrases/.*/^, \$self->{'savephrases'},
160	q^suffixmode/\d/1^, \$self->{'suffixmode'},
161	q^thesaurus/.*/^, \$self->{'thesaurus'},
162	q^untidy^, \$self->{'untidy'},
163	"allow_extra_options")) {
164
165	print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166	&print_usage();
167	die "\n";
168	}
169
170	# classifier information
171	$self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173	# limit languages
174	$self->{'language_exp'} = $language;
175
176	# collection directories
177	$self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178	if (!$builddir) {
179	$builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180	}
181	$self->{'builddir'} = $builddir;
182
183	$self->{'total'} = 0;
184
185	return bless $self, $class;
186	}
187
188
189	# Initialise the Phind classifier
190
191	sub init {
192	my $self = shift (@_);
193
194	# ensure we have a build directory
195	my $builddir = $self->{'builddir'};
196	die unless (-e "$builddir");
197
198	# create Phind directory
199	my $phnumber = 1;
200	my $phinddir = &util::filename_cat($builddir, "phind1");
201	while (-e "$phinddir") {
202	$phnumber++;
203	$phinddir = &util::filename_cat($builddir, "phind$phnumber");
204	}
205	&util::mk_dir("$phinddir");
206	$self->{'phinddir'} = $phinddir;
207	$self->{'phindnumber'} = $phnumber;
208
209	push(@removedirs, $phinddir) unless $self->{'untidy'};
210
211	# open filehandles for documents and text
212	my $clausefile = &util::filename_cat("$phinddir", "clauses");
213	&util::rm($clausefile) if (-e $clausefile);
214
215	my $txthandle = 'TEXT' . $phnumber;
216	open($txthandle, ">$clausefile") \|\| die "Cannot open $clausefile: $!";
217	$self->{'txthandle'} = $txthandle;
218
219	my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220	&util::rm($docfile) if (-e $docfile);
221
222	my $dochandle = 'DOC' . $phnumber;
223	open($dochandle, ">$docfile") \|\| die "Cannot open $docfile: $!";
224	$self->{'dochandle'} = $dochandle;
225
226	}
227
228
229	# Classify each document.
230	#
231	# Each document is passed here in turn. The classifier extracts the
232	# text of each and stores it in the clauses file. Document details are
233	# stored in the docs.txt file.
234
235	sub classify {
236	my $self = shift (@_);
237	my $doc_obj = shift @_;
238
239	my $verbosity = $self->{'verbosity'};
240	my $top_section = $doc_obj->get_top_section();
241
242	my $titlefield = $self->{'titlefield'};
243
244	my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245	print "process: $title\n" if ($verbosity > 2);
246
247	# Only consider the file if it is in the correct language
248	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249	my $phrlanguage = $self->{'language_exp'};
250	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252	# record this file
253	$self->{'total'} ++;
254	print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
255
256
257	# Store document details
258	my $OID = $doc_obj->get_OID();
259	$OID = "NULL" unless defined $OID;
260	my $dochandle = $self->{'dochandle'};
261	print $dochandle "<Document>\t$OID\t$title\n";
262
263	# Store the text occuring in this object
264
265	# output the document delimiter
266	my $txthandle = $self->{'txthandle'};
267	print $txthandle "$doclimit\n";
268
269	# iterarate over the required indexes and store their text
270	my $indexes = $self->{'indexes'};
271	my $text = "";
272	my ($part, $level, $field, $section, $data, $dataref);
273
274	foreach $part (split(/,/, $indexes)) {
275
276	# Each field has a level and a data element ((e.g. document:Title)
277	($level, $field) = split(/:/, $part);
278	die unless ($level && $field);
279
280	# Extract the text from every section
281	# (In Phind, document:text and section:text are equivalent)
282	if ($field eq "text") {
283	$data = "";
284	$section = $doc_obj->get_top_section();
285	while (defined($section)) {
286	$data .= $doc_obj->get_text($section) . "\n";
287	$section = $doc_obj->get_next_section($section);
288	}
289	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290	}
291
292	# Extract a metadata field from a document
293	# (If there is more than one element of the given type, get them all.)
294	elsif ($level eq "document") {
295	$dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296	foreach $data (@$dataref) {
297	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298	}
299	}
300
301	# Extract metadata from every section in a document
302	elsif ($level eq "section") {
303	$data = "";
304	$section = $doc_obj->get_top_section();
305	while (defined($section)) {
306	$dataref = $doc_obj->get_metadata($section, $field);
307	$data .= join("\n", @$dataref) . "\n";
308	$section = $doc_obj->get_next_section($section);
309	}
310	$text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311	}
312
313	# Some sort of specification which I don't understand
314	else {
315	die "Unknown level ($level) in Phind index ($part)\n";
316	}
317
318	}
319
320	# output the text
321	$text =~ tr/\n//s;
322	print $txthandle "$text";
323	}
324
325
326	# Construct the classifier from the information already gathered
327	#
328	# When get_classify_info is called, the clauses and docs.txt files have
329	# already been constructed in the phind directory. This function will
330	# translate them into compressed, indexed MGPP files that can be read by
331	# the phindcgi script. It will also register our classifier so that it
332	# shows up in the navigation bar.
333
334	sub get_classify_info {
335	my $self = shift (@_);
336
337	close $self->{'dochandle'};
338	close $self->{'txthandle'};
339	my $verbosity = $self->{'verbosity'};
340	my $out = $self->{'outhandle'};
341	my $phinddir = $self->{'phinddir'};
342
343	my $osextra = "";
344	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345	$osextra = " -d /";
346	}
347
348	if ($verbosity) {
349	print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350	print $out "*** in ", $self->{'phinddir'}, "\n";
351	}
352
353	# Construct Phind indexes
354	my $suffixmode = $self->{'suffixmode'};
355	my ($command, $status);
356
357	# Generate the vocabulary, symbol statistics, and numbers file
358	# from the clauses file
359	print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360	&extract_vocabulary($self);
361
362	# Use the suffix program to generate the phind/phrases file
363	print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364	&execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
365
366
367	# check that we generated some files. It's not necessarily an error if
368	# we didn't (execute() would have quit on error), but we can't go on.
369	my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
370	if (! -r $phrasesfile) {
371	print $out "\nNo phrases found for Phind classifier!\n";
372	return;
373	}
374
375	# Create the phrase file and put phrase numbers in phind/phrases
376	print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
377	&renumber_phrases($self);
378
379	print $out "\nCreating phrase databases\n";
380	my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
381	my $mg_stem = &util::filename_cat($phinddir, "pdata");
382
383	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
384	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
386
387	# create the mg index of words
388	print $out "\nCreating word-level search indexes\n";
389	$mg_input = &util::filename_cat($phinddir, "pword.txt");
390	$mg_stem = &util::filename_cat($phinddir, "pword");
391
392	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
393	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
394	&execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
395	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
396	&execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
397	&execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
398
399	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
400	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
401	&execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
402
403	# create the mg document information database
404	print $out "\nCreating document information databases\n";
405	$mg_input = &util::filename_cat($phinddir, "docs.txt");
406	$mg_stem = &util::filename_cat($phinddir, "docs");
407
408	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
409	&execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
410	&execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
411
412	# Return the information about the classifier that we'll later want to
413	# use to create macros when the Phind classifier document is displayed.
414	my %classifyinfo = ('thistype'=>'Invisible',
415	'childtype'=>'Phind',
416	'Title'=>$self->{'buttonname'},
417	'parameters'=>"phindnumber=$self->{'phindnumber'}",
418	'contains'=>[]);
419
420	my $collection = $self->{'collection'};
421	my $url = "library?a=p&p=phind&c=$collection";
422	push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
423
424	return \%classifyinfo;
425	}
426
427
428
429	sub convert_gml_to_tokens {
430
431	my ($language_exp, $text) = @_;
432
433	# escape any magic words... - jrm21
434	foreach my $delim (@delimiters) {
435	my $replacement=lc($delim);
436	my $num= $text=~ s/$delim/$replacement/g;
437	if (!$num) {$num=0;}
438	}
439
440	if ($language_exp =~ /en/) {
441	return &convert_gml_to_tokens_EN($text);
442	}
443	if ($language_exp =~ /zh/) {
444	return &convert_gml_to_tokens_ZH($text);
445	}
446
447	$_ = $text;
448
449	# 1. remove GML tags
450
451	# Remove everything that is in a tag
452	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
453	s/\s<br>\s/ LINEBREAK /isgo;
454	s/<[^>]*>/ /sgo;
455
456	# Now we have the text, but it may contain HTML
457	# elements coded as > etc. Remove these tags.
458	s/&/&/sgo;
459	s/</</sgo;
460	s/>/>/sgo;
461	s/\s<p>\s/ PARAGRAPHBREAK /isgo;
462	s/\s<br>\s/ LINEBREAK /isgo;
463	s/<[^>]*>/ /sgo;
464
465	# replace<p> and <br> placeholders with clause break symbol (\n)
466	s/\s+/ /gso;
467	s/PARAGRAPHBREAK/\n/sgo;
468	s/LINEBREAK/\n/sgo;
469
470
471
472
473	# 2. Split the remaining text into space-delimited tokens
474
475	# Convert entities to their UTF8 equivalents
476	s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse;
477
478	# Split text at word boundaries
479	s/\b/ /go;
480
481	# 3. Convert the remaining text to "clause format"
482
483	# Insert newline if the end of a sentence is detected
484	# (delimter is: "[\.\?\!]\s")
485	# s/\s*[\.\?\!]\s+/\n/go;
486
487	# remove unnecessary punctuation and replace with clause break symbol (\n)
488	s/[^\w ]/\n/go;
489
490	# remove extraneous whitespace
491	s/ +/ /sgo;
492	s/^\s+//mgo;
493	s/\s*$/\n/mgo;
494
495	# remove lines that contain one word or less
496	s/^\S*$//mgo;
497	s/^\s*$//mgo;
498	tr/\n//s;
499
500	return $_;
501	}
502
503	# a chinese version
504	sub convert_gml_to_tokens_ZH {
505
506	$_ = shift @_;
507
508	# Replace all whitespace with a simple space
509	s/\s+/ /gs;
510	# Remove everything that is in a tag
511	s/\s<p>\s/ PARAGRAPHBREAK /isg;
512	s/\s<br>\s/ LINEBREAK /isg;
513	s/<[^>]*>/ /sg;
514
515	# Now we have the text, but it may contain HTML
516	# elements coded as > etc. Remove these tags.
517	s/</</sg;
518	s/>/>/sg;
519
520	s/\s+/ /sg;
521	s/\s<p>\s/ PARAGRAPHBREAK /isg;
522	s/\s<br>\s/ LINEBREAK /isg;
523	s/<[^>]*>/ /sg;
524
525	# remove & and other miscellaneous markup tags
526	s/&/&/sg;
527	s/</</sg;
528	s/>/>/sg;
529	s/&/&/sg;
530
531	# replace<p> and <br> placeholders with carriage returns
532	s/PARAGRAPHBREAK/\n/sg;
533	s/LINEBREAK/\n/sg;
534
535
536	# print STDERR "text:$_\n";
537	return $_;
538	}
539	# A version of convert_gml_to_tokens that is fine-tuned to the English language.
540
541	sub convert_gml_to_tokens_EN {
542	$_ = shift @_;
543
544	# FIRST, remove GML tags
545
546	# Replace all whitespace with a simple space
547	s/\s+/ /gs;
548
549	# Remove everything that is in a tag
550	s/\s<p>\s/ PARAGRAPHBREAK /isg;
551	s/\s<br>\s/ LINEBREAK /isg;
552	s/<[^>]*>/ /sg;
553
554	# Now we have the text, but it may contain HTML
555	# elements coded as > etc. Remove these tags.
556	s/</</sg;
557	s/>/>/sg;
558
559	s/\s+/ /sg;
560	s/\s<p>\s/ PARAGRAPHBREAK /isg;
561	s/\s<br>\s/ LINEBREAK /isg;
562	s/<[^>]*>/ /sg;
563
564	# remove & and other miscellaneous markup tags
565	s/&/&/sg;
566	s/</</sg;
567	s/>/>/sg;
568	s/&/&/sg;
569
570	# replace<p> and <br> placeholders with carriage returns
571	s/PARAGRAPHBREAK/\n/sg;
572	s/LINEBREAK/\n/sg;
573
574
575	# Exceptional punctuation
576	#
577	# We make special cases of some punctuation
578
579	# remove any apostrophe that indicates omitted letters
580	s/(\w+)\'(\w*\s)/ $1$2 /g;
581
582	# remove period that appears in a person's initals
583	s/\s([A-Z])\./ $1 /g;
584
585	# replace hyphens in hypheanted words and names with a space
586	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
587
588	# Convert the remaining text to "clause format",
589	# This means removing all excess punctuation and garbage text,
590	# normalising valid punctuation to fullstops and commas,
591	# then putting one cluse on each line.
592
593	# Insert newline when the end of a sentence is detected
594	# (delimter is: "[\.\?\!]\s")
595	s/\s*[\.\?\!]\s+/\n/g;
596
597	# split numbers after four digits
598	s/(\d\d\d\d)/$1 /g;
599
600	# split words after 32 characters
601
602	# squash repeated punctuation
603	tr/A-Za-z0-9 //cs;
604
605	# save email addresses
606	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
607
608	# normalise clause breaks (mostly punctuation symbols) to commas
609	s/[^A-Za-z0-9 \n]+/ , /g;
610
611	# Remove repeated commas, and replace with newline
612	s/\s*,[, ]+/\n/g;
613
614	# remove extra whitespace
615	s/ +/ /sg;
616	s/^\s+//mg;
617	s/\s*$/\n/mg;
618
619	# remove lines that contain one word or less
620	s/^\w*$//mg;
621	s/^\s*$//mg;
622	tr/\n//s;
623
624	return $_;
625
626	}
627
628
629
630	# Execute a system command
631
632	sub execute {
633	my ($command, $verbosity, $outhandle) = @_;
634	print $outhandle "Executing: $command\n" if ($verbosity > 2);
635	$! = 0;
636	my $status = system($command);
637	if ($status != 0) {
638	print STDERR "Phind - Error executing '$command': $!\n";
639	exit($status); # this causes the build to fail...
640	}
641	}
642
643
644	# Generate the vocabulary, symbol statistics, and numbers file from the
645	# clauses file. This is legacy code, so is a bit messy and probably wont
646	# run under windows.
647
648	sub extract_vocabulary {
649	my ($self) = @_;
650
651	my $verbosity = $self->{'verbosity'};
652	my $out = $self->{'outhandle'};
653
654	my $collectiondir = $self->{'collectiondir'};
655	my $phinddir = $self->{'phinddir'};
656
657	my $language_exp = $self->{'language_exp'};
658
659	my ($w, $l, $line, $word);
660
661	my ($first_delimiter, $last_delimiter,
662	$first_stopword, $last_stopword,
663	$first_extractword, $last_extractword,
664	$first_contentword, $last_contentword,
665	$phrasedelimiter);
666
667	my $thesaurus = $self->{'thesaurus'};
668	my ($thesaurus_links, $thesaurus_terms,
669	%thesaurus, $first_thesaurusword, $last_thesaurusword);
670
671	my %symbol;
672	my (%freq);
673
674	print $out "Calculating vocabulary\n" if ($verbosity > 1);
675
676	# Read and store the stopwords
677	my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
678	my $stopword_files = ();
679	my ($language, $language_dir, $file, $file_name);
680	my %stopwords;
681
682	# Examine each directory in the stopword directory
683	opendir(STOPDIR, $stopdir);
684	foreach $language (readdir STOPDIR) {
685
686	# Ignore entries that do not match the classifier's language
687	next unless ($language =~ /$language_exp/);
688	$language_dir = &util::filename_cat($stopdir, $language);
689	next unless (-d "$language_dir");
690
691	opendir(LANGDIR, $language_dir);
692	foreach $file (readdir LANGDIR) {
693
694	# Ignore entries that are not stopword files
695	next unless ($file =~ /sw$/);
696	$file_name = &util::filename_cat($language_dir, $file);
697	next unless (-f "$file_name");
698
699	# Read the stopwords
700	open(STOPFILE, "<$file_name");
701	while (<STOPFILE>) {
702	s/^\s+//;
703	s/\s.*//;
704	$word = $_;
705	$l = lc($word);
706	$stopwords{$l} = $word;
707	}
708	close STOPFILE;
709
710	}
711	closedir LANGDIR;
712	}
713	closedir STOPDIR;
714
715	# Read thesaurus information
716	if ($thesaurus) {
717
718	# link file exists
719	$thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
720	die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
721
722	# ensure term file exists in the correct language
723	if ($language_exp =~ /^([a-z][a-z])/) {
724	$language = $1;
725	} else {
726	$language = 'en';
727	}
728	$thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
729	die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
730
731
732	# Read the thesaurus terms
733	open(TH, "<$thesaurus_terms");
734	while(<TH>) {
735	s/^\d+ //;
736	s/\(.*\)//;
737	foreach $w (split(/\s+/, $_)) {
738	$thesaurus{lc($w)} = $w;
739	}
740	}
741	close TH;
742	}
743
744	# Read words in the text and count occurences
745	open(TXT, "<$phinddir/clauses");
746
747	my @words;
748	while(<TXT>) {
749	$line = $_;
750	next unless ($line =~ /./);
751
752	@words = split(/\s+/, $line);
753	foreach $w (@words) {
754	$l = lc($w);
755	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
756	$freq{$w}++;
757	}
758	$freq{$senlimit}++;
759	}
760
761	close TXT;
762
763	# Calculate the "best" form of each word
764	my (%bestform, %totalfreq, %bestfreq);
765
766	foreach $w (sort (keys %freq)) {
767	$l = lc($w);
768
769	# totalfreq is the number of times a term appears in any form
770	$totalfreq{$l} += $freq{$w};
771
772	if (defined $stopwords{$l}) {
773	$bestform{$l} = $stopwords{$l};
774
775	} elsif (defined $thesaurus{$l}) {
776	$bestform{$l} = $thesaurus{$l};
777
778	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
779	$bestfreq{$l} = $freq{$w};
780	$bestform{$l} = $w;
781	}
782	}
783	undef %freq;
784	undef %bestfreq;
785
786
787	# Assign symbol numbers to tokens
788	my $nextsymbol = 1;
789	my (@vocab);
790
791	# Delimiters
792	$first_delimiter = 1;
793
794	foreach $word (@delimiters) {
795
796	# $word = lc($word); # jrm21
797	$word = uc($word);
798	$bestform{$word} = $word;
799	$vocab[$nextsymbol] = $word;
800	$symbol{$word} = $nextsymbol;
801	$nextsymbol++;
802	}
803	$last_delimiter = $nextsymbol - 1;
804	# Stopwords
805	$first_stopword = $nextsymbol;
806
807	foreach my $word (sort keys %stopwords) {
808	# don't include stopword unless it occurs in the text
809	$word = lc($word);
810	next unless ($totalfreq{$word});
811	next if ($symbol{$word});
812
813	$vocab[$nextsymbol] = $word;
814	$symbol{$word} = $nextsymbol;
815	$nextsymbol++;
816	}
817	$last_stopword = $nextsymbol - 1;
818	$first_contentword = $nextsymbol;
819
820	# Thesaurus terms
821	if ($thesaurus) {
822	$first_thesaurusword = $nextsymbol;
823
824	foreach my $word (sort keys %thesaurus) {
825
826	$word = lc($word);
827	next if ($symbol{$word});
828	$bestform{$word} = $thesaurus{$word};
829
830	$vocab[$nextsymbol] = $word;
831	$symbol{$word} = $nextsymbol;
832	$nextsymbol++;
833
834	}
835	$last_thesaurusword = $nextsymbol - 1;
836	}
837
838	# Other content words
839	$first_extractword = $nextsymbol;
840
841	foreach my $word (sort (keys %bestform)) {
842
843	next if ($symbol{$word});
844
845	$vocab[$nextsymbol] = $word;
846	$symbol{$word} = $nextsymbol;
847	$nextsymbol++;
848	}
849	$last_extractword = $nextsymbol - 1;
850	$last_contentword = $nextsymbol - 1;
851
852	# Outut the words
853	print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
854	open(VOC, ">$phinddir/clauses.vocab");
855
856	for (my $i = 1; $i < $nextsymbol; $i++) {
857	$w = $vocab[$i];
858
859	print VOC "$bestform{$w}\n";
860	$totalfreq{$w} = 0 unless ($totalfreq{$w});
861	}
862	close VOC;
863
864
865	# Create statistics file
866	# Output statistics about the vocablary
867	print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
868	&util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
869
870	open(STAT, ">$phinddir/clauses.stats")
871	\|\| die "Cannot open $phinddir/clauses.stats: $!";
872
873	print STAT "first_delimiter $first_delimiter\n";
874	print STAT "last_delimiter $last_delimiter\n";
875	print STAT "first_stopword $first_stopword\n";
876	print STAT "last_stopword $last_stopword\n";
877	if ($thesaurus) {
878	print STAT "first_thesaurusword $first_thesaurusword\n";
879	print STAT "last_thesaurusword $last_thesaurusword\n";
880	}
881	print STAT "first_extractword $first_extractword\n";
882	print STAT "last_extractword $last_extractword\n";
883	print STAT "first_contentword $first_contentword\n";
884	print STAT "last_contentword $last_contentword\n";
885	print STAT "first_symbol $first_delimiter\n";
886	print STAT "last_symbol $last_contentword\n";
887	print STAT "first_word $first_stopword\n";
888	print STAT "last_word $last_contentword\n";
889	close STAT;
890
891	undef @vocab;
892
893
894	# Create numbers file
895	# Save text as symbol numbers
896	print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
897
898	open(TXT, "<$phinddir/clauses");
899	open(NUM, ">$phinddir/clauses.numbers");
900
901	## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
902	## print NUM "$symbol{lc($colstart)}\n"; # jrm21
903	$phrasedelimiter = $symbol{$senlimit};
904	print NUM "$symbol{$colstart}\n";
905
906	# set up the special symbols that delimit documents and sentences
907	while(<TXT>) {
908
909	# split sentence into a list of tokens
910	$line = $_;
911	next unless ($line =~ /./);
912	@words = split(/\s+/, $line);
913
914	# output one token at a time
915	foreach $word (@words) {
916	# don't lower-case special delimiters - jrm21
917	if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
918	$word = lc($word);
919	}
920	print NUM "$symbol{$word}\n";
921	}
922
923	# output phrase delimiter
924	print NUM "$phrasedelimiter\n";
925	}
926
927	close TXT;
928	# print NUM "$symbol{lc($colend)}\n";# jrm21
929	print NUM "$symbol{$colend}\n";
930	close NUM;
931
932	# Save thesaurus data in one convienient file
933	if ($thesaurus) {
934
935	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
936
937
938	print $out "Saving thesaurus as numbers in $thesaurusfile\n"
939	if ($verbosity > 1);
940
941	# Read the thesaurus terms
942	my ($num, $text, %thes_symbols);
943
944	open(TH, "<$thesaurus_terms");
945	while(<TH>) {
946	chomp;
947	@words = split(/\s+/, $_);
948	$num = shift @words;
949	$text = "";
950
951	# translate words into symbol numbers
952	foreach $word (@words) {
953	$word = lc($word);
954	if ($symbol{$word}) {
955	$text .= "s$symbol{$word} ";
956	} elsif ($verbosity) {
957	print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
958	}
959	}
960	$text =~ s/ $//;
961	$thes_symbols{$num} = $text;
962	}
963	close TH;
964
965	# Read the thesaurus links and write the corresponding data
966	open(TH, "<$thesaurus_links");
967	open(THOUT, ">$thesaurusfile");
968
969	while(<TH>) {
970	chomp;
971	($num, $text) = split(/:/, $_);
972
973	if (defined($thes_symbols{$num})) {
974	print THOUT "$num:$thes_symbols{$num}:$text\n";
975	} else {
976	print THOUT "$num:untranslated:$text\n";
977	}
978	}
979	close TH;
980	close THOUT;
981	}
982
983
984
985
986	}
987
988
989	# renumber_phrases
990	#
991	# Prepare the phrases file to be input to mgpp. The biggest problem is
992	# reconciling the phrase identifiers used by the suffix program (which
993	# we'll call suffix-id numbers) with the numbers used in the thesaurus
994	# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
995	# Phind-id numbers must be sorted by frequency of occurance.
996	#
997	# Start creating a set of phind-id numbers from the sorted suffix-id
998	# numbers and (if required) the thesaurus-id numbers. Then add any other
999	# phrases occuring in the thesaurus.
1000	#
1001	# The last thing we have to do is restore the vocabulary information to the
1002	# phrase file so that the phrases are stored as words, not as symbol
1003	# numbers.
1004
1005	# The original phrases file looks something like this:
1006	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1007	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1008	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1009	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1010
1011
1012	sub renumber_phrases {
1013	my ($self) = @_;
1014
1015	renumber_suffix_data($self);
1016	renumber_thesaurus_data($self);
1017	restore_vocabulary_data($self);
1018
1019	}
1020
1021
1022
1023	# renumber_suffix_data
1024	#
1025	# Translate phrases file to phrases.2 using phind keys instead
1026	# of suffix keys and sorting the expansion data.
1027
1028	sub renumber_suffix_data {
1029	my ($self) = @_;
1030
1031	my $verbosity = $self->{'verbosity'};
1032	my $out = $self->{'outhandle'};
1033	print $out "Translate phrases: suffix-ids become phind-id's\n"
1034	if ($verbosity);
1035
1036	my $phinddir = $self->{'phinddir'};
1037	my $infile = &util::filename_cat($phinddir, 'phrases');
1038	my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1039
1040	# Read the phrase file. Calculate initial set of phind-id
1041	# numbers and store (suffixid -> frequency) relation.
1042
1043	my %suffixtophind;
1044	my @phindfrequency;
1045	my (@fields, $suffixid);
1046	my $nextphind = 1;
1047
1048	open(IN, "<$infile");
1049	while(<IN>) {
1050
1051	chomp;
1052	@fields = split(/:/, $_);
1053
1054	# get next suffixid and phindid
1055	$suffixid = shift @fields;
1056	$suffixtophind{$suffixid} = $nextphind;
1057
1058	# store total frequency
1059	shift @fields;
1060	$totalfrequency[$nextphind] = shift @fields;
1061
1062	$nextphind++;
1063	}
1064	close IN;
1065
1066
1067	# Translate phrases file to phrases.2. Use phind keys (not suffix
1068	# keys), sort expansion and document occurance data in order of
1069	# descending frequency..
1070	open(IN, "<$infile");
1071	open(OUT, ">$outfile");
1072
1073	my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1074	my (@documwents, @newexp, $k, $n);
1075	my $linenumber = 0;
1076
1077	while(<IN>) {
1078
1079	# read the line
1080	chomp;
1081	@fields = split(/:/, $_);
1082
1083	# get a phrase number for this line
1084	$suffixid = shift @fields;
1085	die unless (defined($suffixtophind{$suffixid}));
1086	$phindid = $suffixtophind{$suffixid};
1087
1088	# get the symbols in the phrase
1089	$text = shift @fields;
1090
1091	# output status information
1092	$linenumber++;
1093	if ($verbosity > 2) {
1094	if ($linenumber % 1000 == 0) {
1095	print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1096	}
1097	print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1098	}
1099
1100	# get the phrase frequency
1101	$tf = shift @fields;
1102
1103	# get the number of expansions
1104	$countexp = shift @fields;
1105
1106	# get the expansions, convert them into phind-id numbers, and sort them
1107	$expansions = shift @fields;
1108	@newexp = ();
1109	foreach $k (split(/,/, $expansions)) {
1110	die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1111	$n = $suffixtophind{$k};
1112	push @newexp, $n;
1113	}
1114	@newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1115
1116	# get the number of documents
1117	$countdocs = shift @fields;
1118
1119	# get the documents and sort them
1120	$documents = shift @fields;
1121	$documents =~ s/d//g;
1122	@documents = split(/;/, $documents);
1123	@documents = sort by_doc_frequency @documents;
1124
1125	# output the phrase data
1126	print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1127	print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1128
1129	}
1130
1131	close IN;
1132	close OUT;
1133	}
1134
1135
1136	# renumber_thesaurus_data
1137	#
1138	# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1139
1140	sub renumber_thesaurus_data {
1141	my ($self) = @_;
1142
1143	my $out = $self->{'outhandle'};
1144	my $verbosity = $self->{'verbosity'};
1145	my $thesaurus = $self->{'thesaurus'};
1146
1147	my $phinddir = $self->{'phinddir'};
1148	my $infile = &util::filename_cat($phinddir, "phrases.2");
1149	my $outfile = &util::filename_cat($phinddir, "phrases.3");
1150
1151
1152	# If no thesaurus is defined, simply move the phrases file.
1153	if (!$thesaurus) {
1154	print $out "Translate phrases.2: no thesaurus data\n"
1155	if ($verbosity);
1156	&util::mv($infile, $outfile);
1157	return;
1158	}
1159
1160	print $out "Translate phrases.2: add thesaurus data\n"
1161	if ($verbosity);
1162
1163	# 1.
1164	# Read thesaurus file and store (symbols->thesaurusid) mapping
1165	my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1166	my %symbolstothesid;
1167	my (@fields, $thesid, $symbols);
1168
1169	open(TH, "<$thesaurusfile");
1170
1171	while (<TH>) {
1172
1173	chomp;
1174	@fields = split(/:/, $_);
1175
1176	# get id and text
1177	$thesid = shift @fields;
1178	$symbols = shift @fields;
1179	$symbolstothesid{$symbols} = $thesid;
1180	}
1181	close TH;
1182
1183	# 2.
1184	# Read phrases file to find thesaurus entries that already
1185	# have a phindid. Store their phind-ids for later translation,
1186	# and store their frequency for later sorting.
1187	my %thesaurustophindid;
1188	my %phindidtofrequency;
1189	my ($phindid, $freq);
1190
1191	open(IN, "<$infile");
1192
1193	while(<IN>) {
1194
1195	chomp;
1196	@fields = split(/:/, $_);
1197
1198	# phindid and symbols for this line
1199	$phindid = shift @fields;
1200	$symbols = shift @fields;
1201	$freq = shift @fields;
1202
1203	# do we have a thesaurus id corresponding to this phrase?
1204	if (defined($symbolstothesid{$symbols})) {
1205	$thesid = $symbolstothesid{$symbols};
1206	$thesaurustophindid{$thesid} = $phindid;
1207	$phindidtofrequency{$phindid} = $freq;
1208	}
1209	}
1210	close IN;
1211
1212	undef %symbolstothesid;
1213
1214	# 3.
1215	# Create phind-id numbers for remaining thesaurus entries,
1216	# and note that their frequency is 0 for later sorting.
1217	my $nextphindid = $phindid + 1;
1218
1219	open(TH, "<$thesaurusfile");
1220	while(<TH>) {
1221
1222	chomp;
1223	@fields = split(/:/, $_);
1224
1225	# read thesaurus-id and ensure it has a corresponding phind-id
1226	$thesid = shift @fields;
1227	if (!defined($thesaurustophindid{$thesid})) {
1228	$thesaurustophindid{$thesid} = $nextphindid;
1229	$phindidtofrequency{$nextphindid} = 0;
1230	$nextphindid++;
1231	}
1232	}
1233	close TH;
1234
1235	# 4.
1236	# Translate thesaurus file, replacing thesaurus-id numbers with
1237	# phind-id numbers.
1238	my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1239	my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1240	my (@links, $linkid, %linkidtotype, $newrelation);
1241
1242	open(TH, "<$thesaurusfile");
1243	open(TO, ">$newthesaurusfile");
1244	while(<TH>) {
1245
1246	chomp;
1247	@fields = split(/:/, $_);
1248
1249	# phindid and symbols for this line
1250	($thesid, $symbols, $relations) = @fields;
1251
1252	die unless ($thesid && $symbols);
1253	die unless $thesaurustophindid{$thesid};
1254	$phindid = $thesaurustophindid{$thesid};
1255
1256	# convert each part of the relation string to use phind-id numbers
1257	# at the same time, we want to sort the list by frequency.
1258	undef %linkidtotype;
1259
1260	foreach $linktext (split(/;/, $relations)) {
1261	@linkdata = split(/,/, $linktext);
1262
1263	# remember the linktype (e.g. BT, NT)
1264	$linktype = shift @linkdata;
1265
1266	# store the type of each link
1267	foreach $thesid (@linkdata) {
1268	die unless (defined($thesaurustophindid{$thesid}));
1269	$linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1270	}
1271	}
1272
1273	# sort the list of links, first by frequency, then by type.
1274	@links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1275	or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1276	$linkcounter = (scalar @links);
1277
1278	# create a string describing the link information
1279	$linktype = $linkidtotype{$links[0]};
1280	$newrelation = $linktype;
1281	foreach $linkid (@links) {
1282	if ($linkidtotype{$linkid} ne $linktype) {
1283	$linktype = $linkidtotype{$linkid};
1284	$newrelation .= ";" . $linktype;
1285	}
1286	$newrelation .= "," . $linkid;
1287	}
1288	$newrelation .= ";";
1289
1290
1291	# output the new line
1292	print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1293	}
1294	close TH;
1295	close TO;
1296
1297	undef %thesaurustophindid;
1298	undef %linkidtotype;
1299	undef %phindidtofrequency;
1300
1301	# 5.
1302	# Read thesaurus data (in phind-id format) into memory
1303	my %thesaurusdata;
1304
1305	open(TH, "<$newthesaurusfile");
1306	while(<TH>) {
1307	chomp;
1308	($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1309	die unless ($phindid && $symbols);
1310	$thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1311	}
1312	close TH;
1313
1314	# 6.
1315	# Add thesaurus data to phrases file
1316	my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1317	my (@documwents, @newexp, $k, $n);
1318	my $linenumber = 0;
1319
1320	open(IN, "<$infile");
1321	open(OUT, ">$outfile");
1322
1323	# Update existing phrases
1324	while(<IN>) {
1325
1326	chomp;
1327	@fields = split(/:/, $_);
1328
1329	# get data for this line
1330	$phindid = shift @fields;
1331
1332	# output the phrase data, with thesaurus information
1333	print OUT "$phindid:", join(":", @fields);
1334
1335	# add thesaurus data
1336	if (defined($thesaurusdata{$phindid})) {
1337	@fields = split(/:/, $thesaurusdata{$phindid});
1338	shift @fields;
1339	$linkcounter = shift @fields;
1340	$relations = shift @fields;
1341
1342	print OUT ":$linkcounter:$relations";
1343	$thesaurusdata{$phindid} = "";
1344	}
1345	print OUT "\n";
1346	}
1347	close IN;
1348
1349	# Add phrases that aren't already in the file
1350	foreach $phindid (sort numerically keys %thesaurusdata) {
1351	next unless ($thesaurusdata{$phindid});
1352
1353	@fields = split(/:/, $thesaurusdata{$phindid});
1354	$symbols = shift @fields;
1355	$linkcounter = shift @fields;
1356	$relations = shift @fields;
1357
1358	print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1359	}
1360	close OUT;
1361
1362	}
1363
1364	# restore_vocabulary_data
1365	#
1366	# Read phrases.3 and restore vocabulary information. Then write
1367	# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1368	# (if requested) to the saved phrases file.
1369
1370	sub restore_vocabulary_data {
1371	my ($self) = @_;
1372
1373	my $out = $self->{'outhandle'};
1374	my $verbosity = $self->{'verbosity'};
1375	print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1376
1377	my $phinddir = $self->{'phinddir'};
1378	my $infile = &util::filename_cat($phinddir, 'phrases.3');
1379	my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1380	my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1381	my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1382
1383	my $savephrases = $self->{'savephrases'};
1384
1385	# 1.
1386	# Read the vocabulary file
1387	open(V, "<$vocabfile")
1388	\|\| die "Cannot open $vocabfile: $!";
1389	my @symbol;
1390	my $i = 1;
1391	while(<V>) {
1392	chomp;
1393	$symbol[$i++] = $_;
1394	}
1395	close V;
1396
1397	# 2.
1398	# Translate phrases.3 to MGPP input files
1399	my ($key, $text, $word, $isThesaurus);
1400	my @fields;
1401	my $linenumber = 0;
1402
1403	open(IN, "<$infile");
1404	open(DATA, ">$datafile");
1405	open(WORD, ">$wordfile");
1406
1407	# Save the phrases in a separate text file
1408	if ($savephrases) {
1409	print $out "Saving phrases in $savephrases\n" if ($verbosity);
1410	open(SAVE, ">$savephrases");
1411	}
1412
1413	while(<IN>) {
1414
1415	# read the line
1416	chomp;
1417	$line = $_;
1418	@fields = split(/:/, $line);
1419
1420	# get a phrase number for this line
1421	$key = shift @fields;
1422
1423	# restore the text of the phrase
1424	$text = shift @fields;
1425	$text =~ s/s(\d+)/$symbol[$1]/g;
1426	if ($text =~ / /) {
1427	$word = "";
1428	} elsif ($text ne 'untranslated') {
1429	$word = $text;
1430	}
1431
1432	# output the phrase data
1433	print DATA "<Document>";
1434	print DATA "$key:$text:", join(":", @fields), ":\n";
1435
1436	# output the word index search data
1437	print WORD "<Document>$word\n";
1438
1439	# output the phrases to a text file
1440	if ($savephrases) {
1441	if ((scalar @fields) == 7) {
1442	$isThesaurus = 1;
1443	} else {
1444	$isThesaurus = 0;
1445	}
1446	print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1447	}
1448	}
1449	close IN;
1450	close WORD;
1451	close DATA;
1452	close SAVE if ($savephrases);
1453
1454	}
1455
1456
1457
1458	# sort routines used to renumber phrases
1459
1460	sub numerically { $a <=> $b }
1461
1462	sub by_doc_frequency {
1463	my $fa = 1;
1464	if ($a =~ /,/) {
1465	$fa = $a;
1466	$fa =~ s/\d+,//;
1467	}
1468	my $fb = 1;
1469	if ($b =~ /,/) {
1470	$fb = $b;
1471	$fb =~ s/\d+,//;
1472	}
1473
1474	return ($fb <=> $fa);
1475	}
1476
1477	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: