source: trunk/gsdl/perllib/classify/Phind.pm@ 3472

Last change on this file since 3472 was 3472, checked in by kjdon, 22 years ago

renamed phind.pm to Phind.pm in keeping with the names of the other classifiers

  • Property svn:keywords set to Author Date Id Revision
File size: 39.7 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl Phind" at the command line for a summary.
32
33package Phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40my @removedirs = ();
41
42my %wanted_index_files = ('td'=>1,
43 't'=>1,
44 'ti'=>1,
45 'tl'=>1,
46 'tsd'=>1,
47 'idb'=>1,
48 'ib1'=>1,
49 'ib2'=>1,
50 'ib3'=>1,
51 'i'=>1,
52 'il'=>1,
53 'w'=>1,
54 'wa'=>1);
55
56sub BEGIN {
57 @ISA = ('BasClas');
58}
59
60sub END {
61
62 # Tidy up stray files - we do this here as there's some weird problem
63 # preventing us from doing it in the get_classify_info() function (on
64 # windows at least) where the close() appears to fail on txthandle and
65 # dochandle, thus preventing us from deleting those files
66
67 foreach my $dir (@removedirs) {
68 if (-d $dir && opendir (DIR, $dir)) {
69 my @files = readdir DIR;
70 closedir DIR;
71
72 foreach $file (@files) {
73 next if $file =~ /^\.\.?$/;
74 my ($suffix) = $file =~ /\.([^\.]+)$/;
75 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
76 # delete it!
77 &util::rm (&util::filename_cat ($dir, $file));
78 }
79 }
80 }
81 }
82}
83
84sub print_usage {
85 print STDERR "
86 usage: classify Phind [options]
87
88 options:
89 -text Fields The text used to build the phrase hierarchy.
90 (default: 'section:Title,section:text')
91
92 -title Title The metadata field used to describe each document.
93 (default: 'Title')
94
95 -button Name The label for the classifier screen and button in
96 navigation bar.
97 (default: 'Phrase')
98
99 -language Regex Language or languages to use building hierarchy.
100 Languages are identified by two-letter country codes
101 like en (English), es (Spanish), and fr (French).
102 Language is a regular expression, so 'en|fr' (English or
103 French) and '..' (match any language) are valid.
104 (default: 'en'.)
105
106 -savephrases File If set, the phrase infomation will be stored in
107 the given file as text. It is probably a good idea
108 to use an absolute path.
109 (default: not set)
110
111 -suffixmode N The smode parameter to the phrase extraction program. A
112 value of 0 means that stopwords are ignored, and of 1
113 means that stopwords are used.
114 (default: 1)
115
116 -thesaurus Name Name of a thesaurus stored in phind format in the
117 collection's etc directory.
118 (default: not set)
119
120 -untidy Don't remove working files.
121
122"; }
123
124# Phrase delimiter symbols - these should be abstracted out someplace
125
126my $colstart = "COLLECTIONSTART";
127my $colend = "COLLECTIONEND";
128my $doclimit = "DOCUMENTLIMIT";
129my $senlimit = "SENTENCELIMIT";
130my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
133# Create a new Phind browser based on collect.cfg
134
135sub new {
136 my $class = shift (@_);
137 my $self = new BasClas($class, @_);
138
139 my $out = $self->{'outhandle'};
140
141 # Ensure the Phind generate scripts are in place
142 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
143 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146 if (!(-e $file1)) {
147 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
148 exit(1);
149 }
150
151 # Parse classifier arguments
152 my $builddir = "";
153 if (!parsargv::parse(\@_,
154 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155 q^title/.*/Title^, \$self->{'titlefield'},
156 q^button/.*/Phrase^, \$self->{'buttonname'},
157 q^language/.*/en^, \$language,
158 q^builddir/.*/^, \$builddir,
159 q^savephrases/.*/^, \$self->{'savephrases'},
160 q^suffixmode/\d/1^, \$self->{'suffixmode'},
161 q^thesaurus/.*/^, \$self->{'thesaurus'},
162 q^untidy^, \$self->{'untidy'},
163 "allow_extra_options")) {
164
165 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166 &print_usage();
167 die "\n";
168 }
169
170 # classifier information
171 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173 # limit languages
174 $self->{'language_exp'} = $language;
175
176 # collection directories
177 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178 if (!$builddir) {
179 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180 }
181 $self->{'builddir'} = $builddir;
182
183 $self->{'total'} = 0;
184
185 return bless $self, $class;
186}
187
188
189# Initialise the Phind classifier
190
191sub init {
192 my $self = shift (@_);
193
194 # ensure we have a build directory
195 my $builddir = $self->{'builddir'};
196 die unless (-e "$builddir");
197
198 # create Phind directory
199 my $phnumber = 1;
200 my $phinddir = &util::filename_cat($builddir, "phind1");
201 while (-e "$phinddir") {
202 $phnumber++;
203 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
204 }
205 &util::mk_dir("$phinddir");
206 $self->{'phinddir'} = $phinddir;
207 $self->{'phindnumber'} = $phnumber;
208
209 push(@removedirs, $phinddir) unless $self->{'untidy'};
210
211 # open filehandles for documents and text
212 my $clausefile = &util::filename_cat("$phinddir", "clauses");
213 &util::rm($clausefile) if (-e $clausefile);
214
215 my $txthandle = 'TEXT' . $phnumber;
216 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
217 $self->{'txthandle'} = $txthandle;
218
219 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220 &util::rm($docfile) if (-e $docfile);
221
222 my $dochandle = 'DOC' . $phnumber;
223 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
224 $self->{'dochandle'} = $dochandle;
225
226}
227
228
229# Classify each document.
230#
231# Each document is passed here in turn. The classifier extracts the
232# text of each and stores it in the clauses file. Document details are
233# stored in the docs.txt file.
234
235sub classify {
236 my $self = shift (@_);
237 my $doc_obj = shift @_;
238
239 my $verbosity = $self->{'verbosity'};
240 my $top_section = $doc_obj->get_top_section();
241
242 my $titlefield = $self->{'titlefield'};
243
244 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245 print "process: $title\n" if ($verbosity > 2);
246
247 # Only consider the file if it is in the correct language
248 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249 my $phrlanguage = $self->{'language_exp'};
250 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252 # record this file
253 $self->{'total'} ++;
254 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
255
256
257 # Store document details
258 my $OID = $doc_obj->get_OID();
259 $OID = "NULL" unless defined $OID;
260 my $dochandle = $self->{'dochandle'};
261 print $dochandle "<Document>\t$OID\t$title\n";
262
263 # Store the text occuring in this object
264
265 # output the document delimiter
266 my $txthandle = $self->{'txthandle'};
267 print $txthandle "$doclimit\n";
268
269 # iterarate over the required indexes and store their text
270 my $indexes = $self->{'indexes'};
271 my $text = "";
272 my ($part, $level, $field, $section, $data, $dataref);
273
274 foreach $part (split(/,/, $indexes)) {
275
276 # Each field has a level and a data element ((e.g. document:Title)
277 ($level, $field) = split(/:/, $part);
278 die unless ($level && $field);
279
280 # Extract the text from every section
281 # (In Phind, document:text and section:text are equivalent)
282 if ($field eq "text") {
283 $data = "";
284 $section = $doc_obj->get_top_section();
285 while (defined($section)) {
286 $data .= $doc_obj->get_text($section) . "\n";
287 $section = $doc_obj->get_next_section($section);
288 }
289 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290 }
291
292 # Extract a metadata field from a document
293 # (If there is more than one element of the given type, get them all.)
294 elsif ($level eq "document") {
295 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296 foreach $data (@$dataref) {
297 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298 }
299 }
300
301 # Extract metadata from every section in a document
302 elsif ($level eq "section") {
303 $data = "";
304 $section = $doc_obj->get_top_section();
305 while (defined($section)) {
306 $dataref = $doc_obj->get_metadata($section, $field);
307 $data .= join("\n", @$dataref) . "\n";
308 $section = $doc_obj->get_next_section($section);
309 }
310 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311 }
312
313 # Some sort of specification which I don't understand
314 else {
315 die "Unknown level ($level) in Phind index ($part)\n";
316 }
317
318 }
319
320 # output the text
321 $text =~ tr/\n//s;
322 print $txthandle "$text";
323}
324
325
326# Construct the classifier from the information already gathered
327#
328# When get_classify_info is called, the clauses and docs.txt files have
329# already been constructed in the phind directory. This function will
330# translate them into compressed, indexed MGPP files that can be read by
331# the phindcgi script. It will also register our classifier so that it
332# shows up in the navigation bar.
333
334sub get_classify_info {
335 my $self = shift (@_);
336
337 close $self->{'dochandle'};
338 close $self->{'txthandle'};
339 my $verbosity = $self->{'verbosity'};
340 my $out = $self->{'outhandle'};
341 my $phinddir = $self->{'phinddir'};
342
343 my $osextra = "";
344 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345 $osextra = " -d /";
346 }
347
348 if ($verbosity) {
349 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350 print $out "*** in ", $self->{'phinddir'}, "\n";
351 }
352
353 # Construct Phind indexes
354 my $suffixmode = $self->{'suffixmode'};
355 my ($command, $status);
356
357 # Generate the vocabulary, symbol statistics, and numbers file
358 # from the clauses file
359 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360 &extract_vocabulary($self);
361
362 # Use the suffix program to generate the phind/phrases file
363 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
365
366
367 # check that we generated some files. It's not necessarily an error if
368 # we didn't (execute() would have quit on error), but we can't go on.
369 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
370 if (! -r $phrasesfile) {
371 print $out "\nNo phrases found for Phind classifier!\n";
372 return;
373 }
374
375 # Create the phrase file and put phrase numbers in phind/phrases
376 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
377 &renumber_phrases($self);
378
379 print $out "\nCreating phrase databases\n";
380 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
381 my $mg_stem = &util::filename_cat($phinddir, "pdata");
382
383 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
384 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
386
387 # create the mg index of words
388 print $out "\nCreating word-level search indexes\n";
389 $mg_input = &util::filename_cat($phinddir, "pword.txt");
390 $mg_stem = &util::filename_cat($phinddir, "pword");
391
392 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
393 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
394 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
395 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
396 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
397 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
398
399 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
400 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
401 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
402
403 # create the mg document information database
404 print $out "\nCreating document information databases\n";
405 $mg_input = &util::filename_cat($phinddir, "docs.txt");
406 $mg_stem = &util::filename_cat($phinddir, "docs");
407
408 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
409 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
410 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
411
412 # Return the information about the classifier that we'll later want to
413 # use to create macros when the Phind classifier document is displayed.
414 my %classifyinfo = ('thistype'=>'Invisible',
415 'childtype'=>'Phind',
416 'Title'=>$self->{'buttonname'},
417 'parameters'=>"phindnumber=$self->{'phindnumber'}",
418 'contains'=>[]);
419
420 my $collection = $self->{'collection'};
421 my $url = "library?a=p&p=phind&c=$collection";
422 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
423
424 return \%classifyinfo;
425}
426
427
428
429sub convert_gml_to_tokens {
430
431 my ($language_exp, $text) = @_;
432
433 # escape any magic words... - jrm21
434 foreach my $delim (@delimiters) {
435 my $replacement=lc($delim);
436 my $num= $text=~ s/$delim/$replacement/g;
437 if (!$num) {$num=0;}
438 }
439
440 if ($language_exp =~ /en/) {
441 return &convert_gml_to_tokens_EN($text);
442 }
443 if ($language_exp =~ /zh/) {
444 return &convert_gml_to_tokens_ZH($text);
445 }
446
447 $_ = $text;
448
449 # 1. remove GML tags
450
451 # Remove everything that is in a tag
452 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
453 s/\s*<br>\s*/ LINEBREAK /isgo;
454 s/<[^>]*>/ /sgo;
455
456 # Now we have the text, but it may contain HTML
457 # elements coded as &gt; etc. Remove these tags.
458 s/&amp;/&/sgo;
459 s/&lt;/</sgo;
460 s/&gt;/>/sgo;
461 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
462 s/\s*<br>\s*/ LINEBREAK /isgo;
463 s/<[^>]*>/ /sgo;
464
465 # replace<p> and <br> placeholders with clause break symbol (\n)
466 s/\s+/ /gso;
467 s/PARAGRAPHBREAK/\n/sgo;
468 s/LINEBREAK/\n/sgo;
469
470
471
472
473 # 2. Split the remaining text into space-delimited tokens
474
475 # Convert entities to their UTF8 equivalents
476 s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse;
477
478 # Split text at word boundaries
479 s/\b/ /go;
480
481 # 3. Convert the remaining text to "clause format"
482
483 # Insert newline if the end of a sentence is detected
484 # (delimter is: "[\.\?\!]\s")
485 # s/\s*[\.\?\!]\s+/\n/go;
486
487 # remove unnecessary punctuation and replace with clause break symbol (\n)
488 s/[^\w ]/\n/go;
489
490 # remove extraneous whitespace
491 s/ +/ /sgo;
492 s/^\s+//mgo;
493 s/\s*$/\n/mgo;
494
495 # remove lines that contain one word or less
496 s/^\S*$//mgo;
497 s/^\s*$//mgo;
498 tr/\n//s;
499
500 return $_;
501}
502
503# a chinese version
504sub convert_gml_to_tokens_ZH {
505
506 $_ = shift @_;
507
508 # Replace all whitespace with a simple space
509 s/\s+/ /gs;
510 # Remove everything that is in a tag
511 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
512 s/\s*<br>\s*/ LINEBREAK /isg;
513 s/<[^>]*>/ /sg;
514
515 # Now we have the text, but it may contain HTML
516 # elements coded as &gt; etc. Remove these tags.
517 s/&lt;/</sg;
518 s/&gt;/>/sg;
519
520 s/\s+/ /sg;
521 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
522 s/\s*<br>\s*/ LINEBREAK /isg;
523 s/<[^>]*>/ /sg;
524
525 # remove &amp; and other miscellaneous markup tags
526 s/&amp;/&/sg;
527 s/&lt;/</sg;
528 s/&gt;/>/sg;
529 s/&amp;/&/sg;
530
531 # replace<p> and <br> placeholders with carriage returns
532 s/PARAGRAPHBREAK/\n/sg;
533 s/LINEBREAK/\n/sg;
534
535
536# print STDERR "text:$_\n";
537 return $_;
538}
539# A version of convert_gml_to_tokens that is fine-tuned to the English language.
540
541sub convert_gml_to_tokens_EN {
542 $_ = shift @_;
543
544 # FIRST, remove GML tags
545
546 # Replace all whitespace with a simple space
547 s/\s+/ /gs;
548
549 # Remove everything that is in a tag
550 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
551 s/\s*<br>\s*/ LINEBREAK /isg;
552 s/<[^>]*>/ /sg;
553
554 # Now we have the text, but it may contain HTML
555 # elements coded as &gt; etc. Remove these tags.
556 s/&lt;/</sg;
557 s/&gt;/>/sg;
558
559 s/\s+/ /sg;
560 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
561 s/\s*<br>\s*/ LINEBREAK /isg;
562 s/<[^>]*>/ /sg;
563
564 # remove &amp; and other miscellaneous markup tags
565 s/&amp;/&/sg;
566 s/&lt;/</sg;
567 s/&gt;/>/sg;
568 s/&amp;/&/sg;
569
570 # replace<p> and <br> placeholders with carriage returns
571 s/PARAGRAPHBREAK/\n/sg;
572 s/LINEBREAK/\n/sg;
573
574
575 # Exceptional punctuation
576 #
577 # We make special cases of some punctuation
578
579 # remove any apostrophe that indicates omitted letters
580 s/(\w+)\'(\w*\s)/ $1$2 /g;
581
582 # remove period that appears in a person's initals
583 s/\s([A-Z])\./ $1 /g;
584
585 # replace hyphens in hypheanted words and names with a space
586 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
587
588 # Convert the remaining text to "clause format",
589 # This means removing all excess punctuation and garbage text,
590 # normalising valid punctuation to fullstops and commas,
591 # then putting one cluse on each line.
592
593 # Insert newline when the end of a sentence is detected
594 # (delimter is: "[\.\?\!]\s")
595 s/\s*[\.\?\!]\s+/\n/g;
596
597 # split numbers after four digits
598 s/(\d\d\d\d)/$1 /g;
599
600 # split words after 32 characters
601
602 # squash repeated punctuation
603 tr/A-Za-z0-9 //cs;
604
605 # save email addresses
606 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
607
608 # normalise clause breaks (mostly punctuation symbols) to commas
609 s/[^A-Za-z0-9 \n]+/ , /g;
610
611 # Remove repeated commas, and replace with newline
612 s/\s*,[, ]+/\n/g;
613
614 # remove extra whitespace
615 s/ +/ /sg;
616 s/^\s+//mg;
617 s/\s*$/\n/mg;
618
619 # remove lines that contain one word or less
620 s/^\w*$//mg;
621 s/^\s*$//mg;
622 tr/\n//s;
623
624 return $_;
625
626}
627
628
629
630# Execute a system command
631
632sub execute {
633 my ($command, $verbosity, $outhandle) = @_;
634 print $outhandle "Executing: $command\n" if ($verbosity > 2);
635 $! = 0;
636 my $status = system($command);
637 if ($status != 0) {
638 print STDERR "Phind - Error executing '$command': $!\n";
639 exit($status); # this causes the build to fail...
640 }
641}
642
643
644# Generate the vocabulary, symbol statistics, and numbers file from the
645# clauses file. This is legacy code, so is a bit messy and probably wont
646# run under windows.
647
648sub extract_vocabulary {
649 my ($self) = @_;
650
651 my $verbosity = $self->{'verbosity'};
652 my $out = $self->{'outhandle'};
653
654 my $collectiondir = $self->{'collectiondir'};
655 my $phinddir = $self->{'phinddir'};
656
657 my $language_exp = $self->{'language_exp'};
658
659 my ($w, $l, $line, $word);
660
661 my ($first_delimiter, $last_delimiter,
662 $first_stopword, $last_stopword,
663 $first_extractword, $last_extractword,
664 $first_contentword, $last_contentword,
665 $phrasedelimiter);
666
667 my $thesaurus = $self->{'thesaurus'};
668 my ($thesaurus_links, $thesaurus_terms,
669 %thesaurus, $first_thesaurusword, $last_thesaurusword);
670
671 my %symbol;
672 my (%freq);
673
674 print $out "Calculating vocabulary\n" if ($verbosity > 1);
675
676 # Read and store the stopwords
677 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
678 my $stopword_files = ();
679 my ($language, $language_dir, $file, $file_name);
680 my %stopwords;
681
682 # Examine each directory in the stopword directory
683 opendir(STOPDIR, $stopdir);
684 foreach $language (readdir STOPDIR) {
685
686 # Ignore entries that do not match the classifier's language
687 next unless ($language =~ /$language_exp/);
688 $language_dir = &util::filename_cat($stopdir, $language);
689 next unless (-d "$language_dir");
690
691 opendir(LANGDIR, $language_dir);
692 foreach $file (readdir LANGDIR) {
693
694 # Ignore entries that are not stopword files
695 next unless ($file =~ /sw$/);
696 $file_name = &util::filename_cat($language_dir, $file);
697 next unless (-f "$file_name");
698
699 # Read the stopwords
700 open(STOPFILE, "<$file_name");
701 while (<STOPFILE>) {
702 s/^\s+//;
703 s/\s.*//;
704 $word = $_;
705 $l = lc($word);
706 $stopwords{$l} = $word;
707 }
708 close STOPFILE;
709
710 }
711 closedir LANGDIR;
712 }
713 closedir STOPDIR;
714
715 # Read thesaurus information
716 if ($thesaurus) {
717
718 # link file exists
719 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
720 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
721
722 # ensure term file exists in the correct language
723 if ($language_exp =~ /^([a-z][a-z])/) {
724 $language = $1;
725 } else {
726 $language = 'en';
727 }
728 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
729 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
730
731
732 # Read the thesaurus terms
733 open(TH, "<$thesaurus_terms");
734 while(<TH>) {
735 s/^\d+ //;
736 s/\(.*\)//;
737 foreach $w (split(/\s+/, $_)) {
738 $thesaurus{lc($w)} = $w;
739 }
740 }
741 close TH;
742 }
743
744 # Read words in the text and count occurences
745 open(TXT, "<$phinddir/clauses");
746
747 my @words;
748 while(<TXT>) {
749 $line = $_;
750 next unless ($line =~ /./);
751
752 @words = split(/\s+/, $line);
753 foreach $w (@words) {
754 $l = lc($w);
755 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
756 $freq{$w}++;
757 }
758 $freq{$senlimit}++;
759 }
760
761 close TXT;
762
763 # Calculate the "best" form of each word
764 my (%bestform, %totalfreq, %bestfreq);
765
766 foreach $w (sort (keys %freq)) {
767 $l = lc($w);
768
769 # totalfreq is the number of times a term appears in any form
770 $totalfreq{$l} += $freq{$w};
771
772 if (defined $stopwords{$l}) {
773 $bestform{$l} = $stopwords{$l};
774
775 } elsif (defined $thesaurus{$l}) {
776 $bestform{$l} = $thesaurus{$l};
777
778 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
779 $bestfreq{$l} = $freq{$w};
780 $bestform{$l} = $w;
781 }
782 }
783 undef %freq;
784 undef %bestfreq;
785
786
787 # Assign symbol numbers to tokens
788 my $nextsymbol = 1;
789 my (@vocab);
790
791 # Delimiters
792 $first_delimiter = 1;
793
794 foreach $word (@delimiters) {
795
796# $word = lc($word); # jrm21
797 $word = uc($word);
798 $bestform{$word} = $word;
799 $vocab[$nextsymbol] = $word;
800 $symbol{$word} = $nextsymbol;
801 $nextsymbol++;
802 }
803 $last_delimiter = $nextsymbol - 1;
804 # Stopwords
805 $first_stopword = $nextsymbol;
806
807 foreach my $word (sort keys %stopwords) {
808 # don't include stopword unless it occurs in the text
809 $word = lc($word);
810 next unless ($totalfreq{$word});
811 next if ($symbol{$word});
812
813 $vocab[$nextsymbol] = $word;
814 $symbol{$word} = $nextsymbol;
815 $nextsymbol++;
816 }
817 $last_stopword = $nextsymbol - 1;
818 $first_contentword = $nextsymbol;
819
820 # Thesaurus terms
821 if ($thesaurus) {
822 $first_thesaurusword = $nextsymbol;
823
824 foreach my $word (sort keys %thesaurus) {
825
826 $word = lc($word);
827 next if ($symbol{$word});
828 $bestform{$word} = $thesaurus{$word};
829
830 $vocab[$nextsymbol] = $word;
831 $symbol{$word} = $nextsymbol;
832 $nextsymbol++;
833
834 }
835 $last_thesaurusword = $nextsymbol - 1;
836 }
837
838 # Other content words
839 $first_extractword = $nextsymbol;
840
841 foreach my $word (sort (keys %bestform)) {
842
843 next if ($symbol{$word});
844
845 $vocab[$nextsymbol] = $word;
846 $symbol{$word} = $nextsymbol;
847 $nextsymbol++;
848 }
849 $last_extractword = $nextsymbol - 1;
850 $last_contentword = $nextsymbol - 1;
851
852 # Outut the words
853 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
854 open(VOC, ">$phinddir/clauses.vocab");
855
856 for (my $i = 1; $i < $nextsymbol; $i++) {
857 $w = $vocab[$i];
858
859 print VOC "$bestform{$w}\n";
860 $totalfreq{$w} = 0 unless ($totalfreq{$w});
861 }
862 close VOC;
863
864
865 # Create statistics file
866 # Output statistics about the vocablary
867 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
868 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
869
870 open(STAT, ">$phinddir/clauses.stats")
871 || die "Cannot open $phinddir/clauses.stats: $!";
872
873 print STAT "first_delimiter $first_delimiter\n";
874 print STAT "last_delimiter $last_delimiter\n";
875 print STAT "first_stopword $first_stopword\n";
876 print STAT "last_stopword $last_stopword\n";
877 if ($thesaurus) {
878 print STAT "first_thesaurusword $first_thesaurusword\n";
879 print STAT "last_thesaurusword $last_thesaurusword\n";
880 }
881 print STAT "first_extractword $first_extractword\n";
882 print STAT "last_extractword $last_extractword\n";
883 print STAT "first_contentword $first_contentword\n";
884 print STAT "last_contentword $last_contentword\n";
885 print STAT "first_symbol $first_delimiter\n";
886 print STAT "last_symbol $last_contentword\n";
887 print STAT "first_word $first_stopword\n";
888 print STAT "last_word $last_contentword\n";
889 close STAT;
890
891 undef @vocab;
892
893
894 # Create numbers file
895 # Save text as symbol numbers
896 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
897
898 open(TXT, "<$phinddir/clauses");
899 open(NUM, ">$phinddir/clauses.numbers");
900
901## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
902## print NUM "$symbol{lc($colstart)}\n"; # jrm21
903 $phrasedelimiter = $symbol{$senlimit};
904 print NUM "$symbol{$colstart}\n";
905
906 # set up the special symbols that delimit documents and sentences
907 while(<TXT>) {
908
909 # split sentence into a list of tokens
910 $line = $_;
911 next unless ($line =~ /./);
912 @words = split(/\s+/, $line);
913
914 # output one token at a time
915 foreach $word (@words) {
916# don't lower-case special delimiters - jrm21
917 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
918 $word = lc($word);
919 }
920 print NUM "$symbol{$word}\n";
921 }
922
923 # output phrase delimiter
924 print NUM "$phrasedelimiter\n";
925 }
926
927 close TXT;
928# print NUM "$symbol{lc($colend)}\n";# jrm21
929 print NUM "$symbol{$colend}\n";
930 close NUM;
931
932 # Save thesaurus data in one convienient file
933 if ($thesaurus) {
934
935 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
936
937
938 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
939 if ($verbosity > 1);
940
941 # Read the thesaurus terms
942 my ($num, $text, %thes_symbols);
943
944 open(TH, "<$thesaurus_terms");
945 while(<TH>) {
946 chomp;
947 @words = split(/\s+/, $_);
948 $num = shift @words;
949 $text = "";
950
951 # translate words into symbol numbers
952 foreach $word (@words) {
953 $word = lc($word);
954 if ($symbol{$word}) {
955 $text .= "s$symbol{$word} ";
956 } elsif ($verbosity) {
957 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
958 }
959 }
960 $text =~ s/ $//;
961 $thes_symbols{$num} = $text;
962 }
963 close TH;
964
965 # Read the thesaurus links and write the corresponding data
966 open(TH, "<$thesaurus_links");
967 open(THOUT, ">$thesaurusfile");
968
969 while(<TH>) {
970 chomp;
971 ($num, $text) = split(/:/, $_);
972
973 if (defined($thes_symbols{$num})) {
974 print THOUT "$num:$thes_symbols{$num}:$text\n";
975 } else {
976 print THOUT "$num:untranslated:$text\n";
977 }
978 }
979 close TH;
980 close THOUT;
981 }
982
983
984
985
986}
987
988
989# renumber_phrases
990#
991# Prepare the phrases file to be input to mgpp. The biggest problem is
992# reconciling the phrase identifiers used by the suffix program (which
993# we'll call suffix-id numbers) with the numbers used in the thesaurus
994# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
995# Phind-id numbers must be sorted by frequency of occurance.
996#
997# Start creating a set of phind-id numbers from the sorted suffix-id
998# numbers and (if required) the thesaurus-id numbers. Then add any other
999# phrases occuring in the thesaurus.
1000#
1001# The last thing we have to do is restore the vocabulary information to the
1002# phrase file so that the phrases are stored as words, not as symbol
1003# numbers.
1004
1005# The original phrases file looks something like this:
1006# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1007# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1008# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1009# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1010
1011
1012sub renumber_phrases {
1013 my ($self) = @_;
1014
1015 renumber_suffix_data($self);
1016 renumber_thesaurus_data($self);
1017 restore_vocabulary_data($self);
1018
1019}
1020
1021
1022
1023# renumber_suffix_data
1024#
1025# Translate phrases file to phrases.2 using phind keys instead
1026# of suffix keys and sorting the expansion data.
1027
1028sub renumber_suffix_data {
1029 my ($self) = @_;
1030
1031 my $verbosity = $self->{'verbosity'};
1032 my $out = $self->{'outhandle'};
1033 print $out "Translate phrases: suffix-ids become phind-id's\n"
1034 if ($verbosity);
1035
1036 my $phinddir = $self->{'phinddir'};
1037 my $infile = &util::filename_cat($phinddir, 'phrases');
1038 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1039
1040 # Read the phrase file. Calculate initial set of phind-id
1041 # numbers and store (suffixid -> frequency) relation.
1042
1043 my %suffixtophind;
1044 my @phindfrequency;
1045 my (@fields, $suffixid);
1046 my $nextphind = 1;
1047
1048 open(IN, "<$infile");
1049 while(<IN>) {
1050
1051 chomp;
1052 @fields = split(/:/, $_);
1053
1054 # get next suffixid and phindid
1055 $suffixid = shift @fields;
1056 $suffixtophind{$suffixid} = $nextphind;
1057
1058 # store total frequency
1059 shift @fields;
1060 $totalfrequency[$nextphind] = shift @fields;
1061
1062 $nextphind++;
1063 }
1064 close IN;
1065
1066
1067 # Translate phrases file to phrases.2. Use phind keys (not suffix
1068 # keys), sort expansion and document occurance data in order of
1069 # descending frequency..
1070 open(IN, "<$infile");
1071 open(OUT, ">$outfile");
1072
1073 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1074 my (@documwents, @newexp, $k, $n);
1075 my $linenumber = 0;
1076
1077 while(<IN>) {
1078
1079 # read the line
1080 chomp;
1081 @fields = split(/:/, $_);
1082
1083 # get a phrase number for this line
1084 $suffixid = shift @fields;
1085 die unless (defined($suffixtophind{$suffixid}));
1086 $phindid = $suffixtophind{$suffixid};
1087
1088 # get the symbols in the phrase
1089 $text = shift @fields;
1090
1091 # output status information
1092 $linenumber++;
1093 if ($verbosity > 2) {
1094 if ($linenumber % 1000 == 0) {
1095 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1096 }
1097 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1098 }
1099
1100 # get the phrase frequency
1101 $tf = shift @fields;
1102
1103 # get the number of expansions
1104 $countexp = shift @fields;
1105
1106 # get the expansions, convert them into phind-id numbers, and sort them
1107 $expansions = shift @fields;
1108 @newexp = ();
1109 foreach $k (split(/,/, $expansions)) {
1110 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1111 $n = $suffixtophind{$k};
1112 push @newexp, $n;
1113 }
1114 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1115
1116 # get the number of documents
1117 $countdocs = shift @fields;
1118
1119 # get the documents and sort them
1120 $documents = shift @fields;
1121 $documents =~ s/d//g;
1122 @documents = split(/;/, $documents);
1123 @documents = sort by_doc_frequency @documents;
1124
1125 # output the phrase data
1126 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1127 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1128
1129 }
1130
1131 close IN;
1132 close OUT;
1133}
1134
1135
1136# renumber_thesaurus_data
1137#
1138# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1139
1140sub renumber_thesaurus_data {
1141 my ($self) = @_;
1142
1143 my $out = $self->{'outhandle'};
1144 my $verbosity = $self->{'verbosity'};
1145 my $thesaurus = $self->{'thesaurus'};
1146
1147 my $phinddir = $self->{'phinddir'};
1148 my $infile = &util::filename_cat($phinddir, "phrases.2");
1149 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1150
1151
1152 # If no thesaurus is defined, simply move the phrases file.
1153 if (!$thesaurus) {
1154 print $out "Translate phrases.2: no thesaurus data\n"
1155 if ($verbosity);
1156 &util::mv($infile, $outfile);
1157 return;
1158 }
1159
1160 print $out "Translate phrases.2: add thesaurus data\n"
1161 if ($verbosity);
1162
1163 # 1.
1164 # Read thesaurus file and store (symbols->thesaurusid) mapping
1165 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1166 my %symbolstothesid;
1167 my (@fields, $thesid, $symbols);
1168
1169 open(TH, "<$thesaurusfile");
1170
1171 while (<TH>) {
1172
1173 chomp;
1174 @fields = split(/:/, $_);
1175
1176 # get id and text
1177 $thesid = shift @fields;
1178 $symbols = shift @fields;
1179 $symbolstothesid{$symbols} = $thesid;
1180 }
1181 close TH;
1182
1183 # 2.
1184 # Read phrases file to find thesaurus entries that already
1185 # have a phindid. Store their phind-ids for later translation,
1186 # and store their frequency for later sorting.
1187 my %thesaurustophindid;
1188 my %phindidtofrequency;
1189 my ($phindid, $freq);
1190
1191 open(IN, "<$infile");
1192
1193 while(<IN>) {
1194
1195 chomp;
1196 @fields = split(/:/, $_);
1197
1198 # phindid and symbols for this line
1199 $phindid = shift @fields;
1200 $symbols = shift @fields;
1201 $freq = shift @fields;
1202
1203 # do we have a thesaurus id corresponding to this phrase?
1204 if (defined($symbolstothesid{$symbols})) {
1205 $thesid = $symbolstothesid{$symbols};
1206 $thesaurustophindid{$thesid} = $phindid;
1207 $phindidtofrequency{$phindid} = $freq;
1208 }
1209 }
1210 close IN;
1211
1212 undef %symbolstothesid;
1213
1214 # 3.
1215 # Create phind-id numbers for remaining thesaurus entries,
1216 # and note that their frequency is 0 for later sorting.
1217 my $nextphindid = $phindid + 1;
1218
1219 open(TH, "<$thesaurusfile");
1220 while(<TH>) {
1221
1222 chomp;
1223 @fields = split(/:/, $_);
1224
1225 # read thesaurus-id and ensure it has a corresponding phind-id
1226 $thesid = shift @fields;
1227 if (!defined($thesaurustophindid{$thesid})) {
1228 $thesaurustophindid{$thesid} = $nextphindid;
1229 $phindidtofrequency{$nextphindid} = 0;
1230 $nextphindid++;
1231 }
1232 }
1233 close TH;
1234
1235 # 4.
1236 # Translate thesaurus file, replacing thesaurus-id numbers with
1237 # phind-id numbers.
1238 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1239 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1240 my (@links, $linkid, %linkidtotype, $newrelation);
1241
1242 open(TH, "<$thesaurusfile");
1243 open(TO, ">$newthesaurusfile");
1244 while(<TH>) {
1245
1246 chomp;
1247 @fields = split(/:/, $_);
1248
1249 # phindid and symbols for this line
1250 ($thesid, $symbols, $relations) = @fields;
1251
1252 die unless ($thesid && $symbols);
1253 die unless $thesaurustophindid{$thesid};
1254 $phindid = $thesaurustophindid{$thesid};
1255
1256 # convert each part of the relation string to use phind-id numbers
1257 # at the same time, we want to sort the list by frequency.
1258 undef %linkidtotype;
1259
1260 foreach $linktext (split(/;/, $relations)) {
1261 @linkdata = split(/,/, $linktext);
1262
1263 # remember the linktype (e.g. BT, NT)
1264 $linktype = shift @linkdata;
1265
1266 # store the type of each link
1267 foreach $thesid (@linkdata) {
1268 die unless (defined($thesaurustophindid{$thesid}));
1269 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1270 }
1271 }
1272
1273 # sort the list of links, first by frequency, then by type.
1274 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1275 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1276 $linkcounter = (scalar @links);
1277
1278 # create a string describing the link information
1279 $linktype = $linkidtotype{$links[0]};
1280 $newrelation = $linktype;
1281 foreach $linkid (@links) {
1282 if ($linkidtotype{$linkid} ne $linktype) {
1283 $linktype = $linkidtotype{$linkid};
1284 $newrelation .= ";" . $linktype;
1285 }
1286 $newrelation .= "," . $linkid;
1287 }
1288 $newrelation .= ";";
1289
1290
1291 # output the new line
1292 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1293 }
1294 close TH;
1295 close TO;
1296
1297 undef %thesaurustophindid;
1298 undef %linkidtotype;
1299 undef %phindidtofrequency;
1300
1301 # 5.
1302 # Read thesaurus data (in phind-id format) into memory
1303 my %thesaurusdata;
1304
1305 open(TH, "<$newthesaurusfile");
1306 while(<TH>) {
1307 chomp;
1308 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1309 die unless ($phindid && $symbols);
1310 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1311 }
1312 close TH;
1313
1314 # 6.
1315 # Add thesaurus data to phrases file
1316 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1317 my (@documwents, @newexp, $k, $n);
1318 my $linenumber = 0;
1319
1320 open(IN, "<$infile");
1321 open(OUT, ">$outfile");
1322
1323 # Update existing phrases
1324 while(<IN>) {
1325
1326 chomp;
1327 @fields = split(/:/, $_);
1328
1329 # get data for this line
1330 $phindid = shift @fields;
1331
1332 # output the phrase data, with thesaurus information
1333 print OUT "$phindid:", join(":", @fields);
1334
1335 # add thesaurus data
1336 if (defined($thesaurusdata{$phindid})) {
1337 @fields = split(/:/, $thesaurusdata{$phindid});
1338 shift @fields;
1339 $linkcounter = shift @fields;
1340 $relations = shift @fields;
1341
1342 print OUT ":$linkcounter:$relations";
1343 $thesaurusdata{$phindid} = "";
1344 }
1345 print OUT "\n";
1346 }
1347 close IN;
1348
1349 # Add phrases that aren't already in the file
1350 foreach $phindid (sort numerically keys %thesaurusdata) {
1351 next unless ($thesaurusdata{$phindid});
1352
1353 @fields = split(/:/, $thesaurusdata{$phindid});
1354 $symbols = shift @fields;
1355 $linkcounter = shift @fields;
1356 $relations = shift @fields;
1357
1358 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1359 }
1360 close OUT;
1361
1362}
1363
1364# restore_vocabulary_data
1365#
1366# Read phrases.3 and restore vocabulary information. Then write
1367# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1368# (if requested) to the saved phrases file.
1369
1370sub restore_vocabulary_data {
1371 my ($self) = @_;
1372
1373 my $out = $self->{'outhandle'};
1374 my $verbosity = $self->{'verbosity'};
1375 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1376
1377 my $phinddir = $self->{'phinddir'};
1378 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1379 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1380 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1381 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1382
1383 my $savephrases = $self->{'savephrases'};
1384
1385 # 1.
1386 # Read the vocabulary file
1387 open(V, "<$vocabfile")
1388 || die "Cannot open $vocabfile: $!";
1389 my @symbol;
1390 my $i = 1;
1391 while(<V>) {
1392 chomp;
1393 $symbol[$i++] = $_;
1394 }
1395 close V;
1396
1397 # 2.
1398 # Translate phrases.3 to MGPP input files
1399 my ($key, $text, $word, $isThesaurus);
1400 my @fields;
1401 my $linenumber = 0;
1402
1403 open(IN, "<$infile");
1404 open(DATA, ">$datafile");
1405 open(WORD, ">$wordfile");
1406
1407 # Save the phrases in a separate text file
1408 if ($savephrases) {
1409 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1410 open(SAVE, ">$savephrases");
1411 }
1412
1413 while(<IN>) {
1414
1415 # read the line
1416 chomp;
1417 $line = $_;
1418 @fields = split(/:/, $line);
1419
1420 # get a phrase number for this line
1421 $key = shift @fields;
1422
1423 # restore the text of the phrase
1424 $text = shift @fields;
1425 $text =~ s/s(\d+)/$symbol[$1]/g;
1426 if ($text =~ / /) {
1427 $word = "";
1428 } elsif ($text ne 'untranslated') {
1429 $word = $text;
1430 }
1431
1432 # output the phrase data
1433 print DATA "<Document>";
1434 print DATA "$key:$text:", join(":", @fields), ":\n";
1435
1436 # output the word index search data
1437 print WORD "<Document>$word\n";
1438
1439 # output the phrases to a text file
1440 if ($savephrases) {
1441 if ((scalar @fields) == 7) {
1442 $isThesaurus = 1;
1443 } else {
1444 $isThesaurus = 0;
1445 }
1446 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1447 }
1448 }
1449 close IN;
1450 close WORD;
1451 close DATA;
1452 close SAVE if ($savephrases);
1453
1454}
1455
1456
1457
1458# sort routines used to renumber phrases
1459
1460sub numerically { $a <=> $b }
1461
1462sub by_doc_frequency {
1463 my $fa = 1;
1464 if ($a =~ /,/) {
1465 $fa = $a;
1466 $fa =~ s/\d+,//;
1467 }
1468 my $fb = 1;
1469 if ($b =~ /,/) {
1470 $fb = $b;
1471 $fb =~ s/\d+,//;
1472 }
1473
1474 return ($fb <=> $fa);
1475}
1476
14771;
Note: See TracBrowser for help on using the repository browser.