source: trunk/gsdl/perllib/classify/Phind.pm@ 8362

Last change on this file since 8362 was 8362, checked in by kjdon, 20 years ago

added a new option to the phind classifier: min_occurs. this is the minimum phrase frequency needed to be included in the hierarchy

  • Property svn:keywords set to Author Date Id Revision
File size: 40.5 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41my @removedirs = ();
42
43my %wanted_index_files = ('td'=>1,
44 't'=>1,
45 'ti'=>1,
46 'tl'=>1,
47 'tsd'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'il'=>1,
54 'w'=>1,
55 'wa'=>1);
56
57sub BEGIN {
58 @ISA = ('BasClas');
59}
60
61sub END {
62
63 # Tidy up stray files - we do this here as there's some weird problem
64 # preventing us from doing it in the get_classify_info() function (on
65 # windows at least) where the close() appears to fail on txthandle and
66 # dochandle, thus preventing us from deleting those files
67
68 foreach my $dir (@removedirs) {
69 if (-d $dir && opendir (DIR, $dir)) {
70 my @files = readdir DIR;
71 closedir DIR;
72
73 foreach $file (@files) {
74 next if $file =~ /^\.\.?$/;
75 my ($suffix) = $file =~ /\.([^\.]+)$/;
76 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
77 # delete it!
78 &util::rm (&util::filename_cat ($dir, $file));
79 }
80 }
81 }
82 }
83}
84
85my $arguments =
86 [ { 'name' => "text",
87 'desc' => "{Phind.text}",
88 'type' => "string",
89 'deft' => "section:Title,section:text",
90 'reqd' => "no" },
91 { 'name' => "title",
92 'desc' => "{Phind.title}",
93 'type' => "metadata",
94 'deft' => "Title",
95 'reqd' => "no" },
96 { 'name' => "buttonname",
97 'desc' => "{BasClas.buttonname}",
98 'type' => "string",
99 'deft' => "Phrase",
100 'reqd' => "no" },
101 { 'name' => "language",
102 'desc' => "{Phind.language}",
103 'type' => "string",
104 'deft' => "en",
105 'reqd' => "no" },
106 { 'name' => "savephrases",
107 'desc' => "{Phind.savephrases}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no" },
111 { 'name' => "suffixmode",
112 'desc' => "{Phind.suffixmode}",
113 'type' => "int",
114 'deft' => "1",
115 'range' => "0,1",
116 'reqd' => "no" },
117 { 'name' => "min_occurs",
118 'desc' => "{Phind.min_occurs}",
119 'type' => "int",
120 'deft' => "2",
121 'range' => "1,",
122 'reqd' => "no" },
123 { 'name' => "thesaurus",
124 'desc' => "{Phind.thesaurus}",
125 'type' => "string",
126 'deft' => "",
127 'reqd' => "no" },
128 { 'name' => "untidy",
129 'desc' => "{Phind.untidy}",
130 'type' => "flag",
131 'reqd' => "no" } ];
132
133my $options = { 'name' => "Phind",
134 'desc' => "{Phind.desc}",
135 'abstract' => "no",
136 'inherits' => "yes",
137 'args' => $arguments };
138
139
140# Phrase delimiter symbols - these should be abstracted out someplace
141
142my $colstart = "COLLECTIONSTART";
143my $colend = "COLLECTIONEND";
144my $doclimit = "DOCUMENTLIMIT";
145my $senlimit = "SENTENCELIMIT";
146my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
147
148
149# Create a new Phind browser based on collect.cfg
150
151sub new {
152 my $class = shift (@_);
153 my $self = new BasClas($class, @_);
154
155 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
156 my $option_list = $self->{'option_list'};
157 push( @{$option_list}, $options );
158
159 if ($self->{'info_only'}) {
160 # created from classinfo.pl - don't need to parse the arguments
161 return bless $self, $class;
162 }
163
164 my $out = $self->{'outhandle'};
165
166 # Ensure the Phind generate scripts are in place
167 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
168 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
169 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
170
171 if (!(-e $file1)) {
172 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173 exit(1);
174 }
175
176 # Parse classifier arguments
177 my $builddir = "";
178 if (!parsargv::parse(\@_,
179 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
180 q^title/.*/Title^, \$self->{'titlefield'},
181 q^buttonname/.*/Phrase^, \$self->{'buttonname'},
182 q^language/.*/en^, \$language,
183 q^savephrases/.*/^, \$self->{'savephrases'},
184 q^suffixmode/\d/1^, \$self->{'suffixmode'},
185 q^min_occurs/\d/2^, \$self->{'min_occurs'},
186 q^thesaurus/.*/^, \$self->{'thesaurus'},
187 q^untidy^, \$self->{'untidy'},
188 "allow_extra_options")) {
189
190 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
191 $self->print_txt_usage(""); # Use default resource bundle
192 die "\n";
193 }
194
195 # classifier information
196 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
197
198 # limit languages
199 $self->{'language_exp'} = $language;
200
201 # collection directories
202 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
203
204 if (! defined $self->{'builddir'}) {
205 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
206 }
207
208 $self->{'total'} = 0;
209
210 return bless $self, $class;
211}
212
213
214# Initialise the Phind classifier
215
216sub init {
217 my $self = shift (@_);
218
219 # ensure we have a build directory
220 my $builddir = $self->{'builddir'};
221 die unless (-e "$builddir");
222
223 # create Phind directory
224 my $phnumber = 1;
225 my $phinddir = &util::filename_cat($builddir, "phind1");
226 while (-e "$phinddir") {
227 $phnumber++;
228 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
229 }
230 &util::mk_dir("$phinddir");
231 $self->{'phinddir'} = $phinddir;
232 $self->{'phindnumber'} = $phnumber;
233
234 push(@removedirs, $phinddir) unless $self->{'untidy'};
235
236 # open filehandles for documents and text
237 my $clausefile = &util::filename_cat("$phinddir", "clauses");
238 &util::rm($clausefile) if (-e $clausefile);
239
240 my $txthandle = 'TEXT' . $phnumber;
241 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
242 $self->{'txthandle'} = $txthandle;
243
244 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
245 &util::rm($docfile) if (-e $docfile);
246
247 my $dochandle = 'DOC' . $phnumber;
248 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
249 $self->{'dochandle'} = $dochandle;
250
251}
252
253
254# Classify each document.
255#
256# Each document is passed here in turn. The classifier extracts the
257# text of each and stores it in the clauses file. Document details are
258# stored in the docs.txt file.
259
260sub classify {
261 my $self = shift (@_);
262 my $doc_obj = shift @_;
263
264 my $verbosity = $self->{'verbosity'};
265 my $top_section = $doc_obj->get_top_section();
266
267 my $titlefield = $self->{'titlefield'};
268
269 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
270 if (!defined($title)) {
271 $title = "";
272 print STDERR "Phind: document has no title\n";
273 }
274 print "process: $title\n" if ($verbosity > 2);
275
276 # Only consider the file if it is in the correct language
277 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
278 my $phrlanguage = $self->{'language_exp'};
279 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
280
281 # record this file
282 $self->{'total'} ++;
283 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
284
285
286 # Store document details
287 my $OID = $doc_obj->get_OID();
288 $OID = "NULL" unless defined $OID;
289 my $dochandle = $self->{'dochandle'};
290 print $dochandle "<Document>\t$OID\t$title\n";
291
292 # Store the text occuring in this object
293
294 # output the document delimiter
295 my $txthandle = $self->{'txthandle'};
296 print $txthandle "$doclimit\n";
297
298 # iterate over the required indexes and store their text
299 my $indexes = $self->{'indexes'};
300 my $text = "";
301 my ($part, $level, $field, $section, $data, $dataref);
302
303 foreach $part (split(/,/, $indexes)) {
304
305 # Each field has a level and a data element ((e.g. document:Title)
306 ($level, $field) = split(/:/, $part);
307 die unless ($level && $field);
308
309 # Extract the text from every section
310 # (In phind, document:text and section:text are equivalent)
311 if ($field eq "text") {
312 $data = "";
313 $section = $doc_obj->get_top_section();
314 while (defined($section)) {
315 $data .= $doc_obj->get_text($section) . "\n";
316 $section = $doc_obj->get_next_section($section);
317 }
318 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
319 }
320
321 # Extract a metadata field from a document
322 # (If there is more than one element of the given type, get them all.)
323 elsif ($level eq "document") {
324 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
325 foreach $data (@$dataref) {
326 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
327 }
328 }
329
330 # Extract metadata from every section in a document
331 elsif ($level eq "section") {
332 $data = "";
333 $section = $doc_obj->get_top_section();
334 while (defined($section)) {
335 $dataref = $doc_obj->get_metadata($section, $field);
336 $data .= join("\n", @$dataref) . "\n";
337 $section = $doc_obj->get_next_section($section);
338 }
339 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
340 }
341
342 # Some sort of specification which I don't understand
343 else {
344 die "Unknown level ($level) in Phind index ($part)\n";
345 }
346
347 }
348
349 # output the text
350 $text =~ tr/\n//s;
351 print $txthandle "$text";
352}
353
354
355# Construct the classifier from the information already gathered
356#
357# When get_classify_info is called, the clauses and docs.txt files have
358# already been constructed in the Phind directory. This function will
359# translate them into compressed, indexed MGPP files that can be read by
360# the phindcgi script. It will also register our classifier so that it
361# shows up in the navigation bar.
362
363sub get_classify_info {
364 my $self = shift (@_);
365 my ($gli) = @_;
366
367 close $self->{'dochandle'};
368 close $self->{'txthandle'};
369 my $verbosity = $self->{'verbosity'};
370 my $out = $self->{'outhandle'};
371 my $phinddir = $self->{'phinddir'};
372
373 my $osextra = "";
374 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
375 $osextra = " -d /";
376 }
377
378 print STDERR "</Stage>\n" if $gli;
379
380 if ($verbosity) {
381 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
382 print $out "*** in ", $self->{'phinddir'}, "\n";
383 }
384
385 print STDERR "<Stage name='Phind'>\n" if $gli;
386
387 # Construct phind indexes
388 my $suffixmode = $self->{'suffixmode'};
389 my $min_occurs = $self->{'min_occurs'};
390 my ($command, $status);
391
392 # Generate the vocabulary, symbol statistics, and numbers file
393 # from the clauses file
394 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
395 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
396 &extract_vocabulary($self);
397
398 # Use the suffix program to generate the phind/phrases file
399 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
400 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
401 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
402
403 # check that we generated some files. It's not necessarily an error if
404 # we didn't (execute() would have quit on error), but we can't go on.
405 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
406 if (! -r $phrasesfile) {
407 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
408 print $out "\nNo phrases found for Phind classifier!\n";
409 return;
410 }
411
412 # Create the phrase file and put phrase numbers in phind/phrases
413 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
414 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
415 &renumber_phrases($self);
416
417 print $out "\nCreating phrase databases\n";
418 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
419 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
420 my $mg_stem = &util::filename_cat($phinddir, "pdata");
421
422 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
423 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
425
426 # create the mg index of words
427 print $out "\nCreating word-level search indexes\n";
428 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
429 $mg_input = &util::filename_cat($phinddir, "pword.txt");
430 $mg_stem = &util::filename_cat($phinddir, "pword");
431
432 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
433 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
434 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
435 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
436 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
437 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
438
439 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
440 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
441 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
442
443 # create the mg document information database
444 print $out "\nCreating document information databases\n";
445 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
446 $mg_input = &util::filename_cat($phinddir, "docs.txt");
447 $mg_stem = &util::filename_cat($phinddir, "docs");
448
449 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
450 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
451 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
452
453 # Return the information about the classifier that we'll later want to
454 # use to create macros when the Phind classifier document is displayed.
455 my %classifyinfo = ('thistype'=>'Invisible',
456 'childtype'=>'Phind',
457 'Title'=>$self->{'buttonname'},
458 'parameters'=>"phindnumber=$self->{'phindnumber'}",
459 'contains'=>[]);
460
461 my $collection = $self->{'collection'};
462 my $url = "library?a=p&p=phind&c=$collection";
463 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
464
465 return \%classifyinfo;
466}
467
468
469
470sub convert_gml_to_tokens {
471
472 my ($language_exp, $text) = @_;
473
474 # escape any magic words... - jrm21
475 foreach my $delim (@delimiters) {
476 my $replacement=lc($delim);
477 my $num= $text=~ s/$delim/$replacement/g;
478 if (!$num) {$num=0;}
479 }
480
481 if ($language_exp =~ /en/) {
482 return &convert_gml_to_tokens_EN($text);
483 }
484
485 if ($language_exp =~ /zh/) {
486 return &convert_gml_to_tokens_ZH($text);
487 }
488
489 $_ = $text;
490
491 # 1. remove GML tags
492
493 # Remove everything that is in a tag
494 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
495 s/\s*<br>\s*/ LINEBREAK /isgo;
496 s/<[^>]*>/ /sgo;
497
498 # Now we have the text, but it may contain HTML
499 # elements coded as &gt; etc. Remove these tags.
500 s/&amp;/&/sgo;
501 s/&lt;/</sgo;
502 s/&gt;/>/sgo;
503 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
504 s/\s*<br>\s*/ LINEBREAK /isgo;
505 s/<[^>]*>/ /sgo;
506
507 # replace<p> and <br> placeholders with clause break symbol (\n)
508 s/\s+/ /gso;
509 s/PARAGRAPHBREAK/\n/sgo;
510 s/LINEBREAK/\n/sgo;
511
512
513
514
515 # 2. Split the remaining text into space-delimited tokens
516
517 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
518 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
519
520 # Split text at word boundaries
521 s/\b/ /go;
522
523 # 3. Convert the remaining text to "clause format"
524
525 # Insert newline if the end of a sentence is detected
526 # (delimter is: "[\.\?\!]\s")
527 # s/\s*[\.\?\!]\s+/\n/go;
528
529 # remove unnecessary punctuation and replace with clause break symbol (\n)
530 s/[^\w ]/\n/go;
531
532 # remove extraneous whitespace
533 s/ +/ /sgo;
534 s/^\s+//mgo;
535 s/\s*$/\n/mgo;
536
537 # remove lines that contain one word or less
538 s/^\S*$//mgo;
539 s/^\s*$//mgo;
540 tr/\n//s;
541
542 return $_;
543}
544
545# a chinese version
546sub convert_gml_to_tokens_ZH {
547
548 $_ = shift @_;
549
550 # Replace all whitespace with a simple space
551 s/\s+/ /gs;
552 # Remove everything that is in a tag
553 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
554 s/\s*<br>\s*/ LINEBREAK /isg;
555 s/<[^>]*>/ /sg;
556
557 # Now we have the text, but it may contain HTML
558 # elements coded as &gt; etc. Remove these tags.
559 s/&lt;/</sg;
560 s/&gt;/>/sg;
561
562 s/\s+/ /sg;
563 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
564 s/\s*<br>\s*/ LINEBREAK /isg;
565 s/<[^>]*>/ /sg;
566
567 # remove &amp; and other miscellaneous markup tags
568 s/&amp;/&/sg;
569 s/&lt;/</sg;
570 s/&gt;/>/sg;
571 s/&amp;/&/sg;
572
573 # replace<p> and <br> placeholders with carriage returns
574 s/PARAGRAPHBREAK/\n/sg;
575 s/LINEBREAK/\n/sg;
576
577
578# print STDERR "text:$_\n";
579 return $_;
580}
581
582# A version of convert_gml_to_tokens that is fine-tuned to the English language.
583
584sub convert_gml_to_tokens_EN {
585 $_ = shift @_;
586
587 # FIRST, remove GML tags
588
589 # Replace all whitespace with a simple space
590 s/\s+/ /gs;
591
592 # Remove everything that is in a tag
593 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
594 s/\s*<br>\s*/ LINEBREAK /isg;
595 s/<[^>]*>/ /sg;
596
597 # Now we have the text, but it may contain HTML
598 # elements coded as &gt; etc. Remove these tags.
599 s/&lt;/</sg;
600 s/&gt;/>/sg;
601
602 s/\s+/ /sg;
603 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
604 s/\s*<br>\s*/ LINEBREAK /isg;
605 s/<[^>]*>/ /sg;
606
607 # remove &amp; and other miscellaneous markup tags
608 s/&amp;/&/sg;
609 s/&lt;/</sg;
610 s/&gt;/>/sg;
611 s/&amp;/&/sg;
612
613 # replace<p> and <br> placeholders with carriage returns
614 s/PARAGRAPHBREAK/\n/sg;
615 s/LINEBREAK/\n/sg;
616
617
618 # Exceptional punctuation
619 #
620 # We make special cases of some punctuation
621
622 # remove any apostrophe that indicates omitted letters
623 s/(\w+)\'(\w*\s)/ $1$2 /g;
624
625 # remove period that appears in a person's initals
626 s/\s([A-Z])\./ $1 /g;
627
628 # replace hyphens in hypheanted words and names with a space
629 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
630
631 # Convert the remaining text to "clause format",
632 # This means removing all excess punctuation and garbage text,
633 # normalising valid punctuation to fullstops and commas,
634 # then putting one cluse on each line.
635
636 # Insert newline when the end of a sentence is detected
637 # (delimter is: "[\.\?\!]\s")
638 s/\s*[\.\?\!]\s+/\n/g;
639
640 # split numbers after four digits
641 s/(\d\d\d\d)/$1 /g;
642
643 # split words after 32 characters
644
645 # squash repeated punctuation
646 tr/A-Za-z0-9 //cs;
647
648 # save email addresses
649 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
650
651 # normalise clause breaks (mostly punctuation symbols) to commas
652 s/[^A-Za-z0-9 \n]+/ , /g;
653
654 # Remove repeated commas, and replace with newline
655 s/\s*,[, ]+/\n/g;
656
657 # remove extra whitespace
658 s/ +/ /sg;
659 s/^\s+//mg;
660 s/\s*$/\n/mg;
661
662 # remove lines that contain one word or less
663 s/^\w*$//mg;
664 s/^\s*$//mg;
665 tr/\n//s;
666
667 return $_;
668
669}
670
671
672
673# Execute a system command
674
675sub execute {
676 my ($command, $verbosity, $outhandle) = @_;
677 print $outhandle "Executing: $command\n" if ($verbosity > 2);
678 $! = 0;
679 my $status = system($command);
680 if ($status != 0) {
681 print STDERR "Phind - Error executing '$command': $!\n";
682 exit($status); # this causes the build to fail...
683 }
684}
685
686
687# Generate the vocabulary, symbol statistics, and numbers file from the
688# clauses file. This is legacy code, so is a bit messy and probably wont
689# run under windows.
690
691sub extract_vocabulary {
692 my ($self) = @_;
693
694 my $verbosity = $self->{'verbosity'};
695 my $out = $self->{'outhandle'};
696
697 my $collectiondir = $self->{'collectiondir'};
698 my $phinddir = $self->{'phinddir'};
699
700 my $language_exp = $self->{'language_exp'};
701
702 my ($w, $l, $line, $word);
703
704 my ($first_delimiter, $last_delimiter,
705 $first_stopword, $last_stopword,
706 $first_extractword, $last_extractword,
707 $first_contentword, $last_contentword,
708 $phrasedelimiter);
709
710 my $thesaurus = $self->{'thesaurus'};
711 my ($thesaurus_links, $thesaurus_terms,
712 %thesaurus, $first_thesaurusword, $last_thesaurusword);
713
714 my %symbol;
715 my (%freq);
716
717 print $out "Calculating vocabulary\n" if ($verbosity > 1);
718
719 # Read and store the stopwords
720 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
721 my $stopword_files = ();
722 my ($language, $language_dir, $file, $file_name);
723 my %stopwords;
724
725 # Examine each directory in the stopword directory
726 opendir(STOPDIR, $stopdir);
727 foreach $language (readdir STOPDIR) {
728
729 # Ignore entries that do not match the classifier's language
730 next unless ($language =~ /$language_exp/);
731 $language_dir = &util::filename_cat($stopdir, $language);
732 next unless (-d "$language_dir");
733
734 opendir(LANGDIR, $language_dir);
735 foreach $file (readdir LANGDIR) {
736
737 # Ignore entries that are not stopword files
738 next unless ($file =~ /sw$/);
739 $file_name = &util::filename_cat($language_dir, $file);
740 next unless (-f "$file_name");
741
742 # Read the stopwords
743 open(STOPFILE, "<$file_name");
744 while (<STOPFILE>) {
745 s/^\s+//;
746 s/\s.*//;
747 $word = $_;
748 $l = lc($word);
749 $stopwords{$l} = $word;
750 }
751 close STOPFILE;
752
753 }
754 closedir LANGDIR;
755 }
756 closedir STOPDIR;
757
758 # Read thesaurus information
759 if ($thesaurus) {
760
761 # link file exists
762 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
763 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
764
765 # ensure term file exists in the correct language
766 if ($language_exp =~ /^([a-z][a-z])/) {
767 $language = $1;
768 } else {
769 $language = 'en';
770 }
771 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
772 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
773
774
775 # Read the thesaurus terms
776 open(TH, "<$thesaurus_terms");
777 while(<TH>) {
778 s/^\d+ //;
779 s/\(.*\)//;
780 foreach $w (split(/\s+/, $_)) {
781 $thesaurus{lc($w)} = $w;
782 }
783 }
784 close TH;
785 }
786
787 # Read words in the text and count occurences
788 open(TXT, "<$phinddir/clauses");
789
790 my @words;
791 while(<TXT>) {
792 $line = $_;
793 next unless ($line =~ /./);
794
795 @words = split(/\s+/, $line);
796 foreach $w (@words) {
797 $l = lc($w);
798 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
799 $freq{$w}++;
800 }
801 $freq{$senlimit}++;
802 }
803
804 close TXT;
805
806 # Calculate the "best" form of each word
807 my (%bestform, %totalfreq, %bestfreq);
808
809 foreach $w (sort (keys %freq)) {
810 $l = lc($w);
811
812 # totalfreq is the number of times a term appears in any form
813 $totalfreq{$l} += $freq{$w};
814
815 if (defined $stopwords{$l}) {
816 $bestform{$l} = $stopwords{$l};
817
818 } elsif (defined $thesaurus{$l}) {
819 $bestform{$l} = $thesaurus{$l};
820
821 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
822 $bestfreq{$l} = $freq{$w};
823 $bestform{$l} = $w;
824 }
825 }
826 undef %freq;
827 undef %bestfreq;
828
829
830 # Assign symbol numbers to tokens
831 my $nextsymbol = 1;
832 my (@vocab);
833
834 # Delimiters
835 $first_delimiter = 1;
836
837 foreach $word (@delimiters) {
838
839# $word = lc($word); # jrm21
840 $word = uc($word);
841 $bestform{$word} = $word;
842 $vocab[$nextsymbol] = $word;
843 $symbol{$word} = $nextsymbol;
844 $nextsymbol++;
845 }
846 $last_delimiter = $nextsymbol - 1;
847 # Stopwords
848 $first_stopword = $nextsymbol;
849
850 foreach my $word (sort keys %stopwords) {
851 # don't include stopword unless it occurs in the text
852 $word = lc($word);
853 next unless ($totalfreq{$word});
854 next if ($symbol{$word});
855
856 $vocab[$nextsymbol] = $word;
857 $symbol{$word} = $nextsymbol;
858 $nextsymbol++;
859 }
860 $last_stopword = $nextsymbol - 1;
861 $first_contentword = $nextsymbol;
862
863 # Thesaurus terms
864 if ($thesaurus) {
865 $first_thesaurusword = $nextsymbol;
866
867 foreach my $word (sort keys %thesaurus) {
868
869 $word = lc($word);
870 next if ($symbol{$word});
871 $bestform{$word} = $thesaurus{$word};
872
873 $vocab[$nextsymbol] = $word;
874 $symbol{$word} = $nextsymbol;
875 $nextsymbol++;
876
877 }
878 $last_thesaurusword = $nextsymbol - 1;
879 }
880
881 # Other content words
882 $first_extractword = $nextsymbol;
883
884 foreach my $word (sort (keys %bestform)) {
885
886 next if ($symbol{$word});
887
888 $vocab[$nextsymbol] = $word;
889 $symbol{$word} = $nextsymbol;
890 $nextsymbol++;
891 }
892 $last_extractword = $nextsymbol - 1;
893 $last_contentword = $nextsymbol - 1;
894
895 # Outut the words
896 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
897 open(VOC, ">$phinddir/clauses.vocab");
898
899 for (my $i = 1; $i < $nextsymbol; $i++) {
900 $w = $vocab[$i];
901
902 print VOC "$bestform{$w}\n";
903 $totalfreq{$w} = 0 unless ($totalfreq{$w});
904 }
905 close VOC;
906
907
908 # Create statistics file
909 # Output statistics about the vocablary
910 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
911 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
912
913 open(STAT, ">$phinddir/clauses.stats")
914 || die "Cannot open $phinddir/clauses.stats: $!";
915
916 print STAT "first_delimiter $first_delimiter\n";
917 print STAT "last_delimiter $last_delimiter\n";
918 print STAT "first_stopword $first_stopword\n";
919 print STAT "last_stopword $last_stopword\n";
920 if ($thesaurus) {
921 print STAT "first_thesaurusword $first_thesaurusword\n";
922 print STAT "last_thesaurusword $last_thesaurusword\n";
923 }
924 print STAT "first_extractword $first_extractword\n";
925 print STAT "last_extractword $last_extractword\n";
926 print STAT "first_contentword $first_contentword\n";
927 print STAT "last_contentword $last_contentword\n";
928 print STAT "first_symbol $first_delimiter\n";
929 print STAT "last_symbol $last_contentword\n";
930 print STAT "first_word $first_stopword\n";
931 print STAT "last_word $last_contentword\n";
932 close STAT;
933
934 undef @vocab;
935
936
937 # Create numbers file
938 # Save text as symbol numbers
939 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
940
941 open(TXT, "<$phinddir/clauses");
942 open(NUM, ">$phinddir/clauses.numbers");
943
944## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
945## print NUM "$symbol{lc($colstart)}\n"; # jrm21
946 $phrasedelimiter = $symbol{$senlimit};
947 print NUM "$symbol{$colstart}\n";
948
949 # set up the special symbols that delimit documents and sentences
950 while(<TXT>) {
951
952 # split sentence into a list of tokens
953 $line = $_;
954 next unless ($line =~ /./);
955 @words = split(/\s+/, $line);
956
957 # output one token at a time
958 foreach $word (@words) {
959# don't lower-case special delimiters - jrm21
960 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
961 $word = lc($word);
962 }
963 print NUM "$symbol{$word}\n";
964 }
965
966 # output phrase delimiter
967 print NUM "$phrasedelimiter\n";
968 }
969
970 close TXT;
971# print NUM "$symbol{lc($colend)}\n";# jrm21
972 print NUM "$symbol{$colend}\n";
973 close NUM;
974
975 # Save thesaurus data in one convienient file
976 if ($thesaurus) {
977
978 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
979
980
981 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
982 if ($verbosity > 1);
983
984 # Read the thesaurus terms
985 my ($num, $text, %thes_symbols);
986
987 open(TH, "<$thesaurus_terms");
988 while(<TH>) {
989 chomp;
990 @words = split(/\s+/, $_);
991 $num = shift @words;
992 $text = "";
993
994 # translate words into symbol numbers
995 foreach $word (@words) {
996 $word = lc($word);
997 if ($symbol{$word}) {
998 $text .= "s$symbol{$word} ";
999 } elsif ($verbosity) {
1000 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1001 }
1002 }
1003 $text =~ s/ $//;
1004 $thes_symbols{$num} = $text;
1005 }
1006 close TH;
1007
1008 # Read the thesaurus links and write the corresponding data
1009 open(TH, "<$thesaurus_links");
1010 open(THOUT, ">$thesaurusfile");
1011
1012 while(<TH>) {
1013 chomp;
1014 ($num, $text) = split(/:/, $_);
1015
1016 if (defined($thes_symbols{$num})) {
1017 print THOUT "$num:$thes_symbols{$num}:$text\n";
1018 } else {
1019 print THOUT "$num:untranslated:$text\n";
1020 }
1021 }
1022 close TH;
1023 close THOUT;
1024 }
1025
1026
1027
1028
1029}
1030
1031
1032# renumber_phrases
1033#
1034# Prepare the phrases file to be input to mgpp. The biggest problem is
1035# reconciling the phrase identifiers used by the suffix program (which
1036# we'll call suffix-id numbers) with the numbers used in the thesaurus
1037# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1038# Phind-id numbers must be sorted by frequency of occurance.
1039#
1040# Start creating a set of phind-id numbers from the sorted suffix-id
1041# numbers and (if required) the thesaurus-id numbers. Then add any other
1042# phrases occuring in the thesaurus.
1043#
1044# The last thing we have to do is restore the vocabulary information to the
1045# phrase file so that the phrases are stored as words, not as symbol
1046# numbers.
1047
1048# The original phrases file looks something like this:
1049# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1050# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1051# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1052# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1053
1054
1055sub renumber_phrases {
1056 my ($self) = @_;
1057
1058 renumber_suffix_data($self);
1059 renumber_thesaurus_data($self);
1060 restore_vocabulary_data($self);
1061
1062}
1063
1064
1065
1066# renumber_suffix_data
1067#
1068# Translate phrases file to phrases.2 using phind keys instead
1069# of suffix keys and sorting the expansion data.
1070
1071sub renumber_suffix_data {
1072 my ($self) = @_;
1073
1074 my $verbosity = $self->{'verbosity'};
1075 my $out = $self->{'outhandle'};
1076 print $out "Translate phrases: suffix-ids become phind-id's\n"
1077 if ($verbosity);
1078
1079 my $phinddir = $self->{'phinddir'};
1080 my $infile = &util::filename_cat($phinddir, 'phrases');
1081 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1082
1083 # Read the phrase file. Calculate initial set of phind-id
1084 # numbers and store (suffixid -> frequency) relation.
1085
1086 my %suffixtophind;
1087 my @phindfrequency;
1088 my (@fields, $suffixid);
1089 my $nextphind = 1;
1090
1091 open(IN, "<$infile");
1092 while(<IN>) {
1093
1094 chomp;
1095 @fields = split(/:/, $_);
1096
1097 # get next suffixid and phindid
1098 $suffixid = shift @fields;
1099 $suffixtophind{$suffixid} = $nextphind;
1100
1101 # store total frequency
1102 shift @fields;
1103 $totalfrequency[$nextphind] = shift @fields;
1104
1105 $nextphind++;
1106 }
1107 close IN;
1108
1109
1110 # Translate phrases file to phrases.2. Use phind keys (not suffix
1111 # keys), sort expansion and document occurance data in order of
1112 # descending frequency..
1113 open(IN, "<$infile");
1114 open(OUT, ">$outfile");
1115
1116 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1117 my (@documwents, @newexp, $k, $n);
1118 my $linenumber = 0;
1119
1120 while(<IN>) {
1121
1122 # read the line
1123 chomp;
1124 @fields = split(/:/, $_);
1125
1126 # get a phrase number for this line
1127 $suffixid = shift @fields;
1128 die unless (defined($suffixtophind{$suffixid}));
1129 $phindid = $suffixtophind{$suffixid};
1130
1131 # get the symbols in the phrase
1132 $text = shift @fields;
1133
1134 # output status information
1135 $linenumber++;
1136 if ($verbosity > 2) {
1137 if ($linenumber % 1000 == 0) {
1138 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1139 }
1140 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1141 }
1142
1143 # get the phrase frequency
1144 $tf = shift @fields;
1145
1146 # get the number of expansions
1147 $countexp = shift @fields;
1148
1149 # get the expansions, convert them into phind-id numbers, and sort them
1150 $expansions = shift @fields;
1151 @newexp = ();
1152 foreach $k (split(/,/, $expansions)) {
1153 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1154 $n = $suffixtophind{$k};
1155 push @newexp, $n;
1156 }
1157 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1158
1159 # get the number of documents
1160 $countdocs = shift @fields;
1161
1162 # get the documents and sort them
1163 $documents = shift @fields;
1164 $documents =~ s/d//g;
1165 @documents = split(/;/, $documents);
1166 @documents = sort by_doc_frequency @documents;
1167
1168 # output the phrase data
1169 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1170 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1171
1172 }
1173
1174 close IN;
1175 close OUT;
1176}
1177
1178
1179# renumber_thesaurus_data
1180#
1181# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1182
1183sub renumber_thesaurus_data {
1184 my ($self) = @_;
1185
1186 my $out = $self->{'outhandle'};
1187 my $verbosity = $self->{'verbosity'};
1188 my $thesaurus = $self->{'thesaurus'};
1189
1190 my $phinddir = $self->{'phinddir'};
1191 my $infile = &util::filename_cat($phinddir, "phrases.2");
1192 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1193
1194
1195 # If no thesaurus is defined, simply move the phrases file.
1196 if (!$thesaurus) {
1197 print $out "Translate phrases.2: no thesaurus data\n"
1198 if ($verbosity);
1199 &util::mv($infile, $outfile);
1200 return;
1201 }
1202
1203 print $out "Translate phrases.2: add thesaurus data\n"
1204 if ($verbosity);
1205
1206 # 1.
1207 # Read thesaurus file and store (symbols->thesaurusid) mapping
1208 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1209 my %symbolstothesid;
1210 my (@fields, $thesid, $symbols);
1211
1212 open(TH, "<$thesaurusfile");
1213
1214 while (<TH>) {
1215
1216 chomp;
1217 @fields = split(/:/, $_);
1218
1219 # get id and text
1220 $thesid = shift @fields;
1221 $symbols = shift @fields;
1222 $symbolstothesid{$symbols} = $thesid;
1223 }
1224 close TH;
1225
1226 # 2.
1227 # Read phrases file to find thesaurus entries that already
1228 # have a phindid. Store their phind-ids for later translation,
1229 # and store their frequency for later sorting.
1230 my %thesaurustophindid;
1231 my %phindidtofrequency;
1232 my ($phindid, $freq);
1233
1234 open(IN, "<$infile");
1235
1236 while(<IN>) {
1237
1238 chomp;
1239 @fields = split(/:/, $_);
1240
1241 # phindid and symbols for this line
1242 $phindid = shift @fields;
1243 $symbols = shift @fields;
1244 $freq = shift @fields;
1245
1246 # do we have a thesaurus id corresponding to this phrase?
1247 if (defined($symbolstothesid{$symbols})) {
1248 $thesid = $symbolstothesid{$symbols};
1249 $thesaurustophindid{$thesid} = $phindid;
1250 $phindidtofrequency{$phindid} = $freq;
1251 }
1252 }
1253 close IN;
1254
1255 undef %symbolstothesid;
1256
1257 # 3.
1258 # Create phind-id numbers for remaining thesaurus entries,
1259 # and note that their frequency is 0 for later sorting.
1260 my $nextphindid = $phindid + 1;
1261
1262 open(TH, "<$thesaurusfile");
1263 while(<TH>) {
1264
1265 chomp;
1266 @fields = split(/:/, $_);
1267
1268 # read thesaurus-id and ensure it has a corresponding phind-id
1269 $thesid = shift @fields;
1270 if (!defined($thesaurustophindid{$thesid})) {
1271 $thesaurustophindid{$thesid} = $nextphindid;
1272 $phindidtofrequency{$nextphindid} = 0;
1273 $nextphindid++;
1274 }
1275 }
1276 close TH;
1277
1278 # 4.
1279 # Translate thesaurus file, replacing thesaurus-id numbers with
1280 # phind-id numbers.
1281 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1282 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1283 my (@links, $linkid, %linkidtotype, $newrelation);
1284
1285 open(TH, "<$thesaurusfile");
1286 open(TO, ">$newthesaurusfile");
1287 while(<TH>) {
1288
1289 chomp;
1290 @fields = split(/:/, $_);
1291
1292 # phindid and symbols for this line
1293 ($thesid, $symbols, $relations) = @fields;
1294
1295 die unless ($thesid && $symbols);
1296 die unless $thesaurustophindid{$thesid};
1297 $phindid = $thesaurustophindid{$thesid};
1298
1299 # convert each part of the relation string to use phind-id numbers
1300 # at the same time, we want to sort the list by frequency.
1301 undef %linkidtotype;
1302
1303 foreach $linktext (split(/;/, $relations)) {
1304 @linkdata = split(/,/, $linktext);
1305
1306 # remember the linktype (e.g. BT, NT)
1307 $linktype = shift @linkdata;
1308
1309 # store the type of each link
1310 foreach $thesid (@linkdata) {
1311 die unless (defined($thesaurustophindid{$thesid}));
1312 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1313 }
1314 }
1315
1316 # sort the list of links, first by frequency, then by type.
1317 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1318 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1319 $linkcounter = (scalar @links);
1320
1321 # create a string describing the link information
1322 $linktype = $linkidtotype{$links[0]};
1323 $newrelation = $linktype;
1324 foreach $linkid (@links) {
1325 if ($linkidtotype{$linkid} ne $linktype) {
1326 $linktype = $linkidtotype{$linkid};
1327 $newrelation .= ";" . $linktype;
1328 }
1329 $newrelation .= "," . $linkid;
1330 }
1331 $newrelation .= ";";
1332
1333
1334 # output the new line
1335 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1336 }
1337 close TH;
1338 close TO;
1339
1340 undef %thesaurustophindid;
1341 undef %linkidtotype;
1342 undef %phindidtofrequency;
1343
1344 # 5.
1345 # Read thesaurus data (in phind-id format) into memory
1346 my %thesaurusdata;
1347
1348 open(TH, "<$newthesaurusfile");
1349 while(<TH>) {
1350 chomp;
1351 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1352 die unless ($phindid && $symbols);
1353 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1354 }
1355 close TH;
1356
1357 # 6.
1358 # Add thesaurus data to phrases file
1359 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1360 my (@documwents, @newexp, $k, $n);
1361 my $linenumber = 0;
1362
1363 open(IN, "<$infile");
1364 open(OUT, ">$outfile");
1365
1366 # Update existing phrases
1367 while(<IN>) {
1368
1369 chomp;
1370 @fields = split(/:/, $_);
1371
1372 # get data for this line
1373 $phindid = shift @fields;
1374
1375 # output the phrase data, with thesaurus information
1376 print OUT "$phindid:", join(":", @fields);
1377
1378 # add thesaurus data
1379 if (defined($thesaurusdata{$phindid})) {
1380 @fields = split(/:/, $thesaurusdata{$phindid});
1381 shift @fields;
1382 $linkcounter = shift @fields;
1383 $relations = shift @fields;
1384
1385 print OUT ":$linkcounter:$relations";
1386 $thesaurusdata{$phindid} = "";
1387 }
1388 print OUT "\n";
1389 }
1390 close IN;
1391
1392 # Add phrases that aren't already in the file
1393 foreach $phindid (sort numerically keys %thesaurusdata) {
1394 next unless ($thesaurusdata{$phindid});
1395
1396 @fields = split(/:/, $thesaurusdata{$phindid});
1397 $symbols = shift @fields;
1398 $linkcounter = shift @fields;
1399 $relations = shift @fields;
1400
1401 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1402 }
1403 close OUT;
1404
1405}
1406
1407# restore_vocabulary_data
1408#
1409# Read phrases.3 and restore vocabulary information. Then write
1410# this data to the MGPP input files (pword.txt and pdata.txt) and
1411# (if requested) to the saved phrases file.
1412
1413sub restore_vocabulary_data {
1414 my ($self) = @_;
1415
1416 my $out = $self->{'outhandle'};
1417 my $verbosity = $self->{'verbosity'};
1418 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1419
1420 my $phinddir = $self->{'phinddir'};
1421 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1422 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1423 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1424 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1425
1426 my $savephrases = $self->{'savephrases'};
1427
1428 # 1.
1429 # Read the vocabulary file
1430 open(V, "<$vocabfile")
1431 || die "Cannot open $vocabfile: $!";
1432 my @symbol;
1433 my $i = 1;
1434 while(<V>) {
1435 chomp;
1436 $symbol[$i++] = $_;
1437 }
1438 close V;
1439
1440 # 2.
1441 # Translate phrases.3 to MGPP input files
1442 my ($key, $text, $word, $isThesaurus);
1443 my @fields;
1444 my $linenumber = 0;
1445
1446 open(IN, "<$infile");
1447 open(DATA, ">$datafile");
1448 open(WORD, ">$wordfile");
1449
1450 # Save the phrases in a separate text file
1451 if ($savephrases) {
1452 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1453 open(SAVE, ">$savephrases");
1454 }
1455
1456 while(<IN>) {
1457
1458 # read the line
1459 chomp;
1460 $line = $_;
1461 @fields = split(/:/, $line);
1462
1463 # get a phrase number for this line
1464 $key = shift @fields;
1465
1466 # restore the text of the phrase
1467 $text = shift @fields;
1468 $text =~ s/s(\d+)/$symbol[$1]/g;
1469 if ($text =~ / /) {
1470 $word = "";
1471 } elsif ($text ne 'untranslated') {
1472 $word = $text;
1473 }
1474
1475 # output the phrase data
1476 print DATA "<Document>";
1477 print DATA "$key:$text:", join(":", @fields), ":\n";
1478
1479 # output the word index search data
1480 print WORD "<Document>$word\n";
1481
1482 # output the phrases to a text file
1483 if ($savephrases) {
1484 if ((scalar @fields) == 7) {
1485 $isThesaurus = 1;
1486 } else {
1487 $isThesaurus = 0;
1488 }
1489 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1490 }
1491 }
1492 close IN;
1493 close WORD;
1494 close DATA;
1495 close SAVE if ($savephrases);
1496
1497}
1498
1499
1500
1501# sort routines used to renumber phrases
1502
1503sub numerically { $a <=> $b }
1504
1505sub by_doc_frequency {
1506 my $fa = 1;
1507 if ($a =~ /,/) {
1508 $fa = $a;
1509 $fa =~ s/\d+,//;
1510 }
1511 my $fb = 1;
1512 if ($b =~ /,/) {
1513 $fb = $b;
1514 $fb =~ s/\d+,//;
1515 }
1516
1517 return ($fb <=> $fa);
1518}
1519
15201;
Note: See TracBrowser for help on using the repository browser.