source: trunk/gsdl/perllib/classify/Phind.pm@ 5645

Last change on this file since 5645 was 5645, checked in by mdewsnip, 21 years ago

Moved classifier descriptions into the resource bundle (perllib/strings.rb).

  • Property svn:keywords set to Author Date Id Revision
File size: 41.3 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41my @removedirs = ();
42
43my %wanted_index_files = ('td'=>1,
44 't'=>1,
45 'ti'=>1,
46 'tl'=>1,
47 'tsd'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'il'=>1,
54 'w'=>1,
55 'wa'=>1);
56
57sub BEGIN {
58 @ISA = ('BasClas');
59}
60
61sub END {
62
63 # Tidy up stray files - we do this here as there's some weird problem
64 # preventing us from doing it in the get_classify_info() function (on
65 # windows at least) where the close() appears to fail on txthandle and
66 # dochandle, thus preventing us from deleting those files
67
68 foreach my $dir (@removedirs) {
69 if (-d $dir && opendir (DIR, $dir)) {
70 my @files = readdir DIR;
71 closedir DIR;
72
73 foreach $file (@files) {
74 next if $file =~ /^\.\.?$/;
75 my ($suffix) = $file =~ /\.([^\.]+)$/;
76 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
77 # delete it!
78 &util::rm (&util::filename_cat ($dir, $file));
79 }
80 }
81 }
82 }
83}
84
85my $arguments =
86 [ { 'name' => "text",
87 'desc' => "{Phind.text}",
88 'type' => "string",
89 'deft' => "section:Title,section:text",
90 'reqd' => "no" },
91 { 'name' => "title",
92 'desc' => "{Phind.title}",
93 'type' => "metadata",
94 'deft' => "Title",
95 'reqd' => "no" },
96 { 'name' => "button",
97 'desc' => "{Phind.button}",
98 'type' => "string",
99 'deft' => "Phrase",
100 'reqd' => "no" },
101 { 'name' => "language",
102 'desc' => "{Phind.language}",
103 'type' => "language",
104 'deft' => "en",
105 'reqd' => "no" },
106 { 'name' => "savephrases",
107 'desc' => "{Phind.savephrases}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no" },
111 { 'name' => "builddir",
112 'desc' => "{BasClas.builddir}",
113 'type' => "string",
114 'deft' => "" },
115 { 'name' => "suffixmode",
116 'desc' => "{Phind.suffixmode}",
117 'type' => "int",
118 'deft' => "1",
119 'reqd' => "no" },
120 { 'name' => "thesaurus",
121 'desc' => "{Phind.thesaurus}",
122 'type' => "string",
123 'deft' => "",
124 'reqd' => "no" },
125 { 'name' => "untidy",
126 'desc' => "{Phind.untidy}",
127 'type' => "flag",
128 'reqd' => "no" } ];
129
130my $options = { 'name' => "Phind",
131 'desc' => "{Phind.desc}",
132 'inherits' => "Yes",
133 'args' => $arguments };
134
135# sub print_usage {
136# print STDERR "
137# usage: classify Phind [options]
138
139# options:
140# -text Fields The text used to build the phrase hierarchy.
141# (default: 'section:Title,section:text')
142
143# -title Title The metadata field used to describe each document.
144# (default: 'Title')
145
146# -button Name The label for the classifier screen and button in
147# navigation bar.
148# (default: 'Phrase')
149
150# -language Regex Language or languages to use building hierarchy.
151# Languages are identified by two-letter country codes
152# like en (English), es (Spanish), and fr (French).
153# Language is a regular expression, so 'en|fr' (English or
154# French) and '..' (match any language) are valid.
155# (default: 'en'.)
156
157# -savephrases File If set, the phrase infomation will be stored in
158# the given file as text. It is probably a good idea
159# to use an absolute path.
160# (default: not set)
161
162# -suffixmode N The smode parameter to the phrase extraction program. A
163# value of 0 means that stopwords are ignored, and of 1
164# means that stopwords are used.
165# (default: 1)
166
167# -thesaurus Name Name of a thesaurus stored in Phind format in the
168# collection's etc directory.
169# (default: not set)
170
171# -untidy Don't remove working files.
172
173# "; }
174
175# Phrase delimiter symbols - these should be abstracted out someplace
176
177my $colstart = "COLLECTIONSTART";
178my $colend = "COLLECTIONEND";
179my $doclimit = "DOCUMENTLIMIT";
180my $senlimit = "SENTENCELIMIT";
181my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
182
183
184# Create a new Phind browser based on collect.cfg
185
186sub new {
187 my $class = shift (@_);
188 my $self = new BasClas($class, @_);
189
190 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
191 my $option_list = $self->{'option_list'};
192 push( @{$option_list}, $options );
193
194 my $out = $self->{'outhandle'};
195
196 # Ensure the Phind generate scripts are in place
197 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
198 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
199 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
200
201 if (!(-e $file1)) {
202 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
203 exit(1);
204 }
205
206 # Parse classifier arguments
207 my $builddir = "";
208 if (!parsargv::parse(\@_,
209 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
210 q^title/.*/Title^, \$self->{'titlefield'},
211 q^button/.*/Phrase^, \$self->{'buttonname'},
212 q^language/.*/en^, \$language,
213 q^builddir/.*/^, \$builddir,
214 q^savephrases/.*/^, \$self->{'savephrases'},
215 q^suffixmode/\d/1^, \$self->{'suffixmode'},
216 q^thesaurus/.*/^, \$self->{'thesaurus'},
217 q^untidy^, \$self->{'untidy'},
218 "allow_extra_options")) {
219
220 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
221 $self->print_txt_usage(""); # Use default resource bundle
222 die "\n";
223 }
224
225 # classifier information
226 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
227
228 # limit languages
229 $self->{'language_exp'} = $language;
230
231 # collection directories
232 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
233 if (!$builddir) {
234 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
235 }
236 $self->{'builddir'} = $builddir;
237
238 $self->{'total'} = 0;
239
240 return bless $self, $class;
241}
242
243
244# Initialise the Phind classifier
245
246sub init {
247 my $self = shift (@_);
248
249 # ensure we have a build directory
250 my $builddir = $self->{'builddir'};
251 die unless (-e "$builddir");
252
253 # create Phind directory
254 my $phnumber = 1;
255 my $phinddir = &util::filename_cat($builddir, "phind1");
256 while (-e "$phinddir") {
257 $phnumber++;
258 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
259 }
260 &util::mk_dir("$phinddir");
261 $self->{'phinddir'} = $phinddir;
262 $self->{'phindnumber'} = $phnumber;
263
264 push(@removedirs, $phinddir) unless $self->{'untidy'};
265
266 # open filehandles for documents and text
267 my $clausefile = &util::filename_cat("$phinddir", "clauses");
268 &util::rm($clausefile) if (-e $clausefile);
269
270 my $txthandle = 'TEXT' . $phnumber;
271 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
272 $self->{'txthandle'} = $txthandle;
273
274 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
275 &util::rm($docfile) if (-e $docfile);
276
277 my $dochandle = 'DOC' . $phnumber;
278 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
279 $self->{'dochandle'} = $dochandle;
280
281}
282
283
284# Classify each document.
285#
286# Each document is passed here in turn. The classifier extracts the
287# text of each and stores it in the clauses file. Document details are
288# stored in the docs.txt file.
289
290sub classify {
291 my $self = shift (@_);
292 my $doc_obj = shift @_;
293
294 my $verbosity = $self->{'verbosity'};
295 my $top_section = $doc_obj->get_top_section();
296
297 my $titlefield = $self->{'titlefield'};
298
299 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
300 if (!defined($title)) {
301 $title = "";
302 print STDERR "Phind: document has no title\n";
303 }
304 print "process: $title\n" if ($verbosity > 2);
305
306 # Only consider the file if it is in the correct language
307 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
308 my $phrlanguage = $self->{'language_exp'};
309 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
310
311 # record this file
312 $self->{'total'} ++;
313 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
314
315
316 # Store document details
317 my $OID = $doc_obj->get_OID();
318 $OID = "NULL" unless defined $OID;
319 my $dochandle = $self->{'dochandle'};
320 print $dochandle "<Document>\t$OID\t$title\n";
321
322 # Store the text occuring in this object
323
324 # output the document delimiter
325 my $txthandle = $self->{'txthandle'};
326 print $txthandle "$doclimit\n";
327
328 # iterarate over the required indexes and store their text
329 my $indexes = $self->{'indexes'};
330 my $text = "";
331 my ($part, $level, $field, $section, $data, $dataref);
332
333 foreach $part (split(/,/, $indexes)) {
334
335 # Each field has a level and a data element ((e.g. document:Title)
336 ($level, $field) = split(/:/, $part);
337 die unless ($level && $field);
338
339 # Extract the text from every section
340 # (In phind, document:text and section:text are equivalent)
341 if ($field eq "text") {
342 $data = "";
343 $section = $doc_obj->get_top_section();
344 while (defined($section)) {
345 $data .= $doc_obj->get_text($section) . "\n";
346 $section = $doc_obj->get_next_section($section);
347 }
348 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
349 }
350
351 # Extract a metadata field from a document
352 # (If there is more than one element of the given type, get them all.)
353 elsif ($level eq "document") {
354 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
355 foreach $data (@$dataref) {
356 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
357 }
358 }
359
360 # Extract metadata from every section in a document
361 elsif ($level eq "section") {
362 $data = "";
363 $section = $doc_obj->get_top_section();
364 while (defined($section)) {
365 $dataref = $doc_obj->get_metadata($section, $field);
366 $data .= join("\n", @$dataref) . "\n";
367 $section = $doc_obj->get_next_section($section);
368 }
369 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
370 }
371
372 # Some sort of specification which I don't understand
373 else {
374 die "Unknown level ($level) in Phind index ($part)\n";
375 }
376
377 }
378
379 # output the text
380 $text =~ tr/\n//s;
381 print $txthandle "$text";
382}
383
384
385# Construct the classifier from the information already gathered
386#
387# When get_classify_info is called, the clauses and docs.txt files have
388# already been constructed in the Phind directory. This function will
389# translate them into compressed, indexed MGPP files that can be read by
390# the phindcgi script. It will also register our classifier so that it
391# shows up in the navigation bar.
392
393sub get_classify_info {
394 my $self = shift (@_);
395
396 close $self->{'dochandle'};
397 close $self->{'txthandle'};
398 my $verbosity = $self->{'verbosity'};
399 my $out = $self->{'outhandle'};
400 my $phinddir = $self->{'phinddir'};
401
402 my $osextra = "";
403 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
404 $osextra = " -d /";
405 }
406
407 if ($verbosity) {
408 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
409 print $out "*** in ", $self->{'phinddir'}, "\n";
410 }
411
412 # Construct phind indexes
413 my $suffixmode = $self->{'suffixmode'};
414 my ($command, $status);
415
416 # Generate the vocabulary, symbol statistics, and numbers file
417 # from the clauses file
418 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
419 &extract_vocabulary($self);
420
421 # Use the suffix program to generate the phind/phrases file
422 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
423 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
424
425 # check that we generated some files. It's not necessarily an error if
426 # we didn't (execute() would have quit on error), but we can't go on.
427 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
428 if (! -r $phrasesfile) {
429 print $out "\nNo phrases found for Phind classifier!\n";
430 return;
431 }
432
433 # Create the phrase file and put phrase numbers in phind/phrases
434 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
435 &renumber_phrases($self);
436
437 print $out "\nCreating phrase databases\n";
438 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
439 my $mg_stem = &util::filename_cat($phinddir, "pdata");
440
441 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
442 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
443 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
444
445 # create the mg index of words
446 print $out "\nCreating word-level search indexes\n";
447 $mg_input = &util::filename_cat($phinddir, "pword.txt");
448 $mg_stem = &util::filename_cat($phinddir, "pword");
449
450 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
451 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
452 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
453 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
454 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
455 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
456
457 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
458 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
459 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
460
461 # create the mg document information database
462 print $out "\nCreating document information databases\n";
463 $mg_input = &util::filename_cat($phinddir, "docs.txt");
464 $mg_stem = &util::filename_cat($phinddir, "docs");
465
466 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
467 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
468 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
469
470 # Return the information about the classifier that we'll later want to
471 # use to create macros when the Phind classifier document is displayed.
472 my %classifyinfo = ('thistype'=>'Invisible',
473 'childtype'=>'Phind',
474 'Title'=>$self->{'buttonname'},
475 'parameters'=>"phindnumber=$self->{'phindnumber'}",
476 'contains'=>[]);
477
478 my $collection = $self->{'collection'};
479 my $url = "library?a=p&p=phind&c=$collection";
480 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
481
482 return \%classifyinfo;
483}
484
485
486
487sub convert_gml_to_tokens {
488
489 my ($language_exp, $text) = @_;
490
491 # escape any magic words... - jrm21
492 foreach my $delim (@delimiters) {
493 my $replacement=lc($delim);
494 my $num= $text=~ s/$delim/$replacement/g;
495 if (!$num) {$num=0;}
496 }
497
498 if ($language_exp =~ /en/) {
499 return &convert_gml_to_tokens_EN($text);
500 }
501
502 if ($language_exp =~ /zh/) {
503 return &convert_gml_to_tokens_ZH($text);
504 }
505
506 $_ = $text;
507
508 # 1. remove GML tags
509
510 # Remove everything that is in a tag
511 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
512 s/\s*<br>\s*/ LINEBREAK /isgo;
513 s/<[^>]*>/ /sgo;
514
515 # Now we have the text, but it may contain HTML
516 # elements coded as &gt; etc. Remove these tags.
517 s/&amp;/&/sgo;
518 s/&lt;/</sgo;
519 s/&gt;/>/sgo;
520 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
521 s/\s*<br>\s*/ LINEBREAK /isgo;
522 s/<[^>]*>/ /sgo;
523
524 # replace<p> and <br> placeholders with clause break symbol (\n)
525 s/\s+/ /gso;
526 s/PARAGRAPHBREAK/\n/sgo;
527 s/LINEBREAK/\n/sgo;
528
529
530
531
532 # 2. Split the remaining text into space-delimited tokens
533
534 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
535 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
536
537 # Split text at word boundaries
538 s/\b/ /go;
539
540 # 3. Convert the remaining text to "clause format"
541
542 # Insert newline if the end of a sentence is detected
543 # (delimter is: "[\.\?\!]\s")
544 # s/\s*[\.\?\!]\s+/\n/go;
545
546 # remove unnecessary punctuation and replace with clause break symbol (\n)
547 s/[^\w ]/\n/go;
548
549 # remove extraneous whitespace
550 s/ +/ /sgo;
551 s/^\s+//mgo;
552 s/\s*$/\n/mgo;
553
554 # remove lines that contain one word or less
555 s/^\S*$//mgo;
556 s/^\s*$//mgo;
557 tr/\n//s;
558
559 return $_;
560}
561
562# a chinese version
563sub convert_gml_to_tokens_ZH {
564
565 $_ = shift @_;
566
567 # Replace all whitespace with a simple space
568 s/\s+/ /gs;
569 # Remove everything that is in a tag
570 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
571 s/\s*<br>\s*/ LINEBREAK /isg;
572 s/<[^>]*>/ /sg;
573
574 # Now we have the text, but it may contain HTML
575 # elements coded as &gt; etc. Remove these tags.
576 s/&lt;/</sg;
577 s/&gt;/>/sg;
578
579 s/\s+/ /sg;
580 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
581 s/\s*<br>\s*/ LINEBREAK /isg;
582 s/<[^>]*>/ /sg;
583
584 # remove &amp; and other miscellaneous markup tags
585 s/&amp;/&/sg;
586 s/&lt;/</sg;
587 s/&gt;/>/sg;
588 s/&amp;/&/sg;
589
590 # replace<p> and <br> placeholders with carriage returns
591 s/PARAGRAPHBREAK/\n/sg;
592 s/LINEBREAK/\n/sg;
593
594
595# print STDERR "text:$_\n";
596 return $_;
597}
598
599# A version of convert_gml_to_tokens that is fine-tuned to the English language.
600
601sub convert_gml_to_tokens_EN {
602 $_ = shift @_;
603
604 # FIRST, remove GML tags
605
606 # Replace all whitespace with a simple space
607 s/\s+/ /gs;
608
609 # Remove everything that is in a tag
610 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
611 s/\s*<br>\s*/ LINEBREAK /isg;
612 s/<[^>]*>/ /sg;
613
614 # Now we have the text, but it may contain HTML
615 # elements coded as &gt; etc. Remove these tags.
616 s/&lt;/</sg;
617 s/&gt;/>/sg;
618
619 s/\s+/ /sg;
620 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
621 s/\s*<br>\s*/ LINEBREAK /isg;
622 s/<[^>]*>/ /sg;
623
624 # remove &amp; and other miscellaneous markup tags
625 s/&amp;/&/sg;
626 s/&lt;/</sg;
627 s/&gt;/>/sg;
628 s/&amp;/&/sg;
629
630 # replace<p> and <br> placeholders with carriage returns
631 s/PARAGRAPHBREAK/\n/sg;
632 s/LINEBREAK/\n/sg;
633
634
635 # Exceptional punctuation
636 #
637 # We make special cases of some punctuation
638
639 # remove any apostrophe that indicates omitted letters
640 s/(\w+)\'(\w*\s)/ $1$2 /g;
641
642 # remove period that appears in a person's initals
643 s/\s([A-Z])\./ $1 /g;
644
645 # replace hyphens in hypheanted words and names with a space
646 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
647
648 # Convert the remaining text to "clause format",
649 # This means removing all excess punctuation and garbage text,
650 # normalising valid punctuation to fullstops and commas,
651 # then putting one cluse on each line.
652
653 # Insert newline when the end of a sentence is detected
654 # (delimter is: "[\.\?\!]\s")
655 s/\s*[\.\?\!]\s+/\n/g;
656
657 # split numbers after four digits
658 s/(\d\d\d\d)/$1 /g;
659
660 # split words after 32 characters
661
662 # squash repeated punctuation
663 tr/A-Za-z0-9 //cs;
664
665 # save email addresses
666 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
667
668 # normalise clause breaks (mostly punctuation symbols) to commas
669 s/[^A-Za-z0-9 \n]+/ , /g;
670
671 # Remove repeated commas, and replace with newline
672 s/\s*,[, ]+/\n/g;
673
674 # remove extra whitespace
675 s/ +/ /sg;
676 s/^\s+//mg;
677 s/\s*$/\n/mg;
678
679 # remove lines that contain one word or less
680 s/^\w*$//mg;
681 s/^\s*$//mg;
682 tr/\n//s;
683
684 return $_;
685
686}
687
688
689
690# Execute a system command
691
692sub execute {
693 my ($command, $verbosity, $outhandle) = @_;
694 print $outhandle "Executing: $command\n" if ($verbosity > 2);
695 $! = 0;
696 my $status = system($command);
697 if ($status != 0) {
698 print STDERR "Phind - Error executing '$command': $!\n";
699 exit($status); # this causes the build to fail...
700 }
701}
702
703
704# Generate the vocabulary, symbol statistics, and numbers file from the
705# clauses file. This is legacy code, so is a bit messy and probably wont
706# run under windows.
707
708sub extract_vocabulary {
709 my ($self) = @_;
710
711 my $verbosity = $self->{'verbosity'};
712 my $out = $self->{'outhandle'};
713
714 my $collectiondir = $self->{'collectiondir'};
715 my $phinddir = $self->{'phinddir'};
716
717 my $language_exp = $self->{'language_exp'};
718
719 my ($w, $l, $line, $word);
720
721 my ($first_delimiter, $last_delimiter,
722 $first_stopword, $last_stopword,
723 $first_extractword, $last_extractword,
724 $first_contentword, $last_contentword,
725 $phrasedelimiter);
726
727 my $thesaurus = $self->{'thesaurus'};
728 my ($thesaurus_links, $thesaurus_terms,
729 %thesaurus, $first_thesaurusword, $last_thesaurusword);
730
731 my %symbol;
732 my (%freq);
733
734 print $out "Calculating vocabulary\n" if ($verbosity > 1);
735
736 # Read and store the stopwords
737 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
738 my $stopword_files = ();
739 my ($language, $language_dir, $file, $file_name);
740 my %stopwords;
741
742 # Examine each directory in the stopword directory
743 opendir(STOPDIR, $stopdir);
744 foreach $language (readdir STOPDIR) {
745
746 # Ignore entries that do not match the classifier's language
747 next unless ($language =~ /$language_exp/);
748 $language_dir = &util::filename_cat($stopdir, $language);
749 next unless (-d "$language_dir");
750
751 opendir(LANGDIR, $language_dir);
752 foreach $file (readdir LANGDIR) {
753
754 # Ignore entries that are not stopword files
755 next unless ($file =~ /sw$/);
756 $file_name = &util::filename_cat($language_dir, $file);
757 next unless (-f "$file_name");
758
759 # Read the stopwords
760 open(STOPFILE, "<$file_name");
761 while (<STOPFILE>) {
762 s/^\s+//;
763 s/\s.*//;
764 $word = $_;
765 $l = lc($word);
766 $stopwords{$l} = $word;
767 }
768 close STOPFILE;
769
770 }
771 closedir LANGDIR;
772 }
773 closedir STOPDIR;
774
775 # Read thesaurus information
776 if ($thesaurus) {
777
778 # link file exists
779 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
780 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
781
782 # ensure term file exists in the correct language
783 if ($language_exp =~ /^([a-z][a-z])/) {
784 $language = $1;
785 } else {
786 $language = 'en';
787 }
788 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
789 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
790
791
792 # Read the thesaurus terms
793 open(TH, "<$thesaurus_terms");
794 while(<TH>) {
795 s/^\d+ //;
796 s/\(.*\)//;
797 foreach $w (split(/\s+/, $_)) {
798 $thesaurus{lc($w)} = $w;
799 }
800 }
801 close TH;
802 }
803
804 # Read words in the text and count occurences
805 open(TXT, "<$phinddir/clauses");
806
807 my @words;
808 while(<TXT>) {
809 $line = $_;
810 next unless ($line =~ /./);
811
812 @words = split(/\s+/, $line);
813 foreach $w (@words) {
814 $l = lc($w);
815 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
816 $freq{$w}++;
817 }
818 $freq{$senlimit}++;
819 }
820
821 close TXT;
822
823 # Calculate the "best" form of each word
824 my (%bestform, %totalfreq, %bestfreq);
825
826 foreach $w (sort (keys %freq)) {
827 $l = lc($w);
828
829 # totalfreq is the number of times a term appears in any form
830 $totalfreq{$l} += $freq{$w};
831
832 if (defined $stopwords{$l}) {
833 $bestform{$l} = $stopwords{$l};
834
835 } elsif (defined $thesaurus{$l}) {
836 $bestform{$l} = $thesaurus{$l};
837
838 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
839 $bestfreq{$l} = $freq{$w};
840 $bestform{$l} = $w;
841 }
842 }
843 undef %freq;
844 undef %bestfreq;
845
846
847 # Assign symbol numbers to tokens
848 my $nextsymbol = 1;
849 my (@vocab);
850
851 # Delimiters
852 $first_delimiter = 1;
853
854 foreach $word (@delimiters) {
855
856# $word = lc($word); # jrm21
857 $word = uc($word);
858 $bestform{$word} = $word;
859 $vocab[$nextsymbol] = $word;
860 $symbol{$word} = $nextsymbol;
861 $nextsymbol++;
862 }
863 $last_delimiter = $nextsymbol - 1;
864 # Stopwords
865 $first_stopword = $nextsymbol;
866
867 foreach my $word (sort keys %stopwords) {
868 # don't include stopword unless it occurs in the text
869 $word = lc($word);
870 next unless ($totalfreq{$word});
871 next if ($symbol{$word});
872
873 $vocab[$nextsymbol] = $word;
874 $symbol{$word} = $nextsymbol;
875 $nextsymbol++;
876 }
877 $last_stopword = $nextsymbol - 1;
878 $first_contentword = $nextsymbol;
879
880 # Thesaurus terms
881 if ($thesaurus) {
882 $first_thesaurusword = $nextsymbol;
883
884 foreach my $word (sort keys %thesaurus) {
885
886 $word = lc($word);
887 next if ($symbol{$word});
888 $bestform{$word} = $thesaurus{$word};
889
890 $vocab[$nextsymbol] = $word;
891 $symbol{$word} = $nextsymbol;
892 $nextsymbol++;
893
894 }
895 $last_thesaurusword = $nextsymbol - 1;
896 }
897
898 # Other content words
899 $first_extractword = $nextsymbol;
900
901 foreach my $word (sort (keys %bestform)) {
902
903 next if ($symbol{$word});
904
905 $vocab[$nextsymbol] = $word;
906 $symbol{$word} = $nextsymbol;
907 $nextsymbol++;
908 }
909 $last_extractword = $nextsymbol - 1;
910 $last_contentword = $nextsymbol - 1;
911
912 # Outut the words
913 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
914 open(VOC, ">$phinddir/clauses.vocab");
915
916 for (my $i = 1; $i < $nextsymbol; $i++) {
917 $w = $vocab[$i];
918
919 print VOC "$bestform{$w}\n";
920 $totalfreq{$w} = 0 unless ($totalfreq{$w});
921 }
922 close VOC;
923
924
925 # Create statistics file
926 # Output statistics about the vocablary
927 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
928 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
929
930 open(STAT, ">$phinddir/clauses.stats")
931 || die "Cannot open $phinddir/clauses.stats: $!";
932
933 print STAT "first_delimiter $first_delimiter\n";
934 print STAT "last_delimiter $last_delimiter\n";
935 print STAT "first_stopword $first_stopword\n";
936 print STAT "last_stopword $last_stopword\n";
937 if ($thesaurus) {
938 print STAT "first_thesaurusword $first_thesaurusword\n";
939 print STAT "last_thesaurusword $last_thesaurusword\n";
940 }
941 print STAT "first_extractword $first_extractword\n";
942 print STAT "last_extractword $last_extractword\n";
943 print STAT "first_contentword $first_contentword\n";
944 print STAT "last_contentword $last_contentword\n";
945 print STAT "first_symbol $first_delimiter\n";
946 print STAT "last_symbol $last_contentword\n";
947 print STAT "first_word $first_stopword\n";
948 print STAT "last_word $last_contentword\n";
949 close STAT;
950
951 undef @vocab;
952
953
954 # Create numbers file
955 # Save text as symbol numbers
956 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
957
958 open(TXT, "<$phinddir/clauses");
959 open(NUM, ">$phinddir/clauses.numbers");
960
961## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
962## print NUM "$symbol{lc($colstart)}\n"; # jrm21
963 $phrasedelimiter = $symbol{$senlimit};
964 print NUM "$symbol{$colstart}\n";
965
966 # set up the special symbols that delimit documents and sentences
967 while(<TXT>) {
968
969 # split sentence into a list of tokens
970 $line = $_;
971 next unless ($line =~ /./);
972 @words = split(/\s+/, $line);
973
974 # output one token at a time
975 foreach $word (@words) {
976# don't lower-case special delimiters - jrm21
977 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
978 $word = lc($word);
979 }
980 print NUM "$symbol{$word}\n";
981 }
982
983 # output phrase delimiter
984 print NUM "$phrasedelimiter\n";
985 }
986
987 close TXT;
988# print NUM "$symbol{lc($colend)}\n";# jrm21
989 print NUM "$symbol{$colend}\n";
990 close NUM;
991
992 # Save thesaurus data in one convienient file
993 if ($thesaurus) {
994
995 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
996
997
998 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
999 if ($verbosity > 1);
1000
1001 # Read the thesaurus terms
1002 my ($num, $text, %thes_symbols);
1003
1004 open(TH, "<$thesaurus_terms");
1005 while(<TH>) {
1006 chomp;
1007 @words = split(/\s+/, $_);
1008 $num = shift @words;
1009 $text = "";
1010
1011 # translate words into symbol numbers
1012 foreach $word (@words) {
1013 $word = lc($word);
1014 if ($symbol{$word}) {
1015 $text .= "s$symbol{$word} ";
1016 } elsif ($verbosity) {
1017 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1018 }
1019 }
1020 $text =~ s/ $//;
1021 $thes_symbols{$num} = $text;
1022 }
1023 close TH;
1024
1025 # Read the thesaurus links and write the corresponding data
1026 open(TH, "<$thesaurus_links");
1027 open(THOUT, ">$thesaurusfile");
1028
1029 while(<TH>) {
1030 chomp;
1031 ($num, $text) = split(/:/, $_);
1032
1033 if (defined($thes_symbols{$num})) {
1034 print THOUT "$num:$thes_symbols{$num}:$text\n";
1035 } else {
1036 print THOUT "$num:untranslated:$text\n";
1037 }
1038 }
1039 close TH;
1040 close THOUT;
1041 }
1042
1043
1044
1045
1046}
1047
1048
1049# renumber_phrases
1050#
1051# Prepare the phrases file to be input to mgpp. The biggest problem is
1052# reconciling the phrase identifiers used by the suffix program (which
1053# we'll call suffix-id numbers) with the numbers used in the thesaurus
1054# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1055# Phind-id numbers must be sorted by frequency of occurance.
1056#
1057# Start creating a set of phind-id numbers from the sorted suffix-id
1058# numbers and (if required) the thesaurus-id numbers. Then add any other
1059# phrases occuring in the thesaurus.
1060#
1061# The last thing we have to do is restore the vocabulary information to the
1062# phrase file so that the phrases are stored as words, not as symbol
1063# numbers.
1064
1065# The original phrases file looks something like this:
1066# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1067# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1068# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1069# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1070
1071
1072sub renumber_phrases {
1073 my ($self) = @_;
1074
1075 renumber_suffix_data($self);
1076 renumber_thesaurus_data($self);
1077 restore_vocabulary_data($self);
1078
1079}
1080
1081
1082
1083# renumber_suffix_data
1084#
1085# Translate phrases file to phrases.2 using phind keys instead
1086# of suffix keys and sorting the expansion data.
1087
1088sub renumber_suffix_data {
1089 my ($self) = @_;
1090
1091 my $verbosity = $self->{'verbosity'};
1092 my $out = $self->{'outhandle'};
1093 print $out "Translate phrases: suffix-ids become phind-id's\n"
1094 if ($verbosity);
1095
1096 my $phinddir = $self->{'phinddir'};
1097 my $infile = &util::filename_cat($phinddir, 'phrases');
1098 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1099
1100 # Read the phrase file. Calculate initial set of phind-id
1101 # numbers and store (suffixid -> frequency) relation.
1102
1103 my %suffixtophind;
1104 my @phindfrequency;
1105 my (@fields, $suffixid);
1106 my $nextphind = 1;
1107
1108 open(IN, "<$infile");
1109 while(<IN>) {
1110
1111 chomp;
1112 @fields = split(/:/, $_);
1113
1114 # get next suffixid and phindid
1115 $suffixid = shift @fields;
1116 $suffixtophind{$suffixid} = $nextphind;
1117
1118 # store total frequency
1119 shift @fields;
1120 $totalfrequency[$nextphind] = shift @fields;
1121
1122 $nextphind++;
1123 }
1124 close IN;
1125
1126
1127 # Translate phrases file to phrases.2. Use phind keys (not suffix
1128 # keys), sort expansion and document occurance data in order of
1129 # descending frequency..
1130 open(IN, "<$infile");
1131 open(OUT, ">$outfile");
1132
1133 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1134 my (@documwents, @newexp, $k, $n);
1135 my $linenumber = 0;
1136
1137 while(<IN>) {
1138
1139 # read the line
1140 chomp;
1141 @fields = split(/:/, $_);
1142
1143 # get a phrase number for this line
1144 $suffixid = shift @fields;
1145 die unless (defined($suffixtophind{$suffixid}));
1146 $phindid = $suffixtophind{$suffixid};
1147
1148 # get the symbols in the phrase
1149 $text = shift @fields;
1150
1151 # output status information
1152 $linenumber++;
1153 if ($verbosity > 2) {
1154 if ($linenumber % 1000 == 0) {
1155 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1156 }
1157 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1158 }
1159
1160 # get the phrase frequency
1161 $tf = shift @fields;
1162
1163 # get the number of expansions
1164 $countexp = shift @fields;
1165
1166 # get the expansions, convert them into phind-id numbers, and sort them
1167 $expansions = shift @fields;
1168 @newexp = ();
1169 foreach $k (split(/,/, $expansions)) {
1170 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1171 $n = $suffixtophind{$k};
1172 push @newexp, $n;
1173 }
1174 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1175
1176 # get the number of documents
1177 $countdocs = shift @fields;
1178
1179 # get the documents and sort them
1180 $documents = shift @fields;
1181 $documents =~ s/d//g;
1182 @documents = split(/;/, $documents);
1183 @documents = sort by_doc_frequency @documents;
1184
1185 # output the phrase data
1186 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1187 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1188
1189 }
1190
1191 close IN;
1192 close OUT;
1193}
1194
1195
1196# renumber_thesaurus_data
1197#
1198# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1199
1200sub renumber_thesaurus_data {
1201 my ($self) = @_;
1202
1203 my $out = $self->{'outhandle'};
1204 my $verbosity = $self->{'verbosity'};
1205 my $thesaurus = $self->{'thesaurus'};
1206
1207 my $phinddir = $self->{'phinddir'};
1208 my $infile = &util::filename_cat($phinddir, "phrases.2");
1209 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1210
1211
1212 # If no thesaurus is defined, simply move the phrases file.
1213 if (!$thesaurus) {
1214 print $out "Translate phrases.2: no thesaurus data\n"
1215 if ($verbosity);
1216 &util::mv($infile, $outfile);
1217 return;
1218 }
1219
1220 print $out "Translate phrases.2: add thesaurus data\n"
1221 if ($verbosity);
1222
1223 # 1.
1224 # Read thesaurus file and store (symbols->thesaurusid) mapping
1225 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1226 my %symbolstothesid;
1227 my (@fields, $thesid, $symbols);
1228
1229 open(TH, "<$thesaurusfile");
1230
1231 while (<TH>) {
1232
1233 chomp;
1234 @fields = split(/:/, $_);
1235
1236 # get id and text
1237 $thesid = shift @fields;
1238 $symbols = shift @fields;
1239 $symbolstothesid{$symbols} = $thesid;
1240 }
1241 close TH;
1242
1243 # 2.
1244 # Read phrases file to find thesaurus entries that already
1245 # have a phindid. Store their phind-ids for later translation,
1246 # and store their frequency for later sorting.
1247 my %thesaurustophindid;
1248 my %phindidtofrequency;
1249 my ($phindid, $freq);
1250
1251 open(IN, "<$infile");
1252
1253 while(<IN>) {
1254
1255 chomp;
1256 @fields = split(/:/, $_);
1257
1258 # phindid and symbols for this line
1259 $phindid = shift @fields;
1260 $symbols = shift @fields;
1261 $freq = shift @fields;
1262
1263 # do we have a thesaurus id corresponding to this phrase?
1264 if (defined($symbolstothesid{$symbols})) {
1265 $thesid = $symbolstothesid{$symbols};
1266 $thesaurustophindid{$thesid} = $phindid;
1267 $phindidtofrequency{$phindid} = $freq;
1268 }
1269 }
1270 close IN;
1271
1272 undef %symbolstothesid;
1273
1274 # 3.
1275 # Create phind-id numbers for remaining thesaurus entries,
1276 # and note that their frequency is 0 for later sorting.
1277 my $nextphindid = $phindid + 1;
1278
1279 open(TH, "<$thesaurusfile");
1280 while(<TH>) {
1281
1282 chomp;
1283 @fields = split(/:/, $_);
1284
1285 # read thesaurus-id and ensure it has a corresponding phind-id
1286 $thesid = shift @fields;
1287 if (!defined($thesaurustophindid{$thesid})) {
1288 $thesaurustophindid{$thesid} = $nextphindid;
1289 $phindidtofrequency{$nextphindid} = 0;
1290 $nextphindid++;
1291 }
1292 }
1293 close TH;
1294
1295 # 4.
1296 # Translate thesaurus file, replacing thesaurus-id numbers with
1297 # phind-id numbers.
1298 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1299 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1300 my (@links, $linkid, %linkidtotype, $newrelation);
1301
1302 open(TH, "<$thesaurusfile");
1303 open(TO, ">$newthesaurusfile");
1304 while(<TH>) {
1305
1306 chomp;
1307 @fields = split(/:/, $_);
1308
1309 # phindid and symbols for this line
1310 ($thesid, $symbols, $relations) = @fields;
1311
1312 die unless ($thesid && $symbols);
1313 die unless $thesaurustophindid{$thesid};
1314 $phindid = $thesaurustophindid{$thesid};
1315
1316 # convert each part of the relation string to use phind-id numbers
1317 # at the same time, we want to sort the list by frequency.
1318 undef %linkidtotype;
1319
1320 foreach $linktext (split(/;/, $relations)) {
1321 @linkdata = split(/,/, $linktext);
1322
1323 # remember the linktype (e.g. BT, NT)
1324 $linktype = shift @linkdata;
1325
1326 # store the type of each link
1327 foreach $thesid (@linkdata) {
1328 die unless (defined($thesaurustophindid{$thesid}));
1329 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1330 }
1331 }
1332
1333 # sort the list of links, first by frequency, then by type.
1334 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1335 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1336 $linkcounter = (scalar @links);
1337
1338 # create a string describing the link information
1339 $linktype = $linkidtotype{$links[0]};
1340 $newrelation = $linktype;
1341 foreach $linkid (@links) {
1342 if ($linkidtotype{$linkid} ne $linktype) {
1343 $linktype = $linkidtotype{$linkid};
1344 $newrelation .= ";" . $linktype;
1345 }
1346 $newrelation .= "," . $linkid;
1347 }
1348 $newrelation .= ";";
1349
1350
1351 # output the new line
1352 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1353 }
1354 close TH;
1355 close TO;
1356
1357 undef %thesaurustophindid;
1358 undef %linkidtotype;
1359 undef %phindidtofrequency;
1360
1361 # 5.
1362 # Read thesaurus data (in phind-id format) into memory
1363 my %thesaurusdata;
1364
1365 open(TH, "<$newthesaurusfile");
1366 while(<TH>) {
1367 chomp;
1368 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1369 die unless ($phindid && $symbols);
1370 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1371 }
1372 close TH;
1373
1374 # 6.
1375 # Add thesaurus data to phrases file
1376 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1377 my (@documwents, @newexp, $k, $n);
1378 my $linenumber = 0;
1379
1380 open(IN, "<$infile");
1381 open(OUT, ">$outfile");
1382
1383 # Update existing phrases
1384 while(<IN>) {
1385
1386 chomp;
1387 @fields = split(/:/, $_);
1388
1389 # get data for this line
1390 $phindid = shift @fields;
1391
1392 # output the phrase data, with thesaurus information
1393 print OUT "$phindid:", join(":", @fields);
1394
1395 # add thesaurus data
1396 if (defined($thesaurusdata{$phindid})) {
1397 @fields = split(/:/, $thesaurusdata{$phindid});
1398 shift @fields;
1399 $linkcounter = shift @fields;
1400 $relations = shift @fields;
1401
1402 print OUT ":$linkcounter:$relations";
1403 $thesaurusdata{$phindid} = "";
1404 }
1405 print OUT "\n";
1406 }
1407 close IN;
1408
1409 # Add phrases that aren't already in the file
1410 foreach $phindid (sort numerically keys %thesaurusdata) {
1411 next unless ($thesaurusdata{$phindid});
1412
1413 @fields = split(/:/, $thesaurusdata{$phindid});
1414 $symbols = shift @fields;
1415 $linkcounter = shift @fields;
1416 $relations = shift @fields;
1417
1418 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1419 }
1420 close OUT;
1421
1422}
1423
1424# restore_vocabulary_data
1425#
1426# Read phrases.3 and restore vocabulary information. Then write
1427# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1428# (if requested) to the saved phrases file.
1429
1430sub restore_vocabulary_data {
1431 my ($self) = @_;
1432
1433 my $out = $self->{'outhandle'};
1434 my $verbosity = $self->{'verbosity'};
1435 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1436
1437 my $phinddir = $self->{'phinddir'};
1438 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1439 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1440 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1441 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1442
1443 my $savephrases = $self->{'savephrases'};
1444
1445 # 1.
1446 # Read the vocabulary file
1447 open(V, "<$vocabfile")
1448 || die "Cannot open $vocabfile: $!";
1449 my @symbol;
1450 my $i = 1;
1451 while(<V>) {
1452 chomp;
1453 $symbol[$i++] = $_;
1454 }
1455 close V;
1456
1457 # 2.
1458 # Translate phrases.3 to MGPP input files
1459 my ($key, $text, $word, $isThesaurus);
1460 my @fields;
1461 my $linenumber = 0;
1462
1463 open(IN, "<$infile");
1464 open(DATA, ">$datafile");
1465 open(WORD, ">$wordfile");
1466
1467 # Save the phrases in a separate text file
1468 if ($savephrases) {
1469 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1470 open(SAVE, ">$savephrases");
1471 }
1472
1473 while(<IN>) {
1474
1475 # read the line
1476 chomp;
1477 $line = $_;
1478 @fields = split(/:/, $line);
1479
1480 # get a phrase number for this line
1481 $key = shift @fields;
1482
1483 # restore the text of the phrase
1484 $text = shift @fields;
1485 $text =~ s/s(\d+)/$symbol[$1]/g;
1486 if ($text =~ / /) {
1487 $word = "";
1488 } elsif ($text ne 'untranslated') {
1489 $word = $text;
1490 }
1491
1492 # output the phrase data
1493 print DATA "<Document>";
1494 print DATA "$key:$text:", join(":", @fields), ":\n";
1495
1496 # output the word index search data
1497 print WORD "<Document>$word\n";
1498
1499 # output the phrases to a text file
1500 if ($savephrases) {
1501 if ((scalar @fields) == 7) {
1502 $isThesaurus = 1;
1503 } else {
1504 $isThesaurus = 0;
1505 }
1506 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1507 }
1508 }
1509 close IN;
1510 close WORD;
1511 close DATA;
1512 close SAVE if ($savephrases);
1513
1514}
1515
1516
1517
1518# sort routines used to renumber phrases
1519
1520sub numerically { $a <=> $b }
1521
1522sub by_doc_frequency {
1523 my $fa = 1;
1524 if ($a =~ /,/) {
1525 $fa = $a;
1526 $fa =~ s/\d+,//;
1527 }
1528 my $fb = 1;
1529 if ($b =~ /,/) {
1530 $fb = $b;
1531 $fb =~ s/\d+,//;
1532 }
1533
1534 return ($fb <=> $fa);
1535}
1536
15371;
Note: See TracBrowser for help on using the repository browser.