source: gsdl/trunk/perllib/classify/Phind.pm@ 20454

Last change on this file since 20454 was 20454, checked in by kjdon, 12 years ago

strip ex from metadata type args

  • Property svn:keywords set to Author Date Id Revision
File size: 40.5 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BaseClassifier;
37use util;
38use ghtml;
39use unicode;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44my @removedirs = ();
45
46my %wanted_index_files = ('td'=>1,
47 't'=>1,
48 'ti'=>1,
49 'tl'=>1,
50 'tsd'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'il'=>1,
57 'w'=>1,
58 'wa'=>1);
59
60sub BEGIN {
61 @Phind::ISA = ('BaseClassifier');
62}
63
64sub END {
65
66 # Tidy up stray files - we do this here as there's some weird problem
67 # preventing us from doing it in the get_classify_info() function (on
68 # windows at least) where the close() appears to fail on txthandle and
69 # dochandle, thus preventing us from deleting those files
70
71 foreach my $dir (@removedirs) {
72 if (-d $dir && opendir (DIR, $dir)) {
73 my @files = readdir DIR;
74 closedir DIR;
75
76 foreach my $file (@files) {
77 next if $file =~ /^\.\.?$/;
78 my ($suffix) = $file =~ /\.([^\.]+)$/;
79 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
80 # delete it!
81 &util::rm (&util::filename_cat ($dir, $file));
82 }
83 }
84 }
85 }
86}
87
88my $arguments =
89 [ { 'name' => "text",
90 'desc' => "{Phind.text}",
91 'type' => "string",
92 'deft' => "section:Title,section:text",
93 'reqd' => "no" },
94 { 'name' => "title",
95 'desc' => "{Phind.title}",
96 'type' => "metadata",
97 'deft' => "Title",
98 'reqd' => "no" },
99 { 'name' => "buttonname",
100 'desc' => "{BasClas.buttonname}",
101 'type' => "string",
102 'deft' => "Phrase",
103 'reqd' => "no" },
104 { 'name' => "language",
105 'desc' => "{Phind.language}",
106 'type' => "string",
107 'deft' => "en",
108 'reqd' => "no" },
109 { 'name' => "savephrases",
110 'desc' => "{Phind.savephrases}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no" },
114 { 'name' => "suffixmode",
115 'desc' => "{Phind.suffixmode}",
116 'type' => "int",
117 'deft' => "1",
118 'range' => "0,1",
119 'reqd' => "no" },
120 { 'name' => "min_occurs",
121 'desc' => "{Phind.min_occurs}",
122 'type' => "int",
123 'deft' => "2",
124 'range' => "1,",
125 'reqd' => "no" },
126 { 'name' => "thesaurus",
127 'desc' => "{Phind.thesaurus}",
128 'type' => "string",
129 'deft' => "",
130 'reqd' => "no" },
131 { 'name' => "untidy",
132 'desc' => "{Phind.untidy}",
133 'type' => "flag",
134 'reqd' => "no" } ];
135
136my $options = { 'name' => "Phind",
137 'desc' => "{Phind.desc}",
138 'abstract' => "no",
139 'inherits' => "yes",
140 'args' => $arguments };
141
142
143# Phrase delimiter symbols - these should be abstracted out someplace
144
145my $colstart = "COLLECTIONSTART";
146my $colend = "COLLECTIONEND";
147my $doclimit = "DOCUMENTLIMIT";
148my $senlimit = "SENTENCELIMIT";
149my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152# Create a new Phind browser based on collect.cfg
153
154sub new {
155 my ($class) = shift (@_);
156 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157 push(@$classifierslist, $class);
158
159 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
160 push(@{$hashArgOptLists->{"OptList"}},$options);
161
162 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
163
164 if ($self->{'info_only'}) {
165 # don't worry about any options etc
166 return bless $self, $class;
167 }
168
169 # Ensure the Phind generate scripts are in place
170 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173 if (!(-e $file1)) {
174 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175 exit(1);
176 }
177
178 # things that may have ex. in them that need to be stripped off
179 $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
180 $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
181
182 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
183
184 $self->{"indexes"} = $self->{"text"};
185
186 # Further setup
187 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
188 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
189 if (! defined $self->{'builddir'}) {
190 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
191 }
192 $self->{'total'} = 0;
193
194 # Clean out the unused keys
195 delete $self->{"text"};
196
197 return bless $self, $class;
198}
199
200
201# Initialise the Phind classifier
202
203sub init {
204 my $self = shift (@_);
205
206 # ensure we have a build directory
207 my $builddir = $self->{'builddir'};
208 die unless (-e "$builddir");
209
210 # create Phind directory
211 my $phnumber = 1;
212 my $phinddir = &util::filename_cat($builddir, "phind1");
213 while (-e "$phinddir") {
214 $phnumber++;
215 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
216 }
217 &util::mk_dir("$phinddir");
218 $self->{'phinddir'} = $phinddir;
219 $self->{'phindnumber'} = $phnumber;
220
221 push(@removedirs, $phinddir) unless $self->{'untidy'};
222
223 # open filehandles for documents and text
224 my $clausefile = &util::filename_cat("$phinddir", "clauses");
225 &util::rm($clausefile) if (-e $clausefile);
226
227 my $txthandle = 'TEXT' . $phnumber;
228 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
229 $self->{'txthandle'} = $txthandle;
230
231 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
232 &util::rm($docfile) if (-e $docfile);
233
234 my $dochandle = 'DOC' . $phnumber;
235 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
236 $self->{'dochandle'} = $dochandle;
237
238}
239
240
241# Classify each document.
242#
243# Each document is passed here in turn. The classifier extracts the
244# text of each and stores it in the clauses file. Document details are
245# stored in the docs.txt file.
246
247sub classify {
248 my $self = shift (@_);
249 my ($doc_obj,$edit_mode) = @_;
250
251 my $verbosity = $self->{'verbosity'};
252 my $top_section = $doc_obj->get_top_section();
253
254 my $titlefield = $self->{'title'};
255
256 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
257 if (!defined($title)) {
258 $title = "";
259 print STDERR "Phind: document has no title\n";
260 }
261 print "process: $title\n" if ($verbosity > 2);
262
263 # Only consider the file if it is in the correct language
264 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
265 my $phrlanguage = $self->{'language'};
266 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
267
268 if ($edit_mode eq "delete") {
269 # This classifier works quite differently to the others
270 # Probably doesn't support incremental building anyway
271 return;
272 }
273
274 # record this file
275 $self->{'total'} ++;
276 # what is $file ???
277 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
278
279 # Store document details
280 my $OID = $doc_obj->get_OID();
281 $OID = "NULL" unless defined $OID;
282 my $dochandle = $self->{'dochandle'};
283 print $dochandle "<Document>\t$OID\t$title\n";
284
285 # Store the text occuring in this object
286
287 # output the document delimiter
288 my $txthandle = $self->{'txthandle'};
289 print $txthandle "$doclimit\n";
290
291 # iterate over the required indexes and store their text
292 my $indexes = $self->{'indexes'};
293 my $text = "";
294 my ($part, $level, $field, $section, $data, $dataref);
295
296 foreach $part (split(/,/, $indexes)) {
297
298 # Each field has a level and a data element ((e.g. document:Title)
299 ($level, $field) = split(/:/, $part);
300 die unless ($level && $field);
301
302 # Extract the text from every section
303 # (In phind, document:text and section:text are equivalent)
304 if ($field eq "text") {
305 $data = "";
306 $section = $doc_obj->get_top_section();
307 while (defined($section)) {
308 $data .= $doc_obj->get_text($section) . "\n";
309 $section = $doc_obj->get_next_section($section);
310 }
311 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
312 }
313
314 # Extract a metadata field from a document
315 # (If there is more than one element of the given type, get them all.)
316 elsif ($level eq "document") {
317 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
318 foreach $data (@$dataref) {
319 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
320 }
321 }
322
323 # Extract metadata from every section in a document
324 elsif ($level eq "section") {
325 $data = "";
326 $section = $doc_obj->get_top_section();
327 while (defined($section)) {
328 $dataref = $doc_obj->get_metadata($section, $field);
329 $data .= join("\n", @$dataref) . "\n";
330 $section = $doc_obj->get_next_section($section);
331 }
332 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
333 }
334
335 # Some sort of specification which I don't understand
336 else {
337 die "Unknown level ($level) in Phind index ($part)\n";
338 }
339
340 }
341
342 # output the text
343 $text =~ tr/\n//s;
344 print $txthandle "$text";
345}
346
347
348# Construct the classifier from the information already gathered
349#
350# When get_classify_info is called, the clauses and docs.txt files have
351# already been constructed in the Phind directory. This function will
352# translate them into compressed, indexed MGPP files that can be read by
353# the phindcgi script. It will also register our classifier so that it
354# shows up in the navigation bar.
355
356sub get_classify_info {
357 my $self = shift (@_);
358 my ($gli) = @_;
359
360 close $self->{'dochandle'};
361 close $self->{'txthandle'};
362 my $verbosity = $self->{'verbosity'};
363 my $out = $self->{'outhandle'};
364 my $phinddir = $self->{'phinddir'};
365
366 my $osextra = "";
367 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
368 $osextra = " -d /";
369 }
370
371 print STDERR "</Stage>\n" if $gli;
372
373 if ($verbosity) {
374 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
375 print $out "*** in ", $self->{'phinddir'}, "\n";
376 }
377
378 print STDERR "<Stage name='Phind'>\n" if $gli;
379
380 # Construct phind indexes
381 my $suffixmode = $self->{'suffixmode'};
382 my $min_occurs = $self->{'min_occurs'};
383 my ($command, $status);
384
385 # Generate the vocabulary, symbol statistics, and numbers file
386 # from the clauses file
387 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
388 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
389 &extract_vocabulary($self);
390
391 # Use the suffix program to generate the phind/phrases file
392 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
393 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
394 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
395
396 # check that we generated some files. It's not necessarily an error if
397 # we didn't (execute() would have quit on error), but we can't go on.
398 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
399 if (! -r $phrasesfile) {
400 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
401 print $out "\nNo phrases found for Phind classifier!\n";
402 return;
403 }
404
405 # Create the phrase file and put phrase numbers in phind/phrases
406 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
407 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
408 &renumber_phrases($self);
409
410 print $out "\nCreating phrase databases\n";
411 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
412 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
413 my $mg_stem = &util::filename_cat($phinddir, "pdata");
414
415 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
416 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
417 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
418
419 # create the mg index of words
420 print $out "\nCreating word-level search indexes\n";
421 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
422 $mg_input = &util::filename_cat($phinddir, "pword.txt");
423 $mg_stem = &util::filename_cat($phinddir, "pword");
424
425 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
426 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
427 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
429 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
430 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
431
432 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
433 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
434 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
435
436 # create the mg document information database
437 print $out "\nCreating document information databases\n";
438 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
439 $mg_input = &util::filename_cat($phinddir, "docs.txt");
440 $mg_stem = &util::filename_cat($phinddir, "docs");
441
442 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
443 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
444 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
445
446 # Return the information about the classifier that we'll later want to
447 # use to create macros when the Phind classifier document is displayed.
448 my %classifyinfo = ('thistype'=>'Invisible',
449 'childtype'=>'Phind',
450 'Title'=>$self->{'buttonname'},
451 'parameters'=>"phindnumber=$self->{'phindnumber'}",
452 'contains'=>[]);
453
454 my $collection = $self->{'collection'};
455 my $url = "library?a=p&p=phind&c=$collection";
456 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
457
458 return \%classifyinfo;
459}
460
461
462
463sub convert_gml_to_tokens {
464
465 my ($language_exp, $text) = @_;
466
467 # escape any magic words... - jrm21
468 foreach my $delim (@delimiters) {
469 my $replacement=lc($delim);
470 my $num= $text=~ s/$delim/$replacement/g;
471 if (!$num) {$num=0;}
472 }
473
474 if ($language_exp =~ /en/) {
475 return &convert_gml_to_tokens_EN($text);
476 }
477
478 if ($language_exp =~ /zh/) {
479 return &convert_gml_to_tokens_ZH($text);
480 }
481
482 $_ = $text;
483
484 # 1. remove GML tags
485
486 # Remove everything that is in a tag
487 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
488 s/\s*<br>\s*/ LINEBREAK /isgo;
489 s/<[^>]*>/ /sgo;
490
491 # Now we have the text, but it may contain HTML
492 # elements coded as &gt; etc. Remove these tags.
493 s/&amp;/&/sgo;
494 s/&lt;/</sgo;
495 s/&gt;/>/sgo;
496 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
497 s/\s*<br>\s*/ LINEBREAK /isgo;
498 s/<[^>]*>/ /sgo;
499
500 # replace<p> and <br> placeholders with clause break symbol (\n)
501 s/\s+/ /gso;
502 s/PARAGRAPHBREAK/\n/sgo;
503 s/LINEBREAK/\n/sgo;
504
505
506
507
508 # 2. Split the remaining text into space-delimited tokens
509
510 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
511 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
512
513 # Split text at word boundaries
514 s/\b/ /go;
515
516 # 3. Convert the remaining text to "clause format"
517
518 # Insert newline if the end of a sentence is detected
519 # (delimter is: "[\.\?\!]\s")
520 # s/\s*[\.\?\!]\s+/\n/go;
521
522 # remove unnecessary punctuation and replace with clause break symbol (\n)
523 s/[^\w ]/\n/go;
524
525 # remove extraneous whitespace
526 s/ +/ /sgo;
527 s/^\s+//mgo;
528 s/\s*$/\n/mgo;
529
530 # remove lines that contain one word or less
531 s/^\S*$//mgo;
532 s/^\s*$//mgo;
533 tr/\n//s;
534
535 return $_;
536}
537
538# a chinese version
539sub convert_gml_to_tokens_ZH {
540
541 $_ = shift @_;
542
543 # Replace all whitespace with a simple space
544 s/\s+/ /gs;
545 # Remove everything that is in a tag
546 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
547 s/\s*<br>\s*/ LINEBREAK /isg;
548 s/<[^>]*>/ /sg;
549
550 # Now we have the text, but it may contain HTML
551 # elements coded as &gt; etc. Remove these tags.
552 s/&lt;/</sg;
553 s/&gt;/>/sg;
554
555 s/\s+/ /sg;
556 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
557 s/\s*<br>\s*/ LINEBREAK /isg;
558 s/<[^>]*>/ /sg;
559
560 # remove &amp; and other miscellaneous markup tags
561 s/&amp;/&/sg;
562 s/&lt;/</sg;
563 s/&gt;/>/sg;
564 s/&amp;/&/sg;
565
566 # replace<p> and <br> placeholders with carriage returns
567 s/PARAGRAPHBREAK/\n/sg;
568 s/LINEBREAK/\n/sg;
569
570
571# print STDERR "text:$_\n";
572 return $_;
573}
574
575# A version of convert_gml_to_tokens that is fine-tuned to the English language.
576
577sub convert_gml_to_tokens_EN {
578 $_ = shift @_;
579
580 # FIRST, remove GML tags
581
582 # Replace all whitespace with a simple space
583 s/\s+/ /gs;
584
585 # Remove everything that is in a tag
586 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
587 s/\s*<br>\s*/ LINEBREAK /isg;
588 s/<[^>]*>/ /sg;
589
590 # Now we have the text, but it may contain HTML
591 # elements coded as &gt; etc. Remove these tags.
592 s/&lt;/</sg;
593 s/&gt;/>/sg;
594
595 s/\s+/ /sg;
596 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
597 s/\s*<br>\s*/ LINEBREAK /isg;
598 s/<[^>]*>/ /sg;
599
600 # remove &amp; and other miscellaneous markup tags
601 s/&amp;/&/sg;
602 s/&lt;/</sg;
603 s/&gt;/>/sg;
604 s/&amp;/&/sg;
605
606 # replace<p> and <br> placeholders with carriage returns
607 s/PARAGRAPHBREAK/\n/sg;
608 s/LINEBREAK/\n/sg;
609
610
611 # Exceptional punctuation
612 #
613 # We make special cases of some punctuation
614
615 # remove any apostrophe that indicates omitted letters
616 s/(\w+)\'(\w*\s)/ $1$2 /g;
617
618 # remove period that appears in a person's initals
619 s/\s([A-Z])\./ $1 /g;
620
621 # replace hyphens in hypheanted words and names with a space
622 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
623
624 # Convert the remaining text to "clause format",
625 # This means removing all excess punctuation and garbage text,
626 # normalising valid punctuation to fullstops and commas,
627 # then putting one cluse on each line.
628
629 # Insert newline when the end of a sentence is detected
630 # (delimter is: "[\.\?\!]\s")
631 s/\s*[\.\?\!]\s+/\n/g;
632
633 # split numbers after four digits
634 s/(\d\d\d\d)/$1 /g;
635
636 # split words after 32 characters
637
638 # squash repeated punctuation
639 tr/A-Za-z0-9 //cs;
640
641 # save email addresses
642 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
643
644 # normalise clause breaks (mostly punctuation symbols) to commas
645 s/[^A-Za-z0-9 \n]+/ , /g;
646
647 # Remove repeated commas, and replace with newline
648 s/\s*,[, ]+/\n/g;
649
650 # remove extra whitespace
651 s/ +/ /sg;
652 s/^\s+//mg;
653 s/\s*$/\n/mg;
654
655 # remove lines that contain one word or less
656 s/^\w*$//mg;
657 s/^\s*$//mg;
658 tr/\n//s;
659
660 return $_;
661
662}
663
664
665
666# Execute a system command
667
668sub execute {
669 my ($command, $verbosity, $outhandle) = @_;
670 print $outhandle "Executing: $command\n" if ($verbosity > 2);
671 $! = 0;
672 my $status = system($command);
673 if ($status != 0) {
674 print STDERR "Phind - Error executing '$command': $!\n";
675 exit($status); # this causes the build to fail...
676 }
677}
678
679
680# Generate the vocabulary, symbol statistics, and numbers file from the
681# clauses file. This is legacy code, so is a bit messy and probably wont
682# run under windows.
683
684sub extract_vocabulary {
685 my ($self) = @_;
686
687 my $verbosity = $self->{'verbosity'};
688 my $out = $self->{'outhandle'};
689
690 my $collectiondir = $self->{'collectiondir'};
691 my $phinddir = $self->{'phinddir'};
692
693 my $language_exp = $self->{'language'};
694
695 my ($w, $l, $line, $word);
696
697 my ($first_delimiter, $last_delimiter,
698 $first_stopword, $last_stopword,
699 $first_extractword, $last_extractword,
700 $first_contentword, $last_contentword,
701 $phrasedelimiter);
702
703 my $thesaurus = $self->{'thesaurus'};
704 my ($thesaurus_links, $thesaurus_terms,
705 %thesaurus, $first_thesaurusword, $last_thesaurusword);
706
707 my %symbol;
708 my (%freq);
709
710 print $out "Calculating vocabulary\n" if ($verbosity > 1);
711
712 # Read and store the stopwords
713 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
714 my $stopword_files = ();
715 my ($language, $language_dir, $file, $file_name);
716 my %stopwords;
717
718 # Examine each directory in the stopword directory
719 opendir(STOPDIR, $stopdir);
720 foreach $language (readdir STOPDIR) {
721
722 # Ignore entries that do not match the classifier's language
723 next unless ($language =~ /$language_exp/);
724 $language_dir = &util::filename_cat($stopdir, $language);
725 next unless (-d "$language_dir");
726
727 opendir(LANGDIR, $language_dir);
728 foreach $file (readdir LANGDIR) {
729
730 # Ignore entries that are not stopword files
731 next unless ($file =~ /sw$/);
732 $file_name = &util::filename_cat($language_dir, $file);
733 next unless (-f "$file_name");
734
735 # Read the stopwords
736 open(STOPFILE, "<$file_name");
737 while (<STOPFILE>) {
738 s/^\s+//;
739 s/\s.*//;
740 $word = $_;
741 $l = lc($word);
742 $stopwords{$l} = $word;
743 }
744 close STOPFILE;
745
746 }
747 closedir LANGDIR;
748 }
749 closedir STOPDIR;
750
751 # Read thesaurus information
752 if ($thesaurus) {
753
754 # link file exists
755 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
756 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
757
758 # ensure term file exists in the correct language
759 if ($language_exp =~ /^([a-z][a-z])/) {
760 $language = $1;
761 } else {
762 $language = 'en';
763 }
764 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
765 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
766
767
768 # Read the thesaurus terms
769 open(TH, "<$thesaurus_terms");
770 while(<TH>) {
771 s/^\d+ //;
772 s/\(.*\)//;
773 foreach $w (split(/\s+/, $_)) {
774 $thesaurus{lc($w)} = $w;
775 }
776 }
777 close TH;
778 }
779
780 # Read words in the text and count occurences
781 open(TXT, "<$phinddir/clauses");
782
783 my @words;
784 while(<TXT>) {
785 $line = $_;
786 next unless ($line =~ /./);
787
788 @words = split(/\s+/, $line);
789 foreach $w (@words) {
790 $l = lc($w);
791 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
792 $freq{$w}++;
793 }
794 $freq{$senlimit}++;
795 }
796
797 close TXT;
798
799 # Calculate the "best" form of each word
800 my (%bestform, %totalfreq, %bestfreq);
801
802 foreach $w (sort (keys %freq)) {
803 $l = lc($w);
804
805 # totalfreq is the number of times a term appears in any form
806 $totalfreq{$l} += $freq{$w};
807
808 if (defined $stopwords{$l}) {
809 $bestform{$l} = $stopwords{$l};
810
811 } elsif (defined $thesaurus{$l}) {
812 $bestform{$l} = $thesaurus{$l};
813
814 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
815 $bestfreq{$l} = $freq{$w};
816 $bestform{$l} = $w;
817 }
818 }
819 undef %freq;
820 undef %bestfreq;
821
822
823 # Assign symbol numbers to tokens
824 my $nextsymbol = 1;
825 my (@vocab);
826
827 # Delimiters
828 $first_delimiter = 1;
829
830 foreach $word (@delimiters) {
831
832# $word = lc($word); # jrm21
833 $word = uc($word);
834 $bestform{$word} = $word;
835 $vocab[$nextsymbol] = $word;
836 $symbol{$word} = $nextsymbol;
837 $nextsymbol++;
838 }
839 $last_delimiter = $nextsymbol - 1;
840 # Stopwords
841 $first_stopword = $nextsymbol;
842
843 foreach my $word (sort keys %stopwords) {
844 # don't include stopword unless it occurs in the text
845 $word = lc($word);
846 next unless ($totalfreq{$word});
847 next if ($symbol{$word});
848
849 $vocab[$nextsymbol] = $word;
850 $symbol{$word} = $nextsymbol;
851 $nextsymbol++;
852 }
853 $last_stopword = $nextsymbol - 1;
854 $first_contentword = $nextsymbol;
855
856 # Thesaurus terms
857 if ($thesaurus) {
858 $first_thesaurusword = $nextsymbol;
859
860 foreach my $word (sort keys %thesaurus) {
861
862 $word = lc($word);
863 next if ($symbol{$word});
864 $bestform{$word} = $thesaurus{$word};
865
866 $vocab[$nextsymbol] = $word;
867 $symbol{$word} = $nextsymbol;
868 $nextsymbol++;
869
870 }
871 $last_thesaurusword = $nextsymbol - 1;
872 }
873
874 # Other content words
875 $first_extractword = $nextsymbol;
876
877 foreach my $word (sort (keys %bestform)) {
878
879 next if ($symbol{$word});
880
881 $vocab[$nextsymbol] = $word;
882 $symbol{$word} = $nextsymbol;
883 $nextsymbol++;
884 }
885 $last_extractword = $nextsymbol - 1;
886 $last_contentword = $nextsymbol - 1;
887
888 # Outut the words
889 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
890 open(VOC, ">$phinddir/clauses.vocab");
891
892 for (my $i = 1; $i < $nextsymbol; $i++) {
893 $w = $vocab[$i];
894
895 print VOC "$bestform{$w}\n";
896 $totalfreq{$w} = 0 unless ($totalfreq{$w});
897 }
898 close VOC;
899
900
901 # Create statistics file
902 # Output statistics about the vocablary
903 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
904 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
905
906 open(STAT, ">$phinddir/clauses.stats")
907 || die "Cannot open $phinddir/clauses.stats: $!";
908
909 print STAT "first_delimiter $first_delimiter\n";
910 print STAT "last_delimiter $last_delimiter\n";
911 print STAT "first_stopword $first_stopword\n";
912 print STAT "last_stopword $last_stopword\n";
913 if ($thesaurus) {
914 print STAT "first_thesaurusword $first_thesaurusword\n";
915 print STAT "last_thesaurusword $last_thesaurusword\n";
916 }
917 print STAT "first_extractword $first_extractword\n";
918 print STAT "last_extractword $last_extractword\n";
919 print STAT "first_contentword $first_contentword\n";
920 print STAT "last_contentword $last_contentword\n";
921 print STAT "first_symbol $first_delimiter\n";
922 print STAT "last_symbol $last_contentword\n";
923 print STAT "first_word $first_stopword\n";
924 print STAT "last_word $last_contentword\n";
925 close STAT;
926
927 undef @vocab;
928
929
930 # Create numbers file
931 # Save text as symbol numbers
932 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
933
934 open(TXT, "<$phinddir/clauses");
935 open(NUM, ">$phinddir/clauses.numbers");
936
937## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
938## print NUM "$symbol{lc($colstart)}\n"; # jrm21
939 $phrasedelimiter = $symbol{$senlimit};
940 print NUM "$symbol{$colstart}\n";
941
942 # set up the special symbols that delimit documents and sentences
943 while(<TXT>) {
944
945 # split sentence into a list of tokens
946 $line = $_;
947 next unless ($line =~ /./);
948 @words = split(/\s+/, $line);
949
950 # output one token at a time
951 foreach $word (@words) {
952# don't lower-case special delimiters - jrm21
953 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
954 $word = lc($word);
955 }
956 print NUM "$symbol{$word}\n";
957 }
958
959 # output phrase delimiter
960 print NUM "$phrasedelimiter\n";
961 }
962
963 close TXT;
964# print NUM "$symbol{lc($colend)}\n";# jrm21
965 print NUM "$symbol{$colend}\n";
966 close NUM;
967
968 # Save thesaurus data in one convienient file
969 if ($thesaurus) {
970
971 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
972
973
974 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
975 if ($verbosity > 1);
976
977 # Read the thesaurus terms
978 my ($num, $text, %thes_symbols);
979
980 open(TH, "<$thesaurus_terms");
981 while(<TH>) {
982 chomp;
983 @words = split(/\s+/, $_);
984 $num = shift @words;
985 $text = "";
986
987 # translate words into symbol numbers
988 foreach $word (@words) {
989 $word = lc($word);
990 if ($symbol{$word}) {
991 $text .= "s$symbol{$word} ";
992 } elsif ($verbosity) {
993 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
994 }
995 }
996 $text =~ s/ $//;
997 $thes_symbols{$num} = $text;
998 }
999 close TH;
1000
1001 # Read the thesaurus links and write the corresponding data
1002 open(TH, "<$thesaurus_links");
1003 open(THOUT, ">$thesaurusfile");
1004
1005 while(<TH>) {
1006 chomp;
1007 ($num, $text) = split(/:/, $_);
1008
1009 if (defined($thes_symbols{$num})) {
1010 print THOUT "$num:$thes_symbols{$num}:$text\n";
1011 } else {
1012 print THOUT "$num:untranslated:$text\n";
1013 }
1014 }
1015 close TH;
1016 close THOUT;
1017 }
1018
1019
1020
1021
1022}
1023
1024
1025# renumber_phrases
1026#
1027# Prepare the phrases file to be input to mgpp. The biggest problem is
1028# reconciling the phrase identifiers used by the suffix program (which
1029# we'll call suffix-id numbers) with the numbers used in the thesaurus
1030# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1031# Phind-id numbers must be sorted by frequency of occurance.
1032#
1033# Start creating a set of phind-id numbers from the sorted suffix-id
1034# numbers and (if required) the thesaurus-id numbers. Then add any other
1035# phrases occuring in the thesaurus.
1036#
1037# The last thing we have to do is restore the vocabulary information to the
1038# phrase file so that the phrases are stored as words, not as symbol
1039# numbers.
1040
1041# The original phrases file looks something like this:
1042# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1043# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1044# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1045# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1046
1047
1048sub renumber_phrases {
1049 my ($self) = @_;
1050
1051 renumber_suffix_data($self);
1052 renumber_thesaurus_data($self);
1053 restore_vocabulary_data($self);
1054
1055}
1056
1057
1058
1059# renumber_suffix_data
1060#
1061# Translate phrases file to phrases.2 using phind keys instead
1062# of suffix keys and sorting the expansion data.
1063
1064sub renumber_suffix_data {
1065 my ($self) = @_;
1066
1067 my $verbosity = $self->{'verbosity'};
1068 my $out = $self->{'outhandle'};
1069 print $out "Translate phrases: suffix-ids become phind-id's\n"
1070 if ($verbosity);
1071
1072 my $phinddir = $self->{'phinddir'};
1073 my $infile = &util::filename_cat($phinddir, 'phrases');
1074 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1075
1076 # Read the phrase file. Calculate initial set of phind-id
1077 # numbers and store (suffixid -> frequency) relation.
1078
1079 my %suffixtophind;
1080 my @totalfrequency;
1081 my (@fields, $suffixid);
1082 my $nextphind = 1;
1083
1084 open(IN, "<$infile");
1085 while(<IN>) {
1086
1087 chomp;
1088 @fields = split(/:/, $_);
1089
1090 # get next suffixid and phindid
1091 $suffixid = shift @fields;
1092 $suffixtophind{$suffixid} = $nextphind;
1093
1094 # store total frequency
1095 shift @fields;
1096 $totalfrequency[$nextphind] = shift @fields;
1097
1098 $nextphind++;
1099 }
1100 close IN;
1101
1102
1103 # Translate phrases file to phrases.2. Use phind keys (not suffix
1104 # keys), sort expansion and document occurance data in order of
1105 # descending frequency..
1106 open(IN, "<$infile");
1107 open(OUT, ">$outfile");
1108
1109 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1110 my (@documwents, @newexp, $k, $n);
1111 my $linenumber = 0;
1112
1113 while(<IN>) {
1114
1115 # read the line
1116 chomp;
1117 @fields = split(/:/, $_);
1118
1119 # get a phrase number for this line
1120 $suffixid = shift @fields;
1121 die unless (defined($suffixtophind{$suffixid}));
1122 $phindid = $suffixtophind{$suffixid};
1123
1124 # get the symbols in the phrase
1125 $text = shift @fields;
1126
1127 # output status information
1128 $linenumber++;
1129 if ($verbosity > 2) {
1130 if ($linenumber % 1000 == 0) {
1131 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1132 }
1133 # what are $num and $key??
1134 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1135 }
1136
1137 # get the phrase frequency
1138 $tf = shift @fields;
1139
1140 # get the number of expansions
1141 $countexp = shift @fields;
1142
1143 # get the expansions, convert them into phind-id numbers, and sort them
1144 $expansions = shift @fields;
1145 @newexp = ();
1146 foreach $k (split(/,/, $expansions)) {
1147 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1148 $n = $suffixtophind{$k};
1149 push @newexp, $n;
1150 }
1151 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1152
1153 # get the number of documents
1154 $countdocs = shift @fields;
1155
1156 # get the documents and sort them
1157 $documents = shift @fields;
1158 $documents =~ s/d//g;
1159 my @documents = split(/;/, $documents);
1160 @documents = sort by_doc_frequency @documents;
1161
1162 # output the phrase data
1163 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1164 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1165
1166 }
1167
1168 close IN;
1169 close OUT;
1170}
1171
1172
1173# renumber_thesaurus_data
1174#
1175# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1176
1177sub renumber_thesaurus_data {
1178 my ($self) = @_;
1179
1180 my $out = $self->{'outhandle'};
1181 my $verbosity = $self->{'verbosity'};
1182 my $thesaurus = $self->{'thesaurus'};
1183
1184 my $phinddir = $self->{'phinddir'};
1185 my $infile = &util::filename_cat($phinddir, "phrases.2");
1186 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1187
1188
1189 # If no thesaurus is defined, simply move the phrases file.
1190 if (!$thesaurus) {
1191 print $out "Translate phrases.2: no thesaurus data\n"
1192 if ($verbosity);
1193 &util::mv($infile, $outfile);
1194 return;
1195 }
1196
1197 print $out "Translate phrases.2: add thesaurus data\n"
1198 if ($verbosity);
1199
1200 # 1.
1201 # Read thesaurus file and store (symbols->thesaurusid) mapping
1202 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1203 my %symbolstothesid;
1204 my (@fields, $thesid, $symbols);
1205
1206 open(TH, "<$thesaurusfile");
1207
1208 while (<TH>) {
1209
1210 chomp;
1211 @fields = split(/:/, $_);
1212
1213 # get id and text
1214 $thesid = shift @fields;
1215 $symbols = shift @fields;
1216 $symbolstothesid{$symbols} = $thesid;
1217 }
1218 close TH;
1219
1220 # 2.
1221 # Read phrases file to find thesaurus entries that already
1222 # have a phindid. Store their phind-ids for later translation,
1223 # and store their frequency for later sorting.
1224 my %thesaurustophindid;
1225 my %phindidtofrequency;
1226 my ($phindid, $freq);
1227
1228 open(IN, "<$infile");
1229
1230 while(<IN>) {
1231
1232 chomp;
1233 @fields = split(/:/, $_);
1234
1235 # phindid and symbols for this line
1236 $phindid = shift @fields;
1237 $symbols = shift @fields;
1238 $freq = shift @fields;
1239
1240 # do we have a thesaurus id corresponding to this phrase?
1241 if (defined($symbolstothesid{$symbols})) {
1242 $thesid = $symbolstothesid{$symbols};
1243 $thesaurustophindid{$thesid} = $phindid;
1244 $phindidtofrequency{$phindid} = $freq;
1245 }
1246 }
1247 close IN;
1248
1249 undef %symbolstothesid;
1250
1251 # 3.
1252 # Create phind-id numbers for remaining thesaurus entries,
1253 # and note that their frequency is 0 for later sorting.
1254 my $nextphindid = $phindid + 1;
1255
1256 open(TH, "<$thesaurusfile");
1257 while(<TH>) {
1258
1259 chomp;
1260 @fields = split(/:/, $_);
1261
1262 # read thesaurus-id and ensure it has a corresponding phind-id
1263 $thesid = shift @fields;
1264 if (!defined($thesaurustophindid{$thesid})) {
1265 $thesaurustophindid{$thesid} = $nextphindid;
1266 $phindidtofrequency{$nextphindid} = 0;
1267 $nextphindid++;
1268 }
1269 }
1270 close TH;
1271
1272 # 4.
1273 # Translate thesaurus file, replacing thesaurus-id numbers with
1274 # phind-id numbers.
1275 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1276 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1277 my (@links, $linkid, %linkidtotype, $newrelation);
1278
1279 open(TH, "<$thesaurusfile");
1280 open(TO, ">$newthesaurusfile");
1281 while(<TH>) {
1282
1283 chomp;
1284 @fields = split(/:/, $_);
1285
1286 # phindid and symbols for this line
1287 ($thesid, $symbols, $relations) = @fields;
1288
1289 die unless ($thesid && $symbols);
1290 die unless $thesaurustophindid{$thesid};
1291 $phindid = $thesaurustophindid{$thesid};
1292
1293 # convert each part of the relation string to use phind-id numbers
1294 # at the same time, we want to sort the list by frequency.
1295 undef %linkidtotype;
1296
1297 foreach $linktext (split(/;/, $relations)) {
1298 @linkdata = split(/,/, $linktext);
1299
1300 # remember the linktype (e.g. BT, NT)
1301 $linktype = shift @linkdata;
1302
1303 # store the type of each link
1304 foreach $thesid (@linkdata) {
1305 die unless (defined($thesaurustophindid{$thesid}));
1306 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1307 }
1308 }
1309
1310 # sort the list of links, first by frequency, then by type.
1311 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1312 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1313 $linkcounter = (scalar @links);
1314
1315 # create a string describing the link information
1316 $linktype = $linkidtotype{$links[0]};
1317 $newrelation = $linktype;
1318 foreach $linkid (@links) {
1319 if ($linkidtotype{$linkid} ne $linktype) {
1320 $linktype = $linkidtotype{$linkid};
1321 $newrelation .= ";" . $linktype;
1322 }
1323 $newrelation .= "," . $linkid;
1324 }
1325 $newrelation .= ";";
1326
1327
1328 # output the new line
1329 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1330 }
1331 close TH;
1332 close TO;
1333
1334 undef %thesaurustophindid;
1335 undef %linkidtotype;
1336 undef %phindidtofrequency;
1337
1338 # 5.
1339 # Read thesaurus data (in phind-id format) into memory
1340 my %thesaurusdata;
1341
1342 open(TH, "<$newthesaurusfile");
1343 while(<TH>) {
1344 chomp;
1345 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1346 die unless ($phindid && $symbols);
1347 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1348 }
1349 close TH;
1350
1351 # 6.
1352 # Add thesaurus data to phrases file
1353 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1354 my (@documwents, @newexp, $k, $n);
1355 my $linenumber = 0;
1356
1357 open(IN, "<$infile");
1358 open(OUT, ">$outfile");
1359
1360 # Update existing phrases
1361 while(<IN>) {
1362
1363 chomp;
1364 @fields = split(/:/, $_);
1365
1366 # get data for this line
1367 $phindid = shift @fields;
1368
1369 # output the phrase data, with thesaurus information
1370 print OUT "$phindid:", join(":", @fields);
1371
1372 # add thesaurus data
1373 if (defined($thesaurusdata{$phindid})) {
1374 @fields = split(/:/, $thesaurusdata{$phindid});
1375 shift @fields;
1376 $linkcounter = shift @fields;
1377 $relations = shift @fields;
1378
1379 print OUT ":$linkcounter:$relations";
1380 $thesaurusdata{$phindid} = "";
1381 }
1382 print OUT "\n";
1383 }
1384 close IN;
1385
1386 # Add phrases that aren't already in the file
1387 foreach $phindid (sort numerically keys %thesaurusdata) {
1388 next unless ($thesaurusdata{$phindid});
1389
1390 @fields = split(/:/, $thesaurusdata{$phindid});
1391 $symbols = shift @fields;
1392 $linkcounter = shift @fields;
1393 $relations = shift @fields;
1394
1395 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1396 }
1397 close OUT;
1398
1399}
1400
1401# restore_vocabulary_data
1402#
1403# Read phrases.3 and restore vocabulary information. Then write
1404# this data to the MGPP input files (pword.txt and pdata.txt) and
1405# (if requested) to the saved phrases file.
1406
1407sub restore_vocabulary_data {
1408 my ($self) = @_;
1409
1410 my $out = $self->{'outhandle'};
1411 my $verbosity = $self->{'verbosity'};
1412 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1413
1414 my $phinddir = $self->{'phinddir'};
1415 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1416 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1417 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1418 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1419
1420 my $savephrases = $self->{'savephrases'};
1421
1422 # 1.
1423 # Read the vocabulary file
1424 open(V, "<$vocabfile")
1425 || die "Cannot open $vocabfile: $!";
1426 my @symbol;
1427 my $i = 1;
1428 while(<V>) {
1429 chomp;
1430 $symbol[$i++] = $_;
1431 }
1432 close V;
1433
1434 # 2.
1435 # Translate phrases.3 to MGPP input files
1436 my ($key, $text, $word, $isThesaurus, $line);
1437 my @fields;
1438 my $linenumber = 0;
1439
1440 open(IN, "<$infile");
1441 open(DATA, ">$datafile");
1442 open(WORD, ">$wordfile");
1443
1444 # Save the phrases in a separate text file
1445 if ($savephrases) {
1446 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1447 open(SAVE, ">$savephrases");
1448 }
1449
1450 while(<IN>) {
1451
1452 # read the line
1453 chomp;
1454 $line = $_;
1455 @fields = split(/:/, $line);
1456
1457 # get a phrase number for this line
1458 $key = shift @fields;
1459
1460 # restore the text of the phrase
1461 $text = shift @fields;
1462 $text =~ s/s(\d+)/$symbol[$1]/g;
1463 if ($text =~ / /) {
1464 $word = "";
1465 } elsif ($text ne 'untranslated') {
1466 $word = $text;
1467 }
1468
1469 # output the phrase data
1470 print DATA "<Document>";
1471 print DATA "$key:$text:", join(":", @fields), ":\n";
1472
1473 # output the word index search data
1474 print WORD "<Document>$word\n";
1475
1476 # output the phrases to a text file
1477 if ($savephrases) {
1478 if ((scalar @fields) == 7) {
1479 $isThesaurus = 1;
1480 } else {
1481 $isThesaurus = 0;
1482 }
1483 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1484 }
1485 }
1486 close IN;
1487 close WORD;
1488 close DATA;
1489 close SAVE if ($savephrases);
1490
1491}
1492
1493
1494
1495# sort routines used to renumber phrases
1496
1497sub numerically { $a <=> $b }
1498
1499sub by_doc_frequency {
1500 my $fa = 1;
1501 if ($a =~ /,/) {
1502 $fa = $a;
1503 $fa =~ s/\d+,//;
1504 }
1505 my $fb = 1;
1506 if ($b =~ /,/) {
1507 $fb = $b;
1508 $fb =~ s/\d+,//;
1509 }
1510
1511 return ($fb <=> $fa);
1512}
1513
15141;
Note: See TracBrowser for help on using the repository browser.