source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 22731

Last change on this file since 22731 was 22382, checked in by kjdon, 14 years ago

if language is ar for phind, then we set textorientation parameter to rtl.

  • Property svn:keywords set to Author Date Id Revision
File size: 41.1 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
34use BaseClassifier;
35use util;
36use ghtml;
37use unicode;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42my @removedirs = ();
43
44my %wanted_index_files = ('td'=>1,
45 't'=>1,
46 'ti'=>1,
47 'tl'=>1,
48 'tsd'=>1,
49 'idb'=>1,
50 'ib1'=>1,
51 'ib2'=>1,
52 'ib3'=>1,
53 'i'=>1,
54 'il'=>1,
55 'w'=>1,
56 'wa'=>1);
57
58sub BEGIN {
59 @Phind::ISA = ('BaseClassifier');
60}
61
62sub END {
63
64 # Tidy up stray files - we do this here as there's some weird problem
65 # preventing us from doing it in the get_classify_info() function (on
66 # windows at least) where the close() appears to fail on txthandle and
67 # dochandle, thus preventing us from deleting those files
68
69 foreach my $dir (@removedirs) {
70 if (-d $dir && opendir (DIR, $dir)) {
71 my @files = readdir DIR;
72 closedir DIR;
73
74 foreach my $file (@files) {
75 next if $file =~ /^\.\.?$/;
76 my ($suffix) = $file =~ /\.([^\.]+)$/;
77 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
78 # delete it!
79 &util::rm (&util::filename_cat ($dir, $file));
80 }
81 }
82 }
83 }
84}
85
86my $arguments =
87 [ { 'name' => "text",
88 'desc' => "{Phind.text}",
89 'type' => "string",
90 'deft' => "section:Title,section:text",
91 'reqd' => "no" },
92 { 'name' => "title",
93 'desc' => "{Phind.title}",
94 'type' => "metadata",
95 'deft' => "Title",
96 'reqd' => "no" },
97 { 'name' => "buttonname",
98 'desc' => "{BasClas.buttonname}",
99 'type' => "string",
100 'deft' => "Phrase",
101 'reqd' => "no" },
102 { 'name' => "language",
103 'desc' => "{Phind.language}",
104 'type' => "string",
105 'deft' => "en",
106 'reqd' => "no" },
107 { 'name' => "savephrases",
108 'desc' => "{Phind.savephrases}",
109 'type' => "string",
110 'deft' => "",
111 'reqd' => "no" },
112 { 'name' => "suffixmode",
113 'desc' => "{Phind.suffixmode}",
114 'type' => "int",
115 'deft' => "1",
116 'range' => "0,1",
117 'reqd' => "no" },
118 { 'name' => "min_occurs",
119 'desc' => "{Phind.min_occurs}",
120 'type' => "int",
121 'deft' => "2",
122 'range' => "1,",
123 'reqd' => "no" },
124 { 'name' => "thesaurus",
125 'desc' => "{Phind.thesaurus}",
126 'type' => "string",
127 'deft' => "",
128 'reqd' => "no" },
129 { 'name' => "untidy",
130 'desc' => "{Phind.untidy}",
131 'type' => "flag",
132 'reqd' => "no" } ];
133
134my $options = { 'name' => "Phind",
135 'desc' => "{Phind.desc}",
136 'abstract' => "no",
137 'inherits' => "yes",
138 'args' => $arguments };
139
140
141# Phrase delimiter symbols - these should be abstracted out someplace
142
143my $colstart = "COLLECTIONSTART";
144my $colend = "COLLECTIONEND";
145my $doclimit = "DOCUMENTLIMIT";
146my $senlimit = "SENTENCELIMIT";
147my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150# Create a new Phind browser based on collect.cfg
151
152sub new {
153 my ($class) = shift (@_);
154 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155 push(@$classifierslist, $class);
156
157 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158 push(@{$hashArgOptLists->{"OptList"}},$options);
159
160 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162 if ($self->{'info_only'}) {
163 # don't worry about any options etc
164 return bless $self, $class;
165 }
166
167 # Ensure the Phind generate scripts are in place
168 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171 if (!(-e $file1)) {
172 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173 exit(1);
174 }
175
176 # things that may have ex. in them that need to be stripped off
177 $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178 $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181
182 $self->{"indexes"} = $self->{"text"};
183
184 # Further setup
185 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187 if (! defined $self->{'builddir'}) {
188 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189 }
190 $self->{'total'} = 0;
191
192 # we set phind to be rtl if language is arabic
193 if ($self->{'language'} eq "ar") {
194 $self->{'textorientation'} = "rtl";
195 }
196 # Clean out the unused keys
197 delete $self->{"text"};
198
199 return bless $self, $class;
200}
201
202
203# Initialise the Phind classifier
204
205sub init {
206 my $self = shift (@_);
207
208 # ensure we have a build directory
209 my $builddir = $self->{'builddir'};
210 die unless (-e "$builddir");
211
212 # create Phind directory
213 my $phnumber = 1;
214 my $phinddir = &util::filename_cat($builddir, "phind1");
215 while (-e "$phinddir") {
216 $phnumber++;
217 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
218 }
219 &util::mk_dir("$phinddir");
220 $self->{'phinddir'} = $phinddir;
221 $self->{'phindnumber'} = $phnumber;
222
223 push(@removedirs, $phinddir) unless $self->{'untidy'};
224
225 # open filehandles for documents and text
226 my $clausefile = &util::filename_cat("$phinddir", "clauses");
227 &util::rm($clausefile) if (-e $clausefile);
228
229 my $txthandle = 'TEXT' . $phnumber;
230 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
231 $self->{'txthandle'} = $txthandle;
232
233 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234 &util::rm($docfile) if (-e $docfile);
235
236 my $dochandle = 'DOC' . $phnumber;
237 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
238 $self->{'dochandle'} = $dochandle;
239
240}
241
242
243# Classify each document.
244#
245# Each document is passed here in turn. The classifier extracts the
246# text of each and stores it in the clauses file. Document details are
247# stored in the docs.txt file.
248
249sub classify {
250 my $self = shift (@_);
251 my ($doc_obj,$edit_mode) = @_;
252
253 my $verbosity = $self->{'verbosity'};
254 my $top_section = $doc_obj->get_top_section();
255
256 my $titlefield = $self->{'title'};
257
258 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
259 if (!defined($title)) {
260 $title = "";
261 print STDERR "Phind: document has no title\n";
262 }
263 print "process: $title\n" if ($verbosity > 2);
264
265 # Only consider the file if it is in the correct language
266 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
267 my $phrlanguage = $self->{'language'};
268 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
269
270 if ($edit_mode eq "delete") {
271 # This classifier works quite differently to the others
272 # Probably doesn't support incremental building anyway
273 return;
274 }
275
276 # record this file
277 $self->{'total'} ++;
278 # what is $file ???
279 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
280
281 # Store document details
282 my $OID = $doc_obj->get_OID();
283 $OID = "NULL" unless defined $OID;
284 my $dochandle = $self->{'dochandle'};
285 print $dochandle "<Document>\t$OID\t$title\n";
286
287 # Store the text occuring in this object
288
289 # output the document delimiter
290 my $txthandle = $self->{'txthandle'};
291 print $txthandle "$doclimit\n";
292
293 # iterate over the required indexes and store their text
294 my $indexes = $self->{'indexes'};
295 my $text = "";
296 my ($part, $level, $field, $section, $data, $dataref);
297
298 foreach $part (split(/,/, $indexes)) {
299
300 # Each field has a level and a data element ((e.g. document:Title)
301 ($level, $field) = split(/:/, $part);
302 die unless ($level && $field);
303
304 # Extract the text from every section
305 # (In phind, document:text and section:text are equivalent)
306 if ($field eq "text") {
307 $data = "";
308 $section = $doc_obj->get_top_section();
309 while (defined($section)) {
310 $data .= $doc_obj->get_text($section) . "\n";
311 $section = $doc_obj->get_next_section($section);
312 }
313 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
314 }
315
316 # Extract a metadata field from a document
317 # (If there is more than one element of the given type, get them all.)
318 elsif ($level eq "document") {
319 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
320 foreach $data (@$dataref) {
321 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
322 }
323 }
324
325 # Extract metadata from every section in a document
326 elsif ($level eq "section") {
327 $data = "";
328 $section = $doc_obj->get_top_section();
329 while (defined($section)) {
330 $dataref = $doc_obj->get_metadata($section, $field);
331 $data .= join("\n", @$dataref) . "\n";
332 $section = $doc_obj->get_next_section($section);
333 }
334 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
335 }
336
337 # Some sort of specification which I don't understand
338 else {
339 die "Unknown level ($level) in Phind index ($part)\n";
340 }
341
342 }
343
344 # output the text
345 $text =~ tr/\n//s;
346 print $txthandle "$text";
347}
348
349
350# Construct the classifier from the information already gathered
351#
352# When get_classify_info is called, the clauses and docs.txt files have
353# already been constructed in the Phind directory. This function will
354# translate them into compressed, indexed MGPP files that can be read by
355# the phindcgi script. It will also register our classifier so that it
356# shows up in the navigation bar.
357
358sub get_classify_info {
359 my $self = shift (@_);
360 my ($gli) = @_;
361
362 close $self->{'dochandle'};
363 close $self->{'txthandle'};
364 my $verbosity = $self->{'verbosity'};
365 my $out = $self->{'outhandle'};
366 my $phinddir = $self->{'phinddir'};
367
368 my $osextra = "";
369 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
370 $osextra = " -d /";
371 }
372
373 print STDERR "</Stage>\n" if $gli;
374
375 if ($verbosity) {
376 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
377 print $out "*** in ", $self->{'phinddir'}, "\n";
378 }
379
380 print STDERR "<Stage name='Phind'>\n" if $gli;
381
382 # Construct phind indexes
383 my $suffixmode = $self->{'suffixmode'};
384 my $min_occurs = $self->{'min_occurs'};
385 my ($command, $status);
386
387 # Generate the vocabulary, symbol statistics, and numbers file
388 # from the clauses file
389 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
390 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
391 &extract_vocabulary($self);
392
393 # Use the suffix program to generate the phind/phrases file
394 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
395 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
396 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
397
398 # check that we generated some files. It's not necessarily an error if
399 # we didn't (execute() would have quit on error), but we can't go on.
400 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
401 if (! -r $phrasesfile) {
402 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
403 print $out "\nNo phrases found for Phind classifier!\n";
404 return;
405 }
406
407 # Create the phrase file and put phrase numbers in phind/phrases
408 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
409 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
410 &renumber_phrases($self);
411
412 print $out "\nCreating phrase databases\n";
413 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
414 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
415 my $mg_stem = &util::filename_cat($phinddir, "pdata");
416
417 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
418 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
419 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
420
421 # create the mg index of words
422 print $out "\nCreating word-level search indexes\n";
423 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
424 $mg_input = &util::filename_cat($phinddir, "pword.txt");
425 $mg_stem = &util::filename_cat($phinddir, "pword");
426
427 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
428 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
430 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
431 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
432 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
433
434 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
435 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
436 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
437
438 # create the mg document information database
439 print $out "\nCreating document information databases\n";
440 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
441 $mg_input = &util::filename_cat($phinddir, "docs.txt");
442 $mg_stem = &util::filename_cat($phinddir, "docs");
443
444 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
445 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
446 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
447
448 my $parameters = "phindnumber=$self->{'phindnumber'}";
449 if (defined ($self->{'textorientation'})) {
450 $parameters .= ";textorientation=$self->{'textorientation'}";
451 }
452 # Return the information about the classifier that we'll later want to
453 # use to create macros when the Phind classifier document is displayed.
454 my %classifyinfo = ('thistype'=>'Invisible',
455 'childtype'=>'Phind',
456 'Title'=>$self->{'buttonname'},
457 'parameters'=>$parameters,
458 'contains'=>[]);
459
460 my $collection = $self->{'collection'};
461 my $url = "library?a=p&p=phind&c=$collection";
462 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
463
464 return \%classifyinfo;
465}
466
467
468
469sub convert_gml_to_tokens {
470
471 my ($language_exp, $text) = @_;
472
473 # escape any magic words... - jrm21
474 foreach my $delim (@delimiters) {
475 my $replacement=lc($delim);
476 my $num= $text=~ s/$delim/$replacement/g;
477 if (!$num) {$num=0;}
478 }
479
480 if ($language_exp =~ /^en$/) {
481 return &convert_gml_to_tokens_EN($text);
482 }
483
484 if ($language_exp =~ /zh/) {
485 return &convert_gml_to_tokens_ZH($text);
486 }
487
488 $_ = $text;
489
490 # 1. remove GML tags
491
492 # Remove everything that is in a tag
493 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
494 s/\s*<br>\s*/ LINEBREAK /isgo;
495 s/<[^>]*>/ /sgo;
496
497 # Now we have the text, but it may contain HTML
498 # elements coded as &gt; etc. Remove these tags.
499 s/&amp;/&/sgo;
500 s/&lt;/</sgo;
501 s/&gt;/>/sgo;
502 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
503 s/\s*<br>\s*/ LINEBREAK /isgo;
504 s/<[^>]*>/ /sgo;
505
506 # replace<p> and <br> placeholders with clause break symbol (\n)
507 s/\s+/ /gso;
508 s/PARAGRAPHBREAK/\n/sgo;
509 s/LINEBREAK/\n/sgo;
510
511
512 # 2. Split the remaining text into space-delimited tokens
513
514 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
515 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
516
517 # Split text at word boundaries
518 s/\b/ /go;
519
520 # 3. Convert the remaining text to "clause format"
521
522 # Insert newline if the end of a sentence is detected
523 # (delimter is: "[\.\?\!]\s")
524 # s/\s*[\.\?\!]\s+/\n/go;
525
526 # remove unnecessary punctuation and replace with clause break symbol (\n)
527 # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
528 #s/[^\w ]/\n/go;
529 # replace punct with new lines - is this what we want??
530 s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
531 # then remove other punct with space
532 s/[\'\`\\\_]/ /go;
533
534 # remove extraneous whitespace
535 s/ +/ /sgo;
536 s/^\s+//mgo;
537 s/\s*$/\n/mgo;
538
539 # remove lines that contain one word or less
540 s/^\S*$//mgo;
541 s/^\s*$//mgo;
542 tr/\n//s;
543
544 return $_;
545}
546
547# a chinese version
548sub convert_gml_to_tokens_ZH {
549
550 $_ = shift @_;
551
552 # Replace all whitespace with a simple space
553 s/\s+/ /gs;
554 # Remove everything that is in a tag
555 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
556 s/\s*<br>\s*/ LINEBREAK /isg;
557 s/<[^>]*>/ /sg;
558
559 # Now we have the text, but it may contain HTML
560 # elements coded as &gt; etc. Remove these tags.
561 s/&lt;/</sg;
562 s/&gt;/>/sg;
563
564 s/\s+/ /sg;
565 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
566 s/\s*<br>\s*/ LINEBREAK /isg;
567 s/<[^>]*>/ /sg;
568
569 # remove &amp; and other miscellaneous markup tags
570 s/&amp;/&/sg;
571 s/&lt;/</sg;
572 s/&gt;/>/sg;
573 s/&amp;/&/sg;
574
575 # replace<p> and <br> placeholders with carriage returns
576 s/PARAGRAPHBREAK/\n/sg;
577 s/LINEBREAK/\n/sg;
578
579
580# print STDERR "text:$_\n";
581 return $_;
582}
583
584# A version of convert_gml_to_tokens that is fine-tuned to the English language.
585
586sub convert_gml_to_tokens_EN {
587 $_ = shift @_;
588
589 # FIRST, remove GML tags
590
591 # Replace all whitespace with a simple space
592 s/\s+/ /gs;
593
594 # Remove everything that is in a tag
595 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
596 s/\s*<br>\s*/ LINEBREAK /isg;
597 s/<[^>]*>/ /sg;
598
599 # Now we have the text, but it may contain HTML
600 # elements coded as &gt; etc. Remove these tags.
601 s/&lt;/</sg;
602 s/&gt;/>/sg;
603
604 s/\s+/ /sg;
605 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
606 s/\s*<br>\s*/ LINEBREAK /isg;
607 s/<[^>]*>/ /sg;
608
609 # remove &amp; and other miscellaneous markup tags
610 s/&amp;/&/sg;
611 s/&lt;/</sg;
612 s/&gt;/>/sg;
613 s/&amp;/&/sg;
614
615 # replace<p> and <br> placeholders with carriage returns
616 s/PARAGRAPHBREAK/\n/sg;
617 s/LINEBREAK/\n/sg;
618
619
620 # Exceptional punctuation
621 #
622 # We make special cases of some punctuation
623
624 # remove any apostrophe that indicates omitted letters
625 s/(\w+)\'(\w*\s)/ $1$2 /g;
626
627 # remove period that appears in a person's initals
628 s/\s([A-Z])\./ $1 /g;
629
630 # replace hyphens in hypheanted words and names with a space
631 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
632
633 # Convert the remaining text to "clause format",
634 # This means removing all excess punctuation and garbage text,
635 # normalising valid punctuation to fullstops and commas,
636 # then putting one cluse on each line.
637
638 # Insert newline when the end of a sentence is detected
639 # (delimter is: "[\.\?\!]\s")
640 s/\s*[\.\?\!]\s+/\n/g;
641
642 # split numbers after four digits
643 s/(\d\d\d\d)/$1 /g;
644
645 # split words after 32 characters
646
647 # squash repeated punctuation
648 tr/A-Za-z0-9 //cs;
649
650 # save email addresses
651 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
652
653 # normalise clause breaks (mostly punctuation symbols) to commas
654 s/[^A-Za-z0-9 \n]+/ , /g;
655
656 # Remove repeated commas, and replace with newline
657 s/\s*,[, ]+/\n/g;
658
659 # remove extra whitespace
660 s/ +/ /sg;
661 s/^\s+//mg;
662 s/\s*$/\n/mg;
663
664 # remove lines that contain one word or less
665 s/^\w*$//mg;
666 s/^\s*$//mg;
667 tr/\n//s;
668
669 return $_;
670
671}
672
673
674
675# Execute a system command
676
677sub execute {
678 my ($command, $verbosity, $outhandle) = @_;
679 print $outhandle "Executing: $command\n" if ($verbosity > 2);
680 $! = 0;
681 my $status = system($command);
682 if ($status != 0) {
683 print STDERR "Phind - Error executing '$command': $!\n";
684 exit($status); # this causes the build to fail...
685 }
686}
687
688
689# Generate the vocabulary, symbol statistics, and numbers file from the
690# clauses file. This is legacy code, so is a bit messy and probably wont
691# run under windows.
692
693sub extract_vocabulary {
694 my ($self) = @_;
695
696 my $verbosity = $self->{'verbosity'};
697 my $out = $self->{'outhandle'};
698
699 my $collectiondir = $self->{'collectiondir'};
700 my $phinddir = $self->{'phinddir'};
701
702 my $language_exp = $self->{'language'};
703
704 my ($w, $l, $line, $word);
705
706 my ($first_delimiter, $last_delimiter,
707 $first_stopword, $last_stopword,
708 $first_extractword, $last_extractword,
709 $first_contentword, $last_contentword,
710 $phrasedelimiter);
711
712 my $thesaurus = $self->{'thesaurus'};
713 my ($thesaurus_links, $thesaurus_terms,
714 %thesaurus, $first_thesaurusword, $last_thesaurusword);
715
716 my %symbol;
717 my (%freq);
718
719 print $out "Calculating vocabulary\n" if ($verbosity > 1);
720
721 # Read and store the stopwords
722 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
723 my $stopword_files = ();
724 my ($language, $language_dir, $file, $file_name);
725 my %stopwords;
726
727 # Examine each directory in the stopword directory
728 opendir(STOPDIR, $stopdir);
729 foreach $language (readdir STOPDIR) {
730
731 # Ignore entries that do not match the classifier's language
732 next unless ($language =~ /$language_exp/);
733 $language_dir = &util::filename_cat($stopdir, $language);
734 next unless (-d "$language_dir");
735
736 opendir(LANGDIR, $language_dir);
737 foreach $file (readdir LANGDIR) {
738
739 # Ignore entries that are not stopword files
740 next unless ($file =~ /sw$/);
741 $file_name = &util::filename_cat($language_dir, $file);
742 next unless (-f "$file_name");
743
744 # Read the stopwords
745 open(STOPFILE, "<$file_name");
746 while (<STOPFILE>) {
747 s/^\s+//;
748 s/\s.*//;
749 $word = $_;
750 $l = lc($word);
751 $stopwords{$l} = $word;
752 }
753 close STOPFILE;
754
755 }
756 closedir LANGDIR;
757 }
758 closedir STOPDIR;
759
760 # Read thesaurus information
761 if ($thesaurus) {
762
763 # link file exists
764 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
765 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
766
767 # ensure term file exists in the correct language
768 if ($language_exp =~ /^([a-z][a-z])/) {
769 $language = $1;
770 } else {
771 $language = 'en';
772 }
773 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
774 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
775
776
777 # Read the thesaurus terms
778 open(TH, "<$thesaurus_terms");
779 while(<TH>) {
780 s/^\d+ //;
781 s/\(.*\)//;
782 foreach $w (split(/\s+/, $_)) {
783 $thesaurus{lc($w)} = $w;
784 }
785 }
786 close TH;
787 }
788
789 # Read words in the text and count occurences
790 open(TXT, "<$phinddir/clauses");
791
792 my @words;
793 while(<TXT>) {
794 $line = $_;
795 next unless ($line =~ /./);
796
797 @words = split(/\s+/, $line);
798 foreach $w (@words) {
799 $l = lc($w);
800 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
801 $freq{$w}++;
802 }
803 $freq{$senlimit}++;
804 }
805
806 close TXT;
807
808 # Calculate the "best" form of each word
809 my (%bestform, %totalfreq, %bestfreq);
810
811 foreach $w (sort (keys %freq)) {
812 $l = lc($w);
813
814 # totalfreq is the number of times a term appears in any form
815 $totalfreq{$l} += $freq{$w};
816
817 if (defined $stopwords{$l}) {
818 $bestform{$l} = $stopwords{$l};
819
820 } elsif (defined $thesaurus{$l}) {
821 $bestform{$l} = $thesaurus{$l};
822
823 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
824 $bestfreq{$l} = $freq{$w};
825 $bestform{$l} = $w;
826 }
827 }
828 undef %freq;
829 undef %bestfreq;
830
831
832 # Assign symbol numbers to tokens
833 my $nextsymbol = 1;
834 my (@vocab);
835
836 # Delimiters
837 $first_delimiter = 1;
838
839 foreach $word (@delimiters) {
840
841# $word = lc($word); # jrm21
842 $word = uc($word);
843 $bestform{$word} = $word;
844 $vocab[$nextsymbol] = $word;
845 $symbol{$word} = $nextsymbol;
846 $nextsymbol++;
847 }
848 $last_delimiter = $nextsymbol - 1;
849 # Stopwords
850 $first_stopword = $nextsymbol;
851
852 foreach my $word (sort keys %stopwords) {
853 # don't include stopword unless it occurs in the text
854 $word = lc($word);
855 next unless ($totalfreq{$word});
856 next if ($symbol{$word});
857
858 $vocab[$nextsymbol] = $word;
859 $symbol{$word} = $nextsymbol;
860 $nextsymbol++;
861 }
862 $last_stopword = $nextsymbol - 1;
863 $first_contentword = $nextsymbol;
864
865 # Thesaurus terms
866 if ($thesaurus) {
867 $first_thesaurusword = $nextsymbol;
868
869 foreach my $word (sort keys %thesaurus) {
870
871 $word = lc($word);
872 next if ($symbol{$word});
873 $bestform{$word} = $thesaurus{$word};
874
875 $vocab[$nextsymbol] = $word;
876 $symbol{$word} = $nextsymbol;
877 $nextsymbol++;
878
879 }
880 $last_thesaurusword = $nextsymbol - 1;
881 }
882
883 # Other content words
884 $first_extractword = $nextsymbol;
885
886 foreach my $word (sort (keys %bestform)) {
887
888 next if ($symbol{$word});
889
890 $vocab[$nextsymbol] = $word;
891 $symbol{$word} = $nextsymbol;
892 $nextsymbol++;
893 }
894 $last_extractword = $nextsymbol - 1;
895 $last_contentword = $nextsymbol - 1;
896
897 # Outut the words
898 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
899 open(VOC, ">$phinddir/clauses.vocab");
900
901 for (my $i = 1; $i < $nextsymbol; $i++) {
902 $w = $vocab[$i];
903
904 print VOC "$bestform{$w}\n";
905 $totalfreq{$w} = 0 unless ($totalfreq{$w});
906 }
907 close VOC;
908
909
910 # Create statistics file
911 # Output statistics about the vocablary
912 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
913 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
914
915 open(STAT, ">$phinddir/clauses.stats")
916 || die "Cannot open $phinddir/clauses.stats: $!";
917
918 print STAT "first_delimiter $first_delimiter\n";
919 print STAT "last_delimiter $last_delimiter\n";
920 print STAT "first_stopword $first_stopword\n";
921 print STAT "last_stopword $last_stopword\n";
922 if ($thesaurus) {
923 print STAT "first_thesaurusword $first_thesaurusword\n";
924 print STAT "last_thesaurusword $last_thesaurusword\n";
925 }
926 print STAT "first_extractword $first_extractword\n";
927 print STAT "last_extractword $last_extractword\n";
928 print STAT "first_contentword $first_contentword\n";
929 print STAT "last_contentword $last_contentword\n";
930 print STAT "first_symbol $first_delimiter\n";
931 print STAT "last_symbol $last_contentword\n";
932 print STAT "first_word $first_stopword\n";
933 print STAT "last_word $last_contentword\n";
934 close STAT;
935
936 undef @vocab;
937
938
939 # Create numbers file
940 # Save text as symbol numbers
941 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
942
943 open(TXT, "<$phinddir/clauses");
944 open(NUM, ">$phinddir/clauses.numbers");
945
946## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
947## print NUM "$symbol{lc($colstart)}\n"; # jrm21
948 $phrasedelimiter = $symbol{$senlimit};
949 print NUM "$symbol{$colstart}\n";
950
951 # set up the special symbols that delimit documents and sentences
952 while(<TXT>) {
953
954 # split sentence into a list of tokens
955 $line = $_;
956 next unless ($line =~ /./);
957 @words = split(/\s+/, $line);
958
959 # output one token at a time
960 foreach $word (@words) {
961# don't lower-case special delimiters - jrm21
962 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
963 $word = lc($word);
964 }
965 print NUM "$symbol{$word}\n";
966 }
967
968 # output phrase delimiter
969 print NUM "$phrasedelimiter\n";
970 }
971
972 close TXT;
973# print NUM "$symbol{lc($colend)}\n";# jrm21
974 print NUM "$symbol{$colend}\n";
975 close NUM;
976
977 # Save thesaurus data in one convienient file
978 if ($thesaurus) {
979
980 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
981
982
983 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
984 if ($verbosity > 1);
985
986 # Read the thesaurus terms
987 my ($num, $text, %thes_symbols);
988
989 open(TH, "<$thesaurus_terms");
990 while(<TH>) {
991 chomp;
992 @words = split(/\s+/, $_);
993 $num = shift @words;
994 $text = "";
995
996 # translate words into symbol numbers
997 foreach $word (@words) {
998 $word = lc($word);
999 if ($symbol{$word}) {
1000 $text .= "s$symbol{$word} ";
1001 } elsif ($verbosity) {
1002 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1003 }
1004 }
1005 $text =~ s/ $//;
1006 $thes_symbols{$num} = $text;
1007 }
1008 close TH;
1009
1010 # Read the thesaurus links and write the corresponding data
1011 open(TH, "<$thesaurus_links");
1012 open(THOUT, ">$thesaurusfile");
1013
1014 while(<TH>) {
1015 chomp;
1016 ($num, $text) = split(/:/, $_);
1017
1018 if (defined($thes_symbols{$num})) {
1019 print THOUT "$num:$thes_symbols{$num}:$text\n";
1020 } else {
1021 print THOUT "$num:untranslated:$text\n";
1022 }
1023 }
1024 close TH;
1025 close THOUT;
1026 }
1027
1028
1029
1030
1031}
1032
1033
1034# renumber_phrases
1035#
1036# Prepare the phrases file to be input to mgpp. The biggest problem is
1037# reconciling the phrase identifiers used by the suffix program (which
1038# we'll call suffix-id numbers) with the numbers used in the thesaurus
1039# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1040# Phind-id numbers must be sorted by frequency of occurance.
1041#
1042# Start creating a set of phind-id numbers from the sorted suffix-id
1043# numbers and (if required) the thesaurus-id numbers. Then add any other
1044# phrases occuring in the thesaurus.
1045#
1046# The last thing we have to do is restore the vocabulary information to the
1047# phrase file so that the phrases are stored as words, not as symbol
1048# numbers.
1049
1050# The original phrases file looks something like this:
1051# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1052# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1053# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1054# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1055
1056
1057sub renumber_phrases {
1058 my ($self) = @_;
1059
1060 renumber_suffix_data($self);
1061 renumber_thesaurus_data($self);
1062 restore_vocabulary_data($self);
1063
1064}
1065
1066
1067
1068# renumber_suffix_data
1069#
1070# Translate phrases file to phrases.2 using phind keys instead
1071# of suffix keys and sorting the expansion data.
1072
1073sub renumber_suffix_data {
1074 my ($self) = @_;
1075
1076 my $verbosity = $self->{'verbosity'};
1077 my $out = $self->{'outhandle'};
1078 print $out "Translate phrases: suffix-ids become phind-id's\n"
1079 if ($verbosity);
1080
1081 my $phinddir = $self->{'phinddir'};
1082 my $infile = &util::filename_cat($phinddir, 'phrases');
1083 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1084
1085 # Read the phrase file. Calculate initial set of phind-id
1086 # numbers and store (suffixid -> frequency) relation.
1087
1088 my %suffixtophind;
1089 my @totalfrequency;
1090 my (@fields, $suffixid);
1091 my $nextphind = 1;
1092
1093 open(IN, "<$infile");
1094 while(<IN>) {
1095
1096 chomp;
1097 @fields = split(/:/, $_);
1098
1099 # get next suffixid and phindid
1100 $suffixid = shift @fields;
1101 $suffixtophind{$suffixid} = $nextphind;
1102
1103 # store total frequency
1104 shift @fields;
1105 $totalfrequency[$nextphind] = shift @fields;
1106
1107 $nextphind++;
1108 }
1109 close IN;
1110
1111
1112 # Translate phrases file to phrases.2. Use phind keys (not suffix
1113 # keys), sort expansion and document occurance data in order of
1114 # descending frequency..
1115 open(IN, "<$infile");
1116 open(OUT, ">$outfile");
1117
1118 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1119 my (@documwents, @newexp, $k, $n);
1120 my $linenumber = 0;
1121
1122 while(<IN>) {
1123
1124 # read the line
1125 chomp;
1126 @fields = split(/:/, $_);
1127
1128 # get a phrase number for this line
1129 $suffixid = shift @fields;
1130 die unless (defined($suffixtophind{$suffixid}));
1131 $phindid = $suffixtophind{$suffixid};
1132
1133 # get the symbols in the phrase
1134 $text = shift @fields;
1135
1136 # output status information
1137 $linenumber++;
1138 if ($verbosity > 2) {
1139 if ($linenumber % 1000 == 0) {
1140 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1141 }
1142 # what are $num and $key??
1143 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1144 }
1145
1146 # get the phrase frequency
1147 $tf = shift @fields;
1148
1149 # get the number of expansions
1150 $countexp = shift @fields;
1151
1152 # get the expansions, convert them into phind-id numbers, and sort them
1153 $expansions = shift @fields;
1154 @newexp = ();
1155 foreach $k (split(/,/, $expansions)) {
1156 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1157 $n = $suffixtophind{$k};
1158 push @newexp, $n;
1159 }
1160 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1161
1162 # get the number of documents
1163 $countdocs = shift @fields;
1164
1165 # get the documents and sort them
1166 $documents = shift @fields;
1167 $documents =~ s/d//g;
1168 my @documents = split(/;/, $documents);
1169 @documents = sort by_doc_frequency @documents;
1170
1171 # output the phrase data
1172 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1173 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1174
1175 }
1176
1177 close IN;
1178 close OUT;
1179}
1180
1181
1182# renumber_thesaurus_data
1183#
1184# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1185
1186sub renumber_thesaurus_data {
1187 my ($self) = @_;
1188
1189 my $out = $self->{'outhandle'};
1190 my $verbosity = $self->{'verbosity'};
1191 my $thesaurus = $self->{'thesaurus'};
1192
1193 my $phinddir = $self->{'phinddir'};
1194 my $infile = &util::filename_cat($phinddir, "phrases.2");
1195 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1196
1197
1198 # If no thesaurus is defined, simply move the phrases file.
1199 if (!$thesaurus) {
1200 print $out "Translate phrases.2: no thesaurus data\n"
1201 if ($verbosity);
1202 &util::mv($infile, $outfile);
1203 return;
1204 }
1205
1206 print $out "Translate phrases.2: add thesaurus data\n"
1207 if ($verbosity);
1208
1209 # 1.
1210 # Read thesaurus file and store (symbols->thesaurusid) mapping
1211 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1212 my %symbolstothesid;
1213 my (@fields, $thesid, $symbols);
1214
1215 open(TH, "<$thesaurusfile");
1216
1217 while (<TH>) {
1218
1219 chomp;
1220 @fields = split(/:/, $_);
1221
1222 # get id and text
1223 $thesid = shift @fields;
1224 $symbols = shift @fields;
1225 $symbolstothesid{$symbols} = $thesid;
1226 }
1227 close TH;
1228
1229 # 2.
1230 # Read phrases file to find thesaurus entries that already
1231 # have a phindid. Store their phind-ids for later translation,
1232 # and store their frequency for later sorting.
1233 my %thesaurustophindid;
1234 my %phindidtofrequency;
1235 my ($phindid, $freq);
1236
1237 open(IN, "<$infile");
1238
1239 while(<IN>) {
1240
1241 chomp;
1242 @fields = split(/:/, $_);
1243
1244 # phindid and symbols for this line
1245 $phindid = shift @fields;
1246 $symbols = shift @fields;
1247 $freq = shift @fields;
1248
1249 # do we have a thesaurus id corresponding to this phrase?
1250 if (defined($symbolstothesid{$symbols})) {
1251 $thesid = $symbolstothesid{$symbols};
1252 $thesaurustophindid{$thesid} = $phindid;
1253 $phindidtofrequency{$phindid} = $freq;
1254 }
1255 }
1256 close IN;
1257
1258 undef %symbolstothesid;
1259
1260 # 3.
1261 # Create phind-id numbers for remaining thesaurus entries,
1262 # and note that their frequency is 0 for later sorting.
1263 my $nextphindid = $phindid + 1;
1264
1265 open(TH, "<$thesaurusfile");
1266 while(<TH>) {
1267
1268 chomp;
1269 @fields = split(/:/, $_);
1270
1271 # read thesaurus-id and ensure it has a corresponding phind-id
1272 $thesid = shift @fields;
1273 if (!defined($thesaurustophindid{$thesid})) {
1274 $thesaurustophindid{$thesid} = $nextphindid;
1275 $phindidtofrequency{$nextphindid} = 0;
1276 $nextphindid++;
1277 }
1278 }
1279 close TH;
1280
1281 # 4.
1282 # Translate thesaurus file, replacing thesaurus-id numbers with
1283 # phind-id numbers.
1284 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1285 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1286 my (@links, $linkid, %linkidtotype, $newrelation);
1287
1288 open(TH, "<$thesaurusfile");
1289 open(TO, ">$newthesaurusfile");
1290 while(<TH>) {
1291
1292 chomp;
1293 @fields = split(/:/, $_);
1294
1295 # phindid and symbols for this line
1296 ($thesid, $symbols, $relations) = @fields;
1297
1298 die unless ($thesid && $symbols);
1299 die unless $thesaurustophindid{$thesid};
1300 $phindid = $thesaurustophindid{$thesid};
1301
1302 # convert each part of the relation string to use phind-id numbers
1303 # at the same time, we want to sort the list by frequency.
1304 undef %linkidtotype;
1305
1306 foreach $linktext (split(/;/, $relations)) {
1307 @linkdata = split(/,/, $linktext);
1308
1309 # remember the linktype (e.g. BT, NT)
1310 $linktype = shift @linkdata;
1311
1312 # store the type of each link
1313 foreach $thesid (@linkdata) {
1314 die unless (defined($thesaurustophindid{$thesid}));
1315 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1316 }
1317 }
1318
1319 # sort the list of links, first by frequency, then by type.
1320 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1321 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1322 $linkcounter = (scalar @links);
1323
1324 # create a string describing the link information
1325 $linktype = $linkidtotype{$links[0]};
1326 $newrelation = $linktype;
1327 foreach $linkid (@links) {
1328 if ($linkidtotype{$linkid} ne $linktype) {
1329 $linktype = $linkidtotype{$linkid};
1330 $newrelation .= ";" . $linktype;
1331 }
1332 $newrelation .= "," . $linkid;
1333 }
1334 $newrelation .= ";";
1335
1336
1337 # output the new line
1338 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1339 }
1340 close TH;
1341 close TO;
1342
1343 undef %thesaurustophindid;
1344 undef %linkidtotype;
1345 undef %phindidtofrequency;
1346
1347 # 5.
1348 # Read thesaurus data (in phind-id format) into memory
1349 my %thesaurusdata;
1350
1351 open(TH, "<$newthesaurusfile");
1352 while(<TH>) {
1353 chomp;
1354 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1355 die unless ($phindid && $symbols);
1356 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1357 }
1358 close TH;
1359
1360 # 6.
1361 # Add thesaurus data to phrases file
1362 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1363 my (@documwents, @newexp, $k, $n);
1364 my $linenumber = 0;
1365
1366 open(IN, "<$infile");
1367 open(OUT, ">$outfile");
1368
1369 # Update existing phrases
1370 while(<IN>) {
1371
1372 chomp;
1373 @fields = split(/:/, $_);
1374
1375 # get data for this line
1376 $phindid = shift @fields;
1377
1378 # output the phrase data, with thesaurus information
1379 print OUT "$phindid:", join(":", @fields);
1380
1381 # add thesaurus data
1382 if (defined($thesaurusdata{$phindid})) {
1383 @fields = split(/:/, $thesaurusdata{$phindid});
1384 shift @fields;
1385 $linkcounter = shift @fields;
1386 $relations = shift @fields;
1387
1388 print OUT ":$linkcounter:$relations";
1389 $thesaurusdata{$phindid} = "";
1390 }
1391 print OUT "\n";
1392 }
1393 close IN;
1394
1395 # Add phrases that aren't already in the file
1396 foreach $phindid (sort numerically keys %thesaurusdata) {
1397 next unless ($thesaurusdata{$phindid});
1398
1399 @fields = split(/:/, $thesaurusdata{$phindid});
1400 $symbols = shift @fields;
1401 $linkcounter = shift @fields;
1402 $relations = shift @fields;
1403
1404 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1405 }
1406 close OUT;
1407
1408}
1409
1410# restore_vocabulary_data
1411#
1412# Read phrases.3 and restore vocabulary information. Then write
1413# this data to the MGPP input files (pword.txt and pdata.txt) and
1414# (if requested) to the saved phrases file.
1415
1416sub restore_vocabulary_data {
1417 my ($self) = @_;
1418
1419 my $out = $self->{'outhandle'};
1420 my $verbosity = $self->{'verbosity'};
1421 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1422
1423 my $phinddir = $self->{'phinddir'};
1424 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1425 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1426 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1427 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1428
1429 my $savephrases = $self->{'savephrases'};
1430
1431 # 1.
1432 # Read the vocabulary file
1433 open(V, "<$vocabfile")
1434 || die "Cannot open $vocabfile: $!";
1435 my @symbol;
1436 my $i = 1;
1437 while(<V>) {
1438 chomp;
1439 $symbol[$i++] = $_;
1440 }
1441 close V;
1442
1443 # 2.
1444 # Translate phrases.3 to MGPP input files
1445 my ($key, $text, $word, $isThesaurus, $line);
1446 my @fields;
1447 my $linenumber = 0;
1448
1449 open(IN, "<$infile");
1450 open(DATA, ">$datafile");
1451 open(WORD, ">$wordfile");
1452
1453 # Save the phrases in a separate text file
1454 if ($savephrases) {
1455 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1456 open(SAVE, ">$savephrases");
1457 }
1458
1459 while(<IN>) {
1460
1461 # read the line
1462 chomp;
1463 $line = $_;
1464 @fields = split(/:/, $line);
1465
1466 # get a phrase number for this line
1467 $key = shift @fields;
1468
1469 # restore the text of the phrase
1470 $text = shift @fields;
1471 $text =~ s/s(\d+)/$symbol[$1]/g;
1472 if ($text =~ / /) {
1473 $word = "";
1474 } elsif ($text ne 'untranslated') {
1475 $word = $text;
1476 }
1477
1478 # output the phrase data
1479 print DATA "<Document>";
1480 print DATA "$key:$text:", join(":", @fields), ":\n";
1481
1482 # output the word index search data
1483 print WORD "<Document>$word\n";
1484
1485 # output the phrases to a text file
1486 if ($savephrases) {
1487 if ((scalar @fields) == 7) {
1488 $isThesaurus = 1;
1489 } else {
1490 $isThesaurus = 0;
1491 }
1492 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1493 }
1494 }
1495 close IN;
1496 close WORD;
1497 close DATA;
1498 close SAVE if ($savephrases);
1499
1500}
1501
1502
1503
1504# sort routines used to renumber phrases
1505
1506sub numerically { $a <=> $b }
1507
1508sub by_doc_frequency {
1509 my $fa = 1;
1510 if ($a =~ /,/) {
1511 $fa = $a;
1512 $fa =~ s/\d+,//;
1513 }
1514 my $fb = 1;
1515 if ($b =~ /,/) {
1516 $fb = $b;
1517 $fb =~ s/\d+,//;
1518 }
1519
1520 return ($fb <=> $fa);
1521}
1522
15231;
Note: See TracBrowser for help on using the repository browser.