source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 21876

Last change on this file since 21876 was 21876, checked in by kjdon, 14 years ago

only process into english clauses if english is the only language, not for eg with ar|en. Don't remove all non \w characters - this removes all non alphanumeric chars. I have made up a punctuation match, some replaced with new lines, some with space.

  • Property svn:keywords set to Author Date Id Revision
File size: 40.8 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
34use BaseClassifier;
35use util;
36use ghtml;
37use unicode;
38
39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42my @removedirs = ();
43
44my %wanted_index_files = ('td'=>1,
45 't'=>1,
46 'ti'=>1,
47 'tl'=>1,
48 'tsd'=>1,
49 'idb'=>1,
50 'ib1'=>1,
51 'ib2'=>1,
52 'ib3'=>1,
53 'i'=>1,
54 'il'=>1,
55 'w'=>1,
56 'wa'=>1);
57
58sub BEGIN {
59 @Phind::ISA = ('BaseClassifier');
60}
61
62sub END {
63
64 # Tidy up stray files - we do this here as there's some weird problem
65 # preventing us from doing it in the get_classify_info() function (on
66 # windows at least) where the close() appears to fail on txthandle and
67 # dochandle, thus preventing us from deleting those files
68
69 foreach my $dir (@removedirs) {
70 if (-d $dir && opendir (DIR, $dir)) {
71 my @files = readdir DIR;
72 closedir DIR;
73
74 foreach my $file (@files) {
75 next if $file =~ /^\.\.?$/;
76 my ($suffix) = $file =~ /\.([^\.]+)$/;
77 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
78 # delete it!
79 &util::rm (&util::filename_cat ($dir, $file));
80 }
81 }
82 }
83 }
84}
85
86my $arguments =
87 [ { 'name' => "text",
88 'desc' => "{Phind.text}",
89 'type' => "string",
90 'deft' => "section:Title,section:text",
91 'reqd' => "no" },
92 { 'name' => "title",
93 'desc' => "{Phind.title}",
94 'type' => "metadata",
95 'deft' => "Title",
96 'reqd' => "no" },
97 { 'name' => "buttonname",
98 'desc' => "{BasClas.buttonname}",
99 'type' => "string",
100 'deft' => "Phrase",
101 'reqd' => "no" },
102 { 'name' => "language",
103 'desc' => "{Phind.language}",
104 'type' => "string",
105 'deft' => "en",
106 'reqd' => "no" },
107 { 'name' => "savephrases",
108 'desc' => "{Phind.savephrases}",
109 'type' => "string",
110 'deft' => "",
111 'reqd' => "no" },
112 { 'name' => "suffixmode",
113 'desc' => "{Phind.suffixmode}",
114 'type' => "int",
115 'deft' => "1",
116 'range' => "0,1",
117 'reqd' => "no" },
118 { 'name' => "min_occurs",
119 'desc' => "{Phind.min_occurs}",
120 'type' => "int",
121 'deft' => "2",
122 'range' => "1,",
123 'reqd' => "no" },
124 { 'name' => "thesaurus",
125 'desc' => "{Phind.thesaurus}",
126 'type' => "string",
127 'deft' => "",
128 'reqd' => "no" },
129 { 'name' => "untidy",
130 'desc' => "{Phind.untidy}",
131 'type' => "flag",
132 'reqd' => "no" } ];
133
134my $options = { 'name' => "Phind",
135 'desc' => "{Phind.desc}",
136 'abstract' => "no",
137 'inherits' => "yes",
138 'args' => $arguments };
139
140
141# Phrase delimiter symbols - these should be abstracted out someplace
142
143my $colstart = "COLLECTIONSTART";
144my $colend = "COLLECTIONEND";
145my $doclimit = "DOCUMENTLIMIT";
146my $senlimit = "SENTENCELIMIT";
147my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150# Create a new Phind browser based on collect.cfg
151
152sub new {
153 my ($class) = shift (@_);
154 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155 push(@$classifierslist, $class);
156
157 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158 push(@{$hashArgOptLists->{"OptList"}},$options);
159
160 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
161
162 if ($self->{'info_only'}) {
163 # don't worry about any options etc
164 return bless $self, $class;
165 }
166
167 # Ensure the Phind generate scripts are in place
168 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171 if (!(-e $file1)) {
172 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173 exit(1);
174 }
175
176 # things that may have ex. in them that need to be stripped off
177 $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178 $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
180 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
181
182 $self->{"indexes"} = $self->{"text"};
183
184 # Further setup
185 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
187 if (! defined $self->{'builddir'}) {
188 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189 }
190 $self->{'total'} = 0;
191
192 # Clean out the unused keys
193 delete $self->{"text"};
194
195 return bless $self, $class;
196}
197
198
199# Initialise the Phind classifier
200
201sub init {
202 my $self = shift (@_);
203
204 # ensure we have a build directory
205 my $builddir = $self->{'builddir'};
206 die unless (-e "$builddir");
207
208 # create Phind directory
209 my $phnumber = 1;
210 my $phinddir = &util::filename_cat($builddir, "phind1");
211 while (-e "$phinddir") {
212 $phnumber++;
213 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
214 }
215 &util::mk_dir("$phinddir");
216 $self->{'phinddir'} = $phinddir;
217 $self->{'phindnumber'} = $phnumber;
218
219 push(@removedirs, $phinddir) unless $self->{'untidy'};
220
221 # open filehandles for documents and text
222 my $clausefile = &util::filename_cat("$phinddir", "clauses");
223 &util::rm($clausefile) if (-e $clausefile);
224
225 my $txthandle = 'TEXT' . $phnumber;
226 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
227 $self->{'txthandle'} = $txthandle;
228
229 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
230 &util::rm($docfile) if (-e $docfile);
231
232 my $dochandle = 'DOC' . $phnumber;
233 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
234 $self->{'dochandle'} = $dochandle;
235
236}
237
238
239# Classify each document.
240#
241# Each document is passed here in turn. The classifier extracts the
242# text of each and stores it in the clauses file. Document details are
243# stored in the docs.txt file.
244
245sub classify {
246 my $self = shift (@_);
247 my ($doc_obj,$edit_mode) = @_;
248
249 my $verbosity = $self->{'verbosity'};
250 my $top_section = $doc_obj->get_top_section();
251
252 my $titlefield = $self->{'title'};
253
254 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
255 if (!defined($title)) {
256 $title = "";
257 print STDERR "Phind: document has no title\n";
258 }
259 print "process: $title\n" if ($verbosity > 2);
260
261 # Only consider the file if it is in the correct language
262 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
263 my $phrlanguage = $self->{'language'};
264 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
265
266 if ($edit_mode eq "delete") {
267 # This classifier works quite differently to the others
268 # Probably doesn't support incremental building anyway
269 return;
270 }
271
272 # record this file
273 $self->{'total'} ++;
274 # what is $file ???
275 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
276
277 # Store document details
278 my $OID = $doc_obj->get_OID();
279 $OID = "NULL" unless defined $OID;
280 my $dochandle = $self->{'dochandle'};
281 print $dochandle "<Document>\t$OID\t$title\n";
282
283 # Store the text occuring in this object
284
285 # output the document delimiter
286 my $txthandle = $self->{'txthandle'};
287 print $txthandle "$doclimit\n";
288
289 # iterate over the required indexes and store their text
290 my $indexes = $self->{'indexes'};
291 my $text = "";
292 my ($part, $level, $field, $section, $data, $dataref);
293
294 foreach $part (split(/,/, $indexes)) {
295
296 # Each field has a level and a data element ((e.g. document:Title)
297 ($level, $field) = split(/:/, $part);
298 die unless ($level && $field);
299
300 # Extract the text from every section
301 # (In phind, document:text and section:text are equivalent)
302 if ($field eq "text") {
303 $data = "";
304 $section = $doc_obj->get_top_section();
305 while (defined($section)) {
306 $data .= $doc_obj->get_text($section) . "\n";
307 $section = $doc_obj->get_next_section($section);
308 }
309 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
310 }
311
312 # Extract a metadata field from a document
313 # (If there is more than one element of the given type, get them all.)
314 elsif ($level eq "document") {
315 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
316 foreach $data (@$dataref) {
317 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
318 }
319 }
320
321 # Extract metadata from every section in a document
322 elsif ($level eq "section") {
323 $data = "";
324 $section = $doc_obj->get_top_section();
325 while (defined($section)) {
326 $dataref = $doc_obj->get_metadata($section, $field);
327 $data .= join("\n", @$dataref) . "\n";
328 $section = $doc_obj->get_next_section($section);
329 }
330 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
331 }
332
333 # Some sort of specification which I don't understand
334 else {
335 die "Unknown level ($level) in Phind index ($part)\n";
336 }
337
338 }
339
340 # output the text
341 $text =~ tr/\n//s;
342 print $txthandle "$text";
343}
344
345
346# Construct the classifier from the information already gathered
347#
348# When get_classify_info is called, the clauses and docs.txt files have
349# already been constructed in the Phind directory. This function will
350# translate them into compressed, indexed MGPP files that can be read by
351# the phindcgi script. It will also register our classifier so that it
352# shows up in the navigation bar.
353
354sub get_classify_info {
355 my $self = shift (@_);
356 my ($gli) = @_;
357
358 close $self->{'dochandle'};
359 close $self->{'txthandle'};
360 my $verbosity = $self->{'verbosity'};
361 my $out = $self->{'outhandle'};
362 my $phinddir = $self->{'phinddir'};
363
364 my $osextra = "";
365 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
366 $osextra = " -d /";
367 }
368
369 print STDERR "</Stage>\n" if $gli;
370
371 if ($verbosity) {
372 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
373 print $out "*** in ", $self->{'phinddir'}, "\n";
374 }
375
376 print STDERR "<Stage name='Phind'>\n" if $gli;
377
378 # Construct phind indexes
379 my $suffixmode = $self->{'suffixmode'};
380 my $min_occurs = $self->{'min_occurs'};
381 my ($command, $status);
382
383 # Generate the vocabulary, symbol statistics, and numbers file
384 # from the clauses file
385 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
386 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
387 &extract_vocabulary($self);
388
389 # Use the suffix program to generate the phind/phrases file
390 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
391 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
392 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
393
394 # check that we generated some files. It's not necessarily an error if
395 # we didn't (execute() would have quit on error), but we can't go on.
396 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
397 if (! -r $phrasesfile) {
398 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
399 print $out "\nNo phrases found for Phind classifier!\n";
400 return;
401 }
402
403 # Create the phrase file and put phrase numbers in phind/phrases
404 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
405 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
406 &renumber_phrases($self);
407
408 print $out "\nCreating phrase databases\n";
409 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
410 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
411 my $mg_stem = &util::filename_cat($phinddir, "pdata");
412
413 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
414 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
415 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
416
417 # create the mg index of words
418 print $out "\nCreating word-level search indexes\n";
419 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
420 $mg_input = &util::filename_cat($phinddir, "pword.txt");
421 $mg_stem = &util::filename_cat($phinddir, "pword");
422
423 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
424 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
425 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
427 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429
430 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
431 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
432 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
433
434 # create the mg document information database
435 print $out "\nCreating document information databases\n";
436 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
437 $mg_input = &util::filename_cat($phinddir, "docs.txt");
438 $mg_stem = &util::filename_cat($phinddir, "docs");
439
440 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
441 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
442 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
443
444 # Return the information about the classifier that we'll later want to
445 # use to create macros when the Phind classifier document is displayed.
446 my %classifyinfo = ('thistype'=>'Invisible',
447 'childtype'=>'Phind',
448 'Title'=>$self->{'buttonname'},
449 'parameters'=>"phindnumber=$self->{'phindnumber'}",
450 'contains'=>[]);
451
452 my $collection = $self->{'collection'};
453 my $url = "library?a=p&p=phind&c=$collection";
454 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
455
456 return \%classifyinfo;
457}
458
459
460
461sub convert_gml_to_tokens {
462
463 my ($language_exp, $text) = @_;
464
465 # escape any magic words... - jrm21
466 foreach my $delim (@delimiters) {
467 my $replacement=lc($delim);
468 my $num= $text=~ s/$delim/$replacement/g;
469 if (!$num) {$num=0;}
470 }
471
472 if ($language_exp =~ /^en$/) {
473 return &convert_gml_to_tokens_EN($text);
474 }
475
476 if ($language_exp =~ /zh/) {
477 return &convert_gml_to_tokens_ZH($text);
478 }
479
480 $_ = $text;
481
482 # 1. remove GML tags
483
484 # Remove everything that is in a tag
485 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
486 s/\s*<br>\s*/ LINEBREAK /isgo;
487 s/<[^>]*>/ /sgo;
488
489 # Now we have the text, but it may contain HTML
490 # elements coded as &gt; etc. Remove these tags.
491 s/&amp;/&/sgo;
492 s/&lt;/</sgo;
493 s/&gt;/>/sgo;
494 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
495 s/\s*<br>\s*/ LINEBREAK /isgo;
496 s/<[^>]*>/ /sgo;
497
498 # replace<p> and <br> placeholders with clause break symbol (\n)
499 s/\s+/ /gso;
500 s/PARAGRAPHBREAK/\n/sgo;
501 s/LINEBREAK/\n/sgo;
502
503
504 # 2. Split the remaining text into space-delimited tokens
505
506 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
507 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
508
509 # Split text at word boundaries
510 s/\b/ /go;
511
512 # 3. Convert the remaining text to "clause format"
513
514 # Insert newline if the end of a sentence is detected
515 # (delimter is: "[\.\?\!]\s")
516 # s/\s*[\.\?\!]\s+/\n/go;
517
518 # remove unnecessary punctuation and replace with clause break symbol (\n)
519 # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
520 #s/[^\w ]/\n/go;
521 # replace punct with new lines - is this what we want??
522 s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
523 # then remove other punct with space
524 s/[\'\`\\\_]/ /go;
525
526 # remove extraneous whitespace
527 s/ +/ /sgo;
528 s/^\s+//mgo;
529 s/\s*$/\n/mgo;
530
531 # remove lines that contain one word or less
532 s/^\S*$//mgo;
533 s/^\s*$//mgo;
534 tr/\n//s;
535
536 return $_;
537}
538
539# a chinese version
540sub convert_gml_to_tokens_ZH {
541
542 $_ = shift @_;
543
544 # Replace all whitespace with a simple space
545 s/\s+/ /gs;
546 # Remove everything that is in a tag
547 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
548 s/\s*<br>\s*/ LINEBREAK /isg;
549 s/<[^>]*>/ /sg;
550
551 # Now we have the text, but it may contain HTML
552 # elements coded as &gt; etc. Remove these tags.
553 s/&lt;/</sg;
554 s/&gt;/>/sg;
555
556 s/\s+/ /sg;
557 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
558 s/\s*<br>\s*/ LINEBREAK /isg;
559 s/<[^>]*>/ /sg;
560
561 # remove &amp; and other miscellaneous markup tags
562 s/&amp;/&/sg;
563 s/&lt;/</sg;
564 s/&gt;/>/sg;
565 s/&amp;/&/sg;
566
567 # replace<p> and <br> placeholders with carriage returns
568 s/PARAGRAPHBREAK/\n/sg;
569 s/LINEBREAK/\n/sg;
570
571
572# print STDERR "text:$_\n";
573 return $_;
574}
575
576# A version of convert_gml_to_tokens that is fine-tuned to the English language.
577
578sub convert_gml_to_tokens_EN {
579 $_ = shift @_;
580
581 # FIRST, remove GML tags
582
583 # Replace all whitespace with a simple space
584 s/\s+/ /gs;
585
586 # Remove everything that is in a tag
587 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
588 s/\s*<br>\s*/ LINEBREAK /isg;
589 s/<[^>]*>/ /sg;
590
591 # Now we have the text, but it may contain HTML
592 # elements coded as &gt; etc. Remove these tags.
593 s/&lt;/</sg;
594 s/&gt;/>/sg;
595
596 s/\s+/ /sg;
597 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
598 s/\s*<br>\s*/ LINEBREAK /isg;
599 s/<[^>]*>/ /sg;
600
601 # remove &amp; and other miscellaneous markup tags
602 s/&amp;/&/sg;
603 s/&lt;/</sg;
604 s/&gt;/>/sg;
605 s/&amp;/&/sg;
606
607 # replace<p> and <br> placeholders with carriage returns
608 s/PARAGRAPHBREAK/\n/sg;
609 s/LINEBREAK/\n/sg;
610
611
612 # Exceptional punctuation
613 #
614 # We make special cases of some punctuation
615
616 # remove any apostrophe that indicates omitted letters
617 s/(\w+)\'(\w*\s)/ $1$2 /g;
618
619 # remove period that appears in a person's initals
620 s/\s([A-Z])\./ $1 /g;
621
622 # replace hyphens in hypheanted words and names with a space
623 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
624
625 # Convert the remaining text to "clause format",
626 # This means removing all excess punctuation and garbage text,
627 # normalising valid punctuation to fullstops and commas,
628 # then putting one cluse on each line.
629
630 # Insert newline when the end of a sentence is detected
631 # (delimter is: "[\.\?\!]\s")
632 s/\s*[\.\?\!]\s+/\n/g;
633
634 # split numbers after four digits
635 s/(\d\d\d\d)/$1 /g;
636
637 # split words after 32 characters
638
639 # squash repeated punctuation
640 tr/A-Za-z0-9 //cs;
641
642 # save email addresses
643 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
644
645 # normalise clause breaks (mostly punctuation symbols) to commas
646 s/[^A-Za-z0-9 \n]+/ , /g;
647
648 # Remove repeated commas, and replace with newline
649 s/\s*,[, ]+/\n/g;
650
651 # remove extra whitespace
652 s/ +/ /sg;
653 s/^\s+//mg;
654 s/\s*$/\n/mg;
655
656 # remove lines that contain one word or less
657 s/^\w*$//mg;
658 s/^\s*$//mg;
659 tr/\n//s;
660
661 return $_;
662
663}
664
665
666
667# Execute a system command
668
669sub execute {
670 my ($command, $verbosity, $outhandle) = @_;
671 print $outhandle "Executing: $command\n" if ($verbosity > 2);
672 $! = 0;
673 my $status = system($command);
674 if ($status != 0) {
675 print STDERR "Phind - Error executing '$command': $!\n";
676 exit($status); # this causes the build to fail...
677 }
678}
679
680
681# Generate the vocabulary, symbol statistics, and numbers file from the
682# clauses file. This is legacy code, so is a bit messy and probably wont
683# run under windows.
684
685sub extract_vocabulary {
686 my ($self) = @_;
687
688 my $verbosity = $self->{'verbosity'};
689 my $out = $self->{'outhandle'};
690
691 my $collectiondir = $self->{'collectiondir'};
692 my $phinddir = $self->{'phinddir'};
693
694 my $language_exp = $self->{'language'};
695
696 my ($w, $l, $line, $word);
697
698 my ($first_delimiter, $last_delimiter,
699 $first_stopword, $last_stopword,
700 $first_extractword, $last_extractword,
701 $first_contentword, $last_contentword,
702 $phrasedelimiter);
703
704 my $thesaurus = $self->{'thesaurus'};
705 my ($thesaurus_links, $thesaurus_terms,
706 %thesaurus, $first_thesaurusword, $last_thesaurusword);
707
708 my %symbol;
709 my (%freq);
710
711 print $out "Calculating vocabulary\n" if ($verbosity > 1);
712
713 # Read and store the stopwords
714 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
715 my $stopword_files = ();
716 my ($language, $language_dir, $file, $file_name);
717 my %stopwords;
718
719 # Examine each directory in the stopword directory
720 opendir(STOPDIR, $stopdir);
721 foreach $language (readdir STOPDIR) {
722
723 # Ignore entries that do not match the classifier's language
724 next unless ($language =~ /$language_exp/);
725 $language_dir = &util::filename_cat($stopdir, $language);
726 next unless (-d "$language_dir");
727
728 opendir(LANGDIR, $language_dir);
729 foreach $file (readdir LANGDIR) {
730
731 # Ignore entries that are not stopword files
732 next unless ($file =~ /sw$/);
733 $file_name = &util::filename_cat($language_dir, $file);
734 next unless (-f "$file_name");
735
736 # Read the stopwords
737 open(STOPFILE, "<$file_name");
738 while (<STOPFILE>) {
739 s/^\s+//;
740 s/\s.*//;
741 $word = $_;
742 $l = lc($word);
743 $stopwords{$l} = $word;
744 }
745 close STOPFILE;
746
747 }
748 closedir LANGDIR;
749 }
750 closedir STOPDIR;
751
752 # Read thesaurus information
753 if ($thesaurus) {
754
755 # link file exists
756 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
757 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
758
759 # ensure term file exists in the correct language
760 if ($language_exp =~ /^([a-z][a-z])/) {
761 $language = $1;
762 } else {
763 $language = 'en';
764 }
765 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
766 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
767
768
769 # Read the thesaurus terms
770 open(TH, "<$thesaurus_terms");
771 while(<TH>) {
772 s/^\d+ //;
773 s/\(.*\)//;
774 foreach $w (split(/\s+/, $_)) {
775 $thesaurus{lc($w)} = $w;
776 }
777 }
778 close TH;
779 }
780
781 # Read words in the text and count occurences
782 open(TXT, "<$phinddir/clauses");
783
784 my @words;
785 while(<TXT>) {
786 $line = $_;
787 next unless ($line =~ /./);
788
789 @words = split(/\s+/, $line);
790 foreach $w (@words) {
791 $l = lc($w);
792 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
793 $freq{$w}++;
794 }
795 $freq{$senlimit}++;
796 }
797
798 close TXT;
799
800 # Calculate the "best" form of each word
801 my (%bestform, %totalfreq, %bestfreq);
802
803 foreach $w (sort (keys %freq)) {
804 $l = lc($w);
805
806 # totalfreq is the number of times a term appears in any form
807 $totalfreq{$l} += $freq{$w};
808
809 if (defined $stopwords{$l}) {
810 $bestform{$l} = $stopwords{$l};
811
812 } elsif (defined $thesaurus{$l}) {
813 $bestform{$l} = $thesaurus{$l};
814
815 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
816 $bestfreq{$l} = $freq{$w};
817 $bestform{$l} = $w;
818 }
819 }
820 undef %freq;
821 undef %bestfreq;
822
823
824 # Assign symbol numbers to tokens
825 my $nextsymbol = 1;
826 my (@vocab);
827
828 # Delimiters
829 $first_delimiter = 1;
830
831 foreach $word (@delimiters) {
832
833# $word = lc($word); # jrm21
834 $word = uc($word);
835 $bestform{$word} = $word;
836 $vocab[$nextsymbol] = $word;
837 $symbol{$word} = $nextsymbol;
838 $nextsymbol++;
839 }
840 $last_delimiter = $nextsymbol - 1;
841 # Stopwords
842 $first_stopword = $nextsymbol;
843
844 foreach my $word (sort keys %stopwords) {
845 # don't include stopword unless it occurs in the text
846 $word = lc($word);
847 next unless ($totalfreq{$word});
848 next if ($symbol{$word});
849
850 $vocab[$nextsymbol] = $word;
851 $symbol{$word} = $nextsymbol;
852 $nextsymbol++;
853 }
854 $last_stopword = $nextsymbol - 1;
855 $first_contentword = $nextsymbol;
856
857 # Thesaurus terms
858 if ($thesaurus) {
859 $first_thesaurusword = $nextsymbol;
860
861 foreach my $word (sort keys %thesaurus) {
862
863 $word = lc($word);
864 next if ($symbol{$word});
865 $bestform{$word} = $thesaurus{$word};
866
867 $vocab[$nextsymbol] = $word;
868 $symbol{$word} = $nextsymbol;
869 $nextsymbol++;
870
871 }
872 $last_thesaurusword = $nextsymbol - 1;
873 }
874
875 # Other content words
876 $first_extractword = $nextsymbol;
877
878 foreach my $word (sort (keys %bestform)) {
879
880 next if ($symbol{$word});
881
882 $vocab[$nextsymbol] = $word;
883 $symbol{$word} = $nextsymbol;
884 $nextsymbol++;
885 }
886 $last_extractword = $nextsymbol - 1;
887 $last_contentword = $nextsymbol - 1;
888
889 # Outut the words
890 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
891 open(VOC, ">$phinddir/clauses.vocab");
892
893 for (my $i = 1; $i < $nextsymbol; $i++) {
894 $w = $vocab[$i];
895
896 print VOC "$bestform{$w}\n";
897 $totalfreq{$w} = 0 unless ($totalfreq{$w});
898 }
899 close VOC;
900
901
902 # Create statistics file
903 # Output statistics about the vocablary
904 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
905 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
906
907 open(STAT, ">$phinddir/clauses.stats")
908 || die "Cannot open $phinddir/clauses.stats: $!";
909
910 print STAT "first_delimiter $first_delimiter\n";
911 print STAT "last_delimiter $last_delimiter\n";
912 print STAT "first_stopword $first_stopword\n";
913 print STAT "last_stopword $last_stopword\n";
914 if ($thesaurus) {
915 print STAT "first_thesaurusword $first_thesaurusword\n";
916 print STAT "last_thesaurusword $last_thesaurusword\n";
917 }
918 print STAT "first_extractword $first_extractword\n";
919 print STAT "last_extractword $last_extractword\n";
920 print STAT "first_contentword $first_contentword\n";
921 print STAT "last_contentword $last_contentword\n";
922 print STAT "first_symbol $first_delimiter\n";
923 print STAT "last_symbol $last_contentword\n";
924 print STAT "first_word $first_stopword\n";
925 print STAT "last_word $last_contentword\n";
926 close STAT;
927
928 undef @vocab;
929
930
931 # Create numbers file
932 # Save text as symbol numbers
933 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
934
935 open(TXT, "<$phinddir/clauses");
936 open(NUM, ">$phinddir/clauses.numbers");
937
938## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
939## print NUM "$symbol{lc($colstart)}\n"; # jrm21
940 $phrasedelimiter = $symbol{$senlimit};
941 print NUM "$symbol{$colstart}\n";
942
943 # set up the special symbols that delimit documents and sentences
944 while(<TXT>) {
945
946 # split sentence into a list of tokens
947 $line = $_;
948 next unless ($line =~ /./);
949 @words = split(/\s+/, $line);
950
951 # output one token at a time
952 foreach $word (@words) {
953# don't lower-case special delimiters - jrm21
954 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
955 $word = lc($word);
956 }
957 print NUM "$symbol{$word}\n";
958 }
959
960 # output phrase delimiter
961 print NUM "$phrasedelimiter\n";
962 }
963
964 close TXT;
965# print NUM "$symbol{lc($colend)}\n";# jrm21
966 print NUM "$symbol{$colend}\n";
967 close NUM;
968
969 # Save thesaurus data in one convienient file
970 if ($thesaurus) {
971
972 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
973
974
975 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
976 if ($verbosity > 1);
977
978 # Read the thesaurus terms
979 my ($num, $text, %thes_symbols);
980
981 open(TH, "<$thesaurus_terms");
982 while(<TH>) {
983 chomp;
984 @words = split(/\s+/, $_);
985 $num = shift @words;
986 $text = "";
987
988 # translate words into symbol numbers
989 foreach $word (@words) {
990 $word = lc($word);
991 if ($symbol{$word}) {
992 $text .= "s$symbol{$word} ";
993 } elsif ($verbosity) {
994 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
995 }
996 }
997 $text =~ s/ $//;
998 $thes_symbols{$num} = $text;
999 }
1000 close TH;
1001
1002 # Read the thesaurus links and write the corresponding data
1003 open(TH, "<$thesaurus_links");
1004 open(THOUT, ">$thesaurusfile");
1005
1006 while(<TH>) {
1007 chomp;
1008 ($num, $text) = split(/:/, $_);
1009
1010 if (defined($thes_symbols{$num})) {
1011 print THOUT "$num:$thes_symbols{$num}:$text\n";
1012 } else {
1013 print THOUT "$num:untranslated:$text\n";
1014 }
1015 }
1016 close TH;
1017 close THOUT;
1018 }
1019
1020
1021
1022
1023}
1024
1025
1026# renumber_phrases
1027#
1028# Prepare the phrases file to be input to mgpp. The biggest problem is
1029# reconciling the phrase identifiers used by the suffix program (which
1030# we'll call suffix-id numbers) with the numbers used in the thesaurus
1031# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1032# Phind-id numbers must be sorted by frequency of occurance.
1033#
1034# Start creating a set of phind-id numbers from the sorted suffix-id
1035# numbers and (if required) the thesaurus-id numbers. Then add any other
1036# phrases occuring in the thesaurus.
1037#
1038# The last thing we have to do is restore the vocabulary information to the
1039# phrase file so that the phrases are stored as words, not as symbol
1040# numbers.
1041
1042# The original phrases file looks something like this:
1043# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1044# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1045# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1046# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1047
1048
1049sub renumber_phrases {
1050 my ($self) = @_;
1051
1052 renumber_suffix_data($self);
1053 renumber_thesaurus_data($self);
1054 restore_vocabulary_data($self);
1055
1056}
1057
1058
1059
1060# renumber_suffix_data
1061#
1062# Translate phrases file to phrases.2 using phind keys instead
1063# of suffix keys and sorting the expansion data.
1064
1065sub renumber_suffix_data {
1066 my ($self) = @_;
1067
1068 my $verbosity = $self->{'verbosity'};
1069 my $out = $self->{'outhandle'};
1070 print $out "Translate phrases: suffix-ids become phind-id's\n"
1071 if ($verbosity);
1072
1073 my $phinddir = $self->{'phinddir'};
1074 my $infile = &util::filename_cat($phinddir, 'phrases');
1075 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1076
1077 # Read the phrase file. Calculate initial set of phind-id
1078 # numbers and store (suffixid -> frequency) relation.
1079
1080 my %suffixtophind;
1081 my @totalfrequency;
1082 my (@fields, $suffixid);
1083 my $nextphind = 1;
1084
1085 open(IN, "<$infile");
1086 while(<IN>) {
1087
1088 chomp;
1089 @fields = split(/:/, $_);
1090
1091 # get next suffixid and phindid
1092 $suffixid = shift @fields;
1093 $suffixtophind{$suffixid} = $nextphind;
1094
1095 # store total frequency
1096 shift @fields;
1097 $totalfrequency[$nextphind] = shift @fields;
1098
1099 $nextphind++;
1100 }
1101 close IN;
1102
1103
1104 # Translate phrases file to phrases.2. Use phind keys (not suffix
1105 # keys), sort expansion and document occurance data in order of
1106 # descending frequency..
1107 open(IN, "<$infile");
1108 open(OUT, ">$outfile");
1109
1110 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1111 my (@documwents, @newexp, $k, $n);
1112 my $linenumber = 0;
1113
1114 while(<IN>) {
1115
1116 # read the line
1117 chomp;
1118 @fields = split(/:/, $_);
1119
1120 # get a phrase number for this line
1121 $suffixid = shift @fields;
1122 die unless (defined($suffixtophind{$suffixid}));
1123 $phindid = $suffixtophind{$suffixid};
1124
1125 # get the symbols in the phrase
1126 $text = shift @fields;
1127
1128 # output status information
1129 $linenumber++;
1130 if ($verbosity > 2) {
1131 if ($linenumber % 1000 == 0) {
1132 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1133 }
1134 # what are $num and $key??
1135 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1136 }
1137
1138 # get the phrase frequency
1139 $tf = shift @fields;
1140
1141 # get the number of expansions
1142 $countexp = shift @fields;
1143
1144 # get the expansions, convert them into phind-id numbers, and sort them
1145 $expansions = shift @fields;
1146 @newexp = ();
1147 foreach $k (split(/,/, $expansions)) {
1148 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1149 $n = $suffixtophind{$k};
1150 push @newexp, $n;
1151 }
1152 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1153
1154 # get the number of documents
1155 $countdocs = shift @fields;
1156
1157 # get the documents and sort them
1158 $documents = shift @fields;
1159 $documents =~ s/d//g;
1160 my @documents = split(/;/, $documents);
1161 @documents = sort by_doc_frequency @documents;
1162
1163 # output the phrase data
1164 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1165 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1166
1167 }
1168
1169 close IN;
1170 close OUT;
1171}
1172
1173
1174# renumber_thesaurus_data
1175#
1176# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1177
1178sub renumber_thesaurus_data {
1179 my ($self) = @_;
1180
1181 my $out = $self->{'outhandle'};
1182 my $verbosity = $self->{'verbosity'};
1183 my $thesaurus = $self->{'thesaurus'};
1184
1185 my $phinddir = $self->{'phinddir'};
1186 my $infile = &util::filename_cat($phinddir, "phrases.2");
1187 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1188
1189
1190 # If no thesaurus is defined, simply move the phrases file.
1191 if (!$thesaurus) {
1192 print $out "Translate phrases.2: no thesaurus data\n"
1193 if ($verbosity);
1194 &util::mv($infile, $outfile);
1195 return;
1196 }
1197
1198 print $out "Translate phrases.2: add thesaurus data\n"
1199 if ($verbosity);
1200
1201 # 1.
1202 # Read thesaurus file and store (symbols->thesaurusid) mapping
1203 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1204 my %symbolstothesid;
1205 my (@fields, $thesid, $symbols);
1206
1207 open(TH, "<$thesaurusfile");
1208
1209 while (<TH>) {
1210
1211 chomp;
1212 @fields = split(/:/, $_);
1213
1214 # get id and text
1215 $thesid = shift @fields;
1216 $symbols = shift @fields;
1217 $symbolstothesid{$symbols} = $thesid;
1218 }
1219 close TH;
1220
1221 # 2.
1222 # Read phrases file to find thesaurus entries that already
1223 # have a phindid. Store their phind-ids for later translation,
1224 # and store their frequency for later sorting.
1225 my %thesaurustophindid;
1226 my %phindidtofrequency;
1227 my ($phindid, $freq);
1228
1229 open(IN, "<$infile");
1230
1231 while(<IN>) {
1232
1233 chomp;
1234 @fields = split(/:/, $_);
1235
1236 # phindid and symbols for this line
1237 $phindid = shift @fields;
1238 $symbols = shift @fields;
1239 $freq = shift @fields;
1240
1241 # do we have a thesaurus id corresponding to this phrase?
1242 if (defined($symbolstothesid{$symbols})) {
1243 $thesid = $symbolstothesid{$symbols};
1244 $thesaurustophindid{$thesid} = $phindid;
1245 $phindidtofrequency{$phindid} = $freq;
1246 }
1247 }
1248 close IN;
1249
1250 undef %symbolstothesid;
1251
1252 # 3.
1253 # Create phind-id numbers for remaining thesaurus entries,
1254 # and note that their frequency is 0 for later sorting.
1255 my $nextphindid = $phindid + 1;
1256
1257 open(TH, "<$thesaurusfile");
1258 while(<TH>) {
1259
1260 chomp;
1261 @fields = split(/:/, $_);
1262
1263 # read thesaurus-id and ensure it has a corresponding phind-id
1264 $thesid = shift @fields;
1265 if (!defined($thesaurustophindid{$thesid})) {
1266 $thesaurustophindid{$thesid} = $nextphindid;
1267 $phindidtofrequency{$nextphindid} = 0;
1268 $nextphindid++;
1269 }
1270 }
1271 close TH;
1272
1273 # 4.
1274 # Translate thesaurus file, replacing thesaurus-id numbers with
1275 # phind-id numbers.
1276 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1277 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1278 my (@links, $linkid, %linkidtotype, $newrelation);
1279
1280 open(TH, "<$thesaurusfile");
1281 open(TO, ">$newthesaurusfile");
1282 while(<TH>) {
1283
1284 chomp;
1285 @fields = split(/:/, $_);
1286
1287 # phindid and symbols for this line
1288 ($thesid, $symbols, $relations) = @fields;
1289
1290 die unless ($thesid && $symbols);
1291 die unless $thesaurustophindid{$thesid};
1292 $phindid = $thesaurustophindid{$thesid};
1293
1294 # convert each part of the relation string to use phind-id numbers
1295 # at the same time, we want to sort the list by frequency.
1296 undef %linkidtotype;
1297
1298 foreach $linktext (split(/;/, $relations)) {
1299 @linkdata = split(/,/, $linktext);
1300
1301 # remember the linktype (e.g. BT, NT)
1302 $linktype = shift @linkdata;
1303
1304 # store the type of each link
1305 foreach $thesid (@linkdata) {
1306 die unless (defined($thesaurustophindid{$thesid}));
1307 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1308 }
1309 }
1310
1311 # sort the list of links, first by frequency, then by type.
1312 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1313 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1314 $linkcounter = (scalar @links);
1315
1316 # create a string describing the link information
1317 $linktype = $linkidtotype{$links[0]};
1318 $newrelation = $linktype;
1319 foreach $linkid (@links) {
1320 if ($linkidtotype{$linkid} ne $linktype) {
1321 $linktype = $linkidtotype{$linkid};
1322 $newrelation .= ";" . $linktype;
1323 }
1324 $newrelation .= "," . $linkid;
1325 }
1326 $newrelation .= ";";
1327
1328
1329 # output the new line
1330 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1331 }
1332 close TH;
1333 close TO;
1334
1335 undef %thesaurustophindid;
1336 undef %linkidtotype;
1337 undef %phindidtofrequency;
1338
1339 # 5.
1340 # Read thesaurus data (in phind-id format) into memory
1341 my %thesaurusdata;
1342
1343 open(TH, "<$newthesaurusfile");
1344 while(<TH>) {
1345 chomp;
1346 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1347 die unless ($phindid && $symbols);
1348 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1349 }
1350 close TH;
1351
1352 # 6.
1353 # Add thesaurus data to phrases file
1354 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1355 my (@documwents, @newexp, $k, $n);
1356 my $linenumber = 0;
1357
1358 open(IN, "<$infile");
1359 open(OUT, ">$outfile");
1360
1361 # Update existing phrases
1362 while(<IN>) {
1363
1364 chomp;
1365 @fields = split(/:/, $_);
1366
1367 # get data for this line
1368 $phindid = shift @fields;
1369
1370 # output the phrase data, with thesaurus information
1371 print OUT "$phindid:", join(":", @fields);
1372
1373 # add thesaurus data
1374 if (defined($thesaurusdata{$phindid})) {
1375 @fields = split(/:/, $thesaurusdata{$phindid});
1376 shift @fields;
1377 $linkcounter = shift @fields;
1378 $relations = shift @fields;
1379
1380 print OUT ":$linkcounter:$relations";
1381 $thesaurusdata{$phindid} = "";
1382 }
1383 print OUT "\n";
1384 }
1385 close IN;
1386
1387 # Add phrases that aren't already in the file
1388 foreach $phindid (sort numerically keys %thesaurusdata) {
1389 next unless ($thesaurusdata{$phindid});
1390
1391 @fields = split(/:/, $thesaurusdata{$phindid});
1392 $symbols = shift @fields;
1393 $linkcounter = shift @fields;
1394 $relations = shift @fields;
1395
1396 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1397 }
1398 close OUT;
1399
1400}
1401
1402# restore_vocabulary_data
1403#
1404# Read phrases.3 and restore vocabulary information. Then write
1405# this data to the MGPP input files (pword.txt and pdata.txt) and
1406# (if requested) to the saved phrases file.
1407
1408sub restore_vocabulary_data {
1409 my ($self) = @_;
1410
1411 my $out = $self->{'outhandle'};
1412 my $verbosity = $self->{'verbosity'};
1413 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1414
1415 my $phinddir = $self->{'phinddir'};
1416 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1417 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1418 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1419 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1420
1421 my $savephrases = $self->{'savephrases'};
1422
1423 # 1.
1424 # Read the vocabulary file
1425 open(V, "<$vocabfile")
1426 || die "Cannot open $vocabfile: $!";
1427 my @symbol;
1428 my $i = 1;
1429 while(<V>) {
1430 chomp;
1431 $symbol[$i++] = $_;
1432 }
1433 close V;
1434
1435 # 2.
1436 # Translate phrases.3 to MGPP input files
1437 my ($key, $text, $word, $isThesaurus, $line);
1438 my @fields;
1439 my $linenumber = 0;
1440
1441 open(IN, "<$infile");
1442 open(DATA, ">$datafile");
1443 open(WORD, ">$wordfile");
1444
1445 # Save the phrases in a separate text file
1446 if ($savephrases) {
1447 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1448 open(SAVE, ">$savephrases");
1449 }
1450
1451 while(<IN>) {
1452
1453 # read the line
1454 chomp;
1455 $line = $_;
1456 @fields = split(/:/, $line);
1457
1458 # get a phrase number for this line
1459 $key = shift @fields;
1460
1461 # restore the text of the phrase
1462 $text = shift @fields;
1463 $text =~ s/s(\d+)/$symbol[$1]/g;
1464 if ($text =~ / /) {
1465 $word = "";
1466 } elsif ($text ne 'untranslated') {
1467 $word = $text;
1468 }
1469
1470 # output the phrase data
1471 print DATA "<Document>";
1472 print DATA "$key:$text:", join(":", @fields), ":\n";
1473
1474 # output the word index search data
1475 print WORD "<Document>$word\n";
1476
1477 # output the phrases to a text file
1478 if ($savephrases) {
1479 if ((scalar @fields) == 7) {
1480 $isThesaurus = 1;
1481 } else {
1482 $isThesaurus = 0;
1483 }
1484 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1485 }
1486 }
1487 close IN;
1488 close WORD;
1489 close DATA;
1490 close SAVE if ($savephrases);
1491
1492}
1493
1494
1495
1496# sort routines used to renumber phrases
1497
1498sub numerically { $a <=> $b }
1499
1500sub by_doc_frequency {
1501 my $fa = 1;
1502 if ($a =~ /,/) {
1503 $fa = $a;
1504 $fa =~ s/\d+,//;
1505 }
1506 my $fb = 1;
1507 if ($b =~ /,/) {
1508 $fb = $b;
1509 $fb =~ s/\d+,//;
1510 }
1511
1512 return ($fb <=> $fa);
1513}
1514
15151;
Note: See TracBrowser for help on using the repository browser.