source: trunk/gsdl/perllib/classify/Phind.pm@ 3590

Last change on this file since 3590 was 3540, checked in by kjdon, 22 years ago

added John T's changes into CVS - added info to enable retrieval of usage info in xml

  • Property svn:keywords set to Author Date Id Revision
File size: 41.8 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl Phind" at the command line for a summary.
32
33# 12/05/02 Added usage datastructure - John Thompson
34
35package Phind;
36
37use BasClas;
38use util;
39use ghtml;
40use unicode;
41
42my @removedirs = ();
43
44my %wanted_index_files = ('td'=>1,
45 't'=>1,
46 'ti'=>1,
47 'tl'=>1,
48 'tsd'=>1,
49 'idb'=>1,
50 'ib1'=>1,
51 'ib2'=>1,
52 'ib3'=>1,
53 'i'=>1,
54 'il'=>1,
55 'w'=>1,
56 'wa'=>1);
57
58sub BEGIN {
59 @ISA = ('BasClas');
60}
61
62sub END {
63
64 # Tidy up stray files - we do this here as there's some weird problem
65 # preventing us from doing it in the get_classify_info() function (on
66 # windows at least) where the close() appears to fail on txthandle and
67 # dochandle, thus preventing us from deleting those files
68
69 foreach my $dir (@removedirs) {
70 if (-d $dir && opendir (DIR, $dir)) {
71 my @files = readdir DIR;
72 closedir DIR;
73
74 foreach $file (@files) {
75 next if $file =~ /^\.\.?$/;
76 my ($suffix) = $file =~ /\.([^\.]+)$/;
77 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
78 # delete it!
79 &util::rm (&util::filename_cat ($dir, $file));
80 }
81 }
82 }
83 }
84}
85
86my $arguments =
87[ { 'name' => "text",
88 'desc' => "The text used to build the phrase hierarchy (default: 'section:Title,section:text').",
89 'type' => "string",
90 'reqd' => "no" } ,
91{ 'name' => "title",
92 'desc' => "The metadata field used to describe each document (default: 'Title').",
93 'type' => "metadata",
94 'reqd' => "no" } ,
95{ 'name' => "button",
96 'desc' => "The label for the classifier screen and button in navigation bar (default: 'Phrase').",
97 'type' => "string",
98 'reqd' => "no" } ,
99{ 'name' => "language",
100 'desc' => "Language or languages to use building hierarchy. Languages are identified by two-letter country codes like en (English), es (Spanish), and fr (French). Language is a regular expression, so 'en|fr' (English or French) and '..' (match any language) are valid (default: 'en').",
101 'type' => "language",
102 'reqd' => "no" } ,
103{ 'name' => "savephrases",
104 'desc' => "If set, the phrase infomation will be stored in the given file as text. It is probably a good idea to use an absolute path (default: not set).",
105 'type' => "string",
106 'reqd' => "no" } ,
107{ 'name' => "suffixmode",
108 'desc' => "The smode parameter to the phrase extraction program. A value of 0 means that stopwords are ignored, and of 1 means that stopwords are used (default: 1).",
109 'type' => "int",
110 'reqd' => "no" } ,
111{ 'name' => "thesaurus",
112 'desc' => "Name of a thesaurus stored in Phind format in the collection's etc directory (default: not set).",
113 'type' => "string",
114 'reqd' => "no" } ,
115{ 'name' => "untidy",
116 'desc' => "Don't remove working files.",
117 'type' => "flag",
118 'reqd' => "no" }
119];
120
121my $options =
122{ 'name' => "Phind",
123 'desc' => "The Phind clasifier plugin.",
124 'inherits' => "Yes",
125 'args' => $arguments };
126
127sub print_usage {
128 print STDERR "
129 usage: classify Phind [options]
130
131 options:
132 -text Fields The text used to build the phrase hierarchy.
133 (default: 'section:Title,section:text')
134
135 -title Title The metadata field used to describe each document.
136 (default: 'Title')
137
138 -button Name The label for the classifier screen and button in
139 navigation bar.
140 (default: 'Phrase')
141
142 -language Regex Language or languages to use building hierarchy.
143 Languages are identified by two-letter country codes
144 like en (English), es (Spanish), and fr (French).
145 Language is a regular expression, so 'en|fr' (English or
146 French) and '..' (match any language) are valid.
147 (default: 'en'.)
148
149 -savephrases File If set, the phrase infomation will be stored in
150 the given file as text. It is probably a good idea
151 to use an absolute path.
152 (default: not set)
153
154 -suffixmode N The smode parameter to the phrase extraction program. A
155 value of 0 means that stopwords are ignored, and of 1
156 means that stopwords are used.
157 (default: 1)
158
159 -thesaurus Name Name of a thesaurus stored in Phind format in the
160 collection's etc directory.
161 (default: not set)
162
163 -untidy Don't remove working files.
164
165"; }
166
167# Phrase delimiter symbols - these should be abstracted out someplace
168
169my $colstart = "COLLECTIONSTART";
170my $colend = "COLLECTIONEND";
171my $doclimit = "DOCUMENTLIMIT";
172my $senlimit = "SENTENCELIMIT";
173my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
174
175
176# Create a new Phind browser based on collect.cfg
177
178sub new {
179 my $class = shift (@_);
180 my $self = new BasClas($class, @_);
181
182 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
183 my $option_list = $self->{'option_list'};
184 push( @{$option_list}, $options );
185
186 my $out = $self->{'outhandle'};
187
188 # Ensure the Phind generate scripts are in place
189 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
190 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
191 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
192
193 if (!(-e $file1)) {
194 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
195 exit(1);
196 }
197
198 # Parse classifier arguments
199 my $builddir = "";
200 if (!parsargv::parse(\@_,
201 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
202 q^title/.*/Title^, \$self->{'titlefield'},
203 q^button/.*/Phrase^, \$self->{'buttonname'},
204 q^language/.*/en^, \$language,
205 q^builddir/.*/^, \$builddir,
206 q^savephrases/.*/^, \$self->{'savephrases'},
207 q^suffixmode/\d/1^, \$self->{'suffixmode'},
208 q^thesaurus/.*/^, \$self->{'thesaurus'},
209 q^untidy^, \$self->{'untidy'},
210 "allow_extra_options")) {
211
212 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
213 &print_usage();
214 die "\n";
215 }
216
217 # classifier information
218 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
219
220 # limit languages
221 $self->{'language_exp'} = $language;
222
223 # collection directories
224 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
225 if (!$builddir) {
226 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
227 }
228 $self->{'builddir'} = $builddir;
229
230 $self->{'total'} = 0;
231
232 return bless $self, $class;
233}
234
235
236# Initialise the Phind classifier
237
238sub init {
239 my $self = shift (@_);
240
241 # ensure we have a build directory
242 my $builddir = $self->{'builddir'};
243 die unless (-e "$builddir");
244
245 # create Phind directory
246 my $phnumber = 1;
247 my $phinddir = &util::filename_cat($builddir, "phind1");
248 while (-e "$phinddir") {
249 $phnumber++;
250 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
251 }
252 &util::mk_dir("$phinddir");
253 $self->{'phinddir'} = $phinddir;
254 $self->{'phindnumber'} = $phnumber;
255
256 push(@removedirs, $phinddir) unless $self->{'untidy'};
257
258 # open filehandles for documents and text
259 my $clausefile = &util::filename_cat("$phinddir", "clauses");
260 &util::rm($clausefile) if (-e $clausefile);
261
262 my $txthandle = 'TEXT' . $phnumber;
263 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
264 $self->{'txthandle'} = $txthandle;
265
266 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
267 &util::rm($docfile) if (-e $docfile);
268
269 my $dochandle = 'DOC' . $phnumber;
270 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
271 $self->{'dochandle'} = $dochandle;
272
273}
274
275
276# Classify each document.
277#
278# Each document is passed here in turn. The classifier extracts the
279# text of each and stores it in the clauses file. Document details are
280# stored in the docs.txt file.
281
282sub classify {
283 my $self = shift (@_);
284 my $doc_obj = shift @_;
285
286 my $verbosity = $self->{'verbosity'};
287 my $top_section = $doc_obj->get_top_section();
288
289 my $titlefield = $self->{'titlefield'};
290
291 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
292 if (!defined($title)) {
293 $title = "";
294 print STDERR "Phind: document has no title\n";
295 }
296 print "process: $title\n" if ($verbosity > 2);
297
298 # Only consider the file if it is in the correct language
299 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
300 my $phrlanguage = $self->{'language_exp'};
301 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
302
303 # record this file
304 $self->{'total'} ++;
305 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
306
307
308 # Store document details
309 my $OID = $doc_obj->get_OID();
310 $OID = "NULL" unless defined $OID;
311 my $dochandle = $self->{'dochandle'};
312 print $dochandle "<Document>\t$OID\t$title\n";
313
314 # Store the text occuring in this object
315
316 # output the document delimiter
317 my $txthandle = $self->{'txthandle'};
318 print $txthandle "$doclimit\n";
319
320 # iterarate over the required indexes and store their text
321 my $indexes = $self->{'indexes'};
322 my $text = "";
323 my ($part, $level, $field, $section, $data, $dataref);
324
325 foreach $part (split(/,/, $indexes)) {
326
327 # Each field has a level and a data element ((e.g. document:Title)
328 ($level, $field) = split(/:/, $part);
329 die unless ($level && $field);
330
331 # Extract the text from every section
332 # (In phind, document:text and section:text are equivalent)
333 if ($field eq "text") {
334 $data = "";
335 $section = $doc_obj->get_top_section();
336 while (defined($section)) {
337 $data .= $doc_obj->get_text($section) . "\n";
338 $section = $doc_obj->get_next_section($section);
339 }
340 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
341 }
342
343 # Extract a metadata field from a document
344 # (If there is more than one element of the given type, get them all.)
345 elsif ($level eq "document") {
346 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
347 foreach $data (@$dataref) {
348 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
349 }
350 }
351
352 # Extract metadata from every section in a document
353 elsif ($level eq "section") {
354 $data = "";
355 $section = $doc_obj->get_top_section();
356 while (defined($section)) {
357 $dataref = $doc_obj->get_metadata($section, $field);
358 $data .= join("\n", @$dataref) . "\n";
359 $section = $doc_obj->get_next_section($section);
360 }
361 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
362 }
363
364 # Some sort of specification which I don't understand
365 else {
366 die "Unknown level ($level) in Phind index ($part)\n";
367 }
368
369 }
370
371 # output the text
372 $text =~ tr/\n//s;
373 print $txthandle "$text";
374}
375
376
377# Construct the classifier from the information already gathered
378#
379# When get_classify_info is called, the clauses and docs.txt files have
380# already been constructed in the Phind directory. This function will
381# translate them into compressed, indexed MGPP files that can be read by
382# the phindcgi script. It will also register our classifier so that it
383# shows up in the navigation bar.
384
385sub get_classify_info {
386 my $self = shift (@_);
387
388 close $self->{'dochandle'};
389 close $self->{'txthandle'};
390 my $verbosity = $self->{'verbosity'};
391 my $out = $self->{'outhandle'};
392 my $phinddir = $self->{'phinddir'};
393
394 my $osextra = "";
395 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
396 $osextra = " -d /";
397 }
398
399 if ($verbosity) {
400 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
401 print $out "*** in ", $self->{'phinddir'}, "\n";
402 }
403
404 # Construct phind indexes
405 my $suffixmode = $self->{'suffixmode'};
406 my ($command, $status);
407
408 # Generate the vocabulary, symbol statistics, and numbers file
409 # from the clauses file
410 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
411 &extract_vocabulary($self);
412
413 # Use the suffix program to generate the phind/phrases file
414 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
415 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
416
417 # check that we generated some files. It's not necessarily an error if
418 # we didn't (execute() would have quit on error), but we can't go on.
419 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
420 if (! -r $phrasesfile) {
421 print $out "\nNo phrases found for Phind classifier!\n";
422 return;
423 }
424
425 # Create the phrase file and put phrase numbers in phind/phrases
426 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
427 &renumber_phrases($self);
428
429 print $out "\nCreating phrase databases\n";
430 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
431 my $mg_stem = &util::filename_cat($phinddir, "pdata");
432
433 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
434 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
435 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
436
437 # create the mg index of words
438 print $out "\nCreating word-level search indexes\n";
439 $mg_input = &util::filename_cat($phinddir, "pword.txt");
440 $mg_stem = &util::filename_cat($phinddir, "pword");
441
442 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
443 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
444 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
445 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
446 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
447 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
448
449 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
450 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
451 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
452
453 # create the mg document information database
454 print $out "\nCreating document information databases\n";
455 $mg_input = &util::filename_cat($phinddir, "docs.txt");
456 $mg_stem = &util::filename_cat($phinddir, "docs");
457
458 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
459 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
460 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
461
462 # Return the information about the classifier that we'll later want to
463 # use to create macros when the Phind classifier document is displayed.
464 my %classifyinfo = ('thistype'=>'Invisible',
465 'childtype'=>'Phind',
466 'Title'=>$self->{'buttonname'},
467 'parameters'=>"phindnumber=$self->{'phindnumber'}",
468 'contains'=>[]);
469
470 my $collection = $self->{'collection'};
471 my $url = "library?a=p&p=phind&c=$collection";
472 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
473
474 return \%classifyinfo;
475}
476
477
478
479sub convert_gml_to_tokens {
480
481 my ($language_exp, $text) = @_;
482
483 # escape any magic words... - jrm21
484 foreach my $delim (@delimiters) {
485 my $replacement=lc($delim);
486 my $num= $text=~ s/$delim/$replacement/g;
487 if (!$num) {$num=0;}
488 }
489
490 if ($language_exp =~ /en/) {
491 return &convert_gml_to_tokens_EN($text);
492 }
493
494 if ($language_exp =~ /zh/) {
495 return &convert_gml_to_tokens_ZH($text);
496 }
497
498 $_ = $text;
499
500 # 1. remove GML tags
501
502 # Remove everything that is in a tag
503 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
504 s/\s*<br>\s*/ LINEBREAK /isgo;
505 s/<[^>]*>/ /sgo;
506
507 # Now we have the text, but it may contain HTML
508 # elements coded as &gt; etc. Remove these tags.
509 s/&amp;/&/sgo;
510 s/&lt;/</sgo;
511 s/&gt;/>/sgo;
512 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
513 s/\s*<br>\s*/ LINEBREAK /isgo;
514 s/<[^>]*>/ /sgo;
515
516 # replace<p> and <br> placeholders with clause break symbol (\n)
517 s/\s+/ /gso;
518 s/PARAGRAPHBREAK/\n/sgo;
519 s/LINEBREAK/\n/sgo;
520
521
522
523
524 # 2. Split the remaining text into space-delimited tokens
525
526 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
527 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
528
529 # Split text at word boundaries
530 s/\b/ /go;
531
532 # 3. Convert the remaining text to "clause format"
533
534 # Insert newline if the end of a sentence is detected
535 # (delimter is: "[\.\?\!]\s")
536 # s/\s*[\.\?\!]\s+/\n/go;
537
538 # remove unnecessary punctuation and replace with clause break symbol (\n)
539 s/[^\w ]/\n/go;
540
541 # remove extraneous whitespace
542 s/ +/ /sgo;
543 s/^\s+//mgo;
544 s/\s*$/\n/mgo;
545
546 # remove lines that contain one word or less
547 s/^\S*$//mgo;
548 s/^\s*$//mgo;
549 tr/\n//s;
550
551 return $_;
552}
553
554# a chinese version
555sub convert_gml_to_tokens_ZH {
556
557 $_ = shift @_;
558
559 # Replace all whitespace with a simple space
560 s/\s+/ /gs;
561 # Remove everything that is in a tag
562 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
563 s/\s*<br>\s*/ LINEBREAK /isg;
564 s/<[^>]*>/ /sg;
565
566 # Now we have the text, but it may contain HTML
567 # elements coded as &gt; etc. Remove these tags.
568 s/&lt;/</sg;
569 s/&gt;/>/sg;
570
571 s/\s+/ /sg;
572 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
573 s/\s*<br>\s*/ LINEBREAK /isg;
574 s/<[^>]*>/ /sg;
575
576 # remove &amp; and other miscellaneous markup tags
577 s/&amp;/&/sg;
578 s/&lt;/</sg;
579 s/&gt;/>/sg;
580 s/&amp;/&/sg;
581
582 # replace<p> and <br> placeholders with carriage returns
583 s/PARAGRAPHBREAK/\n/sg;
584 s/LINEBREAK/\n/sg;
585
586
587# print STDERR "text:$_\n";
588 return $_;
589}
590
591# A version of convert_gml_to_tokens that is fine-tuned to the English language.
592
593sub convert_gml_to_tokens_EN {
594 $_ = shift @_;
595
596 # FIRST, remove GML tags
597
598 # Replace all whitespace with a simple space
599 s/\s+/ /gs;
600
601 # Remove everything that is in a tag
602 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
603 s/\s*<br>\s*/ LINEBREAK /isg;
604 s/<[^>]*>/ /sg;
605
606 # Now we have the text, but it may contain HTML
607 # elements coded as &gt; etc. Remove these tags.
608 s/&lt;/</sg;
609 s/&gt;/>/sg;
610
611 s/\s+/ /sg;
612 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
613 s/\s*<br>\s*/ LINEBREAK /isg;
614 s/<[^>]*>/ /sg;
615
616 # remove &amp; and other miscellaneous markup tags
617 s/&amp;/&/sg;
618 s/&lt;/</sg;
619 s/&gt;/>/sg;
620 s/&amp;/&/sg;
621
622 # replace<p> and <br> placeholders with carriage returns
623 s/PARAGRAPHBREAK/\n/sg;
624 s/LINEBREAK/\n/sg;
625
626
627 # Exceptional punctuation
628 #
629 # We make special cases of some punctuation
630
631 # remove any apostrophe that indicates omitted letters
632 s/(\w+)\'(\w*\s)/ $1$2 /g;
633
634 # remove period that appears in a person's initals
635 s/\s([A-Z])\./ $1 /g;
636
637 # replace hyphens in hypheanted words and names with a space
638 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
639
640 # Convert the remaining text to "clause format",
641 # This means removing all excess punctuation and garbage text,
642 # normalising valid punctuation to fullstops and commas,
643 # then putting one cluse on each line.
644
645 # Insert newline when the end of a sentence is detected
646 # (delimter is: "[\.\?\!]\s")
647 s/\s*[\.\?\!]\s+/\n/g;
648
649 # split numbers after four digits
650 s/(\d\d\d\d)/$1 /g;
651
652 # split words after 32 characters
653
654 # squash repeated punctuation
655 tr/A-Za-z0-9 //cs;
656
657 # save email addresses
658 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
659
660 # normalise clause breaks (mostly punctuation symbols) to commas
661 s/[^A-Za-z0-9 \n]+/ , /g;
662
663 # Remove repeated commas, and replace with newline
664 s/\s*,[, ]+/\n/g;
665
666 # remove extra whitespace
667 s/ +/ /sg;
668 s/^\s+//mg;
669 s/\s*$/\n/mg;
670
671 # remove lines that contain one word or less
672 s/^\w*$//mg;
673 s/^\s*$//mg;
674 tr/\n//s;
675
676 return $_;
677
678}
679
680
681
682# Execute a system command
683
684sub execute {
685 my ($command, $verbosity, $outhandle) = @_;
686 print $outhandle "Executing: $command\n" if ($verbosity > 2);
687 $! = 0;
688 my $status = system($command);
689 if ($status != 0) {
690 print STDERR "Phind - Error executing '$command': $!\n";
691 exit($status); # this causes the build to fail...
692 }
693}
694
695
696# Generate the vocabulary, symbol statistics, and numbers file from the
697# clauses file. This is legacy code, so is a bit messy and probably wont
698# run under windows.
699
700sub extract_vocabulary {
701 my ($self) = @_;
702
703 my $verbosity = $self->{'verbosity'};
704 my $out = $self->{'outhandle'};
705
706 my $collectiondir = $self->{'collectiondir'};
707 my $phinddir = $self->{'phinddir'};
708
709 my $language_exp = $self->{'language_exp'};
710
711 my ($w, $l, $line, $word);
712
713 my ($first_delimiter, $last_delimiter,
714 $first_stopword, $last_stopword,
715 $first_extractword, $last_extractword,
716 $first_contentword, $last_contentword,
717 $phrasedelimiter);
718
719 my $thesaurus = $self->{'thesaurus'};
720 my ($thesaurus_links, $thesaurus_terms,
721 %thesaurus, $first_thesaurusword, $last_thesaurusword);
722
723 my %symbol;
724 my (%freq);
725
726 print $out "Calculating vocabulary\n" if ($verbosity > 1);
727
728 # Read and store the stopwords
729 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
730 my $stopword_files = ();
731 my ($language, $language_dir, $file, $file_name);
732 my %stopwords;
733
734 # Examine each directory in the stopword directory
735 opendir(STOPDIR, $stopdir);
736 foreach $language (readdir STOPDIR) {
737
738 # Ignore entries that do not match the classifier's language
739 next unless ($language =~ /$language_exp/);
740 $language_dir = &util::filename_cat($stopdir, $language);
741 next unless (-d "$language_dir");
742
743 opendir(LANGDIR, $language_dir);
744 foreach $file (readdir LANGDIR) {
745
746 # Ignore entries that are not stopword files
747 next unless ($file =~ /sw$/);
748 $file_name = &util::filename_cat($language_dir, $file);
749 next unless (-f "$file_name");
750
751 # Read the stopwords
752 open(STOPFILE, "<$file_name");
753 while (<STOPFILE>) {
754 s/^\s+//;
755 s/\s.*//;
756 $word = $_;
757 $l = lc($word);
758 $stopwords{$l} = $word;
759 }
760 close STOPFILE;
761
762 }
763 closedir LANGDIR;
764 }
765 closedir STOPDIR;
766
767 # Read thesaurus information
768 if ($thesaurus) {
769
770 # link file exists
771 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
772 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
773
774 # ensure term file exists in the correct language
775 if ($language_exp =~ /^([a-z][a-z])/) {
776 $language = $1;
777 } else {
778 $language = 'en';
779 }
780 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
781 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
782
783
784 # Read the thesaurus terms
785 open(TH, "<$thesaurus_terms");
786 while(<TH>) {
787 s/^\d+ //;
788 s/\(.*\)//;
789 foreach $w (split(/\s+/, $_)) {
790 $thesaurus{lc($w)} = $w;
791 }
792 }
793 close TH;
794 }
795
796 # Read words in the text and count occurences
797 open(TXT, "<$phinddir/clauses");
798
799 my @words;
800 while(<TXT>) {
801 $line = $_;
802 next unless ($line =~ /./);
803
804 @words = split(/\s+/, $line);
805 foreach $w (@words) {
806 $l = lc($w);
807 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
808 $freq{$w}++;
809 }
810 $freq{$senlimit}++;
811 }
812
813 close TXT;
814
815 # Calculate the "best" form of each word
816 my (%bestform, %totalfreq, %bestfreq);
817
818 foreach $w (sort (keys %freq)) {
819 $l = lc($w);
820
821 # totalfreq is the number of times a term appears in any form
822 $totalfreq{$l} += $freq{$w};
823
824 if (defined $stopwords{$l}) {
825 $bestform{$l} = $stopwords{$l};
826
827 } elsif (defined $thesaurus{$l}) {
828 $bestform{$l} = $thesaurus{$l};
829
830 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
831 $bestfreq{$l} = $freq{$w};
832 $bestform{$l} = $w;
833 }
834 }
835 undef %freq;
836 undef %bestfreq;
837
838
839 # Assign symbol numbers to tokens
840 my $nextsymbol = 1;
841 my (@vocab);
842
843 # Delimiters
844 $first_delimiter = 1;
845
846 foreach $word (@delimiters) {
847
848# $word = lc($word); # jrm21
849 $word = uc($word);
850 $bestform{$word} = $word;
851 $vocab[$nextsymbol] = $word;
852 $symbol{$word} = $nextsymbol;
853 $nextsymbol++;
854 }
855 $last_delimiter = $nextsymbol - 1;
856 # Stopwords
857 $first_stopword = $nextsymbol;
858
859 foreach my $word (sort keys %stopwords) {
860 # don't include stopword unless it occurs in the text
861 $word = lc($word);
862 next unless ($totalfreq{$word});
863 next if ($symbol{$word});
864
865 $vocab[$nextsymbol] = $word;
866 $symbol{$word} = $nextsymbol;
867 $nextsymbol++;
868 }
869 $last_stopword = $nextsymbol - 1;
870 $first_contentword = $nextsymbol;
871
872 # Thesaurus terms
873 if ($thesaurus) {
874 $first_thesaurusword = $nextsymbol;
875
876 foreach my $word (sort keys %thesaurus) {
877
878 $word = lc($word);
879 next if ($symbol{$word});
880 $bestform{$word} = $thesaurus{$word};
881
882 $vocab[$nextsymbol] = $word;
883 $symbol{$word} = $nextsymbol;
884 $nextsymbol++;
885
886 }
887 $last_thesaurusword = $nextsymbol - 1;
888 }
889
890 # Other content words
891 $first_extractword = $nextsymbol;
892
893 foreach my $word (sort (keys %bestform)) {
894
895 next if ($symbol{$word});
896
897 $vocab[$nextsymbol] = $word;
898 $symbol{$word} = $nextsymbol;
899 $nextsymbol++;
900 }
901 $last_extractword = $nextsymbol - 1;
902 $last_contentword = $nextsymbol - 1;
903
904 # Outut the words
905 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
906 open(VOC, ">$phinddir/clauses.vocab");
907
908 for (my $i = 1; $i < $nextsymbol; $i++) {
909 $w = $vocab[$i];
910
911 print VOC "$bestform{$w}\n";
912 $totalfreq{$w} = 0 unless ($totalfreq{$w});
913 }
914 close VOC;
915
916
917 # Create statistics file
918 # Output statistics about the vocablary
919 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
920 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
921
922 open(STAT, ">$phinddir/clauses.stats")
923 || die "Cannot open $phinddir/clauses.stats: $!";
924
925 print STAT "first_delimiter $first_delimiter\n";
926 print STAT "last_delimiter $last_delimiter\n";
927 print STAT "first_stopword $first_stopword\n";
928 print STAT "last_stopword $last_stopword\n";
929 if ($thesaurus) {
930 print STAT "first_thesaurusword $first_thesaurusword\n";
931 print STAT "last_thesaurusword $last_thesaurusword\n";
932 }
933 print STAT "first_extractword $first_extractword\n";
934 print STAT "last_extractword $last_extractword\n";
935 print STAT "first_contentword $first_contentword\n";
936 print STAT "last_contentword $last_contentword\n";
937 print STAT "first_symbol $first_delimiter\n";
938 print STAT "last_symbol $last_contentword\n";
939 print STAT "first_word $first_stopword\n";
940 print STAT "last_word $last_contentword\n";
941 close STAT;
942
943 undef @vocab;
944
945
946 # Create numbers file
947 # Save text as symbol numbers
948 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
949
950 open(TXT, "<$phinddir/clauses");
951 open(NUM, ">$phinddir/clauses.numbers");
952
953## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
954## print NUM "$symbol{lc($colstart)}\n"; # jrm21
955 $phrasedelimiter = $symbol{$senlimit};
956 print NUM "$symbol{$colstart}\n";
957
958 # set up the special symbols that delimit documents and sentences
959 while(<TXT>) {
960
961 # split sentence into a list of tokens
962 $line = $_;
963 next unless ($line =~ /./);
964 @words = split(/\s+/, $line);
965
966 # output one token at a time
967 foreach $word (@words) {
968# don't lower-case special delimiters - jrm21
969 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
970 $word = lc($word);
971 }
972 print NUM "$symbol{$word}\n";
973 }
974
975 # output phrase delimiter
976 print NUM "$phrasedelimiter\n";
977 }
978
979 close TXT;
980# print NUM "$symbol{lc($colend)}\n";# jrm21
981 print NUM "$symbol{$colend}\n";
982 close NUM;
983
984 # Save thesaurus data in one convienient file
985 if ($thesaurus) {
986
987 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
988
989
990 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
991 if ($verbosity > 1);
992
993 # Read the thesaurus terms
994 my ($num, $text, %thes_symbols);
995
996 open(TH, "<$thesaurus_terms");
997 while(<TH>) {
998 chomp;
999 @words = split(/\s+/, $_);
1000 $num = shift @words;
1001 $text = "";
1002
1003 # translate words into symbol numbers
1004 foreach $word (@words) {
1005 $word = lc($word);
1006 if ($symbol{$word}) {
1007 $text .= "s$symbol{$word} ";
1008 } elsif ($verbosity) {
1009 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1010 }
1011 }
1012 $text =~ s/ $//;
1013 $thes_symbols{$num} = $text;
1014 }
1015 close TH;
1016
1017 # Read the thesaurus links and write the corresponding data
1018 open(TH, "<$thesaurus_links");
1019 open(THOUT, ">$thesaurusfile");
1020
1021 while(<TH>) {
1022 chomp;
1023 ($num, $text) = split(/:/, $_);
1024
1025 if (defined($thes_symbols{$num})) {
1026 print THOUT "$num:$thes_symbols{$num}:$text\n";
1027 } else {
1028 print THOUT "$num:untranslated:$text\n";
1029 }
1030 }
1031 close TH;
1032 close THOUT;
1033 }
1034
1035
1036
1037
1038}
1039
1040
1041# renumber_phrases
1042#
1043# Prepare the phrases file to be input to mgpp. The biggest problem is
1044# reconciling the phrase identifiers used by the suffix program (which
1045# we'll call suffix-id numbers) with the numbers used in the thesaurus
1046# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1047# Phind-id numbers must be sorted by frequency of occurance.
1048#
1049# Start creating a set of phind-id numbers from the sorted suffix-id
1050# numbers and (if required) the thesaurus-id numbers. Then add any other
1051# phrases occuring in the thesaurus.
1052#
1053# The last thing we have to do is restore the vocabulary information to the
1054# phrase file so that the phrases are stored as words, not as symbol
1055# numbers.
1056
1057# The original phrases file looks something like this:
1058# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1059# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1060# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1061# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1062
1063
1064sub renumber_phrases {
1065 my ($self) = @_;
1066
1067 renumber_suffix_data($self);
1068 renumber_thesaurus_data($self);
1069 restore_vocabulary_data($self);
1070
1071}
1072
1073
1074
1075# renumber_suffix_data
1076#
1077# Translate phrases file to phrases.2 using phind keys instead
1078# of suffix keys and sorting the expansion data.
1079
1080sub renumber_suffix_data {
1081 my ($self) = @_;
1082
1083 my $verbosity = $self->{'verbosity'};
1084 my $out = $self->{'outhandle'};
1085 print $out "Translate phrases: suffix-ids become phind-id's\n"
1086 if ($verbosity);
1087
1088 my $phinddir = $self->{'phinddir'};
1089 my $infile = &util::filename_cat($phinddir, 'phrases');
1090 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1091
1092 # Read the phrase file. Calculate initial set of phind-id
1093 # numbers and store (suffixid -> frequency) relation.
1094
1095 my %suffixtophind;
1096 my @phindfrequency;
1097 my (@fields, $suffixid);
1098 my $nextphind = 1;
1099
1100 open(IN, "<$infile");
1101 while(<IN>) {
1102
1103 chomp;
1104 @fields = split(/:/, $_);
1105
1106 # get next suffixid and phindid
1107 $suffixid = shift @fields;
1108 $suffixtophind{$suffixid} = $nextphind;
1109
1110 # store total frequency
1111 shift @fields;
1112 $totalfrequency[$nextphind] = shift @fields;
1113
1114 $nextphind++;
1115 }
1116 close IN;
1117
1118
1119 # Translate phrases file to phrases.2. Use phind keys (not suffix
1120 # keys), sort expansion and document occurance data in order of
1121 # descending frequency..
1122 open(IN, "<$infile");
1123 open(OUT, ">$outfile");
1124
1125 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1126 my (@documwents, @newexp, $k, $n);
1127 my $linenumber = 0;
1128
1129 while(<IN>) {
1130
1131 # read the line
1132 chomp;
1133 @fields = split(/:/, $_);
1134
1135 # get a phrase number for this line
1136 $suffixid = shift @fields;
1137 die unless (defined($suffixtophind{$suffixid}));
1138 $phindid = $suffixtophind{$suffixid};
1139
1140 # get the symbols in the phrase
1141 $text = shift @fields;
1142
1143 # output status information
1144 $linenumber++;
1145 if ($verbosity > 2) {
1146 if ($linenumber % 1000 == 0) {
1147 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1148 }
1149 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1150 }
1151
1152 # get the phrase frequency
1153 $tf = shift @fields;
1154
1155 # get the number of expansions
1156 $countexp = shift @fields;
1157
1158 # get the expansions, convert them into phind-id numbers, and sort them
1159 $expansions = shift @fields;
1160 @newexp = ();
1161 foreach $k (split(/,/, $expansions)) {
1162 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1163 $n = $suffixtophind{$k};
1164 push @newexp, $n;
1165 }
1166 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1167
1168 # get the number of documents
1169 $countdocs = shift @fields;
1170
1171 # get the documents and sort them
1172 $documents = shift @fields;
1173 $documents =~ s/d//g;
1174 @documents = split(/;/, $documents);
1175 @documents = sort by_doc_frequency @documents;
1176
1177 # output the phrase data
1178 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1179 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1180
1181 }
1182
1183 close IN;
1184 close OUT;
1185}
1186
1187
1188# renumber_thesaurus_data
1189#
1190# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1191
1192sub renumber_thesaurus_data {
1193 my ($self) = @_;
1194
1195 my $out = $self->{'outhandle'};
1196 my $verbosity = $self->{'verbosity'};
1197 my $thesaurus = $self->{'thesaurus'};
1198
1199 my $phinddir = $self->{'phinddir'};
1200 my $infile = &util::filename_cat($phinddir, "phrases.2");
1201 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1202
1203
1204 # If no thesaurus is defined, simply move the phrases file.
1205 if (!$thesaurus) {
1206 print $out "Translate phrases.2: no thesaurus data\n"
1207 if ($verbosity);
1208 &util::mv($infile, $outfile);
1209 return;
1210 }
1211
1212 print $out "Translate phrases.2: add thesaurus data\n"
1213 if ($verbosity);
1214
1215 # 1.
1216 # Read thesaurus file and store (symbols->thesaurusid) mapping
1217 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1218 my %symbolstothesid;
1219 my (@fields, $thesid, $symbols);
1220
1221 open(TH, "<$thesaurusfile");
1222
1223 while (<TH>) {
1224
1225 chomp;
1226 @fields = split(/:/, $_);
1227
1228 # get id and text
1229 $thesid = shift @fields;
1230 $symbols = shift @fields;
1231 $symbolstothesid{$symbols} = $thesid;
1232 }
1233 close TH;
1234
1235 # 2.
1236 # Read phrases file to find thesaurus entries that already
1237 # have a phindid. Store their phind-ids for later translation,
1238 # and store their frequency for later sorting.
1239 my %thesaurustophindid;
1240 my %phindidtofrequency;
1241 my ($phindid, $freq);
1242
1243 open(IN, "<$infile");
1244
1245 while(<IN>) {
1246
1247 chomp;
1248 @fields = split(/:/, $_);
1249
1250 # phindid and symbols for this line
1251 $phindid = shift @fields;
1252 $symbols = shift @fields;
1253 $freq = shift @fields;
1254
1255 # do we have a thesaurus id corresponding to this phrase?
1256 if (defined($symbolstothesid{$symbols})) {
1257 $thesid = $symbolstothesid{$symbols};
1258 $thesaurustophindid{$thesid} = $phindid;
1259 $phindidtofrequency{$phindid} = $freq;
1260 }
1261 }
1262 close IN;
1263
1264 undef %symbolstothesid;
1265
1266 # 3.
1267 # Create phind-id numbers for remaining thesaurus entries,
1268 # and note that their frequency is 0 for later sorting.
1269 my $nextphindid = $phindid + 1;
1270
1271 open(TH, "<$thesaurusfile");
1272 while(<TH>) {
1273
1274 chomp;
1275 @fields = split(/:/, $_);
1276
1277 # read thesaurus-id and ensure it has a corresponding phind-id
1278 $thesid = shift @fields;
1279 if (!defined($thesaurustophindid{$thesid})) {
1280 $thesaurustophindid{$thesid} = $nextphindid;
1281 $phindidtofrequency{$nextphindid} = 0;
1282 $nextphindid++;
1283 }
1284 }
1285 close TH;
1286
1287 # 4.
1288 # Translate thesaurus file, replacing thesaurus-id numbers with
1289 # phind-id numbers.
1290 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1291 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1292 my (@links, $linkid, %linkidtotype, $newrelation);
1293
1294 open(TH, "<$thesaurusfile");
1295 open(TO, ">$newthesaurusfile");
1296 while(<TH>) {
1297
1298 chomp;
1299 @fields = split(/:/, $_);
1300
1301 # phindid and symbols for this line
1302 ($thesid, $symbols, $relations) = @fields;
1303
1304 die unless ($thesid && $symbols);
1305 die unless $thesaurustophindid{$thesid};
1306 $phindid = $thesaurustophindid{$thesid};
1307
1308 # convert each part of the relation string to use phind-id numbers
1309 # at the same time, we want to sort the list by frequency.
1310 undef %linkidtotype;
1311
1312 foreach $linktext (split(/;/, $relations)) {
1313 @linkdata = split(/,/, $linktext);
1314
1315 # remember the linktype (e.g. BT, NT)
1316 $linktype = shift @linkdata;
1317
1318 # store the type of each link
1319 foreach $thesid (@linkdata) {
1320 die unless (defined($thesaurustophindid{$thesid}));
1321 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1322 }
1323 }
1324
1325 # sort the list of links, first by frequency, then by type.
1326 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1327 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1328 $linkcounter = (scalar @links);
1329
1330 # create a string describing the link information
1331 $linktype = $linkidtotype{$links[0]};
1332 $newrelation = $linktype;
1333 foreach $linkid (@links) {
1334 if ($linkidtotype{$linkid} ne $linktype) {
1335 $linktype = $linkidtotype{$linkid};
1336 $newrelation .= ";" . $linktype;
1337 }
1338 $newrelation .= "," . $linkid;
1339 }
1340 $newrelation .= ";";
1341
1342
1343 # output the new line
1344 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1345 }
1346 close TH;
1347 close TO;
1348
1349 undef %thesaurustophindid;
1350 undef %linkidtotype;
1351 undef %phindidtofrequency;
1352
1353 # 5.
1354 # Read thesaurus data (in phind-id format) into memory
1355 my %thesaurusdata;
1356
1357 open(TH, "<$newthesaurusfile");
1358 while(<TH>) {
1359 chomp;
1360 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1361 die unless ($phindid && $symbols);
1362 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1363 }
1364 close TH;
1365
1366 # 6.
1367 # Add thesaurus data to phrases file
1368 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1369 my (@documwents, @newexp, $k, $n);
1370 my $linenumber = 0;
1371
1372 open(IN, "<$infile");
1373 open(OUT, ">$outfile");
1374
1375 # Update existing phrases
1376 while(<IN>) {
1377
1378 chomp;
1379 @fields = split(/:/, $_);
1380
1381 # get data for this line
1382 $phindid = shift @fields;
1383
1384 # output the phrase data, with thesaurus information
1385 print OUT "$phindid:", join(":", @fields);
1386
1387 # add thesaurus data
1388 if (defined($thesaurusdata{$phindid})) {
1389 @fields = split(/:/, $thesaurusdata{$phindid});
1390 shift @fields;
1391 $linkcounter = shift @fields;
1392 $relations = shift @fields;
1393
1394 print OUT ":$linkcounter:$relations";
1395 $thesaurusdata{$phindid} = "";
1396 }
1397 print OUT "\n";
1398 }
1399 close IN;
1400
1401 # Add phrases that aren't already in the file
1402 foreach $phindid (sort numerically keys %thesaurusdata) {
1403 next unless ($thesaurusdata{$phindid});
1404
1405 @fields = split(/:/, $thesaurusdata{$phindid});
1406 $symbols = shift @fields;
1407 $linkcounter = shift @fields;
1408 $relations = shift @fields;
1409
1410 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1411 }
1412 close OUT;
1413
1414}
1415
1416# restore_vocabulary_data
1417#
1418# Read phrases.3 and restore vocabulary information. Then write
1419# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1420# (if requested) to the saved phrases file.
1421
1422sub restore_vocabulary_data {
1423 my ($self) = @_;
1424
1425 my $out = $self->{'outhandle'};
1426 my $verbosity = $self->{'verbosity'};
1427 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1428
1429 my $phinddir = $self->{'phinddir'};
1430 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1431 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1432 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1433 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1434
1435 my $savephrases = $self->{'savephrases'};
1436
1437 # 1.
1438 # Read the vocabulary file
1439 open(V, "<$vocabfile")
1440 || die "Cannot open $vocabfile: $!";
1441 my @symbol;
1442 my $i = 1;
1443 while(<V>) {
1444 chomp;
1445 $symbol[$i++] = $_;
1446 }
1447 close V;
1448
1449 # 2.
1450 # Translate phrases.3 to MGPP input files
1451 my ($key, $text, $word, $isThesaurus);
1452 my @fields;
1453 my $linenumber = 0;
1454
1455 open(IN, "<$infile");
1456 open(DATA, ">$datafile");
1457 open(WORD, ">$wordfile");
1458
1459 # Save the phrases in a separate text file
1460 if ($savephrases) {
1461 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1462 open(SAVE, ">$savephrases");
1463 }
1464
1465 while(<IN>) {
1466
1467 # read the line
1468 chomp;
1469 $line = $_;
1470 @fields = split(/:/, $line);
1471
1472 # get a phrase number for this line
1473 $key = shift @fields;
1474
1475 # restore the text of the phrase
1476 $text = shift @fields;
1477 $text =~ s/s(\d+)/$symbol[$1]/g;
1478 if ($text =~ / /) {
1479 $word = "";
1480 } elsif ($text ne 'untranslated') {
1481 $word = $text;
1482 }
1483
1484 # output the phrase data
1485 print DATA "<Document>";
1486 print DATA "$key:$text:", join(":", @fields), ":\n";
1487
1488 # output the word index search data
1489 print WORD "<Document>$word\n";
1490
1491 # output the phrases to a text file
1492 if ($savephrases) {
1493 if ((scalar @fields) == 7) {
1494 $isThesaurus = 1;
1495 } else {
1496 $isThesaurus = 0;
1497 }
1498 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1499 }
1500 }
1501 close IN;
1502 close WORD;
1503 close DATA;
1504 close SAVE if ($savephrases);
1505
1506}
1507
1508
1509
1510# sort routines used to renumber phrases
1511
1512sub numerically { $a <=> $b }
1513
1514sub by_doc_frequency {
1515 my $fa = 1;
1516 if ($a =~ /,/) {
1517 $fa = $a;
1518 $fa =~ s/\d+,//;
1519 }
1520 my $fb = 1;
1521 if ($b =~ /,/) {
1522 $fb = $b;
1523 $fb =~ s/\d+,//;
1524 }
1525
1526 return ($fb <=> $fa);
1527}
1528
15291;
Note: See TracBrowser for help on using the repository browser.