source: trunk/gsdl/perllib/classify/Phind.pm@ 7023

Last change on this file since 7023 was 6989, checked in by kjdon, 20 years ago

changed the language option to type string so you can have more than one lang when configuring it in GLI

  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41my @removedirs = ();
42
43my %wanted_index_files = ('td'=>1,
44 't'=>1,
45 'ti'=>1,
46 'tl'=>1,
47 'tsd'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'il'=>1,
54 'w'=>1,
55 'wa'=>1);
56
57sub BEGIN {
58 @ISA = ('BasClas');
59}
60
61sub END {
62
63 # Tidy up stray files - we do this here as there's some weird problem
64 # preventing us from doing it in the get_classify_info() function (on
65 # windows at least) where the close() appears to fail on txthandle and
66 # dochandle, thus preventing us from deleting those files
67
68 foreach my $dir (@removedirs) {
69 if (-d $dir && opendir (DIR, $dir)) {
70 my @files = readdir DIR;
71 closedir DIR;
72
73 foreach $file (@files) {
74 next if $file =~ /^\.\.?$/;
75 my ($suffix) = $file =~ /\.([^\.]+)$/;
76 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
77 # delete it!
78 &util::rm (&util::filename_cat ($dir, $file));
79 }
80 }
81 }
82 }
83}
84
85my $arguments =
86 [ { 'name' => "text",
87 'desc' => "{Phind.text}",
88 'type' => "string",
89 'deft' => "section:Title,section:text",
90 'reqd' => "no" },
91 { 'name' => "title",
92 'desc' => "{Phind.title}",
93 'type' => "metadata",
94 'deft' => "Title",
95 'reqd' => "no" },
96 { 'name' => "buttonname",
97 'desc' => "{BasClas.buttonname}",
98 'type' => "string",
99 'deft' => "Phrase",
100 'reqd' => "no" },
101 { 'name' => "language",
102 'desc' => "{Phind.language}",
103 'type' => "string",
104 'deft' => "en",
105 'reqd' => "no" },
106 { 'name' => "savephrases",
107 'desc' => "{Phind.savephrases}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no" },
111 { 'name' => "suffixmode",
112 'desc' => "{Phind.suffixmode}",
113 'type' => "int",
114 'deft' => "1",
115 'reqd' => "no" },
116 { 'name' => "thesaurus",
117 'desc' => "{Phind.thesaurus}",
118 'type' => "string",
119 'deft' => "",
120 'reqd' => "no" },
121 { 'name' => "untidy",
122 'desc' => "{Phind.untidy}",
123 'type' => "flag",
124 'reqd' => "no" } ];
125
126my $options = { 'name' => "Phind",
127 'desc' => "{Phind.desc}",
128 'abstract' => "no",
129 'inherits' => "yes",
130 'args' => $arguments };
131
132
133# Phrase delimiter symbols - these should be abstracted out someplace
134
135my $colstart = "COLLECTIONSTART";
136my $colend = "COLLECTIONEND";
137my $doclimit = "DOCUMENTLIMIT";
138my $senlimit = "SENTENCELIMIT";
139my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
140
141
142# Create a new Phind browser based on collect.cfg
143
144sub new {
145 my $class = shift (@_);
146 my $self = new BasClas($class, @_);
147
148 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
149 my $option_list = $self->{'option_list'};
150 push( @{$option_list}, $options );
151
152 if ($self->{'info_only'}) {
153 # created from classinfo.pl - don't need to parse the arguments
154 return bless $self, $class;
155 }
156
157 my $out = $self->{'outhandle'};
158
159 # Ensure the Phind generate scripts are in place
160 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
161 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
162 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
163
164 if (!(-e $file1)) {
165 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
166 exit(1);
167 }
168
169 # Parse classifier arguments
170 my $builddir = "";
171 if (!parsargv::parse(\@_,
172 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
173 q^title/.*/Title^, \$self->{'titlefield'},
174 q^buttonname/.*/Phrase^, \$self->{'buttonname'},
175 q^language/.*/en^, \$language,
176 q^savephrases/.*/^, \$self->{'savephrases'},
177 q^suffixmode/\d/1^, \$self->{'suffixmode'},
178 q^thesaurus/.*/^, \$self->{'thesaurus'},
179 q^untidy^, \$self->{'untidy'},
180 "allow_extra_options")) {
181
182 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
183 $self->print_txt_usage(""); # Use default resource bundle
184 die "\n";
185 }
186
187 # classifier information
188 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
189
190 # limit languages
191 $self->{'language_exp'} = $language;
192
193 # collection directories
194 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
195
196 if (! defined $self->{'builddir'}) {
197 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
198 }
199
200 $self->{'total'} = 0;
201
202 return bless $self, $class;
203}
204
205
206# Initialise the Phind classifier
207
208sub init {
209 my $self = shift (@_);
210
211 # ensure we have a build directory
212 my $builddir = $self->{'builddir'};
213 die unless (-e "$builddir");
214
215 # create Phind directory
216 my $phnumber = 1;
217 my $phinddir = &util::filename_cat($builddir, "phind1");
218 while (-e "$phinddir") {
219 $phnumber++;
220 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
221 }
222 &util::mk_dir("$phinddir");
223 $self->{'phinddir'} = $phinddir;
224 $self->{'phindnumber'} = $phnumber;
225
226 push(@removedirs, $phinddir) unless $self->{'untidy'};
227
228 # open filehandles for documents and text
229 my $clausefile = &util::filename_cat("$phinddir", "clauses");
230 &util::rm($clausefile) if (-e $clausefile);
231
232 my $txthandle = 'TEXT' . $phnumber;
233 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
234 $self->{'txthandle'} = $txthandle;
235
236 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
237 &util::rm($docfile) if (-e $docfile);
238
239 my $dochandle = 'DOC' . $phnumber;
240 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
241 $self->{'dochandle'} = $dochandle;
242
243}
244
245
246# Classify each document.
247#
248# Each document is passed here in turn. The classifier extracts the
249# text of each and stores it in the clauses file. Document details are
250# stored in the docs.txt file.
251
252sub classify {
253 my $self = shift (@_);
254 my $doc_obj = shift @_;
255
256 my $verbosity = $self->{'verbosity'};
257 my $top_section = $doc_obj->get_top_section();
258
259 my $titlefield = $self->{'titlefield'};
260
261 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
262 if (!defined($title)) {
263 $title = "";
264 print STDERR "Phind: document has no title\n";
265 }
266 print "process: $title\n" if ($verbosity > 2);
267
268 # Only consider the file if it is in the correct language
269 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
270 my $phrlanguage = $self->{'language_exp'};
271 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
272
273 # record this file
274 $self->{'total'} ++;
275 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
276
277
278 # Store document details
279 my $OID = $doc_obj->get_OID();
280 $OID = "NULL" unless defined $OID;
281 my $dochandle = $self->{'dochandle'};
282 print $dochandle "<Document>\t$OID\t$title\n";
283
284 # Store the text occuring in this object
285
286 # output the document delimiter
287 my $txthandle = $self->{'txthandle'};
288 print $txthandle "$doclimit\n";
289
290 # iterarate over the required indexes and store their text
291 my $indexes = $self->{'indexes'};
292 my $text = "";
293 my ($part, $level, $field, $section, $data, $dataref);
294
295 foreach $part (split(/,/, $indexes)) {
296
297 # Each field has a level and a data element ((e.g. document:Title)
298 ($level, $field) = split(/:/, $part);
299 die unless ($level && $field);
300
301 # Extract the text from every section
302 # (In phind, document:text and section:text are equivalent)
303 if ($field eq "text") {
304 $data = "";
305 $section = $doc_obj->get_top_section();
306 while (defined($section)) {
307 $data .= $doc_obj->get_text($section) . "\n";
308 $section = $doc_obj->get_next_section($section);
309 }
310 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311 }
312
313 # Extract a metadata field from a document
314 # (If there is more than one element of the given type, get them all.)
315 elsif ($level eq "document") {
316 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
317 foreach $data (@$dataref) {
318 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
319 }
320 }
321
322 # Extract metadata from every section in a document
323 elsif ($level eq "section") {
324 $data = "";
325 $section = $doc_obj->get_top_section();
326 while (defined($section)) {
327 $dataref = $doc_obj->get_metadata($section, $field);
328 $data .= join("\n", @$dataref) . "\n";
329 $section = $doc_obj->get_next_section($section);
330 }
331 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
332 }
333
334 # Some sort of specification which I don't understand
335 else {
336 die "Unknown level ($level) in Phind index ($part)\n";
337 }
338
339 }
340
341 # output the text
342 $text =~ tr/\n//s;
343 print $txthandle "$text";
344}
345
346
347# Construct the classifier from the information already gathered
348#
349# When get_classify_info is called, the clauses and docs.txt files have
350# already been constructed in the Phind directory. This function will
351# translate them into compressed, indexed MGPP files that can be read by
352# the phindcgi script. It will also register our classifier so that it
353# shows up in the navigation bar.
354
355sub get_classify_info {
356 my $self = shift (@_);
357 my ($gli) = @_;
358
359 close $self->{'dochandle'};
360 close $self->{'txthandle'};
361 my $verbosity = $self->{'verbosity'};
362 my $out = $self->{'outhandle'};
363 my $phinddir = $self->{'phinddir'};
364
365 my $osextra = "";
366 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
367 $osextra = " -d /";
368 }
369
370 print STDERR "</Stage>\n" if $gli;
371
372 if ($verbosity) {
373 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
374 print $out "*** in ", $self->{'phinddir'}, "\n";
375 }
376
377 print STDERR "<Stage name='Phind'>\n" if $gli;
378
379 # Construct phind indexes
380 my $suffixmode = $self->{'suffixmode'};
381 my ($command, $status);
382
383 # Generate the vocabulary, symbol statistics, and numbers file
384 # from the clauses file
385 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
386 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
387 &extract_vocabulary($self);
388
389 # Use the suffix program to generate the phind/phrases file
390 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
391 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
392 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
393
394 # check that we generated some files. It's not necessarily an error if
395 # we didn't (execute() would have quit on error), but we can't go on.
396 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
397 if (! -r $phrasesfile) {
398 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
399 print $out "\nNo phrases found for Phind classifier!\n";
400 return;
401 }
402
403 # Create the phrase file and put phrase numbers in phind/phrases
404 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
405 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
406 &renumber_phrases($self);
407
408 print $out "\nCreating phrase databases\n";
409 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
410 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
411 my $mg_stem = &util::filename_cat($phinddir, "pdata");
412
413 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
414 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
415 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
416
417 # create the mg index of words
418 print $out "\nCreating word-level search indexes\n";
419 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
420 $mg_input = &util::filename_cat($phinddir, "pword.txt");
421 $mg_stem = &util::filename_cat($phinddir, "pword");
422
423 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
424 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
425 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
427 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
428 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
429
430 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
431 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
432 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
433
434 # create the mg document information database
435 print $out "\nCreating document information databases\n";
436 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
437 $mg_input = &util::filename_cat($phinddir, "docs.txt");
438 $mg_stem = &util::filename_cat($phinddir, "docs");
439
440 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
441 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
442 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
443
444 # Return the information about the classifier that we'll later want to
445 # use to create macros when the Phind classifier document is displayed.
446 my %classifyinfo = ('thistype'=>'Invisible',
447 'childtype'=>'Phind',
448 'Title'=>$self->{'buttonname'},
449 'parameters'=>"phindnumber=$self->{'phindnumber'}",
450 'contains'=>[]);
451
452 my $collection = $self->{'collection'};
453 my $url = "library?a=p&p=phind&c=$collection";
454 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
455
456 return \%classifyinfo;
457}
458
459
460
461sub convert_gml_to_tokens {
462
463 my ($language_exp, $text) = @_;
464
465 # escape any magic words... - jrm21
466 foreach my $delim (@delimiters) {
467 my $replacement=lc($delim);
468 my $num= $text=~ s/$delim/$replacement/g;
469 if (!$num) {$num=0;}
470 }
471
472 if ($language_exp =~ /en/) {
473 return &convert_gml_to_tokens_EN($text);
474 }
475
476 if ($language_exp =~ /zh/) {
477 return &convert_gml_to_tokens_ZH($text);
478 }
479
480 $_ = $text;
481
482 # 1. remove GML tags
483
484 # Remove everything that is in a tag
485 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
486 s/\s*<br>\s*/ LINEBREAK /isgo;
487 s/<[^>]*>/ /sgo;
488
489 # Now we have the text, but it may contain HTML
490 # elements coded as &gt; etc. Remove these tags.
491 s/&amp;/&/sgo;
492 s/&lt;/</sgo;
493 s/&gt;/>/sgo;
494 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
495 s/\s*<br>\s*/ LINEBREAK /isgo;
496 s/<[^>]*>/ /sgo;
497
498 # replace<p> and <br> placeholders with clause break symbol (\n)
499 s/\s+/ /gso;
500 s/PARAGRAPHBREAK/\n/sgo;
501 s/LINEBREAK/\n/sgo;
502
503
504
505
506 # 2. Split the remaining text into space-delimited tokens
507
508 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
509 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
510
511 # Split text at word boundaries
512 s/\b/ /go;
513
514 # 3. Convert the remaining text to "clause format"
515
516 # Insert newline if the end of a sentence is detected
517 # (delimter is: "[\.\?\!]\s")
518 # s/\s*[\.\?\!]\s+/\n/go;
519
520 # remove unnecessary punctuation and replace with clause break symbol (\n)
521 s/[^\w ]/\n/go;
522
523 # remove extraneous whitespace
524 s/ +/ /sgo;
525 s/^\s+//mgo;
526 s/\s*$/\n/mgo;
527
528 # remove lines that contain one word or less
529 s/^\S*$//mgo;
530 s/^\s*$//mgo;
531 tr/\n//s;
532
533 return $_;
534}
535
536# a chinese version
537sub convert_gml_to_tokens_ZH {
538
539 $_ = shift @_;
540
541 # Replace all whitespace with a simple space
542 s/\s+/ /gs;
543 # Remove everything that is in a tag
544 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
545 s/\s*<br>\s*/ LINEBREAK /isg;
546 s/<[^>]*>/ /sg;
547
548 # Now we have the text, but it may contain HTML
549 # elements coded as &gt; etc. Remove these tags.
550 s/&lt;/</sg;
551 s/&gt;/>/sg;
552
553 s/\s+/ /sg;
554 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
555 s/\s*<br>\s*/ LINEBREAK /isg;
556 s/<[^>]*>/ /sg;
557
558 # remove &amp; and other miscellaneous markup tags
559 s/&amp;/&/sg;
560 s/&lt;/</sg;
561 s/&gt;/>/sg;
562 s/&amp;/&/sg;
563
564 # replace<p> and <br> placeholders with carriage returns
565 s/PARAGRAPHBREAK/\n/sg;
566 s/LINEBREAK/\n/sg;
567
568
569# print STDERR "text:$_\n";
570 return $_;
571}
572
573# A version of convert_gml_to_tokens that is fine-tuned to the English language.
574
575sub convert_gml_to_tokens_EN {
576 $_ = shift @_;
577
578 # FIRST, remove GML tags
579
580 # Replace all whitespace with a simple space
581 s/\s+/ /gs;
582
583 # Remove everything that is in a tag
584 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
585 s/\s*<br>\s*/ LINEBREAK /isg;
586 s/<[^>]*>/ /sg;
587
588 # Now we have the text, but it may contain HTML
589 # elements coded as &gt; etc. Remove these tags.
590 s/&lt;/</sg;
591 s/&gt;/>/sg;
592
593 s/\s+/ /sg;
594 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
595 s/\s*<br>\s*/ LINEBREAK /isg;
596 s/<[^>]*>/ /sg;
597
598 # remove &amp; and other miscellaneous markup tags
599 s/&amp;/&/sg;
600 s/&lt;/</sg;
601 s/&gt;/>/sg;
602 s/&amp;/&/sg;
603
604 # replace<p> and <br> placeholders with carriage returns
605 s/PARAGRAPHBREAK/\n/sg;
606 s/LINEBREAK/\n/sg;
607
608
609 # Exceptional punctuation
610 #
611 # We make special cases of some punctuation
612
613 # remove any apostrophe that indicates omitted letters
614 s/(\w+)\'(\w*\s)/ $1$2 /g;
615
616 # remove period that appears in a person's initals
617 s/\s([A-Z])\./ $1 /g;
618
619 # replace hyphens in hypheanted words and names with a space
620 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
621
622 # Convert the remaining text to "clause format",
623 # This means removing all excess punctuation and garbage text,
624 # normalising valid punctuation to fullstops and commas,
625 # then putting one cluse on each line.
626
627 # Insert newline when the end of a sentence is detected
628 # (delimter is: "[\.\?\!]\s")
629 s/\s*[\.\?\!]\s+/\n/g;
630
631 # split numbers after four digits
632 s/(\d\d\d\d)/$1 /g;
633
634 # split words after 32 characters
635
636 # squash repeated punctuation
637 tr/A-Za-z0-9 //cs;
638
639 # save email addresses
640 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
641
642 # normalise clause breaks (mostly punctuation symbols) to commas
643 s/[^A-Za-z0-9 \n]+/ , /g;
644
645 # Remove repeated commas, and replace with newline
646 s/\s*,[, ]+/\n/g;
647
648 # remove extra whitespace
649 s/ +/ /sg;
650 s/^\s+//mg;
651 s/\s*$/\n/mg;
652
653 # remove lines that contain one word or less
654 s/^\w*$//mg;
655 s/^\s*$//mg;
656 tr/\n//s;
657
658 return $_;
659
660}
661
662
663
664# Execute a system command
665
666sub execute {
667 my ($command, $verbosity, $outhandle) = @_;
668 print $outhandle "Executing: $command\n" if ($verbosity > 2);
669 $! = 0;
670 my $status = system($command);
671 if ($status != 0) {
672 print STDERR "Phind - Error executing '$command': $!\n";
673 exit($status); # this causes the build to fail...
674 }
675}
676
677
678# Generate the vocabulary, symbol statistics, and numbers file from the
679# clauses file. This is legacy code, so is a bit messy and probably wont
680# run under windows.
681
682sub extract_vocabulary {
683 my ($self) = @_;
684
685 my $verbosity = $self->{'verbosity'};
686 my $out = $self->{'outhandle'};
687
688 my $collectiondir = $self->{'collectiondir'};
689 my $phinddir = $self->{'phinddir'};
690
691 my $language_exp = $self->{'language_exp'};
692
693 my ($w, $l, $line, $word);
694
695 my ($first_delimiter, $last_delimiter,
696 $first_stopword, $last_stopword,
697 $first_extractword, $last_extractword,
698 $first_contentword, $last_contentword,
699 $phrasedelimiter);
700
701 my $thesaurus = $self->{'thesaurus'};
702 my ($thesaurus_links, $thesaurus_terms,
703 %thesaurus, $first_thesaurusword, $last_thesaurusword);
704
705 my %symbol;
706 my (%freq);
707
708 print $out "Calculating vocabulary\n" if ($verbosity > 1);
709
710 # Read and store the stopwords
711 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
712 my $stopword_files = ();
713 my ($language, $language_dir, $file, $file_name);
714 my %stopwords;
715
716 # Examine each directory in the stopword directory
717 opendir(STOPDIR, $stopdir);
718 foreach $language (readdir STOPDIR) {
719
720 # Ignore entries that do not match the classifier's language
721 next unless ($language =~ /$language_exp/);
722 $language_dir = &util::filename_cat($stopdir, $language);
723 next unless (-d "$language_dir");
724
725 opendir(LANGDIR, $language_dir);
726 foreach $file (readdir LANGDIR) {
727
728 # Ignore entries that are not stopword files
729 next unless ($file =~ /sw$/);
730 $file_name = &util::filename_cat($language_dir, $file);
731 next unless (-f "$file_name");
732
733 # Read the stopwords
734 open(STOPFILE, "<$file_name");
735 while (<STOPFILE>) {
736 s/^\s+//;
737 s/\s.*//;
738 $word = $_;
739 $l = lc($word);
740 $stopwords{$l} = $word;
741 }
742 close STOPFILE;
743
744 }
745 closedir LANGDIR;
746 }
747 closedir STOPDIR;
748
749 # Read thesaurus information
750 if ($thesaurus) {
751
752 # link file exists
753 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
754 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
755
756 # ensure term file exists in the correct language
757 if ($language_exp =~ /^([a-z][a-z])/) {
758 $language = $1;
759 } else {
760 $language = 'en';
761 }
762 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
763 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
764
765
766 # Read the thesaurus terms
767 open(TH, "<$thesaurus_terms");
768 while(<TH>) {
769 s/^\d+ //;
770 s/\(.*\)//;
771 foreach $w (split(/\s+/, $_)) {
772 $thesaurus{lc($w)} = $w;
773 }
774 }
775 close TH;
776 }
777
778 # Read words in the text and count occurences
779 open(TXT, "<$phinddir/clauses");
780
781 my @words;
782 while(<TXT>) {
783 $line = $_;
784 next unless ($line =~ /./);
785
786 @words = split(/\s+/, $line);
787 foreach $w (@words) {
788 $l = lc($w);
789 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
790 $freq{$w}++;
791 }
792 $freq{$senlimit}++;
793 }
794
795 close TXT;
796
797 # Calculate the "best" form of each word
798 my (%bestform, %totalfreq, %bestfreq);
799
800 foreach $w (sort (keys %freq)) {
801 $l = lc($w);
802
803 # totalfreq is the number of times a term appears in any form
804 $totalfreq{$l} += $freq{$w};
805
806 if (defined $stopwords{$l}) {
807 $bestform{$l} = $stopwords{$l};
808
809 } elsif (defined $thesaurus{$l}) {
810 $bestform{$l} = $thesaurus{$l};
811
812 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
813 $bestfreq{$l} = $freq{$w};
814 $bestform{$l} = $w;
815 }
816 }
817 undef %freq;
818 undef %bestfreq;
819
820
821 # Assign symbol numbers to tokens
822 my $nextsymbol = 1;
823 my (@vocab);
824
825 # Delimiters
826 $first_delimiter = 1;
827
828 foreach $word (@delimiters) {
829
830# $word = lc($word); # jrm21
831 $word = uc($word);
832 $bestform{$word} = $word;
833 $vocab[$nextsymbol] = $word;
834 $symbol{$word} = $nextsymbol;
835 $nextsymbol++;
836 }
837 $last_delimiter = $nextsymbol - 1;
838 # Stopwords
839 $first_stopword = $nextsymbol;
840
841 foreach my $word (sort keys %stopwords) {
842 # don't include stopword unless it occurs in the text
843 $word = lc($word);
844 next unless ($totalfreq{$word});
845 next if ($symbol{$word});
846
847 $vocab[$nextsymbol] = $word;
848 $symbol{$word} = $nextsymbol;
849 $nextsymbol++;
850 }
851 $last_stopword = $nextsymbol - 1;
852 $first_contentword = $nextsymbol;
853
854 # Thesaurus terms
855 if ($thesaurus) {
856 $first_thesaurusword = $nextsymbol;
857
858 foreach my $word (sort keys %thesaurus) {
859
860 $word = lc($word);
861 next if ($symbol{$word});
862 $bestform{$word} = $thesaurus{$word};
863
864 $vocab[$nextsymbol] = $word;
865 $symbol{$word} = $nextsymbol;
866 $nextsymbol++;
867
868 }
869 $last_thesaurusword = $nextsymbol - 1;
870 }
871
872 # Other content words
873 $first_extractword = $nextsymbol;
874
875 foreach my $word (sort (keys %bestform)) {
876
877 next if ($symbol{$word});
878
879 $vocab[$nextsymbol] = $word;
880 $symbol{$word} = $nextsymbol;
881 $nextsymbol++;
882 }
883 $last_extractword = $nextsymbol - 1;
884 $last_contentword = $nextsymbol - 1;
885
886 # Outut the words
887 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
888 open(VOC, ">$phinddir/clauses.vocab");
889
890 for (my $i = 1; $i < $nextsymbol; $i++) {
891 $w = $vocab[$i];
892
893 print VOC "$bestform{$w}\n";
894 $totalfreq{$w} = 0 unless ($totalfreq{$w});
895 }
896 close VOC;
897
898
899 # Create statistics file
900 # Output statistics about the vocablary
901 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
902 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
903
904 open(STAT, ">$phinddir/clauses.stats")
905 || die "Cannot open $phinddir/clauses.stats: $!";
906
907 print STAT "first_delimiter $first_delimiter\n";
908 print STAT "last_delimiter $last_delimiter\n";
909 print STAT "first_stopword $first_stopword\n";
910 print STAT "last_stopword $last_stopword\n";
911 if ($thesaurus) {
912 print STAT "first_thesaurusword $first_thesaurusword\n";
913 print STAT "last_thesaurusword $last_thesaurusword\n";
914 }
915 print STAT "first_extractword $first_extractword\n";
916 print STAT "last_extractword $last_extractword\n";
917 print STAT "first_contentword $first_contentword\n";
918 print STAT "last_contentword $last_contentword\n";
919 print STAT "first_symbol $first_delimiter\n";
920 print STAT "last_symbol $last_contentword\n";
921 print STAT "first_word $first_stopword\n";
922 print STAT "last_word $last_contentword\n";
923 close STAT;
924
925 undef @vocab;
926
927
928 # Create numbers file
929 # Save text as symbol numbers
930 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
931
932 open(TXT, "<$phinddir/clauses");
933 open(NUM, ">$phinddir/clauses.numbers");
934
935## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
936## print NUM "$symbol{lc($colstart)}\n"; # jrm21
937 $phrasedelimiter = $symbol{$senlimit};
938 print NUM "$symbol{$colstart}\n";
939
940 # set up the special symbols that delimit documents and sentences
941 while(<TXT>) {
942
943 # split sentence into a list of tokens
944 $line = $_;
945 next unless ($line =~ /./);
946 @words = split(/\s+/, $line);
947
948 # output one token at a time
949 foreach $word (@words) {
950# don't lower-case special delimiters - jrm21
951 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
952 $word = lc($word);
953 }
954 print NUM "$symbol{$word}\n";
955 }
956
957 # output phrase delimiter
958 print NUM "$phrasedelimiter\n";
959 }
960
961 close TXT;
962# print NUM "$symbol{lc($colend)}\n";# jrm21
963 print NUM "$symbol{$colend}\n";
964 close NUM;
965
966 # Save thesaurus data in one convienient file
967 if ($thesaurus) {
968
969 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
970
971
972 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
973 if ($verbosity > 1);
974
975 # Read the thesaurus terms
976 my ($num, $text, %thes_symbols);
977
978 open(TH, "<$thesaurus_terms");
979 while(<TH>) {
980 chomp;
981 @words = split(/\s+/, $_);
982 $num = shift @words;
983 $text = "";
984
985 # translate words into symbol numbers
986 foreach $word (@words) {
987 $word = lc($word);
988 if ($symbol{$word}) {
989 $text .= "s$symbol{$word} ";
990 } elsif ($verbosity) {
991 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
992 }
993 }
994 $text =~ s/ $//;
995 $thes_symbols{$num} = $text;
996 }
997 close TH;
998
999 # Read the thesaurus links and write the corresponding data
1000 open(TH, "<$thesaurus_links");
1001 open(THOUT, ">$thesaurusfile");
1002
1003 while(<TH>) {
1004 chomp;
1005 ($num, $text) = split(/:/, $_);
1006
1007 if (defined($thes_symbols{$num})) {
1008 print THOUT "$num:$thes_symbols{$num}:$text\n";
1009 } else {
1010 print THOUT "$num:untranslated:$text\n";
1011 }
1012 }
1013 close TH;
1014 close THOUT;
1015 }
1016
1017
1018
1019
1020}
1021
1022
1023# renumber_phrases
1024#
1025# Prepare the phrases file to be input to mgpp. The biggest problem is
1026# reconciling the phrase identifiers used by the suffix program (which
1027# we'll call suffix-id numbers) with the numbers used in the thesaurus
1028# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1029# Phind-id numbers must be sorted by frequency of occurance.
1030#
1031# Start creating a set of phind-id numbers from the sorted suffix-id
1032# numbers and (if required) the thesaurus-id numbers. Then add any other
1033# phrases occuring in the thesaurus.
1034#
1035# The last thing we have to do is restore the vocabulary information to the
1036# phrase file so that the phrases are stored as words, not as symbol
1037# numbers.
1038
1039# The original phrases file looks something like this:
1040# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1041# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1042# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1043# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1044
1045
1046sub renumber_phrases {
1047 my ($self) = @_;
1048
1049 renumber_suffix_data($self);
1050 renumber_thesaurus_data($self);
1051 restore_vocabulary_data($self);
1052
1053}
1054
1055
1056
1057# renumber_suffix_data
1058#
1059# Translate phrases file to phrases.2 using phind keys instead
1060# of suffix keys and sorting the expansion data.
1061
1062sub renumber_suffix_data {
1063 my ($self) = @_;
1064
1065 my $verbosity = $self->{'verbosity'};
1066 my $out = $self->{'outhandle'};
1067 print $out "Translate phrases: suffix-ids become phind-id's\n"
1068 if ($verbosity);
1069
1070 my $phinddir = $self->{'phinddir'};
1071 my $infile = &util::filename_cat($phinddir, 'phrases');
1072 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1073
1074 # Read the phrase file. Calculate initial set of phind-id
1075 # numbers and store (suffixid -> frequency) relation.
1076
1077 my %suffixtophind;
1078 my @phindfrequency;
1079 my (@fields, $suffixid);
1080 my $nextphind = 1;
1081
1082 open(IN, "<$infile");
1083 while(<IN>) {
1084
1085 chomp;
1086 @fields = split(/:/, $_);
1087
1088 # get next suffixid and phindid
1089 $suffixid = shift @fields;
1090 $suffixtophind{$suffixid} = $nextphind;
1091
1092 # store total frequency
1093 shift @fields;
1094 $totalfrequency[$nextphind] = shift @fields;
1095
1096 $nextphind++;
1097 }
1098 close IN;
1099
1100
1101 # Translate phrases file to phrases.2. Use phind keys (not suffix
1102 # keys), sort expansion and document occurance data in order of
1103 # descending frequency..
1104 open(IN, "<$infile");
1105 open(OUT, ">$outfile");
1106
1107 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1108 my (@documwents, @newexp, $k, $n);
1109 my $linenumber = 0;
1110
1111 while(<IN>) {
1112
1113 # read the line
1114 chomp;
1115 @fields = split(/:/, $_);
1116
1117 # get a phrase number for this line
1118 $suffixid = shift @fields;
1119 die unless (defined($suffixtophind{$suffixid}));
1120 $phindid = $suffixtophind{$suffixid};
1121
1122 # get the symbols in the phrase
1123 $text = shift @fields;
1124
1125 # output status information
1126 $linenumber++;
1127 if ($verbosity > 2) {
1128 if ($linenumber % 1000 == 0) {
1129 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1130 }
1131 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1132 }
1133
1134 # get the phrase frequency
1135 $tf = shift @fields;
1136
1137 # get the number of expansions
1138 $countexp = shift @fields;
1139
1140 # get the expansions, convert them into phind-id numbers, and sort them
1141 $expansions = shift @fields;
1142 @newexp = ();
1143 foreach $k (split(/,/, $expansions)) {
1144 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1145 $n = $suffixtophind{$k};
1146 push @newexp, $n;
1147 }
1148 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1149
1150 # get the number of documents
1151 $countdocs = shift @fields;
1152
1153 # get the documents and sort them
1154 $documents = shift @fields;
1155 $documents =~ s/d//g;
1156 @documents = split(/;/, $documents);
1157 @documents = sort by_doc_frequency @documents;
1158
1159 # output the phrase data
1160 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1161 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1162
1163 }
1164
1165 close IN;
1166 close OUT;
1167}
1168
1169
1170# renumber_thesaurus_data
1171#
1172# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1173
1174sub renumber_thesaurus_data {
1175 my ($self) = @_;
1176
1177 my $out = $self->{'outhandle'};
1178 my $verbosity = $self->{'verbosity'};
1179 my $thesaurus = $self->{'thesaurus'};
1180
1181 my $phinddir = $self->{'phinddir'};
1182 my $infile = &util::filename_cat($phinddir, "phrases.2");
1183 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1184
1185
1186 # If no thesaurus is defined, simply move the phrases file.
1187 if (!$thesaurus) {
1188 print $out "Translate phrases.2: no thesaurus data\n"
1189 if ($verbosity);
1190 &util::mv($infile, $outfile);
1191 return;
1192 }
1193
1194 print $out "Translate phrases.2: add thesaurus data\n"
1195 if ($verbosity);
1196
1197 # 1.
1198 # Read thesaurus file and store (symbols->thesaurusid) mapping
1199 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1200 my %symbolstothesid;
1201 my (@fields, $thesid, $symbols);
1202
1203 open(TH, "<$thesaurusfile");
1204
1205 while (<TH>) {
1206
1207 chomp;
1208 @fields = split(/:/, $_);
1209
1210 # get id and text
1211 $thesid = shift @fields;
1212 $symbols = shift @fields;
1213 $symbolstothesid{$symbols} = $thesid;
1214 }
1215 close TH;
1216
1217 # 2.
1218 # Read phrases file to find thesaurus entries that already
1219 # have a phindid. Store their phind-ids for later translation,
1220 # and store their frequency for later sorting.
1221 my %thesaurustophindid;
1222 my %phindidtofrequency;
1223 my ($phindid, $freq);
1224
1225 open(IN, "<$infile");
1226
1227 while(<IN>) {
1228
1229 chomp;
1230 @fields = split(/:/, $_);
1231
1232 # phindid and symbols for this line
1233 $phindid = shift @fields;
1234 $symbols = shift @fields;
1235 $freq = shift @fields;
1236
1237 # do we have a thesaurus id corresponding to this phrase?
1238 if (defined($symbolstothesid{$symbols})) {
1239 $thesid = $symbolstothesid{$symbols};
1240 $thesaurustophindid{$thesid} = $phindid;
1241 $phindidtofrequency{$phindid} = $freq;
1242 }
1243 }
1244 close IN;
1245
1246 undef %symbolstothesid;
1247
1248 # 3.
1249 # Create phind-id numbers for remaining thesaurus entries,
1250 # and note that their frequency is 0 for later sorting.
1251 my $nextphindid = $phindid + 1;
1252
1253 open(TH, "<$thesaurusfile");
1254 while(<TH>) {
1255
1256 chomp;
1257 @fields = split(/:/, $_);
1258
1259 # read thesaurus-id and ensure it has a corresponding phind-id
1260 $thesid = shift @fields;
1261 if (!defined($thesaurustophindid{$thesid})) {
1262 $thesaurustophindid{$thesid} = $nextphindid;
1263 $phindidtofrequency{$nextphindid} = 0;
1264 $nextphindid++;
1265 }
1266 }
1267 close TH;
1268
1269 # 4.
1270 # Translate thesaurus file, replacing thesaurus-id numbers with
1271 # phind-id numbers.
1272 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1273 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1274 my (@links, $linkid, %linkidtotype, $newrelation);
1275
1276 open(TH, "<$thesaurusfile");
1277 open(TO, ">$newthesaurusfile");
1278 while(<TH>) {
1279
1280 chomp;
1281 @fields = split(/:/, $_);
1282
1283 # phindid and symbols for this line
1284 ($thesid, $symbols, $relations) = @fields;
1285
1286 die unless ($thesid && $symbols);
1287 die unless $thesaurustophindid{$thesid};
1288 $phindid = $thesaurustophindid{$thesid};
1289
1290 # convert each part of the relation string to use phind-id numbers
1291 # at the same time, we want to sort the list by frequency.
1292 undef %linkidtotype;
1293
1294 foreach $linktext (split(/;/, $relations)) {
1295 @linkdata = split(/,/, $linktext);
1296
1297 # remember the linktype (e.g. BT, NT)
1298 $linktype = shift @linkdata;
1299
1300 # store the type of each link
1301 foreach $thesid (@linkdata) {
1302 die unless (defined($thesaurustophindid{$thesid}));
1303 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1304 }
1305 }
1306
1307 # sort the list of links, first by frequency, then by type.
1308 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1309 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1310 $linkcounter = (scalar @links);
1311
1312 # create a string describing the link information
1313 $linktype = $linkidtotype{$links[0]};
1314 $newrelation = $linktype;
1315 foreach $linkid (@links) {
1316 if ($linkidtotype{$linkid} ne $linktype) {
1317 $linktype = $linkidtotype{$linkid};
1318 $newrelation .= ";" . $linktype;
1319 }
1320 $newrelation .= "," . $linkid;
1321 }
1322 $newrelation .= ";";
1323
1324
1325 # output the new line
1326 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1327 }
1328 close TH;
1329 close TO;
1330
1331 undef %thesaurustophindid;
1332 undef %linkidtotype;
1333 undef %phindidtofrequency;
1334
1335 # 5.
1336 # Read thesaurus data (in phind-id format) into memory
1337 my %thesaurusdata;
1338
1339 open(TH, "<$newthesaurusfile");
1340 while(<TH>) {
1341 chomp;
1342 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1343 die unless ($phindid && $symbols);
1344 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1345 }
1346 close TH;
1347
1348 # 6.
1349 # Add thesaurus data to phrases file
1350 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1351 my (@documwents, @newexp, $k, $n);
1352 my $linenumber = 0;
1353
1354 open(IN, "<$infile");
1355 open(OUT, ">$outfile");
1356
1357 # Update existing phrases
1358 while(<IN>) {
1359
1360 chomp;
1361 @fields = split(/:/, $_);
1362
1363 # get data for this line
1364 $phindid = shift @fields;
1365
1366 # output the phrase data, with thesaurus information
1367 print OUT "$phindid:", join(":", @fields);
1368
1369 # add thesaurus data
1370 if (defined($thesaurusdata{$phindid})) {
1371 @fields = split(/:/, $thesaurusdata{$phindid});
1372 shift @fields;
1373 $linkcounter = shift @fields;
1374 $relations = shift @fields;
1375
1376 print OUT ":$linkcounter:$relations";
1377 $thesaurusdata{$phindid} = "";
1378 }
1379 print OUT "\n";
1380 }
1381 close IN;
1382
1383 # Add phrases that aren't already in the file
1384 foreach $phindid (sort numerically keys %thesaurusdata) {
1385 next unless ($thesaurusdata{$phindid});
1386
1387 @fields = split(/:/, $thesaurusdata{$phindid});
1388 $symbols = shift @fields;
1389 $linkcounter = shift @fields;
1390 $relations = shift @fields;
1391
1392 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1393 }
1394 close OUT;
1395
1396}
1397
1398# restore_vocabulary_data
1399#
1400# Read phrases.3 and restore vocabulary information. Then write
1401# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1402# (if requested) to the saved phrases file.
1403
1404sub restore_vocabulary_data {
1405 my ($self) = @_;
1406
1407 my $out = $self->{'outhandle'};
1408 my $verbosity = $self->{'verbosity'};
1409 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1410
1411 my $phinddir = $self->{'phinddir'};
1412 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1413 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1414 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1415 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1416
1417 my $savephrases = $self->{'savephrases'};
1418
1419 # 1.
1420 # Read the vocabulary file
1421 open(V, "<$vocabfile")
1422 || die "Cannot open $vocabfile: $!";
1423 my @symbol;
1424 my $i = 1;
1425 while(<V>) {
1426 chomp;
1427 $symbol[$i++] = $_;
1428 }
1429 close V;
1430
1431 # 2.
1432 # Translate phrases.3 to MGPP input files
1433 my ($key, $text, $word, $isThesaurus);
1434 my @fields;
1435 my $linenumber = 0;
1436
1437 open(IN, "<$infile");
1438 open(DATA, ">$datafile");
1439 open(WORD, ">$wordfile");
1440
1441 # Save the phrases in a separate text file
1442 if ($savephrases) {
1443 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1444 open(SAVE, ">$savephrases");
1445 }
1446
1447 while(<IN>) {
1448
1449 # read the line
1450 chomp;
1451 $line = $_;
1452 @fields = split(/:/, $line);
1453
1454 # get a phrase number for this line
1455 $key = shift @fields;
1456
1457 # restore the text of the phrase
1458 $text = shift @fields;
1459 $text =~ s/s(\d+)/$symbol[$1]/g;
1460 if ($text =~ / /) {
1461 $word = "";
1462 } elsif ($text ne 'untranslated') {
1463 $word = $text;
1464 }
1465
1466 # output the phrase data
1467 print DATA "<Document>";
1468 print DATA "$key:$text:", join(":", @fields), ":\n";
1469
1470 # output the word index search data
1471 print WORD "<Document>$word\n";
1472
1473 # output the phrases to a text file
1474 if ($savephrases) {
1475 if ((scalar @fields) == 7) {
1476 $isThesaurus = 1;
1477 } else {
1478 $isThesaurus = 0;
1479 }
1480 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1481 }
1482 }
1483 close IN;
1484 close WORD;
1485 close DATA;
1486 close SAVE if ($savephrases);
1487
1488}
1489
1490
1491
1492# sort routines used to renumber phrases
1493
1494sub numerically { $a <=> $b }
1495
1496sub by_doc_frequency {
1497 my $fa = 1;
1498 if ($a =~ /,/) {
1499 $fa = $a;
1500 $fa =~ s/\d+,//;
1501 }
1502 my $fb = 1;
1503 if ($b =~ /,/) {
1504 $fb = $b;
1505 $fb =~ s/\d+,//;
1506 }
1507
1508 return ($fb <=> $fa);
1509}
1510
15111;
Note: See TracBrowser for help on using the repository browser.