source: trunk/gsdl/perllib/classify/Phind.pm@ 10253

Last change on this file since 10253 was 10253, checked in by kjdon, 19 years ago

added 'use strict' to all classifiers, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44my @removedirs = ();
45
46my %wanted_index_files = ('td'=>1,
47 't'=>1,
48 'ti'=>1,
49 'tl'=>1,
50 'tsd'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'il'=>1,
57 'w'=>1,
58 'wa'=>1);
59
60sub BEGIN {
61 @Phind::ISA = ('BasClas');
62}
63
64sub END {
65
66 # Tidy up stray files - we do this here as there's some weird problem
67 # preventing us from doing it in the get_classify_info() function (on
68 # windows at least) where the close() appears to fail on txthandle and
69 # dochandle, thus preventing us from deleting those files
70
71 foreach my $dir (@removedirs) {
72 if (-d $dir && opendir (DIR, $dir)) {
73 my @files = readdir DIR;
74 closedir DIR;
75
76 foreach my $file (@files) {
77 next if $file =~ /^\.\.?$/;
78 my ($suffix) = $file =~ /\.([^\.]+)$/;
79 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
80 # delete it!
81 &util::rm (&util::filename_cat ($dir, $file));
82 }
83 }
84 }
85 }
86}
87
88my $arguments =
89 [ { 'name' => "text",
90 'desc' => "{Phind.text}",
91 'type' => "string",
92 'deft' => "section:Title,section:text",
93 'reqd' => "no" },
94 { 'name' => "title",
95 'desc' => "{Phind.title}",
96 'type' => "metadata",
97 'deft' => "Title",
98 'reqd' => "no" },
99 { 'name' => "buttonname",
100 'desc' => "{BasClas.buttonname}",
101 'type' => "string",
102 'deft' => "Phrase",
103 'reqd' => "no" },
104 { 'name' => "language",
105 'desc' => "{Phind.language}",
106 'type' => "string",
107 'deft' => "en",
108 'reqd' => "no" },
109 { 'name' => "savephrases",
110 'desc' => "{Phind.savephrases}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no" },
114 { 'name' => "suffixmode",
115 'desc' => "{Phind.suffixmode}",
116 'type' => "int",
117 'deft' => "1",
118 'range' => "0,1",
119 'reqd' => "no" },
120 { 'name' => "min_occurs",
121 'desc' => "{Phind.min_occurs}",
122 'type' => "int",
123 'deft' => "2",
124 'range' => "1,",
125 'reqd' => "no" },
126 { 'name' => "thesaurus",
127 'desc' => "{Phind.thesaurus}",
128 'type' => "string",
129 'deft' => "",
130 'reqd' => "no" },
131 { 'name' => "untidy",
132 'desc' => "{Phind.untidy}",
133 'type' => "flag",
134 'reqd' => "no" } ];
135
136my $options = { 'name' => "Phind",
137 'desc' => "{Phind.desc}",
138 'abstract' => "no",
139 'inherits' => "yes",
140 'args' => $arguments };
141
142
143# Phrase delimiter symbols - these should be abstracted out someplace
144
145my $colstart = "COLLECTIONSTART";
146my $colend = "COLLECTIONEND";
147my $doclimit = "DOCUMENTLIMIT";
148my $senlimit = "SENTENCELIMIT";
149my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152# Create a new Phind browser based on collect.cfg
153
154sub new {
155 my ($class) = shift (@_);
156 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157 push(@$classifierslist, $class);
158
159 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
160 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
161
162 my $self = (defined $hashArgOptLists)? new BasClas($classifierslist,$inputargs,$hashArgOptLists): new BasClas($classifierslist,$inputargs);
163
164 if ($self->{'info_only'}) {
165 # don't worry about any options etc
166 return bless $self, $class;
167 }
168
169 # Ensure the Phind generate scripts are in place
170 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173 if (!(-e $file1)) {
174 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175 exit(1);
176 }
177
178 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
179 $self->{"indexes"} = $self->{"text"};
180
181 # Further setup
182 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
183 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
184 if (! defined $self->{'builddir'}) {
185 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
186 }
187 $self->{'total'} = 0;
188
189 # Clean out the unused keys
190 delete $self->{"text"};
191
192 return bless $self, $class;
193}
194
195
196# Initialise the Phind classifier
197
198sub init {
199 my $self = shift (@_);
200
201 # ensure we have a build directory
202 my $builddir = $self->{'builddir'};
203 die unless (-e "$builddir");
204
205 # create Phind directory
206 my $phnumber = 1;
207 my $phinddir = &util::filename_cat($builddir, "phind1");
208 while (-e "$phinddir") {
209 $phnumber++;
210 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
211 }
212 &util::mk_dir("$phinddir");
213 $self->{'phinddir'} = $phinddir;
214 $self->{'phindnumber'} = $phnumber;
215
216 push(@removedirs, $phinddir) unless $self->{'untidy'};
217
218 # open filehandles for documents and text
219 my $clausefile = &util::filename_cat("$phinddir", "clauses");
220 &util::rm($clausefile) if (-e $clausefile);
221
222 my $txthandle = 'TEXT' . $phnumber;
223 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
224 $self->{'txthandle'} = $txthandle;
225
226 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
227 &util::rm($docfile) if (-e $docfile);
228
229 my $dochandle = 'DOC' . $phnumber;
230 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
231 $self->{'dochandle'} = $dochandle;
232
233}
234
235
236# Classify each document.
237#
238# Each document is passed here in turn. The classifier extracts the
239# text of each and stores it in the clauses file. Document details are
240# stored in the docs.txt file.
241
242sub classify {
243 my $self = shift (@_);
244 my $doc_obj = shift @_;
245
246 my $verbosity = $self->{'verbosity'};
247 my $top_section = $doc_obj->get_top_section();
248
249 my $titlefield = $self->{'title'};
250
251 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
252 if (!defined($title)) {
253 $title = "";
254 print STDERR "Phind: document has no title\n";
255 }
256 print "process: $title\n" if ($verbosity > 2);
257
258 # Only consider the file if it is in the correct language
259 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
260 my $phrlanguage = $self->{'language'};
261 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
262
263 # record this file
264 $self->{'total'} ++;
265 # what is $file ???
266 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
267
268 # Store document details
269 my $OID = $doc_obj->get_OID();
270 $OID = "NULL" unless defined $OID;
271 my $dochandle = $self->{'dochandle'};
272 print $dochandle "<Document>\t$OID\t$title\n";
273
274 # Store the text occuring in this object
275
276 # output the document delimiter
277 my $txthandle = $self->{'txthandle'};
278 print $txthandle "$doclimit\n";
279
280 # iterate over the required indexes and store their text
281 my $indexes = $self->{'indexes'};
282 my $text = "";
283 my ($part, $level, $field, $section, $data, $dataref);
284
285 foreach $part (split(/,/, $indexes)) {
286
287 # Each field has a level and a data element ((e.g. document:Title)
288 ($level, $field) = split(/:/, $part);
289 die unless ($level && $field);
290
291 # Extract the text from every section
292 # (In phind, document:text and section:text are equivalent)
293 if ($field eq "text") {
294 $data = "";
295 $section = $doc_obj->get_top_section();
296 while (defined($section)) {
297 $data .= $doc_obj->get_text($section) . "\n";
298 $section = $doc_obj->get_next_section($section);
299 }
300 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
301 }
302
303 # Extract a metadata field from a document
304 # (If there is more than one element of the given type, get them all.)
305 elsif ($level eq "document") {
306 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
307 foreach $data (@$dataref) {
308 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309 }
310 }
311
312 # Extract metadata from every section in a document
313 elsif ($level eq "section") {
314 $data = "";
315 $section = $doc_obj->get_top_section();
316 while (defined($section)) {
317 $dataref = $doc_obj->get_metadata($section, $field);
318 $data .= join("\n", @$dataref) . "\n";
319 $section = $doc_obj->get_next_section($section);
320 }
321 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
322 }
323
324 # Some sort of specification which I don't understand
325 else {
326 die "Unknown level ($level) in Phind index ($part)\n";
327 }
328
329 }
330
331 # output the text
332 $text =~ tr/\n//s;
333 print $txthandle "$text";
334}
335
336
337# Construct the classifier from the information already gathered
338#
339# When get_classify_info is called, the clauses and docs.txt files have
340# already been constructed in the Phind directory. This function will
341# translate them into compressed, indexed MGPP files that can be read by
342# the phindcgi script. It will also register our classifier so that it
343# shows up in the navigation bar.
344
345sub get_classify_info {
346 my $self = shift (@_);
347 my ($gli) = @_;
348
349 close $self->{'dochandle'};
350 close $self->{'txthandle'};
351 my $verbosity = $self->{'verbosity'};
352 my $out = $self->{'outhandle'};
353 my $phinddir = $self->{'phinddir'};
354
355 my $osextra = "";
356 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
357 $osextra = " -d /";
358 }
359
360 print STDERR "</Stage>\n" if $gli;
361
362 if ($verbosity) {
363 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
364 print $out "*** in ", $self->{'phinddir'}, "\n";
365 }
366
367 print STDERR "<Stage name='Phind'>\n" if $gli;
368
369 # Construct phind indexes
370 my $suffixmode = $self->{'suffixmode'};
371 my $min_occurs = $self->{'min_occurs'};
372 my ($command, $status);
373
374 # Generate the vocabulary, symbol statistics, and numbers file
375 # from the clauses file
376 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
377 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
378 &extract_vocabulary($self);
379
380 # Use the suffix program to generate the phind/phrases file
381 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
382 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
383 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
384
385 # check that we generated some files. It's not necessarily an error if
386 # we didn't (execute() would have quit on error), but we can't go on.
387 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
388 if (! -r $phrasesfile) {
389 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
390 print $out "\nNo phrases found for Phind classifier!\n";
391 return;
392 }
393
394 # Create the phrase file and put phrase numbers in phind/phrases
395 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
396 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
397 &renumber_phrases($self);
398
399 print $out "\nCreating phrase databases\n";
400 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
401 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
402 my $mg_stem = &util::filename_cat($phinddir, "pdata");
403
404 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
405 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
406 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
407
408 # create the mg index of words
409 print $out "\nCreating word-level search indexes\n";
410 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
411 $mg_input = &util::filename_cat($phinddir, "pword.txt");
412 $mg_stem = &util::filename_cat($phinddir, "pword");
413
414 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
415 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
416 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
417 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
418 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
419 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
420
421 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
422 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
423 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
424
425 # create the mg document information database
426 print $out "\nCreating document information databases\n";
427 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
428 $mg_input = &util::filename_cat($phinddir, "docs.txt");
429 $mg_stem = &util::filename_cat($phinddir, "docs");
430
431 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
432 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
433 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
434
435 # Return the information about the classifier that we'll later want to
436 # use to create macros when the Phind classifier document is displayed.
437 my %classifyinfo = ('thistype'=>'Invisible',
438 'childtype'=>'Phind',
439 'Title'=>$self->{'buttonname'},
440 'parameters'=>"phindnumber=$self->{'phindnumber'}",
441 'contains'=>[]);
442
443 my $collection = $self->{'collection'};
444 my $url = "library?a=p&p=phind&c=$collection";
445 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
446
447 return \%classifyinfo;
448}
449
450
451
452sub convert_gml_to_tokens {
453
454 my ($language_exp, $text) = @_;
455
456 # escape any magic words... - jrm21
457 foreach my $delim (@delimiters) {
458 my $replacement=lc($delim);
459 my $num= $text=~ s/$delim/$replacement/g;
460 if (!$num) {$num=0;}
461 }
462
463 if ($language_exp =~ /en/) {
464 return &convert_gml_to_tokens_EN($text);
465 }
466
467 if ($language_exp =~ /zh/) {
468 return &convert_gml_to_tokens_ZH($text);
469 }
470
471 $_ = $text;
472
473 # 1. remove GML tags
474
475 # Remove everything that is in a tag
476 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
477 s/\s*<br>\s*/ LINEBREAK /isgo;
478 s/<[^>]*>/ /sgo;
479
480 # Now we have the text, but it may contain HTML
481 # elements coded as &gt; etc. Remove these tags.
482 s/&amp;/&/sgo;
483 s/&lt;/</sgo;
484 s/&gt;/>/sgo;
485 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
486 s/\s*<br>\s*/ LINEBREAK /isgo;
487 s/<[^>]*>/ /sgo;
488
489 # replace<p> and <br> placeholders with clause break symbol (\n)
490 s/\s+/ /gso;
491 s/PARAGRAPHBREAK/\n/sgo;
492 s/LINEBREAK/\n/sgo;
493
494
495
496
497 # 2. Split the remaining text into space-delimited tokens
498
499 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
500 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
501
502 # Split text at word boundaries
503 s/\b/ /go;
504
505 # 3. Convert the remaining text to "clause format"
506
507 # Insert newline if the end of a sentence is detected
508 # (delimter is: "[\.\?\!]\s")
509 # s/\s*[\.\?\!]\s+/\n/go;
510
511 # remove unnecessary punctuation and replace with clause break symbol (\n)
512 s/[^\w ]/\n/go;
513
514 # remove extraneous whitespace
515 s/ +/ /sgo;
516 s/^\s+//mgo;
517 s/\s*$/\n/mgo;
518
519 # remove lines that contain one word or less
520 s/^\S*$//mgo;
521 s/^\s*$//mgo;
522 tr/\n//s;
523
524 return $_;
525}
526
527# a chinese version
528sub convert_gml_to_tokens_ZH {
529
530 $_ = shift @_;
531
532 # Replace all whitespace with a simple space
533 s/\s+/ /gs;
534 # Remove everything that is in a tag
535 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
536 s/\s*<br>\s*/ LINEBREAK /isg;
537 s/<[^>]*>/ /sg;
538
539 # Now we have the text, but it may contain HTML
540 # elements coded as &gt; etc. Remove these tags.
541 s/&lt;/</sg;
542 s/&gt;/>/sg;
543
544 s/\s+/ /sg;
545 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
546 s/\s*<br>\s*/ LINEBREAK /isg;
547 s/<[^>]*>/ /sg;
548
549 # remove &amp; and other miscellaneous markup tags
550 s/&amp;/&/sg;
551 s/&lt;/</sg;
552 s/&gt;/>/sg;
553 s/&amp;/&/sg;
554
555 # replace<p> and <br> placeholders with carriage returns
556 s/PARAGRAPHBREAK/\n/sg;
557 s/LINEBREAK/\n/sg;
558
559
560# print STDERR "text:$_\n";
561 return $_;
562}
563
564# A version of convert_gml_to_tokens that is fine-tuned to the English language.
565
566sub convert_gml_to_tokens_EN {
567 $_ = shift @_;
568
569 # FIRST, remove GML tags
570
571 # Replace all whitespace with a simple space
572 s/\s+/ /gs;
573
574 # Remove everything that is in a tag
575 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
576 s/\s*<br>\s*/ LINEBREAK /isg;
577 s/<[^>]*>/ /sg;
578
579 # Now we have the text, but it may contain HTML
580 # elements coded as &gt; etc. Remove these tags.
581 s/&lt;/</sg;
582 s/&gt;/>/sg;
583
584 s/\s+/ /sg;
585 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
586 s/\s*<br>\s*/ LINEBREAK /isg;
587 s/<[^>]*>/ /sg;
588
589 # remove &amp; and other miscellaneous markup tags
590 s/&amp;/&/sg;
591 s/&lt;/</sg;
592 s/&gt;/>/sg;
593 s/&amp;/&/sg;
594
595 # replace<p> and <br> placeholders with carriage returns
596 s/PARAGRAPHBREAK/\n/sg;
597 s/LINEBREAK/\n/sg;
598
599
600 # Exceptional punctuation
601 #
602 # We make special cases of some punctuation
603
604 # remove any apostrophe that indicates omitted letters
605 s/(\w+)\'(\w*\s)/ $1$2 /g;
606
607 # remove period that appears in a person's initals
608 s/\s([A-Z])\./ $1 /g;
609
610 # replace hyphens in hypheanted words and names with a space
611 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
612
613 # Convert the remaining text to "clause format",
614 # This means removing all excess punctuation and garbage text,
615 # normalising valid punctuation to fullstops and commas,
616 # then putting one cluse on each line.
617
618 # Insert newline when the end of a sentence is detected
619 # (delimter is: "[\.\?\!]\s")
620 s/\s*[\.\?\!]\s+/\n/g;
621
622 # split numbers after four digits
623 s/(\d\d\d\d)/$1 /g;
624
625 # split words after 32 characters
626
627 # squash repeated punctuation
628 tr/A-Za-z0-9 //cs;
629
630 # save email addresses
631 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
632
633 # normalise clause breaks (mostly punctuation symbols) to commas
634 s/[^A-Za-z0-9 \n]+/ , /g;
635
636 # Remove repeated commas, and replace with newline
637 s/\s*,[, ]+/\n/g;
638
639 # remove extra whitespace
640 s/ +/ /sg;
641 s/^\s+//mg;
642 s/\s*$/\n/mg;
643
644 # remove lines that contain one word or less
645 s/^\w*$//mg;
646 s/^\s*$//mg;
647 tr/\n//s;
648
649 return $_;
650
651}
652
653
654
655# Execute a system command
656
657sub execute {
658 my ($command, $verbosity, $outhandle) = @_;
659 print $outhandle "Executing: $command\n" if ($verbosity > 2);
660 $! = 0;
661 my $status = system($command);
662 if ($status != 0) {
663 print STDERR "Phind - Error executing '$command': $!\n";
664 exit($status); # this causes the build to fail...
665 }
666}
667
668
669# Generate the vocabulary, symbol statistics, and numbers file from the
670# clauses file. This is legacy code, so is a bit messy and probably wont
671# run under windows.
672
673sub extract_vocabulary {
674 my ($self) = @_;
675
676 my $verbosity = $self->{'verbosity'};
677 my $out = $self->{'outhandle'};
678
679 my $collectiondir = $self->{'collectiondir'};
680 my $phinddir = $self->{'phinddir'};
681
682 my $language_exp = $self->{'language'};
683
684 my ($w, $l, $line, $word);
685
686 my ($first_delimiter, $last_delimiter,
687 $first_stopword, $last_stopword,
688 $first_extractword, $last_extractword,
689 $first_contentword, $last_contentword,
690 $phrasedelimiter);
691
692 my $thesaurus = $self->{'thesaurus'};
693 my ($thesaurus_links, $thesaurus_terms,
694 %thesaurus, $first_thesaurusword, $last_thesaurusword);
695
696 my %symbol;
697 my (%freq);
698
699 print $out "Calculating vocabulary\n" if ($verbosity > 1);
700
701 # Read and store the stopwords
702 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
703 my $stopword_files = ();
704 my ($language, $language_dir, $file, $file_name);
705 my %stopwords;
706
707 # Examine each directory in the stopword directory
708 opendir(STOPDIR, $stopdir);
709 foreach $language (readdir STOPDIR) {
710
711 # Ignore entries that do not match the classifier's language
712 next unless ($language =~ /$language_exp/);
713 $language_dir = &util::filename_cat($stopdir, $language);
714 next unless (-d "$language_dir");
715
716 opendir(LANGDIR, $language_dir);
717 foreach $file (readdir LANGDIR) {
718
719 # Ignore entries that are not stopword files
720 next unless ($file =~ /sw$/);
721 $file_name = &util::filename_cat($language_dir, $file);
722 next unless (-f "$file_name");
723
724 # Read the stopwords
725 open(STOPFILE, "<$file_name");
726 while (<STOPFILE>) {
727 s/^\s+//;
728 s/\s.*//;
729 $word = $_;
730 $l = lc($word);
731 $stopwords{$l} = $word;
732 }
733 close STOPFILE;
734
735 }
736 closedir LANGDIR;
737 }
738 closedir STOPDIR;
739
740 # Read thesaurus information
741 if ($thesaurus) {
742
743 # link file exists
744 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
745 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
746
747 # ensure term file exists in the correct language
748 if ($language_exp =~ /^([a-z][a-z])/) {
749 $language = $1;
750 } else {
751 $language = 'en';
752 }
753 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
754 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
755
756
757 # Read the thesaurus terms
758 open(TH, "<$thesaurus_terms");
759 while(<TH>) {
760 s/^\d+ //;
761 s/\(.*\)//;
762 foreach $w (split(/\s+/, $_)) {
763 $thesaurus{lc($w)} = $w;
764 }
765 }
766 close TH;
767 }
768
769 # Read words in the text and count occurences
770 open(TXT, "<$phinddir/clauses");
771
772 my @words;
773 while(<TXT>) {
774 $line = $_;
775 next unless ($line =~ /./);
776
777 @words = split(/\s+/, $line);
778 foreach $w (@words) {
779 $l = lc($w);
780 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
781 $freq{$w}++;
782 }
783 $freq{$senlimit}++;
784 }
785
786 close TXT;
787
788 # Calculate the "best" form of each word
789 my (%bestform, %totalfreq, %bestfreq);
790
791 foreach $w (sort (keys %freq)) {
792 $l = lc($w);
793
794 # totalfreq is the number of times a term appears in any form
795 $totalfreq{$l} += $freq{$w};
796
797 if (defined $stopwords{$l}) {
798 $bestform{$l} = $stopwords{$l};
799
800 } elsif (defined $thesaurus{$l}) {
801 $bestform{$l} = $thesaurus{$l};
802
803 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
804 $bestfreq{$l} = $freq{$w};
805 $bestform{$l} = $w;
806 }
807 }
808 undef %freq;
809 undef %bestfreq;
810
811
812 # Assign symbol numbers to tokens
813 my $nextsymbol = 1;
814 my (@vocab);
815
816 # Delimiters
817 $first_delimiter = 1;
818
819 foreach $word (@delimiters) {
820
821# $word = lc($word); # jrm21
822 $word = uc($word);
823 $bestform{$word} = $word;
824 $vocab[$nextsymbol] = $word;
825 $symbol{$word} = $nextsymbol;
826 $nextsymbol++;
827 }
828 $last_delimiter = $nextsymbol - 1;
829 # Stopwords
830 $first_stopword = $nextsymbol;
831
832 foreach my $word (sort keys %stopwords) {
833 # don't include stopword unless it occurs in the text
834 $word = lc($word);
835 next unless ($totalfreq{$word});
836 next if ($symbol{$word});
837
838 $vocab[$nextsymbol] = $word;
839 $symbol{$word} = $nextsymbol;
840 $nextsymbol++;
841 }
842 $last_stopword = $nextsymbol - 1;
843 $first_contentword = $nextsymbol;
844
845 # Thesaurus terms
846 if ($thesaurus) {
847 $first_thesaurusword = $nextsymbol;
848
849 foreach my $word (sort keys %thesaurus) {
850
851 $word = lc($word);
852 next if ($symbol{$word});
853 $bestform{$word} = $thesaurus{$word};
854
855 $vocab[$nextsymbol] = $word;
856 $symbol{$word} = $nextsymbol;
857 $nextsymbol++;
858
859 }
860 $last_thesaurusword = $nextsymbol - 1;
861 }
862
863 # Other content words
864 $first_extractword = $nextsymbol;
865
866 foreach my $word (sort (keys %bestform)) {
867
868 next if ($symbol{$word});
869
870 $vocab[$nextsymbol] = $word;
871 $symbol{$word} = $nextsymbol;
872 $nextsymbol++;
873 }
874 $last_extractword = $nextsymbol - 1;
875 $last_contentword = $nextsymbol - 1;
876
877 # Outut the words
878 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
879 open(VOC, ">$phinddir/clauses.vocab");
880
881 for (my $i = 1; $i < $nextsymbol; $i++) {
882 $w = $vocab[$i];
883
884 print VOC "$bestform{$w}\n";
885 $totalfreq{$w} = 0 unless ($totalfreq{$w});
886 }
887 close VOC;
888
889
890 # Create statistics file
891 # Output statistics about the vocablary
892 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
893 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
894
895 open(STAT, ">$phinddir/clauses.stats")
896 || die "Cannot open $phinddir/clauses.stats: $!";
897
898 print STAT "first_delimiter $first_delimiter\n";
899 print STAT "last_delimiter $last_delimiter\n";
900 print STAT "first_stopword $first_stopword\n";
901 print STAT "last_stopword $last_stopword\n";
902 if ($thesaurus) {
903 print STAT "first_thesaurusword $first_thesaurusword\n";
904 print STAT "last_thesaurusword $last_thesaurusword\n";
905 }
906 print STAT "first_extractword $first_extractword\n";
907 print STAT "last_extractword $last_extractword\n";
908 print STAT "first_contentword $first_contentword\n";
909 print STAT "last_contentword $last_contentword\n";
910 print STAT "first_symbol $first_delimiter\n";
911 print STAT "last_symbol $last_contentword\n";
912 print STAT "first_word $first_stopword\n";
913 print STAT "last_word $last_contentword\n";
914 close STAT;
915
916 undef @vocab;
917
918
919 # Create numbers file
920 # Save text as symbol numbers
921 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
922
923 open(TXT, "<$phinddir/clauses");
924 open(NUM, ">$phinddir/clauses.numbers");
925
926## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
927## print NUM "$symbol{lc($colstart)}\n"; # jrm21
928 $phrasedelimiter = $symbol{$senlimit};
929 print NUM "$symbol{$colstart}\n";
930
931 # set up the special symbols that delimit documents and sentences
932 while(<TXT>) {
933
934 # split sentence into a list of tokens
935 $line = $_;
936 next unless ($line =~ /./);
937 @words = split(/\s+/, $line);
938
939 # output one token at a time
940 foreach $word (@words) {
941# don't lower-case special delimiters - jrm21
942 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
943 $word = lc($word);
944 }
945 print NUM "$symbol{$word}\n";
946 }
947
948 # output phrase delimiter
949 print NUM "$phrasedelimiter\n";
950 }
951
952 close TXT;
953# print NUM "$symbol{lc($colend)}\n";# jrm21
954 print NUM "$symbol{$colend}\n";
955 close NUM;
956
957 # Save thesaurus data in one convienient file
958 if ($thesaurus) {
959
960 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
961
962
963 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
964 if ($verbosity > 1);
965
966 # Read the thesaurus terms
967 my ($num, $text, %thes_symbols);
968
969 open(TH, "<$thesaurus_terms");
970 while(<TH>) {
971 chomp;
972 @words = split(/\s+/, $_);
973 $num = shift @words;
974 $text = "";
975
976 # translate words into symbol numbers
977 foreach $word (@words) {
978 $word = lc($word);
979 if ($symbol{$word}) {
980 $text .= "s$symbol{$word} ";
981 } elsif ($verbosity) {
982 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
983 }
984 }
985 $text =~ s/ $//;
986 $thes_symbols{$num} = $text;
987 }
988 close TH;
989
990 # Read the thesaurus links and write the corresponding data
991 open(TH, "<$thesaurus_links");
992 open(THOUT, ">$thesaurusfile");
993
994 while(<TH>) {
995 chomp;
996 ($num, $text) = split(/:/, $_);
997
998 if (defined($thes_symbols{$num})) {
999 print THOUT "$num:$thes_symbols{$num}:$text\n";
1000 } else {
1001 print THOUT "$num:untranslated:$text\n";
1002 }
1003 }
1004 close TH;
1005 close THOUT;
1006 }
1007
1008
1009
1010
1011}
1012
1013
1014# renumber_phrases
1015#
1016# Prepare the phrases file to be input to mgpp. The biggest problem is
1017# reconciling the phrase identifiers used by the suffix program (which
1018# we'll call suffix-id numbers) with the numbers used in the thesaurus
1019# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1020# Phind-id numbers must be sorted by frequency of occurance.
1021#
1022# Start creating a set of phind-id numbers from the sorted suffix-id
1023# numbers and (if required) the thesaurus-id numbers. Then add any other
1024# phrases occuring in the thesaurus.
1025#
1026# The last thing we have to do is restore the vocabulary information to the
1027# phrase file so that the phrases are stored as words, not as symbol
1028# numbers.
1029
1030# The original phrases file looks something like this:
1031# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1032# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1033# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1034# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1035
1036
1037sub renumber_phrases {
1038 my ($self) = @_;
1039
1040 renumber_suffix_data($self);
1041 renumber_thesaurus_data($self);
1042 restore_vocabulary_data($self);
1043
1044}
1045
1046
1047
1048# renumber_suffix_data
1049#
1050# Translate phrases file to phrases.2 using phind keys instead
1051# of suffix keys and sorting the expansion data.
1052
1053sub renumber_suffix_data {
1054 my ($self) = @_;
1055
1056 my $verbosity = $self->{'verbosity'};
1057 my $out = $self->{'outhandle'};
1058 print $out "Translate phrases: suffix-ids become phind-id's\n"
1059 if ($verbosity);
1060
1061 my $phinddir = $self->{'phinddir'};
1062 my $infile = &util::filename_cat($phinddir, 'phrases');
1063 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1064
1065 # Read the phrase file. Calculate initial set of phind-id
1066 # numbers and store (suffixid -> frequency) relation.
1067
1068 my %suffixtophind;
1069 my @totalfrequency;
1070 my (@fields, $suffixid);
1071 my $nextphind = 1;
1072
1073 open(IN, "<$infile");
1074 while(<IN>) {
1075
1076 chomp;
1077 @fields = split(/:/, $_);
1078
1079 # get next suffixid and phindid
1080 $suffixid = shift @fields;
1081 $suffixtophind{$suffixid} = $nextphind;
1082
1083 # store total frequency
1084 shift @fields;
1085 $totalfrequency[$nextphind] = shift @fields;
1086
1087 $nextphind++;
1088 }
1089 close IN;
1090
1091
1092 # Translate phrases file to phrases.2. Use phind keys (not suffix
1093 # keys), sort expansion and document occurance data in order of
1094 # descending frequency..
1095 open(IN, "<$infile");
1096 open(OUT, ">$outfile");
1097
1098 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1099 my (@documwents, @newexp, $k, $n);
1100 my $linenumber = 0;
1101
1102 while(<IN>) {
1103
1104 # read the line
1105 chomp;
1106 @fields = split(/:/, $_);
1107
1108 # get a phrase number for this line
1109 $suffixid = shift @fields;
1110 die unless (defined($suffixtophind{$suffixid}));
1111 $phindid = $suffixtophind{$suffixid};
1112
1113 # get the symbols in the phrase
1114 $text = shift @fields;
1115
1116 # output status information
1117 $linenumber++;
1118 if ($verbosity > 2) {
1119 if ($linenumber % 1000 == 0) {
1120 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1121 }
1122 # what are $num and $key??
1123 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1124 }
1125
1126 # get the phrase frequency
1127 $tf = shift @fields;
1128
1129 # get the number of expansions
1130 $countexp = shift @fields;
1131
1132 # get the expansions, convert them into phind-id numbers, and sort them
1133 $expansions = shift @fields;
1134 @newexp = ();
1135 foreach $k (split(/,/, $expansions)) {
1136 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1137 $n = $suffixtophind{$k};
1138 push @newexp, $n;
1139 }
1140 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1141
1142 # get the number of documents
1143 $countdocs = shift @fields;
1144
1145 # get the documents and sort them
1146 $documents = shift @fields;
1147 $documents =~ s/d//g;
1148 my @documents = split(/;/, $documents);
1149 @documents = sort by_doc_frequency @documents;
1150
1151 # output the phrase data
1152 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1153 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1154
1155 }
1156
1157 close IN;
1158 close OUT;
1159}
1160
1161
1162# renumber_thesaurus_data
1163#
1164# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1165
1166sub renumber_thesaurus_data {
1167 my ($self) = @_;
1168
1169 my $out = $self->{'outhandle'};
1170 my $verbosity = $self->{'verbosity'};
1171 my $thesaurus = $self->{'thesaurus'};
1172
1173 my $phinddir = $self->{'phinddir'};
1174 my $infile = &util::filename_cat($phinddir, "phrases.2");
1175 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1176
1177
1178 # If no thesaurus is defined, simply move the phrases file.
1179 if (!$thesaurus) {
1180 print $out "Translate phrases.2: no thesaurus data\n"
1181 if ($verbosity);
1182 &util::mv($infile, $outfile);
1183 return;
1184 }
1185
1186 print $out "Translate phrases.2: add thesaurus data\n"
1187 if ($verbosity);
1188
1189 # 1.
1190 # Read thesaurus file and store (symbols->thesaurusid) mapping
1191 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1192 my %symbolstothesid;
1193 my (@fields, $thesid, $symbols);
1194
1195 open(TH, "<$thesaurusfile");
1196
1197 while (<TH>) {
1198
1199 chomp;
1200 @fields = split(/:/, $_);
1201
1202 # get id and text
1203 $thesid = shift @fields;
1204 $symbols = shift @fields;
1205 $symbolstothesid{$symbols} = $thesid;
1206 }
1207 close TH;
1208
1209 # 2.
1210 # Read phrases file to find thesaurus entries that already
1211 # have a phindid. Store their phind-ids for later translation,
1212 # and store their frequency for later sorting.
1213 my %thesaurustophindid;
1214 my %phindidtofrequency;
1215 my ($phindid, $freq);
1216
1217 open(IN, "<$infile");
1218
1219 while(<IN>) {
1220
1221 chomp;
1222 @fields = split(/:/, $_);
1223
1224 # phindid and symbols for this line
1225 $phindid = shift @fields;
1226 $symbols = shift @fields;
1227 $freq = shift @fields;
1228
1229 # do we have a thesaurus id corresponding to this phrase?
1230 if (defined($symbolstothesid{$symbols})) {
1231 $thesid = $symbolstothesid{$symbols};
1232 $thesaurustophindid{$thesid} = $phindid;
1233 $phindidtofrequency{$phindid} = $freq;
1234 }
1235 }
1236 close IN;
1237
1238 undef %symbolstothesid;
1239
1240 # 3.
1241 # Create phind-id numbers for remaining thesaurus entries,
1242 # and note that their frequency is 0 for later sorting.
1243 my $nextphindid = $phindid + 1;
1244
1245 open(TH, "<$thesaurusfile");
1246 while(<TH>) {
1247
1248 chomp;
1249 @fields = split(/:/, $_);
1250
1251 # read thesaurus-id and ensure it has a corresponding phind-id
1252 $thesid = shift @fields;
1253 if (!defined($thesaurustophindid{$thesid})) {
1254 $thesaurustophindid{$thesid} = $nextphindid;
1255 $phindidtofrequency{$nextphindid} = 0;
1256 $nextphindid++;
1257 }
1258 }
1259 close TH;
1260
1261 # 4.
1262 # Translate thesaurus file, replacing thesaurus-id numbers with
1263 # phind-id numbers.
1264 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1265 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1266 my (@links, $linkid, %linkidtotype, $newrelation);
1267
1268 open(TH, "<$thesaurusfile");
1269 open(TO, ">$newthesaurusfile");
1270 while(<TH>) {
1271
1272 chomp;
1273 @fields = split(/:/, $_);
1274
1275 # phindid and symbols for this line
1276 ($thesid, $symbols, $relations) = @fields;
1277
1278 die unless ($thesid && $symbols);
1279 die unless $thesaurustophindid{$thesid};
1280 $phindid = $thesaurustophindid{$thesid};
1281
1282 # convert each part of the relation string to use phind-id numbers
1283 # at the same time, we want to sort the list by frequency.
1284 undef %linkidtotype;
1285
1286 foreach $linktext (split(/;/, $relations)) {
1287 @linkdata = split(/,/, $linktext);
1288
1289 # remember the linktype (e.g. BT, NT)
1290 $linktype = shift @linkdata;
1291
1292 # store the type of each link
1293 foreach $thesid (@linkdata) {
1294 die unless (defined($thesaurustophindid{$thesid}));
1295 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1296 }
1297 }
1298
1299 # sort the list of links, first by frequency, then by type.
1300 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1301 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1302 $linkcounter = (scalar @links);
1303
1304 # create a string describing the link information
1305 $linktype = $linkidtotype{$links[0]};
1306 $newrelation = $linktype;
1307 foreach $linkid (@links) {
1308 if ($linkidtotype{$linkid} ne $linktype) {
1309 $linktype = $linkidtotype{$linkid};
1310 $newrelation .= ";" . $linktype;
1311 }
1312 $newrelation .= "," . $linkid;
1313 }
1314 $newrelation .= ";";
1315
1316
1317 # output the new line
1318 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1319 }
1320 close TH;
1321 close TO;
1322
1323 undef %thesaurustophindid;
1324 undef %linkidtotype;
1325 undef %phindidtofrequency;
1326
1327 # 5.
1328 # Read thesaurus data (in phind-id format) into memory
1329 my %thesaurusdata;
1330
1331 open(TH, "<$newthesaurusfile");
1332 while(<TH>) {
1333 chomp;
1334 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1335 die unless ($phindid && $symbols);
1336 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1337 }
1338 close TH;
1339
1340 # 6.
1341 # Add thesaurus data to phrases file
1342 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1343 my (@documwents, @newexp, $k, $n);
1344 my $linenumber = 0;
1345
1346 open(IN, "<$infile");
1347 open(OUT, ">$outfile");
1348
1349 # Update existing phrases
1350 while(<IN>) {
1351
1352 chomp;
1353 @fields = split(/:/, $_);
1354
1355 # get data for this line
1356 $phindid = shift @fields;
1357
1358 # output the phrase data, with thesaurus information
1359 print OUT "$phindid:", join(":", @fields);
1360
1361 # add thesaurus data
1362 if (defined($thesaurusdata{$phindid})) {
1363 @fields = split(/:/, $thesaurusdata{$phindid});
1364 shift @fields;
1365 $linkcounter = shift @fields;
1366 $relations = shift @fields;
1367
1368 print OUT ":$linkcounter:$relations";
1369 $thesaurusdata{$phindid} = "";
1370 }
1371 print OUT "\n";
1372 }
1373 close IN;
1374
1375 # Add phrases that aren't already in the file
1376 foreach $phindid (sort numerically keys %thesaurusdata) {
1377 next unless ($thesaurusdata{$phindid});
1378
1379 @fields = split(/:/, $thesaurusdata{$phindid});
1380 $symbols = shift @fields;
1381 $linkcounter = shift @fields;
1382 $relations = shift @fields;
1383
1384 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1385 }
1386 close OUT;
1387
1388}
1389
1390# restore_vocabulary_data
1391#
1392# Read phrases.3 and restore vocabulary information. Then write
1393# this data to the MGPP input files (pword.txt and pdata.txt) and
1394# (if requested) to the saved phrases file.
1395
1396sub restore_vocabulary_data {
1397 my ($self) = @_;
1398
1399 my $out = $self->{'outhandle'};
1400 my $verbosity = $self->{'verbosity'};
1401 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1402
1403 my $phinddir = $self->{'phinddir'};
1404 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1405 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1406 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1407 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1408
1409 my $savephrases = $self->{'savephrases'};
1410
1411 # 1.
1412 # Read the vocabulary file
1413 open(V, "<$vocabfile")
1414 || die "Cannot open $vocabfile: $!";
1415 my @symbol;
1416 my $i = 1;
1417 while(<V>) {
1418 chomp;
1419 $symbol[$i++] = $_;
1420 }
1421 close V;
1422
1423 # 2.
1424 # Translate phrases.3 to MGPP input files
1425 my ($key, $text, $word, $isThesaurus, $line);
1426 my @fields;
1427 my $linenumber = 0;
1428
1429 open(IN, "<$infile");
1430 open(DATA, ">$datafile");
1431 open(WORD, ">$wordfile");
1432
1433 # Save the phrases in a separate text file
1434 if ($savephrases) {
1435 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1436 open(SAVE, ">$savephrases");
1437 }
1438
1439 while(<IN>) {
1440
1441 # read the line
1442 chomp;
1443 $line = $_;
1444 @fields = split(/:/, $line);
1445
1446 # get a phrase number for this line
1447 $key = shift @fields;
1448
1449 # restore the text of the phrase
1450 $text = shift @fields;
1451 $text =~ s/s(\d+)/$symbol[$1]/g;
1452 if ($text =~ / /) {
1453 $word = "";
1454 } elsif ($text ne 'untranslated') {
1455 $word = $text;
1456 }
1457
1458 # output the phrase data
1459 print DATA "<Document>";
1460 print DATA "$key:$text:", join(":", @fields), ":\n";
1461
1462 # output the word index search data
1463 print WORD "<Document>$word\n";
1464
1465 # output the phrases to a text file
1466 if ($savephrases) {
1467 if ((scalar @fields) == 7) {
1468 $isThesaurus = 1;
1469 } else {
1470 $isThesaurus = 0;
1471 }
1472 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1473 }
1474 }
1475 close IN;
1476 close WORD;
1477 close DATA;
1478 close SAVE if ($savephrases);
1479
1480}
1481
1482
1483
1484# sort routines used to renumber phrases
1485
1486sub numerically { $a <=> $b }
1487
1488sub by_doc_frequency {
1489 my $fa = 1;
1490 if ($a =~ /,/) {
1491 $fa = $a;
1492 $fa =~ s/\d+,//;
1493 }
1494 my $fb = 1;
1495 if ($b =~ /,/) {
1496 $fb = $b;
1497 $fb =~ s/\d+,//;
1498 }
1499
1500 return ($fb <=> $fa);
1501}
1502
15031;
Note: See TracBrowser for help on using the repository browser.