source: trunk/gsdl/perllib/classify/Phind.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 40.0 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BasClas;
37use util;
38use ghtml;
39use unicode;
40
41my @removedirs = ();
42
43my %wanted_index_files = ('td'=>1,
44 't'=>1,
45 'ti'=>1,
46 'tl'=>1,
47 'tsd'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'il'=>1,
54 'w'=>1,
55 'wa'=>1);
56
57sub BEGIN {
58 @ISA = ('BasClas');
59}
60
61sub END {
62
63 # Tidy up stray files - we do this here as there's some weird problem
64 # preventing us from doing it in the get_classify_info() function (on
65 # windows at least) where the close() appears to fail on txthandle and
66 # dochandle, thus preventing us from deleting those files
67
68 foreach my $dir (@removedirs) {
69 if (-d $dir && opendir (DIR, $dir)) {
70 my @files = readdir DIR;
71 closedir DIR;
72
73 foreach $file (@files) {
74 next if $file =~ /^\.\.?$/;
75 my ($suffix) = $file =~ /\.([^\.]+)$/;
76 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
77 # delete it!
78 &util::rm (&util::filename_cat ($dir, $file));
79 }
80 }
81 }
82 }
83}
84
85my $arguments =
86 [ { 'name' => "text",
87 'desc' => "{Phind.text}",
88 'type' => "string",
89 'deft' => "section:Title,section:text",
90 'reqd' => "no" },
91 { 'name' => "title",
92 'desc' => "{Phind.title}",
93 'type' => "metadata",
94 'deft' => "Title",
95 'reqd' => "no" },
96 { 'name' => "buttonname",
97 'desc' => "{BasClas.buttonname}",
98 'type' => "string",
99 'deft' => "Phrase",
100 'reqd' => "no" },
101 { 'name' => "language",
102 'desc' => "{Phind.language}",
103 'type' => "string",
104 'deft' => "en",
105 'reqd' => "no" },
106 { 'name' => "savephrases",
107 'desc' => "{Phind.savephrases}",
108 'type' => "string",
109 'deft' => "",
110 'reqd' => "no" },
111 { 'name' => "suffixmode",
112 'desc' => "{Phind.suffixmode}",
113 'type' => "int",
114 'deft' => "1",
115 'range' => "0,1",
116 'reqd' => "no" },
117 { 'name' => "min_occurs",
118 'desc' => "{Phind.min_occurs}",
119 'type' => "int",
120 'deft' => "2",
121 'range' => "1,",
122 'reqd' => "no" },
123 { 'name' => "thesaurus",
124 'desc' => "{Phind.thesaurus}",
125 'type' => "string",
126 'deft' => "",
127 'reqd' => "no" },
128 { 'name' => "untidy",
129 'desc' => "{Phind.untidy}",
130 'type' => "flag",
131 'reqd' => "no" } ];
132
133my $options = { 'name' => "Phind",
134 'desc' => "{Phind.desc}",
135 'abstract' => "no",
136 'inherits' => "yes",
137 'args' => $arguments };
138
139
140# Phrase delimiter symbols - these should be abstracted out someplace
141
142my $colstart = "COLLECTIONSTART";
143my $colend = "COLLECTIONEND";
144my $doclimit = "DOCUMENTLIMIT";
145my $senlimit = "SENTENCELIMIT";
146my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
147
148
149# Create a new Phind browser based on collect.cfg
150
151sub new {
152 my ($class) = shift (@_);
153 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
154 push(@$classifierslist, $class);
155
156 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
157 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
158
159 my $self = (defined $hashArgOptLists)? new BasClas($classifierslist,$inputargs,$hashArgOptLists): new BasClas($classifierslist,$inputargs);
160
161 # Ensure the Phind generate scripts are in place
162 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
163 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
164 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
165 if (!(-e $file1)) {
166 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
167 exit(1);
168 }
169
170 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
171 $self->{"indexes"} = $self->{"text"};
172
173 # Further setup
174 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
175 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
176 if (! defined $self->{'builddir'}) {
177 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
178 }
179 $self->{'total'} = 0;
180
181 # Clean out the unused keys
182 delete $self->{"text"};
183
184 return bless $self, $class;
185}
186
187
188# Initialise the Phind classifier
189
190sub init {
191 my $self = shift (@_);
192
193 # ensure we have a build directory
194 my $builddir = $self->{'builddir'};
195 die unless (-e "$builddir");
196
197 # create Phind directory
198 my $phnumber = 1;
199 my $phinddir = &util::filename_cat($builddir, "phind1");
200 while (-e "$phinddir") {
201 $phnumber++;
202 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
203 }
204 &util::mk_dir("$phinddir");
205 $self->{'phinddir'} = $phinddir;
206 $self->{'phindnumber'} = $phnumber;
207
208 push(@removedirs, $phinddir) unless $self->{'untidy'};
209
210 # open filehandles for documents and text
211 my $clausefile = &util::filename_cat("$phinddir", "clauses");
212 &util::rm($clausefile) if (-e $clausefile);
213
214 my $txthandle = 'TEXT' . $phnumber;
215 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
216 $self->{'txthandle'} = $txthandle;
217
218 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
219 &util::rm($docfile) if (-e $docfile);
220
221 my $dochandle = 'DOC' . $phnumber;
222 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
223 $self->{'dochandle'} = $dochandle;
224
225}
226
227
228# Classify each document.
229#
230# Each document is passed here in turn. The classifier extracts the
231# text of each and stores it in the clauses file. Document details are
232# stored in the docs.txt file.
233
234sub classify {
235 my $self = shift (@_);
236 my $doc_obj = shift @_;
237
238 my $verbosity = $self->{'verbosity'};
239 my $top_section = $doc_obj->get_top_section();
240
241 my $titlefield = $self->{'title'};
242
243 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
244 if (!defined($title)) {
245 $title = "";
246 print STDERR "Phind: document has no title\n";
247 }
248 print "process: $title\n" if ($verbosity > 2);
249
250 # Only consider the file if it is in the correct language
251 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
252 my $phrlanguage = $self->{'language'};
253 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
254
255 # record this file
256 $self->{'total'} ++;
257 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
258
259 # Store document details
260 my $OID = $doc_obj->get_OID();
261 $OID = "NULL" unless defined $OID;
262 my $dochandle = $self->{'dochandle'};
263 print $dochandle "<Document>\t$OID\t$title\n";
264
265 # Store the text occuring in this object
266
267 # output the document delimiter
268 my $txthandle = $self->{'txthandle'};
269 print $txthandle "$doclimit\n";
270
271 # iterate over the required indexes and store their text
272 my $indexes = $self->{'indexes'};
273 my $text = "";
274 my ($part, $level, $field, $section, $data, $dataref);
275
276 foreach $part (split(/,/, $indexes)) {
277
278 # Each field has a level and a data element ((e.g. document:Title)
279 ($level, $field) = split(/:/, $part);
280 die unless ($level && $field);
281
282 # Extract the text from every section
283 # (In phind, document:text and section:text are equivalent)
284 if ($field eq "text") {
285 $data = "";
286 $section = $doc_obj->get_top_section();
287 while (defined($section)) {
288 $data .= $doc_obj->get_text($section) . "\n";
289 $section = $doc_obj->get_next_section($section);
290 }
291 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
292 }
293
294 # Extract a metadata field from a document
295 # (If there is more than one element of the given type, get them all.)
296 elsif ($level eq "document") {
297 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
298 foreach $data (@$dataref) {
299 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
300 }
301 }
302
303 # Extract metadata from every section in a document
304 elsif ($level eq "section") {
305 $data = "";
306 $section = $doc_obj->get_top_section();
307 while (defined($section)) {
308 $dataref = $doc_obj->get_metadata($section, $field);
309 $data .= join("\n", @$dataref) . "\n";
310 $section = $doc_obj->get_next_section($section);
311 }
312 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
313 }
314
315 # Some sort of specification which I don't understand
316 else {
317 die "Unknown level ($level) in Phind index ($part)\n";
318 }
319
320 }
321
322 # output the text
323 $text =~ tr/\n//s;
324 print $txthandle "$text";
325}
326
327
328# Construct the classifier from the information already gathered
329#
330# When get_classify_info is called, the clauses and docs.txt files have
331# already been constructed in the Phind directory. This function will
332# translate them into compressed, indexed MGPP files that can be read by
333# the phindcgi script. It will also register our classifier so that it
334# shows up in the navigation bar.
335
336sub get_classify_info {
337 my $self = shift (@_);
338 my ($gli) = @_;
339
340 close $self->{'dochandle'};
341 close $self->{'txthandle'};
342 my $verbosity = $self->{'verbosity'};
343 my $out = $self->{'outhandle'};
344 my $phinddir = $self->{'phinddir'};
345
346 my $osextra = "";
347 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
348 $osextra = " -d /";
349 }
350
351 print STDERR "</Stage>\n" if $gli;
352
353 if ($verbosity) {
354 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
355 print $out "*** in ", $self->{'phinddir'}, "\n";
356 }
357
358 print STDERR "<Stage name='Phind'>\n" if $gli;
359
360 # Construct phind indexes
361 my $suffixmode = $self->{'suffixmode'};
362 my $min_occurs = $self->{'min_occurs'};
363 my ($command, $status);
364
365 # Generate the vocabulary, symbol statistics, and numbers file
366 # from the clauses file
367 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
368 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
369 &extract_vocabulary($self);
370
371 # Use the suffix program to generate the phind/phrases file
372 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
373 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
374 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
375
376 # check that we generated some files. It's not necessarily an error if
377 # we didn't (execute() would have quit on error), but we can't go on.
378 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
379 if (! -r $phrasesfile) {
380 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
381 print $out "\nNo phrases found for Phind classifier!\n";
382 return;
383 }
384
385 # Create the phrase file and put phrase numbers in phind/phrases
386 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
387 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
388 &renumber_phrases($self);
389
390 print $out "\nCreating phrase databases\n";
391 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
392 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
393 my $mg_stem = &util::filename_cat($phinddir, "pdata");
394
395 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
396 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
397 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
398
399 # create the mg index of words
400 print $out "\nCreating word-level search indexes\n";
401 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
402 $mg_input = &util::filename_cat($phinddir, "pword.txt");
403 $mg_stem = &util::filename_cat($phinddir, "pword");
404
405 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
406 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
407 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
408 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
409 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
410 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
411
412 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
413 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
414 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
415
416 # create the mg document information database
417 print $out "\nCreating document information databases\n";
418 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
419 $mg_input = &util::filename_cat($phinddir, "docs.txt");
420 $mg_stem = &util::filename_cat($phinddir, "docs");
421
422 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
423 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
424 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
425
426 # Return the information about the classifier that we'll later want to
427 # use to create macros when the Phind classifier document is displayed.
428 my %classifyinfo = ('thistype'=>'Invisible',
429 'childtype'=>'Phind',
430 'Title'=>$self->{'buttonname'},
431 'parameters'=>"phindnumber=$self->{'phindnumber'}",
432 'contains'=>[]);
433
434 my $collection = $self->{'collection'};
435 my $url = "library?a=p&p=phind&c=$collection";
436 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
437
438 return \%classifyinfo;
439}
440
441
442
443sub convert_gml_to_tokens {
444
445 my ($language_exp, $text) = @_;
446
447 # escape any magic words... - jrm21
448 foreach my $delim (@delimiters) {
449 my $replacement=lc($delim);
450 my $num= $text=~ s/$delim/$replacement/g;
451 if (!$num) {$num=0;}
452 }
453
454 if ($language_exp =~ /en/) {
455 return &convert_gml_to_tokens_EN($text);
456 }
457
458 if ($language_exp =~ /zh/) {
459 return &convert_gml_to_tokens_ZH($text);
460 }
461
462 $_ = $text;
463
464 # 1. remove GML tags
465
466 # Remove everything that is in a tag
467 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
468 s/\s*<br>\s*/ LINEBREAK /isgo;
469 s/<[^>]*>/ /sgo;
470
471 # Now we have the text, but it may contain HTML
472 # elements coded as &gt; etc. Remove these tags.
473 s/&amp;/&/sgo;
474 s/&lt;/</sgo;
475 s/&gt;/>/sgo;
476 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
477 s/\s*<br>\s*/ LINEBREAK /isgo;
478 s/<[^>]*>/ /sgo;
479
480 # replace<p> and <br> placeholders with clause break symbol (\n)
481 s/\s+/ /gso;
482 s/PARAGRAPHBREAK/\n/sgo;
483 s/LINEBREAK/\n/sgo;
484
485
486
487
488 # 2. Split the remaining text into space-delimited tokens
489
490 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
491 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
492
493 # Split text at word boundaries
494 s/\b/ /go;
495
496 # 3. Convert the remaining text to "clause format"
497
498 # Insert newline if the end of a sentence is detected
499 # (delimter is: "[\.\?\!]\s")
500 # s/\s*[\.\?\!]\s+/\n/go;
501
502 # remove unnecessary punctuation and replace with clause break symbol (\n)
503 s/[^\w ]/\n/go;
504
505 # remove extraneous whitespace
506 s/ +/ /sgo;
507 s/^\s+//mgo;
508 s/\s*$/\n/mgo;
509
510 # remove lines that contain one word or less
511 s/^\S*$//mgo;
512 s/^\s*$//mgo;
513 tr/\n//s;
514
515 return $_;
516}
517
518# a chinese version
519sub convert_gml_to_tokens_ZH {
520
521 $_ = shift @_;
522
523 # Replace all whitespace with a simple space
524 s/\s+/ /gs;
525 # Remove everything that is in a tag
526 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
527 s/\s*<br>\s*/ LINEBREAK /isg;
528 s/<[^>]*>/ /sg;
529
530 # Now we have the text, but it may contain HTML
531 # elements coded as &gt; etc. Remove these tags.
532 s/&lt;/</sg;
533 s/&gt;/>/sg;
534
535 s/\s+/ /sg;
536 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
537 s/\s*<br>\s*/ LINEBREAK /isg;
538 s/<[^>]*>/ /sg;
539
540 # remove &amp; and other miscellaneous markup tags
541 s/&amp;/&/sg;
542 s/&lt;/</sg;
543 s/&gt;/>/sg;
544 s/&amp;/&/sg;
545
546 # replace<p> and <br> placeholders with carriage returns
547 s/PARAGRAPHBREAK/\n/sg;
548 s/LINEBREAK/\n/sg;
549
550
551# print STDERR "text:$_\n";
552 return $_;
553}
554
555# A version of convert_gml_to_tokens that is fine-tuned to the English language.
556
557sub convert_gml_to_tokens_EN {
558 $_ = shift @_;
559
560 # FIRST, remove GML tags
561
562 # Replace all whitespace with a simple space
563 s/\s+/ /gs;
564
565 # Remove everything that is in a tag
566 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
567 s/\s*<br>\s*/ LINEBREAK /isg;
568 s/<[^>]*>/ /sg;
569
570 # Now we have the text, but it may contain HTML
571 # elements coded as &gt; etc. Remove these tags.
572 s/&lt;/</sg;
573 s/&gt;/>/sg;
574
575 s/\s+/ /sg;
576 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
577 s/\s*<br>\s*/ LINEBREAK /isg;
578 s/<[^>]*>/ /sg;
579
580 # remove &amp; and other miscellaneous markup tags
581 s/&amp;/&/sg;
582 s/&lt;/</sg;
583 s/&gt;/>/sg;
584 s/&amp;/&/sg;
585
586 # replace<p> and <br> placeholders with carriage returns
587 s/PARAGRAPHBREAK/\n/sg;
588 s/LINEBREAK/\n/sg;
589
590
591 # Exceptional punctuation
592 #
593 # We make special cases of some punctuation
594
595 # remove any apostrophe that indicates omitted letters
596 s/(\w+)\'(\w*\s)/ $1$2 /g;
597
598 # remove period that appears in a person's initals
599 s/\s([A-Z])\./ $1 /g;
600
601 # replace hyphens in hypheanted words and names with a space
602 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
603
604 # Convert the remaining text to "clause format",
605 # This means removing all excess punctuation and garbage text,
606 # normalising valid punctuation to fullstops and commas,
607 # then putting one cluse on each line.
608
609 # Insert newline when the end of a sentence is detected
610 # (delimter is: "[\.\?\!]\s")
611 s/\s*[\.\?\!]\s+/\n/g;
612
613 # split numbers after four digits
614 s/(\d\d\d\d)/$1 /g;
615
616 # split words after 32 characters
617
618 # squash repeated punctuation
619 tr/A-Za-z0-9 //cs;
620
621 # save email addresses
622 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
623
624 # normalise clause breaks (mostly punctuation symbols) to commas
625 s/[^A-Za-z0-9 \n]+/ , /g;
626
627 # Remove repeated commas, and replace with newline
628 s/\s*,[, ]+/\n/g;
629
630 # remove extra whitespace
631 s/ +/ /sg;
632 s/^\s+//mg;
633 s/\s*$/\n/mg;
634
635 # remove lines that contain one word or less
636 s/^\w*$//mg;
637 s/^\s*$//mg;
638 tr/\n//s;
639
640 return $_;
641
642}
643
644
645
646# Execute a system command
647
648sub execute {
649 my ($command, $verbosity, $outhandle) = @_;
650 print $outhandle "Executing: $command\n" if ($verbosity > 2);
651 $! = 0;
652 my $status = system($command);
653 if ($status != 0) {
654 print STDERR "Phind - Error executing '$command': $!\n";
655 exit($status); # this causes the build to fail...
656 }
657}
658
659
660# Generate the vocabulary, symbol statistics, and numbers file from the
661# clauses file. This is legacy code, so is a bit messy and probably wont
662# run under windows.
663
664sub extract_vocabulary {
665 my ($self) = @_;
666
667 my $verbosity = $self->{'verbosity'};
668 my $out = $self->{'outhandle'};
669
670 my $collectiondir = $self->{'collectiondir'};
671 my $phinddir = $self->{'phinddir'};
672
673 my $language_exp = $self->{'language'};
674
675 my ($w, $l, $line, $word);
676
677 my ($first_delimiter, $last_delimiter,
678 $first_stopword, $last_stopword,
679 $first_extractword, $last_extractword,
680 $first_contentword, $last_contentword,
681 $phrasedelimiter);
682
683 my $thesaurus = $self->{'thesaurus'};
684 my ($thesaurus_links, $thesaurus_terms,
685 %thesaurus, $first_thesaurusword, $last_thesaurusword);
686
687 my %symbol;
688 my (%freq);
689
690 print $out "Calculating vocabulary\n" if ($verbosity > 1);
691
692 # Read and store the stopwords
693 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
694 my $stopword_files = ();
695 my ($language, $language_dir, $file, $file_name);
696 my %stopwords;
697
698 # Examine each directory in the stopword directory
699 opendir(STOPDIR, $stopdir);
700 foreach $language (readdir STOPDIR) {
701
702 # Ignore entries that do not match the classifier's language
703 next unless ($language =~ /$language_exp/);
704 $language_dir = &util::filename_cat($stopdir, $language);
705 next unless (-d "$language_dir");
706
707 opendir(LANGDIR, $language_dir);
708 foreach $file (readdir LANGDIR) {
709
710 # Ignore entries that are not stopword files
711 next unless ($file =~ /sw$/);
712 $file_name = &util::filename_cat($language_dir, $file);
713 next unless (-f "$file_name");
714
715 # Read the stopwords
716 open(STOPFILE, "<$file_name");
717 while (<STOPFILE>) {
718 s/^\s+//;
719 s/\s.*//;
720 $word = $_;
721 $l = lc($word);
722 $stopwords{$l} = $word;
723 }
724 close STOPFILE;
725
726 }
727 closedir LANGDIR;
728 }
729 closedir STOPDIR;
730
731 # Read thesaurus information
732 if ($thesaurus) {
733
734 # link file exists
735 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
736 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
737
738 # ensure term file exists in the correct language
739 if ($language_exp =~ /^([a-z][a-z])/) {
740 $language = $1;
741 } else {
742 $language = 'en';
743 }
744 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
745 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
746
747
748 # Read the thesaurus terms
749 open(TH, "<$thesaurus_terms");
750 while(<TH>) {
751 s/^\d+ //;
752 s/\(.*\)//;
753 foreach $w (split(/\s+/, $_)) {
754 $thesaurus{lc($w)} = $w;
755 }
756 }
757 close TH;
758 }
759
760 # Read words in the text and count occurences
761 open(TXT, "<$phinddir/clauses");
762
763 my @words;
764 while(<TXT>) {
765 $line = $_;
766 next unless ($line =~ /./);
767
768 @words = split(/\s+/, $line);
769 foreach $w (@words) {
770 $l = lc($w);
771 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
772 $freq{$w}++;
773 }
774 $freq{$senlimit}++;
775 }
776
777 close TXT;
778
779 # Calculate the "best" form of each word
780 my (%bestform, %totalfreq, %bestfreq);
781
782 foreach $w (sort (keys %freq)) {
783 $l = lc($w);
784
785 # totalfreq is the number of times a term appears in any form
786 $totalfreq{$l} += $freq{$w};
787
788 if (defined $stopwords{$l}) {
789 $bestform{$l} = $stopwords{$l};
790
791 } elsif (defined $thesaurus{$l}) {
792 $bestform{$l} = $thesaurus{$l};
793
794 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
795 $bestfreq{$l} = $freq{$w};
796 $bestform{$l} = $w;
797 }
798 }
799 undef %freq;
800 undef %bestfreq;
801
802
803 # Assign symbol numbers to tokens
804 my $nextsymbol = 1;
805 my (@vocab);
806
807 # Delimiters
808 $first_delimiter = 1;
809
810 foreach $word (@delimiters) {
811
812# $word = lc($word); # jrm21
813 $word = uc($word);
814 $bestform{$word} = $word;
815 $vocab[$nextsymbol] = $word;
816 $symbol{$word} = $nextsymbol;
817 $nextsymbol++;
818 }
819 $last_delimiter = $nextsymbol - 1;
820 # Stopwords
821 $first_stopword = $nextsymbol;
822
823 foreach my $word (sort keys %stopwords) {
824 # don't include stopword unless it occurs in the text
825 $word = lc($word);
826 next unless ($totalfreq{$word});
827 next if ($symbol{$word});
828
829 $vocab[$nextsymbol] = $word;
830 $symbol{$word} = $nextsymbol;
831 $nextsymbol++;
832 }
833 $last_stopword = $nextsymbol - 1;
834 $first_contentword = $nextsymbol;
835
836 # Thesaurus terms
837 if ($thesaurus) {
838 $first_thesaurusword = $nextsymbol;
839
840 foreach my $word (sort keys %thesaurus) {
841
842 $word = lc($word);
843 next if ($symbol{$word});
844 $bestform{$word} = $thesaurus{$word};
845
846 $vocab[$nextsymbol] = $word;
847 $symbol{$word} = $nextsymbol;
848 $nextsymbol++;
849
850 }
851 $last_thesaurusword = $nextsymbol - 1;
852 }
853
854 # Other content words
855 $first_extractword = $nextsymbol;
856
857 foreach my $word (sort (keys %bestform)) {
858
859 next if ($symbol{$word});
860
861 $vocab[$nextsymbol] = $word;
862 $symbol{$word} = $nextsymbol;
863 $nextsymbol++;
864 }
865 $last_extractword = $nextsymbol - 1;
866 $last_contentword = $nextsymbol - 1;
867
868 # Outut the words
869 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
870 open(VOC, ">$phinddir/clauses.vocab");
871
872 for (my $i = 1; $i < $nextsymbol; $i++) {
873 $w = $vocab[$i];
874
875 print VOC "$bestform{$w}\n";
876 $totalfreq{$w} = 0 unless ($totalfreq{$w});
877 }
878 close VOC;
879
880
881 # Create statistics file
882 # Output statistics about the vocablary
883 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
884 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
885
886 open(STAT, ">$phinddir/clauses.stats")
887 || die "Cannot open $phinddir/clauses.stats: $!";
888
889 print STAT "first_delimiter $first_delimiter\n";
890 print STAT "last_delimiter $last_delimiter\n";
891 print STAT "first_stopword $first_stopword\n";
892 print STAT "last_stopword $last_stopword\n";
893 if ($thesaurus) {
894 print STAT "first_thesaurusword $first_thesaurusword\n";
895 print STAT "last_thesaurusword $last_thesaurusword\n";
896 }
897 print STAT "first_extractword $first_extractword\n";
898 print STAT "last_extractword $last_extractword\n";
899 print STAT "first_contentword $first_contentword\n";
900 print STAT "last_contentword $last_contentword\n";
901 print STAT "first_symbol $first_delimiter\n";
902 print STAT "last_symbol $last_contentword\n";
903 print STAT "first_word $first_stopword\n";
904 print STAT "last_word $last_contentword\n";
905 close STAT;
906
907 undef @vocab;
908
909
910 # Create numbers file
911 # Save text as symbol numbers
912 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
913
914 open(TXT, "<$phinddir/clauses");
915 open(NUM, ">$phinddir/clauses.numbers");
916
917## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
918## print NUM "$symbol{lc($colstart)}\n"; # jrm21
919 $phrasedelimiter = $symbol{$senlimit};
920 print NUM "$symbol{$colstart}\n";
921
922 # set up the special symbols that delimit documents and sentences
923 while(<TXT>) {
924
925 # split sentence into a list of tokens
926 $line = $_;
927 next unless ($line =~ /./);
928 @words = split(/\s+/, $line);
929
930 # output one token at a time
931 foreach $word (@words) {
932# don't lower-case special delimiters - jrm21
933 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
934 $word = lc($word);
935 }
936 print NUM "$symbol{$word}\n";
937 }
938
939 # output phrase delimiter
940 print NUM "$phrasedelimiter\n";
941 }
942
943 close TXT;
944# print NUM "$symbol{lc($colend)}\n";# jrm21
945 print NUM "$symbol{$colend}\n";
946 close NUM;
947
948 # Save thesaurus data in one convienient file
949 if ($thesaurus) {
950
951 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
952
953
954 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
955 if ($verbosity > 1);
956
957 # Read the thesaurus terms
958 my ($num, $text, %thes_symbols);
959
960 open(TH, "<$thesaurus_terms");
961 while(<TH>) {
962 chomp;
963 @words = split(/\s+/, $_);
964 $num = shift @words;
965 $text = "";
966
967 # translate words into symbol numbers
968 foreach $word (@words) {
969 $word = lc($word);
970 if ($symbol{$word}) {
971 $text .= "s$symbol{$word} ";
972 } elsif ($verbosity) {
973 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
974 }
975 }
976 $text =~ s/ $//;
977 $thes_symbols{$num} = $text;
978 }
979 close TH;
980
981 # Read the thesaurus links and write the corresponding data
982 open(TH, "<$thesaurus_links");
983 open(THOUT, ">$thesaurusfile");
984
985 while(<TH>) {
986 chomp;
987 ($num, $text) = split(/:/, $_);
988
989 if (defined($thes_symbols{$num})) {
990 print THOUT "$num:$thes_symbols{$num}:$text\n";
991 } else {
992 print THOUT "$num:untranslated:$text\n";
993 }
994 }
995 close TH;
996 close THOUT;
997 }
998
999
1000
1001
1002}
1003
1004
1005# renumber_phrases
1006#
1007# Prepare the phrases file to be input to mgpp. The biggest problem is
1008# reconciling the phrase identifiers used by the suffix program (which
1009# we'll call suffix-id numbers) with the numbers used in the thesaurus
1010# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1011# Phind-id numbers must be sorted by frequency of occurance.
1012#
1013# Start creating a set of phind-id numbers from the sorted suffix-id
1014# numbers and (if required) the thesaurus-id numbers. Then add any other
1015# phrases occuring in the thesaurus.
1016#
1017# The last thing we have to do is restore the vocabulary information to the
1018# phrase file so that the phrases are stored as words, not as symbol
1019# numbers.
1020
1021# The original phrases file looks something like this:
1022# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1023# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1024# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1025# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1026
1027
1028sub renumber_phrases {
1029 my ($self) = @_;
1030
1031 renumber_suffix_data($self);
1032 renumber_thesaurus_data($self);
1033 restore_vocabulary_data($self);
1034
1035}
1036
1037
1038
1039# renumber_suffix_data
1040#
1041# Translate phrases file to phrases.2 using phind keys instead
1042# of suffix keys and sorting the expansion data.
1043
1044sub renumber_suffix_data {
1045 my ($self) = @_;
1046
1047 my $verbosity = $self->{'verbosity'};
1048 my $out = $self->{'outhandle'};
1049 print $out "Translate phrases: suffix-ids become phind-id's\n"
1050 if ($verbosity);
1051
1052 my $phinddir = $self->{'phinddir'};
1053 my $infile = &util::filename_cat($phinddir, 'phrases');
1054 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1055
1056 # Read the phrase file. Calculate initial set of phind-id
1057 # numbers and store (suffixid -> frequency) relation.
1058
1059 my %suffixtophind;
1060 my @phindfrequency;
1061 my (@fields, $suffixid);
1062 my $nextphind = 1;
1063
1064 open(IN, "<$infile");
1065 while(<IN>) {
1066
1067 chomp;
1068 @fields = split(/:/, $_);
1069
1070 # get next suffixid and phindid
1071 $suffixid = shift @fields;
1072 $suffixtophind{$suffixid} = $nextphind;
1073
1074 # store total frequency
1075 shift @fields;
1076 $totalfrequency[$nextphind] = shift @fields;
1077
1078 $nextphind++;
1079 }
1080 close IN;
1081
1082
1083 # Translate phrases file to phrases.2. Use phind keys (not suffix
1084 # keys), sort expansion and document occurance data in order of
1085 # descending frequency..
1086 open(IN, "<$infile");
1087 open(OUT, ">$outfile");
1088
1089 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1090 my (@documwents, @newexp, $k, $n);
1091 my $linenumber = 0;
1092
1093 while(<IN>) {
1094
1095 # read the line
1096 chomp;
1097 @fields = split(/:/, $_);
1098
1099 # get a phrase number for this line
1100 $suffixid = shift @fields;
1101 die unless (defined($suffixtophind{$suffixid}));
1102 $phindid = $suffixtophind{$suffixid};
1103
1104 # get the symbols in the phrase
1105 $text = shift @fields;
1106
1107 # output status information
1108 $linenumber++;
1109 if ($verbosity > 2) {
1110 if ($linenumber % 1000 == 0) {
1111 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1112 }
1113 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1114 }
1115
1116 # get the phrase frequency
1117 $tf = shift @fields;
1118
1119 # get the number of expansions
1120 $countexp = shift @fields;
1121
1122 # get the expansions, convert them into phind-id numbers, and sort them
1123 $expansions = shift @fields;
1124 @newexp = ();
1125 foreach $k (split(/,/, $expansions)) {
1126 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1127 $n = $suffixtophind{$k};
1128 push @newexp, $n;
1129 }
1130 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1131
1132 # get the number of documents
1133 $countdocs = shift @fields;
1134
1135 # get the documents and sort them
1136 $documents = shift @fields;
1137 $documents =~ s/d//g;
1138 @documents = split(/;/, $documents);
1139 @documents = sort by_doc_frequency @documents;
1140
1141 # output the phrase data
1142 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1143 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1144
1145 }
1146
1147 close IN;
1148 close OUT;
1149}
1150
1151
1152# renumber_thesaurus_data
1153#
1154# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1155
1156sub renumber_thesaurus_data {
1157 my ($self) = @_;
1158
1159 my $out = $self->{'outhandle'};
1160 my $verbosity = $self->{'verbosity'};
1161 my $thesaurus = $self->{'thesaurus'};
1162
1163 my $phinddir = $self->{'phinddir'};
1164 my $infile = &util::filename_cat($phinddir, "phrases.2");
1165 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1166
1167
1168 # If no thesaurus is defined, simply move the phrases file.
1169 if (!$thesaurus) {
1170 print $out "Translate phrases.2: no thesaurus data\n"
1171 if ($verbosity);
1172 &util::mv($infile, $outfile);
1173 return;
1174 }
1175
1176 print $out "Translate phrases.2: add thesaurus data\n"
1177 if ($verbosity);
1178
1179 # 1.
1180 # Read thesaurus file and store (symbols->thesaurusid) mapping
1181 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1182 my %symbolstothesid;
1183 my (@fields, $thesid, $symbols);
1184
1185 open(TH, "<$thesaurusfile");
1186
1187 while (<TH>) {
1188
1189 chomp;
1190 @fields = split(/:/, $_);
1191
1192 # get id and text
1193 $thesid = shift @fields;
1194 $symbols = shift @fields;
1195 $symbolstothesid{$symbols} = $thesid;
1196 }
1197 close TH;
1198
1199 # 2.
1200 # Read phrases file to find thesaurus entries that already
1201 # have a phindid. Store their phind-ids for later translation,
1202 # and store their frequency for later sorting.
1203 my %thesaurustophindid;
1204 my %phindidtofrequency;
1205 my ($phindid, $freq);
1206
1207 open(IN, "<$infile");
1208
1209 while(<IN>) {
1210
1211 chomp;
1212 @fields = split(/:/, $_);
1213
1214 # phindid and symbols for this line
1215 $phindid = shift @fields;
1216 $symbols = shift @fields;
1217 $freq = shift @fields;
1218
1219 # do we have a thesaurus id corresponding to this phrase?
1220 if (defined($symbolstothesid{$symbols})) {
1221 $thesid = $symbolstothesid{$symbols};
1222 $thesaurustophindid{$thesid} = $phindid;
1223 $phindidtofrequency{$phindid} = $freq;
1224 }
1225 }
1226 close IN;
1227
1228 undef %symbolstothesid;
1229
1230 # 3.
1231 # Create phind-id numbers for remaining thesaurus entries,
1232 # and note that their frequency is 0 for later sorting.
1233 my $nextphindid = $phindid + 1;
1234
1235 open(TH, "<$thesaurusfile");
1236 while(<TH>) {
1237
1238 chomp;
1239 @fields = split(/:/, $_);
1240
1241 # read thesaurus-id and ensure it has a corresponding phind-id
1242 $thesid = shift @fields;
1243 if (!defined($thesaurustophindid{$thesid})) {
1244 $thesaurustophindid{$thesid} = $nextphindid;
1245 $phindidtofrequency{$nextphindid} = 0;
1246 $nextphindid++;
1247 }
1248 }
1249 close TH;
1250
1251 # 4.
1252 # Translate thesaurus file, replacing thesaurus-id numbers with
1253 # phind-id numbers.
1254 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1255 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1256 my (@links, $linkid, %linkidtotype, $newrelation);
1257
1258 open(TH, "<$thesaurusfile");
1259 open(TO, ">$newthesaurusfile");
1260 while(<TH>) {
1261
1262 chomp;
1263 @fields = split(/:/, $_);
1264
1265 # phindid and symbols for this line
1266 ($thesid, $symbols, $relations) = @fields;
1267
1268 die unless ($thesid && $symbols);
1269 die unless $thesaurustophindid{$thesid};
1270 $phindid = $thesaurustophindid{$thesid};
1271
1272 # convert each part of the relation string to use phind-id numbers
1273 # at the same time, we want to sort the list by frequency.
1274 undef %linkidtotype;
1275
1276 foreach $linktext (split(/;/, $relations)) {
1277 @linkdata = split(/,/, $linktext);
1278
1279 # remember the linktype (e.g. BT, NT)
1280 $linktype = shift @linkdata;
1281
1282 # store the type of each link
1283 foreach $thesid (@linkdata) {
1284 die unless (defined($thesaurustophindid{$thesid}));
1285 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1286 }
1287 }
1288
1289 # sort the list of links, first by frequency, then by type.
1290 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1291 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1292 $linkcounter = (scalar @links);
1293
1294 # create a string describing the link information
1295 $linktype = $linkidtotype{$links[0]};
1296 $newrelation = $linktype;
1297 foreach $linkid (@links) {
1298 if ($linkidtotype{$linkid} ne $linktype) {
1299 $linktype = $linkidtotype{$linkid};
1300 $newrelation .= ";" . $linktype;
1301 }
1302 $newrelation .= "," . $linkid;
1303 }
1304 $newrelation .= ";";
1305
1306
1307 # output the new line
1308 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1309 }
1310 close TH;
1311 close TO;
1312
1313 undef %thesaurustophindid;
1314 undef %linkidtotype;
1315 undef %phindidtofrequency;
1316
1317 # 5.
1318 # Read thesaurus data (in phind-id format) into memory
1319 my %thesaurusdata;
1320
1321 open(TH, "<$newthesaurusfile");
1322 while(<TH>) {
1323 chomp;
1324 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1325 die unless ($phindid && $symbols);
1326 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1327 }
1328 close TH;
1329
1330 # 6.
1331 # Add thesaurus data to phrases file
1332 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1333 my (@documwents, @newexp, $k, $n);
1334 my $linenumber = 0;
1335
1336 open(IN, "<$infile");
1337 open(OUT, ">$outfile");
1338
1339 # Update existing phrases
1340 while(<IN>) {
1341
1342 chomp;
1343 @fields = split(/:/, $_);
1344
1345 # get data for this line
1346 $phindid = shift @fields;
1347
1348 # output the phrase data, with thesaurus information
1349 print OUT "$phindid:", join(":", @fields);
1350
1351 # add thesaurus data
1352 if (defined($thesaurusdata{$phindid})) {
1353 @fields = split(/:/, $thesaurusdata{$phindid});
1354 shift @fields;
1355 $linkcounter = shift @fields;
1356 $relations = shift @fields;
1357
1358 print OUT ":$linkcounter:$relations";
1359 $thesaurusdata{$phindid} = "";
1360 }
1361 print OUT "\n";
1362 }
1363 close IN;
1364
1365 # Add phrases that aren't already in the file
1366 foreach $phindid (sort numerically keys %thesaurusdata) {
1367 next unless ($thesaurusdata{$phindid});
1368
1369 @fields = split(/:/, $thesaurusdata{$phindid});
1370 $symbols = shift @fields;
1371 $linkcounter = shift @fields;
1372 $relations = shift @fields;
1373
1374 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1375 }
1376 close OUT;
1377
1378}
1379
1380# restore_vocabulary_data
1381#
1382# Read phrases.3 and restore vocabulary information. Then write
1383# this data to the MGPP input files (pword.txt and pdata.txt) and
1384# (if requested) to the saved phrases file.
1385
1386sub restore_vocabulary_data {
1387 my ($self) = @_;
1388
1389 my $out = $self->{'outhandle'};
1390 my $verbosity = $self->{'verbosity'};
1391 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1392
1393 my $phinddir = $self->{'phinddir'};
1394 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1395 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1396 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1397 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1398
1399 my $savephrases = $self->{'savephrases'};
1400
1401 # 1.
1402 # Read the vocabulary file
1403 open(V, "<$vocabfile")
1404 || die "Cannot open $vocabfile: $!";
1405 my @symbol;
1406 my $i = 1;
1407 while(<V>) {
1408 chomp;
1409 $symbol[$i++] = $_;
1410 }
1411 close V;
1412
1413 # 2.
1414 # Translate phrases.3 to MGPP input files
1415 my ($key, $text, $word, $isThesaurus);
1416 my @fields;
1417 my $linenumber = 0;
1418
1419 open(IN, "<$infile");
1420 open(DATA, ">$datafile");
1421 open(WORD, ">$wordfile");
1422
1423 # Save the phrases in a separate text file
1424 if ($savephrases) {
1425 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1426 open(SAVE, ">$savephrases");
1427 }
1428
1429 while(<IN>) {
1430
1431 # read the line
1432 chomp;
1433 $line = $_;
1434 @fields = split(/:/, $line);
1435
1436 # get a phrase number for this line
1437 $key = shift @fields;
1438
1439 # restore the text of the phrase
1440 $text = shift @fields;
1441 $text =~ s/s(\d+)/$symbol[$1]/g;
1442 if ($text =~ / /) {
1443 $word = "";
1444 } elsif ($text ne 'untranslated') {
1445 $word = $text;
1446 }
1447
1448 # output the phrase data
1449 print DATA "<Document>";
1450 print DATA "$key:$text:", join(":", @fields), ":\n";
1451
1452 # output the word index search data
1453 print WORD "<Document>$word\n";
1454
1455 # output the phrases to a text file
1456 if ($savephrases) {
1457 if ((scalar @fields) == 7) {
1458 $isThesaurus = 1;
1459 } else {
1460 $isThesaurus = 0;
1461 }
1462 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1463 }
1464 }
1465 close IN;
1466 close WORD;
1467 close DATA;
1468 close SAVE if ($savephrases);
1469
1470}
1471
1472
1473
1474# sort routines used to renumber phrases
1475
1476sub numerically { $a <=> $b }
1477
1478sub by_doc_frequency {
1479 my $fa = 1;
1480 if ($a =~ /,/) {
1481 $fa = $a;
1482 $fa =~ s/\d+,//;
1483 }
1484 my $fb = 1;
1485 if ($b =~ /,/) {
1486 $fb = $b;
1487 $fb =~ s/\d+,//;
1488 }
1489
1490 return ($fb <=> $fa);
1491}
1492
14931;
Note: See TracBrowser for help on using the repository browser.