source: gsdl/trunk/perllib/classify/Phind.pm@ 18455

Last change on this file since 18455 was 18455, checked in by davidb, 15 years ago

Addition of 'edit_mode' parameter to classify(). This can be either 'add' 'delete' or 'reindex' (should think about renaming the last one to something more appropriate, e.g. update).

  • Property svn:keywords set to Author Date Id Revision
File size: 40.3 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32# 12/05/02 Added usage datastructure - John Thompson
33
34package Phind;
35
36use BaseClassifier;
37use util;
38use ghtml;
39use unicode;
40
41use strict;
42no strict 'refs'; # allow filehandles to be variables and viceversa
43
44my @removedirs = ();
45
46my %wanted_index_files = ('td'=>1,
47 't'=>1,
48 'ti'=>1,
49 'tl'=>1,
50 'tsd'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'il'=>1,
57 'w'=>1,
58 'wa'=>1);
59
60sub BEGIN {
61 @Phind::ISA = ('BaseClassifier');
62}
63
64sub END {
65
66 # Tidy up stray files - we do this here as there's some weird problem
67 # preventing us from doing it in the get_classify_info() function (on
68 # windows at least) where the close() appears to fail on txthandle and
69 # dochandle, thus preventing us from deleting those files
70
71 foreach my $dir (@removedirs) {
72 if (-d $dir && opendir (DIR, $dir)) {
73 my @files = readdir DIR;
74 closedir DIR;
75
76 foreach my $file (@files) {
77 next if $file =~ /^\.\.?$/;
78 my ($suffix) = $file =~ /\.([^\.]+)$/;
79 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
80 # delete it!
81 &util::rm (&util::filename_cat ($dir, $file));
82 }
83 }
84 }
85 }
86}
87
88my $arguments =
89 [ { 'name' => "text",
90 'desc' => "{Phind.text}",
91 'type' => "string",
92 'deft' => "section:Title,section:text",
93 'reqd' => "no" },
94 { 'name' => "title",
95 'desc' => "{Phind.title}",
96 'type' => "metadata",
97 'deft' => "Title",
98 'reqd' => "no" },
99 { 'name' => "buttonname",
100 'desc' => "{BasClas.buttonname}",
101 'type' => "string",
102 'deft' => "Phrase",
103 'reqd' => "no" },
104 { 'name' => "language",
105 'desc' => "{Phind.language}",
106 'type' => "string",
107 'deft' => "en",
108 'reqd' => "no" },
109 { 'name' => "savephrases",
110 'desc' => "{Phind.savephrases}",
111 'type' => "string",
112 'deft' => "",
113 'reqd' => "no" },
114 { 'name' => "suffixmode",
115 'desc' => "{Phind.suffixmode}",
116 'type' => "int",
117 'deft' => "1",
118 'range' => "0,1",
119 'reqd' => "no" },
120 { 'name' => "min_occurs",
121 'desc' => "{Phind.min_occurs}",
122 'type' => "int",
123 'deft' => "2",
124 'range' => "1,",
125 'reqd' => "no" },
126 { 'name' => "thesaurus",
127 'desc' => "{Phind.thesaurus}",
128 'type' => "string",
129 'deft' => "",
130 'reqd' => "no" },
131 { 'name' => "untidy",
132 'desc' => "{Phind.untidy}",
133 'type' => "flag",
134 'reqd' => "no" } ];
135
136my $options = { 'name' => "Phind",
137 'desc' => "{Phind.desc}",
138 'abstract' => "no",
139 'inherits' => "yes",
140 'args' => $arguments };
141
142
143# Phrase delimiter symbols - these should be abstracted out someplace
144
145my $colstart = "COLLECTIONSTART";
146my $colend = "COLLECTIONEND";
147my $doclimit = "DOCUMENTLIMIT";
148my $senlimit = "SENTENCELIMIT";
149my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
150
151
152# Create a new Phind browser based on collect.cfg
153
154sub new {
155 my ($class) = shift (@_);
156 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
157 push(@$classifierslist, $class);
158
159 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
160 push(@{$hashArgOptLists->{"OptList"}},$options);
161
162 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
163
164 if ($self->{'info_only'}) {
165 # don't worry about any options etc
166 return bless $self, $class;
167 }
168
169 # Ensure the Phind generate scripts are in place
170 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
171 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
172 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
173 if (!(-e $file1)) {
174 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
175 exit(1);
176 }
177
178 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
179 $self->{"indexes"} = $self->{"text"};
180
181 # Further setup
182 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
183 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
184 if (! defined $self->{'builddir'}) {
185 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
186 }
187 $self->{'total'} = 0;
188
189 # Clean out the unused keys
190 delete $self->{"text"};
191
192 return bless $self, $class;
193}
194
195
196# Initialise the Phind classifier
197
198sub init {
199 my $self = shift (@_);
200
201 # ensure we have a build directory
202 my $builddir = $self->{'builddir'};
203 die unless (-e "$builddir");
204
205 # create Phind directory
206 my $phnumber = 1;
207 my $phinddir = &util::filename_cat($builddir, "phind1");
208 while (-e "$phinddir") {
209 $phnumber++;
210 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
211 }
212 &util::mk_dir("$phinddir");
213 $self->{'phinddir'} = $phinddir;
214 $self->{'phindnumber'} = $phnumber;
215
216 push(@removedirs, $phinddir) unless $self->{'untidy'};
217
218 # open filehandles for documents and text
219 my $clausefile = &util::filename_cat("$phinddir", "clauses");
220 &util::rm($clausefile) if (-e $clausefile);
221
222 my $txthandle = 'TEXT' . $phnumber;
223 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
224 $self->{'txthandle'} = $txthandle;
225
226 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
227 &util::rm($docfile) if (-e $docfile);
228
229 my $dochandle = 'DOC' . $phnumber;
230 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
231 $self->{'dochandle'} = $dochandle;
232
233}
234
235
236# Classify each document.
237#
238# Each document is passed here in turn. The classifier extracts the
239# text of each and stores it in the clauses file. Document details are
240# stored in the docs.txt file.
241
242sub classify {
243 my $self = shift (@_);
244 my ($doc_obj,$edit_mode) = @_;
245
246 my $verbosity = $self->{'verbosity'};
247 my $top_section = $doc_obj->get_top_section();
248
249 my $titlefield = $self->{'title'};
250
251 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
252 if (!defined($title)) {
253 $title = "";
254 print STDERR "Phind: document has no title\n";
255 }
256 print "process: $title\n" if ($verbosity > 2);
257
258 # Only consider the file if it is in the correct language
259 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
260 my $phrlanguage = $self->{'language'};
261 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
262
263 if ($edit_mode eq "delete") {
264 # This classifier works quite differently to the others
265 # Probably doesn't support incremental building anyway
266 return;
267 }
268
269 # record this file
270 $self->{'total'} ++;
271 # what is $file ???
272 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
273
274 # Store document details
275 my $OID = $doc_obj->get_OID();
276 $OID = "NULL" unless defined $OID;
277 my $dochandle = $self->{'dochandle'};
278 print $dochandle "<Document>\t$OID\t$title\n";
279
280 # Store the text occuring in this object
281
282 # output the document delimiter
283 my $txthandle = $self->{'txthandle'};
284 print $txthandle "$doclimit\n";
285
286 # iterate over the required indexes and store their text
287 my $indexes = $self->{'indexes'};
288 my $text = "";
289 my ($part, $level, $field, $section, $data, $dataref);
290
291 foreach $part (split(/,/, $indexes)) {
292
293 # Each field has a level and a data element ((e.g. document:Title)
294 ($level, $field) = split(/:/, $part);
295 die unless ($level && $field);
296
297 # Extract the text from every section
298 # (In phind, document:text and section:text are equivalent)
299 if ($field eq "text") {
300 $data = "";
301 $section = $doc_obj->get_top_section();
302 while (defined($section)) {
303 $data .= $doc_obj->get_text($section) . "\n";
304 $section = $doc_obj->get_next_section($section);
305 }
306 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
307 }
308
309 # Extract a metadata field from a document
310 # (If there is more than one element of the given type, get them all.)
311 elsif ($level eq "document") {
312 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
313 foreach $data (@$dataref) {
314 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
315 }
316 }
317
318 # Extract metadata from every section in a document
319 elsif ($level eq "section") {
320 $data = "";
321 $section = $doc_obj->get_top_section();
322 while (defined($section)) {
323 $dataref = $doc_obj->get_metadata($section, $field);
324 $data .= join("\n", @$dataref) . "\n";
325 $section = $doc_obj->get_next_section($section);
326 }
327 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
328 }
329
330 # Some sort of specification which I don't understand
331 else {
332 die "Unknown level ($level) in Phind index ($part)\n";
333 }
334
335 }
336
337 # output the text
338 $text =~ tr/\n//s;
339 print $txthandle "$text";
340}
341
342
343# Construct the classifier from the information already gathered
344#
345# When get_classify_info is called, the clauses and docs.txt files have
346# already been constructed in the Phind directory. This function will
347# translate them into compressed, indexed MGPP files that can be read by
348# the phindcgi script. It will also register our classifier so that it
349# shows up in the navigation bar.
350
351sub get_classify_info {
352 my $self = shift (@_);
353 my ($gli) = @_;
354
355 close $self->{'dochandle'};
356 close $self->{'txthandle'};
357 my $verbosity = $self->{'verbosity'};
358 my $out = $self->{'outhandle'};
359 my $phinddir = $self->{'phinddir'};
360
361 my $osextra = "";
362 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
363 $osextra = " -d /";
364 }
365
366 print STDERR "</Stage>\n" if $gli;
367
368 if ($verbosity) {
369 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
370 print $out "*** in ", $self->{'phinddir'}, "\n";
371 }
372
373 print STDERR "<Stage name='Phind'>\n" if $gli;
374
375 # Construct phind indexes
376 my $suffixmode = $self->{'suffixmode'};
377 my $min_occurs = $self->{'min_occurs'};
378 my ($command, $status);
379
380 # Generate the vocabulary, symbol statistics, and numbers file
381 # from the clauses file
382 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
383 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
384 &extract_vocabulary($self);
385
386 # Use the suffix program to generate the phind/phrases file
387 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
388 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
389 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
390
391 # check that we generated some files. It's not necessarily an error if
392 # we didn't (execute() would have quit on error), but we can't go on.
393 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
394 if (! -r $phrasesfile) {
395 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
396 print $out "\nNo phrases found for Phind classifier!\n";
397 return;
398 }
399
400 # Create the phrase file and put phrase numbers in phind/phrases
401 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
402 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
403 &renumber_phrases($self);
404
405 print $out "\nCreating phrase databases\n";
406 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
407 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
408 my $mg_stem = &util::filename_cat($phinddir, "pdata");
409
410 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
411 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
412 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
413
414 # create the mg index of words
415 print $out "\nCreating word-level search indexes\n";
416 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
417 $mg_input = &util::filename_cat($phinddir, "pword.txt");
418 $mg_stem = &util::filename_cat($phinddir, "pword");
419
420 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
421 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
422 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
423 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
424 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
425 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
426
427 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
428 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
429 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
430
431 # create the mg document information database
432 print $out "\nCreating document information databases\n";
433 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
434 $mg_input = &util::filename_cat($phinddir, "docs.txt");
435 $mg_stem = &util::filename_cat($phinddir, "docs");
436
437 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
438 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
439 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
440
441 # Return the information about the classifier that we'll later want to
442 # use to create macros when the Phind classifier document is displayed.
443 my %classifyinfo = ('thistype'=>'Invisible',
444 'childtype'=>'Phind',
445 'Title'=>$self->{'buttonname'},
446 'parameters'=>"phindnumber=$self->{'phindnumber'}",
447 'contains'=>[]);
448
449 my $collection = $self->{'collection'};
450 my $url = "library?a=p&p=phind&c=$collection";
451 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
452
453 return \%classifyinfo;
454}
455
456
457
458sub convert_gml_to_tokens {
459
460 my ($language_exp, $text) = @_;
461
462 # escape any magic words... - jrm21
463 foreach my $delim (@delimiters) {
464 my $replacement=lc($delim);
465 my $num= $text=~ s/$delim/$replacement/g;
466 if (!$num) {$num=0;}
467 }
468
469 if ($language_exp =~ /en/) {
470 return &convert_gml_to_tokens_EN($text);
471 }
472
473 if ($language_exp =~ /zh/) {
474 return &convert_gml_to_tokens_ZH($text);
475 }
476
477 $_ = $text;
478
479 # 1. remove GML tags
480
481 # Remove everything that is in a tag
482 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
483 s/\s*<br>\s*/ LINEBREAK /isgo;
484 s/<[^>]*>/ /sgo;
485
486 # Now we have the text, but it may contain HTML
487 # elements coded as &gt; etc. Remove these tags.
488 s/&amp;/&/sgo;
489 s/&lt;/</sgo;
490 s/&gt;/>/sgo;
491 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
492 s/\s*<br>\s*/ LINEBREAK /isgo;
493 s/<[^>]*>/ /sgo;
494
495 # replace<p> and <br> placeholders with clause break symbol (\n)
496 s/\s+/ /gso;
497 s/PARAGRAPHBREAK/\n/sgo;
498 s/LINEBREAK/\n/sgo;
499
500
501
502
503 # 2. Split the remaining text into space-delimited tokens
504
505 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
506 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
507
508 # Split text at word boundaries
509 s/\b/ /go;
510
511 # 3. Convert the remaining text to "clause format"
512
513 # Insert newline if the end of a sentence is detected
514 # (delimter is: "[\.\?\!]\s")
515 # s/\s*[\.\?\!]\s+/\n/go;
516
517 # remove unnecessary punctuation and replace with clause break symbol (\n)
518 s/[^\w ]/\n/go;
519
520 # remove extraneous whitespace
521 s/ +/ /sgo;
522 s/^\s+//mgo;
523 s/\s*$/\n/mgo;
524
525 # remove lines that contain one word or less
526 s/^\S*$//mgo;
527 s/^\s*$//mgo;
528 tr/\n//s;
529
530 return $_;
531}
532
533# a chinese version
534sub convert_gml_to_tokens_ZH {
535
536 $_ = shift @_;
537
538 # Replace all whitespace with a simple space
539 s/\s+/ /gs;
540 # Remove everything that is in a tag
541 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
542 s/\s*<br>\s*/ LINEBREAK /isg;
543 s/<[^>]*>/ /sg;
544
545 # Now we have the text, but it may contain HTML
546 # elements coded as &gt; etc. Remove these tags.
547 s/&lt;/</sg;
548 s/&gt;/>/sg;
549
550 s/\s+/ /sg;
551 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
552 s/\s*<br>\s*/ LINEBREAK /isg;
553 s/<[^>]*>/ /sg;
554
555 # remove &amp; and other miscellaneous markup tags
556 s/&amp;/&/sg;
557 s/&lt;/</sg;
558 s/&gt;/>/sg;
559 s/&amp;/&/sg;
560
561 # replace<p> and <br> placeholders with carriage returns
562 s/PARAGRAPHBREAK/\n/sg;
563 s/LINEBREAK/\n/sg;
564
565
566# print STDERR "text:$_\n";
567 return $_;
568}
569
570# A version of convert_gml_to_tokens that is fine-tuned to the English language.
571
572sub convert_gml_to_tokens_EN {
573 $_ = shift @_;
574
575 # FIRST, remove GML tags
576
577 # Replace all whitespace with a simple space
578 s/\s+/ /gs;
579
580 # Remove everything that is in a tag
581 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
582 s/\s*<br>\s*/ LINEBREAK /isg;
583 s/<[^>]*>/ /sg;
584
585 # Now we have the text, but it may contain HTML
586 # elements coded as &gt; etc. Remove these tags.
587 s/&lt;/</sg;
588 s/&gt;/>/sg;
589
590 s/\s+/ /sg;
591 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
592 s/\s*<br>\s*/ LINEBREAK /isg;
593 s/<[^>]*>/ /sg;
594
595 # remove &amp; and other miscellaneous markup tags
596 s/&amp;/&/sg;
597 s/&lt;/</sg;
598 s/&gt;/>/sg;
599 s/&amp;/&/sg;
600
601 # replace<p> and <br> placeholders with carriage returns
602 s/PARAGRAPHBREAK/\n/sg;
603 s/LINEBREAK/\n/sg;
604
605
606 # Exceptional punctuation
607 #
608 # We make special cases of some punctuation
609
610 # remove any apostrophe that indicates omitted letters
611 s/(\w+)\'(\w*\s)/ $1$2 /g;
612
613 # remove period that appears in a person's initals
614 s/\s([A-Z])\./ $1 /g;
615
616 # replace hyphens in hypheanted words and names with a space
617 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
618
619 # Convert the remaining text to "clause format",
620 # This means removing all excess punctuation and garbage text,
621 # normalising valid punctuation to fullstops and commas,
622 # then putting one cluse on each line.
623
624 # Insert newline when the end of a sentence is detected
625 # (delimter is: "[\.\?\!]\s")
626 s/\s*[\.\?\!]\s+/\n/g;
627
628 # split numbers after four digits
629 s/(\d\d\d\d)/$1 /g;
630
631 # split words after 32 characters
632
633 # squash repeated punctuation
634 tr/A-Za-z0-9 //cs;
635
636 # save email addresses
637 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
638
639 # normalise clause breaks (mostly punctuation symbols) to commas
640 s/[^A-Za-z0-9 \n]+/ , /g;
641
642 # Remove repeated commas, and replace with newline
643 s/\s*,[, ]+/\n/g;
644
645 # remove extra whitespace
646 s/ +/ /sg;
647 s/^\s+//mg;
648 s/\s*$/\n/mg;
649
650 # remove lines that contain one word or less
651 s/^\w*$//mg;
652 s/^\s*$//mg;
653 tr/\n//s;
654
655 return $_;
656
657}
658
659
660
661# Execute a system command
662
663sub execute {
664 my ($command, $verbosity, $outhandle) = @_;
665 print $outhandle "Executing: $command\n" if ($verbosity > 2);
666 $! = 0;
667 my $status = system($command);
668 if ($status != 0) {
669 print STDERR "Phind - Error executing '$command': $!\n";
670 exit($status); # this causes the build to fail...
671 }
672}
673
674
675# Generate the vocabulary, symbol statistics, and numbers file from the
676# clauses file. This is legacy code, so is a bit messy and probably wont
677# run under windows.
678
679sub extract_vocabulary {
680 my ($self) = @_;
681
682 my $verbosity = $self->{'verbosity'};
683 my $out = $self->{'outhandle'};
684
685 my $collectiondir = $self->{'collectiondir'};
686 my $phinddir = $self->{'phinddir'};
687
688 my $language_exp = $self->{'language'};
689
690 my ($w, $l, $line, $word);
691
692 my ($first_delimiter, $last_delimiter,
693 $first_stopword, $last_stopword,
694 $first_extractword, $last_extractword,
695 $first_contentword, $last_contentword,
696 $phrasedelimiter);
697
698 my $thesaurus = $self->{'thesaurus'};
699 my ($thesaurus_links, $thesaurus_terms,
700 %thesaurus, $first_thesaurusword, $last_thesaurusword);
701
702 my %symbol;
703 my (%freq);
704
705 print $out "Calculating vocabulary\n" if ($verbosity > 1);
706
707 # Read and store the stopwords
708 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
709 my $stopword_files = ();
710 my ($language, $language_dir, $file, $file_name);
711 my %stopwords;
712
713 # Examine each directory in the stopword directory
714 opendir(STOPDIR, $stopdir);
715 foreach $language (readdir STOPDIR) {
716
717 # Ignore entries that do not match the classifier's language
718 next unless ($language =~ /$language_exp/);
719 $language_dir = &util::filename_cat($stopdir, $language);
720 next unless (-d "$language_dir");
721
722 opendir(LANGDIR, $language_dir);
723 foreach $file (readdir LANGDIR) {
724
725 # Ignore entries that are not stopword files
726 next unless ($file =~ /sw$/);
727 $file_name = &util::filename_cat($language_dir, $file);
728 next unless (-f "$file_name");
729
730 # Read the stopwords
731 open(STOPFILE, "<$file_name");
732 while (<STOPFILE>) {
733 s/^\s+//;
734 s/\s.*//;
735 $word = $_;
736 $l = lc($word);
737 $stopwords{$l} = $word;
738 }
739 close STOPFILE;
740
741 }
742 closedir LANGDIR;
743 }
744 closedir STOPDIR;
745
746 # Read thesaurus information
747 if ($thesaurus) {
748
749 # link file exists
750 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
751 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
752
753 # ensure term file exists in the correct language
754 if ($language_exp =~ /^([a-z][a-z])/) {
755 $language = $1;
756 } else {
757 $language = 'en';
758 }
759 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
760 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
761
762
763 # Read the thesaurus terms
764 open(TH, "<$thesaurus_terms");
765 while(<TH>) {
766 s/^\d+ //;
767 s/\(.*\)//;
768 foreach $w (split(/\s+/, $_)) {
769 $thesaurus{lc($w)} = $w;
770 }
771 }
772 close TH;
773 }
774
775 # Read words in the text and count occurences
776 open(TXT, "<$phinddir/clauses");
777
778 my @words;
779 while(<TXT>) {
780 $line = $_;
781 next unless ($line =~ /./);
782
783 @words = split(/\s+/, $line);
784 foreach $w (@words) {
785 $l = lc($w);
786 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
787 $freq{$w}++;
788 }
789 $freq{$senlimit}++;
790 }
791
792 close TXT;
793
794 # Calculate the "best" form of each word
795 my (%bestform, %totalfreq, %bestfreq);
796
797 foreach $w (sort (keys %freq)) {
798 $l = lc($w);
799
800 # totalfreq is the number of times a term appears in any form
801 $totalfreq{$l} += $freq{$w};
802
803 if (defined $stopwords{$l}) {
804 $bestform{$l} = $stopwords{$l};
805
806 } elsif (defined $thesaurus{$l}) {
807 $bestform{$l} = $thesaurus{$l};
808
809 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
810 $bestfreq{$l} = $freq{$w};
811 $bestform{$l} = $w;
812 }
813 }
814 undef %freq;
815 undef %bestfreq;
816
817
818 # Assign symbol numbers to tokens
819 my $nextsymbol = 1;
820 my (@vocab);
821
822 # Delimiters
823 $first_delimiter = 1;
824
825 foreach $word (@delimiters) {
826
827# $word = lc($word); # jrm21
828 $word = uc($word);
829 $bestform{$word} = $word;
830 $vocab[$nextsymbol] = $word;
831 $symbol{$word} = $nextsymbol;
832 $nextsymbol++;
833 }
834 $last_delimiter = $nextsymbol - 1;
835 # Stopwords
836 $first_stopword = $nextsymbol;
837
838 foreach my $word (sort keys %stopwords) {
839 # don't include stopword unless it occurs in the text
840 $word = lc($word);
841 next unless ($totalfreq{$word});
842 next if ($symbol{$word});
843
844 $vocab[$nextsymbol] = $word;
845 $symbol{$word} = $nextsymbol;
846 $nextsymbol++;
847 }
848 $last_stopword = $nextsymbol - 1;
849 $first_contentword = $nextsymbol;
850
851 # Thesaurus terms
852 if ($thesaurus) {
853 $first_thesaurusword = $nextsymbol;
854
855 foreach my $word (sort keys %thesaurus) {
856
857 $word = lc($word);
858 next if ($symbol{$word});
859 $bestform{$word} = $thesaurus{$word};
860
861 $vocab[$nextsymbol] = $word;
862 $symbol{$word} = $nextsymbol;
863 $nextsymbol++;
864
865 }
866 $last_thesaurusword = $nextsymbol - 1;
867 }
868
869 # Other content words
870 $first_extractword = $nextsymbol;
871
872 foreach my $word (sort (keys %bestform)) {
873
874 next if ($symbol{$word});
875
876 $vocab[$nextsymbol] = $word;
877 $symbol{$word} = $nextsymbol;
878 $nextsymbol++;
879 }
880 $last_extractword = $nextsymbol - 1;
881 $last_contentword = $nextsymbol - 1;
882
883 # Outut the words
884 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
885 open(VOC, ">$phinddir/clauses.vocab");
886
887 for (my $i = 1; $i < $nextsymbol; $i++) {
888 $w = $vocab[$i];
889
890 print VOC "$bestform{$w}\n";
891 $totalfreq{$w} = 0 unless ($totalfreq{$w});
892 }
893 close VOC;
894
895
896 # Create statistics file
897 # Output statistics about the vocablary
898 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
899 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
900
901 open(STAT, ">$phinddir/clauses.stats")
902 || die "Cannot open $phinddir/clauses.stats: $!";
903
904 print STAT "first_delimiter $first_delimiter\n";
905 print STAT "last_delimiter $last_delimiter\n";
906 print STAT "first_stopword $first_stopword\n";
907 print STAT "last_stopword $last_stopword\n";
908 if ($thesaurus) {
909 print STAT "first_thesaurusword $first_thesaurusword\n";
910 print STAT "last_thesaurusword $last_thesaurusword\n";
911 }
912 print STAT "first_extractword $first_extractword\n";
913 print STAT "last_extractword $last_extractword\n";
914 print STAT "first_contentword $first_contentword\n";
915 print STAT "last_contentword $last_contentword\n";
916 print STAT "first_symbol $first_delimiter\n";
917 print STAT "last_symbol $last_contentword\n";
918 print STAT "first_word $first_stopword\n";
919 print STAT "last_word $last_contentword\n";
920 close STAT;
921
922 undef @vocab;
923
924
925 # Create numbers file
926 # Save text as symbol numbers
927 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
928
929 open(TXT, "<$phinddir/clauses");
930 open(NUM, ">$phinddir/clauses.numbers");
931
932## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
933## print NUM "$symbol{lc($colstart)}\n"; # jrm21
934 $phrasedelimiter = $symbol{$senlimit};
935 print NUM "$symbol{$colstart}\n";
936
937 # set up the special symbols that delimit documents and sentences
938 while(<TXT>) {
939
940 # split sentence into a list of tokens
941 $line = $_;
942 next unless ($line =~ /./);
943 @words = split(/\s+/, $line);
944
945 # output one token at a time
946 foreach $word (@words) {
947# don't lower-case special delimiters - jrm21
948 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
949 $word = lc($word);
950 }
951 print NUM "$symbol{$word}\n";
952 }
953
954 # output phrase delimiter
955 print NUM "$phrasedelimiter\n";
956 }
957
958 close TXT;
959# print NUM "$symbol{lc($colend)}\n";# jrm21
960 print NUM "$symbol{$colend}\n";
961 close NUM;
962
963 # Save thesaurus data in one convienient file
964 if ($thesaurus) {
965
966 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
967
968
969 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
970 if ($verbosity > 1);
971
972 # Read the thesaurus terms
973 my ($num, $text, %thes_symbols);
974
975 open(TH, "<$thesaurus_terms");
976 while(<TH>) {
977 chomp;
978 @words = split(/\s+/, $_);
979 $num = shift @words;
980 $text = "";
981
982 # translate words into symbol numbers
983 foreach $word (@words) {
984 $word = lc($word);
985 if ($symbol{$word}) {
986 $text .= "s$symbol{$word} ";
987 } elsif ($verbosity) {
988 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
989 }
990 }
991 $text =~ s/ $//;
992 $thes_symbols{$num} = $text;
993 }
994 close TH;
995
996 # Read the thesaurus links and write the corresponding data
997 open(TH, "<$thesaurus_links");
998 open(THOUT, ">$thesaurusfile");
999
1000 while(<TH>) {
1001 chomp;
1002 ($num, $text) = split(/:/, $_);
1003
1004 if (defined($thes_symbols{$num})) {
1005 print THOUT "$num:$thes_symbols{$num}:$text\n";
1006 } else {
1007 print THOUT "$num:untranslated:$text\n";
1008 }
1009 }
1010 close TH;
1011 close THOUT;
1012 }
1013
1014
1015
1016
1017}
1018
1019
1020# renumber_phrases
1021#
1022# Prepare the phrases file to be input to mgpp. The biggest problem is
1023# reconciling the phrase identifiers used by the suffix program (which
1024# we'll call suffix-id numbers) with the numbers used in the thesaurus
1025# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1026# Phind-id numbers must be sorted by frequency of occurance.
1027#
1028# Start creating a set of phind-id numbers from the sorted suffix-id
1029# numbers and (if required) the thesaurus-id numbers. Then add any other
1030# phrases occuring in the thesaurus.
1031#
1032# The last thing we have to do is restore the vocabulary information to the
1033# phrase file so that the phrases are stored as words, not as symbol
1034# numbers.
1035
1036# The original phrases file looks something like this:
1037# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1038# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1039# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1040# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1041
1042
1043sub renumber_phrases {
1044 my ($self) = @_;
1045
1046 renumber_suffix_data($self);
1047 renumber_thesaurus_data($self);
1048 restore_vocabulary_data($self);
1049
1050}
1051
1052
1053
1054# renumber_suffix_data
1055#
1056# Translate phrases file to phrases.2 using phind keys instead
1057# of suffix keys and sorting the expansion data.
1058
1059sub renumber_suffix_data {
1060 my ($self) = @_;
1061
1062 my $verbosity = $self->{'verbosity'};
1063 my $out = $self->{'outhandle'};
1064 print $out "Translate phrases: suffix-ids become phind-id's\n"
1065 if ($verbosity);
1066
1067 my $phinddir = $self->{'phinddir'};
1068 my $infile = &util::filename_cat($phinddir, 'phrases');
1069 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1070
1071 # Read the phrase file. Calculate initial set of phind-id
1072 # numbers and store (suffixid -> frequency) relation.
1073
1074 my %suffixtophind;
1075 my @totalfrequency;
1076 my (@fields, $suffixid);
1077 my $nextphind = 1;
1078
1079 open(IN, "<$infile");
1080 while(<IN>) {
1081
1082 chomp;
1083 @fields = split(/:/, $_);
1084
1085 # get next suffixid and phindid
1086 $suffixid = shift @fields;
1087 $suffixtophind{$suffixid} = $nextphind;
1088
1089 # store total frequency
1090 shift @fields;
1091 $totalfrequency[$nextphind] = shift @fields;
1092
1093 $nextphind++;
1094 }
1095 close IN;
1096
1097
1098 # Translate phrases file to phrases.2. Use phind keys (not suffix
1099 # keys), sort expansion and document occurance data in order of
1100 # descending frequency..
1101 open(IN, "<$infile");
1102 open(OUT, ">$outfile");
1103
1104 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1105 my (@documwents, @newexp, $k, $n);
1106 my $linenumber = 0;
1107
1108 while(<IN>) {
1109
1110 # read the line
1111 chomp;
1112 @fields = split(/:/, $_);
1113
1114 # get a phrase number for this line
1115 $suffixid = shift @fields;
1116 die unless (defined($suffixtophind{$suffixid}));
1117 $phindid = $suffixtophind{$suffixid};
1118
1119 # get the symbols in the phrase
1120 $text = shift @fields;
1121
1122 # output status information
1123 $linenumber++;
1124 if ($verbosity > 2) {
1125 if ($linenumber % 1000 == 0) {
1126 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1127 }
1128 # what are $num and $key??
1129 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1130 }
1131
1132 # get the phrase frequency
1133 $tf = shift @fields;
1134
1135 # get the number of expansions
1136 $countexp = shift @fields;
1137
1138 # get the expansions, convert them into phind-id numbers, and sort them
1139 $expansions = shift @fields;
1140 @newexp = ();
1141 foreach $k (split(/,/, $expansions)) {
1142 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1143 $n = $suffixtophind{$k};
1144 push @newexp, $n;
1145 }
1146 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1147
1148 # get the number of documents
1149 $countdocs = shift @fields;
1150
1151 # get the documents and sort them
1152 $documents = shift @fields;
1153 $documents =~ s/d//g;
1154 my @documents = split(/;/, $documents);
1155 @documents = sort by_doc_frequency @documents;
1156
1157 # output the phrase data
1158 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1159 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1160
1161 }
1162
1163 close IN;
1164 close OUT;
1165}
1166
1167
1168# renumber_thesaurus_data
1169#
1170# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1171
1172sub renumber_thesaurus_data {
1173 my ($self) = @_;
1174
1175 my $out = $self->{'outhandle'};
1176 my $verbosity = $self->{'verbosity'};
1177 my $thesaurus = $self->{'thesaurus'};
1178
1179 my $phinddir = $self->{'phinddir'};
1180 my $infile = &util::filename_cat($phinddir, "phrases.2");
1181 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1182
1183
1184 # If no thesaurus is defined, simply move the phrases file.
1185 if (!$thesaurus) {
1186 print $out "Translate phrases.2: no thesaurus data\n"
1187 if ($verbosity);
1188 &util::mv($infile, $outfile);
1189 return;
1190 }
1191
1192 print $out "Translate phrases.2: add thesaurus data\n"
1193 if ($verbosity);
1194
1195 # 1.
1196 # Read thesaurus file and store (symbols->thesaurusid) mapping
1197 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1198 my %symbolstothesid;
1199 my (@fields, $thesid, $symbols);
1200
1201 open(TH, "<$thesaurusfile");
1202
1203 while (<TH>) {
1204
1205 chomp;
1206 @fields = split(/:/, $_);
1207
1208 # get id and text
1209 $thesid = shift @fields;
1210 $symbols = shift @fields;
1211 $symbolstothesid{$symbols} = $thesid;
1212 }
1213 close TH;
1214
1215 # 2.
1216 # Read phrases file to find thesaurus entries that already
1217 # have a phindid. Store their phind-ids for later translation,
1218 # and store their frequency for later sorting.
1219 my %thesaurustophindid;
1220 my %phindidtofrequency;
1221 my ($phindid, $freq);
1222
1223 open(IN, "<$infile");
1224
1225 while(<IN>) {
1226
1227 chomp;
1228 @fields = split(/:/, $_);
1229
1230 # phindid and symbols for this line
1231 $phindid = shift @fields;
1232 $symbols = shift @fields;
1233 $freq = shift @fields;
1234
1235 # do we have a thesaurus id corresponding to this phrase?
1236 if (defined($symbolstothesid{$symbols})) {
1237 $thesid = $symbolstothesid{$symbols};
1238 $thesaurustophindid{$thesid} = $phindid;
1239 $phindidtofrequency{$phindid} = $freq;
1240 }
1241 }
1242 close IN;
1243
1244 undef %symbolstothesid;
1245
1246 # 3.
1247 # Create phind-id numbers for remaining thesaurus entries,
1248 # and note that their frequency is 0 for later sorting.
1249 my $nextphindid = $phindid + 1;
1250
1251 open(TH, "<$thesaurusfile");
1252 while(<TH>) {
1253
1254 chomp;
1255 @fields = split(/:/, $_);
1256
1257 # read thesaurus-id and ensure it has a corresponding phind-id
1258 $thesid = shift @fields;
1259 if (!defined($thesaurustophindid{$thesid})) {
1260 $thesaurustophindid{$thesid} = $nextphindid;
1261 $phindidtofrequency{$nextphindid} = 0;
1262 $nextphindid++;
1263 }
1264 }
1265 close TH;
1266
1267 # 4.
1268 # Translate thesaurus file, replacing thesaurus-id numbers with
1269 # phind-id numbers.
1270 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1271 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1272 my (@links, $linkid, %linkidtotype, $newrelation);
1273
1274 open(TH, "<$thesaurusfile");
1275 open(TO, ">$newthesaurusfile");
1276 while(<TH>) {
1277
1278 chomp;
1279 @fields = split(/:/, $_);
1280
1281 # phindid and symbols for this line
1282 ($thesid, $symbols, $relations) = @fields;
1283
1284 die unless ($thesid && $symbols);
1285 die unless $thesaurustophindid{$thesid};
1286 $phindid = $thesaurustophindid{$thesid};
1287
1288 # convert each part of the relation string to use phind-id numbers
1289 # at the same time, we want to sort the list by frequency.
1290 undef %linkidtotype;
1291
1292 foreach $linktext (split(/;/, $relations)) {
1293 @linkdata = split(/,/, $linktext);
1294
1295 # remember the linktype (e.g. BT, NT)
1296 $linktype = shift @linkdata;
1297
1298 # store the type of each link
1299 foreach $thesid (@linkdata) {
1300 die unless (defined($thesaurustophindid{$thesid}));
1301 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1302 }
1303 }
1304
1305 # sort the list of links, first by frequency, then by type.
1306 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1307 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1308 $linkcounter = (scalar @links);
1309
1310 # create a string describing the link information
1311 $linktype = $linkidtotype{$links[0]};
1312 $newrelation = $linktype;
1313 foreach $linkid (@links) {
1314 if ($linkidtotype{$linkid} ne $linktype) {
1315 $linktype = $linkidtotype{$linkid};
1316 $newrelation .= ";" . $linktype;
1317 }
1318 $newrelation .= "," . $linkid;
1319 }
1320 $newrelation .= ";";
1321
1322
1323 # output the new line
1324 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1325 }
1326 close TH;
1327 close TO;
1328
1329 undef %thesaurustophindid;
1330 undef %linkidtotype;
1331 undef %phindidtofrequency;
1332
1333 # 5.
1334 # Read thesaurus data (in phind-id format) into memory
1335 my %thesaurusdata;
1336
1337 open(TH, "<$newthesaurusfile");
1338 while(<TH>) {
1339 chomp;
1340 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1341 die unless ($phindid && $symbols);
1342 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1343 }
1344 close TH;
1345
1346 # 6.
1347 # Add thesaurus data to phrases file
1348 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1349 my (@documwents, @newexp, $k, $n);
1350 my $linenumber = 0;
1351
1352 open(IN, "<$infile");
1353 open(OUT, ">$outfile");
1354
1355 # Update existing phrases
1356 while(<IN>) {
1357
1358 chomp;
1359 @fields = split(/:/, $_);
1360
1361 # get data for this line
1362 $phindid = shift @fields;
1363
1364 # output the phrase data, with thesaurus information
1365 print OUT "$phindid:", join(":", @fields);
1366
1367 # add thesaurus data
1368 if (defined($thesaurusdata{$phindid})) {
1369 @fields = split(/:/, $thesaurusdata{$phindid});
1370 shift @fields;
1371 $linkcounter = shift @fields;
1372 $relations = shift @fields;
1373
1374 print OUT ":$linkcounter:$relations";
1375 $thesaurusdata{$phindid} = "";
1376 }
1377 print OUT "\n";
1378 }
1379 close IN;
1380
1381 # Add phrases that aren't already in the file
1382 foreach $phindid (sort numerically keys %thesaurusdata) {
1383 next unless ($thesaurusdata{$phindid});
1384
1385 @fields = split(/:/, $thesaurusdata{$phindid});
1386 $symbols = shift @fields;
1387 $linkcounter = shift @fields;
1388 $relations = shift @fields;
1389
1390 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1391 }
1392 close OUT;
1393
1394}
1395
1396# restore_vocabulary_data
1397#
1398# Read phrases.3 and restore vocabulary information. Then write
1399# this data to the MGPP input files (pword.txt and pdata.txt) and
1400# (if requested) to the saved phrases file.
1401
1402sub restore_vocabulary_data {
1403 my ($self) = @_;
1404
1405 my $out = $self->{'outhandle'};
1406 my $verbosity = $self->{'verbosity'};
1407 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1408
1409 my $phinddir = $self->{'phinddir'};
1410 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1411 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1412 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1413 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1414
1415 my $savephrases = $self->{'savephrases'};
1416
1417 # 1.
1418 # Read the vocabulary file
1419 open(V, "<$vocabfile")
1420 || die "Cannot open $vocabfile: $!";
1421 my @symbol;
1422 my $i = 1;
1423 while(<V>) {
1424 chomp;
1425 $symbol[$i++] = $_;
1426 }
1427 close V;
1428
1429 # 2.
1430 # Translate phrases.3 to MGPP input files
1431 my ($key, $text, $word, $isThesaurus, $line);
1432 my @fields;
1433 my $linenumber = 0;
1434
1435 open(IN, "<$infile");
1436 open(DATA, ">$datafile");
1437 open(WORD, ">$wordfile");
1438
1439 # Save the phrases in a separate text file
1440 if ($savephrases) {
1441 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1442 open(SAVE, ">$savephrases");
1443 }
1444
1445 while(<IN>) {
1446
1447 # read the line
1448 chomp;
1449 $line = $_;
1450 @fields = split(/:/, $line);
1451
1452 # get a phrase number for this line
1453 $key = shift @fields;
1454
1455 # restore the text of the phrase
1456 $text = shift @fields;
1457 $text =~ s/s(\d+)/$symbol[$1]/g;
1458 if ($text =~ / /) {
1459 $word = "";
1460 } elsif ($text ne 'untranslated') {
1461 $word = $text;
1462 }
1463
1464 # output the phrase data
1465 print DATA "<Document>";
1466 print DATA "$key:$text:", join(":", @fields), ":\n";
1467
1468 # output the word index search data
1469 print WORD "<Document>$word\n";
1470
1471 # output the phrases to a text file
1472 if ($savephrases) {
1473 if ((scalar @fields) == 7) {
1474 $isThesaurus = 1;
1475 } else {
1476 $isThesaurus = 0;
1477 }
1478 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1479 }
1480 }
1481 close IN;
1482 close WORD;
1483 close DATA;
1484 close SAVE if ($savephrases);
1485
1486}
1487
1488
1489
1490# sort routines used to renumber phrases
1491
1492sub numerically { $a <=> $b }
1493
1494sub by_doc_frequency {
1495 my $fa = 1;
1496 if ($a =~ /,/) {
1497 $fa = $a;
1498 $fa =~ s/\d+,//;
1499 }
1500 my $fb = 1;
1501 if ($b =~ /,/) {
1502 $fb = $b;
1503 $fb =~ s/\d+,//;
1504 }
1505
1506 return ($fb <=> $fa);
1507}
1508
15091;
Note: See TracBrowser for help on using the repository browser.