source: main/trunk/greenstone2/perllib/classify/Phind.pm

Last change on this file was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:keywords set to Author Date Id Revision
File size: 41.6 KB
Line 
1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
34use BaseClassifier;
35use FileUtils;
36use util;
37use ghtml;
38use unicode;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43my @removedirs = ();
44
45my %wanted_index_files = ('td'=>1,
46 't'=>1,
47 'ti'=>1,
48 'tl'=>1,
49 'tsd'=>1,
50 'idb'=>1,
51 'ib1'=>1,
52 'ib2'=>1,
53 'ib3'=>1,
54 'i'=>1,
55 'il'=>1,
56 'w'=>1,
57 'wa'=>1);
58
59sub BEGIN {
60 @Phind::ISA = ('BaseClassifier');
61}
62
63sub END {
64
65 # Tidy up stray files - we do this here as there's some weird problem
66 # preventing us from doing it in the get_classify_info() function (on
67 # windows at least) where the close() appears to fail on txthandle and
68 # dochandle, thus preventing us from deleting those files
69
70 foreach my $dir (@removedirs) {
71 if (-d $dir && opendir (DIR, $dir)) {
72 my @files = readdir DIR;
73 closedir DIR;
74
75 foreach my $file (@files) {
76 next if $file =~ /^\.\.?$/;
77 my ($suffix) = $file =~ /\.([^\.]+)$/;
78 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
79 # delete it!
80 &FileUtils::removeFiles (&FileUtils::filenameConcatenate ($dir, $file));
81 }
82 }
83 }
84 }
85}
86
87my $arguments =
88 [ { 'name' => "text",
89 'desc' => "{Phind.text}",
90 'type' => "string",
91 'deft' => "section:Title,section:text",
92 'reqd' => "no" },
93 { 'name' => "title",
94 'desc' => "{Phind.title}",
95 'type' => "metadata",
96 'deft' => "Title",
97 'reqd' => "no" },
98 { 'name' => "buttonname",
99 'desc' => "{BasClas.buttonname}",
100 'type' => "string",
101 'deft' => "Phrase",
102 'reqd' => "no" },
103 { 'name' => "language",
104 'desc' => "{Phind.language}",
105 'type' => "string",
106 'deft' => "en",
107 'reqd' => "no" },
108 { 'name' => "savephrases",
109 'desc' => "{Phind.savephrases}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "no" },
113 { 'name' => "suffixmode",
114 'desc' => "{Phind.suffixmode}",
115 'type' => "int",
116 'deft' => "1",
117 'range' => "0,1",
118 'reqd' => "no" },
119 { 'name' => "min_occurs",
120 'desc' => "{Phind.min_occurs}",
121 'type' => "int",
122 'deft' => "2",
123 'range' => "1,",
124 'reqd' => "no" },
125 { 'name' => "thesaurus",
126 'desc' => "{Phind.thesaurus}",
127 'type' => "string",
128 'deft' => "",
129 'reqd' => "no" },
130 { 'name' => "untidy",
131 'desc' => "{Phind.untidy}",
132 'type' => "flag",
133 'reqd' => "no" } ];
134
135my $options = { 'name' => "Phind",
136 'desc' => "{Phind.desc}",
137 'abstract' => "no",
138 'inherits' => "yes",
139 'args' => $arguments };
140
141
142# Phrase delimiter symbols - these should be abstracted out someplace
143
144my $colstart = "COLLECTIONSTART";
145my $colend = "COLLECTIONEND";
146my $doclimit = "DOCUMENTLIMIT";
147my $senlimit = "SENTENCELIMIT";
148my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
149
150
151# Create a new Phind browser based on collect.cfg
152
153sub new {
154 my ($class) = shift (@_);
155 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
156 push(@$classifierslist, $class);
157
158 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
159 push(@{$hashArgOptLists->{"OptList"}},$options);
160
161 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
162
163 if ($self->{'info_only'}) {
164 # don't worry about any options etc
165 return bless $self, $class;
166 }
167
168 # Ensure the Phind generate scripts are in place
169
170 # the suffix binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
171 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
172
173 my $file1 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
174 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
175 my $src = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "src", "phind", "generate");
176 if (!(-e $file1)) {
177 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
178 exit(1);
179 }
180
181 # things that may have ex. in them that need to be stripped off
182 $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
183 $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
184
185 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
186
187 $self->{"indexes"} = $self->{"text"};
188
189 # Further setup
190 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
191 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
192 if (! defined $self->{'builddir'}) {
193 $self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "building");
194 }
195 $self->{'total'} = 0;
196
197 # we set phind to be rtl if language is arabic
198 if ($self->{'language'} eq "ar") {
199 $self->{'textorientation'} = "rtl";
200 }
201 # Clean out the unused keys
202 delete $self->{"text"};
203
204 return bless $self, $class;
205}
206
207
208# Initialise the Phind classifier
209
210sub init {
211 my $self = shift (@_);
212
213 # ensure we have a build directory
214 my $builddir = $self->{'builddir'};
215 die unless (-e "$builddir");
216
217 # create Phind directory
218 my $phnumber = 1;
219 my $phinddir = &FileUtils::filenameConcatenate($builddir, "phind1");
220 while (-e "$phinddir") {
221 $phnumber++;
222 $phinddir = &FileUtils::filenameConcatenate($builddir, "phind$phnumber");
223 }
224 &FileUtils::makeDirectory("$phinddir");
225 $self->{'phinddir'} = $phinddir;
226 $self->{'phindnumber'} = $phnumber;
227
228 push(@removedirs, $phinddir) unless $self->{'untidy'};
229
230 # open filehandles for documents and text
231 my $clausefile = &FileUtils::filenameConcatenate("$phinddir", "clauses");
232 &FileUtils::removeFiles($clausefile) if (-e $clausefile);
233
234 my $txthandle = 'TEXT' . $phnumber;
235 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
236 $self->{'txthandle'} = $txthandle;
237
238 my $docfile = &FileUtils::filenameConcatenate("$phinddir", "docs.txt");
239 &FileUtils::removeFiles($docfile) if (-e $docfile);
240
241 my $dochandle = 'DOC' . $phnumber;
242 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
243 $self->{'dochandle'} = $dochandle;
244
245}
246
247
248# Classify each document.
249#
250# Each document is passed here in turn. The classifier extracts the
251# text of each and stores it in the clauses file. Document details are
252# stored in the docs.txt file.
253
254sub classify {
255 my $self = shift (@_);
256 my ($doc_obj) = @_;
257
258 my $verbosity = $self->{'verbosity'};
259 my $top_section = $doc_obj->get_top_section();
260
261 my $titlefield = $self->{'title'};
262
263 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
264 if (!defined($title)) {
265 $title = "";
266 print STDERR "Phind: document has no title\n";
267 }
268 print "process: $title\n" if ($verbosity > 2);
269
270 # Only consider the file if it is in the correct language
271 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
272 my $phrlanguage = $self->{'language'};
273 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
274
275 # record this file
276 $self->{'total'} ++;
277 # what is $file ???
278 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
279
280 # Store document details
281 my $OID = $doc_obj->get_OID();
282 $OID = "NULL" unless defined $OID;
283 my $dochandle = $self->{'dochandle'};
284 print $dochandle "<Document>\t$OID\t$title\n";
285
286 # Store the text occuring in this object
287
288 # output the document delimiter
289 my $txthandle = $self->{'txthandle'};
290 print $txthandle "$doclimit\n";
291
292 # iterate over the required indexes and store their text
293 my $indexes = $self->{'indexes'};
294 my $text = "";
295 my ($part, $level, $field, $section, $data, $dataref);
296
297 foreach $part (split(/,/, $indexes)) {
298
299 # Each field has a level and a data element ((e.g. document:Title)
300 ($level, $field) = split(/:/, $part);
301 die unless ($level && $field);
302
303 # Extract the text from every section
304 # (In phind, document:text and section:text are equivalent)
305 if ($field eq "text") {
306 $data = "";
307 $section = $doc_obj->get_top_section();
308 while (defined($section)) {
309 $data .= $doc_obj->get_text($section) . "\n";
310 $section = $doc_obj->get_next_section($section);
311 }
312 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
313 }
314
315 # Extract a metadata field from a document
316 # (If there is more than one element of the given type, get them all.)
317 elsif ($level eq "document") {
318 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
319 foreach $data (@$dataref) {
320 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
321 }
322 }
323
324 # Extract metadata from every section in a document
325 elsif ($level eq "section") {
326 $data = "";
327 $section = $doc_obj->get_top_section();
328 while (defined($section)) {
329 $dataref = $doc_obj->get_metadata($section, $field);
330 $data .= join("\n", @$dataref) . "\n";
331 $section = $doc_obj->get_next_section($section);
332 }
333 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
334 }
335
336 # Some sort of specification which I don't understand
337 else {
338 die "Unknown level ($level) in Phind index ($part)\n";
339 }
340
341 }
342
343 # output the text
344 $text =~ tr/\n//s;
345 print $txthandle "$text";
346}
347
348
349# Construct the classifier from the information already gathered
350#
351# When get_classify_info is called, the clauses and docs.txt files have
352# already been constructed in the Phind directory. This function will
353# translate them into compressed, indexed MGPP files that can be read by
354# the phindcgi script. It will also register our classifier so that it
355# shows up in the navigation bar.
356
357sub get_classify_info {
358 my $self = shift (@_);
359 my ($gli) = @_;
360
361 close $self->{'dochandle'};
362 close $self->{'txthandle'};
363 my $verbosity = $self->{'verbosity'};
364 my $out = $self->{'outhandle'};
365 my $phinddir = $self->{'phinddir'};
366
367 my $osextra = "";
368 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
369 $osextra = " -d /";
370 }
371
372 print STDERR "</Stage>\n" if $gli;
373
374 if ($verbosity) {
375 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
376 print $out "*** in ", $self->{'phinddir'}, "\n";
377 }
378
379 print STDERR "<Stage name='Phind'>\n" if $gli;
380
381 # Construct phind indexes
382 my $suffixmode = $self->{'suffixmode'};
383 my $min_occurs = $self->{'min_occurs'};
384 my ($command, $status);
385
386 # Generate the vocabulary, symbol statistics, and numbers file
387 # from the clauses file
388 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
389 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
390 &extract_vocabulary($self);
391
392 # Use the suffix program to generate the phind/phrases file
393 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
394 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
395 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
396
397 # check that we generated some files. It's not necessarily an error if
398 # we didn't (execute() would have quit on error), but we can't go on.
399 my $phrasesfile=&FileUtils::filenameConcatenate($self->{'phinddir'}, 'phrases');
400 if (! -r $phrasesfile) {
401 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
402 print $out "\nNo phrases found for Phind classifier!\n";
403 return;
404 }
405
406 # Create the phrase file and put phrase numbers in phind/phrases
407 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
408 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
409 &renumber_phrases($self);
410
411 print $out "\nCreating phrase databases\n";
412 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
413 my $mg_input = &FileUtils::filenameConcatenate($phinddir, "pdata.txt");
414 my $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pdata");
415
416 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
417 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
418 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
419
420 # create the mg index of words
421 print $out "\nCreating word-level search indexes\n";
422 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
423 $mg_input = &FileUtils::filenameConcatenate($phinddir, "pword.txt");
424 $mg_stem = &FileUtils::filenameConcatenate($phinddir, "pword");
425
426 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
427 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
428 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
429 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
430 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
431 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
432
433 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
434 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
435 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
436
437 # create the mg document information database
438 print $out "\nCreating document information databases\n";
439 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
440 $mg_input = &FileUtils::filenameConcatenate($phinddir, "docs.txt");
441 $mg_stem = &FileUtils::filenameConcatenate($phinddir, "docs");
442
443 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
444 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
445 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
446
447 my $parameters = "phindnumber=$self->{'phindnumber'}";
448 if (defined ($self->{'textorientation'})) {
449 $parameters .= ";textorientation=$self->{'textorientation'}";
450 }
451 # Return the information about the classifier that we'll later want to
452 # use to create macros when the Phind classifier document is displayed.
453 my %classifyinfo = ('thistype'=>'Invisible',
454 'childtype'=>'Phind',
455 'Title'=>$self->{'buttonname'},
456 'parameters'=>$parameters,
457 'contains'=>[]);
458
459 my $collection = $self->{'collection'};
460 my $url = "library?a=p&p=phind&c=$collection";
461 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
462
463 return \%classifyinfo;
464}
465
466
467
468sub convert_gml_to_tokens {
469
470 my ($language_exp, $text) = @_;
471
472 # escape any magic words... - jrm21
473 foreach my $delim (@delimiters) {
474 my $replacement=lc($delim);
475 my $num= $text=~ s/$delim/$replacement/g;
476 if (!$num) {$num=0;}
477 }
478
479 if ($language_exp =~ /^en$/) {
480 return &convert_gml_to_tokens_EN($text);
481 }
482
483 if ($language_exp =~ /zh/) {
484 return &convert_gml_to_tokens_ZH($text);
485 }
486
487 $_ = $text;
488
489 # 1. remove GML tags
490
491 # Remove everything that is in a tag
492 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
493 s/\s*<br>\s*/ LINEBREAK /isgo;
494 s/<[^>]*>/ /sgo;
495
496 # Now we have the text, but it may contain HTML
497 # elements coded as &gt; etc. Remove these tags.
498 s/&amp;/&/sgo;
499 s/&lt;/</sgo;
500 s/&gt;/>/sgo;
501 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
502 s/\s*<br>\s*/ LINEBREAK /isgo;
503 s/<[^>]*>/ /sgo;
504
505 # replace<p> and <br> placeholders with clause break symbol (\n)
506 s/\s+/ /gso;
507 s/PARAGRAPHBREAK/\n/sgo;
508 s/LINEBREAK/\n/sgo;
509
510
511 # 2. Split the remaining text into space-delimited tokens
512
513 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
514 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
515
516 # Split text at word boundaries
517 s/\b/ /go;
518
519 # 3. Convert the remaining text to "clause format"
520
521 # Insert newline if the end of a sentence is detected
522 # (delimter is: "[\.\?\!]\s")
523 # s/\s*[\.\?\!]\s+/\n/go;
524
525 # remove unnecessary punctuation and replace with clause break symbol (\n)
526 # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
527 #s/[^\w ]/\n/go;
528 # replace punct with new lines - is this what we want??
529 s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
530 # then remove other punct with space
531 s/[\'\`\\\_]/ /go;
532
533 # remove extraneous whitespace
534 s/ +/ /sgo;
535 s/^\s+//mgo;
536 s/\s*$/\n/mgo;
537
538 # remove lines that contain one word or less
539 s/^\S*$//mgo;
540 s/^\s*$//mgo;
541 tr/\n//s;
542
543 return $_;
544}
545
546# a chinese version
547sub convert_gml_to_tokens_ZH {
548
549 $_ = shift @_;
550
551 # Replace all whitespace with a simple space
552 s/\s+/ /gs;
553 # Remove everything that is in a tag
554 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
555 s/\s*<br>\s*/ LINEBREAK /isg;
556 s/<[^>]*>/ /sg;
557
558 # Now we have the text, but it may contain HTML
559 # elements coded as &gt; etc. Remove these tags.
560 s/&lt;/</sg;
561 s/&gt;/>/sg;
562
563 s/\s+/ /sg;
564 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
565 s/\s*<br>\s*/ LINEBREAK /isg;
566 s/<[^>]*>/ /sg;
567
568 # remove &amp; and other miscellaneous markup tags
569 s/&amp;/&/sg;
570 s/&lt;/</sg;
571 s/&gt;/>/sg;
572 s/&amp;/&/sg;
573
574 # replace<p> and <br> placeholders with carriage returns
575 s/PARAGRAPHBREAK/\n/sg;
576 s/LINEBREAK/\n/sg;
577
578
579# print STDERR "text:$_\n";
580 return $_;
581}
582
583# A version of convert_gml_to_tokens that is fine-tuned to the English language.
584
585sub convert_gml_to_tokens_EN {
586 $_ = shift @_;
587
588 # FIRST, remove GML tags
589
590 # Replace all whitespace with a simple space
591 s/\s+/ /gs;
592
593 # Remove everything that is in a tag
594 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
595 s/\s*<br>\s*/ LINEBREAK /isg;
596 s/<[^>]*>/ /sg;
597
598 # Now we have the text, but it may contain HTML
599 # elements coded as &gt; etc. Remove these tags.
600 s/&lt;/</sg;
601 s/&gt;/>/sg;
602
603 s/\s+/ /sg;
604 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
605 s/\s*<br>\s*/ LINEBREAK /isg;
606 s/<[^>]*>/ /sg;
607
608 # remove &amp; and other miscellaneous markup tags
609 s/&amp;/&/sg;
610 s/&lt;/</sg;
611 s/&gt;/>/sg;
612 s/&amp;/&/sg;
613
614 # replace<p> and <br> placeholders with carriage returns
615 s/PARAGRAPHBREAK/\n/sg;
616 s/LINEBREAK/\n/sg;
617
618
619 # Exceptional punctuation
620 #
621 # We make special cases of some punctuation
622
623 # remove any apostrophe that indicates omitted letters
624 s/(\w+)\'(\w*\s)/ $1$2 /g;
625
626 # remove period that appears in a person's initals
627 s/\s([A-Z])\./ $1 /g;
628
629 # replace hyphens in hypheanted words and names with a space
630 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
631
632 # Convert the remaining text to "clause format",
633 # This means removing all excess punctuation and garbage text,
634 # normalising valid punctuation to fullstops and commas,
635 # then putting one cluse on each line.
636
637 # Insert newline when the end of a sentence is detected
638 # (delimter is: "[\.\?\!]\s")
639 s/\s*[\.\?\!]\s+/\n/g;
640
641 # split numbers after four digits
642 s/(\d\d\d\d)/$1 /g;
643
644 # split words after 32 characters
645
646 # squash repeated punctuation
647 tr/A-Za-z0-9 //cs;
648
649 # save email addresses
650 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
651
652 # normalise clause breaks (mostly punctuation symbols) to commas
653 s/[^A-Za-z0-9 \n]+/ , /g;
654
655 # Remove repeated commas, and replace with newline
656 s/\s*,[, ]+/\n/g;
657
658 # remove extra whitespace
659 s/ +/ /sg;
660 s/^\s+//mg;
661 s/\s*$/\n/mg;
662
663 # remove lines that contain one word or less
664 s/^\w*$//mg;
665 s/^\s*$//mg;
666 tr/\n//s;
667
668 return $_;
669
670}
671
672
673
674# Execute a system command
675
676sub execute {
677 my ($command, $verbosity, $outhandle) = @_;
678 print $outhandle "Executing: $command\n" if ($verbosity > 2);
679 $! = 0;
680 my $status = system($command);
681 if ($status != 0) {
682 print STDERR "Phind - Error executing '$command': $!\n";
683 exit($status); # this causes the build to fail...
684 }
685}
686
687
688# Generate the vocabulary, symbol statistics, and numbers file from the
689# clauses file. This is legacy code, so is a bit messy and probably wont
690# run under windows.
691
692sub extract_vocabulary {
693 my ($self) = @_;
694
695 my $verbosity = $self->{'verbosity'};
696 my $out = $self->{'outhandle'};
697
698 my $collectiondir = $self->{'collectiondir'};
699 my $phinddir = $self->{'phinddir'};
700
701 my $language_exp = $self->{'language'};
702
703 my ($w, $l, $line, $word);
704
705 my ($first_delimiter, $last_delimiter,
706 $first_stopword, $last_stopword,
707 $first_extractword, $last_extractword,
708 $first_contentword, $last_contentword,
709 $phrasedelimiter);
710
711 my $thesaurus = $self->{'thesaurus'};
712 my ($thesaurus_links, $thesaurus_terms,
713 %thesaurus, $first_thesaurusword, $last_thesaurusword);
714
715 my %symbol;
716 my (%freq);
717
718 print $out "Calculating vocabulary\n" if ($verbosity > 1);
719
720 # Read and store the stopwords
721 my $stopdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
722 my $stopword_files = ();
723 my ($language, $language_dir, $file, $file_name);
724 my %stopwords;
725
726 # Examine each directory in the stopword directory
727 opendir(STOPDIR, $stopdir);
728 foreach $language (readdir STOPDIR) {
729
730 # Ignore entries that do not match the classifier's language
731 next unless ($language =~ /$language_exp/);
732 $language_dir = &FileUtils::filenameConcatenate($stopdir, $language);
733 next unless (-d "$language_dir");
734
735 opendir(LANGDIR, $language_dir);
736 foreach $file (readdir LANGDIR) {
737
738 # Ignore entries that are not stopword files
739 next unless ($file =~ /sw$/);
740 $file_name = &FileUtils::filenameConcatenate($language_dir, $file);
741 next unless (-f "$file_name");
742
743 # Read the stopwords
744 open(STOPFILE, "<$file_name");
745 while (<STOPFILE>) {
746 s/^\s+//;
747 s/\s.*//;
748 $word = $_;
749 $l = lc($word);
750 $stopwords{$l} = $word;
751 }
752 close STOPFILE;
753
754 }
755 closedir LANGDIR;
756 }
757 closedir STOPDIR;
758
759 # Read thesaurus information
760 if ($thesaurus) {
761
762 # link file exists
763 $thesaurus_links = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.lnk");
764 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
765
766 # ensure term file exists in the correct language
767 if ($language_exp =~ /^([a-z][a-z])/) {
768 $language = $1;
769 } else {
770 $language = 'en';
771 }
772 $thesaurus_terms = &FileUtils::filenameConcatenate($collectiondir, "etc", "$thesaurus.$language");
773 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
774
775
776 # Read the thesaurus terms
777 open(TH, "<$thesaurus_terms");
778 while(<TH>) {
779 s/^\d+ //;
780 s/\(.*\)//;
781 foreach $w (split(/\s+/, $_)) {
782 $thesaurus{lc($w)} = $w;
783 }
784 }
785 close TH;
786 }
787
788 # Read words in the text and count occurences
789 open(TXT, "<$phinddir/clauses");
790
791 my @words;
792 while(<TXT>) {
793 $line = $_;
794 next unless ($line =~ /./);
795
796 @words = split(/\s+/, $line);
797 foreach $w (@words) {
798 $l = lc($w);
799 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
800 $freq{$w}++;
801 }
802 $freq{$senlimit}++;
803 }
804
805 close TXT;
806
807 # Calculate the "best" form of each word
808 my (%bestform, %totalfreq, %bestfreq);
809
810 foreach $w (sort (keys %freq)) {
811 $l = lc($w);
812
813 # totalfreq is the number of times a term appears in any form
814 $totalfreq{$l} += $freq{$w};
815
816 if (defined $stopwords{$l}) {
817 $bestform{$l} = $stopwords{$l};
818
819 } elsif (defined $thesaurus{$l}) {
820 $bestform{$l} = $thesaurus{$l};
821
822 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
823 $bestfreq{$l} = $freq{$w};
824 $bestform{$l} = $w;
825 }
826 }
827 undef %freq;
828 undef %bestfreq;
829
830
831 # Assign symbol numbers to tokens
832 my $nextsymbol = 1;
833 my (@vocab);
834
835 # Delimiters
836 $first_delimiter = 1;
837
838 foreach $word (@delimiters) {
839
840# $word = lc($word); # jrm21
841 $word = uc($word);
842 $bestform{$word} = $word;
843 $vocab[$nextsymbol] = $word;
844 $symbol{$word} = $nextsymbol;
845 $nextsymbol++;
846 }
847 $last_delimiter = $nextsymbol - 1;
848 # Stopwords
849 $first_stopword = $nextsymbol;
850
851 foreach my $word (sort keys %stopwords) {
852 # don't include stopword unless it occurs in the text
853 $word = lc($word);
854 next unless ($totalfreq{$word});
855 next if ($symbol{$word});
856
857 $vocab[$nextsymbol] = $word;
858 $symbol{$word} = $nextsymbol;
859 $nextsymbol++;
860 }
861 $last_stopword = $nextsymbol - 1;
862 $first_contentword = $nextsymbol;
863
864 # Thesaurus terms
865 if ($thesaurus) {
866 $first_thesaurusword = $nextsymbol;
867
868 foreach my $word (sort keys %thesaurus) {
869
870 $word = lc($word);
871 next if ($symbol{$word});
872 $bestform{$word} = $thesaurus{$word};
873
874 $vocab[$nextsymbol] = $word;
875 $symbol{$word} = $nextsymbol;
876 $nextsymbol++;
877
878 }
879 $last_thesaurusword = $nextsymbol - 1;
880 }
881
882 # Other content words
883 $first_extractword = $nextsymbol;
884
885 foreach my $word (sort (keys %bestform)) {
886
887 next if ($symbol{$word});
888
889 $vocab[$nextsymbol] = $word;
890 $symbol{$word} = $nextsymbol;
891 $nextsymbol++;
892 }
893 $last_extractword = $nextsymbol - 1;
894 $last_contentword = $nextsymbol - 1;
895
896 # Outut the words
897 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
898 open(VOC, ">$phinddir/clauses.vocab");
899
900 for (my $i = 1; $i < $nextsymbol; $i++) {
901 $w = $vocab[$i];
902
903 print VOC "$bestform{$w}\n";
904 $totalfreq{$w} = 0 unless ($totalfreq{$w});
905 }
906 close VOC;
907
908
909 # Create statistics file
910 # Output statistics about the vocablary
911 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
912 &FileUtils::removeFiles("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
913
914 open(STAT, ">$phinddir/clauses.stats")
915 || die "Cannot open $phinddir/clauses.stats: $!";
916
917 print STAT "first_delimiter $first_delimiter\n";
918 print STAT "last_delimiter $last_delimiter\n";
919 print STAT "first_stopword $first_stopword\n";
920 print STAT "last_stopword $last_stopword\n";
921 if ($thesaurus) {
922 print STAT "first_thesaurusword $first_thesaurusword\n";
923 print STAT "last_thesaurusword $last_thesaurusword\n";
924 }
925 print STAT "first_extractword $first_extractword\n";
926 print STAT "last_extractword $last_extractword\n";
927 print STAT "first_contentword $first_contentword\n";
928 print STAT "last_contentword $last_contentword\n";
929 print STAT "first_symbol $first_delimiter\n";
930 print STAT "last_symbol $last_contentword\n";
931 print STAT "first_word $first_stopword\n";
932 print STAT "last_word $last_contentword\n";
933 close STAT;
934
935 undef @vocab;
936
937
938 # Create numbers file
939 # Save text as symbol numbers
940 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
941
942 open(TXT, "<$phinddir/clauses");
943 open(NUM, ">$phinddir/clauses.numbers");
944
945## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
946## print NUM "$symbol{lc($colstart)}\n"; # jrm21
947 $phrasedelimiter = $symbol{$senlimit};
948 print NUM "$symbol{$colstart}\n";
949
950 # set up the special symbols that delimit documents and sentences
951 while(<TXT>) {
952
953 # split sentence into a list of tokens
954 $line = $_;
955 next unless ($line =~ /./);
956 @words = split(/\s+/, $line);
957
958 # output one token at a time
959 foreach $word (@words) {
960# don't lower-case special delimiters - jrm21
961 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
962 $word = lc($word);
963 }
964 print NUM "$symbol{$word}\n";
965 }
966
967 # output phrase delimiter
968 print NUM "$phrasedelimiter\n";
969 }
970
971 close TXT;
972# print NUM "$symbol{lc($colend)}\n";# jrm21
973 print NUM "$symbol{$colend}\n";
974 close NUM;
975
976 # Save thesaurus data in one convienient file
977 if ($thesaurus) {
978
979 my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
980
981
982 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
983 if ($verbosity > 1);
984
985 # Read the thesaurus terms
986 my ($num, $text, %thes_symbols);
987
988 open(TH, "<$thesaurus_terms");
989 while(<TH>) {
990 chomp;
991 @words = split(/\s+/, $_);
992 $num = shift @words;
993 $text = "";
994
995 # translate words into symbol numbers
996 foreach $word (@words) {
997 $word = lc($word);
998 if ($symbol{$word}) {
999 $text .= "s$symbol{$word} ";
1000 } elsif ($verbosity) {
1001 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
1002 }
1003 }
1004 $text =~ s/ $//;
1005 $thes_symbols{$num} = $text;
1006 }
1007 close TH;
1008
1009 # Read the thesaurus links and write the corresponding data
1010 open(TH, "<$thesaurus_links");
1011 open(THOUT, ">$thesaurusfile");
1012
1013 while(<TH>) {
1014 chomp;
1015 ($num, $text) = split(/:/, $_);
1016
1017 if (defined($thes_symbols{$num})) {
1018 print THOUT "$num:$thes_symbols{$num}:$text\n";
1019 } else {
1020 print THOUT "$num:untranslated:$text\n";
1021 }
1022 }
1023 close TH;
1024 close THOUT;
1025 }
1026
1027
1028
1029
1030}
1031
1032
1033# renumber_phrases
1034#
1035# Prepare the phrases file to be input to mgpp. The biggest problem is
1036# reconciling the phrase identifiers used by the suffix program (which
1037# we'll call suffix-id numbers) with the numbers used in the thesaurus
1038# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1039# Phind-id numbers must be sorted by frequency of occurance.
1040#
1041# Start creating a set of phind-id numbers from the sorted suffix-id
1042# numbers and (if required) the thesaurus-id numbers. Then add any other
1043# phrases occuring in the thesaurus.
1044#
1045# The last thing we have to do is restore the vocabulary information to the
1046# phrase file so that the phrases are stored as words, not as symbol
1047# numbers.
1048
1049# The original phrases file looks something like this:
1050# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1051# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1052# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1053# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1054
1055
1056sub renumber_phrases {
1057 my ($self) = @_;
1058
1059 renumber_suffix_data($self);
1060 renumber_thesaurus_data($self);
1061 restore_vocabulary_data($self);
1062
1063}
1064
1065
1066
1067# renumber_suffix_data
1068#
1069# Translate phrases file to phrases.2 using phind keys instead
1070# of suffix keys and sorting the expansion data.
1071
1072sub renumber_suffix_data {
1073 my ($self) = @_;
1074
1075 my $verbosity = $self->{'verbosity'};
1076 my $out = $self->{'outhandle'};
1077 print $out "Translate phrases: suffix-ids become phind-id's\n"
1078 if ($verbosity);
1079
1080 my $phinddir = $self->{'phinddir'};
1081 my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases');
1082 my $outfile = &FileUtils::filenameConcatenate($phinddir, 'phrases.2');
1083
1084 # Read the phrase file. Calculate initial set of phind-id
1085 # numbers and store (suffixid -> frequency) relation.
1086
1087 my %suffixtophind;
1088 my @totalfrequency;
1089 my (@fields, $suffixid);
1090 my $nextphind = 1;
1091
1092 open(IN, "<$infile");
1093 while(<IN>) {
1094
1095 chomp;
1096 @fields = split(/:/, $_);
1097
1098 # get next suffixid and phindid
1099 $suffixid = shift @fields;
1100 $suffixtophind{$suffixid} = $nextphind;
1101
1102 # store total frequency
1103 shift @fields;
1104 $totalfrequency[$nextphind] = shift @fields;
1105
1106 $nextphind++;
1107 }
1108 close IN;
1109
1110
1111 # Translate phrases file to phrases.2. Use phind keys (not suffix
1112 # keys), sort expansion and document occurance data in order of
1113 # descending frequency..
1114 open(IN, "<$infile");
1115 open(OUT, ">$outfile");
1116
1117 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1118 my (@documwents, @newexp, $k, $n);
1119 my $linenumber = 0;
1120
1121 while(<IN>) {
1122
1123 # read the line
1124 chomp;
1125 @fields = split(/:/, $_);
1126
1127 # get a phrase number for this line
1128 $suffixid = shift @fields;
1129 die unless (defined($suffixtophind{$suffixid}));
1130 $phindid = $suffixtophind{$suffixid};
1131
1132 # get the symbols in the phrase
1133 $text = shift @fields;
1134
1135 # output status information
1136 $linenumber++;
1137 if ($verbosity > 2) {
1138 if ($linenumber % 1000 == 0) {
1139 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1140 }
1141 # what are $num and $key??
1142 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1143 }
1144
1145 # get the phrase frequency
1146 $tf = shift @fields;
1147
1148 # get the number of expansions
1149 $countexp = shift @fields;
1150
1151 # get the expansions, convert them into phind-id numbers, and sort them
1152 $expansions = shift @fields;
1153 @newexp = ();
1154 foreach $k (split(/,/, $expansions)) {
1155 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1156 $n = $suffixtophind{$k};
1157 push @newexp, $n;
1158 }
1159 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1160
1161 # get the number of documents
1162 $countdocs = shift @fields;
1163
1164 # get the documents and sort them
1165 $documents = shift @fields;
1166 $documents =~ s/d//g;
1167 my @documents = split(/;/, $documents);
1168 @documents = sort by_doc_frequency @documents;
1169
1170 # output the phrase data
1171 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1172 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1173
1174 }
1175
1176 close IN;
1177 close OUT;
1178}
1179
1180
1181# renumber_thesaurus_data
1182#
1183# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1184
1185sub renumber_thesaurus_data {
1186 my ($self) = @_;
1187
1188 my $out = $self->{'outhandle'};
1189 my $verbosity = $self->{'verbosity'};
1190 my $thesaurus = $self->{'thesaurus'};
1191
1192 my $phinddir = $self->{'phinddir'};
1193 my $infile = &FileUtils::filenameConcatenate($phinddir, "phrases.2");
1194 my $outfile = &FileUtils::filenameConcatenate($phinddir, "phrases.3");
1195
1196
1197 # If no thesaurus is defined, simply move the phrases file.
1198 if (!$thesaurus) {
1199 print $out "Translate phrases.2: no thesaurus data\n"
1200 if ($verbosity);
1201 &FileUtils::moveFiles($infile, $outfile);
1202 return;
1203 }
1204
1205 print $out "Translate phrases.2: add thesaurus data\n"
1206 if ($verbosity);
1207
1208 # 1.
1209 # Read thesaurus file and store (symbols->thesaurusid) mapping
1210 my $thesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.numbers");
1211 my %symbolstothesid;
1212 my (@fields, $thesid, $symbols);
1213
1214 open(TH, "<$thesaurusfile");
1215
1216 while (<TH>) {
1217
1218 chomp;
1219 @fields = split(/:/, $_);
1220
1221 # get id and text
1222 $thesid = shift @fields;
1223 $symbols = shift @fields;
1224 $symbolstothesid{$symbols} = $thesid;
1225 }
1226 close TH;
1227
1228 # 2.
1229 # Read phrases file to find thesaurus entries that already
1230 # have a phindid. Store their phind-ids for later translation,
1231 # and store their frequency for later sorting.
1232 my %thesaurustophindid;
1233 my %phindidtofrequency;
1234 my ($phindid, $freq);
1235
1236 open(IN, "<$infile");
1237
1238 while(<IN>) {
1239
1240 chomp;
1241 @fields = split(/:/, $_);
1242
1243 # phindid and symbols for this line
1244 $phindid = shift @fields;
1245 $symbols = shift @fields;
1246 $freq = shift @fields;
1247
1248 # do we have a thesaurus id corresponding to this phrase?
1249 if (defined($symbolstothesid{$symbols})) {
1250 $thesid = $symbolstothesid{$symbols};
1251 $thesaurustophindid{$thesid} = $phindid;
1252 $phindidtofrequency{$phindid} = $freq;
1253 }
1254 }
1255 close IN;
1256
1257 undef %symbolstothesid;
1258
1259 # 3.
1260 # Create phind-id numbers for remaining thesaurus entries,
1261 # and note that their frequency is 0 for later sorting.
1262 my $nextphindid = $phindid + 1;
1263
1264 open(TH, "<$thesaurusfile");
1265 while(<TH>) {
1266
1267 chomp;
1268 @fields = split(/:/, $_);
1269
1270 # read thesaurus-id and ensure it has a corresponding phind-id
1271 $thesid = shift @fields;
1272 if (!defined($thesaurustophindid{$thesid})) {
1273 $thesaurustophindid{$thesid} = $nextphindid;
1274 $phindidtofrequency{$nextphindid} = 0;
1275 $nextphindid++;
1276 }
1277 }
1278 close TH;
1279
1280 # 4.
1281 # Translate thesaurus file, replacing thesaurus-id numbers with
1282 # phind-id numbers.
1283 my $newthesaurusfile = &FileUtils::filenameConcatenate($phinddir, "$thesaurus.phindid");
1284 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1285 my (@links, $linkid, %linkidtotype, $newrelation);
1286
1287 open(TH, "<$thesaurusfile");
1288 open(TO, ">$newthesaurusfile");
1289 while(<TH>) {
1290
1291 chomp;
1292 @fields = split(/:/, $_);
1293
1294 # phindid and symbols for this line
1295 ($thesid, $symbols, $relations) = @fields;
1296
1297 die unless ($thesid && $symbols);
1298 die unless $thesaurustophindid{$thesid};
1299 $phindid = $thesaurustophindid{$thesid};
1300
1301 # convert each part of the relation string to use phind-id numbers
1302 # at the same time, we want to sort the list by frequency.
1303 undef %linkidtotype;
1304
1305 foreach $linktext (split(/;/, $relations)) {
1306 @linkdata = split(/,/, $linktext);
1307
1308 # remember the linktype (e.g. BT, NT)
1309 $linktype = shift @linkdata;
1310
1311 # store the type of each link
1312 foreach $thesid (@linkdata) {
1313 die unless (defined($thesaurustophindid{$thesid}));
1314 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1315 }
1316 }
1317
1318 # sort the list of links, first by frequency, then by type.
1319 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1320 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1321 $linkcounter = (scalar @links);
1322
1323 # create a string describing the link information
1324 $linktype = $linkidtotype{$links[0]};
1325 $newrelation = $linktype;
1326 foreach $linkid (@links) {
1327 if ($linkidtotype{$linkid} ne $linktype) {
1328 $linktype = $linkidtotype{$linkid};
1329 $newrelation .= ";" . $linktype;
1330 }
1331 $newrelation .= "," . $linkid;
1332 }
1333 $newrelation .= ";";
1334
1335
1336 # output the new line
1337 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1338 }
1339 close TH;
1340 close TO;
1341
1342 undef %thesaurustophindid;
1343 undef %linkidtotype;
1344 undef %phindidtofrequency;
1345
1346 # 5.
1347 # Read thesaurus data (in phind-id format) into memory
1348 my %thesaurusdata;
1349
1350 open(TH, "<$newthesaurusfile");
1351 while(<TH>) {
1352 chomp;
1353 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1354 die unless ($phindid && $symbols);
1355 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1356 }
1357 close TH;
1358
1359 # 6.
1360 # Add thesaurus data to phrases file
1361 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1362 my (@documwents, @newexp, $k, $n);
1363 my $linenumber = 0;
1364
1365 open(IN, "<$infile");
1366 open(OUT, ">$outfile");
1367
1368 # Update existing phrases
1369 while(<IN>) {
1370
1371 chomp;
1372 @fields = split(/:/, $_);
1373
1374 # get data for this line
1375 $phindid = shift @fields;
1376
1377 # output the phrase data, with thesaurus information
1378 print OUT "$phindid:", join(":", @fields);
1379
1380 # add thesaurus data
1381 if (defined($thesaurusdata{$phindid})) {
1382 @fields = split(/:/, $thesaurusdata{$phindid});
1383 shift @fields;
1384 $linkcounter = shift @fields;
1385 $relations = shift @fields;
1386
1387 print OUT ":$linkcounter:$relations";
1388 $thesaurusdata{$phindid} = "";
1389 }
1390 print OUT "\n";
1391 }
1392 close IN;
1393
1394 # Add phrases that aren't already in the file
1395 foreach $phindid (sort numerically keys %thesaurusdata) {
1396 next unless ($thesaurusdata{$phindid});
1397
1398 @fields = split(/:/, $thesaurusdata{$phindid});
1399 $symbols = shift @fields;
1400 $linkcounter = shift @fields;
1401 $relations = shift @fields;
1402
1403 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1404 }
1405 close OUT;
1406
1407}
1408
1409# restore_vocabulary_data
1410#
1411# Read phrases.3 and restore vocabulary information. Then write
1412# this data to the MGPP input files (pword.txt and pdata.txt) and
1413# (if requested) to the saved phrases file.
1414
1415sub restore_vocabulary_data {
1416 my ($self) = @_;
1417
1418 my $out = $self->{'outhandle'};
1419 my $verbosity = $self->{'verbosity'};
1420 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1421
1422 my $phinddir = $self->{'phinddir'};
1423 my $infile = &FileUtils::filenameConcatenate($phinddir, 'phrases.3');
1424 my $vocabfile = &FileUtils::filenameConcatenate($phinddir, 'clauses.vocab');
1425 my $datafile = &FileUtils::filenameConcatenate($phinddir, 'pdata.txt');
1426 my $wordfile = &FileUtils::filenameConcatenate($phinddir, 'pword.txt');
1427
1428 my $savephrases = $self->{'savephrases'};
1429
1430 # 1.
1431 # Read the vocabulary file
1432 open(V, "<$vocabfile")
1433 || die "Cannot open $vocabfile: $!";
1434 my @symbol;
1435 my $i = 1;
1436 while(<V>) {
1437 chomp;
1438 $symbol[$i++] = $_;
1439 }
1440 close V;
1441
1442 # 2.
1443 # Translate phrases.3 to MGPP input files
1444 my ($key, $text, $word, $isThesaurus, $line);
1445 my @fields;
1446 my $linenumber = 0;
1447
1448 open(IN, "<$infile");
1449 open(DATA, ">$datafile");
1450 open(WORD, ">$wordfile");
1451
1452 # Save the phrases in a separate text file
1453 if ($savephrases) {
1454 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1455 open(SAVE, ">$savephrases");
1456 }
1457
1458 while(<IN>) {
1459
1460 # read the line
1461 chomp;
1462 $line = $_;
1463 @fields = split(/:/, $line);
1464
1465 # get a phrase number for this line
1466 $key = shift @fields;
1467
1468 # restore the text of the phrase
1469 $text = shift @fields;
1470 $text =~ s/s(\d+)/$symbol[$1]/g;
1471 if ($text =~ / /) {
1472 $word = "";
1473 } elsif ($text ne 'untranslated') {
1474 $word = $text;
1475 }
1476
1477 # output the phrase data
1478 print DATA "<Document>";
1479 print DATA "$key:$text:", join(":", @fields), ":\n";
1480
1481 # output the word index search data
1482 print WORD "<Document>$word\n";
1483
1484 # output the phrases to a text file
1485 if ($savephrases) {
1486 if ((scalar @fields) == 7) {
1487 $isThesaurus = 1;
1488 } else {
1489 $isThesaurus = 0;
1490 }
1491 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1492 }
1493 }
1494 close IN;
1495 close WORD;
1496 close DATA;
1497 close SAVE if ($savephrases);
1498
1499}
1500
1501
1502
1503# sort routines used to renumber phrases
1504
1505sub numerically { $a <=> $b }
1506
1507sub by_doc_frequency {
1508 my $fa = 1;
1509 if ($a =~ /,/) {
1510 $fa = $a;
1511 $fa =~ s/\d+,//;
1512 }
1513 my $fb = 1;
1514 if ($b =~ /,/) {
1515 $fb = $b;
1516 $fb =~ s/\d+,//;
1517 }
1518
1519 return ($fb <=> $fa);
1520}
1521
15221;
Note: See TracBrowser for help on using the repository browser.