source: main/trunk/greenstone2/perllib/classify/Phind.pm@ 23116

Last change on this file since 23116 was 23116, checked in by kjdon, 14 years ago

for incremental build, classifiers are not really done incrementally. Previously, we reconstructed all the docs from the database, and classified them, then processed any new/edited/deleted docs, updating the classifier as necessary. Now, we process all new/updated docs, then reconstruct the docs from the database, but only classify those not changed/deleted. This means that we are only ever adding docs to a classifier, never updating or deleting. I have removed edit_mode and all code handling deleting stuff from the classifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 40.9 KB
RevLine 
[3472]1###########################################################################
2#
3# Phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The Phind clasifier plugin.
30# Type "classinfo.pl Phind" at the command line for a summary.
31
32package Phind;
33
[17209]34use BaseClassifier;
[3472]35use util;
36use ghtml;
37use unicode;
38
[10253]39use strict;
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
[3472]42my @removedirs = ();
43
44my %wanted_index_files = ('td'=>1,
45 't'=>1,
46 'ti'=>1,
47 'tl'=>1,
48 'tsd'=>1,
49 'idb'=>1,
50 'ib1'=>1,
51 'ib2'=>1,
52 'ib3'=>1,
53 'i'=>1,
54 'il'=>1,
55 'w'=>1,
56 'wa'=>1);
57
58sub BEGIN {
[17209]59 @Phind::ISA = ('BaseClassifier');
[3472]60}
61
62sub END {
63
64 # Tidy up stray files - we do this here as there's some weird problem
65 # preventing us from doing it in the get_classify_info() function (on
66 # windows at least) where the close() appears to fail on txthandle and
67 # dochandle, thus preventing us from deleting those files
68
69 foreach my $dir (@removedirs) {
70 if (-d $dir && opendir (DIR, $dir)) {
71 my @files = readdir DIR;
72 closedir DIR;
73
[10253]74 foreach my $file (@files) {
[3472]75 next if $file =~ /^\.\.?$/;
76 my ($suffix) = $file =~ /\.([^\.]+)$/;
77 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
78 # delete it!
79 &util::rm (&util::filename_cat ($dir, $file));
80 }
81 }
82 }
83 }
84}
85
[4759]86my $arguments =
87 [ { 'name' => "text",
[4873]88 'desc' => "{Phind.text}",
[3540]89 'type' => "string",
[4759]90 'deft' => "section:Title,section:text",
91 'reqd' => "no" },
92 { 'name' => "title",
[4873]93 'desc' => "{Phind.title}",
[3540]94 'type' => "metadata",
[4759]95 'deft' => "Title",
96 'reqd' => "no" },
[6968]97 { 'name' => "buttonname",
98 'desc' => "{BasClas.buttonname}",
[3540]99 'type' => "string",
[4759]100 'deft' => "Phrase",
101 'reqd' => "no" },
102 { 'name' => "language",
[4873]103 'desc' => "{Phind.language}",
[6989]104 'type' => "string",
[4759]105 'deft' => "en",
106 'reqd' => "no" },
107 { 'name' => "savephrases",
[4873]108 'desc' => "{Phind.savephrases}",
[3540]109 'type' => "string",
[4759]110 'deft' => "",
111 'reqd' => "no" },
112 { 'name' => "suffixmode",
[4873]113 'desc' => "{Phind.suffixmode}",
[3540]114 'type' => "int",
[4759]115 'deft' => "1",
[8362]116 'range' => "0,1",
[4759]117 'reqd' => "no" },
[8362]118 { 'name' => "min_occurs",
119 'desc' => "{Phind.min_occurs}",
120 'type' => "int",
121 'deft' => "2",
122 'range' => "1,",
123 'reqd' => "no" },
[4759]124 { 'name' => "thesaurus",
[4873]125 'desc' => "{Phind.thesaurus}",
[3540]126 'type' => "string",
[4759]127 'deft' => "",
128 'reqd' => "no" },
129 { 'name' => "untidy",
[4873]130 'desc' => "{Phind.untidy}",
[3540]131 'type' => "flag",
[4759]132 'reqd' => "no" } ];
[3540]133
[4759]134my $options = { 'name' => "Phind",
[5645]135 'desc' => "{Phind.desc}",
[6408]136 'abstract' => "no",
137 'inherits' => "yes",
[4759]138 'args' => $arguments };
[3540]139
[3472]140
141# Phrase delimiter symbols - these should be abstracted out someplace
142
143my $colstart = "COLLECTIONSTART";
144my $colend = "COLLECTIONEND";
145my $doclimit = "DOCUMENTLIMIT";
146my $senlimit = "SENTENCELIMIT";
147my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
148
149
150# Create a new Phind browser based on collect.cfg
151
152sub new {
[10218]153 my ($class) = shift (@_);
154 my ($classifierslist,$inputargs,$hashArgOptLists) = @_;
155 push(@$classifierslist, $class);
[3472]156
[17209]157 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
158 push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]159
[17209]160 my $self = new BaseClassifier($classifierslist, $inputargs, $hashArgOptLists);
[6968]161
[10253]162 if ($self->{'info_only'}) {
163 # don't worry about any options etc
164 return bless $self, $class;
165 }
166
[3472]167 # Ensure the Phind generate scripts are in place
168 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
169 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
170 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
171 if (!(-e $file1)) {
172 print STDERR "Phind.pm: ERROR: The Phind \"suffix\" program is not installed.\n\n";
173 exit(1);
174 }
175
[20454]176 # things that may have ex. in them that need to be stripped off
177 $self->{'text'} = $self->strip_ex_from_metadata($self->{'text'});
178 $self->{'title'} = $self->strip_ex_from_metadata($self->{'title'});
179
[10218]180 # Transfer value from Auto Parsing to the variable name that used in previous GreenStone.
[20454]181
[10218]182 $self->{"indexes"} = $self->{"text"};
[3472]183
[10218]184 # Further setup
185 $self->{'collection'} = $ENV{'GSDLCOLLECTION'}; # classifier information
186 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'}; # collection directories
[6982]187 if (! defined $self->{'builddir'}) {
188 $self->{'builddir'} = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
189 }
[10218]190 $self->{'total'} = 0;
191
[22382]192 # we set phind to be rtl if language is arabic
193 if ($self->{'language'} eq "ar") {
194 $self->{'textorientation'} = "rtl";
195 }
[10218]196 # Clean out the unused keys
197 delete $self->{"text"};
[6982]198
[3472]199 return bless $self, $class;
200}
201
202
203# Initialise the Phind classifier
204
205sub init {
206 my $self = shift (@_);
207
208 # ensure we have a build directory
209 my $builddir = $self->{'builddir'};
210 die unless (-e "$builddir");
211
212 # create Phind directory
213 my $phnumber = 1;
214 my $phinddir = &util::filename_cat($builddir, "phind1");
215 while (-e "$phinddir") {
216 $phnumber++;
217 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
218 }
219 &util::mk_dir("$phinddir");
220 $self->{'phinddir'} = $phinddir;
221 $self->{'phindnumber'} = $phnumber;
222
223 push(@removedirs, $phinddir) unless $self->{'untidy'};
224
225 # open filehandles for documents and text
226 my $clausefile = &util::filename_cat("$phinddir", "clauses");
227 &util::rm($clausefile) if (-e $clausefile);
228
229 my $txthandle = 'TEXT' . $phnumber;
230 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
231 $self->{'txthandle'} = $txthandle;
232
233 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
234 &util::rm($docfile) if (-e $docfile);
235
236 my $dochandle = 'DOC' . $phnumber;
237 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
238 $self->{'dochandle'} = $dochandle;
239
240}
241
242
243# Classify each document.
244#
245# Each document is passed here in turn. The classifier extracts the
246# text of each and stores it in the clauses file. Document details are
247# stored in the docs.txt file.
248
249sub classify {
250 my $self = shift (@_);
[23116]251 my ($doc_obj) = @_;
[3472]252
253 my $verbosity = $self->{'verbosity'};
254 my $top_section = $doc_obj->get_top_section();
255
[10218]256 my $titlefield = $self->{'title'};
[3472]257
258 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
[3536]259 if (!defined($title)) {
260 $title = "";
261 print STDERR "Phind: document has no title\n";
262 }
[3472]263 print "process: $title\n" if ($verbosity > 2);
264
265 # Only consider the file if it is in the correct language
266 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
[10218]267 my $phrlanguage = $self->{'language'};
[3472]268 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
[18455]269
[3472]270 # record this file
271 $self->{'total'} ++;
[10253]272 # what is $file ???
273 # print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
[3472]274
275 # Store document details
276 my $OID = $doc_obj->get_OID();
277 $OID = "NULL" unless defined $OID;
278 my $dochandle = $self->{'dochandle'};
279 print $dochandle "<Document>\t$OID\t$title\n";
280
281 # Store the text occuring in this object
282
283 # output the document delimiter
284 my $txthandle = $self->{'txthandle'};
285 print $txthandle "$doclimit\n";
286
[8362]287 # iterate over the required indexes and store their text
[3472]288 my $indexes = $self->{'indexes'};
289 my $text = "";
290 my ($part, $level, $field, $section, $data, $dataref);
291
292 foreach $part (split(/,/, $indexes)) {
293
294 # Each field has a level and a data element ((e.g. document:Title)
295 ($level, $field) = split(/:/, $part);
296 die unless ($level && $field);
297
298 # Extract the text from every section
[3540]299 # (In phind, document:text and section:text are equivalent)
[3472]300 if ($field eq "text") {
301 $data = "";
302 $section = $doc_obj->get_top_section();
303 while (defined($section)) {
304 $data .= $doc_obj->get_text($section) . "\n";
305 $section = $doc_obj->get_next_section($section);
306 }
307 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
308 }
309
310 # Extract a metadata field from a document
311 # (If there is more than one element of the given type, get them all.)
312 elsif ($level eq "document") {
313 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
314 foreach $data (@$dataref) {
315 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
316 }
317 }
318
319 # Extract metadata from every section in a document
320 elsif ($level eq "section") {
321 $data = "";
322 $section = $doc_obj->get_top_section();
323 while (defined($section)) {
324 $dataref = $doc_obj->get_metadata($section, $field);
325 $data .= join("\n", @$dataref) . "\n";
326 $section = $doc_obj->get_next_section($section);
327 }
328 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
329 }
330
331 # Some sort of specification which I don't understand
332 else {
333 die "Unknown level ($level) in Phind index ($part)\n";
334 }
335
336 }
337
338 # output the text
339 $text =~ tr/\n//s;
340 print $txthandle "$text";
341}
342
343
344# Construct the classifier from the information already gathered
345#
346# When get_classify_info is called, the clauses and docs.txt files have
[3540]347# already been constructed in the Phind directory. This function will
[3472]348# translate them into compressed, indexed MGPP files that can be read by
349# the phindcgi script. It will also register our classifier so that it
350# shows up in the navigation bar.
351
352sub get_classify_info {
353 my $self = shift (@_);
[6408]354 my ($gli) = @_;
[3472]355
356 close $self->{'dochandle'};
357 close $self->{'txthandle'};
358 my $verbosity = $self->{'verbosity'};
359 my $out = $self->{'outhandle'};
360 my $phinddir = $self->{'phinddir'};
361
362 my $osextra = "";
363 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
364 $osextra = " -d /";
365 }
366
[6408]367 print STDERR "</Stage>\n" if $gli;
368
[3472]369 if ($verbosity) {
370 print $out "\n*** Phind.pm generating indexes for ", $self->{'indexes'}, "\n";
371 print $out "*** in ", $self->{'phinddir'}, "\n";
372 }
373
[6408]374 print STDERR "<Stage name='Phind'>\n" if $gli;
375
[3540]376 # Construct phind indexes
[3472]377 my $suffixmode = $self->{'suffixmode'};
[8362]378 my $min_occurs = $self->{'min_occurs'};
[3472]379 my ($command, $status);
380
381 # Generate the vocabulary, symbol statistics, and numbers file
382 # from the clauses file
383 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
[6408]384 print STDERR "<Phase name='ExtractingVocab'/>\n" if $gli;
[3472]385 &extract_vocabulary($self);
[3540]386
[3472]387 # Use the suffix program to generate the phind/phrases file
388 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
[6408]389 print STDERR "<Phase name='ExtractingPhrase'/>\n" if $gli;
[8362]390 &execute("suffix \"$phinddir\" $suffixmode $min_occurs $verbosity", $verbosity, $out);
[3472]391
392 # check that we generated some files. It's not necessarily an error if
393 # we didn't (execute() would have quit on error), but we can't go on.
394 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
395 if (! -r $phrasesfile) {
[6408]396 print STDERR "<Warning name='NoPhrasesFound'/>\n" if $gli;
[3472]397 print $out "\nNo phrases found for Phind classifier!\n";
398 return;
[3540]399 }
[3472]400
401 # Create the phrase file and put phrase numbers in phind/phrases
402 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
[6408]403 print STDERR "<Phase name='SortAndRenumber'/>\n" if $gli;
[3472]404 &renumber_phrases($self);
405
406 print $out "\nCreating phrase databases\n";
[6408]407 print STDERR "<Phase name='PhraseDatabases'/>\n" if $gli;
[3472]408 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
409 my $mg_stem = &util::filename_cat($phinddir, "pdata");
410
411 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
412 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
413 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
414
415 # create the mg index of words
416 print $out "\nCreating word-level search indexes\n";
[6408]417 print STDERR "<Phase name='WordLevelIndexes'/>\n" if $gli;
[3472]418 $mg_input = &util::filename_cat($phinddir, "pword.txt");
419 $mg_stem = &util::filename_cat($phinddir, "pword");
420
421 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
422 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
423 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
424 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
425 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
426 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
427
428 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
429 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
430 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
431
432 # create the mg document information database
433 print $out "\nCreating document information databases\n";
[6408]434 print STDERR "<Phase name='DocInfoDatabases'/>\n" if $gli;
[3472]435 $mg_input = &util::filename_cat($phinddir, "docs.txt");
436 $mg_stem = &util::filename_cat($phinddir, "docs");
437
438 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
439 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
440 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
441
[22382]442 my $parameters = "phindnumber=$self->{'phindnumber'}";
443 if (defined ($self->{'textorientation'})) {
444 $parameters .= ";textorientation=$self->{'textorientation'}";
445 }
[3472]446 # Return the information about the classifier that we'll later want to
447 # use to create macros when the Phind classifier document is displayed.
448 my %classifyinfo = ('thistype'=>'Invisible',
449 'childtype'=>'Phind',
450 'Title'=>$self->{'buttonname'},
[22382]451 'parameters'=>$parameters,
[3472]452 'contains'=>[]);
453
454 my $collection = $self->{'collection'};
455 my $url = "library?a=p&p=phind&c=$collection";
456 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
457
458 return \%classifyinfo;
459}
460
461
462
463sub convert_gml_to_tokens {
464
465 my ($language_exp, $text) = @_;
466
467 # escape any magic words... - jrm21
468 foreach my $delim (@delimiters) {
469 my $replacement=lc($delim);
470 my $num= $text=~ s/$delim/$replacement/g;
471 if (!$num) {$num=0;}
472 }
473
[21876]474 if ($language_exp =~ /^en$/) {
[3472]475 return &convert_gml_to_tokens_EN($text);
476 }
[3540]477
[3472]478 if ($language_exp =~ /zh/) {
479 return &convert_gml_to_tokens_ZH($text);
[3540]480 }
[3472]481
482 $_ = $text;
483
484 # 1. remove GML tags
485
486 # Remove everything that is in a tag
487 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
488 s/\s*<br>\s*/ LINEBREAK /isgo;
489 s/<[^>]*>/ /sgo;
490
491 # Now we have the text, but it may contain HTML
492 # elements coded as &gt; etc. Remove these tags.
493 s/&amp;/&/sgo;
494 s/&lt;/</sgo;
495 s/&gt;/>/sgo;
496 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
497 s/\s*<br>\s*/ LINEBREAK /isgo;
498 s/<[^>]*>/ /sgo;
499
500 # replace<p> and <br> placeholders with clause break symbol (\n)
501 s/\s+/ /gso;
502 s/PARAGRAPHBREAK/\n/sgo;
503 s/LINEBREAK/\n/sgo;
504
505
506 # 2. Split the remaining text into space-delimited tokens
507
[3540]508 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
509 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
[3472]510
511 # Split text at word boundaries
512 s/\b/ /go;
[21876]513
[3472]514 # 3. Convert the remaining text to "clause format"
515
516 # Insert newline if the end of a sentence is detected
517 # (delimter is: "[\.\?\!]\s")
518 # s/\s*[\.\?\!]\s+/\n/go;
519
520 # remove unnecessary punctuation and replace with clause break symbol (\n)
[21876]521 # the following very nicely removes all non alphanumeric characters. too bad if you are not using english...
522 #s/[^\w ]/\n/go;
523 # replace punct with new lines - is this what we want??
524 s/\s*[\?\;\:\!\,\.\"\[\]\{\}\(\)]\s*/\n/go; #"
525 # then remove other punct with space
526 s/[\'\`\\\_]/ /go;
[3472]527
528 # remove extraneous whitespace
529 s/ +/ /sgo;
530 s/^\s+//mgo;
531 s/\s*$/\n/mgo;
532
533 # remove lines that contain one word or less
534 s/^\S*$//mgo;
535 s/^\s*$//mgo;
536 tr/\n//s;
537
538 return $_;
539}
540
541# a chinese version
542sub convert_gml_to_tokens_ZH {
543
544 $_ = shift @_;
545
546 # Replace all whitespace with a simple space
547 s/\s+/ /gs;
548 # Remove everything that is in a tag
549 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
550 s/\s*<br>\s*/ LINEBREAK /isg;
551 s/<[^>]*>/ /sg;
552
553 # Now we have the text, but it may contain HTML
554 # elements coded as &gt; etc. Remove these tags.
555 s/&lt;/</sg;
556 s/&gt;/>/sg;
557
558 s/\s+/ /sg;
559 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
560 s/\s*<br>\s*/ LINEBREAK /isg;
561 s/<[^>]*>/ /sg;
562
563 # remove &amp; and other miscellaneous markup tags
564 s/&amp;/&/sg;
565 s/&lt;/</sg;
566 s/&gt;/>/sg;
567 s/&amp;/&/sg;
568
569 # replace<p> and <br> placeholders with carriage returns
570 s/PARAGRAPHBREAK/\n/sg;
571 s/LINEBREAK/\n/sg;
572
573
574# print STDERR "text:$_\n";
575 return $_;
576}
[3540]577
[3472]578# A version of convert_gml_to_tokens that is fine-tuned to the English language.
579
580sub convert_gml_to_tokens_EN {
581 $_ = shift @_;
582
583 # FIRST, remove GML tags
584
585 # Replace all whitespace with a simple space
586 s/\s+/ /gs;
587
588 # Remove everything that is in a tag
589 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
590 s/\s*<br>\s*/ LINEBREAK /isg;
591 s/<[^>]*>/ /sg;
592
593 # Now we have the text, but it may contain HTML
594 # elements coded as &gt; etc. Remove these tags.
595 s/&lt;/</sg;
596 s/&gt;/>/sg;
597
598 s/\s+/ /sg;
599 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
600 s/\s*<br>\s*/ LINEBREAK /isg;
601 s/<[^>]*>/ /sg;
602
603 # remove &amp; and other miscellaneous markup tags
604 s/&amp;/&/sg;
605 s/&lt;/</sg;
606 s/&gt;/>/sg;
607 s/&amp;/&/sg;
608
609 # replace<p> and <br> placeholders with carriage returns
610 s/PARAGRAPHBREAK/\n/sg;
611 s/LINEBREAK/\n/sg;
612
613
614 # Exceptional punctuation
615 #
616 # We make special cases of some punctuation
617
618 # remove any apostrophe that indicates omitted letters
619 s/(\w+)\'(\w*\s)/ $1$2 /g;
620
621 # remove period that appears in a person's initals
622 s/\s([A-Z])\./ $1 /g;
623
624 # replace hyphens in hypheanted words and names with a space
625 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
626
627 # Convert the remaining text to "clause format",
628 # This means removing all excess punctuation and garbage text,
629 # normalising valid punctuation to fullstops and commas,
630 # then putting one cluse on each line.
631
632 # Insert newline when the end of a sentence is detected
633 # (delimter is: "[\.\?\!]\s")
634 s/\s*[\.\?\!]\s+/\n/g;
635
636 # split numbers after four digits
637 s/(\d\d\d\d)/$1 /g;
638
639 # split words after 32 characters
640
641 # squash repeated punctuation
642 tr/A-Za-z0-9 //cs;
643
644 # save email addresses
645 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
646
647 # normalise clause breaks (mostly punctuation symbols) to commas
648 s/[^A-Za-z0-9 \n]+/ , /g;
649
650 # Remove repeated commas, and replace with newline
651 s/\s*,[, ]+/\n/g;
652
653 # remove extra whitespace
654 s/ +/ /sg;
655 s/^\s+//mg;
656 s/\s*$/\n/mg;
657
658 # remove lines that contain one word or less
659 s/^\w*$//mg;
660 s/^\s*$//mg;
661 tr/\n//s;
662
663 return $_;
664
665}
666
667
668
669# Execute a system command
670
671sub execute {
672 my ($command, $verbosity, $outhandle) = @_;
673 print $outhandle "Executing: $command\n" if ($verbosity > 2);
674 $! = 0;
675 my $status = system($command);
676 if ($status != 0) {
677 print STDERR "Phind - Error executing '$command': $!\n";
[3540]678 exit($status); # this causes the build to fail...
[3472]679 }
680}
681
682
683# Generate the vocabulary, symbol statistics, and numbers file from the
684# clauses file. This is legacy code, so is a bit messy and probably wont
685# run under windows.
686
687sub extract_vocabulary {
688 my ($self) = @_;
689
690 my $verbosity = $self->{'verbosity'};
691 my $out = $self->{'outhandle'};
692
693 my $collectiondir = $self->{'collectiondir'};
694 my $phinddir = $self->{'phinddir'};
695
[10218]696 my $language_exp = $self->{'language'};
[3472]697
698 my ($w, $l, $line, $word);
699
700 my ($first_delimiter, $last_delimiter,
701 $first_stopword, $last_stopword,
702 $first_extractword, $last_extractword,
703 $first_contentword, $last_contentword,
704 $phrasedelimiter);
705
706 my $thesaurus = $self->{'thesaurus'};
707 my ($thesaurus_links, $thesaurus_terms,
708 %thesaurus, $first_thesaurusword, $last_thesaurusword);
709
710 my %symbol;
711 my (%freq);
712
713 print $out "Calculating vocabulary\n" if ($verbosity > 1);
714
715 # Read and store the stopwords
716 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
717 my $stopword_files = ();
718 my ($language, $language_dir, $file, $file_name);
719 my %stopwords;
720
721 # Examine each directory in the stopword directory
722 opendir(STOPDIR, $stopdir);
723 foreach $language (readdir STOPDIR) {
724
725 # Ignore entries that do not match the classifier's language
726 next unless ($language =~ /$language_exp/);
727 $language_dir = &util::filename_cat($stopdir, $language);
728 next unless (-d "$language_dir");
729
730 opendir(LANGDIR, $language_dir);
731 foreach $file (readdir LANGDIR) {
732
733 # Ignore entries that are not stopword files
734 next unless ($file =~ /sw$/);
735 $file_name = &util::filename_cat($language_dir, $file);
736 next unless (-f "$file_name");
737
738 # Read the stopwords
739 open(STOPFILE, "<$file_name");
740 while (<STOPFILE>) {
741 s/^\s+//;
742 s/\s.*//;
743 $word = $_;
744 $l = lc($word);
745 $stopwords{$l} = $word;
746 }
747 close STOPFILE;
748
749 }
750 closedir LANGDIR;
751 }
752 closedir STOPDIR;
753
754 # Read thesaurus information
755 if ($thesaurus) {
756
757 # link file exists
758 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
759 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
760
761 # ensure term file exists in the correct language
762 if ($language_exp =~ /^([a-z][a-z])/) {
763 $language = $1;
764 } else {
765 $language = 'en';
766 }
767 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
768 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
769
770
771 # Read the thesaurus terms
772 open(TH, "<$thesaurus_terms");
773 while(<TH>) {
774 s/^\d+ //;
775 s/\(.*\)//;
776 foreach $w (split(/\s+/, $_)) {
777 $thesaurus{lc($w)} = $w;
778 }
779 }
780 close TH;
781 }
782
783 # Read words in the text and count occurences
784 open(TXT, "<$phinddir/clauses");
785
786 my @words;
787 while(<TXT>) {
788 $line = $_;
789 next unless ($line =~ /./);
790
791 @words = split(/\s+/, $line);
792 foreach $w (@words) {
793 $l = lc($w);
794 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
795 $freq{$w}++;
796 }
797 $freq{$senlimit}++;
798 }
799
800 close TXT;
801
802 # Calculate the "best" form of each word
803 my (%bestform, %totalfreq, %bestfreq);
804
805 foreach $w (sort (keys %freq)) {
806 $l = lc($w);
807
808 # totalfreq is the number of times a term appears in any form
809 $totalfreq{$l} += $freq{$w};
810
811 if (defined $stopwords{$l}) {
812 $bestform{$l} = $stopwords{$l};
813
814 } elsif (defined $thesaurus{$l}) {
815 $bestform{$l} = $thesaurus{$l};
816
817 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
818 $bestfreq{$l} = $freq{$w};
819 $bestform{$l} = $w;
820 }
821 }
822 undef %freq;
823 undef %bestfreq;
824
825
826 # Assign symbol numbers to tokens
827 my $nextsymbol = 1;
828 my (@vocab);
829
830 # Delimiters
831 $first_delimiter = 1;
832
833 foreach $word (@delimiters) {
834
835# $word = lc($word); # jrm21
836 $word = uc($word);
837 $bestform{$word} = $word;
838 $vocab[$nextsymbol] = $word;
839 $symbol{$word} = $nextsymbol;
840 $nextsymbol++;
841 }
842 $last_delimiter = $nextsymbol - 1;
843 # Stopwords
844 $first_stopword = $nextsymbol;
845
846 foreach my $word (sort keys %stopwords) {
847 # don't include stopword unless it occurs in the text
848 $word = lc($word);
849 next unless ($totalfreq{$word});
850 next if ($symbol{$word});
851
852 $vocab[$nextsymbol] = $word;
853 $symbol{$word} = $nextsymbol;
854 $nextsymbol++;
855 }
856 $last_stopword = $nextsymbol - 1;
857 $first_contentword = $nextsymbol;
858
859 # Thesaurus terms
860 if ($thesaurus) {
861 $first_thesaurusword = $nextsymbol;
862
863 foreach my $word (sort keys %thesaurus) {
864
865 $word = lc($word);
866 next if ($symbol{$word});
867 $bestform{$word} = $thesaurus{$word};
868
869 $vocab[$nextsymbol] = $word;
870 $symbol{$word} = $nextsymbol;
871 $nextsymbol++;
872
873 }
874 $last_thesaurusword = $nextsymbol - 1;
875 }
876
877 # Other content words
878 $first_extractword = $nextsymbol;
879
880 foreach my $word (sort (keys %bestform)) {
881
882 next if ($symbol{$word});
883
884 $vocab[$nextsymbol] = $word;
885 $symbol{$word} = $nextsymbol;
886 $nextsymbol++;
887 }
888 $last_extractword = $nextsymbol - 1;
889 $last_contentword = $nextsymbol - 1;
890
891 # Outut the words
892 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
893 open(VOC, ">$phinddir/clauses.vocab");
894
895 for (my $i = 1; $i < $nextsymbol; $i++) {
896 $w = $vocab[$i];
897
898 print VOC "$bestform{$w}\n";
899 $totalfreq{$w} = 0 unless ($totalfreq{$w});
900 }
901 close VOC;
902
903
904 # Create statistics file
905 # Output statistics about the vocablary
906 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
907 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
908
909 open(STAT, ">$phinddir/clauses.stats")
910 || die "Cannot open $phinddir/clauses.stats: $!";
911
912 print STAT "first_delimiter $first_delimiter\n";
913 print STAT "last_delimiter $last_delimiter\n";
914 print STAT "first_stopword $first_stopword\n";
915 print STAT "last_stopword $last_stopword\n";
916 if ($thesaurus) {
917 print STAT "first_thesaurusword $first_thesaurusword\n";
918 print STAT "last_thesaurusword $last_thesaurusword\n";
919 }
920 print STAT "first_extractword $first_extractword\n";
921 print STAT "last_extractword $last_extractword\n";
922 print STAT "first_contentword $first_contentword\n";
923 print STAT "last_contentword $last_contentword\n";
924 print STAT "first_symbol $first_delimiter\n";
925 print STAT "last_symbol $last_contentword\n";
926 print STAT "first_word $first_stopword\n";
927 print STAT "last_word $last_contentword\n";
928 close STAT;
929
930 undef @vocab;
931
932
933 # Create numbers file
934 # Save text as symbol numbers
935 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
936
937 open(TXT, "<$phinddir/clauses");
938 open(NUM, ">$phinddir/clauses.numbers");
939
940## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
941## print NUM "$symbol{lc($colstart)}\n"; # jrm21
942 $phrasedelimiter = $symbol{$senlimit};
943 print NUM "$symbol{$colstart}\n";
944
945 # set up the special symbols that delimit documents and sentences
946 while(<TXT>) {
947
948 # split sentence into a list of tokens
949 $line = $_;
950 next unless ($line =~ /./);
951 @words = split(/\s+/, $line);
952
953 # output one token at a time
954 foreach $word (@words) {
955# don't lower-case special delimiters - jrm21
956 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
957 $word = lc($word);
958 }
959 print NUM "$symbol{$word}\n";
960 }
961
962 # output phrase delimiter
963 print NUM "$phrasedelimiter\n";
964 }
965
966 close TXT;
967# print NUM "$symbol{lc($colend)}\n";# jrm21
968 print NUM "$symbol{$colend}\n";
969 close NUM;
970
971 # Save thesaurus data in one convienient file
972 if ($thesaurus) {
973
974 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
975
976
977 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
978 if ($verbosity > 1);
979
980 # Read the thesaurus terms
981 my ($num, $text, %thes_symbols);
982
983 open(TH, "<$thesaurus_terms");
984 while(<TH>) {
985 chomp;
986 @words = split(/\s+/, $_);
987 $num = shift @words;
988 $text = "";
989
990 # translate words into symbol numbers
991 foreach $word (@words) {
992 $word = lc($word);
993 if ($symbol{$word}) {
994 $text .= "s$symbol{$word} ";
995 } elsif ($verbosity) {
996 print $out "Phind: No thesaurus symbol, ignoring \"$word\"\n";
997 }
998 }
999 $text =~ s/ $//;
1000 $thes_symbols{$num} = $text;
1001 }
1002 close TH;
1003
1004 # Read the thesaurus links and write the corresponding data
1005 open(TH, "<$thesaurus_links");
1006 open(THOUT, ">$thesaurusfile");
1007
1008 while(<TH>) {
1009 chomp;
1010 ($num, $text) = split(/:/, $_);
1011
1012 if (defined($thes_symbols{$num})) {
1013 print THOUT "$num:$thes_symbols{$num}:$text\n";
1014 } else {
1015 print THOUT "$num:untranslated:$text\n";
1016 }
1017 }
1018 close TH;
1019 close THOUT;
1020 }
1021
1022
1023
1024
1025}
1026
1027
1028# renumber_phrases
1029#
1030# Prepare the phrases file to be input to mgpp. The biggest problem is
1031# reconciling the phrase identifiers used by the suffix program (which
1032# we'll call suffix-id numbers) with the numbers used in the thesaurus
1033# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
1034# Phind-id numbers must be sorted by frequency of occurance.
1035#
1036# Start creating a set of phind-id numbers from the sorted suffix-id
1037# numbers and (if required) the thesaurus-id numbers. Then add any other
1038# phrases occuring in the thesaurus.
1039#
1040# The last thing we have to do is restore the vocabulary information to the
1041# phrase file so that the phrases are stored as words, not as symbol
1042# numbers.
1043
1044# The original phrases file looks something like this:
1045# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
1046# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
1047# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
1048# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
1049
1050
1051sub renumber_phrases {
1052 my ($self) = @_;
1053
1054 renumber_suffix_data($self);
1055 renumber_thesaurus_data($self);
1056 restore_vocabulary_data($self);
1057
1058}
1059
1060
1061
1062# renumber_suffix_data
1063#
1064# Translate phrases file to phrases.2 using phind keys instead
1065# of suffix keys and sorting the expansion data.
1066
1067sub renumber_suffix_data {
1068 my ($self) = @_;
1069
1070 my $verbosity = $self->{'verbosity'};
1071 my $out = $self->{'outhandle'};
1072 print $out "Translate phrases: suffix-ids become phind-id's\n"
1073 if ($verbosity);
1074
1075 my $phinddir = $self->{'phinddir'};
1076 my $infile = &util::filename_cat($phinddir, 'phrases');
1077 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1078
1079 # Read the phrase file. Calculate initial set of phind-id
1080 # numbers and store (suffixid -> frequency) relation.
1081
1082 my %suffixtophind;
[10253]1083 my @totalfrequency;
[3472]1084 my (@fields, $suffixid);
1085 my $nextphind = 1;
1086
1087 open(IN, "<$infile");
1088 while(<IN>) {
1089
1090 chomp;
1091 @fields = split(/:/, $_);
1092
1093 # get next suffixid and phindid
1094 $suffixid = shift @fields;
1095 $suffixtophind{$suffixid} = $nextphind;
1096
1097 # store total frequency
1098 shift @fields;
1099 $totalfrequency[$nextphind] = shift @fields;
1100
1101 $nextphind++;
1102 }
1103 close IN;
1104
1105
1106 # Translate phrases file to phrases.2. Use phind keys (not suffix
1107 # keys), sort expansion and document occurance data in order of
1108 # descending frequency..
1109 open(IN, "<$infile");
1110 open(OUT, ">$outfile");
1111
1112 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1113 my (@documwents, @newexp, $k, $n);
1114 my $linenumber = 0;
1115
1116 while(<IN>) {
1117
1118 # read the line
1119 chomp;
1120 @fields = split(/:/, $_);
1121
1122 # get a phrase number for this line
1123 $suffixid = shift @fields;
1124 die unless (defined($suffixtophind{$suffixid}));
1125 $phindid = $suffixtophind{$suffixid};
1126
1127 # get the symbols in the phrase
1128 $text = shift @fields;
1129
1130 # output status information
1131 $linenumber++;
1132 if ($verbosity > 2) {
1133 if ($linenumber % 1000 == 0) {
1134 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
[10253]1135 }
1136 # what are $num and $key??
1137 #print $out "$num: $key\t($text)\n" if ($verbosity > 3);
[3472]1138 }
1139
1140 # get the phrase frequency
1141 $tf = shift @fields;
1142
1143 # get the number of expansions
1144 $countexp = shift @fields;
1145
1146 # get the expansions, convert them into phind-id numbers, and sort them
1147 $expansions = shift @fields;
1148 @newexp = ();
1149 foreach $k (split(/,/, $expansions)) {
1150 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1151 $n = $suffixtophind{$k};
1152 push @newexp, $n;
1153 }
1154 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1155
1156 # get the number of documents
1157 $countdocs = shift @fields;
1158
1159 # get the documents and sort them
1160 $documents = shift @fields;
1161 $documents =~ s/d//g;
[10253]1162 my @documents = split(/;/, $documents);
[3472]1163 @documents = sort by_doc_frequency @documents;
1164
1165 # output the phrase data
1166 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1167 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1168
1169 }
1170
1171 close IN;
1172 close OUT;
1173}
1174
1175
1176# renumber_thesaurus_data
1177#
1178# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1179
1180sub renumber_thesaurus_data {
1181 my ($self) = @_;
1182
1183 my $out = $self->{'outhandle'};
1184 my $verbosity = $self->{'verbosity'};
1185 my $thesaurus = $self->{'thesaurus'};
1186
1187 my $phinddir = $self->{'phinddir'};
1188 my $infile = &util::filename_cat($phinddir, "phrases.2");
1189 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1190
1191
1192 # If no thesaurus is defined, simply move the phrases file.
1193 if (!$thesaurus) {
1194 print $out "Translate phrases.2: no thesaurus data\n"
1195 if ($verbosity);
1196 &util::mv($infile, $outfile);
1197 return;
1198 }
1199
1200 print $out "Translate phrases.2: add thesaurus data\n"
1201 if ($verbosity);
1202
1203 # 1.
1204 # Read thesaurus file and store (symbols->thesaurusid) mapping
1205 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1206 my %symbolstothesid;
1207 my (@fields, $thesid, $symbols);
1208
1209 open(TH, "<$thesaurusfile");
1210
1211 while (<TH>) {
1212
1213 chomp;
1214 @fields = split(/:/, $_);
1215
1216 # get id and text
1217 $thesid = shift @fields;
1218 $symbols = shift @fields;
1219 $symbolstothesid{$symbols} = $thesid;
1220 }
1221 close TH;
1222
1223 # 2.
1224 # Read phrases file to find thesaurus entries that already
1225 # have a phindid. Store their phind-ids for later translation,
1226 # and store their frequency for later sorting.
1227 my %thesaurustophindid;
1228 my %phindidtofrequency;
1229 my ($phindid, $freq);
1230
1231 open(IN, "<$infile");
1232
1233 while(<IN>) {
1234
1235 chomp;
1236 @fields = split(/:/, $_);
1237
1238 # phindid and symbols for this line
1239 $phindid = shift @fields;
1240 $symbols = shift @fields;
1241 $freq = shift @fields;
1242
1243 # do we have a thesaurus id corresponding to this phrase?
1244 if (defined($symbolstothesid{$symbols})) {
1245 $thesid = $symbolstothesid{$symbols};
1246 $thesaurustophindid{$thesid} = $phindid;
1247 $phindidtofrequency{$phindid} = $freq;
1248 }
1249 }
1250 close IN;
1251
1252 undef %symbolstothesid;
1253
1254 # 3.
1255 # Create phind-id numbers for remaining thesaurus entries,
1256 # and note that their frequency is 0 for later sorting.
1257 my $nextphindid = $phindid + 1;
1258
1259 open(TH, "<$thesaurusfile");
1260 while(<TH>) {
1261
1262 chomp;
1263 @fields = split(/:/, $_);
1264
1265 # read thesaurus-id and ensure it has a corresponding phind-id
1266 $thesid = shift @fields;
1267 if (!defined($thesaurustophindid{$thesid})) {
1268 $thesaurustophindid{$thesid} = $nextphindid;
1269 $phindidtofrequency{$nextphindid} = 0;
1270 $nextphindid++;
1271 }
1272 }
1273 close TH;
1274
1275 # 4.
1276 # Translate thesaurus file, replacing thesaurus-id numbers with
1277 # phind-id numbers.
1278 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1279 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1280 my (@links, $linkid, %linkidtotype, $newrelation);
1281
1282 open(TH, "<$thesaurusfile");
1283 open(TO, ">$newthesaurusfile");
1284 while(<TH>) {
1285
1286 chomp;
1287 @fields = split(/:/, $_);
1288
1289 # phindid and symbols for this line
1290 ($thesid, $symbols, $relations) = @fields;
1291
1292 die unless ($thesid && $symbols);
1293 die unless $thesaurustophindid{$thesid};
1294 $phindid = $thesaurustophindid{$thesid};
1295
1296 # convert each part of the relation string to use phind-id numbers
1297 # at the same time, we want to sort the list by frequency.
1298 undef %linkidtotype;
1299
1300 foreach $linktext (split(/;/, $relations)) {
1301 @linkdata = split(/,/, $linktext);
1302
1303 # remember the linktype (e.g. BT, NT)
1304 $linktype = shift @linkdata;
1305
1306 # store the type of each link
1307 foreach $thesid (@linkdata) {
1308 die unless (defined($thesaurustophindid{$thesid}));
1309 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1310 }
1311 }
1312
1313 # sort the list of links, first by frequency, then by type.
1314 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1315 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1316 $linkcounter = (scalar @links);
1317
1318 # create a string describing the link information
1319 $linktype = $linkidtotype{$links[0]};
1320 $newrelation = $linktype;
1321 foreach $linkid (@links) {
1322 if ($linkidtotype{$linkid} ne $linktype) {
1323 $linktype = $linkidtotype{$linkid};
1324 $newrelation .= ";" . $linktype;
1325 }
1326 $newrelation .= "," . $linkid;
1327 }
1328 $newrelation .= ";";
1329
1330
1331 # output the new line
1332 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1333 }
1334 close TH;
1335 close TO;
1336
1337 undef %thesaurustophindid;
1338 undef %linkidtotype;
1339 undef %phindidtofrequency;
1340
1341 # 5.
1342 # Read thesaurus data (in phind-id format) into memory
1343 my %thesaurusdata;
1344
1345 open(TH, "<$newthesaurusfile");
1346 while(<TH>) {
1347 chomp;
1348 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1349 die unless ($phindid && $symbols);
1350 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1351 }
1352 close TH;
1353
1354 # 6.
1355 # Add thesaurus data to phrases file
1356 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1357 my (@documwents, @newexp, $k, $n);
1358 my $linenumber = 0;
1359
1360 open(IN, "<$infile");
1361 open(OUT, ">$outfile");
1362
1363 # Update existing phrases
1364 while(<IN>) {
1365
1366 chomp;
1367 @fields = split(/:/, $_);
1368
1369 # get data for this line
1370 $phindid = shift @fields;
1371
1372 # output the phrase data, with thesaurus information
1373 print OUT "$phindid:", join(":", @fields);
1374
1375 # add thesaurus data
1376 if (defined($thesaurusdata{$phindid})) {
1377 @fields = split(/:/, $thesaurusdata{$phindid});
1378 shift @fields;
1379 $linkcounter = shift @fields;
1380 $relations = shift @fields;
1381
1382 print OUT ":$linkcounter:$relations";
1383 $thesaurusdata{$phindid} = "";
1384 }
1385 print OUT "\n";
1386 }
1387 close IN;
1388
1389 # Add phrases that aren't already in the file
1390 foreach $phindid (sort numerically keys %thesaurusdata) {
1391 next unless ($thesaurusdata{$phindid});
1392
1393 @fields = split(/:/, $thesaurusdata{$phindid});
1394 $symbols = shift @fields;
1395 $linkcounter = shift @fields;
1396 $relations = shift @fields;
1397
1398 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1399 }
1400 close OUT;
1401
1402}
1403
1404# restore_vocabulary_data
1405#
1406# Read phrases.3 and restore vocabulary information. Then write
[8362]1407# this data to the MGPP input files (pword.txt and pdata.txt) and
[3472]1408# (if requested) to the saved phrases file.
1409
1410sub restore_vocabulary_data {
1411 my ($self) = @_;
1412
1413 my $out = $self->{'outhandle'};
1414 my $verbosity = $self->{'verbosity'};
1415 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1416
1417 my $phinddir = $self->{'phinddir'};
1418 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1419 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1420 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1421 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1422
1423 my $savephrases = $self->{'savephrases'};
1424
1425 # 1.
1426 # Read the vocabulary file
1427 open(V, "<$vocabfile")
1428 || die "Cannot open $vocabfile: $!";
1429 my @symbol;
1430 my $i = 1;
1431 while(<V>) {
1432 chomp;
1433 $symbol[$i++] = $_;
1434 }
1435 close V;
1436
1437 # 2.
1438 # Translate phrases.3 to MGPP input files
[10253]1439 my ($key, $text, $word, $isThesaurus, $line);
[3472]1440 my @fields;
1441 my $linenumber = 0;
1442
1443 open(IN, "<$infile");
1444 open(DATA, ">$datafile");
1445 open(WORD, ">$wordfile");
1446
1447 # Save the phrases in a separate text file
1448 if ($savephrases) {
1449 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1450 open(SAVE, ">$savephrases");
1451 }
1452
1453 while(<IN>) {
1454
1455 # read the line
1456 chomp;
1457 $line = $_;
1458 @fields = split(/:/, $line);
1459
1460 # get a phrase number for this line
1461 $key = shift @fields;
1462
1463 # restore the text of the phrase
1464 $text = shift @fields;
1465 $text =~ s/s(\d+)/$symbol[$1]/g;
1466 if ($text =~ / /) {
1467 $word = "";
1468 } elsif ($text ne 'untranslated') {
1469 $word = $text;
1470 }
1471
1472 # output the phrase data
1473 print DATA "<Document>";
1474 print DATA "$key:$text:", join(":", @fields), ":\n";
1475
1476 # output the word index search data
1477 print WORD "<Document>$word\n";
1478
1479 # output the phrases to a text file
1480 if ($savephrases) {
1481 if ((scalar @fields) == 7) {
1482 $isThesaurus = 1;
1483 } else {
1484 $isThesaurus = 0;
1485 }
1486 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1487 }
1488 }
1489 close IN;
1490 close WORD;
1491 close DATA;
1492 close SAVE if ($savephrases);
1493
1494}
1495
1496
1497
1498# sort routines used to renumber phrases
1499
1500sub numerically { $a <=> $b }
1501
1502sub by_doc_frequency {
1503 my $fa = 1;
1504 if ($a =~ /,/) {
1505 $fa = $a;
1506 $fa =~ s/\d+,//;
1507 }
1508 my $fb = 1;
1509 if ($b =~ /,/) {
1510 $fb = $b;
1511 $fb =~ s/\d+,//;
1512 }
1513
1514 return ($fb <=> $fa);
1515}
1516
15171;
Note: See TracBrowser for help on using the repository browser.