source: trunk/gsdl/perllib/classify/phind.pm@ 1840

Last change on this file since 1840 was 1840, checked in by paynter, 23 years ago

Changed default suffix size, clean up phrases.3 file

  • Property svn:keywords set to Author Date Id Revision
File size: 35.8 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# button=Name The label for the classifiers button in the
33# navigation bar (defaults to "Phrase").
34# title=Title The metadata field used to describe each document
35# (defaults to "Title").
36# text=fields The text used to build the phrase hierarchy
37# (defaults to "section:Title,section:text").
38# phinddir=directory Location of phind index files
39# verbosity=num Control amount of output
40# untidy=true Do not clean up intermediate files
41# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
42# suffixsize=num Number of symbols available to suffix program
43# savephrases=filename If set, phrase infomation will be stored in filename
44# as text. (By defualt, it is not set.)
45# thesaurus=name Name of a thesaurus stred in phind format in etc dir.
46
47# How a classifier works.
48#
49# For each classifier requested in the collect.cfg file, buildcol.pl creates
50# a new classifier object (such as the one defined in theis file) and later
51# passes each document object to the classifier in turn for classification.
52#
53# Four functions are used:
54#
55# 1. "new" is called before the documents are processed to set up the
56# classifier.
57#
58# 2. "init" is called after buildcol.pl has created the indexes etc but
59# before the documents are classified in order that the classifier might
60# set any variables it requires, etc.
61#
62# 3. "classify" is called once for each document object. The classifier
63# "classifies" each document and updates its local data accordingly.
64#
65# 4. "get_classify_info" is called after every document has been
66# classified. It collates the information about the documents and
67# stores a reference to the classifier so that Greenstone can later
68# display it.
69
70
71package phind;
72
73use BasClas;
74use util;
75
76sub BEGIN {
77 @ISA = ('BasClas');
78}
79
80# Define delimiter symbols - this should be abstracted out someplace
81my $colstart = "COLLECTIONSTART";
82my $colend = "COLLECTIONEND";
83my $doclimit = "DOCUMENTLIMIT";
84my $senlimit = "SENTENCELIMIT";
85my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
86
87
88sub print_usage {
89 print STDERR "
90 usage: classify phind [options]
91
92 options:
93 -title Title to use on web pages
94 -text
95 -title
96 -button
97 -language
98 -savephrases
99 -suffixsize
100 -suffixmode
101 -thesaurus
102 -untidy
103";
104}
105
106# Create a new phind browser based on collect.cfg
107
108sub new {
109 my $class = shift (@_);
110 my $self = new BasClas($class, @_);
111
112 my $out = $self->{'outhandle'};
113
114
115 # Phind installation check
116 # The phind phrase browser is research software and is not installed
117 # by defualt. If the user attepts to use it we warn them that it's a
118 # bit dodgy, then tell them how to install it. If they can do that
119 # and get all the files in place, then we let them proceed.
120
121 print $out "Checking Phind phrase browser requirements...\n";
122
123 # Make sure we're not in windows
124 if ($ENV{'GSDLOS'} =~ /windows/i) {
125 print STDERR "Sorry - Phind currently only works under Unix";
126 exit(1);
127 }
128
129 # Ensure the Phind generate scripts are in place
130 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
131 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133 if (!(-e $file1)) {
134 print STDERR "The phind \"suffix\" program is not installed. ";
135 print STDERR "To install it, change to the directory\n";
136 print STDERR " $src\n";
137 print STDERR "and type \"make install-phind\".\n\n";
138 exit(1);
139 }
140
141 # Ensure the Phind CGI script is in place
142 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
143 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
144
145 if (!(-e $file1)) {
146 print STDERR "The phind CGI program is not installed. ";
147 print STDERR "To install it, change to the directory\n";
148 print STDERR " $src\n";
149 print STDERR "and type \"make install-phind\".\n\n";
150 exit(1);
151 }
152
153 # Ensure the Phind Java applet is in place
154 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
155 $file1 = &util::filename_cat($src, "Phind.class");
156
157 if (!(-e $file1)) {
158 print STDERR "The phind Java classes are not compiled. ";
159 print STDERR "To compile them, change to the directory\n";
160 print STDERR " $src\n";
161 print STDERR "and use your Java compiler to compile Phind.java.\n";
162 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
163 exit(1);
164 }
165
166 # Parse classifier arguments
167 my $builddir = "";
168 my $phinddir = "";
169 if (!parsargv::parse(\@_,
170 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
171 q^title/.*/Title^, \$self->{'titlefield'},
172 q^button/.*/Phrase^, \$self->{'buttonname'},
173 q^language/.*/english^, \$language,
174 q^builddir/.*/^, \$builddir,
175 q^savephrases/\d/0^, \$self->{'savephrases'},
176 q^suffixsize/\d+/100000^, \$self->{'suffixsize'},
177 q^suffixmode/\d/1^, \$self->{'suffixmode'},
178 q^thesaurus/.*/^, \$self->{'thesaurus'},
179 q^untidy^, \$self->{'untidy'},
180 "allow_extra_options")) {
181
182 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
183 &print_usage();
184 die "\n";
185 }
186
187 # classifier information
188 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
189
190 # limit languages
191 $language =~ s/,/\|/g;
192 $self->{'language_exp'} = $language;
193
194 # collection directories
195 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
196 if (!$builddir) {
197 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
198 }
199 $self->{'builddir'} = $builddir;
200 $self->{'phinddir'} = &util::filename_cat($builddir, "phind");
201
202 return bless $self, $class;
203}
204
205
206# Initialise the phind classifier
207
208sub init {
209 my $self = shift (@_);
210
211 # ensure we have a build directory
212 my $builddir = $self->{'builddir'};
213 die unless (-e "$builddir");
214
215 # create phind directory
216 my $phinddir = $self->{'phinddir'};
217 if (-e "$phinddir") {
218 &util::rm_r("$phinddir");
219 }
220 &util::mk_dir("$phinddir");
221
222 # open filehandles for documents and text
223 my $clausefile = &util::filename_cat("$phinddir", "clauses");
224 &util::rm($clausefile) if (-e $clausefile);
225 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
226 $self->{'txthandle'} = TEXT;
227
228 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
229 &util::rm($docfile) if (-e $docfile);
230 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
231 $self->{'dochandle'} = DOCS;
232
233}
234
235# Classify each document.
236#
237# Each document is passed here in turn. The classifier extracts the
238# text of each and stores it in the clauses file. Document details are
239# stored in the docs.txt file.
240
241sub classify {
242 my $self = shift (@_);
243 my ($doc_obj) = @_;
244
245 my $verbosity = $self->{'verbosity'};
246 my $top_section = $doc_obj->get_top_section();
247
248 my $titlefield = $self->{'titlefield'};
249
250 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
251 print "process: $title\n" if ($verbosity > 2);
252
253
254 # only consider english-language files
255 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
256 my $phrlanguage = $self->{'language_exp'};
257 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
258
259 # record this file
260 my $total++;
261 print "file $total: $file\n" if ($self->{'$verbosity'});
262
263
264 # Store document details
265 my $OID = $doc_obj->get_OID();
266 $OID = "NULL" unless defined $OID;
267 my $dochandle = $self->{'dochandle'};
268 print $dochandle "<Document>\t$OID\t$title\n";
269
270 # Store the text occuring in this object
271
272 # output the document delimiter
273 my $txthandle = $self->{'txthandle'};
274 print $txthandle "$doclimit\n";
275
276 # iterarate over the required indexes and store their text
277 my $indexes = $self->{'indexes'};
278 my $text = "";
279 my ($part, $level, $field, $section, $data);
280
281 foreach $part (split(/,/, $indexes)) {
282
283 # Each field has a level and a data element ((e.g. document:Title)
284 ($level, $field) = split(/:/, $part);
285 die unless ($level && $field);
286
287 # Extract the text from every section
288 # (In phind, document:text and section:text are equivalent)
289 if ($field eq "text") {
290 $data = "";
291 $section = $doc_obj->get_top_section();
292 while (defined($section)) {
293 $data .= $doc_obj->get_text($section) . "\n";
294 $section = $doc_obj->get_next_section($section);
295 }
296 $text .= convert_gml_to_tokens($data) . "\n";
297 }
298
299 # Extract a metadata field from a document
300 elsif ($level eq "document") {
301 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
302 $text .= convert_gml_to_tokens($data) . "\n";
303 }
304
305 # Extract metadata from every section in a document
306 elsif ($level eq "section") {
307 $data = "";
308 $section = $doc_obj->get_top_section();
309 while (defined($section)) {
310 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
311 $section = $doc_obj->get_next_section($section);
312 }
313 $text .= convert_gml_to_tokens($data) . "\n";
314 }
315
316 # Some sort of specification which I don't understand
317 else {
318 die "Unknown level ($level) in phind index ($part)\n";
319 }
320
321 }
322
323 # output the text
324 $text =~ tr/\n//s;
325 print $txthandle "$text";
326
327}
328
329
330
331# Construct the classifier from the information already gathered
332#
333# When get_classify_info is called, the clauses and docs.txt files have
334# already been constructed in the phind directory. This function will
335# translate them into compressed, indexed MGPP files that can be read by
336# the phindcgi script. It will also register our classifier so that it
337# shows up in the navigation bar.
338
339sub get_classify_info {
340 my $self = shift (@_);
341
342 my $verbosity = $self->{'verbosity'};
343 my $out = $self->{'outhandle'};
344
345 my $phinddir = $self->{'phinddir'};
346 my $language = "english";
347
348 if ($verbosity) {
349 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350 }
351
352 # Construct phind indexes
353 my $suffixmode = $self->{'suffixmode'};
354 my $suffixsize = $self->{'suffixsize'};
355 my ($command, $status);
356
357 # Generate the vocabulary, symbol statistics, and numbers file
358 # from the clauses file
359 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360 &extract_vocabulary($self);
361
362 # Use the suffix program to generate the phind/phrases file
363 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364 &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity, $out);
365
366 # Create the phrase file and put phrase numbers in phind/phrases
367 print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
368 &renumber_phrases($self);
369
370 # Create the mg phrase database
371 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
372 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
373 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
374
375 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
376 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
377 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
378 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
379
380 print $out "\nCreating phrase databases\n";
381 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
382 my $mg_stem = "pdata";
383
384 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
385 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
386 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
387
388 # create the mg index of words
389 print $out "\nCreating word-level search indexes\n";
390 $mg_input = &util::filename_cat($phinddir, "pword.txt");
391 $mg_stem = "pword";
392
393 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
394 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
395 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
396 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
397 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
398 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
399
400 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
401 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
402 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
403
404 # create the mg document information database
405 print $out "\nCreating document information databases\n";
406 $mg_input = &util::filename_cat($phinddir, "docs.txt");
407 $mg_stem = "docs";
408
409 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
410 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
411 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
412
413
414 # Tidy up stray files
415 if (!$self->{'untidy'}) {
416 print $out "\nCleaning up\n" if ($verbosity > 2);
417 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
418 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
419 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
420 "$phinddir/pdata.txt", "$phinddir/pword.txt");
421 my $outfile = 1;
422 while (-e "$phinddir/outPhrase.$outfile") {
423 &util::rm("$phinddir/outPhrase.$outfile");
424 $outfile++;
425 }
426 }
427
428
429 # Insert the classifier into.... what?
430 my $collection = $self->{'collection'};
431 my $url = "library?a=p&p=phind&c=$collection";
432
433 my %classifyinfo = ('thistype'=>'Invisible',
434 'childtype'=>'Phind',
435 'Title'=>$self->{'buttonname'},
436 'contains'=>[]);
437
438 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
439 return \%classifyinfo;
440}
441
442
443
444sub convert_gml_to_tokens {
445
446 $_ = shift @_;
447
448 # FIRST, remove GML tags
449
450 # Replace all whitespace with a simple space
451 s/\s+/ /gs;
452
453 # Remove everything that is in a tag
454 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
455 s/\s*<br>\s*/ LINEBREAK /isg;
456 s/<[^>]*>/ /sg;
457
458 # Now we have the text, but it may contain HTML
459 # elements coded as &gt; etc. Remove these tags.
460 s/&lt;/</sg;
461 s/&gt;/>/sg;
462
463 s/\s+/ /sg;
464 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
465 s/\s*<br>\s*/ LINEBREAK /isg;
466 s/<[^>]*>/ /sg;
467
468 # remove &amp; and other miscellaneous markup tags
469 s/&amp;/&/sg;
470 s/&lt;/</sg;
471 s/&gt;/>/sg;
472 s/&amp;/&/sg;
473
474 # replace<p> and <br> placeholders with carriage returns
475 s/PARAGRAPHBREAK/\n/sg;
476 s/LINEBREAK/\n/sg;
477
478
479 # Exceptional punctuation
480 #
481 # We make special cases of some punctuation
482
483 # remove any apostrophe that indicates omitted letters
484 s/(\w+)\'(\w*\s)/ $1$2 /g;
485
486 # remove period that appears in a person's initals
487 s/\s([A-Z])\./ $1 /g;
488
489 # replace hyphens in hypheanted words and names with a space
490 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
491
492
493 # Convert the remaining text to "clause format",
494 # This means removing all excess punctuation and garbage text,
495 # normalising valid punctuation to fullstops and commas,
496 # then putting one cluse on each line.
497
498 # Insert newline when the end of a sentence is detected
499 # (delimter is: "[\.\?\!]\s")
500 s/\s*[\.\?\!]\s+/\n/g;
501
502 # split numbers after four digits
503 s/(\d\d\d\d)/$1 /g;
504
505 # split words after 32 characters
506
507 # squash repeated punctuation
508 tr/A-Za-z0-9 //cs;
509
510 # save email addresses
511 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
512
513 # normalise clause breaks (mostly punctuation symbols) to commas
514 s/[^A-Za-z0-9 \n]+/ , /g;
515
516 # Remove repeated commas, and replace with newline
517 s/\s*,[, ]+/\n/g;
518
519 # remove extra whitespace
520 s/ +/ /sg;
521 s/^\s+//mg;
522 s/\s*$/\n/mg;
523
524 # remove lines that contain one word or less
525 s/^\w*$//mg;
526 s/^\s*$//mg;
527 tr/\n//s;
528
529 return $_;
530}
531
532
533# Execute a system command
534
535sub execute {
536 my ($command, $verbosity, $outhandle) = @_;
537 print $outhandle "Executing: $command\n" if ($verbosity > 2);
538 my $status = system($command);
539 if ($status != 0) {
540 print STDERR "phindgen.pl - Error executing $command: $!\n";
541 exit($status);
542 }
543}
544
545
546# Generate the vocabulary, symbol statistics, and numbers file from the
547# clauses file. This is legacy code, so is a bit messy and probably wont
548# run under windows.
549
550sub extract_vocabulary {
551 my ($self) = @_;
552
553 my $verbosity = $self->{'verbosity'};
554 my $out = $self->{'outhandle'};
555
556 my $language = "english"; # $self->{'language'};
557
558 my $collectiondir = $self->{'collectiondir'};
559
560 my $phinddir = $self->{'phinddir'};
561
562 my ($w, $l, $line, $word);
563
564 my ($first_delimiter, $last_delimiter,
565 $first_stopword, $last_stopword,
566 $first_extractword, $last_extractword,
567 $first_contentword, $last_contentword,
568 $phrasedelimiter);
569
570 my $thesaurus = $self->{'thesaurus'};
571 my ($thesaurus_links, $thesaurus_terms,
572 %thesaurus, $first_thesaurusword, $last_thesaurusword);
573
574 my %symbol;
575 my (%freq);
576
577 print $out "Calculating vocabulary\n" if ($verbosity > 1);
578
579 # Read and store the stopwords
580 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
581 my %stopwords;
582 foreach $w (split(/\s+/, $words)) {
583 $l = lc($w);
584 $stopwords{$l} = $w;
585 }
586
587 # Read thesaurus information
588 if ($thesaurus) {
589
590 # Ensure both link and term files exist
591 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
592 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
593 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
594 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
595
596 # Read the thesaurus terms
597 open(TH, "<$thesaurus_terms");
598 while(<TH>) {
599 s/^\d+ //;
600 s/\(.*\)//;
601 foreach $w (split(/\s+/, $_)) {
602 $thesaurus{lc($w)} = $w;
603 }
604 }
605 close TH;
606 }
607
608 # Read words in the text and count occurences
609 open(TXT, "<$phinddir/clauses");
610 my @words;
611
612 while(<TXT>) {
613 $line = $_;
614 next unless ($line =~ /./);
615
616 @words = split(/\s+/, $line);
617 foreach $w (@words) {
618 $l = lc($w);
619 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
620 $freq{$w}++;
621 }
622 $freq{$senlimit}++;
623 }
624
625 # Calculate the "best" form of each word
626 my (%bestform, %totalfreq, %bestfreq);
627
628 foreach $w (sort (keys %freq)) {
629 $l = lc($w);
630
631 # totalfreq is the number of times a term appears in any form
632 $totalfreq{$l} += $freq{$w};
633
634 if (defined $stopwords{$l}) {
635 $bestform{$l} = $stopwords{$l};
636
637 } elsif (defined $thesaurus{$l}) {
638 $bestform{$l} = $thesaurus{$l};
639
640 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
641 $bestfreq{$l} = $freq{$w};
642 $bestform{$l} = $w;
643 }
644 }
645
646 undef %freq;
647 undef %bestfreq;
648
649
650 # Assign symbol numbers to tokens
651 my $nextsymbol = 1;
652 my (@vocab);
653
654 # Delimiters
655 $first_delimiter = 1;
656
657 foreach $word (@delimiters) {
658
659 $word = lc($word);
660 $bestform{$word} = uc($word);
661 $vocab[$nextsymbol] = $word;
662 $symbol{$word} = $nextsymbol;
663 $nextsymbol++;
664 }
665 $last_delimiter = $nextsymbol - 1;
666
667 # Stopwords
668 $first_stopword = $nextsymbol;
669
670 foreach my $word (sort keys %stopwords) {
671
672 # don't incluse stopword unless it occurs in the text
673 $word = lc($word);
674 next unless ($totalfreq{$word});
675 next if ($symbol{$word});
676
677 $vocab[$nextsymbol] = $word;
678 $symbol{$word} = $nextsymbol;
679 $nextsymbol++;
680 }
681 $last_stopword = $nextsymbol - 1;
682 $first_contentword = $nextsymbol;
683
684 # Thesaurus terms
685 if ($thesaurus) {
686 $first_thesaurusword = $nextsymbol;
687
688 foreach my $word (sort keys %thesaurus) {
689
690 $word = lc($word);
691 next if ($symbol{$word});
692 $bestform{$word} = $thesaurus{$word};
693
694 $vocab[$nextsymbol] = $word;
695 $symbol{$word} = $nextsymbol;
696 $nextsymbol++;
697
698 }
699 $last_thesaurusword = $nextsymbol - 1;
700 }
701
702 # Other content words
703 $first_extractword = $nextsymbol;
704
705 foreach my $word (sort (keys %bestform)) {
706
707 next if ($symbol{$word});
708
709 $vocab[$nextsymbol] = $word;
710 $symbol{$word} = $nextsymbol;
711 $nextsymbol++;
712 }
713 $last_extractword = $nextsymbol - 1;
714 $last_contentword = $nextsymbol - 1;
715
716
717 # Outut the words
718 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
719 open(VOC, ">$phinddir/clauses.vocab");
720
721 for (my $i = 1; $i < $nextsymbol; $i++) {
722 $w = $vocab[$i];
723
724 print VOC "$bestform{$w}\n";
725 $totalfreq{$w} = 0 unless ($totalfreq{$w});
726 }
727 close VOC;
728
729
730 # Create statistics file
731 # Output statistics about the vocablary
732 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
733 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
734
735 open(STAT, ">$phinddir/clauses.stats")
736 || die "Cannot open $phinddir/clauses.stats: $!";
737
738 print STAT "first_delimiter $first_delimiter\n";
739 print STAT "last_delimiter $last_delimiter\n";
740 print STAT "first_stopword $first_stopword\n";
741 print STAT "last_stopword $last_stopword\n";
742 if ($thesaurus) {
743 print STAT "first_thesaurusword $first_thesaurusword\n";
744 print STAT "last_thesaurusword $last_thesaurusword\n";
745 }
746 print STAT "first_extractword $first_extractword\n";
747 print STAT "last_extractword $last_extractword\n";
748 print STAT "first_contentword $first_contentword\n";
749 print STAT "last_contentword $last_contentword\n";
750 print STAT "first_symbol $first_delimiter\n";
751 print STAT "last_symbol $last_contentword\n";
752 print STAT "first_word $first_stopword\n";
753 print STAT "last_word $last_contentword\n";
754 close STAT;
755
756 undef @vocab;
757
758
759 # Create numbers file
760 # Save text as symbol numbers
761 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
762
763 open(TXT, "<$phinddir/clauses");
764 open(NUM, ">$phinddir/clauses.numbers");
765
766 $phrasedelimiter = $symbol{lc($senlimit)};
767 print NUM "$symbol{lc($colstart)}\n";
768
769 # set up the special symbols that delimit documents and sentences
770 while(<TXT>) {
771
772 # split sentence into a list of tokens
773 $line = $_;
774 next unless ($line =~ /./);
775 @words = split(/\s+/, $line);
776
777 # output one token at a time
778 foreach $word (@words) {
779 $word = lc($word);
780 print NUM "$symbol{$word}\n";
781 }
782
783 # output phrase delimiter
784 print NUM "$phrasedelimiter\n";
785 }
786
787 print NUM "$symbol{lc($colend)}\n";
788 close NUM;
789
790 # Save thesaurus data in one convienient file
791 if ($thesaurus) {
792
793 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
794
795
796 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
797 if ($verbosity > 1);
798
799 # Read the thesaurus terms
800 my ($num, $text, %thes_symbols);
801
802 open(TH, "<$thesaurus_terms");
803 while(<TH>) {
804 chomp;
805 @words = split(/\s+/, $_);
806 $num = shift @words;
807 $text = "";
808
809 # translate words into symbol numbers
810 foreach $word (@words) {
811 $word = lc($word);
812 if ($symbol{$word}) {
813 $text .= "s$symbol{$word} ";
814 } elsif ($verbosity) {
815 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
816 }
817 }
818 $text =~ s/ $//;
819 $thes_symbols{$num} = $text;
820 }
821 close TH;
822
823 # Read the thesaurus links and write the corresponding data
824 open(TH, "<$thesaurus_links");
825 open(THOUT, ">$thesaurusfile");
826
827 while(<TH>) {
828 chomp;
829 ($num, $text) = split(/:/, $_);
830
831 if (defined($thes_symbols{$num})) {
832 print THOUT "$num:$thes_symbols{$num}:$text\n";
833 } else {
834 print THOUT "$num:untranslated:$text\n";
835 }
836 }
837 close TH;
838 close THOUT;
839 }
840
841
842
843
844}
845
846
847# renumber_phrases
848#
849# Prepare the phrases file to be input to mgpp. The biggest problem is
850# reconciling the phrase identifiers used by the suffix program (which
851# we'll call suffix-id numbers) with the numbers used in the thesaurus
852# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
853# Phind-id numbers must be sorted by frequency of occurance.
854#
855# Start creating a set of phind-id numbers from the sorted suffix-id
856# numbers and (if required) the thesaurus-id numbers. Then add any other
857# phrases occuring in the thesaurus.
858#
859# The last thing we have to do is restore the vocabulary information to the
860# phrase file so that the phrases are stored as words, not as symbol
861# numbers.
862
863# The original phrases file looks something like this:
864# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
865# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
866# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
867# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
868
869
870sub renumber_phrases {
871 my ($self) = @_;
872
873 renumber_suffix_data($self);
874 renumber_thesaurus_data($self);
875 restore_vocabulary_data($self);
876
877}
878
879
880
881# renumber_suffix_data
882#
883# Translate phrases file to phrases.2 using phind keys instead
884# of suffix keys and sorting the expansion data.
885
886sub renumber_suffix_data {
887 my ($self) = @_;
888
889 my $verbosity = $self->{'verbosity'};
890 my $out = $self->{'outhandle'};
891 print $out "Translate phrases: suffix-ids become phind-id's\n"
892 if ($verbosity);
893
894 my $phinddir = $self->{'phinddir'};
895 my $infile = &util::filename_cat($phinddir, 'phrases');
896 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
897
898 # Read the phrase file. Calculate initial set of phind-id
899 # numbers and store (suffixid -> frequency) relation.
900
901 my %suffixtophind;
902 my @phindfrequency;
903 my (@fields, $suffixid);
904 my $nextphind = 1;
905
906 open(IN, "<$infile");
907 while(<IN>) {
908
909 chomp;
910 @fields = split(/:/, $_);
911
912 # get next suffixid and phindid
913 $suffixid = shift @fields;
914 $suffixtophind{$suffixid} = $nextphind;
915
916 # store total frequency
917 shift @fields;
918 $totalfrequency[$nextphind] = shift @fields;
919
920 $nextphind++;
921 }
922 close IN;
923
924
925 # Translate phrases file to phrases.2. Use phind keys (not suffix
926 # keys), sort expansion and document occurance data in order of
927 # descending frequency..
928 open(IN, "<$infile");
929 open(OUT, ">$outfile");
930
931 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
932 my (@documwents, @newexp, $k, $n);
933 my $linenumber = 0;
934
935 while(<IN>) {
936
937 # read the line
938 chomp;
939 @fields = split(/:/, $_);
940
941 # get a phrase number for this line
942 $suffixid = shift @fields;
943 die unless (defined($suffixtophind{$suffixid}));
944 $phindid = $suffixtophind{$suffixid};
945
946 # get the symbols in the phrase
947 $text = shift @fields;
948
949 # output status information
950 $linenumber++;
951 if ($verbosity > 2) {
952 if ($linenumber % 1000 == 0) {
953 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
954 }
955 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
956 }
957
958 # get the phrase frequency
959 $tf = shift @fields;
960
961 # get the number of expansions
962 $countexp = shift @fields;
963
964 # get the expansions, convert them into phind-id numbers, and sort them
965 $expansions = shift @fields;
966 @newexp = ();
967 foreach $k (split(/,/, $expansions)) {
968 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
969 $n = $suffixtophind{$k};
970 push @newexp, $n;
971 }
972 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
973
974 # get the number of documents
975 $countdocs = shift @fields;
976
977 # get the documents and sort them
978 $documents = shift @fields;
979 $documents =~ s/d//g;
980 @documents = split(/;/, $documents);
981 @documents = sort by_doc_frequency @documents;
982
983 # output the phrase data
984 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
985 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
986
987 }
988
989 close IN;
990 close OUT;
991}
992
993
994# renumber_thesaurus_data
995#
996# Translate phrases.2 to phrases.3, adding thesaurus data if available.
997
998sub renumber_thesaurus_data {
999 my ($self) = @_;
1000
1001 my $out = $self->{'outhandle'};
1002 my $verbosity = $self->{'verbosity'};
1003 my $thesaurus = $self->{'thesaurus'};
1004
1005 my $phinddir = $self->{'phinddir'};
1006 my $infile = &util::filename_cat($phinddir, "phrases.2");
1007 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1008
1009
1010 # If no thesaurus is defined, simply move the phrases file.
1011 if (!$thesaurus) {
1012 print $out "Translate phrases.2: no thesaurus data\n"
1013 if ($verbosity);
1014 &util::mv($infile, $outfile);
1015 return;
1016 }
1017
1018 print $out "Translate phrases.2: add thesaurus data\n"
1019 if ($verbosity);
1020
1021 # 1.
1022 # Read thesaurus file and store (symbols->thesaurusid) mapping
1023 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1024 my %symbolstothesid;
1025 my (@fields, $thesid, $symbols);
1026
1027 open(TH, "<$thesaurusfile");
1028
1029 while (<TH>) {
1030
1031 chomp;
1032 @fields = split(/:/, $_);
1033
1034 # get id and text
1035 $thesid = shift @fields;
1036 $symbols = shift @fields;
1037 $symbolstothesid{$symbols} = $thesid;
1038 }
1039 close TH;
1040
1041 # 2.
1042 # Read phrases file to find thesaurus entries that already
1043 # have a phindid. Store their phind-ids for later translation.
1044 my %thesaurustophindid;
1045 my ($phindid);
1046
1047 open(IN, "<$infile");
1048
1049 while(<IN>) {
1050
1051 chomp;
1052 @fields = split(/:/, $_);
1053
1054 # phindid and symbols for this line
1055 $phindid = shift @fields;
1056 $symbols = shift @fields;
1057
1058 # do we have a thesaurus id corresponding to this phrase?
1059 if (defined($symbolstothesid{$symbols})) {
1060 $thesid = $symbolstothesid{$symbols};
1061 $thesaurustophindid{$thesid} = $phindid;
1062 }
1063 }
1064 close IN;
1065
1066 undef %symbolstothesid;
1067
1068 # 3.
1069 # Create phind-id numbers for remaining thesaurus entries
1070 my $nextphindid = $phindid + 1;
1071
1072 open(TH, "<$thesaurusfile");
1073 while(<TH>) {
1074
1075 chomp;
1076 @fields = split(/:/, $_);
1077
1078 # read thesaurus-id and ensure it has a corresponding phind-id
1079 $thesid = shift @fields;
1080 if (!defined($thesaurustophindid{$thesid})) {
1081 $thesaurustophindid{$thesid} = $nextphindid;
1082 $nextphindid++;
1083 }
1084 }
1085 close TH;
1086
1087 # 4.
1088 # Translate thesaurus file, replacing thesaurus-id numbers with
1089 # phind-id numbers.
1090 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1091 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1092
1093 open(TH, "<$thesaurusfile");
1094 open(TO, ">$newthesaurusfile");
1095 while(<TH>) {
1096
1097 chomp;
1098 @fields = split(/:/, $_);
1099
1100 # phindid and symbols for this line
1101 ($thesid, $symbols, $relations) = @fields;
1102
1103 die unless ($thesid && $symbols);
1104 die unless $thesaurustophindid{$thesid};
1105 $phindid = $thesaurustophindid{$thesid};
1106
1107 # convert each part of the relation string to use phind-id numbers
1108 $newrelation = "";
1109 $linkcounter = 0;
1110 foreach $linktext (split(/;/, $relations)) {
1111 @linkdata = split(/,/, $linktext);
1112
1113 # remember the linktype (e.g. BT, NT)
1114 $linktype = shift @linkdata;
1115 $newrelation .= "$linktype,";
1116
1117 # convert the link target identfiers
1118 foreach $link (@linkdata) {
1119 die unless (defined($thesaurustophindid{$link}));
1120 $newrelation .= "$thesaurustophindid{$link},";
1121 $linkcounter++;
1122 }
1123 $newrelation =~ s/\,$//;
1124 $newrelation .= ";";
1125 }
1126 $newrelation .= ":";
1127
1128 print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1129 }
1130 close TH;
1131 close TO;
1132
1133 undef %thesaurustophindid;
1134
1135 # 5.
1136 # Read thesaurus data (in phind-id format) into memory
1137 my %thesaurusdata;
1138
1139 open(TH, "<$newthesaurusfile");
1140 while(<TH>) {
1141 chomp;
1142 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1143 die unless ($phindid && $symbols);
1144 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1145 }
1146
1147 # 6.
1148 # Add thesaurus data to phrases file
1149 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1150 my (@documwents, @newexp, $k, $n);
1151 my $linenumber = 0;
1152
1153 open(IN, "<$infile");
1154 open(OUT, ">$outfile");
1155
1156 # Update existing phrases
1157 while(<IN>) {
1158
1159 chomp;
1160 @fields = split(/:/, $_);
1161
1162 # get data for this line
1163 $phindid = shift @fields;
1164
1165 # output the phrase data, with thesaurus information
1166 print OUT "$phindid:", join(":", @fields);
1167
1168 # add thesaurus data
1169 if (defined($thesaurusdata{$phindid})) {
1170 @fields = split(/:/, $thesaurusdata{$phindid});
1171 shift @fields;
1172 $linkcounter = shift @fields;
1173 $relations = shift @fields;
1174
1175 print OUT ":$linkcounter:$relations";
1176 $thesaurusdata{$phindid} = "";
1177 }
1178 print OUT "\n";
1179 }
1180 close IN;
1181
1182 # Add phrases that aren't already in the file
1183 foreach $phindid (sort numerically keys %thesaurusdata) {
1184 next unless ($thesaurusdata{$phindid});
1185
1186 @fields = split(/:/, $thesaurusdata{$phindid});
1187 $symbols = shift @fields;
1188 $linkcounter = shift @fields;
1189 $relations = shift @fields;
1190
1191 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1192 }
1193 close OUT;
1194
1195}
1196
1197# restore_vocabulary_data
1198#
1199# Read phrases.3 and restore vocabulary information. Then write
1200# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1201# (if requested) to the saved phrases file.
1202
1203sub restore_vocabulary_data {
1204 my ($self) = @_;
1205
1206 my $out = $self->{'outhandle'};
1207 my $verbosity = $self->{'verbosity'};
1208 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1209
1210 my $phinddir = $self->{'phinddir'};
1211 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1212 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1213 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1214 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1215
1216 my $savephrases = $self->{'savephrases'};
1217
1218 # 1.
1219 # Read the vocabulary file
1220 open(V, "<$vocabfile")
1221 || die "Cannot open $vocabfile: $!";
1222 my @symbol;
1223 my $i = 1;
1224 while(<V>) {
1225 chomp;
1226 $symbol[$i++] = $_;
1227 }
1228
1229
1230 # 2.
1231 # Translate phrases.3 to MGPP input files
1232 my ($key, $text, $word);
1233 my @fields;
1234 my $linenumber = 0;
1235
1236 open(IN, "<$infile");
1237 open(DATA, ">$datafile");
1238 open(WORD, ">$wordfile");
1239
1240 # Save the phrases in a separate text file
1241 if ($savephrases) {
1242 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1243 open(SAVE, ">$savephrases");
1244 }
1245
1246 while(<IN>) {
1247
1248 # read the line
1249 chomp;
1250 $line = $_;
1251 @fields = split(/:/, $line);
1252
1253 # get a phrase number for this line
1254 $key = shift @fields;
1255
1256 # restore the text of the phrase
1257 $text = shift @fields;
1258 $text =~ s/s(\d+)/$symbol[$1]/g;
1259 if ($text =~ / /) {
1260 $word = "";
1261 } elsif ($text ne 'untranslated') {
1262 $word = $text;
1263 }
1264
1265 # output the phrase data
1266 print DATA "<Document>";
1267 print DATA "$key:$text:", join(":", @fields), ":\n";
1268
1269 # output the word index search data
1270 print WORD "<Document>$word\n";
1271
1272 # output the phrases to a text file
1273 if ($savephrases) {
1274 print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1275 }
1276 }
1277 close IN;
1278 close WORD;
1279 close DATA;
1280 close SAVE if ($savephrases);
1281
1282}
1283
1284
1285
1286# sort routines used to renumber phrases
1287
1288sub numerically { $a <=> $b }
1289
1290sub by_doc_frequency {
1291 my $fa = 1;
1292 if ($a =~ /,/) {
1293 $fa = $a;
1294 $fa =~ s/\d+,//;
1295 }
1296 my $fb = 1;
1297 if ($b =~ /,/) {
1298 $fb = $b;
1299 $fb =~ s/\d+,//;
1300 }
1301
1302 return ($fb <=> $fa);
1303}
1304
13051;
Note: See TracBrowser for help on using the repository browser.