source: trunk/gsdl/perllib/classify/phind.pm@ 1871

Last change on this file since 1871 was 1871, checked in by paynter, 23 years ago

Use two-letter codes for language names, updated docs.

  • Property svn:keywords set to Author Date Id Revision
File size: 35.8 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# button=Name The label for the classifiers button in the
33# navigation bar (defaults to "Phrase").
34# -title Title The metadata field used to describe each document
35# (defaults to "Title").
36# -text fields The text used to build the phrase hierarchy
37# (defaults to "section:Title,section:text").
38# -phinddir directory Location of phind index files
39# -verbosity num Control amount of output
40# -untidy Do not clean up intermediate files
41# -suffixmode num Mode of suffix program (0 = all phrases, 1 = stopword)
42# -suffixsize num Number of symbols available to suffix program
43# -savephrases filename If set, phrase infomation will be stored in filename
44# as text. (By defualt, it is not set.)
45# -thesaurus name Name of a thesaurus stred in phind format in etc dir.
46
47# How a classifier works.
48#
49# For each classifier requested in the collect.cfg file, buildcol.pl creates
50# a new classifier object (such as the one defined in theis file) and later
51# passes each document object to the classifier in turn for classification.
52#
53# Four functions are used:
54#
55# 1. "new" is called before the documents are processed to set up the
56# classifier.
57#
58# 2. "init" is called after buildcol.pl has created the indexes etc but
59# before the documents are classified in order that the classifier might
60# set any variables it requires, etc.
61#
62# 3. "classify" is called once for each document object. The classifier
63# "classifies" each document and updates its local data accordingly.
64#
65# 4. "get_classify_info" is called after every document has been
66# classified. It collates the information about the documents and
67# stores a reference to the classifier so that Greenstone can later
68# display it.
69
70
71package phind;
72
73use BasClas;
74use util;
75
76sub BEGIN {
77 @ISA = ('BasClas');
78}
79
80# Define delimiter symbols - this should be abstracted out someplace
81my $colstart = "COLLECTIONSTART";
82my $colend = "COLLECTIONEND";
83my $doclimit = "DOCUMENTLIMIT";
84my $senlimit = "SENTENCELIMIT";
85my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
86
87
88sub print_usage {
89 print STDERR "
90 usage: classify phind [options]
91
92 options:
93 -title Title to use on web pages
94 -text
95 -title
96 -button
97 -language
98 -savephrases
99 -suffixsize
100 -suffixmode
101 -thesaurus
102 -untidy
103";
104}
105
106# Create a new phind browser based on collect.cfg
107
108sub new {
109 my $class = shift (@_);
110 my $self = new BasClas($class, @_);
111
112 my $out = $self->{'outhandle'};
113
114
115 # Phind installation check
116 # The phind phrase browser is research software and is not installed
117 # by defualt. If the user attepts to use it we warn them that it's a
118 # bit dodgy, then tell them how to install it. If they can do that
119 # and get all the files in place, then we let them proceed.
120
121 print $out "Checking Phind phrase browser requirements...\n";
122
123 # Make sure we're not in windows
124 if ($ENV{'GSDLOS'} =~ /windows/i) {
125 print STDERR "Sorry - Phind currently only works under Unix";
126 exit(1);
127 }
128
129 # Ensure the Phind generate scripts are in place
130 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
131 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133 if (!(-e $file1)) {
134 print STDERR "The phind \"suffix\" program is not installed. ";
135 print STDERR "To install it, change to the directory\n";
136 print STDERR " $src\n";
137 print STDERR "and type \"make install-phind\".\n\n";
138 exit(1);
139 }
140
141 # Ensure the Phind CGI script is in place
142 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
143 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
144
145 if (!(-e $file1)) {
146 print STDERR "The phind CGI program is not installed. ";
147 print STDERR "To install it, change to the directory\n";
148 print STDERR " $src\n";
149 print STDERR "and type \"make install-phind\".\n\n";
150 exit(1);
151 }
152
153 # Ensure the Phind Java applet is in place
154 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
155 $file1 = &util::filename_cat($src, "Phind.class");
156
157 if (!(-e $file1)) {
158 print STDERR "The phind Java classes are not compiled. ";
159 print STDERR "To compile them, change to the directory\n";
160 print STDERR " $src\n";
161 print STDERR "and use your Java compiler to compile Phind.java.\n";
162 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
163 exit(1);
164 }
165
166 # Parse classifier arguments
167 my $builddir = "";
168 my $phinddir = "";
169 if (!parsargv::parse(\@_,
170 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
171 q^title/.*/Title^, \$self->{'titlefield'},
172 q^button/.*/Phrase^, \$self->{'buttonname'},
173 q^language/.*/en^, \$language,
174 q^builddir/.*/^, \$builddir,
175 q^savephrases/\d/0^, \$self->{'savephrases'},
176 q^suffixsize/\d+/100000^, \$self->{'suffixsize'},
177 q^suffixmode/\d/1^, \$self->{'suffixmode'},
178 q^thesaurus/.*/^, \$self->{'thesaurus'},
179 q^untidy^, \$self->{'untidy'},
180 "allow_extra_options")) {
181
182 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
183 &print_usage();
184 die "\n";
185 }
186
187 # classifier information
188 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
189
190 # limit languages
191 $self->{'language_exp'} = $language;
192
193 # collection directories
194 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
195 if (!$builddir) {
196 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
197 }
198 $self->{'builddir'} = $builddir;
199 $self->{'phinddir'} = &util::filename_cat($builddir, "phind");
200
201 return bless $self, $class;
202}
203
204
205# Initialise the phind classifier
206
207sub init {
208 my $self = shift (@_);
209
210 # ensure we have a build directory
211 my $builddir = $self->{'builddir'};
212 die unless (-e "$builddir");
213
214 # create phind directory
215 my $phinddir = $self->{'phinddir'};
216 if (-e "$phinddir") {
217 &util::rm_r("$phinddir");
218 }
219 &util::mk_dir("$phinddir");
220
221 # open filehandles for documents and text
222 my $clausefile = &util::filename_cat("$phinddir", "clauses");
223 &util::rm($clausefile) if (-e $clausefile);
224 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
225 $self->{'txthandle'} = TEXT;
226
227 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
228 &util::rm($docfile) if (-e $docfile);
229 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
230 $self->{'dochandle'} = DOCS;
231
232}
233
234# Classify each document.
235#
236# Each document is passed here in turn. The classifier extracts the
237# text of each and stores it in the clauses file. Document details are
238# stored in the docs.txt file.
239
240sub classify {
241 my $self = shift (@_);
242 my ($doc_obj) = @_;
243
244 my $verbosity = $self->{'verbosity'};
245 my $top_section = $doc_obj->get_top_section();
246
247 my $titlefield = $self->{'titlefield'};
248
249 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
250 print "process: $title\n" if ($verbosity > 2);
251
252 # only consider english-language files
253 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
254 my $phrlanguage = $self->{'language_exp'};
255 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
256
257 # record this file
258 my $total++;
259 print "file $total: $file\n" if ($self->{'$verbosity'});
260
261
262 # Store document details
263 my $OID = $doc_obj->get_OID();
264 $OID = "NULL" unless defined $OID;
265 my $dochandle = $self->{'dochandle'};
266 print $dochandle "<Document>\t$OID\t$title\n";
267
268 # Store the text occuring in this object
269
270 # output the document delimiter
271 my $txthandle = $self->{'txthandle'};
272 print $txthandle "$doclimit\n";
273
274 # iterarate over the required indexes and store their text
275 my $indexes = $self->{'indexes'};
276 my $text = "";
277 my ($part, $level, $field, $section, $data);
278
279 foreach $part (split(/,/, $indexes)) {
280
281 # Each field has a level and a data element ((e.g. document:Title)
282 ($level, $field) = split(/:/, $part);
283 die unless ($level && $field);
284
285 # Extract the text from every section
286 # (In phind, document:text and section:text are equivalent)
287 if ($field eq "text") {
288 $data = "";
289 $section = $doc_obj->get_top_section();
290 while (defined($section)) {
291 $data .= $doc_obj->get_text($section) . "\n";
292 $section = $doc_obj->get_next_section($section);
293 }
294 $text .= convert_gml_to_tokens($data) . "\n";
295 }
296
297 # Extract a metadata field from a document
298 elsif ($level eq "document") {
299 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
300 $text .= convert_gml_to_tokens($data) . "\n";
301 }
302
303 # Extract metadata from every section in a document
304 elsif ($level eq "section") {
305 $data = "";
306 $section = $doc_obj->get_top_section();
307 while (defined($section)) {
308 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
309 $section = $doc_obj->get_next_section($section);
310 }
311 $text .= convert_gml_to_tokens($data) . "\n";
312 }
313
314 # Some sort of specification which I don't understand
315 else {
316 die "Unknown level ($level) in phind index ($part)\n";
317 }
318
319 }
320
321 # output the text
322 $text =~ tr/\n//s;
323 print $txthandle "$text";
324
325}
326
327
328
329# Construct the classifier from the information already gathered
330#
331# When get_classify_info is called, the clauses and docs.txt files have
332# already been constructed in the phind directory. This function will
333# translate them into compressed, indexed MGPP files that can be read by
334# the phindcgi script. It will also register our classifier so that it
335# shows up in the navigation bar.
336
337sub get_classify_info {
338 my $self = shift (@_);
339
340 my $verbosity = $self->{'verbosity'};
341 my $out = $self->{'outhandle'};
342
343 my $phinddir = $self->{'phinddir'};
344 my $language = "english";
345
346 if ($verbosity) {
347 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
348 }
349
350 # Construct phind indexes
351 my $suffixmode = $self->{'suffixmode'};
352 my $suffixsize = $self->{'suffixsize'};
353 my ($command, $status);
354
355 # Generate the vocabulary, symbol statistics, and numbers file
356 # from the clauses file
357 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
358 &extract_vocabulary($self);
359
360 # Use the suffix program to generate the phind/phrases file
361 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
362 &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity, $out);
363
364 # Create the phrase file and put phrase numbers in phind/phrases
365 print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
366 &renumber_phrases($self);
367
368 # Create the mg phrase database
369 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
370 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
371 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
372
373 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
374 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
375 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
376 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
377
378 print $out "\nCreating phrase databases\n";
379 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
380 my $mg_stem = "pdata";
381
382 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
383 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
384 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
385
386 # create the mg index of words
387 print $out "\nCreating word-level search indexes\n";
388 $mg_input = &util::filename_cat($phinddir, "pword.txt");
389 $mg_stem = "pword";
390
391 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
392 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
393 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
394 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
395 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
396 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
397
398 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
399 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
400 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
401
402 # create the mg document information database
403 print $out "\nCreating document information databases\n";
404 $mg_input = &util::filename_cat($phinddir, "docs.txt");
405 $mg_stem = "docs";
406
407 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
408 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
409 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
410
411
412 # Tidy up stray files
413 if (!$self->{'untidy'}) {
414 print $out "\nCleaning up\n" if ($verbosity > 2);
415 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
416 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
417 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
418 "$phinddir/pdata.txt", "$phinddir/pword.txt");
419 my $outfile = 1;
420 while (-e "$phinddir/outPhrase.$outfile") {
421 &util::rm("$phinddir/outPhrase.$outfile");
422 $outfile++;
423 }
424 }
425
426
427 # Insert the classifier into.... what?
428 my $collection = $self->{'collection'};
429 my $url = "library?a=p&p=phind&c=$collection";
430
431 my %classifyinfo = ('thistype'=>'Invisible',
432 'childtype'=>'Phind',
433 'Title'=>$self->{'buttonname'},
434 'contains'=>[]);
435
436 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
437 return \%classifyinfo;
438}
439
440
441
442sub convert_gml_to_tokens {
443
444 $_ = shift @_;
445
446 # FIRST, remove GML tags
447
448 # Replace all whitespace with a simple space
449 s/\s+/ /gs;
450
451 # Remove everything that is in a tag
452 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
453 s/\s*<br>\s*/ LINEBREAK /isg;
454 s/<[^>]*>/ /sg;
455
456 # Now we have the text, but it may contain HTML
457 # elements coded as &gt; etc. Remove these tags.
458 s/&lt;/</sg;
459 s/&gt;/>/sg;
460
461 s/\s+/ /sg;
462 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
463 s/\s*<br>\s*/ LINEBREAK /isg;
464 s/<[^>]*>/ /sg;
465
466 # remove &amp; and other miscellaneous markup tags
467 s/&amp;/&/sg;
468 s/&lt;/</sg;
469 s/&gt;/>/sg;
470 s/&amp;/&/sg;
471
472 # replace<p> and <br> placeholders with carriage returns
473 s/PARAGRAPHBREAK/\n/sg;
474 s/LINEBREAK/\n/sg;
475
476
477 # Exceptional punctuation
478 #
479 # We make special cases of some punctuation
480
481 # remove any apostrophe that indicates omitted letters
482 s/(\w+)\'(\w*\s)/ $1$2 /g;
483
484 # remove period that appears in a person's initals
485 s/\s([A-Z])\./ $1 /g;
486
487 # replace hyphens in hypheanted words and names with a space
488 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
489
490
491 # Convert the remaining text to "clause format",
492 # This means removing all excess punctuation and garbage text,
493 # normalising valid punctuation to fullstops and commas,
494 # then putting one cluse on each line.
495
496 # Insert newline when the end of a sentence is detected
497 # (delimter is: "[\.\?\!]\s")
498 s/\s*[\.\?\!]\s+/\n/g;
499
500 # split numbers after four digits
501 s/(\d\d\d\d)/$1 /g;
502
503 # split words after 32 characters
504
505 # squash repeated punctuation
506 tr/A-Za-z0-9 //cs;
507
508 # save email addresses
509 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
510
511 # normalise clause breaks (mostly punctuation symbols) to commas
512 s/[^A-Za-z0-9 \n]+/ , /g;
513
514 # Remove repeated commas, and replace with newline
515 s/\s*,[, ]+/\n/g;
516
517 # remove extra whitespace
518 s/ +/ /sg;
519 s/^\s+//mg;
520 s/\s*$/\n/mg;
521
522 # remove lines that contain one word or less
523 s/^\w*$//mg;
524 s/^\s*$//mg;
525 tr/\n//s;
526
527 return $_;
528}
529
530
531# Execute a system command
532
533sub execute {
534 my ($command, $verbosity, $outhandle) = @_;
535 print $outhandle "Executing: $command\n" if ($verbosity > 2);
536 my $status = system($command);
537 if ($status != 0) {
538 print STDERR "phind - Error executing $command: $!\n";
539 exit($status);
540 }
541}
542
543
544# Generate the vocabulary, symbol statistics, and numbers file from the
545# clauses file. This is legacy code, so is a bit messy and probably wont
546# run under windows.
547
548sub extract_vocabulary {
549 my ($self) = @_;
550
551 my $verbosity = $self->{'verbosity'};
552 my $out = $self->{'outhandle'};
553
554 my $language = "english"; # $self->{'language'};
555
556 my $collectiondir = $self->{'collectiondir'};
557
558 my $phinddir = $self->{'phinddir'};
559
560 my ($w, $l, $line, $word);
561
562 my ($first_delimiter, $last_delimiter,
563 $first_stopword, $last_stopword,
564 $first_extractword, $last_extractword,
565 $first_contentword, $last_contentword,
566 $phrasedelimiter);
567
568 my $thesaurus = $self->{'thesaurus'};
569 my ($thesaurus_links, $thesaurus_terms,
570 %thesaurus, $first_thesaurusword, $last_thesaurusword);
571
572 my %symbol;
573 my (%freq);
574
575 print $out "Calculating vocabulary\n" if ($verbosity > 1);
576
577 # Read and store the stopwords
578 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
579 my %stopwords;
580 foreach $w (split(/\s+/, $words)) {
581 $l = lc($w);
582 $stopwords{$l} = $w;
583 }
584
585 # Read thesaurus information
586 if ($thesaurus) {
587
588 # Ensure both link and term files exist
589 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
590 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
591 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
592 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
593
594 # Read the thesaurus terms
595 open(TH, "<$thesaurus_terms");
596 while(<TH>) {
597 s/^\d+ //;
598 s/\(.*\)//;
599 foreach $w (split(/\s+/, $_)) {
600 $thesaurus{lc($w)} = $w;
601 }
602 }
603 close TH;
604 }
605
606 # Read words in the text and count occurences
607 open(TXT, "<$phinddir/clauses");
608 my @words;
609
610 while(<TXT>) {
611 $line = $_;
612 next unless ($line =~ /./);
613
614 @words = split(/\s+/, $line);
615 foreach $w (@words) {
616 $l = lc($w);
617 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
618 $freq{$w}++;
619 }
620 $freq{$senlimit}++;
621 }
622
623 # Calculate the "best" form of each word
624 my (%bestform, %totalfreq, %bestfreq);
625
626 foreach $w (sort (keys %freq)) {
627 $l = lc($w);
628
629 # totalfreq is the number of times a term appears in any form
630 $totalfreq{$l} += $freq{$w};
631
632 if (defined $stopwords{$l}) {
633 $bestform{$l} = $stopwords{$l};
634
635 } elsif (defined $thesaurus{$l}) {
636 $bestform{$l} = $thesaurus{$l};
637
638 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
639 $bestfreq{$l} = $freq{$w};
640 $bestform{$l} = $w;
641 }
642 }
643
644 undef %freq;
645 undef %bestfreq;
646
647
648 # Assign symbol numbers to tokens
649 my $nextsymbol = 1;
650 my (@vocab);
651
652 # Delimiters
653 $first_delimiter = 1;
654
655 foreach $word (@delimiters) {
656
657 $word = lc($word);
658 $bestform{$word} = uc($word);
659 $vocab[$nextsymbol] = $word;
660 $symbol{$word} = $nextsymbol;
661 $nextsymbol++;
662 }
663 $last_delimiter = $nextsymbol - 1;
664
665 # Stopwords
666 $first_stopword = $nextsymbol;
667
668 foreach my $word (sort keys %stopwords) {
669
670 # don't incluse stopword unless it occurs in the text
671 $word = lc($word);
672 next unless ($totalfreq{$word});
673 next if ($symbol{$word});
674
675 $vocab[$nextsymbol] = $word;
676 $symbol{$word} = $nextsymbol;
677 $nextsymbol++;
678 }
679 $last_stopword = $nextsymbol - 1;
680 $first_contentword = $nextsymbol;
681
682 # Thesaurus terms
683 if ($thesaurus) {
684 $first_thesaurusword = $nextsymbol;
685
686 foreach my $word (sort keys %thesaurus) {
687
688 $word = lc($word);
689 next if ($symbol{$word});
690 $bestform{$word} = $thesaurus{$word};
691
692 $vocab[$nextsymbol] = $word;
693 $symbol{$word} = $nextsymbol;
694 $nextsymbol++;
695
696 }
697 $last_thesaurusword = $nextsymbol - 1;
698 }
699
700 # Other content words
701 $first_extractword = $nextsymbol;
702
703 foreach my $word (sort (keys %bestform)) {
704
705 next if ($symbol{$word});
706
707 $vocab[$nextsymbol] = $word;
708 $symbol{$word} = $nextsymbol;
709 $nextsymbol++;
710 }
711 $last_extractword = $nextsymbol - 1;
712 $last_contentword = $nextsymbol - 1;
713
714
715 # Outut the words
716 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
717 open(VOC, ">$phinddir/clauses.vocab");
718
719 for (my $i = 1; $i < $nextsymbol; $i++) {
720 $w = $vocab[$i];
721
722 print VOC "$bestform{$w}\n";
723 $totalfreq{$w} = 0 unless ($totalfreq{$w});
724 }
725 close VOC;
726
727
728 # Create statistics file
729 # Output statistics about the vocablary
730 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
731 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
732
733 open(STAT, ">$phinddir/clauses.stats")
734 || die "Cannot open $phinddir/clauses.stats: $!";
735
736 print STAT "first_delimiter $first_delimiter\n";
737 print STAT "last_delimiter $last_delimiter\n";
738 print STAT "first_stopword $first_stopword\n";
739 print STAT "last_stopword $last_stopword\n";
740 if ($thesaurus) {
741 print STAT "first_thesaurusword $first_thesaurusword\n";
742 print STAT "last_thesaurusword $last_thesaurusword\n";
743 }
744 print STAT "first_extractword $first_extractword\n";
745 print STAT "last_extractword $last_extractword\n";
746 print STAT "first_contentword $first_contentword\n";
747 print STAT "last_contentword $last_contentword\n";
748 print STAT "first_symbol $first_delimiter\n";
749 print STAT "last_symbol $last_contentword\n";
750 print STAT "first_word $first_stopword\n";
751 print STAT "last_word $last_contentword\n";
752 close STAT;
753
754 undef @vocab;
755
756
757 # Create numbers file
758 # Save text as symbol numbers
759 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
760
761 open(TXT, "<$phinddir/clauses");
762 open(NUM, ">$phinddir/clauses.numbers");
763
764 $phrasedelimiter = $symbol{lc($senlimit)};
765 print NUM "$symbol{lc($colstart)}\n";
766
767 # set up the special symbols that delimit documents and sentences
768 while(<TXT>) {
769
770 # split sentence into a list of tokens
771 $line = $_;
772 next unless ($line =~ /./);
773 @words = split(/\s+/, $line);
774
775 # output one token at a time
776 foreach $word (@words) {
777 $word = lc($word);
778 print NUM "$symbol{$word}\n";
779 }
780
781 # output phrase delimiter
782 print NUM "$phrasedelimiter\n";
783 }
784
785 print NUM "$symbol{lc($colend)}\n";
786 close NUM;
787
788 # Save thesaurus data in one convienient file
789 if ($thesaurus) {
790
791 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
792
793
794 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
795 if ($verbosity > 1);
796
797 # Read the thesaurus terms
798 my ($num, $text, %thes_symbols);
799
800 open(TH, "<$thesaurus_terms");
801 while(<TH>) {
802 chomp;
803 @words = split(/\s+/, $_);
804 $num = shift @words;
805 $text = "";
806
807 # translate words into symbol numbers
808 foreach $word (@words) {
809 $word = lc($word);
810 if ($symbol{$word}) {
811 $text .= "s$symbol{$word} ";
812 } elsif ($verbosity) {
813 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
814 }
815 }
816 $text =~ s/ $//;
817 $thes_symbols{$num} = $text;
818 }
819 close TH;
820
821 # Read the thesaurus links and write the corresponding data
822 open(TH, "<$thesaurus_links");
823 open(THOUT, ">$thesaurusfile");
824
825 while(<TH>) {
826 chomp;
827 ($num, $text) = split(/:/, $_);
828
829 if (defined($thes_symbols{$num})) {
830 print THOUT "$num:$thes_symbols{$num}:$text\n";
831 } else {
832 print THOUT "$num:untranslated:$text\n";
833 }
834 }
835 close TH;
836 close THOUT;
837 }
838
839
840
841
842}
843
844
845# renumber_phrases
846#
847# Prepare the phrases file to be input to mgpp. The biggest problem is
848# reconciling the phrase identifiers used by the suffix program (which
849# we'll call suffix-id numbers) with the numbers used in the thesaurus
850# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
851# Phind-id numbers must be sorted by frequency of occurance.
852#
853# Start creating a set of phind-id numbers from the sorted suffix-id
854# numbers and (if required) the thesaurus-id numbers. Then add any other
855# phrases occuring in the thesaurus.
856#
857# The last thing we have to do is restore the vocabulary information to the
858# phrase file so that the phrases are stored as words, not as symbol
859# numbers.
860
861# The original phrases file looks something like this:
862# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
863# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
864# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
865# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
866
867
868sub renumber_phrases {
869 my ($self) = @_;
870
871 renumber_suffix_data($self);
872 renumber_thesaurus_data($self);
873 restore_vocabulary_data($self);
874
875}
876
877
878
879# renumber_suffix_data
880#
881# Translate phrases file to phrases.2 using phind keys instead
882# of suffix keys and sorting the expansion data.
883
884sub renumber_suffix_data {
885 my ($self) = @_;
886
887 my $verbosity = $self->{'verbosity'};
888 my $out = $self->{'outhandle'};
889 print $out "Translate phrases: suffix-ids become phind-id's\n"
890 if ($verbosity);
891
892 my $phinddir = $self->{'phinddir'};
893 my $infile = &util::filename_cat($phinddir, 'phrases');
894 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
895
896 # Read the phrase file. Calculate initial set of phind-id
897 # numbers and store (suffixid -> frequency) relation.
898
899 my %suffixtophind;
900 my @phindfrequency;
901 my (@fields, $suffixid);
902 my $nextphind = 1;
903
904 open(IN, "<$infile");
905 while(<IN>) {
906
907 chomp;
908 @fields = split(/:/, $_);
909
910 # get next suffixid and phindid
911 $suffixid = shift @fields;
912 $suffixtophind{$suffixid} = $nextphind;
913
914 # store total frequency
915 shift @fields;
916 $totalfrequency[$nextphind] = shift @fields;
917
918 $nextphind++;
919 }
920 close IN;
921
922
923 # Translate phrases file to phrases.2. Use phind keys (not suffix
924 # keys), sort expansion and document occurance data in order of
925 # descending frequency..
926 open(IN, "<$infile");
927 open(OUT, ">$outfile");
928
929 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
930 my (@documwents, @newexp, $k, $n);
931 my $linenumber = 0;
932
933 while(<IN>) {
934
935 # read the line
936 chomp;
937 @fields = split(/:/, $_);
938
939 # get a phrase number for this line
940 $suffixid = shift @fields;
941 die unless (defined($suffixtophind{$suffixid}));
942 $phindid = $suffixtophind{$suffixid};
943
944 # get the symbols in the phrase
945 $text = shift @fields;
946
947 # output status information
948 $linenumber++;
949 if ($verbosity > 2) {
950 if ($linenumber % 1000 == 0) {
951 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
952 }
953 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
954 }
955
956 # get the phrase frequency
957 $tf = shift @fields;
958
959 # get the number of expansions
960 $countexp = shift @fields;
961
962 # get the expansions, convert them into phind-id numbers, and sort them
963 $expansions = shift @fields;
964 @newexp = ();
965 foreach $k (split(/,/, $expansions)) {
966 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
967 $n = $suffixtophind{$k};
968 push @newexp, $n;
969 }
970 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
971
972 # get the number of documents
973 $countdocs = shift @fields;
974
975 # get the documents and sort them
976 $documents = shift @fields;
977 $documents =~ s/d//g;
978 @documents = split(/;/, $documents);
979 @documents = sort by_doc_frequency @documents;
980
981 # output the phrase data
982 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
983 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
984
985 }
986
987 close IN;
988 close OUT;
989}
990
991
992# renumber_thesaurus_data
993#
994# Translate phrases.2 to phrases.3, adding thesaurus data if available.
995
996sub renumber_thesaurus_data {
997 my ($self) = @_;
998
999 my $out = $self->{'outhandle'};
1000 my $verbosity = $self->{'verbosity'};
1001 my $thesaurus = $self->{'thesaurus'};
1002
1003 my $phinddir = $self->{'phinddir'};
1004 my $infile = &util::filename_cat($phinddir, "phrases.2");
1005 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1006
1007
1008 # If no thesaurus is defined, simply move the phrases file.
1009 if (!$thesaurus) {
1010 print $out "Translate phrases.2: no thesaurus data\n"
1011 if ($verbosity);
1012 &util::mv($infile, $outfile);
1013 return;
1014 }
1015
1016 print $out "Translate phrases.2: add thesaurus data\n"
1017 if ($verbosity);
1018
1019 # 1.
1020 # Read thesaurus file and store (symbols->thesaurusid) mapping
1021 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1022 my %symbolstothesid;
1023 my (@fields, $thesid, $symbols);
1024
1025 open(TH, "<$thesaurusfile");
1026
1027 while (<TH>) {
1028
1029 chomp;
1030 @fields = split(/:/, $_);
1031
1032 # get id and text
1033 $thesid = shift @fields;
1034 $symbols = shift @fields;
1035 $symbolstothesid{$symbols} = $thesid;
1036 }
1037 close TH;
1038
1039 # 2.
1040 # Read phrases file to find thesaurus entries that already
1041 # have a phindid. Store their phind-ids for later translation.
1042 my %thesaurustophindid;
1043 my ($phindid);
1044
1045 open(IN, "<$infile");
1046
1047 while(<IN>) {
1048
1049 chomp;
1050 @fields = split(/:/, $_);
1051
1052 # phindid and symbols for this line
1053 $phindid = shift @fields;
1054 $symbols = shift @fields;
1055
1056 # do we have a thesaurus id corresponding to this phrase?
1057 if (defined($symbolstothesid{$symbols})) {
1058 $thesid = $symbolstothesid{$symbols};
1059 $thesaurustophindid{$thesid} = $phindid;
1060 }
1061 }
1062 close IN;
1063
1064 undef %symbolstothesid;
1065
1066 # 3.
1067 # Create phind-id numbers for remaining thesaurus entries
1068 my $nextphindid = $phindid + 1;
1069
1070 open(TH, "<$thesaurusfile");
1071 while(<TH>) {
1072
1073 chomp;
1074 @fields = split(/:/, $_);
1075
1076 # read thesaurus-id and ensure it has a corresponding phind-id
1077 $thesid = shift @fields;
1078 if (!defined($thesaurustophindid{$thesid})) {
1079 $thesaurustophindid{$thesid} = $nextphindid;
1080 $nextphindid++;
1081 }
1082 }
1083 close TH;
1084
1085 # 4.
1086 # Translate thesaurus file, replacing thesaurus-id numbers with
1087 # phind-id numbers.
1088 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1089 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1090
1091 open(TH, "<$thesaurusfile");
1092 open(TO, ">$newthesaurusfile");
1093 while(<TH>) {
1094
1095 chomp;
1096 @fields = split(/:/, $_);
1097
1098 # phindid and symbols for this line
1099 ($thesid, $symbols, $relations) = @fields;
1100
1101 die unless ($thesid && $symbols);
1102 die unless $thesaurustophindid{$thesid};
1103 $phindid = $thesaurustophindid{$thesid};
1104
1105 # convert each part of the relation string to use phind-id numbers
1106 $newrelation = "";
1107 $linkcounter = 0;
1108 foreach $linktext (split(/;/, $relations)) {
1109 @linkdata = split(/,/, $linktext);
1110
1111 # remember the linktype (e.g. BT, NT)
1112 $linktype = shift @linkdata;
1113 $newrelation .= "$linktype,";
1114
1115 # convert the link target identfiers
1116 foreach $link (@linkdata) {
1117 die unless (defined($thesaurustophindid{$link}));
1118 $newrelation .= "$thesaurustophindid{$link},";
1119 $linkcounter++;
1120 }
1121 $newrelation =~ s/\,$//;
1122 $newrelation .= ";";
1123 }
1124 $newrelation .= ":";
1125
1126 print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1127 }
1128 close TH;
1129 close TO;
1130
1131 undef %thesaurustophindid;
1132
1133 # 5.
1134 # Read thesaurus data (in phind-id format) into memory
1135 my %thesaurusdata;
1136
1137 open(TH, "<$newthesaurusfile");
1138 while(<TH>) {
1139 chomp;
1140 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1141 die unless ($phindid && $symbols);
1142 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1143 }
1144
1145 # 6.
1146 # Add thesaurus data to phrases file
1147 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1148 my (@documwents, @newexp, $k, $n);
1149 my $linenumber = 0;
1150
1151 open(IN, "<$infile");
1152 open(OUT, ">$outfile");
1153
1154 # Update existing phrases
1155 while(<IN>) {
1156
1157 chomp;
1158 @fields = split(/:/, $_);
1159
1160 # get data for this line
1161 $phindid = shift @fields;
1162
1163 # output the phrase data, with thesaurus information
1164 print OUT "$phindid:", join(":", @fields);
1165
1166 # add thesaurus data
1167 if (defined($thesaurusdata{$phindid})) {
1168 @fields = split(/:/, $thesaurusdata{$phindid});
1169 shift @fields;
1170 $linkcounter = shift @fields;
1171 $relations = shift @fields;
1172
1173 print OUT ":$linkcounter:$relations";
1174 $thesaurusdata{$phindid} = "";
1175 }
1176 print OUT "\n";
1177 }
1178 close IN;
1179
1180 # Add phrases that aren't already in the file
1181 foreach $phindid (sort numerically keys %thesaurusdata) {
1182 next unless ($thesaurusdata{$phindid});
1183
1184 @fields = split(/:/, $thesaurusdata{$phindid});
1185 $symbols = shift @fields;
1186 $linkcounter = shift @fields;
1187 $relations = shift @fields;
1188
1189 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1190 }
1191 close OUT;
1192
1193}
1194
1195# restore_vocabulary_data
1196#
1197# Read phrases.3 and restore vocabulary information. Then write
1198# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1199# (if requested) to the saved phrases file.
1200
1201sub restore_vocabulary_data {
1202 my ($self) = @_;
1203
1204 my $out = $self->{'outhandle'};
1205 my $verbosity = $self->{'verbosity'};
1206 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1207
1208 my $phinddir = $self->{'phinddir'};
1209 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1210 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1211 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1212 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1213
1214 my $savephrases = $self->{'savephrases'};
1215
1216 # 1.
1217 # Read the vocabulary file
1218 open(V, "<$vocabfile")
1219 || die "Cannot open $vocabfile: $!";
1220 my @symbol;
1221 my $i = 1;
1222 while(<V>) {
1223 chomp;
1224 $symbol[$i++] = $_;
1225 }
1226
1227
1228 # 2.
1229 # Translate phrases.3 to MGPP input files
1230 my ($key, $text, $word);
1231 my @fields;
1232 my $linenumber = 0;
1233
1234 open(IN, "<$infile");
1235 open(DATA, ">$datafile");
1236 open(WORD, ">$wordfile");
1237
1238 # Save the phrases in a separate text file
1239 if ($savephrases) {
1240 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1241 open(SAVE, ">$savephrases");
1242 }
1243
1244 while(<IN>) {
1245
1246 # read the line
1247 chomp;
1248 $line = $_;
1249 @fields = split(/:/, $line);
1250
1251 # get a phrase number for this line
1252 $key = shift @fields;
1253
1254 # restore the text of the phrase
1255 $text = shift @fields;
1256 $text =~ s/s(\d+)/$symbol[$1]/g;
1257 if ($text =~ / /) {
1258 $word = "";
1259 } elsif ($text ne 'untranslated') {
1260 $word = $text;
1261 }
1262
1263 # output the phrase data
1264 print DATA "<Document>";
1265 print DATA "$key:$text:", join(":", @fields), ":\n";
1266
1267 # output the word index search data
1268 print WORD "<Document>$word\n";
1269
1270 # output the phrases to a text file
1271 if ($savephrases) {
1272 print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1273 }
1274 }
1275 close IN;
1276 close WORD;
1277 close DATA;
1278 close SAVE if ($savephrases);
1279
1280}
1281
1282
1283
1284# sort routines used to renumber phrases
1285
1286sub numerically { $a <=> $b }
1287
1288sub by_doc_frequency {
1289 my $fa = 1;
1290 if ($a =~ /,/) {
1291 $fa = $a;
1292 $fa =~ s/\d+,//;
1293 }
1294 my $fb = 1;
1295 if ($b =~ /,/) {
1296 $fb = $b;
1297 $fb =~ s/\d+,//;
1298 }
1299
1300 return ($fb <=> $fa);
1301}
1302
13031;
Note: See TracBrowser for help on using the repository browser.