source: trunk/gsdl/perllib/classify/phind.pm@ 2503

Last change on this file since 2503 was 2500, checked in by sjboddie, 23 years ago

Removed test for phindcgi from phind classifier as it is no longer used

  • Property svn:keywords set to Author Date Id Revision
File size: 38.6 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40sub BEGIN {
41 @ISA = ('BasClas');
42}
43
44
45sub print_usage {
46 print STDERR "
47 usage: classify phind [options]
48
49 options:
50 -text Fields The text used to build the phrase hierarchy.
51 (default: 'section:Title,section:text')
52
53 -title Title The metadata field used to describe each document.
54 (default: 'Title')
55
56 -button Name The label for the classifier screen and button in
57 navigation bar.
58 (default: 'Phrase')
59
60 -language Regex Language or languages to use building hierarchy.
61 Languages are identified by two-letter country codes
62 like en (English), es (Spanish), and fr (French).
63 Language is a regular expression, so 'en|fr' (English or
64 French) and '..' (match any language) are valid.
65 (default: 'en'.)
66
67 -savephrases File If set, the phrase infomation will be stored in
68 the given file as text. It is probably a good idea
69 to use an absolute path.
70 (defualt: not set)
71
72 -suffixmode N The smode parameter to the phrase extraction program. A
73 value of 0 means that stopwords are ignored, and of 1
74 means that stopwords are used.
75 (default: 1)
76
77 -thesaurus Name Name of a thesaurus stored in phind format in the
78 collection's etc directory.
79 (default: not set)
80
81 -untidy Don't remove working files.
82
83"; }
84
85
86%wanted_index_files = ('td'=>1,
87 't'=>1,
88 'ti'=>1,
89 'tl'=>1,
90 'tsd'=>1,
91 'idb'=>1,
92 'ib1'=>1,
93 'ib2'=>1,
94 'ib3'=>1,
95 'i'=>1,
96 'il'=>1,
97 'w'=>1,
98 'wa'=>1);
99
100
101
102# Phrase delimiter symbols - these should be abstracted out someplace
103
104my $colstart = "COLLECTIONSTART";
105my $colend = "COLLECTIONEND";
106my $doclimit = "DOCUMENTLIMIT";
107my $senlimit = "SENTENCELIMIT";
108my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
109
110
111# Create a new phind browser based on collect.cfg
112
113sub new {
114 my $class = shift (@_);
115 my $self = new BasClas($class, @_);
116
117 my $out = $self->{'outhandle'};
118
119
120 # Phind installation check
121 # The phind phrase browser is research software and is not installed
122 # by defualt. If the user attepts to use it we warn them that it's a
123 # bit dodgy, then tell them how to install it. If they can do that
124 # and get all the files in place, then we let them proceed.
125
126 print $out "Checking Phind phrase browser requirements...\n";
127
128 # Ensure the Phind generate scripts are in place
129 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
130 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
131 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
132
133 if (!(-e $file1)) {
134 print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
135 exit(1);
136 }
137
138 # Ensure the Phind Java applet is in place
139 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
140 $file1 = &util::filename_cat($src, "Phind.class");
141
142 if (!(-e $file1)) {
143 print STDERR "The phind Java classes are not compiled. ";
144 print STDERR "To compile them, change to the directory\n";
145 print STDERR " $src\n";
146 print STDERR "and use your Java compiler to compile Phind.java.\n";
147 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
148 exit(1);
149 }
150
151 # Parse classifier arguments
152 my $builddir = "";
153 if (!parsargv::parse(\@_,
154 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155 q^title/.*/Title^, \$self->{'titlefield'},
156 q^button/.*/Phrase^, \$self->{'buttonname'},
157 q^language/.*/en^, \$language,
158 q^builddir/.*/^, \$builddir,
159 q^savephrases/\d/0^, \$self->{'savephrases'},
160 q^suffixmode/\d/1^, \$self->{'suffixmode'},
161 q^thesaurus/.*/^, \$self->{'thesaurus'},
162 q^untidy^, \$self->{'untidy'},
163 "allow_extra_options")) {
164
165 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166 &print_usage();
167 die "\n";
168 }
169
170 # classifier information
171 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173 # limit languages
174 $self->{'language_exp'} = $language;
175
176 # collection directories
177 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178 if (!$builddir) {
179 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180 }
181 $self->{'builddir'} = $builddir;
182
183 $self->{'total'} = 0;
184
185 return bless $self, $class;
186}
187
188
189# Initialise the phind classifier
190
191sub init {
192 my $self = shift (@_);
193
194 # ensure we have a build directory
195 my $builddir = $self->{'builddir'};
196 die unless (-e "$builddir");
197
198 # create phind directory
199 my $phnumber = 1;
200 my $phinddir = &util::filename_cat($builddir, "phind1");
201 while (-e "$phinddir") {
202 $phnumber++;
203 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
204 }
205 &util::mk_dir("$phinddir");
206 $self->{'phinddir'} = $phinddir;
207 $self->{'phindnumber'} = $phnumber;
208
209 # open filehandles for documents and text
210 my $clausefile = &util::filename_cat("$phinddir", "clauses");
211 &util::rm($clausefile) if (-e $clausefile);
212
213 my $txthandle = 'TEXT' . $phnumber;
214 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
215 $self->{'txthandle'} = $txthandle;
216
217 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
218 &util::rm($docfile) if (-e $docfile);
219
220 my $dochandle = 'DOC' . $phnumber;
221 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
222 $self->{'dochandle'} = $dochandle;
223
224}
225
226
227# Classify each document.
228#
229# Each document is passed here in turn. The classifier extracts the
230# text of each and stores it in the clauses file. Document details are
231# stored in the docs.txt file.
232
233sub classify {
234 my $self = shift (@_);
235 my $doc_obj = shift @_;
236
237 my $verbosity = $self->{'verbosity'};
238 my $top_section = $doc_obj->get_top_section();
239
240 my $titlefield = $self->{'titlefield'};
241
242 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
243 print "process: $title\n" if ($verbosity > 2);
244
245 # Only consider the file if it is in the correct language
246 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
247 my $phrlanguage = $self->{'language_exp'};
248 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
249
250 # record this file
251 $self->{'total'} ++;
252 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
253
254
255 # Store document details
256 my $OID = $doc_obj->get_OID();
257 $OID = "NULL" unless defined $OID;
258 my $dochandle = $self->{'dochandle'};
259 print $dochandle "<Document>\t$OID\t$title\n";
260
261 # Store the text occuring in this object
262
263 # output the document delimiter
264 my $txthandle = $self->{'txthandle'};
265 print $txthandle "$doclimit\n";
266
267 # iterarate over the required indexes and store their text
268 my $indexes = $self->{'indexes'};
269 my $text = "";
270 my ($part, $level, $field, $section, $data, $dataref);
271
272 foreach $part (split(/,/, $indexes)) {
273
274 # Each field has a level and a data element ((e.g. document:Title)
275 ($level, $field) = split(/:/, $part);
276 die unless ($level && $field);
277
278 # Extract the text from every section
279 # (In phind, document:text and section:text are equivalent)
280 if ($field eq "text") {
281 $data = "";
282 $section = $doc_obj->get_top_section();
283 while (defined($section)) {
284 $data .= $doc_obj->get_text($section) . "\n";
285 $section = $doc_obj->get_next_section($section);
286 }
287 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
288 }
289
290 # Extract a metadata field from a document
291 # (If ther eis more than one element of the given type, get them all.)
292 elsif ($level eq "document") {
293 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
294 foreach $data (@$dataref) {
295 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
296 }
297 }
298
299 # Extract metadata from every section in a document
300 elsif ($level eq "section") {
301 $data = "";
302 $section = $doc_obj->get_top_section();
303 while (defined($section)) {
304 $dataref = $doc_obj->get_metadata($section, $field);
305 $data .= join("\n", @$dataref) . "\n";
306 $section = $doc_obj->get_next_section($section);
307 }
308 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
309 }
310
311 # Some sort of specification which I don't understand
312 else {
313 die "Unknown level ($level) in phind index ($part)\n";
314 }
315
316 }
317
318 # output the text
319 $text =~ tr/\n//s;
320 print $txthandle "$text";
321}
322
323
324# Construct the classifier from the information already gathered
325#
326# When get_classify_info is called, the clauses and docs.txt files have
327# already been constructed in the phind directory. This function will
328# translate them into compressed, indexed MGPP files that can be read by
329# the phindcgi script. It will also register our classifier so that it
330# shows up in the navigation bar.
331
332sub get_classify_info {
333 my $self = shift (@_);
334
335 close $self->{'dochandle'};
336 close $self->{'txthandle'};
337 my $verbosity = $self->{'verbosity'};
338 my $out = $self->{'outhandle'};
339 my $phinddir = $self->{'phinddir'};
340
341 my $osextra = "";
342 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
343 $osextra = " -d /";
344 }
345
346 if ($verbosity) {
347 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
348 print $out "*** in", $self->{'phinddir'}, "\n";
349 }
350
351 # Construct phind indexes
352 my $suffixmode = $self->{'suffixmode'};
353 my ($command, $status);
354
355 # Generate the vocabulary, symbol statistics, and numbers file
356 # from the clauses file
357 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
358 &extract_vocabulary($self);
359
360 # Use the suffix program to generate the phind/phrases file
361 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
362 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
363
364 # Create the phrase file and put phrase numbers in phind/phrases
365 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
366 &renumber_phrases($self);
367
368 print $out "\nCreating phrase databases\n";
369 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
370 my $mg_stem = &util::filename_cat($phinddir, "pdata");
371
372 &execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
373 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
374 &execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
375
376 # create the mg index of words
377 print $out "\nCreating word-level search indexes\n";
378 $mg_input = &util::filename_cat($phinddir, "pword.txt");
379 $mg_stem = &util::filename_cat($phinddir, "pword");
380
381 &execute("mgpp_passes $osextra -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
382 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
383 &execute("mgpp_perf_hash_build $osextra -f $mg_stem", $verbosity, $out);
384 &execute("mgpp_passes $osextra -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
385 &execute("mgpp_weights_build $osextra -f $mg_stem", $verbosity, $out);
386 &execute("mgpp_invf_dict $osextra -f $mg_stem", $verbosity, $out);
387
388 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 1", $verbosity, $out);
389 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 2", $verbosity, $out);
390 &execute("mgpp_stem_idx $osextra -f $mg_stem -s 3", $verbosity, $out);
391
392 # create the mg document information database
393 print $out "\nCreating document information databases\n";
394 $mg_input = &util::filename_cat($phinddir, "docs.txt");
395 $mg_stem = &util::filename_cat($phinddir, "docs");
396
397 &execute("mgpp_passes $osextra -f $mg_stem -T1 $mg_input", $verbosity, $out);
398 &execute("mgpp_compression_dict $osextra -f $mg_stem", $verbosity, $out);
399 &execute("mgpp_passes $osextra -f $mg_stem -T2 $mg_input", $verbosity, $out);
400
401 # Tidy up stray files
402 if (!$self->{'untidy'}) {
403 print $out "\nCleaning up\n" if ($verbosity > 2);
404 opendir (DIR, $phinddir) || die;
405 my @files = readdir DIR;
406 closedir DIR;
407
408 foreach $file (@files) {
409 next if $file =~ /^\.\.?$/;
410 my ($suffix) = $file =~ /\.([^\.]+)$/;
411 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
412 # delete it!
413 print $out "deleting $file\n"; # if $verbosity > 2;
414 &util::rm (&util::filename_cat ($phinddir, $file));
415 }
416 }
417 }
418
419 # Return the information about the classifier that we'll later want to
420 # use to create macros when the Phind classifier document is displayed.
421 my %classifyinfo = ('thistype'=>'Invisible',
422 'childtype'=>'Phind',
423 'Title'=>$self->{'buttonname'},
424 'parameters'=>"phindnumber=$self->{'phindnumber'}",
425 'contains'=>[]);
426
427 my $collection = $self->{'collection'};
428 my $url = "library?a=p&p=phind&c=$collection";
429 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
430
431 return \%classifyinfo;
432}
433
434
435
436sub convert_gml_to_tokens {
437
438 my ($language_exp, $text) = @_;
439
440 if ($language_exp =~ /en/) {
441 return &convert_gml_to_tokens_EN($text);
442 }
443
444 $_ = $text;
445
446 # 1. remove GML tags
447
448 # Remove everything that is in a tag
449 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
450 s/\s*<br>\s*/ LINEBREAK /isgo;
451 s/<[^>]*>/ /sgo;
452
453 # Now we have the text, but it may contain HTML
454 # elements coded as &gt; etc. Remove these tags.
455 s/&amp;/&/sgo;
456 s/&lt;/</sgo;
457 s/&gt;/>/sgo;
458 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
459 s/\s*<br>\s*/ LINEBREAK /isgo;
460 s/<[^>]*>/ /sgo;
461
462 # replace<p> and <br> placeholders with clause break symbol (\n)
463 s/\s+/ /gso;
464 s/PARAGRAPHBREAK/\n/sgo;
465 s/LINEBREAK/\n/sgo;
466
467
468
469
470 # 2. Split the remaining text into space-delimited tokens
471
472 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
473 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
474
475 # Split text at word boundaries
476 s/\b/ /go;
477
478 # 3. Convert the remaining text to "clause format"
479
480 # Insert newline if the end of a sentence is detected
481 # (delimter is: "[\.\?\!]\s")
482 # s/\s*[\.\?\!]\s+/\n/go;
483
484 # remove unnecessary punctuation and replace with clause break symbol (\n)
485 s/[^\w ]/\n/go;
486
487 # remove extraneous whitespace
488 s/ +/ /sgo;
489 s/^\s+//mgo;
490 s/\s*$/\n/mgo;
491
492 # remove lines that contain one word or less
493 s/^\S*$//mgo;
494 s/^\s*$//mgo;
495 tr/\n//s;
496
497 return $_;
498}
499
500# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502sub convert_gml_to_tokens_EN {
503 $_ = shift @_;
504
505 # FIRST, remove GML tags
506
507 # Replace all whitespace with a simple space
508 s/\s+/ /gs;
509
510 # Remove everything that is in a tag
511 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
512 s/\s*<br>\s*/ LINEBREAK /isg;
513 s/<[^>]*>/ /sg;
514
515 # Now we have the text, but it may contain HTML
516 # elements coded as &gt; etc. Remove these tags.
517 s/&lt;/</sg;
518 s/&gt;/>/sg;
519
520 s/\s+/ /sg;
521 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
522 s/\s*<br>\s*/ LINEBREAK /isg;
523 s/<[^>]*>/ /sg;
524
525 # remove &amp; and other miscellaneous markup tags
526 s/&amp;/&/sg;
527 s/&lt;/</sg;
528 s/&gt;/>/sg;
529 s/&amp;/&/sg;
530
531 # replace<p> and <br> placeholders with carriage returns
532 s/PARAGRAPHBREAK/\n/sg;
533 s/LINEBREAK/\n/sg;
534
535
536 # Exceptional punctuation
537 #
538 # We make special cases of some punctuation
539
540 # remove any apostrophe that indicates omitted letters
541 s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543 # remove period that appears in a person's initals
544 s/\s([A-Z])\./ $1 /g;
545
546 # replace hyphens in hypheanted words and names with a space
547 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549 # Convert the remaining text to "clause format",
550 # This means removing all excess punctuation and garbage text,
551 # normalising valid punctuation to fullstops and commas,
552 # then putting one cluse on each line.
553
554 # Insert newline when the end of a sentence is detected
555 # (delimter is: "[\.\?\!]\s")
556 s/\s*[\.\?\!]\s+/\n/g;
557
558 # split numbers after four digits
559 s/(\d\d\d\d)/$1 /g;
560
561 # split words after 32 characters
562
563 # squash repeated punctuation
564 tr/A-Za-z0-9 //cs;
565
566 # save email addresses
567 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569 # normalise clause breaks (mostly punctuation symbols) to commas
570 s/[^A-Za-z0-9 \n]+/ , /g;
571
572 # Remove repeated commas, and replace with newline
573 s/\s*,[, ]+/\n/g;
574
575 # remove extra whitespace
576 s/ +/ /sg;
577 s/^\s+//mg;
578 s/\s*$/\n/mg;
579
580 # remove lines that contain one word or less
581 s/^\w*$//mg;
582 s/^\s*$//mg;
583 tr/\n//s;
584
585 return $_;
586
587}
588
589
590
591# Execute a system command
592
593sub execute {
594 my ($command, $verbosity, $outhandle) = @_;
595 print $outhandle "Executing: $command\n" if ($verbosity > 2);
596 $! = 0;
597 my $status = system($command);
598 if ($status != 0) {
599 print STDERR "phind - Error executing '$command': $!\n";
600 exit($status);
601 }
602}
603
604
605# Generate the vocabulary, symbol statistics, and numbers file from the
606# clauses file. This is legacy code, so is a bit messy and probably wont
607# run under windows.
608
609sub extract_vocabulary {
610 my ($self) = @_;
611
612 my $verbosity = $self->{'verbosity'};
613 my $out = $self->{'outhandle'};
614
615 my $collectiondir = $self->{'collectiondir'};
616 my $phinddir = $self->{'phinddir'};
617
618 my $language_exp = $self->{'language_exp'};
619
620 my ($w, $l, $line, $word);
621
622 my ($first_delimiter, $last_delimiter,
623 $first_stopword, $last_stopword,
624 $first_extractword, $last_extractword,
625 $first_contentword, $last_contentword,
626 $phrasedelimiter);
627
628 my $thesaurus = $self->{'thesaurus'};
629 my ($thesaurus_links, $thesaurus_terms,
630 %thesaurus, $first_thesaurusword, $last_thesaurusword);
631
632 my %symbol;
633 my (%freq);
634
635 print $out "Calculating vocabulary\n" if ($verbosity > 1);
636
637 # Read and store the stopwords
638 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
639 my $stopword_files = ();
640 my ($language, $language_dir, $file, $file_name);
641 my %stopwords;
642
643 # Examine each directory in the stopword directory
644 opendir(STOPDIR, $stopdir);
645 foreach $language (readdir STOPDIR) {
646
647 # Ignore entries that do not match the classifier's language
648 next unless ($language =~ /$language_exp/);
649 $language_dir = &util::filename_cat($stopdir, $language);
650 next unless (-d "$language_dir");
651
652 opendir(LANGDIR, $language_dir);
653 foreach $file (readdir LANGDIR) {
654
655 # Ignore entries that are not stopword files
656 next unless ($file =~ /sw$/);
657 $file_name = &util::filename_cat($language_dir, $file);
658 next unless (-f "$file_name");
659
660 # Read the stopwords
661 open(STOPFILE, "<$file_name");
662 while (<STOPFILE>) {
663 s/^\s+//;
664 s/\s.*//;
665 $word = $_;
666 $l = lc($word);
667 $stopwords{$l} = $word;
668 }
669 close STOPFILE;
670
671 }
672 closedir LANGDIR;
673 }
674 closedir STOPDIR;
675
676 # Read thesaurus information
677 if ($thesaurus) {
678
679 # link file exists
680 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
681 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
682
683 # ensure term file exists in the correct language
684 if ($language_exp =~ /^([a-z][a-z])/) {
685 $language = $1;
686 } else {
687 $language = 'en';
688 }
689 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
690 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
691
692
693 # Read the thesaurus terms
694 open(TH, "<$thesaurus_terms");
695 while(<TH>) {
696 s/^\d+ //;
697 s/\(.*\)//;
698 foreach $w (split(/\s+/, $_)) {
699 $thesaurus{lc($w)} = $w;
700 }
701 }
702 close TH;
703 }
704
705 # Read words in the text and count occurences
706 open(TXT, "<$phinddir/clauses");
707 my @words;
708
709 while(<TXT>) {
710 $line = $_;
711 next unless ($line =~ /./);
712
713 @words = split(/\s+/, $line);
714 foreach $w (@words) {
715 $l = lc($w);
716 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
717 $freq{$w}++;
718 }
719 $freq{$senlimit}++;
720 }
721
722 close TXT;
723
724 # Calculate the "best" form of each word
725 my (%bestform, %totalfreq, %bestfreq);
726
727 foreach $w (sort (keys %freq)) {
728 $l = lc($w);
729
730 # totalfreq is the number of times a term appears in any form
731 $totalfreq{$l} += $freq{$w};
732
733 if (defined $stopwords{$l}) {
734 $bestform{$l} = $stopwords{$l};
735
736 } elsif (defined $thesaurus{$l}) {
737 $bestform{$l} = $thesaurus{$l};
738
739 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
740 $bestfreq{$l} = $freq{$w};
741 $bestform{$l} = $w;
742 }
743 }
744
745 undef %freq;
746 undef %bestfreq;
747
748
749 # Assign symbol numbers to tokens
750 my $nextsymbol = 1;
751 my (@vocab);
752
753 # Delimiters
754 $first_delimiter = 1;
755
756 foreach $word (@delimiters) {
757
758 $word = lc($word);
759 $bestform{$word} = uc($word);
760 $vocab[$nextsymbol] = $word;
761 $symbol{$word} = $nextsymbol;
762 $nextsymbol++;
763 }
764 $last_delimiter = $nextsymbol - 1;
765
766 # Stopwords
767 $first_stopword = $nextsymbol;
768
769 foreach my $word (sort keys %stopwords) {
770
771 # don't incluse stopword unless it occurs in the text
772 $word = lc($word);
773 next unless ($totalfreq{$word});
774 next if ($symbol{$word});
775
776 $vocab[$nextsymbol] = $word;
777 $symbol{$word} = $nextsymbol;
778 $nextsymbol++;
779 }
780 $last_stopword = $nextsymbol - 1;
781 $first_contentword = $nextsymbol;
782
783 # Thesaurus terms
784 if ($thesaurus) {
785 $first_thesaurusword = $nextsymbol;
786
787 foreach my $word (sort keys %thesaurus) {
788
789 $word = lc($word);
790 next if ($symbol{$word});
791 $bestform{$word} = $thesaurus{$word};
792
793 $vocab[$nextsymbol] = $word;
794 $symbol{$word} = $nextsymbol;
795 $nextsymbol++;
796
797 }
798 $last_thesaurusword = $nextsymbol - 1;
799 }
800
801 # Other content words
802 $first_extractword = $nextsymbol;
803
804 foreach my $word (sort (keys %bestform)) {
805
806 next if ($symbol{$word});
807
808 $vocab[$nextsymbol] = $word;
809 $symbol{$word} = $nextsymbol;
810 $nextsymbol++;
811 }
812 $last_extractword = $nextsymbol - 1;
813 $last_contentword = $nextsymbol - 1;
814
815
816 # Outut the words
817 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
818 open(VOC, ">$phinddir/clauses.vocab");
819
820 for (my $i = 1; $i < $nextsymbol; $i++) {
821 $w = $vocab[$i];
822
823 print VOC "$bestform{$w}\n";
824 $totalfreq{$w} = 0 unless ($totalfreq{$w});
825 }
826 close VOC;
827
828
829 # Create statistics file
830 # Output statistics about the vocablary
831 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
832 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
833
834 open(STAT, ">$phinddir/clauses.stats")
835 || die "Cannot open $phinddir/clauses.stats: $!";
836
837 print STAT "first_delimiter $first_delimiter\n";
838 print STAT "last_delimiter $last_delimiter\n";
839 print STAT "first_stopword $first_stopword\n";
840 print STAT "last_stopword $last_stopword\n";
841 if ($thesaurus) {
842 print STAT "first_thesaurusword $first_thesaurusword\n";
843 print STAT "last_thesaurusword $last_thesaurusword\n";
844 }
845 print STAT "first_extractword $first_extractword\n";
846 print STAT "last_extractword $last_extractword\n";
847 print STAT "first_contentword $first_contentword\n";
848 print STAT "last_contentword $last_contentword\n";
849 print STAT "first_symbol $first_delimiter\n";
850 print STAT "last_symbol $last_contentword\n";
851 print STAT "first_word $first_stopword\n";
852 print STAT "last_word $last_contentword\n";
853 close STAT;
854
855 undef @vocab;
856
857
858 # Create numbers file
859 # Save text as symbol numbers
860 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
861
862 open(TXT, "<$phinddir/clauses");
863 open(NUM, ">$phinddir/clauses.numbers");
864
865 $phrasedelimiter = $symbol{lc($senlimit)};
866 print NUM "$symbol{lc($colstart)}\n";
867
868 # set up the special symbols that delimit documents and sentences
869 while(<TXT>) {
870
871 # split sentence into a list of tokens
872 $line = $_;
873 next unless ($line =~ /./);
874 @words = split(/\s+/, $line);
875
876 # output one token at a time
877 foreach $word (@words) {
878 $word = lc($word);
879 print NUM "$symbol{$word}\n";
880 }
881
882 # output phrase delimiter
883 print NUM "$phrasedelimiter\n";
884 }
885
886 close TXT;
887 print NUM "$symbol{lc($colend)}\n";
888 close NUM;
889
890 # Save thesaurus data in one convienient file
891 if ($thesaurus) {
892
893 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
894
895
896 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
897 if ($verbosity > 1);
898
899 # Read the thesaurus terms
900 my ($num, $text, %thes_symbols);
901
902 open(TH, "<$thesaurus_terms");
903 while(<TH>) {
904 chomp;
905 @words = split(/\s+/, $_);
906 $num = shift @words;
907 $text = "";
908
909 # translate words into symbol numbers
910 foreach $word (@words) {
911 $word = lc($word);
912 if ($symbol{$word}) {
913 $text .= "s$symbol{$word} ";
914 } elsif ($verbosity) {
915 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
916 }
917 }
918 $text =~ s/ $//;
919 $thes_symbols{$num} = $text;
920 }
921 close TH;
922
923 # Read the thesaurus links and write the corresponding data
924 open(TH, "<$thesaurus_links");
925 open(THOUT, ">$thesaurusfile");
926
927 while(<TH>) {
928 chomp;
929 ($num, $text) = split(/:/, $_);
930
931 if (defined($thes_symbols{$num})) {
932 print THOUT "$num:$thes_symbols{$num}:$text\n";
933 } else {
934 print THOUT "$num:untranslated:$text\n";
935 }
936 }
937 close TH;
938 close THOUT;
939 }
940
941
942
943
944}
945
946
947# renumber_phrases
948#
949# Prepare the phrases file to be input to mgpp. The biggest problem is
950# reconciling the phrase identifiers used by the suffix program (which
951# we'll call suffix-id numbers) with the numbers used in the thesaurus
952# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
953# Phind-id numbers must be sorted by frequency of occurance.
954#
955# Start creating a set of phind-id numbers from the sorted suffix-id
956# numbers and (if required) the thesaurus-id numbers. Then add any other
957# phrases occuring in the thesaurus.
958#
959# The last thing we have to do is restore the vocabulary information to the
960# phrase file so that the phrases are stored as words, not as symbol
961# numbers.
962
963# The original phrases file looks something like this:
964# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
965# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
966# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
967# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
968
969
970sub renumber_phrases {
971 my ($self) = @_;
972
973 renumber_suffix_data($self);
974 renumber_thesaurus_data($self);
975 restore_vocabulary_data($self);
976
977}
978
979
980
981# renumber_suffix_data
982#
983# Translate phrases file to phrases.2 using phind keys instead
984# of suffix keys and sorting the expansion data.
985
986sub renumber_suffix_data {
987 my ($self) = @_;
988
989 my $verbosity = $self->{'verbosity'};
990 my $out = $self->{'outhandle'};
991 print $out "Translate phrases: suffix-ids become phind-id's\n"
992 if ($verbosity);
993
994 my $phinddir = $self->{'phinddir'};
995 my $infile = &util::filename_cat($phinddir, 'phrases');
996 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
997
998 # Read the phrase file. Calculate initial set of phind-id
999 # numbers and store (suffixid -> frequency) relation.
1000
1001 my %suffixtophind;
1002 my @phindfrequency;
1003 my (@fields, $suffixid);
1004 my $nextphind = 1;
1005
1006 open(IN, "<$infile");
1007 while(<IN>) {
1008
1009 chomp;
1010 @fields = split(/:/, $_);
1011
1012 # get next suffixid and phindid
1013 $suffixid = shift @fields;
1014 $suffixtophind{$suffixid} = $nextphind;
1015
1016 # store total frequency
1017 shift @fields;
1018 $totalfrequency[$nextphind] = shift @fields;
1019
1020 $nextphind++;
1021 }
1022 close IN;
1023
1024
1025 # Translate phrases file to phrases.2. Use phind keys (not suffix
1026 # keys), sort expansion and document occurance data in order of
1027 # descending frequency..
1028 open(IN, "<$infile");
1029 open(OUT, ">$outfile");
1030
1031 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1032 my (@documwents, @newexp, $k, $n);
1033 my $linenumber = 0;
1034
1035 while(<IN>) {
1036
1037 # read the line
1038 chomp;
1039 @fields = split(/:/, $_);
1040
1041 # get a phrase number for this line
1042 $suffixid = shift @fields;
1043 die unless (defined($suffixtophind{$suffixid}));
1044 $phindid = $suffixtophind{$suffixid};
1045
1046 # get the symbols in the phrase
1047 $text = shift @fields;
1048
1049 # output status information
1050 $linenumber++;
1051 if ($verbosity > 2) {
1052 if ($linenumber % 1000 == 0) {
1053 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1054 }
1055 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1056 }
1057
1058 # get the phrase frequency
1059 $tf = shift @fields;
1060
1061 # get the number of expansions
1062 $countexp = shift @fields;
1063
1064 # get the expansions, convert them into phind-id numbers, and sort them
1065 $expansions = shift @fields;
1066 @newexp = ();
1067 foreach $k (split(/,/, $expansions)) {
1068 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1069 $n = $suffixtophind{$k};
1070 push @newexp, $n;
1071 }
1072 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1073
1074 # get the number of documents
1075 $countdocs = shift @fields;
1076
1077 # get the documents and sort them
1078 $documents = shift @fields;
1079 $documents =~ s/d//g;
1080 @documents = split(/;/, $documents);
1081 @documents = sort by_doc_frequency @documents;
1082
1083 # output the phrase data
1084 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1085 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1086
1087 }
1088
1089 close IN;
1090 close OUT;
1091}
1092
1093
1094# renumber_thesaurus_data
1095#
1096# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1097
1098sub renumber_thesaurus_data {
1099 my ($self) = @_;
1100
1101 my $out = $self->{'outhandle'};
1102 my $verbosity = $self->{'verbosity'};
1103 my $thesaurus = $self->{'thesaurus'};
1104
1105 my $phinddir = $self->{'phinddir'};
1106 my $infile = &util::filename_cat($phinddir, "phrases.2");
1107 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1108
1109
1110 # If no thesaurus is defined, simply move the phrases file.
1111 if (!$thesaurus) {
1112 print $out "Translate phrases.2: no thesaurus data\n"
1113 if ($verbosity);
1114 &util::mv($infile, $outfile);
1115 return;
1116 }
1117
1118 print $out "Translate phrases.2: add thesaurus data\n"
1119 if ($verbosity);
1120
1121 # 1.
1122 # Read thesaurus file and store (symbols->thesaurusid) mapping
1123 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1124 my %symbolstothesid;
1125 my (@fields, $thesid, $symbols);
1126
1127 open(TH, "<$thesaurusfile");
1128
1129 while (<TH>) {
1130
1131 chomp;
1132 @fields = split(/:/, $_);
1133
1134 # get id and text
1135 $thesid = shift @fields;
1136 $symbols = shift @fields;
1137 $symbolstothesid{$symbols} = $thesid;
1138 }
1139 close TH;
1140
1141 # 2.
1142 # Read phrases file to find thesaurus entries that already
1143 # have a phindid. Store their phind-ids for later translation,
1144 # and store their frequency for later sorting.
1145 my %thesaurustophindid;
1146 my %phindidtofrequency;
1147 my ($phindid, $freq);
1148
1149 open(IN, "<$infile");
1150
1151 while(<IN>) {
1152
1153 chomp;
1154 @fields = split(/:/, $_);
1155
1156 # phindid and symbols for this line
1157 $phindid = shift @fields;
1158 $symbols = shift @fields;
1159 $freq = shift @fields;
1160
1161 # do we have a thesaurus id corresponding to this phrase?
1162 if (defined($symbolstothesid{$symbols})) {
1163 $thesid = $symbolstothesid{$symbols};
1164 $thesaurustophindid{$thesid} = $phindid;
1165 $phindidtofrequency{$phindid} = $freq;
1166 }
1167 }
1168 close IN;
1169
1170 undef %symbolstothesid;
1171
1172 # 3.
1173 # Create phind-id numbers for remaining thesaurus entries,
1174 # and note that their frequency is 0 for later sorting.
1175 my $nextphindid = $phindid + 1;
1176
1177 open(TH, "<$thesaurusfile");
1178 while(<TH>) {
1179
1180 chomp;
1181 @fields = split(/:/, $_);
1182
1183 # read thesaurus-id and ensure it has a corresponding phind-id
1184 $thesid = shift @fields;
1185 if (!defined($thesaurustophindid{$thesid})) {
1186 $thesaurustophindid{$thesid} = $nextphindid;
1187 $phindidtofrequency{$nextphindid} = 0;
1188 $nextphindid++;
1189 }
1190 }
1191 close TH;
1192
1193 # 4.
1194 # Translate thesaurus file, replacing thesaurus-id numbers with
1195 # phind-id numbers.
1196 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1197 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1198 my (@links, $linkid, %linkidtotype, $newrelation);
1199
1200 open(TH, "<$thesaurusfile");
1201 open(TO, ">$newthesaurusfile");
1202 while(<TH>) {
1203
1204 chomp;
1205 @fields = split(/:/, $_);
1206
1207 # phindid and symbols for this line
1208 ($thesid, $symbols, $relations) = @fields;
1209
1210 die unless ($thesid && $symbols);
1211 die unless $thesaurustophindid{$thesid};
1212 $phindid = $thesaurustophindid{$thesid};
1213
1214 # convert each part of the relation string to use phind-id numbers
1215 # at the same time, we want to sort the list by frequency.
1216 undef %linkidtotype;
1217
1218 foreach $linktext (split(/;/, $relations)) {
1219 @linkdata = split(/,/, $linktext);
1220
1221 # remember the linktype (e.g. BT, NT)
1222 $linktype = shift @linkdata;
1223
1224 # store the type of each link
1225 foreach $thesid (@linkdata) {
1226 die unless (defined($thesaurustophindid{$thesid}));
1227 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1228 }
1229 }
1230
1231 # sort the list of links, first by frequency, then by type.
1232 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1233 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1234 $linkcounter = (scalar @links);
1235
1236 # create a string describing the link information
1237 $linktype = $linkidtotype{$links[0]};
1238 $newrelation = $linktype;
1239 foreach $linkid (@links) {
1240 if ($linkidtotype{$linkid} ne $linktype) {
1241 $linktype = $linkidtotype{$linkid};
1242 $newrelation .= ";" . $linktype;
1243 }
1244 $newrelation .= "," . $linkid;
1245 }
1246 $newrelation .= ";";
1247
1248
1249 # output the new line
1250 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1251 }
1252 close TH;
1253 close TO;
1254
1255 undef %thesaurustophindid;
1256 undef %linkidtotype;
1257 undef %phindidtofrequency;
1258
1259 # 5.
1260 # Read thesaurus data (in phind-id format) into memory
1261 my %thesaurusdata;
1262
1263 open(TH, "<$newthesaurusfile");
1264 while(<TH>) {
1265 chomp;
1266 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1267 die unless ($phindid && $symbols);
1268 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1269 }
1270 close TH;
1271
1272 # 6.
1273 # Add thesaurus data to phrases file
1274 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1275 my (@documwents, @newexp, $k, $n);
1276 my $linenumber = 0;
1277
1278 open(IN, "<$infile");
1279 open(OUT, ">$outfile");
1280
1281 # Update existing phrases
1282 while(<IN>) {
1283
1284 chomp;
1285 @fields = split(/:/, $_);
1286
1287 # get data for this line
1288 $phindid = shift @fields;
1289
1290 # output the phrase data, with thesaurus information
1291 print OUT "$phindid:", join(":", @fields);
1292
1293 # add thesaurus data
1294 if (defined($thesaurusdata{$phindid})) {
1295 @fields = split(/:/, $thesaurusdata{$phindid});
1296 shift @fields;
1297 $linkcounter = shift @fields;
1298 $relations = shift @fields;
1299
1300 print OUT ":$linkcounter:$relations";
1301 $thesaurusdata{$phindid} = "";
1302 }
1303 print OUT "\n";
1304 }
1305 close IN;
1306
1307 # Add phrases that aren't already in the file
1308 foreach $phindid (sort numerically keys %thesaurusdata) {
1309 next unless ($thesaurusdata{$phindid});
1310
1311 @fields = split(/:/, $thesaurusdata{$phindid});
1312 $symbols = shift @fields;
1313 $linkcounter = shift @fields;
1314 $relations = shift @fields;
1315
1316 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1317 }
1318 close OUT;
1319
1320}
1321
1322# restore_vocabulary_data
1323#
1324# Read phrases.3 and restore vocabulary information. Then write
1325# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1326# (if requested) to the saved phrases file.
1327
1328sub restore_vocabulary_data {
1329 my ($self) = @_;
1330
1331 my $out = $self->{'outhandle'};
1332 my $verbosity = $self->{'verbosity'};
1333 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1334
1335 my $phinddir = $self->{'phinddir'};
1336 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1337 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1338 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1339 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1340
1341 my $savephrases = $self->{'savephrases'};
1342
1343 # 1.
1344 # Read the vocabulary file
1345 open(V, "<$vocabfile")
1346 || die "Cannot open $vocabfile: $!";
1347 my @symbol;
1348 my $i = 1;
1349 while(<V>) {
1350 chomp;
1351 $symbol[$i++] = $_;
1352 }
1353 close V;
1354
1355 # 2.
1356 # Translate phrases.3 to MGPP input files
1357 my ($key, $text, $word, $isThesaurus);
1358 my @fields;
1359 my $linenumber = 0;
1360
1361 open(IN, "<$infile");
1362 open(DATA, ">$datafile");
1363 open(WORD, ">$wordfile");
1364
1365 # Save the phrases in a separate text file
1366 if ($savephrases) {
1367 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1368 open(SAVE, ">$savephrases");
1369 }
1370
1371 while(<IN>) {
1372
1373 # read the line
1374 chomp;
1375 $line = $_;
1376 @fields = split(/:/, $line);
1377
1378 # get a phrase number for this line
1379 $key = shift @fields;
1380
1381 # restore the text of the phrase
1382 $text = shift @fields;
1383 $text =~ s/s(\d+)/$symbol[$1]/g;
1384 if ($text =~ / /) {
1385 $word = "";
1386 } elsif ($text ne 'untranslated') {
1387 $word = $text;
1388 }
1389
1390 # output the phrase data
1391 print DATA "<Document>";
1392 print DATA "$key:$text:", join(":", @fields), ":\n";
1393
1394 # output the word index search data
1395 print WORD "<Document>$word\n";
1396
1397 # output the phrases to a text file
1398 if ($savephrases) {
1399 if ((scalar @fields) == 7) {
1400 $isThesaurus = 1;
1401 } else {
1402 $isThesaurus = 0;
1403 }
1404 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1405 }
1406 }
1407 close IN;
1408 close WORD;
1409 close DATA;
1410 close SAVE if ($savephrases);
1411
1412}
1413
1414
1415
1416# sort routines used to renumber phrases
1417
1418sub numerically { $a <=> $b }
1419
1420sub by_doc_frequency {
1421 my $fa = 1;
1422 if ($a =~ /,/) {
1423 $fa = $a;
1424 $fa =~ s/\d+,//;
1425 }
1426 my $fb = 1;
1427 if ($b =~ /,/) {
1428 $fb = $b;
1429 $fb =~ s/\d+,//;
1430 }
1431
1432 return ($fb <=> $fa);
1433}
1434
14351;
Note: See TracBrowser for help on using the repository browser.