source: trunk/gsdl/perllib/classify/phind.pm@ 2083

Last change on this file since 2083 was 2083, checked in by paynter, 23 years ago

Fixed a stupid mistake that I know I've fixed before.

  • Property svn:keywords set to Author Date Id Revision
File size: 39.3 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40sub BEGIN {
41 @ISA = ('BasClas');
42}
43
44
45sub print_usage {
46 print STDERR "
47 usage: classify phind [options]
48
49 options:
50 -text Fields The text used to build the phrase hierarchy.
51 (default: 'section:Title,section:text')
52
53 -title Title The metadata field used to describe each document.
54 (default: 'Title')
55
56 -button Name The label for the classifier screen and button in
57 navigation bar.
58 (default: 'Phrase')
59
60 -language Regex Language or languages to use building hierarchy.
61 Languages are identified by two-letter country codes
62 like en (English), es (Spanish), and fr (French).
63 Language is a regular expression, so 'en|fr' (English or
64 French) and '..' (match any language) are valid.
65 (default: 'en'.)
66
67 -savephrases File If set, the phrase infomation will be stored in
68 the given file as text. It is probably a good idea
69 to use an absolute path.
70 (defualt: not set)
71
72 -suffixmode N The smode parameter to the phrase extraction program. A
73 value of 0 means that stopwords are ignored, and of 1
74 means that stopwords are used.
75 (default: 1)
76
77 -thesaurus Name Name of a thesaurus stored in phind format in the
78 collection's etc directory.
79 (default: not set)
80
81 -untidy Don't remove working files.
82
83"; }
84
85
86# Phrase delimiter symbols - these should be abstracted out someplace
87
88my $colstart = "COLLECTIONSTART";
89my $colend = "COLLECTIONEND";
90my $doclimit = "DOCUMENTLIMIT";
91my $senlimit = "SENTENCELIMIT";
92my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
93
94
95# Create a new phind browser based on collect.cfg
96
97sub new {
98 my $class = shift (@_);
99 my $self = new BasClas($class, @_);
100
101 my $out = $self->{'outhandle'};
102
103
104 # Phind installation check
105 # The phind phrase browser is research software and is not installed
106 # by defualt. If the user attepts to use it we warn them that it's a
107 # bit dodgy, then tell them how to install it. If they can do that
108 # and get all the files in place, then we let them proceed.
109
110 print $out "Checking Phind phrase browser requirements...\n";
111
112 # Make sure we're not in windows
113 if ($ENV{'GSDLOS'} =~ /windows/i) {
114 print STDERR "Sorry - Phind currently only works under Unix";
115 exit(1);
116 }
117
118 # Ensure the Phind generate scripts are in place
119 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
120 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
121
122 if (!(-e $file1)) {
123 print STDERR "The phind \"suffix\" program is not installed. ";
124 print STDERR "To install it, change to the directory\n";
125 print STDERR " $src\n";
126 print STDERR "and type \"make install-phind\".\n\n";
127 exit(1);
128 }
129
130 # Ensure the Phind CGI script is in place
131 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
132 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
133
134 if (!(-e $file1)) {
135 print STDERR "The phind CGI program is not installed. ";
136 print STDERR "To install it, change to the directory\n";
137 print STDERR " $src\n";
138 print STDERR "and type \"make install-phind\".\n\n";
139 exit(1);
140 }
141
142 # Ensure the Phind Java applet is in place
143 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
144 $file1 = &util::filename_cat($src, "Phind.class");
145
146 if (!(-e $file1)) {
147 print STDERR "The phind Java classes are not compiled. ";
148 print STDERR "To compile them, change to the directory\n";
149 print STDERR " $src\n";
150 print STDERR "and use your Java compiler to compile Phind.java.\n";
151 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
152 exit(1);
153 }
154
155 # Parse classifier arguments
156 my $builddir = "";
157 if (!parsargv::parse(\@_,
158 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
159 q^title/.*/Title^, \$self->{'titlefield'},
160 q^button/.*/Phrase^, \$self->{'buttonname'},
161 q^language/.*/en^, \$language,
162 q^builddir/.*/^, \$builddir,
163 q^savephrases/\d/0^, \$self->{'savephrases'},
164 q^suffixmode/\d/1^, \$self->{'suffixmode'},
165 q^thesaurus/.*/^, \$self->{'thesaurus'},
166 q^untidy^, \$self->{'untidy'},
167 "allow_extra_options")) {
168
169 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
170 &print_usage();
171 die "\n";
172 }
173
174 # classifier information
175 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
176
177 # limit languages
178 $self->{'language_exp'} = $language;
179
180 # collection directories
181 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
182 if (!$builddir) {
183 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
184 }
185 $self->{'builddir'} = $builddir;
186
187 return bless $self, $class;
188}
189
190
191# Initialise the phind classifier
192
193sub init {
194 my $self = shift (@_);
195
196 # ensure we have a build directory
197 my $builddir = $self->{'builddir'};
198 die unless (-e "$builddir");
199
200 # create phind directory
201 my $phnumber = 1;
202 my $phinddir = &util::filename_cat($builddir, "phind1");
203 while (-e "$phinddir") {
204 $phnumber++;
205 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
206 }
207 &util::mk_dir("$phinddir");
208 $self->{'phinddir'} = $phinddir;
209 $self->{'phindnumber'} = $phnumber;
210
211 # open filehandles for documents and text
212 my $clausefile = &util::filename_cat("$phinddir", "clauses");
213 &util::rm($clausefile) if (-e $clausefile);
214
215 my $txthandle = 'TEXT' . $phnumber;
216 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
217 $self->{'txthandle'} = $txthandle;
218
219 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220 &util::rm($docfile) if (-e $docfile);
221
222 my $dochandle = 'DOC' . $phnumber;
223 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
224 $self->{'dochandle'} = $dochandle;
225
226}
227
228
229# Classify each document.
230#
231# Each document is passed here in turn. The classifier extracts the
232# text of each and stores it in the clauses file. Document details are
233# stored in the docs.txt file.
234
235sub classify {
236 my $self = shift (@_);
237 my $doc_obj = shift @_;
238
239 my $verbosity = $self->{'verbosity'};
240 my $top_section = $doc_obj->get_top_section();
241
242 my $titlefield = $self->{'titlefield'};
243
244 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245 print "process: $title\n" if ($verbosity > 2);
246
247 # Only consider the file if it is in the correct language
248 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249 my $phrlanguage = $self->{'language_exp'};
250 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252 # record this file
253 my $total++;
254 print "file $total: $file\n" if ($self->{'$verbosity'});
255
256
257 # Store document details
258 my $OID = $doc_obj->get_OID();
259 $OID = "NULL" unless defined $OID;
260 my $dochandle = $self->{'dochandle'};
261 print $dochandle "<Document>\t$OID\t$title\n";
262
263 # Store the text occuring in this object
264
265 # output the document delimiter
266 my $txthandle = $self->{'txthandle'};
267 print $txthandle "$doclimit\n";
268
269 # iterarate over the required indexes and store their text
270 my $indexes = $self->{'indexes'};
271 my $text = "";
272 my ($part, $level, $field, $section, $data, $dataref);
273
274 foreach $part (split(/,/, $indexes)) {
275
276 # Each field has a level and a data element ((e.g. document:Title)
277 ($level, $field) = split(/:/, $part);
278 die unless ($level && $field);
279
280 # Extract the text from every section
281 # (In phind, document:text and section:text are equivalent)
282 if ($field eq "text") {
283 $data = "";
284 $section = $doc_obj->get_top_section();
285 while (defined($section)) {
286 $data .= $doc_obj->get_text($section) . "\n";
287 $section = $doc_obj->get_next_section($section);
288 }
289 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290 }
291
292 # Extract a metadata field from a document
293 # (If ther eis more than one element of the given type, get them all.)
294 elsif ($level eq "document") {
295 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296 foreach $data ($@dataref) {
297 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298 }
299 }
300
301 # Extract metadata from every section in a document
302 elsif ($level eq "section") {
303 $data = "";
304 $section = $doc_obj->get_top_section();
305 while (defined($section)) {
306 $dataref = $doc_obj->get_metadata($section, $field);
307 $data .= join("\n", @$dataref) . "\n";
308 $section = $doc_obj->get_next_section($section);
309 }
310 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311 }
312
313 # Some sort of specification which I don't understand
314 else {
315 die "Unknown level ($level) in phind index ($part)\n";
316 }
317
318 }
319
320 # output the text
321 $text =~ tr/\n//s;
322 print $txthandle "$text";
323}
324
325
326# Construct the classifier from the information already gathered
327#
328# When get_classify_info is called, the clauses and docs.txt files have
329# already been constructed in the phind directory. This function will
330# translate them into compressed, indexed MGPP files that can be read by
331# the phindcgi script. It will also register our classifier so that it
332# shows up in the navigation bar.
333
334sub get_classify_info {
335 my $self = shift (@_);
336
337 my $verbosity = $self->{'verbosity'};
338 my $out = $self->{'outhandle'};
339 my $phinddir = $self->{'phinddir'};
340
341 if ($verbosity) {
342 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
343 print $out "*** in", $self->{'phinddir'}, "\n";
344 }
345
346 # Construct phind indexes
347 my $suffixmode = $self->{'suffixmode'};
348 my ($command, $status);
349
350 # Generate the vocabulary, symbol statistics, and numbers file
351 # from the clauses file
352 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
353 &extract_vocabulary($self);
354
355 # Use the suffix program to generate the phind/phrases file
356 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
357 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
358
359 # Create the phrase file and put phrase numbers in phind/phrases
360 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
361 &renumber_phrases($self);
362
363 # Create the mg phrase database
364 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
365 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
366 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
367
368 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
369 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
370 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
371 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
372
373 print $out "\nCreating phrase databases\n";
374 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
375 my $mg_stem = "pdata";
376
377 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
378 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
379 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
380
381 # create the mg index of words
382 print $out "\nCreating word-level search indexes\n";
383 $mg_input = &util::filename_cat($phinddir, "pword.txt");
384 $mg_stem = "pword";
385
386 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
387 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
388 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
389 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
390 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
391 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
392
393 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
394 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
395 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
396
397 # create the mg document information database
398 print $out "\nCreating document information databases\n";
399 $mg_input = &util::filename_cat($phinddir, "docs.txt");
400 $mg_stem = "docs";
401
402 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
403 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
404 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
405
406
407 # Tidy up stray files
408 if (!$self->{'untidy'}) {
409 print $out "\nCleaning up\n" if ($verbosity > 2);
410 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
411 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
412 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
413 "$phinddir/pdata.txt", "$phinddir/pword.txt");
414 my $outfile = 1;
415 while (-e "$phinddir/outPhrase.$outfile") {
416 &util::rm("$phinddir/outPhrase.$outfile");
417 $outfile++;
418 }
419 }
420
421
422 # Return the information about the classifier that we'll later want to
423 # use to create macros when the Phind classifier document is displayed.
424 my %classifyinfo = ('thistype'=>'Invisible',
425 'childtype'=>'Phind',
426 'Title'=>$self->{'buttonname'},
427 'parameters'=>"phindnumber=$self->{'phindnumber'}",
428 'contains'=>[]);
429
430 my $collection = $self->{'collection'};
431 my $url = "library?a=p&p=phind&c=$collection";
432 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
433
434 return \%classifyinfo;
435}
436
437
438
439sub convert_gml_to_tokens {
440
441 my ($language_exp, $text) = @_;
442
443 if ($language_exp =~ /en/) {
444 return &convert_gml_to_tokens_EN($text);
445 }
446
447 $_ = $text;
448
449 # 1. remove GML tags
450
451 # Remove everything that is in a tag
452 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
453 s/\s*<br>\s*/ LINEBREAK /isgo;
454 s/<[^>]*>/ /sgo;
455
456 # Now we have the text, but it may contain HTML
457 # elements coded as &gt; etc. Remove these tags.
458 s/&amp;/&/sgo;
459 s/&lt;/</sgo;
460 s/&gt;/>/sgo;
461 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
462 s/\s*<br>\s*/ LINEBREAK /isgo;
463 s/<[^>]*>/ /sgo;
464
465 # replace<p> and <br> placeholders with clause break symbol (\n)
466 s/\s+/ /gso;
467 s/PARAGRAPHBREAK/\n/sgo;
468 s/LINEBREAK/\n/sgo;
469
470
471
472
473 # 2. Split the remaining text into space-delimited tokens
474
475 # Convert any HTML special characters (like &quot;) to their UTF8 equivalent
476 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
477
478 # Split text at word boundaries
479 s/\b/ /go;
480
481 # 3. Convert the remaining text to "clause format"
482
483 # Insert newline if the end of a sentence is detected
484 # (delimter is: "[\.\?\!]\s")
485 # s/\s*[\.\?\!]\s+/\n/go;
486
487 # remove unnecessary punctuation and replace with clause break symbol (\n)
488 s/[^\w ]/\n/go;
489
490 # remove extraneous whitespace
491 s/ +/ /sgo;
492 s/^\s+//mgo;
493 s/\s*$/\n/mgo;
494
495 # remove lines that contain one word or less
496 s/^\S*$//mgo;
497 s/^\s*$//mgo;
498 tr/\n//s;
499
500 return $_;
501}
502
503# A version of convert_gml_to_tokens that is fine-tuned to the English language.
504
505sub convert_gml_to_tokens_EN {
506 $_ = shift @_;
507
508 # FIRST, remove GML tags
509
510 # Replace all whitespace with a simple space
511 s/\s+/ /gs;
512
513 # Remove everything that is in a tag
514 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
515 s/\s*<br>\s*/ LINEBREAK /isg;
516 s/<[^>]*>/ /sg;
517
518 # Now we have the text, but it may contain HTML
519 # elements coded as &gt; etc. Remove these tags.
520 s/&lt;/</sg;
521 s/&gt;/>/sg;
522
523 s/\s+/ /sg;
524 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
525 s/\s*<br>\s*/ LINEBREAK /isg;
526 s/<[^>]*>/ /sg;
527
528 # remove &amp; and other miscellaneous markup tags
529 s/&amp;/&/sg;
530 s/&lt;/</sg;
531 s/&gt;/>/sg;
532 s/&amp;/&/sg;
533
534 # replace<p> and <br> placeholders with carriage returns
535 s/PARAGRAPHBREAK/\n/sg;
536 s/LINEBREAK/\n/sg;
537
538
539 # Exceptional punctuation
540 #
541 # We make special cases of some punctuation
542
543 # remove any apostrophe that indicates omitted letters
544 s/(\w+)\'(\w*\s)/ $1$2 /g;
545
546 # remove period that appears in a person's initals
547 s/\s([A-Z])\./ $1 /g;
548
549 # replace hyphens in hypheanted words and names with a space
550 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
551
552 # Convert the remaining text to "clause format",
553 # This means removing all excess punctuation and garbage text,
554 # normalising valid punctuation to fullstops and commas,
555 # then putting one cluse on each line.
556
557 # Insert newline when the end of a sentence is detected
558 # (delimter is: "[\.\?\!]\s")
559 s/\s*[\.\?\!]\s+/\n/g;
560
561 # split numbers after four digits
562 s/(\d\d\d\d)/$1 /g;
563
564 # split words after 32 characters
565
566 # squash repeated punctuation
567 tr/A-Za-z0-9 //cs;
568
569 # save email addresses
570 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
571
572 # normalise clause breaks (mostly punctuation symbols) to commas
573 s/[^A-Za-z0-9 \n]+/ , /g;
574
575 # Remove repeated commas, and replace with newline
576 s/\s*,[, ]+/\n/g;
577
578 # remove extra whitespace
579 s/ +/ /sg;
580 s/^\s+//mg;
581 s/\s*$/\n/mg;
582
583 # remove lines that contain one word or less
584 s/^\w*$//mg;
585 s/^\s*$//mg;
586 tr/\n//s;
587
588 return $_;
589
590}
591
592
593
594# Execute a system command
595
596sub execute {
597 my ($command, $verbosity, $outhandle) = @_;
598 print $outhandle "Executing: $command\n" if ($verbosity > 2);
599 my $status = system($command);
600 if ($status != 0) {
601 print STDERR "phind - Error executing $command: $!\n";
602 exit($status);
603 }
604}
605
606
607# Generate the vocabulary, symbol statistics, and numbers file from the
608# clauses file. This is legacy code, so is a bit messy and probably wont
609# run under windows.
610
611sub extract_vocabulary {
612 my ($self) = @_;
613
614 my $verbosity = $self->{'verbosity'};
615 my $out = $self->{'outhandle'};
616
617 my $collectiondir = $self->{'collectiondir'};
618 my $phinddir = $self->{'phinddir'};
619
620 my $language_exp = $self->{'language_exp'};
621
622 my ($w, $l, $line, $word);
623
624 my ($first_delimiter, $last_delimiter,
625 $first_stopword, $last_stopword,
626 $first_extractword, $last_extractword,
627 $first_contentword, $last_contentword,
628 $phrasedelimiter);
629
630 my $thesaurus = $self->{'thesaurus'};
631 my ($thesaurus_links, $thesaurus_terms,
632 %thesaurus, $first_thesaurusword, $last_thesaurusword);
633
634 my %symbol;
635 my (%freq);
636
637 print $out "Calculating vocabulary\n" if ($verbosity > 1);
638
639 # Read and store the stopwords
640 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
641 my $stopword_files = ();
642 my ($language, $language_dir, $file, $file_name);
643 my %stopwords;
644
645 # Examine each directory in the stopword directory
646 opendir(STOPDIR, $stopdir);
647 foreach $language (readdir STOPDIR) {
648
649 # Ignore entries that do not match the classifier's language
650 next unless ($language =~ /$language_exp/);
651 $language_dir = &util::filename_cat($stopdir, $language);
652 next unless (-d "$language_dir");
653
654 opendir(LANGDIR, $language_dir);
655 foreach $file (readdir LANGDIR) {
656
657 # Ignore entries that are not stopword files
658 next unless ($file =~ /sw$/);
659 $file_name = &util::filename_cat($language_dir, $file);
660 next unless (-f "$file_name");
661
662 # Read the stopwords
663 open(STOPFILE, "<$file_name");
664 while (<STOPFILE>) {
665 s/^\s+//;
666 s/\s.*//;
667 $word = $_;
668 $l = lc($word);
669 $stopwords{$l} = $word;
670 }
671 close STOPFILE;
672
673 }
674 }
675
676 # Read thesaurus information
677 if ($thesaurus) {
678
679 # link file exists
680 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
681 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
682
683 # ensure term file exists in the correct language
684 if ($language_exp =~ /^([a-z][a-z])/) {
685 $language = $1;
686 } else {
687 $language = 'en';
688 }
689 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
690 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
691
692
693 # Read the thesaurus terms
694 open(TH, "<$thesaurus_terms");
695 while(<TH>) {
696 s/^\d+ //;
697 s/\(.*\)//;
698 foreach $w (split(/\s+/, $_)) {
699 $thesaurus{lc($w)} = $w;
700 }
701 }
702 close TH;
703 }
704
705 # Read words in the text and count occurences
706 open(TXT, "<$phinddir/clauses");
707 my @words;
708
709 while(<TXT>) {
710 $line = $_;
711 next unless ($line =~ /./);
712
713 @words = split(/\s+/, $line);
714 foreach $w (@words) {
715 $l = lc($w);
716 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
717 $freq{$w}++;
718 }
719 $freq{$senlimit}++;
720 }
721
722 # Calculate the "best" form of each word
723 my (%bestform, %totalfreq, %bestfreq);
724
725 foreach $w (sort (keys %freq)) {
726 $l = lc($w);
727
728 # totalfreq is the number of times a term appears in any form
729 $totalfreq{$l} += $freq{$w};
730
731 if (defined $stopwords{$l}) {
732 $bestform{$l} = $stopwords{$l};
733
734 } elsif (defined $thesaurus{$l}) {
735 $bestform{$l} = $thesaurus{$l};
736
737 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
738 $bestfreq{$l} = $freq{$w};
739 $bestform{$l} = $w;
740 }
741 }
742
743 undef %freq;
744 undef %bestfreq;
745
746
747 # Assign symbol numbers to tokens
748 my $nextsymbol = 1;
749 my (@vocab);
750
751 # Delimiters
752 $first_delimiter = 1;
753
754 foreach $word (@delimiters) {
755
756 $word = lc($word);
757 $bestform{$word} = uc($word);
758 $vocab[$nextsymbol] = $word;
759 $symbol{$word} = $nextsymbol;
760 $nextsymbol++;
761 }
762 $last_delimiter = $nextsymbol - 1;
763
764 # Stopwords
765 $first_stopword = $nextsymbol;
766
767 foreach my $word (sort keys %stopwords) {
768
769 # don't incluse stopword unless it occurs in the text
770 $word = lc($word);
771 next unless ($totalfreq{$word});
772 next if ($symbol{$word});
773
774 $vocab[$nextsymbol] = $word;
775 $symbol{$word} = $nextsymbol;
776 $nextsymbol++;
777 }
778 $last_stopword = $nextsymbol - 1;
779 $first_contentword = $nextsymbol;
780
781 # Thesaurus terms
782 if ($thesaurus) {
783 $first_thesaurusword = $nextsymbol;
784
785 foreach my $word (sort keys %thesaurus) {
786
787 $word = lc($word);
788 next if ($symbol{$word});
789 $bestform{$word} = $thesaurus{$word};
790
791 $vocab[$nextsymbol] = $word;
792 $symbol{$word} = $nextsymbol;
793 $nextsymbol++;
794
795 }
796 $last_thesaurusword = $nextsymbol - 1;
797 }
798
799 # Other content words
800 $first_extractword = $nextsymbol;
801
802 foreach my $word (sort (keys %bestform)) {
803
804 next if ($symbol{$word});
805
806 $vocab[$nextsymbol] = $word;
807 $symbol{$word} = $nextsymbol;
808 $nextsymbol++;
809 }
810 $last_extractword = $nextsymbol - 1;
811 $last_contentword = $nextsymbol - 1;
812
813
814 # Outut the words
815 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
816 open(VOC, ">$phinddir/clauses.vocab");
817
818 for (my $i = 1; $i < $nextsymbol; $i++) {
819 $w = $vocab[$i];
820
821 print VOC "$bestform{$w}\n";
822 $totalfreq{$w} = 0 unless ($totalfreq{$w});
823 }
824 close VOC;
825
826
827 # Create statistics file
828 # Output statistics about the vocablary
829 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
830 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
831
832 open(STAT, ">$phinddir/clauses.stats")
833 || die "Cannot open $phinddir/clauses.stats: $!";
834
835 print STAT "first_delimiter $first_delimiter\n";
836 print STAT "last_delimiter $last_delimiter\n";
837 print STAT "first_stopword $first_stopword\n";
838 print STAT "last_stopword $last_stopword\n";
839 if ($thesaurus) {
840 print STAT "first_thesaurusword $first_thesaurusword\n";
841 print STAT "last_thesaurusword $last_thesaurusword\n";
842 }
843 print STAT "first_extractword $first_extractword\n";
844 print STAT "last_extractword $last_extractword\n";
845 print STAT "first_contentword $first_contentword\n";
846 print STAT "last_contentword $last_contentword\n";
847 print STAT "first_symbol $first_delimiter\n";
848 print STAT "last_symbol $last_contentword\n";
849 print STAT "first_word $first_stopword\n";
850 print STAT "last_word $last_contentword\n";
851 close STAT;
852
853 undef @vocab;
854
855
856 # Create numbers file
857 # Save text as symbol numbers
858 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
859
860 open(TXT, "<$phinddir/clauses");
861 open(NUM, ">$phinddir/clauses.numbers");
862
863 $phrasedelimiter = $symbol{lc($senlimit)};
864 print NUM "$symbol{lc($colstart)}\n";
865
866 # set up the special symbols that delimit documents and sentences
867 while(<TXT>) {
868
869 # split sentence into a list of tokens
870 $line = $_;
871 next unless ($line =~ /./);
872 @words = split(/\s+/, $line);
873
874 # output one token at a time
875 foreach $word (@words) {
876 $word = lc($word);
877 print NUM "$symbol{$word}\n";
878 }
879
880 # output phrase delimiter
881 print NUM "$phrasedelimiter\n";
882 }
883
884 print NUM "$symbol{lc($colend)}\n";
885 close NUM;
886
887 # Save thesaurus data in one convienient file
888 if ($thesaurus) {
889
890 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
891
892
893 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
894 if ($verbosity > 1);
895
896 # Read the thesaurus terms
897 my ($num, $text, %thes_symbols);
898
899 open(TH, "<$thesaurus_terms");
900 while(<TH>) {
901 chomp;
902 @words = split(/\s+/, $_);
903 $num = shift @words;
904 $text = "";
905
906 # translate words into symbol numbers
907 foreach $word (@words) {
908 $word = lc($word);
909 if ($symbol{$word}) {
910 $text .= "s$symbol{$word} ";
911 } elsif ($verbosity) {
912 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
913 }
914 }
915 $text =~ s/ $//;
916 $thes_symbols{$num} = $text;
917 }
918 close TH;
919
920 # Read the thesaurus links and write the corresponding data
921 open(TH, "<$thesaurus_links");
922 open(THOUT, ">$thesaurusfile");
923
924 while(<TH>) {
925 chomp;
926 ($num, $text) = split(/:/, $_);
927
928 if (defined($thes_symbols{$num})) {
929 print THOUT "$num:$thes_symbols{$num}:$text\n";
930 } else {
931 print THOUT "$num:untranslated:$text\n";
932 }
933 }
934 close TH;
935 close THOUT;
936 }
937
938
939
940
941}
942
943
944# renumber_phrases
945#
946# Prepare the phrases file to be input to mgpp. The biggest problem is
947# reconciling the phrase identifiers used by the suffix program (which
948# we'll call suffix-id numbers) with the numbers used in the thesaurus
949# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
950# Phind-id numbers must be sorted by frequency of occurance.
951#
952# Start creating a set of phind-id numbers from the sorted suffix-id
953# numbers and (if required) the thesaurus-id numbers. Then add any other
954# phrases occuring in the thesaurus.
955#
956# The last thing we have to do is restore the vocabulary information to the
957# phrase file so that the phrases are stored as words, not as symbol
958# numbers.
959
960# The original phrases file looks something like this:
961# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
962# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
963# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
964# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
965
966
967sub renumber_phrases {
968 my ($self) = @_;
969
970 renumber_suffix_data($self);
971 renumber_thesaurus_data($self);
972 restore_vocabulary_data($self);
973
974}
975
976
977
978# renumber_suffix_data
979#
980# Translate phrases file to phrases.2 using phind keys instead
981# of suffix keys and sorting the expansion data.
982
983sub renumber_suffix_data {
984 my ($self) = @_;
985
986 my $verbosity = $self->{'verbosity'};
987 my $out = $self->{'outhandle'};
988 print $out "Translate phrases: suffix-ids become phind-id's\n"
989 if ($verbosity);
990
991 my $phinddir = $self->{'phinddir'};
992 my $infile = &util::filename_cat($phinddir, 'phrases');
993 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
994
995 # Read the phrase file. Calculate initial set of phind-id
996 # numbers and store (suffixid -> frequency) relation.
997
998 my %suffixtophind;
999 my @phindfrequency;
1000 my (@fields, $suffixid);
1001 my $nextphind = 1;
1002
1003 open(IN, "<$infile");
1004 while(<IN>) {
1005
1006 chomp;
1007 @fields = split(/:/, $_);
1008
1009 # get next suffixid and phindid
1010 $suffixid = shift @fields;
1011 $suffixtophind{$suffixid} = $nextphind;
1012
1013 # store total frequency
1014 shift @fields;
1015 $totalfrequency[$nextphind] = shift @fields;
1016
1017 $nextphind++;
1018 }
1019 close IN;
1020
1021
1022 # Translate phrases file to phrases.2. Use phind keys (not suffix
1023 # keys), sort expansion and document occurance data in order of
1024 # descending frequency..
1025 open(IN, "<$infile");
1026 open(OUT, ">$outfile");
1027
1028 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1029 my (@documwents, @newexp, $k, $n);
1030 my $linenumber = 0;
1031
1032 while(<IN>) {
1033
1034 # read the line
1035 chomp;
1036 @fields = split(/:/, $_);
1037
1038 # get a phrase number for this line
1039 $suffixid = shift @fields;
1040 die unless (defined($suffixtophind{$suffixid}));
1041 $phindid = $suffixtophind{$suffixid};
1042
1043 # get the symbols in the phrase
1044 $text = shift @fields;
1045
1046 # output status information
1047 $linenumber++;
1048 if ($verbosity > 2) {
1049 if ($linenumber % 1000 == 0) {
1050 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1051 }
1052 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1053 }
1054
1055 # get the phrase frequency
1056 $tf = shift @fields;
1057
1058 # get the number of expansions
1059 $countexp = shift @fields;
1060
1061 # get the expansions, convert them into phind-id numbers, and sort them
1062 $expansions = shift @fields;
1063 @newexp = ();
1064 foreach $k (split(/,/, $expansions)) {
1065 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1066 $n = $suffixtophind{$k};
1067 push @newexp, $n;
1068 }
1069 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1070
1071 # get the number of documents
1072 $countdocs = shift @fields;
1073
1074 # get the documents and sort them
1075 $documents = shift @fields;
1076 $documents =~ s/d//g;
1077 @documents = split(/;/, $documents);
1078 @documents = sort by_doc_frequency @documents;
1079
1080 # output the phrase data
1081 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1082 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1083
1084 }
1085
1086 close IN;
1087 close OUT;
1088}
1089
1090
1091# renumber_thesaurus_data
1092#
1093# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1094
1095sub renumber_thesaurus_data {
1096 my ($self) = @_;
1097
1098 my $out = $self->{'outhandle'};
1099 my $verbosity = $self->{'verbosity'};
1100 my $thesaurus = $self->{'thesaurus'};
1101
1102 my $phinddir = $self->{'phinddir'};
1103 my $infile = &util::filename_cat($phinddir, "phrases.2");
1104 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1105
1106
1107 # If no thesaurus is defined, simply move the phrases file.
1108 if (!$thesaurus) {
1109 print $out "Translate phrases.2: no thesaurus data\n"
1110 if ($verbosity);
1111 &util::mv($infile, $outfile);
1112 return;
1113 }
1114
1115 print $out "Translate phrases.2: add thesaurus data\n"
1116 if ($verbosity);
1117
1118 # 1.
1119 # Read thesaurus file and store (symbols->thesaurusid) mapping
1120 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1121 my %symbolstothesid;
1122 my (@fields, $thesid, $symbols);
1123
1124 open(TH, "<$thesaurusfile");
1125
1126 while (<TH>) {
1127
1128 chomp;
1129 @fields = split(/:/, $_);
1130
1131 # get id and text
1132 $thesid = shift @fields;
1133 $symbols = shift @fields;
1134 $symbolstothesid{$symbols} = $thesid;
1135 }
1136 close TH;
1137
1138 # 2.
1139 # Read phrases file to find thesaurus entries that already
1140 # have a phindid. Store their phind-ids for later translation,
1141 # and store their frequency for later sorting.
1142 my %thesaurustophindid;
1143 my %phindidtofrequency;
1144 my ($phindid, $freq);
1145
1146 open(IN, "<$infile");
1147
1148 while(<IN>) {
1149
1150 chomp;
1151 @fields = split(/:/, $_);
1152
1153 # phindid and symbols for this line
1154 $phindid = shift @fields;
1155 $symbols = shift @fields;
1156 $freq = shift @fields;
1157
1158 # do we have a thesaurus id corresponding to this phrase?
1159 if (defined($symbolstothesid{$symbols})) {
1160 $thesid = $symbolstothesid{$symbols};
1161 $thesaurustophindid{$thesid} = $phindid;
1162 $phindidtofrequency{$phindid} = $freq;
1163 }
1164 }
1165 close IN;
1166
1167 undef %symbolstothesid;
1168
1169 # 3.
1170 # Create phind-id numbers for remaining thesaurus entries,
1171 # and note that their frequency is 0 for later sorting.
1172 my $nextphindid = $phindid + 1;
1173
1174 open(TH, "<$thesaurusfile");
1175 while(<TH>) {
1176
1177 chomp;
1178 @fields = split(/:/, $_);
1179
1180 # read thesaurus-id and ensure it has a corresponding phind-id
1181 $thesid = shift @fields;
1182 if (!defined($thesaurustophindid{$thesid})) {
1183 $thesaurustophindid{$thesid} = $nextphindid;
1184 $phindidtofrequency{$nextphindid} = 0;
1185 $nextphindid++;
1186 }
1187 }
1188 close TH;
1189
1190 # 4.
1191 # Translate thesaurus file, replacing thesaurus-id numbers with
1192 # phind-id numbers.
1193 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1194 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1195 my (@links, $linkid, %linkidtotype, $newrelation);
1196
1197 open(TH, "<$thesaurusfile");
1198 open(TO, ">$newthesaurusfile");
1199 while(<TH>) {
1200
1201 chomp;
1202 @fields = split(/:/, $_);
1203
1204 # phindid and symbols for this line
1205 ($thesid, $symbols, $relations) = @fields;
1206
1207 die unless ($thesid && $symbols);
1208 die unless $thesaurustophindid{$thesid};
1209 $phindid = $thesaurustophindid{$thesid};
1210
1211 # convert each part of the relation string to use phind-id numbers
1212 # at the same time, we want to sort the list by frequency.
1213 undef %linkidtotype;
1214
1215 foreach $linktext (split(/;/, $relations)) {
1216 @linkdata = split(/,/, $linktext);
1217
1218 # remember the linktype (e.g. BT, NT)
1219 $linktype = shift @linkdata;
1220
1221 # store the type of each link
1222 foreach $thesid (@linkdata) {
1223 die unless (defined($thesaurustophindid{$thesid}));
1224 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1225 }
1226 }
1227
1228 # sort the list of links, first by frequency, then by type.
1229 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1230 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1231 $linkcounter = (scalar @links);
1232
1233 # create a string describing the link information
1234 $linktype = $linkidtotype{$links[0]};
1235 $newrelation = $linktype;
1236 foreach $linkid (@links) {
1237 if ($linkidtotype{$linkid} ne $linktype) {
1238 $linktype = $linkidtotype{$linkid};
1239 $newrelation .= ";" . $linktype;
1240 }
1241 $newrelation .= "," . $linkid;
1242 }
1243 $newrelation .= ";";
1244
1245
1246 # output the new line
1247 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1248 }
1249 close TH;
1250 close TO;
1251
1252 undef %thesaurustophindid;
1253 undef %linkidtotype;
1254 undef %phindidtofrequency;
1255
1256 # 5.
1257 # Read thesaurus data (in phind-id format) into memory
1258 my %thesaurusdata;
1259
1260 open(TH, "<$newthesaurusfile");
1261 while(<TH>) {
1262 chomp;
1263 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1264 die unless ($phindid && $symbols);
1265 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1266 }
1267
1268 # 6.
1269 # Add thesaurus data to phrases file
1270 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1271 my (@documwents, @newexp, $k, $n);
1272 my $linenumber = 0;
1273
1274 open(IN, "<$infile");
1275 open(OUT, ">$outfile");
1276
1277 # Update existing phrases
1278 while(<IN>) {
1279
1280 chomp;
1281 @fields = split(/:/, $_);
1282
1283 # get data for this line
1284 $phindid = shift @fields;
1285
1286 # output the phrase data, with thesaurus information
1287 print OUT "$phindid:", join(":", @fields);
1288
1289 # add thesaurus data
1290 if (defined($thesaurusdata{$phindid})) {
1291 @fields = split(/:/, $thesaurusdata{$phindid});
1292 shift @fields;
1293 $linkcounter = shift @fields;
1294 $relations = shift @fields;
1295
1296 print OUT ":$linkcounter:$relations";
1297 $thesaurusdata{$phindid} = "";
1298 }
1299 print OUT "\n";
1300 }
1301 close IN;
1302
1303 # Add phrases that aren't already in the file
1304 foreach $phindid (sort numerically keys %thesaurusdata) {
1305 next unless ($thesaurusdata{$phindid});
1306
1307 @fields = split(/:/, $thesaurusdata{$phindid});
1308 $symbols = shift @fields;
1309 $linkcounter = shift @fields;
1310 $relations = shift @fields;
1311
1312 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1313 }
1314 close OUT;
1315
1316}
1317
1318# restore_vocabulary_data
1319#
1320# Read phrases.3 and restore vocabulary information. Then write
1321# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1322# (if requested) to the saved phrases file.
1323
1324sub restore_vocabulary_data {
1325 my ($self) = @_;
1326
1327 my $out = $self->{'outhandle'};
1328 my $verbosity = $self->{'verbosity'};
1329 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1330
1331 my $phinddir = $self->{'phinddir'};
1332 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1333 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1334 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1335 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1336
1337 my $savephrases = $self->{'savephrases'};
1338
1339 # 1.
1340 # Read the vocabulary file
1341 open(V, "<$vocabfile")
1342 || die "Cannot open $vocabfile: $!";
1343 my @symbol;
1344 my $i = 1;
1345 while(<V>) {
1346 chomp;
1347 $symbol[$i++] = $_;
1348 }
1349
1350
1351 # 2.
1352 # Translate phrases.3 to MGPP input files
1353 my ($key, $text, $word, $isThesaurus);
1354 my @fields;
1355 my $linenumber = 0;
1356
1357 open(IN, "<$infile");
1358 open(DATA, ">$datafile");
1359 open(WORD, ">$wordfile");
1360
1361 # Save the phrases in a separate text file
1362 if ($savephrases) {
1363 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1364 open(SAVE, ">$savephrases");
1365 }
1366
1367 while(<IN>) {
1368
1369 # read the line
1370 chomp;
1371 $line = $_;
1372 @fields = split(/:/, $line);
1373
1374 # get a phrase number for this line
1375 $key = shift @fields;
1376
1377 # restore the text of the phrase
1378 $text = shift @fields;
1379 $text =~ s/s(\d+)/$symbol[$1]/g;
1380 if ($text =~ / /) {
1381 $word = "";
1382 } elsif ($text ne 'untranslated') {
1383 $word = $text;
1384 }
1385
1386 # output the phrase data
1387 print DATA "<Document>";
1388 print DATA "$key:$text:", join(":", @fields), ":\n";
1389
1390 # output the word index search data
1391 print WORD "<Document>$word\n";
1392
1393 # output the phrases to a text file
1394 if ($savephrases) {
1395 if ((scalar @fields) == 7) {
1396 $isThesaurus = 1;
1397 } else {
1398 $isThesaurus = 0;
1399 }
1400 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1401 }
1402 }
1403 close IN;
1404 close WORD;
1405 close DATA;
1406 close SAVE if ($savephrases);
1407
1408}
1409
1410
1411
1412# sort routines used to renumber phrases
1413
1414sub numerically { $a <=> $b }
1415
1416sub by_doc_frequency {
1417 my $fa = 1;
1418 if ($a =~ /,/) {
1419 $fa = $a;
1420 $fa =~ s/\d+,//;
1421 }
1422 my $fb = 1;
1423 if ($b =~ /,/) {
1424 $fb = $b;
1425 $fb =~ s/\d+,//;
1426 }
1427
1428 return ($fb <=> $fa);
1429}
1430
14311;
Note: See TracBrowser for help on using the repository browser.