source: trunk/gsdl/perllib/classify/phind.pm@ 3400

Last change on this file since 3400 was 3244, checked in by jrm21, 22 years ago

we no longer exit with an error if the suffix program failed to create
necessary files. If suffix returns non-zero we will still exit, but exiting
means the build failed.

  • Property svn:keywords set to Author Date Id Revision
File size: 38.8 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40my @removedirs = ();
41
42my %wanted_index_files = ('td'=>1,
43 't'=>1,
44 'ti'=>1,
45 'tl'=>1,
46 'tsd'=>1,
47 'idb'=>1,
48 'ib1'=>1,
49 'ib2'=>1,
50 'ib3'=>1,
51 'i'=>1,
52 'il'=>1,
53 'w'=>1,
54 'wa'=>1);
55
56sub BEGIN {
57 @ISA = ('BasClas');
58}
59
60sub END {
61
62 # Tidy up stray files - we do this here as there's some weird problem
63 # preventing us from doing it in the get_classify_info() function (on
64 # windows at least) where the close() appears to fail on txthandle and
65 # dochandle, thus preventing us from deleting those files
66
67 foreach my $dir (@removedirs) {
68 if (-d $dir && opendir (DIR, $dir)) {
69 my @files = readdir DIR;
70 closedir DIR;
71
72 foreach $file (@files) {
73 next if $file =~ /^\.\.?$/;
74 my ($suffix) = $file =~ /\.([^\.]+)$/;
75 if (!defined $suffix || !defined $wanted_index_files{$suffix}) {
76 # delete it!
77 &util::rm (&util::filename_cat ($dir, $file));
78 }
79 }
80 }
81 }
82}
83
84sub print_usage {
85 print STDERR "
86 usage: classify phind [options]
87
88 options:
89 -text Fields The text used to build the phrase hierarchy.
90 (default: 'section:Title,section:text')
91
92 -title Title The metadata field used to describe each document.
93 (default: 'Title')
94
95 -button Name The label for the classifier screen and button in
96 navigation bar.
97 (default: 'Phrase')
98
99 -language Regex Language or languages to use building hierarchy.
100 Languages are identified by two-letter country codes
101 like en (English), es (Spanish), and fr (French).
102 Language is a regular expression, so 'en|fr' (English or
103 French) and '..' (match any language) are valid.
104 (default: 'en'.)
105
106 -savephrases File If set, the phrase infomation will be stored in
107 the given file as text. It is probably a good idea
108 to use an absolute path.
109 (default: not set)
110
111 -suffixmode N The smode parameter to the phrase extraction program. A
112 value of 0 means that stopwords are ignored, and of 1
113 means that stopwords are used.
114 (default: 1)
115
116 -thesaurus Name Name of a thesaurus stored in phind format in the
117 collection's etc directory.
118 (default: not set)
119
120 -untidy Don't remove working files.
121
122"; }
123
124# Phrase delimiter symbols - these should be abstracted out someplace
125
126my $colstart = "COLLECTIONSTART";
127my $colend = "COLLECTIONEND";
128my $doclimit = "DOCUMENTLIMIT";
129my $senlimit = "SENTENCELIMIT";
130my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
131
132
133# Create a new phind browser based on collect.cfg
134
135sub new {
136 my $class = shift (@_);
137 my $self = new BasClas($class, @_);
138
139 my $out = $self->{'outhandle'};
140
141 # Ensure the Phind generate scripts are in place
142 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
143 $file1 .= ".exe" if $ENV{'GSDLOS'} =~ /^windows$/;
144 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
145
146 if (!(-e $file1)) {
147 print STDERR "phind.pm: ERROR: The phind \"suffix\" program is not installed.\n\n";
148 exit(1);
149 }
150
151 # Parse classifier arguments
152 my $builddir = "";
153 if (!parsargv::parse(\@_,
154 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
155 q^title/.*/Title^, \$self->{'titlefield'},
156 q^button/.*/Phrase^, \$self->{'buttonname'},
157 q^language/.*/en^, \$language,
158 q^builddir/.*/^, \$builddir,
159 q^savephrases/.*/^, \$self->{'savephrases'},
160 q^suffixmode/\d/1^, \$self->{'suffixmode'},
161 q^thesaurus/.*/^, \$self->{'thesaurus'},
162 q^untidy^, \$self->{'untidy'},
163 "allow_extra_options")) {
164
165 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
166 &print_usage();
167 die "\n";
168 }
169
170 # classifier information
171 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
172
173 # limit languages
174 $self->{'language_exp'} = $language;
175
176 # collection directories
177 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
178 if (!$builddir) {
179 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
180 }
181 $self->{'builddir'} = $builddir;
182
183 $self->{'total'} = 0;
184
185 return bless $self, $class;
186}
187
188
189# Initialise the phind classifier
190
191sub init {
192 my $self = shift (@_);
193
194 # ensure we have a build directory
195 my $builddir = $self->{'builddir'};
196 die unless (-e "$builddir");
197
198 # create phind directory
199 my $phnumber = 1;
200 my $phinddir = &util::filename_cat($builddir, "phind1");
201 while (-e "$phinddir") {
202 $phnumber++;
203 $phinddir = &util::filename_cat($builddir, "phind$phnumber");
204 }
205 &util::mk_dir("$phinddir");
206 $self->{'phinddir'} = $phinddir;
207 $self->{'phindnumber'} = $phnumber;
208
209 push(@removedirs, $phinddir) unless $self->{'untidy'};
210
211 # open filehandles for documents and text
212 my $clausefile = &util::filename_cat("$phinddir", "clauses");
213 &util::rm($clausefile) if (-e $clausefile);
214
215 my $txthandle = 'TEXT' . $phnumber;
216 open($txthandle, ">$clausefile") || die "Cannot open $clausefile: $!";
217 $self->{'txthandle'} = $txthandle;
218
219 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
220 &util::rm($docfile) if (-e $docfile);
221
222 my $dochandle = 'DOC' . $phnumber;
223 open($dochandle, ">$docfile") || die "Cannot open $docfile: $!";
224 $self->{'dochandle'} = $dochandle;
225
226}
227
228
229# Classify each document.
230#
231# Each document is passed here in turn. The classifier extracts the
232# text of each and stores it in the clauses file. Document details are
233# stored in the docs.txt file.
234
235sub classify {
236 my $self = shift (@_);
237 my $doc_obj = shift @_;
238
239 my $verbosity = $self->{'verbosity'};
240 my $top_section = $doc_obj->get_top_section();
241
242 my $titlefield = $self->{'titlefield'};
243
244 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
245 print "process: $title\n" if ($verbosity > 2);
246
247 # Only consider the file if it is in the correct language
248 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
249 my $phrlanguage = $self->{'language_exp'};
250 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
251
252 # record this file
253 $self->{'total'} ++;
254 print "file $self->{'total'}: $file\n" if ($self->{'$verbosity'});
255
256
257 # Store document details
258 my $OID = $doc_obj->get_OID();
259 $OID = "NULL" unless defined $OID;
260 my $dochandle = $self->{'dochandle'};
261 print $dochandle "<Document>\t$OID\t$title\n";
262
263 # Store the text occuring in this object
264
265 # output the document delimiter
266 my $txthandle = $self->{'txthandle'};
267 print $txthandle "$doclimit\n";
268
269 # iterarate over the required indexes and store their text
270 my $indexes = $self->{'indexes'};
271 my $text = "";
272 my ($part, $level, $field, $section, $data, $dataref);
273
274 foreach $part (split(/,/, $indexes)) {
275
276 # Each field has a level and a data element ((e.g. document:Title)
277 ($level, $field) = split(/:/, $part);
278 die unless ($level && $field);
279
280 # Extract the text from every section
281 # (In phind, document:text and section:text are equivalent)
282 if ($field eq "text") {
283 $data = "";
284 $section = $doc_obj->get_top_section();
285 while (defined($section)) {
286 $data .= $doc_obj->get_text($section) . "\n";
287 $section = $doc_obj->get_next_section($section);
288 }
289 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
290 }
291
292 # Extract a metadata field from a document
293 # (If there is more than one element of the given type, get them all.)
294 elsif ($level eq "document") {
295 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
296 foreach $data (@$dataref) {
297 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
298 }
299 }
300
301 # Extract metadata from every section in a document
302 elsif ($level eq "section") {
303 $data = "";
304 $section = $doc_obj->get_top_section();
305 while (defined($section)) {
306 $dataref = $doc_obj->get_metadata($section, $field);
307 $data .= join("\n", @$dataref) . "\n";
308 $section = $doc_obj->get_next_section($section);
309 }
310 $text .= convert_gml_to_tokens($phrlanguage, $data) . "\n";
311 }
312
313 # Some sort of specification which I don't understand
314 else {
315 die "Unknown level ($level) in phind index ($part)\n";
316 }
317
318 }
319
320 # output the text
321 $text =~ tr/\n//s;
322 print $txthandle "$text";
323}
324
325
326# Construct the classifier from the information already gathered
327#
328# When get_classify_info is called, the clauses and docs.txt files have
329# already been constructed in the phind directory. This function will
330# translate them into compressed, indexed MGPP files that can be read by
331# the phindcgi script. It will also register our classifier so that it
332# shows up in the navigation bar.
333
334sub get_classify_info {
335 my $self = shift (@_);
336
337 close $self->{'dochandle'};
338 close $self->{'txthandle'};
339 my $verbosity = $self->{'verbosity'};
340 my $out = $self->{'outhandle'};
341 my $phinddir = $self->{'phinddir'};
342
343 my $osextra = "";
344 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
345 $osextra = " -d /";
346 }
347
348 if ($verbosity) {
349 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
350 print $out "*** in ", $self->{'phinddir'}, "\n";
351 }
352
353 # Construct phind indexes
354 my $suffixmode = $self->{'suffixmode'};
355 my ($command, $status);
356
357 # Generate the vocabulary, symbol statistics, and numbers file
358 # from the clauses file
359 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
360 &extract_vocabulary($self);
361
362 # Use the suffix program to generate the phind/phrases file
363 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
364 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out);
365
366
367 # check that we generated some files. It's not necessarily an error if
368 # we didn't (execute() would have quit on error), but we can't go on.
369 my $phrasesfile=&util::filename_cat($self->{'phinddir'}, 'phrases');
370 if (! -r $phrasesfile) {
371 print $out "\nNo phrases found for phind classifier!\n";
372 return;
373 }
374
375 # Create the phrase file and put phrase numbers in phind/phrases
376 print $out "\nSorting and renumbering phrases for input to mgpp\n" if $verbosity;
377 &renumber_phrases($self);
378
379 print $out "\nCreating phrase databases\n";
380 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
381 my $mg_stem = &util::filename_cat($phinddir, "pdata");
382
383 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
384 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
385 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
386
387 # create the mg index of words
388 print $out "\nCreating word-level search indexes\n";
389 $mg_input = &util::filename_cat($phinddir, "pword.txt");
390 $mg_stem = &util::filename_cat($phinddir, "pword");
391
392 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 -I1 \"$mg_input\"", $verbosity, $out);
393 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
394 &execute("mgpp_perf_hash_build $osextra -f \"$mg_stem\"", $verbosity, $out);
395 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 -I2 \"$mg_input\"", $verbosity, $out);
396 &execute("mgpp_weights_build $osextra -f \"$mg_stem\"", $verbosity, $out);
397 &execute("mgpp_invf_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
398
399 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 1", $verbosity, $out);
400 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 2", $verbosity, $out);
401 &execute("mgpp_stem_idx $osextra -f \"$mg_stem\" -s 3", $verbosity, $out);
402
403 # create the mg document information database
404 print $out "\nCreating document information databases\n";
405 $mg_input = &util::filename_cat($phinddir, "docs.txt");
406 $mg_stem = &util::filename_cat($phinddir, "docs");
407
408 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T1 \"$mg_input\"", $verbosity, $out);
409 &execute("mgpp_compression_dict $osextra -f \"$mg_stem\"", $verbosity, $out);
410 &execute("mgpp_passes $osextra -f \"$mg_stem\" -T2 \"$mg_input\"", $verbosity, $out);
411
412 # Return the information about the classifier that we'll later want to
413 # use to create macros when the Phind classifier document is displayed.
414 my %classifyinfo = ('thistype'=>'Invisible',
415 'childtype'=>'Phind',
416 'Title'=>$self->{'buttonname'},
417 'parameters'=>"phindnumber=$self->{'phindnumber'}",
418 'contains'=>[]);
419
420 my $collection = $self->{'collection'};
421 my $url = "library?a=p&p=phind&c=$collection";
422 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
423
424 return \%classifyinfo;
425}
426
427
428
429sub convert_gml_to_tokens {
430
431 my ($language_exp, $text) = @_;
432
433 # escape any magic words... - jrm21
434 foreach my $delim (@delimiters) {
435 my $replacement=lc($delim);
436 my $num= $text=~ s/$delim/$replacement/g;
437 if (!$num) {$num=0;}
438 }
439
440 if ($language_exp =~ /en/) {
441 return &convert_gml_to_tokens_EN($text);
442 }
443
444 $_ = $text;
445
446 # 1. remove GML tags
447
448 # Remove everything that is in a tag
449 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
450 s/\s*<br>\s*/ LINEBREAK /isgo;
451 s/<[^>]*>/ /sgo;
452
453 # Now we have the text, but it may contain HTML
454 # elements coded as &gt; etc. Remove these tags.
455 s/&amp;/&/sgo;
456 s/&lt;/</sgo;
457 s/&gt;/>/sgo;
458 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
459 s/\s*<br>\s*/ LINEBREAK /isgo;
460 s/<[^>]*>/ /sgo;
461
462 # replace<p> and <br> placeholders with clause break symbol (\n)
463 s/\s+/ /gso;
464 s/PARAGRAPHBREAK/\n/sgo;
465 s/LINEBREAK/\n/sgo;
466
467
468
469
470 # 2. Split the remaining text into space-delimited tokens
471
472 # Convert entities to their UTF8 equivalents
473 s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse;
474
475 # Split text at word boundaries
476 s/\b/ /go;
477
478 # 3. Convert the remaining text to "clause format"
479
480 # Insert newline if the end of a sentence is detected
481 # (delimter is: "[\.\?\!]\s")
482 # s/\s*[\.\?\!]\s+/\n/go;
483
484 # remove unnecessary punctuation and replace with clause break symbol (\n)
485 s/[^\w ]/\n/go;
486
487 # remove extraneous whitespace
488 s/ +/ /sgo;
489 s/^\s+//mgo;
490 s/\s*$/\n/mgo;
491
492 # remove lines that contain one word or less
493 s/^\S*$//mgo;
494 s/^\s*$//mgo;
495 tr/\n//s;
496
497 return $_;
498}
499
500# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502sub convert_gml_to_tokens_EN {
503 $_ = shift @_;
504
505 # FIRST, remove GML tags
506
507 # Replace all whitespace with a simple space
508 s/\s+/ /gs;
509
510 # Remove everything that is in a tag
511 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
512 s/\s*<br>\s*/ LINEBREAK /isg;
513 s/<[^>]*>/ /sg;
514
515 # Now we have the text, but it may contain HTML
516 # elements coded as &gt; etc. Remove these tags.
517 s/&lt;/</sg;
518 s/&gt;/>/sg;
519
520 s/\s+/ /sg;
521 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
522 s/\s*<br>\s*/ LINEBREAK /isg;
523 s/<[^>]*>/ /sg;
524
525 # remove &amp; and other miscellaneous markup tags
526 s/&amp;/&/sg;
527 s/&lt;/</sg;
528 s/&gt;/>/sg;
529 s/&amp;/&/sg;
530
531 # replace<p> and <br> placeholders with carriage returns
532 s/PARAGRAPHBREAK/\n/sg;
533 s/LINEBREAK/\n/sg;
534
535
536 # Exceptional punctuation
537 #
538 # We make special cases of some punctuation
539
540 # remove any apostrophe that indicates omitted letters
541 s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543 # remove period that appears in a person's initals
544 s/\s([A-Z])\./ $1 /g;
545
546 # replace hyphens in hypheanted words and names with a space
547 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549 # Convert the remaining text to "clause format",
550 # This means removing all excess punctuation and garbage text,
551 # normalising valid punctuation to fullstops and commas,
552 # then putting one cluse on each line.
553
554 # Insert newline when the end of a sentence is detected
555 # (delimter is: "[\.\?\!]\s")
556 s/\s*[\.\?\!]\s+/\n/g;
557
558 # split numbers after four digits
559 s/(\d\d\d\d)/$1 /g;
560
561 # split words after 32 characters
562
563 # squash repeated punctuation
564 tr/A-Za-z0-9 //cs;
565
566 # save email addresses
567 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569 # normalise clause breaks (mostly punctuation symbols) to commas
570 s/[^A-Za-z0-9 \n]+/ , /g;
571
572 # Remove repeated commas, and replace with newline
573 s/\s*,[, ]+/\n/g;
574
575 # remove extra whitespace
576 s/ +/ /sg;
577 s/^\s+//mg;
578 s/\s*$/\n/mg;
579
580 # remove lines that contain one word or less
581 s/^\w*$//mg;
582 s/^\s*$//mg;
583 tr/\n//s;
584
585 return $_;
586
587}
588
589
590
591# Execute a system command
592
593sub execute {
594 my ($command, $verbosity, $outhandle) = @_;
595 print $outhandle "Executing: $command\n" if ($verbosity > 2);
596 $! = 0;
597 my $status = system($command);
598 if ($status != 0) {
599 print STDERR "phind - Error executing '$command': $!\n";
600 exit($status); # this causes the build to fail...
601 }
602}
603
604
605# Generate the vocabulary, symbol statistics, and numbers file from the
606# clauses file. This is legacy code, so is a bit messy and probably wont
607# run under windows.
608
609sub extract_vocabulary {
610 my ($self) = @_;
611
612 my $verbosity = $self->{'verbosity'};
613 my $out = $self->{'outhandle'};
614
615 my $collectiondir = $self->{'collectiondir'};
616 my $phinddir = $self->{'phinddir'};
617
618 my $language_exp = $self->{'language_exp'};
619
620 my ($w, $l, $line, $word);
621
622 my ($first_delimiter, $last_delimiter,
623 $first_stopword, $last_stopword,
624 $first_extractword, $last_extractword,
625 $first_contentword, $last_contentword,
626 $phrasedelimiter);
627
628 my $thesaurus = $self->{'thesaurus'};
629 my ($thesaurus_links, $thesaurus_terms,
630 %thesaurus, $first_thesaurusword, $last_thesaurusword);
631
632 my %symbol;
633 my (%freq);
634
635 print $out "Calculating vocabulary\n" if ($verbosity > 1);
636
637 # Read and store the stopwords
638 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword");
639 my $stopword_files = ();
640 my ($language, $language_dir, $file, $file_name);
641 my %stopwords;
642
643 # Examine each directory in the stopword directory
644 opendir(STOPDIR, $stopdir);
645 foreach $language (readdir STOPDIR) {
646
647 # Ignore entries that do not match the classifier's language
648 next unless ($language =~ /$language_exp/);
649 $language_dir = &util::filename_cat($stopdir, $language);
650 next unless (-d "$language_dir");
651
652 opendir(LANGDIR, $language_dir);
653 foreach $file (readdir LANGDIR) {
654
655 # Ignore entries that are not stopword files
656 next unless ($file =~ /sw$/);
657 $file_name = &util::filename_cat($language_dir, $file);
658 next unless (-f "$file_name");
659
660 # Read the stopwords
661 open(STOPFILE, "<$file_name");
662 while (<STOPFILE>) {
663 s/^\s+//;
664 s/\s.*//;
665 $word = $_;
666 $l = lc($word);
667 $stopwords{$l} = $word;
668 }
669 close STOPFILE;
670
671 }
672 closedir LANGDIR;
673 }
674 closedir STOPDIR;
675
676 # Read thesaurus information
677 if ($thesaurus) {
678
679 # link file exists
680 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
681 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
682
683 # ensure term file exists in the correct language
684 if ($language_exp =~ /^([a-z][a-z])/) {
685 $language = $1;
686 } else {
687 $language = 'en';
688 }
689 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
690 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
691
692
693 # Read the thesaurus terms
694 open(TH, "<$thesaurus_terms");
695 while(<TH>) {
696 s/^\d+ //;
697 s/\(.*\)//;
698 foreach $w (split(/\s+/, $_)) {
699 $thesaurus{lc($w)} = $w;
700 }
701 }
702 close TH;
703 }
704
705 # Read words in the text and count occurences
706 open(TXT, "<$phinddir/clauses");
707
708 my @words;
709 while(<TXT>) {
710 $line = $_;
711 next unless ($line =~ /./);
712
713 @words = split(/\s+/, $line);
714 foreach $w (@words) {
715 $l = lc($w);
716 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
717 $freq{$w}++;
718 }
719 $freq{$senlimit}++;
720 }
721
722 close TXT;
723
724 # Calculate the "best" form of each word
725 my (%bestform, %totalfreq, %bestfreq);
726
727 foreach $w (sort (keys %freq)) {
728 $l = lc($w);
729
730 # totalfreq is the number of times a term appears in any form
731 $totalfreq{$l} += $freq{$w};
732
733 if (defined $stopwords{$l}) {
734 $bestform{$l} = $stopwords{$l};
735
736 } elsif (defined $thesaurus{$l}) {
737 $bestform{$l} = $thesaurus{$l};
738
739 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
740 $bestfreq{$l} = $freq{$w};
741 $bestform{$l} = $w;
742 }
743 }
744 undef %freq;
745 undef %bestfreq;
746
747
748 # Assign symbol numbers to tokens
749 my $nextsymbol = 1;
750 my (@vocab);
751
752 # Delimiters
753 $first_delimiter = 1;
754
755 foreach $word (@delimiters) {
756
757# $word = lc($word); # jrm21
758 $word = uc($word);
759 $bestform{$word} = $word;
760 $vocab[$nextsymbol] = $word;
761 $symbol{$word} = $nextsymbol;
762 $nextsymbol++;
763 }
764 $last_delimiter = $nextsymbol - 1;
765 # Stopwords
766 $first_stopword = $nextsymbol;
767
768 foreach my $word (sort keys %stopwords) {
769 # don't include stopword unless it occurs in the text
770 $word = lc($word);
771 next unless ($totalfreq{$word});
772 next if ($symbol{$word});
773
774 $vocab[$nextsymbol] = $word;
775 $symbol{$word} = $nextsymbol;
776 $nextsymbol++;
777 }
778 $last_stopword = $nextsymbol - 1;
779 $first_contentword = $nextsymbol;
780
781 # Thesaurus terms
782 if ($thesaurus) {
783 $first_thesaurusword = $nextsymbol;
784
785 foreach my $word (sort keys %thesaurus) {
786
787 $word = lc($word);
788 next if ($symbol{$word});
789 $bestform{$word} = $thesaurus{$word};
790
791 $vocab[$nextsymbol] = $word;
792 $symbol{$word} = $nextsymbol;
793 $nextsymbol++;
794
795 }
796 $last_thesaurusword = $nextsymbol - 1;
797 }
798
799 # Other content words
800 $first_extractword = $nextsymbol;
801
802 foreach my $word (sort (keys %bestform)) {
803
804 next if ($symbol{$word});
805
806 $vocab[$nextsymbol] = $word;
807 $symbol{$word} = $nextsymbol;
808 $nextsymbol++;
809 }
810 $last_extractword = $nextsymbol - 1;
811 $last_contentword = $nextsymbol - 1;
812
813 # Outut the words
814 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
815 open(VOC, ">$phinddir/clauses.vocab");
816
817 for (my $i = 1; $i < $nextsymbol; $i++) {
818 $w = $vocab[$i];
819
820 print VOC "$bestform{$w}\n";
821 $totalfreq{$w} = 0 unless ($totalfreq{$w});
822 }
823 close VOC;
824
825
826 # Create statistics file
827 # Output statistics about the vocablary
828 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
829 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
830
831 open(STAT, ">$phinddir/clauses.stats")
832 || die "Cannot open $phinddir/clauses.stats: $!";
833
834 print STAT "first_delimiter $first_delimiter\n";
835 print STAT "last_delimiter $last_delimiter\n";
836 print STAT "first_stopword $first_stopword\n";
837 print STAT "last_stopword $last_stopword\n";
838 if ($thesaurus) {
839 print STAT "first_thesaurusword $first_thesaurusword\n";
840 print STAT "last_thesaurusword $last_thesaurusword\n";
841 }
842 print STAT "first_extractword $first_extractword\n";
843 print STAT "last_extractword $last_extractword\n";
844 print STAT "first_contentword $first_contentword\n";
845 print STAT "last_contentword $last_contentword\n";
846 print STAT "first_symbol $first_delimiter\n";
847 print STAT "last_symbol $last_contentword\n";
848 print STAT "first_word $first_stopword\n";
849 print STAT "last_word $last_contentword\n";
850 close STAT;
851
852 undef @vocab;
853
854
855 # Create numbers file
856 # Save text as symbol numbers
857 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
858
859 open(TXT, "<$phinddir/clauses");
860 open(NUM, ">$phinddir/clauses.numbers");
861
862## $phrasedelimiter = $symbol{lc($senlimit)}; # jrm21
863## print NUM "$symbol{lc($colstart)}\n"; # jrm21
864 $phrasedelimiter = $symbol{$senlimit};
865 print NUM "$symbol{$colstart}\n";
866
867 # set up the special symbols that delimit documents and sentences
868 while(<TXT>) {
869
870 # split sentence into a list of tokens
871 $line = $_;
872 next unless ($line =~ /./);
873 @words = split(/\s+/, $line);
874
875 # output one token at a time
876 foreach $word (@words) {
877# don't lower-case special delimiters - jrm21
878 if (!map {if ($word eq $_) {1} else {()}} @delimiters) {
879 $word = lc($word);
880 }
881 print NUM "$symbol{$word}\n";
882 }
883
884 # output phrase delimiter
885 print NUM "$phrasedelimiter\n";
886 }
887
888 close TXT;
889# print NUM "$symbol{lc($colend)}\n";# jrm21
890 print NUM "$symbol{$colend}\n";
891 close NUM;
892
893 # Save thesaurus data in one convienient file
894 if ($thesaurus) {
895
896 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
897
898
899 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
900 if ($verbosity > 1);
901
902 # Read the thesaurus terms
903 my ($num, $text, %thes_symbols);
904
905 open(TH, "<$thesaurus_terms");
906 while(<TH>) {
907 chomp;
908 @words = split(/\s+/, $_);
909 $num = shift @words;
910 $text = "";
911
912 # translate words into symbol numbers
913 foreach $word (@words) {
914 $word = lc($word);
915 if ($symbol{$word}) {
916 $text .= "s$symbol{$word} ";
917 } elsif ($verbosity) {
918 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
919 }
920 }
921 $text =~ s/ $//;
922 $thes_symbols{$num} = $text;
923 }
924 close TH;
925
926 # Read the thesaurus links and write the corresponding data
927 open(TH, "<$thesaurus_links");
928 open(THOUT, ">$thesaurusfile");
929
930 while(<TH>) {
931 chomp;
932 ($num, $text) = split(/:/, $_);
933
934 if (defined($thes_symbols{$num})) {
935 print THOUT "$num:$thes_symbols{$num}:$text\n";
936 } else {
937 print THOUT "$num:untranslated:$text\n";
938 }
939 }
940 close TH;
941 close THOUT;
942 }
943
944
945
946
947}
948
949
950# renumber_phrases
951#
952# Prepare the phrases file to be input to mgpp. The biggest problem is
953# reconciling the phrase identifiers used by the suffix program (which
954# we'll call suffix-id numbers) with the numbers used in the thesaurus
955# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
956# Phind-id numbers must be sorted by frequency of occurance.
957#
958# Start creating a set of phind-id numbers from the sorted suffix-id
959# numbers and (if required) the thesaurus-id numbers. Then add any other
960# phrases occuring in the thesaurus.
961#
962# The last thing we have to do is restore the vocabulary information to the
963# phrase file so that the phrases are stored as words, not as symbol
964# numbers.
965
966# The original phrases file looks something like this:
967# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
968# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
969# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
970# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
971
972
973sub renumber_phrases {
974 my ($self) = @_;
975
976 renumber_suffix_data($self);
977 renumber_thesaurus_data($self);
978 restore_vocabulary_data($self);
979
980}
981
982
983
984# renumber_suffix_data
985#
986# Translate phrases file to phrases.2 using phind keys instead
987# of suffix keys and sorting the expansion data.
988
989sub renumber_suffix_data {
990 my ($self) = @_;
991
992 my $verbosity = $self->{'verbosity'};
993 my $out = $self->{'outhandle'};
994 print $out "Translate phrases: suffix-ids become phind-id's\n"
995 if ($verbosity);
996
997 my $phinddir = $self->{'phinddir'};
998 my $infile = &util::filename_cat($phinddir, 'phrases');
999 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
1000
1001 # Read the phrase file. Calculate initial set of phind-id
1002 # numbers and store (suffixid -> frequency) relation.
1003
1004 my %suffixtophind;
1005 my @phindfrequency;
1006 my (@fields, $suffixid);
1007 my $nextphind = 1;
1008
1009 open(IN, "<$infile");
1010 while(<IN>) {
1011
1012 chomp;
1013 @fields = split(/:/, $_);
1014
1015 # get next suffixid and phindid
1016 $suffixid = shift @fields;
1017 $suffixtophind{$suffixid} = $nextphind;
1018
1019 # store total frequency
1020 shift @fields;
1021 $totalfrequency[$nextphind] = shift @fields;
1022
1023 $nextphind++;
1024 }
1025 close IN;
1026
1027
1028 # Translate phrases file to phrases.2. Use phind keys (not suffix
1029 # keys), sort expansion and document occurance data in order of
1030 # descending frequency..
1031 open(IN, "<$infile");
1032 open(OUT, ">$outfile");
1033
1034 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1035 my (@documwents, @newexp, $k, $n);
1036 my $linenumber = 0;
1037
1038 while(<IN>) {
1039
1040 # read the line
1041 chomp;
1042 @fields = split(/:/, $_);
1043
1044 # get a phrase number for this line
1045 $suffixid = shift @fields;
1046 die unless (defined($suffixtophind{$suffixid}));
1047 $phindid = $suffixtophind{$suffixid};
1048
1049 # get the symbols in the phrase
1050 $text = shift @fields;
1051
1052 # output status information
1053 $linenumber++;
1054 if ($verbosity > 2) {
1055 if ($linenumber % 1000 == 0) {
1056 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1057 }
1058 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1059 }
1060
1061 # get the phrase frequency
1062 $tf = shift @fields;
1063
1064 # get the number of expansions
1065 $countexp = shift @fields;
1066
1067 # get the expansions, convert them into phind-id numbers, and sort them
1068 $expansions = shift @fields;
1069 @newexp = ();
1070 foreach $k (split(/,/, $expansions)) {
1071 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1072 $n = $suffixtophind{$k};
1073 push @newexp, $n;
1074 }
1075 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1076
1077 # get the number of documents
1078 $countdocs = shift @fields;
1079
1080 # get the documents and sort them
1081 $documents = shift @fields;
1082 $documents =~ s/d//g;
1083 @documents = split(/;/, $documents);
1084 @documents = sort by_doc_frequency @documents;
1085
1086 # output the phrase data
1087 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1088 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1089
1090 }
1091
1092 close IN;
1093 close OUT;
1094}
1095
1096
1097# renumber_thesaurus_data
1098#
1099# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1100
1101sub renumber_thesaurus_data {
1102 my ($self) = @_;
1103
1104 my $out = $self->{'outhandle'};
1105 my $verbosity = $self->{'verbosity'};
1106 my $thesaurus = $self->{'thesaurus'};
1107
1108 my $phinddir = $self->{'phinddir'};
1109 my $infile = &util::filename_cat($phinddir, "phrases.2");
1110 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1111
1112
1113 # If no thesaurus is defined, simply move the phrases file.
1114 if (!$thesaurus) {
1115 print $out "Translate phrases.2: no thesaurus data\n"
1116 if ($verbosity);
1117 &util::mv($infile, $outfile);
1118 return;
1119 }
1120
1121 print $out "Translate phrases.2: add thesaurus data\n"
1122 if ($verbosity);
1123
1124 # 1.
1125 # Read thesaurus file and store (symbols->thesaurusid) mapping
1126 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1127 my %symbolstothesid;
1128 my (@fields, $thesid, $symbols);
1129
1130 open(TH, "<$thesaurusfile");
1131
1132 while (<TH>) {
1133
1134 chomp;
1135 @fields = split(/:/, $_);
1136
1137 # get id and text
1138 $thesid = shift @fields;
1139 $symbols = shift @fields;
1140 $symbolstothesid{$symbols} = $thesid;
1141 }
1142 close TH;
1143
1144 # 2.
1145 # Read phrases file to find thesaurus entries that already
1146 # have a phindid. Store their phind-ids for later translation,
1147 # and store their frequency for later sorting.
1148 my %thesaurustophindid;
1149 my %phindidtofrequency;
1150 my ($phindid, $freq);
1151
1152 open(IN, "<$infile");
1153
1154 while(<IN>) {
1155
1156 chomp;
1157 @fields = split(/:/, $_);
1158
1159 # phindid and symbols for this line
1160 $phindid = shift @fields;
1161 $symbols = shift @fields;
1162 $freq = shift @fields;
1163
1164 # do we have a thesaurus id corresponding to this phrase?
1165 if (defined($symbolstothesid{$symbols})) {
1166 $thesid = $symbolstothesid{$symbols};
1167 $thesaurustophindid{$thesid} = $phindid;
1168 $phindidtofrequency{$phindid} = $freq;
1169 }
1170 }
1171 close IN;
1172
1173 undef %symbolstothesid;
1174
1175 # 3.
1176 # Create phind-id numbers for remaining thesaurus entries,
1177 # and note that their frequency is 0 for later sorting.
1178 my $nextphindid = $phindid + 1;
1179
1180 open(TH, "<$thesaurusfile");
1181 while(<TH>) {
1182
1183 chomp;
1184 @fields = split(/:/, $_);
1185
1186 # read thesaurus-id and ensure it has a corresponding phind-id
1187 $thesid = shift @fields;
1188 if (!defined($thesaurustophindid{$thesid})) {
1189 $thesaurustophindid{$thesid} = $nextphindid;
1190 $phindidtofrequency{$nextphindid} = 0;
1191 $nextphindid++;
1192 }
1193 }
1194 close TH;
1195
1196 # 4.
1197 # Translate thesaurus file, replacing thesaurus-id numbers with
1198 # phind-id numbers.
1199 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1200 my ($relations, $linkcounter, $linktext, $linktype, @linkdata);
1201 my (@links, $linkid, %linkidtotype, $newrelation);
1202
1203 open(TH, "<$thesaurusfile");
1204 open(TO, ">$newthesaurusfile");
1205 while(<TH>) {
1206
1207 chomp;
1208 @fields = split(/:/, $_);
1209
1210 # phindid and symbols for this line
1211 ($thesid, $symbols, $relations) = @fields;
1212
1213 die unless ($thesid && $symbols);
1214 die unless $thesaurustophindid{$thesid};
1215 $phindid = $thesaurustophindid{$thesid};
1216
1217 # convert each part of the relation string to use phind-id numbers
1218 # at the same time, we want to sort the list by frequency.
1219 undef %linkidtotype;
1220
1221 foreach $linktext (split(/;/, $relations)) {
1222 @linkdata = split(/,/, $linktext);
1223
1224 # remember the linktype (e.g. BT, NT)
1225 $linktype = shift @linkdata;
1226
1227 # store the type of each link
1228 foreach $thesid (@linkdata) {
1229 die unless (defined($thesaurustophindid{$thesid}));
1230 $linkidtotype{$thesaurustophindid{$thesid}} = $linktype;
1231 }
1232 }
1233
1234 # sort the list of links, first by frequency, then by type.
1235 @links = sort { ($phindidtofrequency{$b} <=> $phindidtofrequency{$a})
1236 or ($linkidtotype{$a} cmp $linkidtotype{$b}) } (keys %linkidtotype);
1237 $linkcounter = (scalar @links);
1238
1239 # create a string describing the link information
1240 $linktype = $linkidtotype{$links[0]};
1241 $newrelation = $linktype;
1242 foreach $linkid (@links) {
1243 if ($linkidtotype{$linkid} ne $linktype) {
1244 $linktype = $linkidtotype{$linkid};
1245 $newrelation .= ";" . $linktype;
1246 }
1247 $newrelation .= "," . $linkid;
1248 }
1249 $newrelation .= ";";
1250
1251
1252 # output the new line
1253 print TO "$phindid:$symbols:$linkcounter:$newrelation:\n";
1254 }
1255 close TH;
1256 close TO;
1257
1258 undef %thesaurustophindid;
1259 undef %linkidtotype;
1260 undef %phindidtofrequency;
1261
1262 # 5.
1263 # Read thesaurus data (in phind-id format) into memory
1264 my %thesaurusdata;
1265
1266 open(TH, "<$newthesaurusfile");
1267 while(<TH>) {
1268 chomp;
1269 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1270 die unless ($phindid && $symbols);
1271 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1272 }
1273 close TH;
1274
1275 # 6.
1276 # Add thesaurus data to phrases file
1277 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1278 my (@documwents, @newexp, $k, $n);
1279 my $linenumber = 0;
1280
1281 open(IN, "<$infile");
1282 open(OUT, ">$outfile");
1283
1284 # Update existing phrases
1285 while(<IN>) {
1286
1287 chomp;
1288 @fields = split(/:/, $_);
1289
1290 # get data for this line
1291 $phindid = shift @fields;
1292
1293 # output the phrase data, with thesaurus information
1294 print OUT "$phindid:", join(":", @fields);
1295
1296 # add thesaurus data
1297 if (defined($thesaurusdata{$phindid})) {
1298 @fields = split(/:/, $thesaurusdata{$phindid});
1299 shift @fields;
1300 $linkcounter = shift @fields;
1301 $relations = shift @fields;
1302
1303 print OUT ":$linkcounter:$relations";
1304 $thesaurusdata{$phindid} = "";
1305 }
1306 print OUT "\n";
1307 }
1308 close IN;
1309
1310 # Add phrases that aren't already in the file
1311 foreach $phindid (sort numerically keys %thesaurusdata) {
1312 next unless ($thesaurusdata{$phindid});
1313
1314 @fields = split(/:/, $thesaurusdata{$phindid});
1315 $symbols = shift @fields;
1316 $linkcounter = shift @fields;
1317 $relations = shift @fields;
1318
1319 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1320 }
1321 close OUT;
1322
1323}
1324
1325# restore_vocabulary_data
1326#
1327# Read phrases.3 and restore vocabulary information. Then write
1328# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1329# (if requested) to the saved phrases file.
1330
1331sub restore_vocabulary_data {
1332 my ($self) = @_;
1333
1334 my $out = $self->{'outhandle'};
1335 my $verbosity = $self->{'verbosity'};
1336 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1337
1338 my $phinddir = $self->{'phinddir'};
1339 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1340 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1341 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1342 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1343
1344 my $savephrases = $self->{'savephrases'};
1345
1346 # 1.
1347 # Read the vocabulary file
1348 open(V, "<$vocabfile")
1349 || die "Cannot open $vocabfile: $!";
1350 my @symbol;
1351 my $i = 1;
1352 while(<V>) {
1353 chomp;
1354 $symbol[$i++] = $_;
1355 }
1356 close V;
1357
1358 # 2.
1359 # Translate phrases.3 to MGPP input files
1360 my ($key, $text, $word, $isThesaurus);
1361 my @fields;
1362 my $linenumber = 0;
1363
1364 open(IN, "<$infile");
1365 open(DATA, ">$datafile");
1366 open(WORD, ">$wordfile");
1367
1368 # Save the phrases in a separate text file
1369 if ($savephrases) {
1370 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1371 open(SAVE, ">$savephrases");
1372 }
1373
1374 while(<IN>) {
1375
1376 # read the line
1377 chomp;
1378 $line = $_;
1379 @fields = split(/:/, $line);
1380
1381 # get a phrase number for this line
1382 $key = shift @fields;
1383
1384 # restore the text of the phrase
1385 $text = shift @fields;
1386 $text =~ s/s(\d+)/$symbol[$1]/g;
1387 if ($text =~ / /) {
1388 $word = "";
1389 } elsif ($text ne 'untranslated') {
1390 $word = $text;
1391 }
1392
1393 # output the phrase data
1394 print DATA "<Document>";
1395 print DATA "$key:$text:", join(":", @fields), ":\n";
1396
1397 # output the word index search data
1398 print WORD "<Document>$word\n";
1399
1400 # output the phrases to a text file
1401 if ($savephrases) {
1402 if ((scalar @fields) == 7) {
1403 $isThesaurus = 1;
1404 } else {
1405 $isThesaurus = 0;
1406 }
1407 print SAVE $fields[0], "\t", $fields[2], "\t$isThesaurus\t$text\n";
1408 }
1409 }
1410 close IN;
1411 close WORD;
1412 close DATA;
1413 close SAVE if ($savephrases);
1414
1415}
1416
1417
1418
1419# sort routines used to renumber phrases
1420
1421sub numerically { $a <=> $b }
1422
1423sub by_doc_frequency {
1424 my $fa = 1;
1425 if ($a =~ /,/) {
1426 $fa = $a;
1427 $fa =~ s/\d+,//;
1428 }
1429 my $fb = 1;
1430 if ($b =~ /,/) {
1431 $fb = $b;
1432 $fb =~ s/\d+,//;
1433 }
1434
1435 return ($fb <=> $fa);
1436}
1437
14381;
Note: See TracBrowser for help on using the repository browser.