source: trunk/gsdl/perllib/classify/phind.pm@ 1897

Last change on this file since 1897 was 1897, checked in by paynter, 23 years ago

Convert_gml_into_tokens function a little more language tolerant,
and the thesaurus appriate to the classifier's language is used
when multiple languages are available.

  • Property svn:keywords set to Author Date Id Revision
File size: 37.9 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37use ghtml;
38use unicode;
39
40sub BEGIN {
41 @ISA = ('BasClas');
42}
43
44
45sub print_usage {
46 print STDERR "
47 usage: classify phind [options]
48
49 options:
50 -text Fields The text used to build the phrase hierarchy.
51 (default: 'section:Title,section:text')
52
53 -title Title The metadata field used to describe each document.
54 (default: 'Title')
55
56 -button Name The label for the classifier screen and button in
57 navigation bar.
58 (default: 'Phrase')
59
60 -language Regex Language or languages to use building hierarchy.
61 Languages are identified by two-letter country codes
62 like en (English), es (Spanish), and fr (French).
63 Language is a regular expression, so 'en|fr' (English or
64 French) and '..' (match any language) are valid.
65 (default: 'en'.)
66
67 -savephrases File If set, the phrase infomation will be stored in
68 the given file as text. It is probably a good idea
69 to use an absolute path.
70 (defualt: not set)
71
72 -suffixmode N The smode parameter to the phrase extraction program. A
73 value of 0 means that stopwords are ignored, and of 1
74 means that stopwords are used.
75 (default: 1)
76
77 -thesaurus Name Name of a thesaurus stored in phind format in the
78 collection's etc directory.
79 (default: not set)
80
81 -untidy Don't remove working files.
82
83"; }
84
85
86# Phrase delimiter symbols - these should be abstracted out someplace
87
88my $colstart = "COLLECTIONSTART";
89my $colend = "COLLECTIONEND";
90my $doclimit = "DOCUMENTLIMIT";
91my $senlimit = "SENTENCELIMIT";
92my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
93
94
95# Create a new phind browser based on collect.cfg
96
97sub new {
98 my $class = shift (@_);
99 my $self = new BasClas($class, @_);
100
101 my $out = $self->{'outhandle'};
102
103
104 # Phind installation check
105 # The phind phrase browser is research software and is not installed
106 # by defualt. If the user attepts to use it we warn them that it's a
107 # bit dodgy, then tell them how to install it. If they can do that
108 # and get all the files in place, then we let them proceed.
109
110 print $out "Checking Phind phrase browser requirements...\n";
111
112 # Make sure we're not in windows
113 if ($ENV{'GSDLOS'} =~ /windows/i) {
114 print STDERR "Sorry - Phind currently only works under Unix";
115 exit(1);
116 }
117
118 # Ensure the Phind generate scripts are in place
119 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
120 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
121
122 if (!(-e $file1)) {
123 print STDERR "The phind \"suffix\" program is not installed. ";
124 print STDERR "To install it, change to the directory\n";
125 print STDERR " $src\n";
126 print STDERR "and type \"make install-phind\".\n\n";
127 exit(1);
128 }
129
130 # Ensure the Phind CGI script is in place
131 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
132 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
133
134 if (!(-e $file1)) {
135 print STDERR "The phind CGI program is not installed. ";
136 print STDERR "To install it, change to the directory\n";
137 print STDERR " $src\n";
138 print STDERR "and type \"make install-phind\".\n\n";
139 exit(1);
140 }
141
142 # Ensure the Phind Java applet is in place
143 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
144 $file1 = &util::filename_cat($src, "Phind.class");
145
146 if (!(-e $file1)) {
147 print STDERR "The phind Java classes are not compiled. ";
148 print STDERR "To compile them, change to the directory\n";
149 print STDERR " $src\n";
150 print STDERR "and use your Java compiler to compile Phind.java.\n";
151 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
152 exit(1);
153 }
154
155 # Parse classifier arguments
156 my $builddir = "";
157 my $phinddir = "";
158 if (!parsargv::parse(\@_,
159 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
160 q^title/.*/Title^, \$self->{'titlefield'},
161 q^button/.*/Phrase^, \$self->{'buttonname'},
162 q^language/.*/en^, \$language,
163 q^builddir/.*/^, \$builddir,
164 q^savephrases/\d/0^, \$self->{'savephrases'},
165 q^suffixmode/\d/1^, \$self->{'suffixmode'},
166 q^thesaurus/.*/^, \$self->{'thesaurus'},
167 q^untidy^, \$self->{'untidy'},
168 "allow_extra_options")) {
169
170 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
171 &print_usage();
172 die "\n";
173 }
174
175 # classifier information
176 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
177
178 # limit languages
179 $self->{'language_exp'} = $language;
180
181 # collection directories
182 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
183 if (!$builddir) {
184 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
185 }
186 $self->{'builddir'} = $builddir;
187 $self->{'phinddir'} = &util::filename_cat($builddir, "phind");
188
189 return bless $self, $class;
190}
191
192
193# Initialise the phind classifier
194
195sub init {
196 my $self = shift (@_);
197
198 # ensure we have a build directory
199 my $builddir = $self->{'builddir'};
200 die unless (-e "$builddir");
201
202 # create phind directory
203 my $phinddir = $self->{'phinddir'};
204 if (-e "$phinddir") {
205 &util::rm_r("$phinddir");
206 }
207 &util::mk_dir("$phinddir");
208
209 # open filehandles for documents and text
210 my $clausefile = &util::filename_cat("$phinddir", "clauses");
211 &util::rm($clausefile) if (-e $clausefile);
212 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
213 $self->{'txthandle'} = TEXT;
214
215 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
216 &util::rm($docfile) if (-e $docfile);
217 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
218 $self->{'dochandle'} = DOCS;
219
220}
221
222
223# Classify each document.
224#
225# Each document is passed here in turn. The classifier extracts the
226# text of each and stores it in the clauses file. Document details are
227# stored in the docs.txt file.
228
229sub classify {
230 my $self = shift (@_);
231 my ($doc_obj) = @_;
232
233 my $verbosity = $self->{'verbosity'};
234 my $top_section = $doc_obj->get_top_section();
235
236 my $titlefield = $self->{'titlefield'};
237
238 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
239 print "process: $title\n" if ($verbosity > 2);
240
241 # Only consider the file if it is in the correct language
242 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
243 my $phrlanguage = $self->{'language_exp'};
244 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
245
246 # record this file
247 my $total++;
248 print "file $total: $file\n" if ($self->{'$verbosity'});
249
250
251 # Store document details
252 my $OID = $doc_obj->get_OID();
253 $OID = "NULL" unless defined $OID;
254 my $dochandle = $self->{'dochandle'};
255 print $dochandle "<Document>\t$OID\t$title\n";
256
257 # Store the text occuring in this object
258
259 # output the document delimiter
260 my $txthandle = $self->{'txthandle'};
261 print $txthandle "$doclimit\n";
262
263 # iterarate over the required indexes and store their text
264 my $indexes = $self->{'indexes'};
265 my $text = "";
266 my ($part, $level, $field, $section, $data, $dataref);
267
268 foreach $part (split(/,/, $indexes)) {
269
270 # Each field has a level and a data element ((e.g. document:Title)
271 ($level, $field) = split(/:/, $part);
272 die unless ($level && $field);
273
274 # Extract the text from every section
275 # (In phind, document:text and section:text are equivalent)
276 if ($field eq "text") {
277 $data = "";
278 $section = $doc_obj->get_top_section();
279 while (defined($section)) {
280 $data .= $doc_obj->get_text($section) . "\n";
281 $section = $doc_obj->get_next_section($section);
282 }
283 $text .= convert_gml_to_tokens($data) . "\n";
284 }
285
286 # Extract a metadata field from a document
287 # (If ther eis more than one element of the given type, get them all.)
288 elsif ($level eq "document") {
289 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
290 foreach $data ($$dataref) {
291 $text .= convert_gml_to_tokens($data) . "\n";
292 }
293 }
294
295 # Extract metadata from every section in a document
296 elsif ($level eq "section") {
297 $data = "";
298 $section = $doc_obj->get_top_section();
299 while (defined($section)) {
300 $dataref .= $doc_obj->get_metadata($section, $field);
301 $data .= join("\n", $$dataref) . "\n";
302 $section = $doc_obj->get_next_section($section);
303 }
304 $text .= convert_gml_to_tokens($data) . "\n";
305 }
306
307 # Some sort of specification which I don't understand
308 else {
309 die "Unknown level ($level) in phind index ($part)\n";
310 }
311
312 }
313
314 # output the text
315 $text =~ tr/\n//s;
316 print $txthandle "$text";
317}
318
319
320# Construct the classifier from the information already gathered
321#
322# When get_classify_info is called, the clauses and docs.txt files have
323# already been constructed in the phind directory. This function will
324# translate them into compressed, indexed MGPP files that can be read by
325# the phindcgi script. It will also register our classifier so that it
326# shows up in the navigation bar.
327
328sub get_classify_info {
329 my $self = shift (@_);
330
331 my $verbosity = $self->{'verbosity'};
332 my $out = $self->{'outhandle'};
333 my $phinddir = $self->{'phinddir'};
334
335 if ($verbosity) {
336 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
337 }
338
339 # Construct phind indexes
340 my $suffixmode = $self->{'suffixmode'};
341 my ($command, $status);
342
343 # Generate the vocabulary, symbol statistics, and numbers file
344 # from the clauses file
345 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
346 &extract_vocabulary($self);
347
348 # Use the suffix program to generate the phind/phrases file
349 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
350 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
351
352 # Create the phrase file and put phrase numbers in phind/phrases
353 print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
354 &renumber_phrases($self);
355
356 # Create the mg phrase database
357 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
358 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
359 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
360
361 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
362 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
363 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
364 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
365
366 print $out "\nCreating phrase databases\n";
367 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
368 my $mg_stem = "pdata";
369
370 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
371 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
372 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
373
374 # create the mg index of words
375 print $out "\nCreating word-level search indexes\n";
376 $mg_input = &util::filename_cat($phinddir, "pword.txt");
377 $mg_stem = "pword";
378
379 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
380 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
381 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
382 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
383 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
384 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
385
386 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
387 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
388 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
389
390 # create the mg document information database
391 print $out "\nCreating document information databases\n";
392 $mg_input = &util::filename_cat($phinddir, "docs.txt");
393 $mg_stem = "docs";
394
395 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
396 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
397 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
398
399
400 # Tidy up stray files
401 if (!$self->{'untidy'}) {
402 print $out "\nCleaning up\n" if ($verbosity > 2);
403 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
404 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
405 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
406 "$phinddir/pdata.txt", "$phinddir/pword.txt");
407 my $outfile = 1;
408 while (-e "$phinddir/outPhrase.$outfile") {
409 &util::rm("$phinddir/outPhrase.$outfile");
410 $outfile++;
411 }
412 }
413
414
415 # Insert the classifier into.... what?
416 my $collection = $self->{'collection'};
417 my $url = "library?a=p&p=phind&c=$collection";
418
419 my %classifyinfo = ('thistype'=>'Invisible',
420 'childtype'=>'Phind',
421 'Title'=>$self->{'buttonname'},
422 'contains'=>[]);
423
424 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
425 return \%classifyinfo;
426}
427
428
429
430sub convert_gml_to_tokens {
431
432 my ($text) = @_;
433 my $language_exp = $self->{'language_exp'};
434
435 if ($language_exp =~ /en/) {
436 return convert_gml_to_tokens_EN($text);
437 }
438
439 # FIRST, remove GML tags
440 $_ = $text;
441
442 # Replace all whitespace with a simple space
443 s/\s+/ /gso;
444
445 # Remove everything that is in a tag
446 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
447 s/\s*<br>\s*/ LINEBREAK /isgo;
448 s/<[^>]*>/ /sgo;
449
450 # Now we have the text, but it may contain HTML
451 # elements coded as &gt; etc. Remove these tags.
452 s/&lt;/</sgo;
453 s/&gt;/>/sgo;
454
455 s/\s+/ /sgo;
456 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
457 s/\s*<br>\s*/ LINEBREAK /isgo;
458 s/<[^>]*>/ /sgo;
459
460 # remove &amp; and other miscellaneous markup tags
461 s/&amp;/&/sgo;
462 s/&lt;/</sgo;
463 s/&gt;/>/sgo;
464 s/&amp;/&/sgo;
465
466 # replace<p> and <br> placeholders with carriage returns
467 s/PARAGRAPHBREAK/\n/sgo;
468 s/LINEBREAK/\n/sgo;
469
470
471 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,0))/gse;
472
473
474 # Convert the remaining text to "clause format",
475
476 # This means removing all excess punctuation and garbage text,
477 # normalising valid punctuation to fullstops and commas,
478 # then putting one clause on each line.
479
480 # Insert newline when the end of a sentence is detected
481 # (delimter is: "[\.\?\!]\s")
482 s/\s*[\.\?\!]\s+/\n/go;
483
484 # split numbers after four digits
485 s/(\d\d\d\d)/$1 /go;
486
487 # remove extra whitespace
488 s/ +/ /sgo;
489 s/^\s+//mgo;
490 s/\s*$/\n/mgo;
491
492 # remove lines that contain one word or less
493 s/^\S*$//mgo;
494 s/^\s*$//mgo;
495 tr/\n//s;
496
497 return $_;
498}
499
500# A version of convert_gml_to_tokens that is fine-tuned to the English language.
501
502sub convert_gml_to_tokens_EN {
503 $_ = shift @_;
504
505 # FIRST, remove GML tags
506
507 # Replace all whitespace with a simple space
508 s/\s+/ /gs;
509
510 # Remove everything that is in a tag
511 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
512 s/\s*<br>\s*/ LINEBREAK /isg;
513 s/<[^>]*>/ /sg;
514
515 # Now we have the text, but it may contain HTML
516 # elements coded as &gt; etc. Remove these tags.
517 s/&lt;/</sg;
518 s/&gt;/>/sg;
519
520 s/\s+/ /sg;
521 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
522 s/\s*<br>\s*/ LINEBREAK /isg;
523 s/<[^>]*>/ /sg;
524
525 # remove &amp; and other miscellaneous markup tags
526 s/&amp;/&/sg;
527 s/&lt;/</sg;
528 s/&gt;/>/sg;
529 s/&amp;/&/sg;
530
531 # replace<p> and <br> placeholders with carriage returns
532 s/PARAGRAPHBREAK/\n/sg;
533 s/LINEBREAK/\n/sg;
534
535
536 # Exceptional punctuation
537 #
538 # We make special cases of some punctuation
539
540 # remove any apostrophe that indicates omitted letters
541 s/(\w+)\'(\w*\s)/ $1$2 /g;
542
543 # remove period that appears in a person's initals
544 s/\s([A-Z])\./ $1 /g;
545
546 # replace hyphens in hypheanted words and names with a space
547 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
548
549 # Convert the remaining text to "clause format",
550 # This means removing all excess punctuation and garbage text,
551 # normalising valid punctuation to fullstops and commas,
552 # then putting one cluse on each line.
553
554 # Insert newline when the end of a sentence is detected
555 # (delimter is: "[\.\?\!]\s")
556 s/\s*[\.\?\!]\s+/\n/g;
557
558 # split numbers after four digits
559 s/(\d\d\d\d)/$1 /g;
560
561 # split words after 32 characters
562
563 # squash repeated punctuation
564 tr/A-Za-z0-9 //cs;
565
566 # save email addresses
567 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
568
569 # normalise clause breaks (mostly punctuation symbols) to commas
570 s/[^A-Za-z0-9 \n]+/ , /g;
571
572 # Remove repeated commas, and replace with newline
573 s/\s*,[, ]+/\n/g;
574
575 # remove extra whitespace
576 s/ +/ /sg;
577 s/^\s+//mg;
578 s/\s*$/\n/mg;
579
580 # remove lines that contain one word or less
581 s/^\w*$//mg;
582 s/^\s*$//mg;
583 tr/\n//s;
584
585 return $_;
586
587}
588
589
590
591# Execute a system command
592
593sub execute {
594 my ($command, $verbosity, $outhandle) = @_;
595 print $outhandle "Executing: $command\n" if ($verbosity > 2);
596 my $status = system($command);
597 if ($status != 0) {
598 print STDERR "phind - Error executing $command: $!\n";
599 exit($status);
600 }
601}
602
603
604# Generate the vocabulary, symbol statistics, and numbers file from the
605# clauses file. This is legacy code, so is a bit messy and probably wont
606# run under windows.
607
608sub extract_vocabulary {
609 my ($self) = @_;
610
611 my $verbosity = $self->{'verbosity'};
612 my $out = $self->{'outhandle'};
613
614 my $collectiondir = $self->{'collectiondir'};
615 my $phinddir = $self->{'phinddir'};
616
617 my $language_exp = $self->{'language_exp'};
618
619 my ($w, $l, $line, $word);
620
621 my ($first_delimiter, $last_delimiter,
622 $first_stopword, $last_stopword,
623 $first_extractword, $last_extractword,
624 $first_contentword, $last_contentword,
625 $phrasedelimiter);
626
627 my $thesaurus = $self->{'thesaurus'};
628 my ($thesaurus_links, $thesaurus_terms,
629 %thesaurus, $first_thesaurusword, $last_thesaurusword);
630
631 my %symbol;
632 my (%freq);
633
634 print $out "Calculating vocabulary\n" if ($verbosity > 1);
635
636 # Read and store the stopwords
637 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
638 my $stopword_files = ();
639 my ($language, $language_dir, $file, $file_name);
640 my %stopwords;
641
642 # Examine each directory in the stopword directory
643 opendir(STOPDIR, $stopdir);
644 foreach $language (readdir STOPDIR) {
645
646 # Ignore entries that do not match the classifier's language
647 next unless ($language =~ /$language_exp/);
648 $language_dir = &util::filename_cat($stopdir, $language);
649 next unless (-d "$language_dir");
650
651 opendir(LANGDIR, $language_dir);
652 foreach $file (readdir LANGDIR) {
653
654 # Ignore entries that are not stopword files
655 next unless ($file =~ /sw$/);
656 $file_name = &util::filename_cat($language_dir, $file);
657 next unless (-f "$file_name");
658
659 # Read the stopwords
660 open(STOPFILE, "<$file_name");
661 while (<STOPFILE>) {
662 s/^\s+//;
663 s/\s.*//;
664 $word = $_;
665 $l = lc($word);
666 $stopwords{$l} = $word;
667 }
668 close STOPFILE;
669
670 }
671 }
672
673 # Read thesaurus information
674 if ($thesaurus) {
675
676 # link file exists
677 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
678 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
679
680 # ensure term file exists in the correct language
681 if ($language_exp =~ /^([a-z][a-z])/) {
682 $language = $1;
683 } else {
684 $language = 'en';
685 }
686 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.$language");
687 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
688
689
690 # Read the thesaurus terms
691 open(TH, "<$thesaurus_terms");
692 while(<TH>) {
693 s/^\d+ //;
694 s/\(.*\)//;
695 foreach $w (split(/\s+/, $_)) {
696 $thesaurus{lc($w)} = $w;
697 }
698 }
699 close TH;
700 }
701
702 # Read words in the text and count occurences
703 open(TXT, "<$phinddir/clauses");
704 my @words;
705
706 while(<TXT>) {
707 $line = $_;
708 next unless ($line =~ /./);
709
710 @words = split(/\s+/, $line);
711 foreach $w (@words) {
712 $l = lc($w);
713 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
714 $freq{$w}++;
715 }
716 $freq{$senlimit}++;
717 }
718
719 # Calculate the "best" form of each word
720 my (%bestform, %totalfreq, %bestfreq);
721
722 foreach $w (sort (keys %freq)) {
723 $l = lc($w);
724
725 # totalfreq is the number of times a term appears in any form
726 $totalfreq{$l} += $freq{$w};
727
728 if (defined $stopwords{$l}) {
729 $bestform{$l} = $stopwords{$l};
730
731 } elsif (defined $thesaurus{$l}) {
732 $bestform{$l} = $thesaurus{$l};
733
734 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
735 $bestfreq{$l} = $freq{$w};
736 $bestform{$l} = $w;
737 }
738 }
739
740 undef %freq;
741 undef %bestfreq;
742
743
744 # Assign symbol numbers to tokens
745 my $nextsymbol = 1;
746 my (@vocab);
747
748 # Delimiters
749 $first_delimiter = 1;
750
751 foreach $word (@delimiters) {
752
753 $word = lc($word);
754 $bestform{$word} = uc($word);
755 $vocab[$nextsymbol] = $word;
756 $symbol{$word} = $nextsymbol;
757 $nextsymbol++;
758 }
759 $last_delimiter = $nextsymbol - 1;
760
761 # Stopwords
762 $first_stopword = $nextsymbol;
763
764 foreach my $word (sort keys %stopwords) {
765
766 # don't incluse stopword unless it occurs in the text
767 $word = lc($word);
768 next unless ($totalfreq{$word});
769 next if ($symbol{$word});
770
771 $vocab[$nextsymbol] = $word;
772 $symbol{$word} = $nextsymbol;
773 $nextsymbol++;
774 }
775 $last_stopword = $nextsymbol - 1;
776 $first_contentword = $nextsymbol;
777
778 # Thesaurus terms
779 if ($thesaurus) {
780 $first_thesaurusword = $nextsymbol;
781
782 foreach my $word (sort keys %thesaurus) {
783
784 $word = lc($word);
785 next if ($symbol{$word});
786 $bestform{$word} = $thesaurus{$word};
787
788 $vocab[$nextsymbol] = $word;
789 $symbol{$word} = $nextsymbol;
790 $nextsymbol++;
791
792 }
793 $last_thesaurusword = $nextsymbol - 1;
794 }
795
796 # Other content words
797 $first_extractword = $nextsymbol;
798
799 foreach my $word (sort (keys %bestform)) {
800
801 next if ($symbol{$word});
802
803 $vocab[$nextsymbol] = $word;
804 $symbol{$word} = $nextsymbol;
805 $nextsymbol++;
806 }
807 $last_extractword = $nextsymbol - 1;
808 $last_contentword = $nextsymbol - 1;
809
810
811 # Outut the words
812 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
813 open(VOC, ">$phinddir/clauses.vocab");
814
815 for (my $i = 1; $i < $nextsymbol; $i++) {
816 $w = $vocab[$i];
817
818 print VOC "$bestform{$w}\n";
819 $totalfreq{$w} = 0 unless ($totalfreq{$w});
820 }
821 close VOC;
822
823
824 # Create statistics file
825 # Output statistics about the vocablary
826 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
827 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
828
829 open(STAT, ">$phinddir/clauses.stats")
830 || die "Cannot open $phinddir/clauses.stats: $!";
831
832 print STAT "first_delimiter $first_delimiter\n";
833 print STAT "last_delimiter $last_delimiter\n";
834 print STAT "first_stopword $first_stopword\n";
835 print STAT "last_stopword $last_stopword\n";
836 if ($thesaurus) {
837 print STAT "first_thesaurusword $first_thesaurusword\n";
838 print STAT "last_thesaurusword $last_thesaurusword\n";
839 }
840 print STAT "first_extractword $first_extractword\n";
841 print STAT "last_extractword $last_extractword\n";
842 print STAT "first_contentword $first_contentword\n";
843 print STAT "last_contentword $last_contentword\n";
844 print STAT "first_symbol $first_delimiter\n";
845 print STAT "last_symbol $last_contentword\n";
846 print STAT "first_word $first_stopword\n";
847 print STAT "last_word $last_contentword\n";
848 close STAT;
849
850 undef @vocab;
851
852
853 # Create numbers file
854 # Save text as symbol numbers
855 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
856
857 open(TXT, "<$phinddir/clauses");
858 open(NUM, ">$phinddir/clauses.numbers");
859
860 $phrasedelimiter = $symbol{lc($senlimit)};
861 print NUM "$symbol{lc($colstart)}\n";
862
863 # set up the special symbols that delimit documents and sentences
864 while(<TXT>) {
865
866 # split sentence into a list of tokens
867 $line = $_;
868 next unless ($line =~ /./);
869 @words = split(/\s+/, $line);
870
871 # output one token at a time
872 foreach $word (@words) {
873 $word = lc($word);
874 print NUM "$symbol{$word}\n";
875 }
876
877 # output phrase delimiter
878 print NUM "$phrasedelimiter\n";
879 }
880
881 print NUM "$symbol{lc($colend)}\n";
882 close NUM;
883
884 # Save thesaurus data in one convienient file
885 if ($thesaurus) {
886
887 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
888
889
890 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
891 if ($verbosity > 1);
892
893 # Read the thesaurus terms
894 my ($num, $text, %thes_symbols);
895
896 open(TH, "<$thesaurus_terms");
897 while(<TH>) {
898 chomp;
899 @words = split(/\s+/, $_);
900 $num = shift @words;
901 $text = "";
902
903 # translate words into symbol numbers
904 foreach $word (@words) {
905 $word = lc($word);
906 if ($symbol{$word}) {
907 $text .= "s$symbol{$word} ";
908 } elsif ($verbosity) {
909 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
910 }
911 }
912 $text =~ s/ $//;
913 $thes_symbols{$num} = $text;
914 }
915 close TH;
916
917 # Read the thesaurus links and write the corresponding data
918 open(TH, "<$thesaurus_links");
919 open(THOUT, ">$thesaurusfile");
920
921 while(<TH>) {
922 chomp;
923 ($num, $text) = split(/:/, $_);
924
925 if (defined($thes_symbols{$num})) {
926 print THOUT "$num:$thes_symbols{$num}:$text\n";
927 } else {
928 print THOUT "$num:untranslated:$text\n";
929 }
930 }
931 close TH;
932 close THOUT;
933 }
934
935
936
937
938}
939
940
941# renumber_phrases
942#
943# Prepare the phrases file to be input to mgpp. The biggest problem is
944# reconciling the phrase identifiers used by the suffix program (which
945# we'll call suffix-id numbers) with the numbers used in the thesaurus
946# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
947# Phind-id numbers must be sorted by frequency of occurance.
948#
949# Start creating a set of phind-id numbers from the sorted suffix-id
950# numbers and (if required) the thesaurus-id numbers. Then add any other
951# phrases occuring in the thesaurus.
952#
953# The last thing we have to do is restore the vocabulary information to the
954# phrase file so that the phrases are stored as words, not as symbol
955# numbers.
956
957# The original phrases file looks something like this:
958# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
959# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
960# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
961# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
962
963
964sub renumber_phrases {
965 my ($self) = @_;
966
967 renumber_suffix_data($self);
968 renumber_thesaurus_data($self);
969 restore_vocabulary_data($self);
970
971}
972
973
974
975# renumber_suffix_data
976#
977# Translate phrases file to phrases.2 using phind keys instead
978# of suffix keys and sorting the expansion data.
979
980sub renumber_suffix_data {
981 my ($self) = @_;
982
983 my $verbosity = $self->{'verbosity'};
984 my $out = $self->{'outhandle'};
985 print $out "Translate phrases: suffix-ids become phind-id's\n"
986 if ($verbosity);
987
988 my $phinddir = $self->{'phinddir'};
989 my $infile = &util::filename_cat($phinddir, 'phrases');
990 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
991
992 # Read the phrase file. Calculate initial set of phind-id
993 # numbers and store (suffixid -> frequency) relation.
994
995 my %suffixtophind;
996 my @phindfrequency;
997 my (@fields, $suffixid);
998 my $nextphind = 1;
999
1000 open(IN, "<$infile");
1001 while(<IN>) {
1002
1003 chomp;
1004 @fields = split(/:/, $_);
1005
1006 # get next suffixid and phindid
1007 $suffixid = shift @fields;
1008 $suffixtophind{$suffixid} = $nextphind;
1009
1010 # store total frequency
1011 shift @fields;
1012 $totalfrequency[$nextphind] = shift @fields;
1013
1014 $nextphind++;
1015 }
1016 close IN;
1017
1018
1019 # Translate phrases file to phrases.2. Use phind keys (not suffix
1020 # keys), sort expansion and document occurance data in order of
1021 # descending frequency..
1022 open(IN, "<$infile");
1023 open(OUT, ">$outfile");
1024
1025 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
1026 my (@documwents, @newexp, $k, $n);
1027 my $linenumber = 0;
1028
1029 while(<IN>) {
1030
1031 # read the line
1032 chomp;
1033 @fields = split(/:/, $_);
1034
1035 # get a phrase number for this line
1036 $suffixid = shift @fields;
1037 die unless (defined($suffixtophind{$suffixid}));
1038 $phindid = $suffixtophind{$suffixid};
1039
1040 # get the symbols in the phrase
1041 $text = shift @fields;
1042
1043 # output status information
1044 $linenumber++;
1045 if ($verbosity > 2) {
1046 if ($linenumber % 1000 == 0) {
1047 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
1048 }
1049 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
1050 }
1051
1052 # get the phrase frequency
1053 $tf = shift @fields;
1054
1055 # get the number of expansions
1056 $countexp = shift @fields;
1057
1058 # get the expansions, convert them into phind-id numbers, and sort them
1059 $expansions = shift @fields;
1060 @newexp = ();
1061 foreach $k (split(/,/, $expansions)) {
1062 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1063 $n = $suffixtophind{$k};
1064 push @newexp, $n;
1065 }
1066 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1067
1068 # get the number of documents
1069 $countdocs = shift @fields;
1070
1071 # get the documents and sort them
1072 $documents = shift @fields;
1073 $documents =~ s/d//g;
1074 @documents = split(/;/, $documents);
1075 @documents = sort by_doc_frequency @documents;
1076
1077 # output the phrase data
1078 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1079 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1080
1081 }
1082
1083 close IN;
1084 close OUT;
1085}
1086
1087
1088# renumber_thesaurus_data
1089#
1090# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1091
1092sub renumber_thesaurus_data {
1093 my ($self) = @_;
1094
1095 my $out = $self->{'outhandle'};
1096 my $verbosity = $self->{'verbosity'};
1097 my $thesaurus = $self->{'thesaurus'};
1098
1099 my $phinddir = $self->{'phinddir'};
1100 my $infile = &util::filename_cat($phinddir, "phrases.2");
1101 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1102
1103
1104 # If no thesaurus is defined, simply move the phrases file.
1105 if (!$thesaurus) {
1106 print $out "Translate phrases.2: no thesaurus data\n"
1107 if ($verbosity);
1108 &util::mv($infile, $outfile);
1109 return;
1110 }
1111
1112 print $out "Translate phrases.2: add thesaurus data\n"
1113 if ($verbosity);
1114
1115 # 1.
1116 # Read thesaurus file and store (symbols->thesaurusid) mapping
1117 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1118 my %symbolstothesid;
1119 my (@fields, $thesid, $symbols);
1120
1121 open(TH, "<$thesaurusfile");
1122
1123 while (<TH>) {
1124
1125 chomp;
1126 @fields = split(/:/, $_);
1127
1128 # get id and text
1129 $thesid = shift @fields;
1130 $symbols = shift @fields;
1131 $symbolstothesid{$symbols} = $thesid;
1132 }
1133 close TH;
1134
1135 # 2.
1136 # Read phrases file to find thesaurus entries that already
1137 # have a phindid. Store their phind-ids for later translation.
1138 my %thesaurustophindid;
1139 my ($phindid);
1140
1141 open(IN, "<$infile");
1142
1143 while(<IN>) {
1144
1145 chomp;
1146 @fields = split(/:/, $_);
1147
1148 # phindid and symbols for this line
1149 $phindid = shift @fields;
1150 $symbols = shift @fields;
1151
1152 # do we have a thesaurus id corresponding to this phrase?
1153 if (defined($symbolstothesid{$symbols})) {
1154 $thesid = $symbolstothesid{$symbols};
1155 $thesaurustophindid{$thesid} = $phindid;
1156 }
1157 }
1158 close IN;
1159
1160 undef %symbolstothesid;
1161
1162 # 3.
1163 # Create phind-id numbers for remaining thesaurus entries
1164 my $nextphindid = $phindid + 1;
1165
1166 open(TH, "<$thesaurusfile");
1167 while(<TH>) {
1168
1169 chomp;
1170 @fields = split(/:/, $_);
1171
1172 # read thesaurus-id and ensure it has a corresponding phind-id
1173 $thesid = shift @fields;
1174 if (!defined($thesaurustophindid{$thesid})) {
1175 $thesaurustophindid{$thesid} = $nextphindid;
1176 $nextphindid++;
1177 }
1178 }
1179 close TH;
1180
1181 # 4.
1182 # Translate thesaurus file, replacing thesaurus-id numbers with
1183 # phind-id numbers.
1184 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1185 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1186
1187 open(TH, "<$thesaurusfile");
1188 open(TO, ">$newthesaurusfile");
1189 while(<TH>) {
1190
1191 chomp;
1192 @fields = split(/:/, $_);
1193
1194 # phindid and symbols for this line
1195 ($thesid, $symbols, $relations) = @fields;
1196
1197 die unless ($thesid && $symbols);
1198 die unless $thesaurustophindid{$thesid};
1199 $phindid = $thesaurustophindid{$thesid};
1200
1201 # convert each part of the relation string to use phind-id numbers
1202 $newrelation = "";
1203 $linkcounter = 0;
1204 foreach $linktext (split(/;/, $relations)) {
1205 @linkdata = split(/,/, $linktext);
1206
1207 # remember the linktype (e.g. BT, NT)
1208 $linktype = shift @linkdata;
1209 $newrelation .= "$linktype,";
1210
1211 # convert the link target identfiers
1212 foreach $link (@linkdata) {
1213 die unless (defined($thesaurustophindid{$link}));
1214 $newrelation .= "$thesaurustophindid{$link},";
1215 $linkcounter++;
1216 }
1217 $newrelation =~ s/\,$//;
1218 $newrelation .= ";";
1219 }
1220 $newrelation .= ":";
1221
1222 print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1223 }
1224 close TH;
1225 close TO;
1226
1227 undef %thesaurustophindid;
1228
1229 # 5.
1230 # Read thesaurus data (in phind-id format) into memory
1231 my %thesaurusdata;
1232
1233 open(TH, "<$newthesaurusfile");
1234 while(<TH>) {
1235 chomp;
1236 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1237 die unless ($phindid && $symbols);
1238 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1239 }
1240
1241 # 6.
1242 # Add thesaurus data to phrases file
1243 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1244 my (@documwents, @newexp, $k, $n);
1245 my $linenumber = 0;
1246
1247 open(IN, "<$infile");
1248 open(OUT, ">$outfile");
1249
1250 # Update existing phrases
1251 while(<IN>) {
1252
1253 chomp;
1254 @fields = split(/:/, $_);
1255
1256 # get data for this line
1257 $phindid = shift @fields;
1258
1259 # output the phrase data, with thesaurus information
1260 print OUT "$phindid:", join(":", @fields);
1261
1262 # add thesaurus data
1263 if (defined($thesaurusdata{$phindid})) {
1264 @fields = split(/:/, $thesaurusdata{$phindid});
1265 shift @fields;
1266 $linkcounter = shift @fields;
1267 $relations = shift @fields;
1268
1269 print OUT ":$linkcounter:$relations";
1270 $thesaurusdata{$phindid} = "";
1271 }
1272 print OUT "\n";
1273 }
1274 close IN;
1275
1276 # Add phrases that aren't already in the file
1277 foreach $phindid (sort numerically keys %thesaurusdata) {
1278 next unless ($thesaurusdata{$phindid});
1279
1280 @fields = split(/:/, $thesaurusdata{$phindid});
1281 $symbols = shift @fields;
1282 $linkcounter = shift @fields;
1283 $relations = shift @fields;
1284
1285 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1286 }
1287 close OUT;
1288
1289}
1290
1291# restore_vocabulary_data
1292#
1293# Read phrases.3 and restore vocabulary information. Then write
1294# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1295# (if requested) to the saved phrases file.
1296
1297sub restore_vocabulary_data {
1298 my ($self) = @_;
1299
1300 my $out = $self->{'outhandle'};
1301 my $verbosity = $self->{'verbosity'};
1302 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1303
1304 my $phinddir = $self->{'phinddir'};
1305 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1306 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1307 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1308 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1309
1310 my $savephrases = $self->{'savephrases'};
1311
1312 # 1.
1313 # Read the vocabulary file
1314 open(V, "<$vocabfile")
1315 || die "Cannot open $vocabfile: $!";
1316 my @symbol;
1317 my $i = 1;
1318 while(<V>) {
1319 chomp;
1320 $symbol[$i++] = $_;
1321 }
1322
1323
1324 # 2.
1325 # Translate phrases.3 to MGPP input files
1326 my ($key, $text, $word);
1327 my @fields;
1328 my $linenumber = 0;
1329
1330 open(IN, "<$infile");
1331 open(DATA, ">$datafile");
1332 open(WORD, ">$wordfile");
1333
1334 # Save the phrases in a separate text file
1335 if ($savephrases) {
1336 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1337 open(SAVE, ">$savephrases");
1338 }
1339
1340 while(<IN>) {
1341
1342 # read the line
1343 chomp;
1344 $line = $_;
1345 @fields = split(/:/, $line);
1346
1347 # get a phrase number for this line
1348 $key = shift @fields;
1349
1350 # restore the text of the phrase
1351 $text = shift @fields;
1352 $text =~ s/s(\d+)/$symbol[$1]/g;
1353 if ($text =~ / /) {
1354 $word = "";
1355 } elsif ($text ne 'untranslated') {
1356 $word = $text;
1357 }
1358
1359 # output the phrase data
1360 print DATA "<Document>";
1361 print DATA "$key:$text:", join(":", @fields), ":\n";
1362
1363 # output the word index search data
1364 print WORD "<Document>$word\n";
1365
1366 # output the phrases to a text file
1367 if ($savephrases) {
1368 print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1369 }
1370 }
1371 close IN;
1372 close WORD;
1373 close DATA;
1374 close SAVE if ($savephrases);
1375
1376}
1377
1378
1379
1380# sort routines used to renumber phrases
1381
1382sub numerically { $a <=> $b }
1383
1384sub by_doc_frequency {
1385 my $fa = 1;
1386 if ($a =~ /,/) {
1387 $fa = $a;
1388 $fa =~ s/\d+,//;
1389 }
1390 my $fb = 1;
1391 if ($b =~ /,/) {
1392 $fb = $b;
1393 $fb =~ s/\d+,//;
1394 }
1395
1396 return ($fb <=> $fa);
1397}
1398
13991;
Note: See TracBrowser for help on using the repository browser.