source: trunk/gsdl/perllib/classify/phind.pm@ 1890

Last change on this file since 1890 was 1890, checked in by paynter, 23 years ago

When multiple metadata fields have multiple values, get them all.
Initial (poor) support for multiple languages (will have to replace).
Some documentation removed.

  • Property svn:keywords set to Author Date Id Revision
File size: 36.2 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin.
30# Options are dexcribed in the print_usage function.
31# Type "classinfo.pl phind" at the command line for a summary.
32
33package phind;
34
35use BasClas;
36use util;
37
38
39sub BEGIN {
40 @ISA = ('BasClas');
41}
42
43
44sub print_usage {
45 print STDERR "
46 usage: classify phind [options]
47
48 options:
49 -text Fields The text used to build the phrase hierarchy.
50 (default: 'section:Title,section:text')
51
52 -title Title The metadata field used to describe each document.
53 (default: 'Title')
54
55 -button Name The label for the classifier screen and button in
56 navigation bar.
57 (default: 'Phrase')
58
59 -language Regex Language or languages to use building hierarchy.
60 Languages are identified by two-letter country codes
61 like en (English), es (Spanish), and fr (French).
62 Language is a regular expression, so 'en|fr' (English or
63 French) and '..' (match any language) are valid.
64 (default: 'en'.)
65
66 -savephrases File If set, the phrase infomation will be stored in
67 the given file as text. It is probably a good idea
68 to use an absolute path.
69 (defualt: not set)
70
71 -suffixmode N The smode parameter to the phrase extraction program. A
72 value of 0 means that stopwords are ignored, and of 1
73 means that stopwords are used.
74 (default: 1)
75
76 -thesaurus Name Name of a thesaurus stored in phind format in the
77 collection's etc directory.
78 (default: not set)
79
80 -untidy Don't remove working files.
81
82"; }
83
84
85# Phrase delimiter symbols - these should be abstracted out someplace
86
87my $colstart = "COLLECTIONSTART";
88my $colend = "COLLECTIONEND";
89my $doclimit = "DOCUMENTLIMIT";
90my $senlimit = "SENTENCELIMIT";
91my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
92
93
94# Create a new phind browser based on collect.cfg
95
96sub new {
97 my $class = shift (@_);
98 my $self = new BasClas($class, @_);
99
100 my $out = $self->{'outhandle'};
101
102
103 # Phind installation check
104 # The phind phrase browser is research software and is not installed
105 # by defualt. If the user attepts to use it we warn them that it's a
106 # bit dodgy, then tell them how to install it. If they can do that
107 # and get all the files in place, then we let them proceed.
108
109 print $out "Checking Phind phrase browser requirements...\n";
110
111 # Make sure we're not in windows
112 if ($ENV{'GSDLOS'} =~ /windows/i) {
113 print STDERR "Sorry - Phind currently only works under Unix";
114 exit(1);
115 }
116
117 # Ensure the Phind generate scripts are in place
118 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
119 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
120
121 if (!(-e $file1)) {
122 print STDERR "The phind \"suffix\" program is not installed. ";
123 print STDERR "To install it, change to the directory\n";
124 print STDERR " $src\n";
125 print STDERR "and type \"make install-phind\".\n\n";
126 exit(1);
127 }
128
129 # Ensure the Phind CGI script is in place
130 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
131 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
132
133 if (!(-e $file1)) {
134 print STDERR "The phind CGI program is not installed. ";
135 print STDERR "To install it, change to the directory\n";
136 print STDERR " $src\n";
137 print STDERR "and type \"make install-phind\".\n\n";
138 exit(1);
139 }
140
141 # Ensure the Phind Java applet is in place
142 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
143 $file1 = &util::filename_cat($src, "Phind.class");
144
145 if (!(-e $file1)) {
146 print STDERR "The phind Java classes are not compiled. ";
147 print STDERR "To compile them, change to the directory\n";
148 print STDERR " $src\n";
149 print STDERR "and use your Java compiler to compile Phind.java.\n";
150 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
151 exit(1);
152 }
153
154 # Parse classifier arguments
155 my $builddir = "";
156 my $phinddir = "";
157 if (!parsargv::parse(\@_,
158 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
159 q^title/.*/Title^, \$self->{'titlefield'},
160 q^button/.*/Phrase^, \$self->{'buttonname'},
161 q^language/.*/en^, \$language,
162 q^builddir/.*/^, \$builddir,
163 q^savephrases/\d/0^, \$self->{'savephrases'},
164 q^suffixmode/\d/1^, \$self->{'suffixmode'},
165 q^thesaurus/.*/^, \$self->{'thesaurus'},
166 q^untidy^, \$self->{'untidy'},
167 "allow_extra_options")) {
168
169 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
170 &print_usage();
171 die "\n";
172 }
173
174 # classifier information
175 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
176
177 # limit languages
178 $self->{'language_exp'} = $language;
179
180 # collection directories
181 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
182 if (!$builddir) {
183 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
184 }
185 $self->{'builddir'} = $builddir;
186 $self->{'phinddir'} = &util::filename_cat($builddir, "phind");
187
188 return bless $self, $class;
189}
190
191
192# Initialise the phind classifier
193
194sub init {
195 my $self = shift (@_);
196
197 # ensure we have a build directory
198 my $builddir = $self->{'builddir'};
199 die unless (-e "$builddir");
200
201 # create phind directory
202 my $phinddir = $self->{'phinddir'};
203 if (-e "$phinddir") {
204 &util::rm_r("$phinddir");
205 }
206 &util::mk_dir("$phinddir");
207
208 # open filehandles for documents and text
209 my $clausefile = &util::filename_cat("$phinddir", "clauses");
210 &util::rm($clausefile) if (-e $clausefile);
211 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
212 $self->{'txthandle'} = TEXT;
213
214 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
215 &util::rm($docfile) if (-e $docfile);
216 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
217 $self->{'dochandle'} = DOCS;
218
219}
220
221
222# Classify each document.
223#
224# Each document is passed here in turn. The classifier extracts the
225# text of each and stores it in the clauses file. Document details are
226# stored in the docs.txt file.
227
228sub classify {
229 my $self = shift (@_);
230 my ($doc_obj) = @_;
231
232 my $verbosity = $self->{'verbosity'};
233 my $top_section = $doc_obj->get_top_section();
234
235 my $titlefield = $self->{'titlefield'};
236
237 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
238 print "process: $title\n" if ($verbosity > 2);
239
240 # Only consider the file if it is in the correct language
241 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
242 my $phrlanguage = $self->{'language_exp'};
243 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
244
245 # record this file
246 my $total++;
247 print "file $total: $file\n" if ($self->{'$verbosity'});
248
249
250 # Store document details
251 my $OID = $doc_obj->get_OID();
252 $OID = "NULL" unless defined $OID;
253 my $dochandle = $self->{'dochandle'};
254 print $dochandle "<Document>\t$OID\t$title\n";
255
256 # Store the text occuring in this object
257
258 # output the document delimiter
259 my $txthandle = $self->{'txthandle'};
260 print $txthandle "$doclimit\n";
261
262 # iterarate over the required indexes and store their text
263 my $indexes = $self->{'indexes'};
264 my $text = "";
265 my ($part, $level, $field, $section, $data, $dataref);
266
267 foreach $part (split(/,/, $indexes)) {
268
269 # Each field has a level and a data element ((e.g. document:Title)
270 ($level, $field) = split(/:/, $part);
271 die unless ($level && $field);
272
273 # Extract the text from every section
274 # (In phind, document:text and section:text are equivalent)
275 if ($field eq "text") {
276 $data = "";
277 $section = $doc_obj->get_top_section();
278 while (defined($section)) {
279 $data .= $doc_obj->get_text($section) . "\n";
280 $section = $doc_obj->get_next_section($section);
281 }
282 $text .= convert_gml_to_tokens($data) . "\n";
283 }
284
285 # Extract a metadata field from a document
286 # (If ther eis more than one element of the given type, get them all.)
287 elsif ($level eq "document") {
288 $dataref = $doc_obj->get_metadata($doc_obj->get_top_section(), $field);
289 foreach $data ($$dataref) {
290 $text .= convert_gml_to_tokens($data) . "\n";
291 }
292 }
293
294 # Extract metadata from every section in a document
295 elsif ($level eq "section") {
296 $data = "";
297 $section = $doc_obj->get_top_section();
298 while (defined($section)) {
299 $dataref .= $doc_obj->get_metadata($section, $field);
300 $data .= join("\n", $$dataref) . "\n";
301 $section = $doc_obj->get_next_section($section);
302 }
303 $text .= convert_gml_to_tokens($data) . "\n";
304 }
305
306 # Some sort of specification which I don't understand
307 else {
308 die "Unknown level ($level) in phind index ($part)\n";
309 }
310
311 }
312
313 # output the text
314 $text =~ tr/\n//s;
315 print $txthandle "$text";
316}
317
318
319# Construct the classifier from the information already gathered
320#
321# When get_classify_info is called, the clauses and docs.txt files have
322# already been constructed in the phind directory. This function will
323# translate them into compressed, indexed MGPP files that can be read by
324# the phindcgi script. It will also register our classifier so that it
325# shows up in the navigation bar.
326
327sub get_classify_info {
328 my $self = shift (@_);
329
330 my $verbosity = $self->{'verbosity'};
331 my $out = $self->{'outhandle'};
332 my $phinddir = $self->{'phinddir'};
333
334 if ($verbosity) {
335 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
336 }
337
338 # Construct phind indexes
339 my $suffixmode = $self->{'suffixmode'};
340 my ($command, $status);
341
342 # Generate the vocabulary, symbol statistics, and numbers file
343 # from the clauses file
344 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
345 &extract_vocabulary($self);
346
347 # Use the suffix program to generate the phind/phrases file
348 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
349 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
350
351 # Create the phrase file and put phrase numbers in phind/phrases
352 print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
353 &renumber_phrases($self);
354
355 # Create the mg phrase database
356 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
357 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
358 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
359
360 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
361 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
362 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
363 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
364
365 print $out "\nCreating phrase databases\n";
366 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
367 my $mg_stem = "pdata";
368
369 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
370 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
371 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
372
373 # create the mg index of words
374 print $out "\nCreating word-level search indexes\n";
375 $mg_input = &util::filename_cat($phinddir, "pword.txt");
376 $mg_stem = "pword";
377
378 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
379 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
380 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
381 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
382 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
383 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
384
385 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
386 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
387 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
388
389 # create the mg document information database
390 print $out "\nCreating document information databases\n";
391 $mg_input = &util::filename_cat($phinddir, "docs.txt");
392 $mg_stem = "docs";
393
394 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
395 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
396 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
397
398
399 # Tidy up stray files
400 if (!$self->{'untidy'}) {
401 print $out "\nCleaning up\n" if ($verbosity > 2);
402 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
403 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
404 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
405 "$phinddir/pdata.txt", "$phinddir/pword.txt");
406 my $outfile = 1;
407 while (-e "$phinddir/outPhrase.$outfile") {
408 &util::rm("$phinddir/outPhrase.$outfile");
409 $outfile++;
410 }
411 }
412
413
414 # Insert the classifier into.... what?
415 my $collection = $self->{'collection'};
416 my $url = "library?a=p&p=phind&c=$collection";
417
418 my %classifyinfo = ('thistype'=>'Invisible',
419 'childtype'=>'Phind',
420 'Title'=>$self->{'buttonname'},
421 'contains'=>[]);
422
423 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
424 return \%classifyinfo;
425}
426
427
428
429sub convert_gml_to_tokens {
430
431 my ($text) = @_;
432 my $language_exp = $self->{'language_exp'};
433
434 # FIRST, remove GML tags
435 $_ = $text;
436
437 # Replace all whitespace with a simple space
438 s/\s+/ /gso;
439
440 # Remove everything that is in a tag
441 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
442 s/\s*<br>\s*/ LINEBREAK /isgo;
443 s/<[^>]*>/ /sgo;
444
445 # Now we have the text, but it may contain HTML
446 # elements coded as &gt; etc. Remove these tags.
447 s/&lt;/</sgo;
448 s/&gt;/>/sgo;
449
450 s/\s+/ /sgo;
451 s/\s*<p>\s*/ PARAGRAPHBREAK /isgo;
452 s/\s*<br>\s*/ LINEBREAK /isgo;
453 s/<[^>]*>/ /sgo;
454
455 # remove &amp; and other miscellaneous markup tags
456 s/&amp;/&/sgo;
457 s/&lt;/</sgo;
458 s/&gt;/>/sgo;
459 s/&amp;/&/sgo;
460
461 # replace<p> and <br> placeholders with carriage returns
462 s/PARAGRAPHBREAK/\n/sgo;
463 s/LINEBREAK/\n/sgo;
464
465 $text = $_;
466
467
468 # Language-specific word-cleanup
469
470 # English
471 if ($language_exp =~ /en/) {
472
473 # remove any apostrophe that indicates omitted letters
474 $text =~ s/(\w+)\'(\w*\s)/ $1$2 /g;
475
476 # remove period that appears in a person's initals
477 $text =~ s/\s([A-Z])\./ $1 /g;
478
479 # replace hyphens in hyphenated words and names with a space
480 $text =~ s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
481
482 }
483
484
485 # Convert the remaining text to "clause format",
486
487 # This means removing all excess punctuation and garbage text,
488 # normalising valid punctuation to fullstops and commas,
489 # then putting one clause on each line.
490
491 # Insert newline when the end of a sentence is detected
492 # (delimter is: "[\.\?\!]\s")
493 $text =~ s/\s*[\.\?\!]\s+/\n/go;
494
495
496 # Language-specific clause clean-up
497
498 # English
499 if ($language_exp =~ /en/) {
500
501 # split numbers after four digits
502 $text =~ s/(\d\d\d\d)/$1 /g;
503
504 # split words after 32 characters
505
506 # squash repeated punctuation
507 $text =~ tr/A-Za-z0-9 //cs;
508
509 # normalise clause breaks (mostly punctuation symbols) to commas
510 $text =~ s/[^A-Za-z0-9 \n]+/ , /g;
511
512 # Remove repeated commas, and replace with newline
513 $text =~ s/\s*,[, ]+/\n/g;
514 }
515
516 # remove extra whitespace
517 $_ = $text;
518 s/ +/ /sgo;
519 s/^\s+//mgo;
520 s/\s*$/\n/mgo;
521
522 # remove lines that contain one word or less
523 s/^\S*$//mgo;
524 s/^\s*$//mgo;
525 tr/\n//s;
526
527 return $_;
528}
529
530
531# Execute a system command
532
533sub execute {
534 my ($command, $verbosity, $outhandle) = @_;
535 print $outhandle "Executing: $command\n" if ($verbosity > 2);
536 my $status = system($command);
537 if ($status != 0) {
538 print STDERR "phind - Error executing $command: $!\n";
539 exit($status);
540 }
541}
542
543
544# Generate the vocabulary, symbol statistics, and numbers file from the
545# clauses file. This is legacy code, so is a bit messy and probably wont
546# run under windows.
547
548sub extract_vocabulary {
549 my ($self) = @_;
550
551 my $verbosity = $self->{'verbosity'};
552 my $out = $self->{'outhandle'};
553
554 my $collectiondir = $self->{'collectiondir'};
555 my $phinddir = $self->{'phinddir'};
556
557 my $language_exp = $self->{'language_exp'};
558
559 my ($w, $l, $line, $word);
560
561 my ($first_delimiter, $last_delimiter,
562 $first_stopword, $last_stopword,
563 $first_extractword, $last_extractword,
564 $first_contentword, $last_contentword,
565 $phrasedelimiter);
566
567 my $thesaurus = $self->{'thesaurus'};
568 my ($thesaurus_links, $thesaurus_terms,
569 %thesaurus, $first_thesaurusword, $last_thesaurusword);
570
571 my %symbol;
572 my (%freq);
573
574 print $out "Calculating vocabulary\n" if ($verbosity > 1);
575
576 # Read and store the stopwords
577 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
578 my $stopword_files = ();
579 my ($language, $language_dir, $file, $file_name);
580 my %stopwords;
581
582 # Examine each directory in the stopword directory
583 opendir(STOPDIR, $stopdir);
584 foreach $language (readdir STOPDIR) {
585
586 # Ignore entries that do not match the classifier's language
587 next unless ($language =~ /$language_exp/);
588 $language_dir = &util::filename_cat($stopdir, $language);
589 next unless (-d "$language_dir");
590
591 opendir(LANGDIR, $language_dir);
592 foreach $file (readdir LANGDIR) {
593
594 # Ignore entries that are not stopword files
595 next unless ($file =~ /sw$/);
596 $file_name = &util::filename_cat($language_dir, $file);
597 next unless (-f "$file_name");
598
599 # Read the stopwords
600 open(STOPFILE, "<$file_name");
601 while (<STOPFILE>) {
602 s/^\s+//;
603 s/\s.*//;
604 $word = $_;
605 $l = lc($word);
606 $stopwords{$l} = $word;
607 }
608 close STOPFILE;
609
610 }
611 }
612
613 # Read thesaurus information
614 if ($thesaurus) {
615
616 # Ensure both link and term files exist
617 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
618 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
619 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
620 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
621
622 # Read the thesaurus terms
623 open(TH, "<$thesaurus_terms");
624 while(<TH>) {
625 s/^\d+ //;
626 s/\(.*\)//;
627 foreach $w (split(/\s+/, $_)) {
628 $thesaurus{lc($w)} = $w;
629 }
630 }
631 close TH;
632 }
633
634 # Read words in the text and count occurences
635 open(TXT, "<$phinddir/clauses");
636 my @words;
637
638 while(<TXT>) {
639 $line = $_;
640 next unless ($line =~ /./);
641
642 @words = split(/\s+/, $line);
643 foreach $w (@words) {
644 $l = lc($w);
645 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
646 $freq{$w}++;
647 }
648 $freq{$senlimit}++;
649 }
650
651 # Calculate the "best" form of each word
652 my (%bestform, %totalfreq, %bestfreq);
653
654 foreach $w (sort (keys %freq)) {
655 $l = lc($w);
656
657 # totalfreq is the number of times a term appears in any form
658 $totalfreq{$l} += $freq{$w};
659
660 if (defined $stopwords{$l}) {
661 $bestform{$l} = $stopwords{$l};
662
663 } elsif (defined $thesaurus{$l}) {
664 $bestform{$l} = $thesaurus{$l};
665
666 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
667 $bestfreq{$l} = $freq{$w};
668 $bestform{$l} = $w;
669 }
670 }
671
672 undef %freq;
673 undef %bestfreq;
674
675
676 # Assign symbol numbers to tokens
677 my $nextsymbol = 1;
678 my (@vocab);
679
680 # Delimiters
681 $first_delimiter = 1;
682
683 foreach $word (@delimiters) {
684
685 $word = lc($word);
686 $bestform{$word} = uc($word);
687 $vocab[$nextsymbol] = $word;
688 $symbol{$word} = $nextsymbol;
689 $nextsymbol++;
690 }
691 $last_delimiter = $nextsymbol - 1;
692
693 # Stopwords
694 $first_stopword = $nextsymbol;
695
696 foreach my $word (sort keys %stopwords) {
697
698 # don't incluse stopword unless it occurs in the text
699 $word = lc($word);
700 next unless ($totalfreq{$word});
701 next if ($symbol{$word});
702
703 $vocab[$nextsymbol] = $word;
704 $symbol{$word} = $nextsymbol;
705 $nextsymbol++;
706 }
707 $last_stopword = $nextsymbol - 1;
708 $first_contentword = $nextsymbol;
709
710 # Thesaurus terms
711 if ($thesaurus) {
712 $first_thesaurusword = $nextsymbol;
713
714 foreach my $word (sort keys %thesaurus) {
715
716 $word = lc($word);
717 next if ($symbol{$word});
718 $bestform{$word} = $thesaurus{$word};
719
720 $vocab[$nextsymbol] = $word;
721 $symbol{$word} = $nextsymbol;
722 $nextsymbol++;
723
724 }
725 $last_thesaurusword = $nextsymbol - 1;
726 }
727
728 # Other content words
729 $first_extractword = $nextsymbol;
730
731 foreach my $word (sort (keys %bestform)) {
732
733 next if ($symbol{$word});
734
735 $vocab[$nextsymbol] = $word;
736 $symbol{$word} = $nextsymbol;
737 $nextsymbol++;
738 }
739 $last_extractword = $nextsymbol - 1;
740 $last_contentword = $nextsymbol - 1;
741
742
743 # Outut the words
744 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
745 open(VOC, ">$phinddir/clauses.vocab");
746
747 for (my $i = 1; $i < $nextsymbol; $i++) {
748 $w = $vocab[$i];
749
750 print VOC "$bestform{$w}\n";
751 $totalfreq{$w} = 0 unless ($totalfreq{$w});
752 }
753 close VOC;
754
755
756 # Create statistics file
757 # Output statistics about the vocablary
758 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
759 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
760
761 open(STAT, ">$phinddir/clauses.stats")
762 || die "Cannot open $phinddir/clauses.stats: $!";
763
764 print STAT "first_delimiter $first_delimiter\n";
765 print STAT "last_delimiter $last_delimiter\n";
766 print STAT "first_stopword $first_stopword\n";
767 print STAT "last_stopword $last_stopword\n";
768 if ($thesaurus) {
769 print STAT "first_thesaurusword $first_thesaurusword\n";
770 print STAT "last_thesaurusword $last_thesaurusword\n";
771 }
772 print STAT "first_extractword $first_extractword\n";
773 print STAT "last_extractword $last_extractword\n";
774 print STAT "first_contentword $first_contentword\n";
775 print STAT "last_contentword $last_contentword\n";
776 print STAT "first_symbol $first_delimiter\n";
777 print STAT "last_symbol $last_contentword\n";
778 print STAT "first_word $first_stopword\n";
779 print STAT "last_word $last_contentword\n";
780 close STAT;
781
782 undef @vocab;
783
784
785 # Create numbers file
786 # Save text as symbol numbers
787 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
788
789 open(TXT, "<$phinddir/clauses");
790 open(NUM, ">$phinddir/clauses.numbers");
791
792 $phrasedelimiter = $symbol{lc($senlimit)};
793 print NUM "$symbol{lc($colstart)}\n";
794
795 # set up the special symbols that delimit documents and sentences
796 while(<TXT>) {
797
798 # split sentence into a list of tokens
799 $line = $_;
800 next unless ($line =~ /./);
801 @words = split(/\s+/, $line);
802
803 # output one token at a time
804 foreach $word (@words) {
805 $word = lc($word);
806 print NUM "$symbol{$word}\n";
807 }
808
809 # output phrase delimiter
810 print NUM "$phrasedelimiter\n";
811 }
812
813 print NUM "$symbol{lc($colend)}\n";
814 close NUM;
815
816 # Save thesaurus data in one convienient file
817 if ($thesaurus) {
818
819 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
820
821
822 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
823 if ($verbosity > 1);
824
825 # Read the thesaurus terms
826 my ($num, $text, %thes_symbols);
827
828 open(TH, "<$thesaurus_terms");
829 while(<TH>) {
830 chomp;
831 @words = split(/\s+/, $_);
832 $num = shift @words;
833 $text = "";
834
835 # translate words into symbol numbers
836 foreach $word (@words) {
837 $word = lc($word);
838 if ($symbol{$word}) {
839 $text .= "s$symbol{$word} ";
840 } elsif ($verbosity) {
841 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
842 }
843 }
844 $text =~ s/ $//;
845 $thes_symbols{$num} = $text;
846 }
847 close TH;
848
849 # Read the thesaurus links and write the corresponding data
850 open(TH, "<$thesaurus_links");
851 open(THOUT, ">$thesaurusfile");
852
853 while(<TH>) {
854 chomp;
855 ($num, $text) = split(/:/, $_);
856
857 if (defined($thes_symbols{$num})) {
858 print THOUT "$num:$thes_symbols{$num}:$text\n";
859 } else {
860 print THOUT "$num:untranslated:$text\n";
861 }
862 }
863 close TH;
864 close THOUT;
865 }
866
867
868
869
870}
871
872
873# renumber_phrases
874#
875# Prepare the phrases file to be input to mgpp. The biggest problem is
876# reconciling the phrase identifiers used by the suffix program (which
877# we'll call suffix-id numbers) with the numbers used in the thesaurus
878# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
879# Phind-id numbers must be sorted by frequency of occurance.
880#
881# Start creating a set of phind-id numbers from the sorted suffix-id
882# numbers and (if required) the thesaurus-id numbers. Then add any other
883# phrases occuring in the thesaurus.
884#
885# The last thing we have to do is restore the vocabulary information to the
886# phrase file so that the phrases are stored as words, not as symbol
887# numbers.
888
889# The original phrases file looks something like this:
890# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
891# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
892# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
893# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
894
895
896sub renumber_phrases {
897 my ($self) = @_;
898
899 renumber_suffix_data($self);
900 renumber_thesaurus_data($self);
901 restore_vocabulary_data($self);
902
903}
904
905
906
907# renumber_suffix_data
908#
909# Translate phrases file to phrases.2 using phind keys instead
910# of suffix keys and sorting the expansion data.
911
912sub renumber_suffix_data {
913 my ($self) = @_;
914
915 my $verbosity = $self->{'verbosity'};
916 my $out = $self->{'outhandle'};
917 print $out "Translate phrases: suffix-ids become phind-id's\n"
918 if ($verbosity);
919
920 my $phinddir = $self->{'phinddir'};
921 my $infile = &util::filename_cat($phinddir, 'phrases');
922 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
923
924 # Read the phrase file. Calculate initial set of phind-id
925 # numbers and store (suffixid -> frequency) relation.
926
927 my %suffixtophind;
928 my @phindfrequency;
929 my (@fields, $suffixid);
930 my $nextphind = 1;
931
932 open(IN, "<$infile");
933 while(<IN>) {
934
935 chomp;
936 @fields = split(/:/, $_);
937
938 # get next suffixid and phindid
939 $suffixid = shift @fields;
940 $suffixtophind{$suffixid} = $nextphind;
941
942 # store total frequency
943 shift @fields;
944 $totalfrequency[$nextphind] = shift @fields;
945
946 $nextphind++;
947 }
948 close IN;
949
950
951 # Translate phrases file to phrases.2. Use phind keys (not suffix
952 # keys), sort expansion and document occurance data in order of
953 # descending frequency..
954 open(IN, "<$infile");
955 open(OUT, ">$outfile");
956
957 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
958 my (@documwents, @newexp, $k, $n);
959 my $linenumber = 0;
960
961 while(<IN>) {
962
963 # read the line
964 chomp;
965 @fields = split(/:/, $_);
966
967 # get a phrase number for this line
968 $suffixid = shift @fields;
969 die unless (defined($suffixtophind{$suffixid}));
970 $phindid = $suffixtophind{$suffixid};
971
972 # get the symbols in the phrase
973 $text = shift @fields;
974
975 # output status information
976 $linenumber++;
977 if ($verbosity > 2) {
978 if ($linenumber % 1000 == 0) {
979 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
980 }
981 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
982 }
983
984 # get the phrase frequency
985 $tf = shift @fields;
986
987 # get the number of expansions
988 $countexp = shift @fields;
989
990 # get the expansions, convert them into phind-id numbers, and sort them
991 $expansions = shift @fields;
992 @newexp = ();
993 foreach $k (split(/,/, $expansions)) {
994 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
995 $n = $suffixtophind{$k};
996 push @newexp, $n;
997 }
998 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
999
1000 # get the number of documents
1001 $countdocs = shift @fields;
1002
1003 # get the documents and sort them
1004 $documents = shift @fields;
1005 $documents =~ s/d//g;
1006 @documents = split(/;/, $documents);
1007 @documents = sort by_doc_frequency @documents;
1008
1009 # output the phrase data
1010 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1011 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1012
1013 }
1014
1015 close IN;
1016 close OUT;
1017}
1018
1019
1020# renumber_thesaurus_data
1021#
1022# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1023
1024sub renumber_thesaurus_data {
1025 my ($self) = @_;
1026
1027 my $out = $self->{'outhandle'};
1028 my $verbosity = $self->{'verbosity'};
1029 my $thesaurus = $self->{'thesaurus'};
1030
1031 my $phinddir = $self->{'phinddir'};
1032 my $infile = &util::filename_cat($phinddir, "phrases.2");
1033 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1034
1035
1036 # If no thesaurus is defined, simply move the phrases file.
1037 if (!$thesaurus) {
1038 print $out "Translate phrases.2: no thesaurus data\n"
1039 if ($verbosity);
1040 &util::mv($infile, $outfile);
1041 return;
1042 }
1043
1044 print $out "Translate phrases.2: add thesaurus data\n"
1045 if ($verbosity);
1046
1047 # 1.
1048 # Read thesaurus file and store (symbols->thesaurusid) mapping
1049 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1050 my %symbolstothesid;
1051 my (@fields, $thesid, $symbols);
1052
1053 open(TH, "<$thesaurusfile");
1054
1055 while (<TH>) {
1056
1057 chomp;
1058 @fields = split(/:/, $_);
1059
1060 # get id and text
1061 $thesid = shift @fields;
1062 $symbols = shift @fields;
1063 $symbolstothesid{$symbols} = $thesid;
1064 }
1065 close TH;
1066
1067 # 2.
1068 # Read phrases file to find thesaurus entries that already
1069 # have a phindid. Store their phind-ids for later translation.
1070 my %thesaurustophindid;
1071 my ($phindid);
1072
1073 open(IN, "<$infile");
1074
1075 while(<IN>) {
1076
1077 chomp;
1078 @fields = split(/:/, $_);
1079
1080 # phindid and symbols for this line
1081 $phindid = shift @fields;
1082 $symbols = shift @fields;
1083
1084 # do we have a thesaurus id corresponding to this phrase?
1085 if (defined($symbolstothesid{$symbols})) {
1086 $thesid = $symbolstothesid{$symbols};
1087 $thesaurustophindid{$thesid} = $phindid;
1088 }
1089 }
1090 close IN;
1091
1092 undef %symbolstothesid;
1093
1094 # 3.
1095 # Create phind-id numbers for remaining thesaurus entries
1096 my $nextphindid = $phindid + 1;
1097
1098 open(TH, "<$thesaurusfile");
1099 while(<TH>) {
1100
1101 chomp;
1102 @fields = split(/:/, $_);
1103
1104 # read thesaurus-id and ensure it has a corresponding phind-id
1105 $thesid = shift @fields;
1106 if (!defined($thesaurustophindid{$thesid})) {
1107 $thesaurustophindid{$thesid} = $nextphindid;
1108 $nextphindid++;
1109 }
1110 }
1111 close TH;
1112
1113 # 4.
1114 # Translate thesaurus file, replacing thesaurus-id numbers with
1115 # phind-id numbers.
1116 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1117 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1118
1119 open(TH, "<$thesaurusfile");
1120 open(TO, ">$newthesaurusfile");
1121 while(<TH>) {
1122
1123 chomp;
1124 @fields = split(/:/, $_);
1125
1126 # phindid and symbols for this line
1127 ($thesid, $symbols, $relations) = @fields;
1128
1129 die unless ($thesid && $symbols);
1130 die unless $thesaurustophindid{$thesid};
1131 $phindid = $thesaurustophindid{$thesid};
1132
1133 # convert each part of the relation string to use phind-id numbers
1134 $newrelation = "";
1135 $linkcounter = 0;
1136 foreach $linktext (split(/;/, $relations)) {
1137 @linkdata = split(/,/, $linktext);
1138
1139 # remember the linktype (e.g. BT, NT)
1140 $linktype = shift @linkdata;
1141 $newrelation .= "$linktype,";
1142
1143 # convert the link target identfiers
1144 foreach $link (@linkdata) {
1145 die unless (defined($thesaurustophindid{$link}));
1146 $newrelation .= "$thesaurustophindid{$link},";
1147 $linkcounter++;
1148 }
1149 $newrelation =~ s/\,$//;
1150 $newrelation .= ";";
1151 }
1152 $newrelation .= ":";
1153
1154 print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1155 }
1156 close TH;
1157 close TO;
1158
1159 undef %thesaurustophindid;
1160
1161 # 5.
1162 # Read thesaurus data (in phind-id format) into memory
1163 my %thesaurusdata;
1164
1165 open(TH, "<$newthesaurusfile");
1166 while(<TH>) {
1167 chomp;
1168 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1169 die unless ($phindid && $symbols);
1170 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1171 }
1172
1173 # 6.
1174 # Add thesaurus data to phrases file
1175 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1176 my (@documwents, @newexp, $k, $n);
1177 my $linenumber = 0;
1178
1179 open(IN, "<$infile");
1180 open(OUT, ">$outfile");
1181
1182 # Update existing phrases
1183 while(<IN>) {
1184
1185 chomp;
1186 @fields = split(/:/, $_);
1187
1188 # get data for this line
1189 $phindid = shift @fields;
1190
1191 # output the phrase data, with thesaurus information
1192 print OUT "$phindid:", join(":", @fields);
1193
1194 # add thesaurus data
1195 if (defined($thesaurusdata{$phindid})) {
1196 @fields = split(/:/, $thesaurusdata{$phindid});
1197 shift @fields;
1198 $linkcounter = shift @fields;
1199 $relations = shift @fields;
1200
1201 print OUT ":$linkcounter:$relations";
1202 $thesaurusdata{$phindid} = "";
1203 }
1204 print OUT "\n";
1205 }
1206 close IN;
1207
1208 # Add phrases that aren't already in the file
1209 foreach $phindid (sort numerically keys %thesaurusdata) {
1210 next unless ($thesaurusdata{$phindid});
1211
1212 @fields = split(/:/, $thesaurusdata{$phindid});
1213 $symbols = shift @fields;
1214 $linkcounter = shift @fields;
1215 $relations = shift @fields;
1216
1217 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1218 }
1219 close OUT;
1220
1221}
1222
1223# restore_vocabulary_data
1224#
1225# Read phrases.3 and restore vocabulary information. Then write
1226# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1227# (if requested) to the saved phrases file.
1228
1229sub restore_vocabulary_data {
1230 my ($self) = @_;
1231
1232 my $out = $self->{'outhandle'};
1233 my $verbosity = $self->{'verbosity'};
1234 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1235
1236 my $phinddir = $self->{'phinddir'};
1237 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1238 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1239 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1240 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1241
1242 my $savephrases = $self->{'savephrases'};
1243
1244 # 1.
1245 # Read the vocabulary file
1246 open(V, "<$vocabfile")
1247 || die "Cannot open $vocabfile: $!";
1248 my @symbol;
1249 my $i = 1;
1250 while(<V>) {
1251 chomp;
1252 $symbol[$i++] = $_;
1253 }
1254
1255
1256 # 2.
1257 # Translate phrases.3 to MGPP input files
1258 my ($key, $text, $word);
1259 my @fields;
1260 my $linenumber = 0;
1261
1262 open(IN, "<$infile");
1263 open(DATA, ">$datafile");
1264 open(WORD, ">$wordfile");
1265
1266 # Save the phrases in a separate text file
1267 if ($savephrases) {
1268 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1269 open(SAVE, ">$savephrases");
1270 }
1271
1272 while(<IN>) {
1273
1274 # read the line
1275 chomp;
1276 $line = $_;
1277 @fields = split(/:/, $line);
1278
1279 # get a phrase number for this line
1280 $key = shift @fields;
1281
1282 # restore the text of the phrase
1283 $text = shift @fields;
1284 $text =~ s/s(\d+)/$symbol[$1]/g;
1285 if ($text =~ / /) {
1286 $word = "";
1287 } elsif ($text ne 'untranslated') {
1288 $word = $text;
1289 }
1290
1291 # output the phrase data
1292 print DATA "<Document>";
1293 print DATA "$key:$text:", join(":", @fields), ":\n";
1294
1295 # output the word index search data
1296 print WORD "<Document>$word\n";
1297
1298 # output the phrases to a text file
1299 if ($savephrases) {
1300 print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1301 }
1302 }
1303 close IN;
1304 close WORD;
1305 close DATA;
1306 close SAVE if ($savephrases);
1307
1308}
1309
1310
1311
1312# sort routines used to renumber phrases
1313
1314sub numerically { $a <=> $b }
1315
1316sub by_doc_frequency {
1317 my $fa = 1;
1318 if ($a =~ /,/) {
1319 $fa = $a;
1320 $fa =~ s/\d+,//;
1321 }
1322 my $fb = 1;
1323 if ($b =~ /,/) {
1324 $fb = $b;
1325 $fb =~ s/\d+,//;
1326 }
1327
1328 return ($fb <=> $fa);
1329}
1330
13311;
Note: See TracBrowser for help on using the repository browser.