source: trunk/gsdl/perllib/classify/phind.pm@ 1883

Last change on this file since 1883 was 1883, checked in by paynter, 23 years ago

Supports new parameters of suffix program and new stopword file locations.
Filtering based on language now works. print_usage updated.

  • Property svn:keywords set to Author Date Id Revision
File size: 37.6 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# -button Name The label for the classifiers button in the
33# navigation bar (defaults to "Phrase").
34# -title Title The metadata field used to describe each document
35# (defaults to "Title").
36# -text fields The text used to build the phrase hierarchy
37# (defaults to "section:Title,section:text").
38# -phinddir directory Location of phind index files
39# -verbosity num Control amount of output
40# -untidy Do not clean up intermediate files
41# -suffixmode num Mode of suffix program (0 = all phrases, 1 = stopword)
42# -savephrases filename If set, phrase infomation will be stored in filename
43# as text. (By defualt, it is not set.)
44# -thesaurus name Name of a thesaurus stred in phind format in etc dir.
45
46# How a classifier works.
47#
48# For each classifier requested in the collect.cfg file, buildcol.pl creates
49# a new classifier object (such as the one defined in theis file) and later
50# passes each document object to the classifier in turn for classification.
51#
52# Four functions are used:
53#
54# 1. "new" is called before the documents are processed to set up the
55# classifier.
56#
57# 2. "init" is called after buildcol.pl has created the indexes etc but
58# before the documents are classified in order that the classifier might
59# set any variables it requires, etc.
60#
61# 3. "classify" is called once for each document object. The classifier
62# "classifies" each document and updates its local data accordingly.
63#
64# 4. "get_classify_info" is called after every document has been
65# classified. It collates the information about the documents and
66# stores a reference to the classifier so that Greenstone can later
67# display it.
68
69
70package phind;
71
72use BasClas;
73use util;
74
75sub BEGIN {
76 @ISA = ('BasClas');
77}
78
79# Define delimiter symbols - this should be abstracted out someplace
80my $colstart = "COLLECTIONSTART";
81my $colend = "COLLECTIONEND";
82my $doclimit = "DOCUMENTLIMIT";
83my $senlimit = "SENTENCELIMIT";
84my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
85
86
87sub print_usage {
88 print STDERR "
89 usage: classify phind [options]
90
91 options:
92 -text Fields The text used to build the phrase hierarchy.
93 (default: 'section:Title,section:text')
94
95 -title Title The metadata field used to describe each document.
96 (default: 'Title')
97
98 -button Name The label for the classifier screen and button in
99 navigation bar.
100 (default: 'Phrase')
101
102 -language Regex Language or languages to use building hierarchy.
103 Languages are identified by two-letter country codes
104 like en (English), es (Spanish), and fr (French).
105 Language is a regular expression, so 'en|fr' (English or
106 French) and '..' (match any language) are valid.
107 (default: 'en'.)
108
109 -savephrases File If set, the phrase infomation will be stored in
110 the given file as text. It is probably a good idea
111 to use an absolute path.
112 (defualt: not set)
113
114 -suffixmode N The smode parameter to the phrase extraction program. A
115 value of 0 means that stopwords are ignored, and of 1
116 means that stopwords are used.
117 (default: 1)
118
119 -thesaurus Name Name of a thesaurus stored in phind format in the
120 collection's etc directory.
121 (default: not set)
122
123 -untidy Don't remove working files.
124
125"; }
126
127# Create a new phind browser based on collect.cfg
128
129sub new {
130 my $class = shift (@_);
131 my $self = new BasClas($class, @_);
132
133 my $out = $self->{'outhandle'};
134
135
136 # Phind installation check
137 # The phind phrase browser is research software and is not installed
138 # by defualt. If the user attepts to use it we warn them that it's a
139 # bit dodgy, then tell them how to install it. If they can do that
140 # and get all the files in place, then we let them proceed.
141
142 print $out "Checking Phind phrase browser requirements...\n";
143
144 # Make sure we're not in windows
145 if ($ENV{'GSDLOS'} =~ /windows/i) {
146 print STDERR "Sorry - Phind currently only works under Unix";
147 exit(1);
148 }
149
150 # Ensure the Phind generate scripts are in place
151 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
152 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
153
154 if (!(-e $file1)) {
155 print STDERR "The phind \"suffix\" program is not installed. ";
156 print STDERR "To install it, change to the directory\n";
157 print STDERR " $src\n";
158 print STDERR "and type \"make install-phind\".\n\n";
159 exit(1);
160 }
161
162 # Ensure the Phind CGI script is in place
163 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
164 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
165
166 if (!(-e $file1)) {
167 print STDERR "The phind CGI program is not installed. ";
168 print STDERR "To install it, change to the directory\n";
169 print STDERR " $src\n";
170 print STDERR "and type \"make install-phind\".\n\n";
171 exit(1);
172 }
173
174 # Ensure the Phind Java applet is in place
175 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
176 $file1 = &util::filename_cat($src, "Phind.class");
177
178 if (!(-e $file1)) {
179 print STDERR "The phind Java classes are not compiled. ";
180 print STDERR "To compile them, change to the directory\n";
181 print STDERR " $src\n";
182 print STDERR "and use your Java compiler to compile Phind.java.\n";
183 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
184 exit(1);
185 }
186
187 # Parse classifier arguments
188 my $builddir = "";
189 my $phinddir = "";
190 if (!parsargv::parse(\@_,
191 q^text/.*/section:Title,section:text^, \$self->{'indexes'},
192 q^title/.*/Title^, \$self->{'titlefield'},
193 q^button/.*/Phrase^, \$self->{'buttonname'},
194 q^language/.*/en^, \$language,
195 q^builddir/.*/^, \$builddir,
196 q^savephrases/\d/0^, \$self->{'savephrases'},
197 q^suffixmode/\d/1^, \$self->{'suffixmode'},
198 q^thesaurus/.*/^, \$self->{'thesaurus'},
199 q^untidy^, \$self->{'untidy'},
200 "allow_extra_options")) {
201
202 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n";
203 &print_usage();
204 die "\n";
205 }
206
207 # classifier information
208 $self->{'collection'} = $ENV{'GSDLCOLLECTION'};
209
210 # limit languages
211 $self->{'language_exp'} = $language;
212
213 # collection directories
214 $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
215 if (!$builddir) {
216 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
217 }
218 $self->{'builddir'} = $builddir;
219 $self->{'phinddir'} = &util::filename_cat($builddir, "phind");
220
221 return bless $self, $class;
222}
223
224
225# Initialise the phind classifier
226
227sub init {
228 my $self = shift (@_);
229
230 # ensure we have a build directory
231 my $builddir = $self->{'builddir'};
232 die unless (-e "$builddir");
233
234 # create phind directory
235 my $phinddir = $self->{'phinddir'};
236 if (-e "$phinddir") {
237 &util::rm_r("$phinddir");
238 }
239 &util::mk_dir("$phinddir");
240
241 # open filehandles for documents and text
242 my $clausefile = &util::filename_cat("$phinddir", "clauses");
243 &util::rm($clausefile) if (-e $clausefile);
244 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
245 $self->{'txthandle'} = TEXT;
246
247 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
248 &util::rm($docfile) if (-e $docfile);
249 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
250 $self->{'dochandle'} = DOCS;
251
252}
253
254# Classify each document.
255#
256# Each document is passed here in turn. The classifier extracts the
257# text of each and stores it in the clauses file. Document details are
258# stored in the docs.txt file.
259
260sub classify {
261 my $self = shift (@_);
262 my ($doc_obj) = @_;
263
264 my $verbosity = $self->{'verbosity'};
265 my $top_section = $doc_obj->get_top_section();
266
267 my $titlefield = $self->{'titlefield'};
268
269 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
270 print "process: $title\n" if ($verbosity > 2);
271
272 # Only consider the file if it is in the correct language
273 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
274 my $phrlanguage = $self->{'language_exp'};
275 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
276
277 # record this file
278 my $total++;
279 print "file $total: $file\n" if ($self->{'$verbosity'});
280
281
282 # Store document details
283 my $OID = $doc_obj->get_OID();
284 $OID = "NULL" unless defined $OID;
285 my $dochandle = $self->{'dochandle'};
286 print $dochandle "<Document>\t$OID\t$title\n";
287
288 # Store the text occuring in this object
289
290 # output the document delimiter
291 my $txthandle = $self->{'txthandle'};
292 print $txthandle "$doclimit\n";
293
294 # iterarate over the required indexes and store their text
295 my $indexes = $self->{'indexes'};
296 my $text = "";
297 my ($part, $level, $field, $section, $data);
298
299 foreach $part (split(/,/, $indexes)) {
300
301 # Each field has a level and a data element ((e.g. document:Title)
302 ($level, $field) = split(/:/, $part);
303 die unless ($level && $field);
304
305 # Extract the text from every section
306 # (In phind, document:text and section:text are equivalent)
307 if ($field eq "text") {
308 $data = "";
309 $section = $doc_obj->get_top_section();
310 while (defined($section)) {
311 $data .= $doc_obj->get_text($section) . "\n";
312 $section = $doc_obj->get_next_section($section);
313 }
314 $text .= convert_gml_to_tokens($data) . "\n";
315 }
316
317 # Extract a metadata field from a document
318 elsif ($level eq "document") {
319 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
320 $text .= convert_gml_to_tokens($data) . "\n";
321 }
322
323 # Extract metadata from every section in a document
324 elsif ($level eq "section") {
325 $data = "";
326 $section = $doc_obj->get_top_section();
327 while (defined($section)) {
328 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
329 $section = $doc_obj->get_next_section($section);
330 }
331 $text .= convert_gml_to_tokens($data) . "\n";
332 }
333
334 # Some sort of specification which I don't understand
335 else {
336 die "Unknown level ($level) in phind index ($part)\n";
337 }
338
339 }
340
341 # output the text
342 $text =~ tr/\n//s;
343 print $txthandle "$text";
344
345}
346
347
348
349# Construct the classifier from the information already gathered
350#
351# When get_classify_info is called, the clauses and docs.txt files have
352# already been constructed in the phind directory. This function will
353# translate them into compressed, indexed MGPP files that can be read by
354# the phindcgi script. It will also register our classifier so that it
355# shows up in the navigation bar.
356
357sub get_classify_info {
358 my $self = shift (@_);
359
360 my $verbosity = $self->{'verbosity'};
361 my $out = $self->{'outhandle'};
362 my $phinddir = $self->{'phinddir'};
363
364 if ($verbosity) {
365 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
366 }
367
368 # Construct phind indexes
369 my $suffixmode = $self->{'suffixmode'};
370 my ($command, $status);
371
372 # Generate the vocabulary, symbol statistics, and numbers file
373 # from the clauses file
374 print $out "\nExtracting vocabulary and statistics\n" if $verbosity;
375 &extract_vocabulary($self);
376
377 # Use the suffix program to generate the phind/phrases file
378 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
379 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out);
380
381 # Create the phrase file and put phrase numbers in phind/phrases
382 print $out "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
383 &renumber_phrases($self);
384
385 # Create the mg phrase database
386 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
387 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
388 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
389
390 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
391 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
392 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
393 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
394
395 print $out "\nCreating phrase databases\n";
396 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
397 my $mg_stem = "pdata";
398
399 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
400 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
401 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
402
403 # create the mg index of words
404 print $out "\nCreating word-level search indexes\n";
405 $mg_input = &util::filename_cat($phinddir, "pword.txt");
406 $mg_stem = "pword";
407
408 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity, $out);
409 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
410 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity, $out);
411 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity, $out);
412 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity, $out);
413 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity, $out);
414
415 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity, $out);
416 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity, $out);
417 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity, $out);
418
419 # create the mg document information database
420 print $out "\nCreating document information databases\n";
421 $mg_input = &util::filename_cat($phinddir, "docs.txt");
422 $mg_stem = "docs";
423
424 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity, $out);
425 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity, $out);
426 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity, $out);
427
428
429 # Tidy up stray files
430 if (!$self->{'untidy'}) {
431 print $out "\nCleaning up\n" if ($verbosity > 2);
432 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
433 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
434 "$phinddir/phrases", "$phinddir/phrases.3", "$phinddir/docs.txt",
435 "$phinddir/pdata.txt", "$phinddir/pword.txt");
436 my $outfile = 1;
437 while (-e "$phinddir/outPhrase.$outfile") {
438 &util::rm("$phinddir/outPhrase.$outfile");
439 $outfile++;
440 }
441 }
442
443
444 # Insert the classifier into.... what?
445 my $collection = $self->{'collection'};
446 my $url = "library?a=p&p=phind&c=$collection";
447
448 my %classifyinfo = ('thistype'=>'Invisible',
449 'childtype'=>'Phind',
450 'Title'=>$self->{'buttonname'},
451 'contains'=>[]);
452
453 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
454 return \%classifyinfo;
455}
456
457
458
459sub convert_gml_to_tokens {
460
461 $_ = shift @_;
462
463 # FIRST, remove GML tags
464
465 # Replace all whitespace with a simple space
466 s/\s+/ /gs;
467
468 # Remove everything that is in a tag
469 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
470 s/\s*<br>\s*/ LINEBREAK /isg;
471 s/<[^>]*>/ /sg;
472
473 # Now we have the text, but it may contain HTML
474 # elements coded as &gt; etc. Remove these tags.
475 s/&lt;/</sg;
476 s/&gt;/>/sg;
477
478 s/\s+/ /sg;
479 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
480 s/\s*<br>\s*/ LINEBREAK /isg;
481 s/<[^>]*>/ /sg;
482
483 # remove &amp; and other miscellaneous markup tags
484 s/&amp;/&/sg;
485 s/&lt;/</sg;
486 s/&gt;/>/sg;
487 s/&amp;/&/sg;
488
489 # replace<p> and <br> placeholders with carriage returns
490 s/PARAGRAPHBREAK/\n/sg;
491 s/LINEBREAK/\n/sg;
492
493
494 # Exceptional punctuation
495 #
496 # We make special cases of some punctuation
497
498 # remove any apostrophe that indicates omitted letters
499 s/(\w+)\'(\w*\s)/ $1$2 /g;
500
501 # remove period that appears in a person's initals
502 s/\s([A-Z])\./ $1 /g;
503
504 # replace hyphens in hypheanted words and names with a space
505 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
506
507
508 # Convert the remaining text to "clause format",
509 # This means removing all excess punctuation and garbage text,
510 # normalising valid punctuation to fullstops and commas,
511 # then putting one cluse on each line.
512
513 # Insert newline when the end of a sentence is detected
514 # (delimter is: "[\.\?\!]\s")
515 s/\s*[\.\?\!]\s+/\n/g;
516
517 # split numbers after four digits
518 s/(\d\d\d\d)/$1 /g;
519
520 # split words after 32 characters
521
522 # squash repeated punctuation
523 tr/A-Za-z0-9 //cs;
524
525 # save email addresses
526 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
527
528 # normalise clause breaks (mostly punctuation symbols) to commas
529 s/[^A-Za-z0-9 \n]+/ , /g;
530
531 # Remove repeated commas, and replace with newline
532 s/\s*,[, ]+/\n/g;
533
534 # remove extra whitespace
535 s/ +/ /sg;
536 s/^\s+//mg;
537 s/\s*$/\n/mg;
538
539 # remove lines that contain one word or less
540 s/^\w*$//mg;
541 s/^\s*$//mg;
542 tr/\n//s;
543
544 return $_;
545}
546
547
548# Execute a system command
549
550sub execute {
551 my ($command, $verbosity, $outhandle) = @_;
552 print $outhandle "Executing: $command\n" if ($verbosity > 2);
553 my $status = system($command);
554 if ($status != 0) {
555 print STDERR "phind - Error executing $command: $!\n";
556 exit($status);
557 }
558}
559
560
561# Generate the vocabulary, symbol statistics, and numbers file from the
562# clauses file. This is legacy code, so is a bit messy and probably wont
563# run under windows.
564
565sub extract_vocabulary {
566 my ($self) = @_;
567
568 my $verbosity = $self->{'verbosity'};
569 my $out = $self->{'outhandle'};
570
571 my $collectiondir = $self->{'collectiondir'};
572 my $phinddir = $self->{'phinddir'};
573
574 my $language_exp = $self->{'language_exp'};
575
576 my ($w, $l, $line, $word);
577
578 my ($first_delimiter, $last_delimiter,
579 $first_stopword, $last_stopword,
580 $first_extractword, $last_extractword,
581 $first_contentword, $last_contentword,
582 $phrasedelimiter);
583
584 my $thesaurus = $self->{'thesaurus'};
585 my ($thesaurus_links, $thesaurus_terms,
586 %thesaurus, $first_thesaurusword, $last_thesaurusword);
587
588 my %symbol;
589 my (%freq);
590
591 print $out "Calculating vocabulary\n" if ($verbosity > 1);
592
593 # Read and store the stopwords
594 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords");
595 my $stopword_files = ();
596 my ($language, $language_dir, $file, $file_name);
597 my %stopwords;
598
599 # Examine each directory in the stopword directory
600 opendir(STOPDIR, $stopdir);
601 foreach $language (readdir STOPDIR) {
602
603 # Ignore entries that do not match the classifier's language
604 next unless ($language =~ /$language_exp/);
605 $language_dir = &util::filename_cat($stopdir, $language);
606 next unless (-d "$language_dir");
607
608 opendir(LANGDIR, $language_dir);
609 foreach $file (readdir LANGDIR) {
610
611 # Ignore entries that are not stopword files
612 next unless ($file =~ /sw$/);
613 $file_name = &util::filename_cat($language_dir, $file);
614 next unless (-f "$file_name");
615
616 # Read the stopwords
617 open(STOPFILE, "<$file_name");
618 while (<STOPFILE>) {
619 s/^\s+//;
620 s/\s.*//;
621 $word = $_;
622 $l = lc($word);
623 $stopwords{$l} = $word;
624 }
625 close STOPFILE;
626
627 }
628 }
629
630 # Read thesaurus information
631 if ($thesaurus) {
632
633 # Ensure both link and term files exist
634 $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
635 die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
636 $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
637 die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
638
639 # Read the thesaurus terms
640 open(TH, "<$thesaurus_terms");
641 while(<TH>) {
642 s/^\d+ //;
643 s/\(.*\)//;
644 foreach $w (split(/\s+/, $_)) {
645 $thesaurus{lc($w)} = $w;
646 }
647 }
648 close TH;
649 }
650
651 # Read words in the text and count occurences
652 open(TXT, "<$phinddir/clauses");
653 my @words;
654
655 while(<TXT>) {
656 $line = $_;
657 next unless ($line =~ /./);
658
659 @words = split(/\s+/, $line);
660 foreach $w (@words) {
661 $l = lc($w);
662 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
663 $freq{$w}++;
664 }
665 $freq{$senlimit}++;
666 }
667
668 # Calculate the "best" form of each word
669 my (%bestform, %totalfreq, %bestfreq);
670
671 foreach $w (sort (keys %freq)) {
672 $l = lc($w);
673
674 # totalfreq is the number of times a term appears in any form
675 $totalfreq{$l} += $freq{$w};
676
677 if (defined $stopwords{$l}) {
678 $bestform{$l} = $stopwords{$l};
679
680 } elsif (defined $thesaurus{$l}) {
681 $bestform{$l} = $thesaurus{$l};
682
683 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
684 $bestfreq{$l} = $freq{$w};
685 $bestform{$l} = $w;
686 }
687 }
688
689 undef %freq;
690 undef %bestfreq;
691
692
693 # Assign symbol numbers to tokens
694 my $nextsymbol = 1;
695 my (@vocab);
696
697 # Delimiters
698 $first_delimiter = 1;
699
700 foreach $word (@delimiters) {
701
702 $word = lc($word);
703 $bestform{$word} = uc($word);
704 $vocab[$nextsymbol] = $word;
705 $symbol{$word} = $nextsymbol;
706 $nextsymbol++;
707 }
708 $last_delimiter = $nextsymbol - 1;
709
710 # Stopwords
711 $first_stopword = $nextsymbol;
712
713 foreach my $word (sort keys %stopwords) {
714
715 # don't incluse stopword unless it occurs in the text
716 $word = lc($word);
717 next unless ($totalfreq{$word});
718 next if ($symbol{$word});
719
720 $vocab[$nextsymbol] = $word;
721 $symbol{$word} = $nextsymbol;
722 $nextsymbol++;
723 }
724 $last_stopword = $nextsymbol - 1;
725 $first_contentword = $nextsymbol;
726
727 # Thesaurus terms
728 if ($thesaurus) {
729 $first_thesaurusword = $nextsymbol;
730
731 foreach my $word (sort keys %thesaurus) {
732
733 $word = lc($word);
734 next if ($symbol{$word});
735 $bestform{$word} = $thesaurus{$word};
736
737 $vocab[$nextsymbol] = $word;
738 $symbol{$word} = $nextsymbol;
739 $nextsymbol++;
740
741 }
742 $last_thesaurusword = $nextsymbol - 1;
743 }
744
745 # Other content words
746 $first_extractword = $nextsymbol;
747
748 foreach my $word (sort (keys %bestform)) {
749
750 next if ($symbol{$word});
751
752 $vocab[$nextsymbol] = $word;
753 $symbol{$word} = $nextsymbol;
754 $nextsymbol++;
755 }
756 $last_extractword = $nextsymbol - 1;
757 $last_contentword = $nextsymbol - 1;
758
759
760 # Outut the words
761 print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
762 open(VOC, ">$phinddir/clauses.vocab");
763
764 for (my $i = 1; $i < $nextsymbol; $i++) {
765 $w = $vocab[$i];
766
767 print VOC "$bestform{$w}\n";
768 $totalfreq{$w} = 0 unless ($totalfreq{$w});
769 }
770 close VOC;
771
772
773 # Create statistics file
774 # Output statistics about the vocablary
775 print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
776 &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
777
778 open(STAT, ">$phinddir/clauses.stats")
779 || die "Cannot open $phinddir/clauses.stats: $!";
780
781 print STAT "first_delimiter $first_delimiter\n";
782 print STAT "last_delimiter $last_delimiter\n";
783 print STAT "first_stopword $first_stopword\n";
784 print STAT "last_stopword $last_stopword\n";
785 if ($thesaurus) {
786 print STAT "first_thesaurusword $first_thesaurusword\n";
787 print STAT "last_thesaurusword $last_thesaurusword\n";
788 }
789 print STAT "first_extractword $first_extractword\n";
790 print STAT "last_extractword $last_extractword\n";
791 print STAT "first_contentword $first_contentword\n";
792 print STAT "last_contentword $last_contentword\n";
793 print STAT "first_symbol $first_delimiter\n";
794 print STAT "last_symbol $last_contentword\n";
795 print STAT "first_word $first_stopword\n";
796 print STAT "last_word $last_contentword\n";
797 close STAT;
798
799 undef @vocab;
800
801
802 # Create numbers file
803 # Save text as symbol numbers
804 print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
805
806 open(TXT, "<$phinddir/clauses");
807 open(NUM, ">$phinddir/clauses.numbers");
808
809 $phrasedelimiter = $symbol{lc($senlimit)};
810 print NUM "$symbol{lc($colstart)}\n";
811
812 # set up the special symbols that delimit documents and sentences
813 while(<TXT>) {
814
815 # split sentence into a list of tokens
816 $line = $_;
817 next unless ($line =~ /./);
818 @words = split(/\s+/, $line);
819
820 # output one token at a time
821 foreach $word (@words) {
822 $word = lc($word);
823 print NUM "$symbol{$word}\n";
824 }
825
826 # output phrase delimiter
827 print NUM "$phrasedelimiter\n";
828 }
829
830 print NUM "$symbol{lc($colend)}\n";
831 close NUM;
832
833 # Save thesaurus data in one convienient file
834 if ($thesaurus) {
835
836 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
837
838
839 print $out "Saving thesaurus as numbers in $thesaurusfile\n"
840 if ($verbosity > 1);
841
842 # Read the thesaurus terms
843 my ($num, $text, %thes_symbols);
844
845 open(TH, "<$thesaurus_terms");
846 while(<TH>) {
847 chomp;
848 @words = split(/\s+/, $_);
849 $num = shift @words;
850 $text = "";
851
852 # translate words into symbol numbers
853 foreach $word (@words) {
854 $word = lc($word);
855 if ($symbol{$word}) {
856 $text .= "s$symbol{$word} ";
857 } elsif ($verbosity) {
858 print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
859 }
860 }
861 $text =~ s/ $//;
862 $thes_symbols{$num} = $text;
863 }
864 close TH;
865
866 # Read the thesaurus links and write the corresponding data
867 open(TH, "<$thesaurus_links");
868 open(THOUT, ">$thesaurusfile");
869
870 while(<TH>) {
871 chomp;
872 ($num, $text) = split(/:/, $_);
873
874 if (defined($thes_symbols{$num})) {
875 print THOUT "$num:$thes_symbols{$num}:$text\n";
876 } else {
877 print THOUT "$num:untranslated:$text\n";
878 }
879 }
880 close TH;
881 close THOUT;
882 }
883
884
885
886
887}
888
889
890# renumber_phrases
891#
892# Prepare the phrases file to be input to mgpp. The biggest problem is
893# reconciling the phrase identifiers used by the suffix program (which
894# we'll call suffix-id numbers) with the numbers used in the thesaurus
895# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
896# Phind-id numbers must be sorted by frequency of occurance.
897#
898# Start creating a set of phind-id numbers from the sorted suffix-id
899# numbers and (if required) the thesaurus-id numbers. Then add any other
900# phrases occuring in the thesaurus.
901#
902# The last thing we have to do is restore the vocabulary information to the
903# phrase file so that the phrases are stored as words, not as symbol
904# numbers.
905
906# The original phrases file looks something like this:
907# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
908# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
909# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
910# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
911
912
913sub renumber_phrases {
914 my ($self) = @_;
915
916 renumber_suffix_data($self);
917 renumber_thesaurus_data($self);
918 restore_vocabulary_data($self);
919
920}
921
922
923
924# renumber_suffix_data
925#
926# Translate phrases file to phrases.2 using phind keys instead
927# of suffix keys and sorting the expansion data.
928
929sub renumber_suffix_data {
930 my ($self) = @_;
931
932 my $verbosity = $self->{'verbosity'};
933 my $out = $self->{'outhandle'};
934 print $out "Translate phrases: suffix-ids become phind-id's\n"
935 if ($verbosity);
936
937 my $phinddir = $self->{'phinddir'};
938 my $infile = &util::filename_cat($phinddir, 'phrases');
939 my $outfile = &util::filename_cat($phinddir, 'phrases.2');
940
941 # Read the phrase file. Calculate initial set of phind-id
942 # numbers and store (suffixid -> frequency) relation.
943
944 my %suffixtophind;
945 my @phindfrequency;
946 my (@fields, $suffixid);
947 my $nextphind = 1;
948
949 open(IN, "<$infile");
950 while(<IN>) {
951
952 chomp;
953 @fields = split(/:/, $_);
954
955 # get next suffixid and phindid
956 $suffixid = shift @fields;
957 $suffixtophind{$suffixid} = $nextphind;
958
959 # store total frequency
960 shift @fields;
961 $totalfrequency[$nextphind] = shift @fields;
962
963 $nextphind++;
964 }
965 close IN;
966
967
968 # Translate phrases file to phrases.2. Use phind keys (not suffix
969 # keys), sort expansion and document occurance data in order of
970 # descending frequency..
971 open(IN, "<$infile");
972 open(OUT, ">$outfile");
973
974 my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
975 my (@documwents, @newexp, $k, $n);
976 my $linenumber = 0;
977
978 while(<IN>) {
979
980 # read the line
981 chomp;
982 @fields = split(/:/, $_);
983
984 # get a phrase number for this line
985 $suffixid = shift @fields;
986 die unless (defined($suffixtophind{$suffixid}));
987 $phindid = $suffixtophind{$suffixid};
988
989 # get the symbols in the phrase
990 $text = shift @fields;
991
992 # output status information
993 $linenumber++;
994 if ($verbosity > 2) {
995 if ($linenumber % 1000 == 0) {
996 print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
997 }
998 print $out "$num: $key\t($text)\n" if ($verbosity > 3);
999 }
1000
1001 # get the phrase frequency
1002 $tf = shift @fields;
1003
1004 # get the number of expansions
1005 $countexp = shift @fields;
1006
1007 # get the expansions, convert them into phind-id numbers, and sort them
1008 $expansions = shift @fields;
1009 @newexp = ();
1010 foreach $k (split(/,/, $expansions)) {
1011 die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
1012 $n = $suffixtophind{$k};
1013 push @newexp, $n;
1014 }
1015 @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
1016
1017 # get the number of documents
1018 $countdocs = shift @fields;
1019
1020 # get the documents and sort them
1021 $documents = shift @fields;
1022 $documents =~ s/d//g;
1023 @documents = split(/;/, $documents);
1024 @documents = sort by_doc_frequency @documents;
1025
1026 # output the phrase data
1027 print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
1028 print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
1029
1030 }
1031
1032 close IN;
1033 close OUT;
1034}
1035
1036
1037# renumber_thesaurus_data
1038#
1039# Translate phrases.2 to phrases.3, adding thesaurus data if available.
1040
1041sub renumber_thesaurus_data {
1042 my ($self) = @_;
1043
1044 my $out = $self->{'outhandle'};
1045 my $verbosity = $self->{'verbosity'};
1046 my $thesaurus = $self->{'thesaurus'};
1047
1048 my $phinddir = $self->{'phinddir'};
1049 my $infile = &util::filename_cat($phinddir, "phrases.2");
1050 my $outfile = &util::filename_cat($phinddir, "phrases.3");
1051
1052
1053 # If no thesaurus is defined, simply move the phrases file.
1054 if (!$thesaurus) {
1055 print $out "Translate phrases.2: no thesaurus data\n"
1056 if ($verbosity);
1057 &util::mv($infile, $outfile);
1058 return;
1059 }
1060
1061 print $out "Translate phrases.2: add thesaurus data\n"
1062 if ($verbosity);
1063
1064 # 1.
1065 # Read thesaurus file and store (symbols->thesaurusid) mapping
1066 my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
1067 my %symbolstothesid;
1068 my (@fields, $thesid, $symbols);
1069
1070 open(TH, "<$thesaurusfile");
1071
1072 while (<TH>) {
1073
1074 chomp;
1075 @fields = split(/:/, $_);
1076
1077 # get id and text
1078 $thesid = shift @fields;
1079 $symbols = shift @fields;
1080 $symbolstothesid{$symbols} = $thesid;
1081 }
1082 close TH;
1083
1084 # 2.
1085 # Read phrases file to find thesaurus entries that already
1086 # have a phindid. Store their phind-ids for later translation.
1087 my %thesaurustophindid;
1088 my ($phindid);
1089
1090 open(IN, "<$infile");
1091
1092 while(<IN>) {
1093
1094 chomp;
1095 @fields = split(/:/, $_);
1096
1097 # phindid and symbols for this line
1098 $phindid = shift @fields;
1099 $symbols = shift @fields;
1100
1101 # do we have a thesaurus id corresponding to this phrase?
1102 if (defined($symbolstothesid{$symbols})) {
1103 $thesid = $symbolstothesid{$symbols};
1104 $thesaurustophindid{$thesid} = $phindid;
1105 }
1106 }
1107 close IN;
1108
1109 undef %symbolstothesid;
1110
1111 # 3.
1112 # Create phind-id numbers for remaining thesaurus entries
1113 my $nextphindid = $phindid + 1;
1114
1115 open(TH, "<$thesaurusfile");
1116 while(<TH>) {
1117
1118 chomp;
1119 @fields = split(/:/, $_);
1120
1121 # read thesaurus-id and ensure it has a corresponding phind-id
1122 $thesid = shift @fields;
1123 if (!defined($thesaurustophindid{$thesid})) {
1124 $thesaurustophindid{$thesid} = $nextphindid;
1125 $nextphindid++;
1126 }
1127 }
1128 close TH;
1129
1130 # 4.
1131 # Translate thesaurus file, replacing thesaurus-id numbers with
1132 # phind-id numbers.
1133 my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
1134 my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
1135
1136 open(TH, "<$thesaurusfile");
1137 open(TO, ">$newthesaurusfile");
1138 while(<TH>) {
1139
1140 chomp;
1141 @fields = split(/:/, $_);
1142
1143 # phindid and symbols for this line
1144 ($thesid, $symbols, $relations) = @fields;
1145
1146 die unless ($thesid && $symbols);
1147 die unless $thesaurustophindid{$thesid};
1148 $phindid = $thesaurustophindid{$thesid};
1149
1150 # convert each part of the relation string to use phind-id numbers
1151 $newrelation = "";
1152 $linkcounter = 0;
1153 foreach $linktext (split(/;/, $relations)) {
1154 @linkdata = split(/,/, $linktext);
1155
1156 # remember the linktype (e.g. BT, NT)
1157 $linktype = shift @linkdata;
1158 $newrelation .= "$linktype,";
1159
1160 # convert the link target identfiers
1161 foreach $link (@linkdata) {
1162 die unless (defined($thesaurustophindid{$link}));
1163 $newrelation .= "$thesaurustophindid{$link},";
1164 $linkcounter++;
1165 }
1166 $newrelation =~ s/\,$//;
1167 $newrelation .= ";";
1168 }
1169 $newrelation .= ":";
1170
1171 print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
1172 }
1173 close TH;
1174 close TO;
1175
1176 undef %thesaurustophindid;
1177
1178 # 5.
1179 # Read thesaurus data (in phind-id format) into memory
1180 my %thesaurusdata;
1181
1182 open(TH, "<$newthesaurusfile");
1183 while(<TH>) {
1184 chomp;
1185 ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
1186 die unless ($phindid && $symbols);
1187 $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
1188 }
1189
1190 # 6.
1191 # Add thesaurus data to phrases file
1192 my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
1193 my (@documwents, @newexp, $k, $n);
1194 my $linenumber = 0;
1195
1196 open(IN, "<$infile");
1197 open(OUT, ">$outfile");
1198
1199 # Update existing phrases
1200 while(<IN>) {
1201
1202 chomp;
1203 @fields = split(/:/, $_);
1204
1205 # get data for this line
1206 $phindid = shift @fields;
1207
1208 # output the phrase data, with thesaurus information
1209 print OUT "$phindid:", join(":", @fields);
1210
1211 # add thesaurus data
1212 if (defined($thesaurusdata{$phindid})) {
1213 @fields = split(/:/, $thesaurusdata{$phindid});
1214 shift @fields;
1215 $linkcounter = shift @fields;
1216 $relations = shift @fields;
1217
1218 print OUT ":$linkcounter:$relations";
1219 $thesaurusdata{$phindid} = "";
1220 }
1221 print OUT "\n";
1222 }
1223 close IN;
1224
1225 # Add phrases that aren't already in the file
1226 foreach $phindid (sort numerically keys %thesaurusdata) {
1227 next unless ($thesaurusdata{$phindid});
1228
1229 @fields = split(/:/, $thesaurusdata{$phindid});
1230 $symbols = shift @fields;
1231 $linkcounter = shift @fields;
1232 $relations = shift @fields;
1233
1234 print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
1235 }
1236 close OUT;
1237
1238}
1239
1240# restore_vocabulary_data
1241#
1242# Read phrases.3 and restore vocabulary information. Then write
1243# this data to the MGPP input files (pwrod.txt and pdata.txt) and
1244# (if requested) to the saved phrases file.
1245
1246sub restore_vocabulary_data {
1247 my ($self) = @_;
1248
1249 my $out = $self->{'outhandle'};
1250 my $verbosity = $self->{'verbosity'};
1251 print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
1252
1253 my $phinddir = $self->{'phinddir'};
1254 my $infile = &util::filename_cat($phinddir, 'phrases.3');
1255 my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
1256 my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
1257 my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
1258
1259 my $savephrases = $self->{'savephrases'};
1260
1261 # 1.
1262 # Read the vocabulary file
1263 open(V, "<$vocabfile")
1264 || die "Cannot open $vocabfile: $!";
1265 my @symbol;
1266 my $i = 1;
1267 while(<V>) {
1268 chomp;
1269 $symbol[$i++] = $_;
1270 }
1271
1272
1273 # 2.
1274 # Translate phrases.3 to MGPP input files
1275 my ($key, $text, $word);
1276 my @fields;
1277 my $linenumber = 0;
1278
1279 open(IN, "<$infile");
1280 open(DATA, ">$datafile");
1281 open(WORD, ">$wordfile");
1282
1283 # Save the phrases in a separate text file
1284 if ($savephrases) {
1285 print $out "Saving phrases in $savephrases\n" if ($verbosity);
1286 open(SAVE, ">$savephrases");
1287 }
1288
1289 while(<IN>) {
1290
1291 # read the line
1292 chomp;
1293 $line = $_;
1294 @fields = split(/:/, $line);
1295
1296 # get a phrase number for this line
1297 $key = shift @fields;
1298
1299 # restore the text of the phrase
1300 $text = shift @fields;
1301 $text =~ s/s(\d+)/$symbol[$1]/g;
1302 if ($text =~ / /) {
1303 $word = "";
1304 } elsif ($text ne 'untranslated') {
1305 $word = $text;
1306 }
1307
1308 # output the phrase data
1309 print DATA "<Document>";
1310 print DATA "$key:$text:", join(":", @fields), ":\n";
1311
1312 # output the word index search data
1313 print WORD "<Document>$word\n";
1314
1315 # output the phrases to a text file
1316 if ($savephrases) {
1317 print SAVE $fields[0], "\t", $fields[2], "\t", "$text\n";
1318 }
1319 }
1320 close IN;
1321 close WORD;
1322 close DATA;
1323 close SAVE if ($savephrases);
1324
1325}
1326
1327
1328
1329# sort routines used to renumber phrases
1330
1331sub numerically { $a <=> $b }
1332
1333sub by_doc_frequency {
1334 my $fa = 1;
1335 if ($a =~ /,/) {
1336 $fa = $a;
1337 $fa =~ s/\d+,//;
1338 }
1339 my $fb = 1;
1340 if ($b =~ /,/) {
1341 $fb = $b;
1342 $fb =~ s/\d+,//;
1343 }
1344
1345 return ($fb <=> $fa);
1346}
1347
13481;
Note: See TracBrowser for help on using the repository browser.