source: trunk/gsdl/perllib/classify/phind.pm@ 1646

Last change on this file since 1646 was 1646, checked in by paynter, 24 years ago

Arguments for setting suffix program parameters.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.0 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# title=Title The title field for this classification
33# text=fields The text used to build the phrase hierarchy
34# phindexdir=directory Location of phind index files
35# verbosity=num Control amount of output
36# untidy=true Do not clean up intermediate files
37# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
38# suffixsize=num Number of symbols available to suffix program
39
40
41# How a classifier works.
42#
43# When a classifier is requested in the collect.cfg file, buildcol creates a
44# new classifier object (such as the one defined in theis file) and later
45# passes each document object to the classifier in turn. Four functions are
46# used:
47#
48# 1. "new" is called before the documents are processed to set up the
49# classifier.
50#
51# 2. "init" is called after buildcol.pl has created the indexes etc but
52# before the documents are classified in order that the classifier might
53# set any varioables it requiers, etc.
54#
55# 3. "classify" is called once for each document object. The classifier
56# "classifies" each document and updates its local data accordingly.
57#
58# 4. "get_classify_info" is called after every document has been
59# classified. It collates the information about the documents and
60# stores a reference to the classifier so that Greenstone can later
61# display it.
62
63
64package phind;
65
66use BasClas;
67use util;
68
69sub BEGIN {
70 @ISA = ('BasClas');
71}
72
73# Define delimiter symbols - this should be abstracted out someplace
74my $colstart = "COLLECTIONSTART";
75my $colend = "COLLECTIONEND";
76my $doclimit = "DOCUMENTLIMIT";
77my $senlimit = "SENTENCELIMIT";
78my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
79
80# Create a new phind browser based on the options in collect.cfg
81
82sub new {
83 my ($class, @options) = @_;
84 my $self = new BasClas ($class, @_);
85 my $out = pop @options;
86
87 # Phind installation check
88 # The phind phrase browser is research software and is not installed
89 # by defualt. If the user attepts to use it we warn them that it's a
90 # bit dodgy, then tell them how to install it. If they can do that
91 # and get all the files in place, then we let them proceed.
92
93 print $out "The Phind classifier for Greenstone.\n";
94 print $out "Checking the phind phrase browser requirements...\n";
95
96 # Make sure we're not in windows
97 if ($ENV{'GSDLOS'} =~ /windows/i) {
98 print STDERR "Phind currently only works under Unix";
99 exit(1);
100 }
101
102 # Ensure the Phind generate scripts are in place
103 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
104 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
105
106 if (!(-e $file1)) {
107 print STDERR "The phind \"suffix\" program is not installed. ";
108 print STDERR "To install it, change to the directory\n";
109 print STDERR " $src\n";
110 print STDERR "and type \"make install-phind\".\n\n";
111 exit(1);
112 }
113
114 # Ensure the Phind CGI script is in place
115 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
116 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
117
118 if (!(-e $file1)) {
119 print STDERR "The phind CGI program is not installed. ";
120 print STDERR "To install it, change to the directory\n";
121 print STDERR " $src\n";
122 print STDERR "and type \"make install-phind\".\n\n";
123 exit(1);
124 }
125
126 # Ensure the Phind Java applet is in place
127 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
128 $file1 = &util::filename_cat($src, "Phind.class");
129
130 if (!(-e $file1)) {
131 print STDERR "The phind Java classes are not compiled. ";
132 print STDERR "To compile them, change to the directory\n";
133 print STDERR " $src\n";
134 print STDERR "and use your Java compiler to compile Phind.java.\n";
135 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
136 exit(1);
137 }
138
139
140 # The installation appears OK - set up the classifier
141 my $collection = $ENV{'GSDLCOLLECTION'};
142 my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
143 my $language = "english";
144
145 my $title = "Topic";
146 my $indexes = "section:Title,section:text";
147
148 my $suffixmode = 1;
149 my $suffixsize = 40000000;
150
151 my $verbosity = 2;
152 my $untidy = 0;
153
154 # parse the options
155 foreach $option (@options) {
156
157 if ($option =~ /^text=(.*)$/i) {
158 $indexes = $1;
159 } elsif ($option =~ /^title=(.*)$/i) {
160 $title = $1;
161 } elsif ($option =~ /^phindexdir=(.*)$/i) {
162 $phindexdir = $1;
163 } elsif ($option =~ /^suffixsize=(.*)$/i) {
164 $suffixsize = $1;
165 } elsif ($option =~ /^suffixmode=(.*)$/i) {
166 $suffixmode = $1;
167 } elsif ($option =~ /^verbosity=(.*)$/i) {
168 $verbosity = $1;
169 } elsif ($option =~ /^untidy/i) {
170 $untidy = 1;
171 }
172 }
173
174
175 $self->{'collection'} = $collection;
176 $self->{'title'} = $title;
177 $self->{'indexes'} = $indexes;
178
179 $self->{'suffixmode'} = $suffixmode;
180 $self->{'suffixsize'} = $suffixsize;
181
182 $self->{'verbosity'} = $verbosity;
183 $self->{'untidy'} = $untidy;
184
185 # limit languages
186 $language =~ s/,/\|/g;
187 $self->{'language_exp'} = $language;
188 $self->{'delimiter'} = $delimiter;
189
190 # reset phindex directory
191 if (-e "$phindexdir") {
192 &util::rm_r("$phindexdir");
193 }
194 &util::mk_dir("$phindexdir");
195 $self->{'phindexdir'} = $phindexdir;
196
197 return bless $self, $class;
198}
199
200
201# Initialise the phind classifier
202
203sub init {
204 my $self = shift (@_);
205
206 # open filehandles for documents and text
207 my $phindexdir = $self->{'phindexdir'};
208
209 my $clausefile = &util::filename_cat("$phindexdir", "clauses");
210 &util::rm($clausefile) if (-e $clausefile);
211 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
212 $self->{'txthandle'} = TEXT;
213
214 my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
215 &util::rm($docfile) if (-e $docfile);
216 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
217 $self->{'dochandle'} = DOCS;
218
219}
220
221# Classify each document.
222#
223# Each document is passed here in turn. The classifier extracts the
224# text of each and stores it in the clauses file. Document details are
225# stored in the docs.txt file.
226
227sub classify {
228 my $self = shift (@_);
229 my ($doc_obj) = @_;
230
231 my $verbosity = $self->{'verbosity'};
232 my $top_section = $doc_obj->get_top_section();
233
234 my $title = $doc_obj->get_metadata_element ($top_section, "Title");
235 print "process: $title\n" if ($verbosity > 2);
236
237
238 # only consider english-language files
239 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
240 my $phrlanguage = $self->{'language_exp'};
241 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
242
243 # record this file
244 my $total++;
245 print "file $total: $file\n" if ($self->{'$verbosity'});
246
247
248 # Store document details
249 my $OID = $doc_obj->get_OID();
250 $OID = "NULL" unless defined $OID;
251 my $dochandle = $self->{'dochandle'};
252 print $dochandle "<Document>\t$OID\t$title\n";
253
254 # Store the text occuring in this object
255
256 # output the document delimiter
257 my $txthandle = $self->{'txthandle'};
258 print $txthandle "$doclimit\n";
259
260 # iterarate over the required indexes and store their text
261 my $indexes = $self->{'indexes'};
262 my $text = "";
263 my ($part, $level, $field, $section, $data);
264
265 foreach $part (split(/,/, $indexes)) {
266
267 # Each field has a level and a data element ((e.g. document:Title)
268 ($level, $field) = split(/:/, $part);
269 die unless ($level && $field);
270
271 # Extract the text from every section
272 # (In phind, document:text and section:text are equivalent)
273 if ($field eq "text") {
274 $data = "";
275 $section = $doc_obj->get_top_section();
276 while (defined($section)) {
277 $data .= $doc_obj->get_text($section) . "\n";
278 $section = $doc_obj->get_next_section($section);
279 }
280 $text .= convert_gml_to_tokens($data) . "\n";
281 }
282
283 # Extract a metadata field from a document
284 elsif ($level eq "document") {
285 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
286 $text .= convert_gml_to_tokens($data) . "\n";
287 }
288
289 # Extract metadata from every section in a document
290 elsif ($level eq "section") {
291 $data = "";
292 $section = $doc_obj->get_top_section();
293 while (defined($section)) {
294 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
295 $section = $doc_obj->get_next_section($section);
296 }
297 $text .= convert_gml_to_tokens($data) . "\n";
298 }
299
300 # Some sort of specification which I don't understand
301 else {
302 die "Unknown level ($level) in phind index ($part)\n";
303 }
304
305 }
306
307 # output the text
308 $text =~ tr/\n//s;
309 print $txthandle "$text";
310
311}
312
313
314
315# Construct the classifier from the information already gathered
316#
317# When get_classify_info is called, the clauses and docs.txt files have
318# already been constructed in the phindex directory. This function will
319# translate them into compressed, indexed MGPP files that can be read by
320# the phindcgi script. It will also register our classifier so that it
321# shows up in thenavigation bar.
322
323sub get_classify_info {
324 my $self = shift (@_);
325
326 my $verbosity = $self->{'verbosity'};
327 my $phindexdir = $self->{'phindexdir'};
328 my $language = "english";
329
330 if ($verbosity) {
331 print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
332 }
333
334 # Construct phind indexes
335 my $suffixmode = $self->{'suffixmode'};
336 my $suffixsize = $self->{'suffixsize'};
337 my ($command, $status);
338
339 # Generate the vocabulary, symbol statistics, and numbers file
340 # from the clauses file
341 print "\nExtracting vocabulary and statistics\n" if $verbosity;
342 &extract_vocabulary($phindexdir, $language, $verbosity);
343
344 # Use the suffix program to generate the phindex/phrases file
345 print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
346 &execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity);
347
348 # Create the phrase file and put phrase numbers in phindex/phrases
349 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
350 &renumber_phrases("$phindexdir", $verbosity);
351
352 # Create the mg phrase database
353 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
354 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
355 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
356
357 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
358 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
359 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
360 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
361
362 print "\nCreating phrase databases\n";
363 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
364 my $mg_stem = "pdata";
365
366 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
367 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
368 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
369
370 # create the mg index of words
371 print "\nCreating word-level search indexes\n";
372 $mg_input = &util::filename_cat($phindexdir, "pword.txt");
373 $mg_stem = "pword";
374
375 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
376 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
377 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
378 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
379 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
380 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
381
382 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
383 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
384 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
385
386 # create the mg document information database
387 print "\nCreating document information databases\n";
388 $mg_input = &util::filename_cat($phindexdir, "docs.txt");
389 $mg_stem = "docs";
390
391 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
392 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
393 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
394
395
396 # Tidy up stray files
397 if (!$untidy) {
398 print "\nCleaning up\n" if ($verbosity > 2);
399 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
400 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
401 "$phindexdir/phrases", "$phindexdir/docs.txt",
402 "$phindexdir/pdata.txt", "$phindexdir/pword.txt");
403 my $outfile = 1;
404 while (-e "$phindexdir/outPhrase.$outfile") {
405 &util::rm("$phindexdir/outPhrase.$outfile");
406 $outfile++;
407 }
408 }
409
410
411 # Insert the classifier into.... what?
412 my $collection = $self->{'collection'};
413 my $url = "library?a=p&p=phind&c=$collection";
414
415 my %classifyinfo = ('thistype'=>'Invisible',
416 'childtype'=>'Phind',
417 'Title'=>$self->{'title'},
418 'contains'=>[]);
419
420 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
421 return \%classifyinfo;
422}
423
424
425
426sub convert_gml_to_tokens {
427
428 $_ = shift @_;
429
430 # FIRST, remove GML tags
431
432 # Replace all whitespace with a simple space
433 s/\s+/ /gs;
434
435 # Remove everything that is in a tag
436 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
437 s/\s*<br>\s*/ LINEBREAK /isg;
438 s/<[^>]*>/ /sg;
439
440 # Now we have the text, but it may contain HTML
441 # elements coded as &gt; etc. Remove these tags.
442 s/&lt;/</sg;
443 s/&gt;/>/sg;
444
445 s/\s+/ /sg;
446 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
447 s/\s*<br>\s*/ LINEBREAK /isg;
448 s/<[^>]*>/ /sg;
449
450 # remove &amp; and other miscellaneous markup tags
451 s/&amp;/&/sg;
452 s/&lt;/</sg;
453 s/&gt;/>/sg;
454 s/&amp;/&/sg;
455
456 # replace<p> and <br> placeholders with carriage returns
457 s/PARAGRAPHBREAK/\n/sg;
458 s/LINEBREAK/\n/sg;
459
460
461 # Exceptional punctuation
462 #
463 # We make special cases of some punctuation
464
465 # remove any apostrophe that indicates omitted letters
466 s/(\w+)\'(\w*\s)/ $1$2 /g;
467
468 # remove period that appears in a person's initals
469 s/\s([A-Z])\./ $1 /g;
470
471 # replace hyphens in hypheanted words and names with a space
472 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
473
474
475 # Convert the remaining text to "clause format",
476 # This means removing all excess punctuation and garbage text,
477 # normalising valid punctuation to fullstops and commas,
478 # then putting one cluse on each line.
479
480 # Insert newline when the end of a sentence is detected
481 # (delimter is: "[\.\?\!]\s")
482 s/\s*[\.\?\!]\s+/\n/g;
483
484 # split numbers after four digits
485 s/(\d\d\d\d)/$1 /g;
486
487 # split words after 32 characters
488
489 # squash repeated punctuation
490 tr/A-Za-z0-9 //cs;
491
492 # save email addresses
493 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
494
495 # normalise clause breaks (mostly punctuation symbols) to commas
496 s/[^A-Za-z0-9 \n]+/ , /g;
497
498 # Remove repeated commas, and replace with newline
499 s/\s*,[, ]+/\n/g;
500
501 # remove extra whitespace
502 s/ +/ /sg;
503 s/^\s+//mg;
504 s/\s*$/\n/mg;
505
506 # remove lines that contain one word or less
507 s/^\w*$//mg;
508 s/^\s*$//mg;
509 tr/\n//s;
510
511 return $_;
512}
513
514
515# Execute a system command
516
517sub execute {
518 my ($command, $verbosity) = @_;
519 print "Executing: $command\n" if ($verbosity > 2);
520 my $status = system($command);
521 if ($status != 0) {
522 print STDERR "phindgen.pl - Error executing $command: $!\n";
523 exit($status);
524 }
525}
526
527
528# Generate the vocabulary, symbol statistics, and numbers file from the
529# clauses file. This is legacy code, so is a bit messy and probably wont
530# run under windows.
531
532sub extract_vocabulary {
533 my ($phindex_dir, $language, $verbosity) = @_;
534
535 my ($w, $l, $line, $word);
536
537 my ($first_delimiter, $last_delimiter,
538 $first_stopword, $last_stopword,
539 $first_extractword, $last_extractword,
540 $first_contentword, $last_contentword,
541 $phrasedelimiter);
542
543 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
544
545
546 my %symbol;
547 my (%freq);
548
549 print "Calculating vocabulary\n" if ($verbosity > 1);
550
551 # Read and store the stopwords
552 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
553 my %stopwords;
554 foreach my $w (split(/\s+/, $words)) {
555 $l = lc($w);
556 $stopwords{$l} = $w;
557 }
558
559 # Read and store the thesaurus terms
560 $use_thesaurus = 0;
561 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
562 "$language", "agrovoc.lex");
563 if (-e "$lex_file") {
564 open(TH, "<$lex_file");
565 while(<TH>) {
566 s/^\d+ //;
567 s/\(.*\)//;
568 foreach my $w (split(/\s+/, $_)) {
569 $thesaurus{lc($w)} = $w;
570 }
571 }
572 close TH;
573 $use_thesaurus = 1;
574 }
575
576 # Read words in the text and count occurences
577 open(TXT, "<$phindex_dir/clauses");
578 my @words;
579
580 while(<TXT>) {
581 $line = $_;
582 next unless ($line =~ /./);
583
584 @words = split(/\s+/, $line);
585 foreach $w (@words) {
586 $l = lc($w);
587 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
588 $freq{$w}++;
589 }
590 $freq{$senlimit}++;
591 }
592
593 # Calculate the "best" form of each word
594 my (%bestform, %totalfreq, %bestfreq);
595
596 foreach $w (sort (keys %freq)) {
597 $l = lc($w);
598
599 # totalfreq is the number of times a term appears in any form
600 $totalfreq{$l} += $freq{$w};
601
602 if (defined $stopwords{$l}) {
603 $bestform{$l} = $stopwords{$l};
604
605 } elsif (defined $thesaurus{$l}) {
606 $bestform{$l} = $thesaurus{$l};
607
608 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
609 $bestfreq{$l} = $freq{$w};
610 $bestform{$l} = $w;
611 }
612 }
613
614 undef %freq;
615 undef %bestfreq;
616
617
618 # Assign symbol numbers to tokens
619 my $nextsymbol = 1;
620 my (@vocab);
621
622 # Delimiters
623 $first_delimiter = 1;
624
625 foreach $word (@delimiters) {
626
627 $word = lc($word);
628 $bestform{$word} = uc($word);
629 $vocab[$nextsymbol] = $word;
630 $symbol{$word} = $nextsymbol;
631 $nextsymbol++;
632 }
633 $last_delimiter = $nextsymbol - 1;
634
635 # Stopwords
636 $first_stopword = $nextsymbol;
637
638 foreach my $word (sort keys %stopwords) {
639
640 # don't incluse stopword unless it occurs in the text
641 $word = lc($word);
642 next unless ($totalfreq{$word});
643 next if ($symbol{$word});
644
645 $vocab[$nextsymbol] = $word;
646 $symbol{$word} = $nextsymbol;
647 $nextsymbol++;
648 }
649 $last_stopword = $nextsymbol - 1;
650 $first_contentword = $nextsymbol;
651
652 # Thesaurus terms
653 if ($use_thesaurus) {
654 $first_thesaurusword = $nextsymbol;
655
656 foreach my $word (sort keys %thesaurus) {
657
658 $word = lc($word);
659 next if ($symbol{$word});
660 $bestform{$word} = $thesaurus{$word};
661
662 $vocab[$nextsymbol] = $word;
663 $symbol{$word} = $nextsymbol;
664 $nextsymbol++;
665
666 }
667 $last_thesaurusword = $nextsymbol - 1;
668 }
669
670 # Other content words
671 $first_extractword = $nextsymbol;
672
673 foreach my $word (sort (keys %bestform)) {
674
675 next if ($symbol{$word});
676
677 $vocab[$nextsymbol] = $word;
678 $symbol{$word} = $nextsymbol;
679 $nextsymbol++;
680 }
681 $last_extractword = $nextsymbol - 1;
682 $last_contentword = $nextsymbol - 1;
683
684
685 # Outut the words
686 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
687 open(VOC, ">$phindex_dir/clauses.vocab");
688
689 for (my $i = 1; $i < $nextsymbol; $i++) {
690 $w = $vocab[$i];
691
692 print VOC "$bestform{$w}\n";
693 $totalfreq{$w} = 0 unless ($totalfreq{$w});
694 }
695 close VOC;
696
697
698 # Output statistics about the vocablary
699 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
700 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
701 open(STAT, ">$phindex_dir/clauses.stats")
702 || die "Cannot open $phindex_dir/clauses.stats: $!";
703
704 print STAT "first_delimiter $first_delimiter\n";
705 print STAT "last_delimiter $last_delimiter\n";
706 print STAT "first_stopword $first_stopword\n";
707 print STAT "last_stopword $last_stopword\n";
708 if ($use_thesaurus) {
709 print STAT "first_thesaurusword $first_thesaurusword\n";
710 print STAT "last_thesaurusword $last_thesaurusword\n";
711 }
712 print STAT "first_extractword $first_extractword\n";
713 print STAT "last_extractword $last_extractword\n";
714 print STAT "first_contentword $first_contentword\n";
715 print STAT "last_contentword $last_contentword\n";
716 print STAT "first_symbol $first_delimiter\n";
717 print STAT "last_symbol $last_contentword\n";
718 print STAT "first_word $first_stopword\n";
719 print STAT "last_word $last_contentword\n";
720 close STAT;
721
722 undef @vocab;
723
724
725 # Save text as symbol numbers
726 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
727
728 open(TXT, "<$phindex_dir/clauses");
729 open(NUM, ">$phindex_dir/clauses.numbers");
730
731 $phrasedelimiter = $symbol{lc($senlimit)};
732 print NUM "$symbol{lc($colstart)}\n";
733
734 # set up the special symbols that delimit documents and sentences
735 while(<TXT>) {
736
737 # split sentence into a list of tokens
738 $line = $_;
739 next unless ($line =~ /./);
740 @words = split(/\s+/, $line);
741
742 # output one token at a time
743 foreach $word (@words) {
744 $word = lc($word);
745 print NUM "$symbol{$word}\n";
746 }
747
748 # output phrase delimiter
749 print NUM "$phrasedelimiter\n";
750 }
751
752 print NUM "$symbol{lc($colend)}\n";
753
754}
755
756
757# Prepare the phrases file to be input to mgpp.
758# This means renumbering the phrases in order of decreasing frequency.
759
760
761# This is legacy code, and a little ugly, and may be unix-specific
762# (particularly the sort command).
763
764sub renumber_phrases {
765 my ($phindex_dir, $verbosity) = @_;
766
767 # Sort the phrases into order of increasing frequency
768 # This means the expansions will be sorted correctly later on.
769 print "Sorting phrases into freq order\n" if ($verbosity);
770 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
771
772 my @symbol;
773
774 # Read the vocabulary
775 print "Reading the vocabulary\n" if ($verbosity);
776 open(V, "<$phindex_dir/clauses.vocab")
777 || die "Cannot open $phindex_dir/clauses.vocab: $!";
778
779 my $i = 1;
780 while(<V>) {
781 chomp;
782 $symbol[$i++] = $_;
783 }
784
785 # Create file for phrase data
786 #
787 # The phrases file looks something like this
788 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
789 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
790 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
791 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
792
793 # The first field on each line is a unique phrase identifier.
794 # We need to calculate phrase numbers for each phrase
795 print "Calculate phrase numbers\n" if ($verbosity);
796
797 my %phrasenumber;
798 my $nextphrase = 1;
799 my ($line);
800
801 open(IN, "<$phindex_dir/phrases");
802 while(<IN>) {
803
804 # read the line
805 chomp;
806 $line = $_;
807
808 # we're only interested in the first field
809 $line =~ s/:.*//;
810
811 # get a phrase number for this line
812 $phrasenumber{$line} = $nextphrase;
813 $nextphrase++;
814 }
815
816
817 # Now we create a new phrase file using phrase numbers, not the old IDs.
818 print "Format phrase data for MGPP\n" if ($verbosity);
819
820 open(IN, "<$phindex_dir/phrases");
821 open(DATA, ">$phindex_dir/pdata.txt");
822 open(IDX, ">$phindex_dir/pword.txt");
823
824 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
825 my @fields;
826 my @documents;
827 my (@newexp, $k, $n);
828
829 my $linenumber = 0;
830
831 while(<IN>) {
832
833 # read the line
834 chomp;
835 $line = $_;
836 @fields = split(/:/, $line);
837
838 # get a phrase number for this line
839 $key = shift @fields;
840 die unless (defined($phrasenumber{$key}));
841 $num = $phrasenumber{$key};
842
843 # get the text of the phrase
844 $text = shift @fields;
845 $text =~ s/s(\d+)/$symbol[$1]/g;
846 if ($text =~ / /) {
847 $word = "";
848 } else {
849 $word = $text;
850 }
851
852 $linenumber++;
853 if ($linenumber % 1000 == 0) {
854 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
855 }
856 print "$num: $key\t($text)\n" if ($verbosity > 3);
857
858 # get the phrase frequency
859 $tf = shift @fields;
860
861 # get the number of expansions
862 $countexp = shift @fields;
863
864 # get the expansions and convert them into phrase numbers
865 $expansions = shift @fields;
866 @newexp = ();
867 foreach $k (split(/,/, $expansions)) {
868 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
869 $n = $phrasenumber{$k};
870 push @newexp, $n;
871 }
872 @newexp = sort numerically @newexp;
873
874 # get the number of documents
875 $countdocs = shift @fields;
876
877 # get the documents
878 $documents = shift @fields;
879 $documents =~ s/d//g;
880 @documents = split(/;/, $documents);
881 @documents = sort by_frequency @documents;
882
883 # output the phrase data
884 print DATA "<Document>";
885 print DATA "$num:$text:$tf:$countexp:$countdocs:";
886 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
887
888 # output the word index search data
889 print IDX "<Document>$word\n";
890
891
892 }
893}
894
895# sort routines used to renumber phrases
896
897sub numerically { $a <=> $b }
898
899sub by_frequency {
900 my $fa = 1;
901 if ($a =~ /,/) {
902 $fa = $a;
903 $fa =~ s/\d+,//;
904 }
905 my $fb = 1;
906 if ($b =~ /,/) {
907 $fb = $b;
908 $fb =~ s/\d+,//;
909 }
910
911 return ($fb <=> $fa);
912}
913
914
9151;
Note: See TracBrowser for help on using the repository browser.