source: trunk/gsdl/perllib/classify/phind.pm@ 1645

Last change on this file since 1645 was 1645, checked in by paynter, 24 years ago

Output less verbose & more consistant with buildcol.pl

  • Property svn:keywords set to Author Date Id Revision
File size: 25.5 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# title=Title The title field for this classification
33# text=fields The text used to build the phrase hierarchy
34# untidy=true Do not clean up intermediate files
35# phindexdir=directory Location of phind index files
36# verbosity=num Control amount of output
37
38
39# How a classifier works.
40#
41# When a classifier is requested in the collect.cfg file, buildcol creates a
42# new classifier object (such as the one defined in theis file) and later
43# passes each document object to the classifier in turn. Four functions are
44# used:
45#
46# 1. "new" is called before the documents are processed to set up the
47# classifier.
48#
49# 2. "init" is called after buildcol.pl has created the indexes etc but
50# before the documents are classified in order that the classifier might
51# set any varioables it requiers, etc.
52#
53# 3. "classify" is called once for each document object. The classifier
54# "classifies" each document and updates its local data accordingly.
55#
56# 4. "get_classify_info" is called after every document has been
57# classified. It collates the information about the documents and
58# stores a reference to the classifier so that Greenstone can later
59# display it.
60
61
62package phind;
63
64use BasClas;
65use util;
66
67sub BEGIN {
68 @ISA = ('BasClas');
69}
70
71# Define delimiter symbols - this should be abstracted out someplace
72my $colstart = "COLLECTIONSTART";
73my $colend = "COLLECTIONEND";
74my $doclimit = "DOCUMENTLIMIT";
75my $senlimit = "SENTENCELIMIT";
76my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
77
78# Create a new phind browser based on the options in collect.cfg
79
80sub new {
81 my ($class, @options) = @_;
82 my $self = new BasClas ($class, @_);
83 my $out = pop @options;
84
85 # Phind installation check
86 # The phind phrase browser is research software and is not installed
87 # by defualt. If the user attepts to use it we warn them that it's a
88 # bit dodgy, then tell them how to install it. If they can do that
89 # and get all the files in place, then we let them proceed.
90
91 print $out "The Phind classifier for Greenstone.\n";
92 print $out "Checking the phind phrase browser requirements...\n";
93
94 # Make sure we're not in windows
95 if ($ENV{'GSDLOS'} =~ /windows/i) {
96 print STDERR "Phind currently only works under Unix";
97 exit(1);
98 }
99
100 # Ensure the Phind generate scripts are in place
101 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
102 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
103
104 if (!(-e $file1)) {
105 print STDERR "The phind \"suffix\" program is not installed. ";
106 print STDERR "To install it, change to the directory\n";
107 print STDERR " $src\n";
108 print STDERR "and type \"make install-phind\".\n\n";
109 exit(1);
110 }
111
112 # Ensure the Phind CGI script is in place
113 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
114 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
115
116 if (!(-e $file1)) {
117 print STDERR "The phind CGI program is not installed. ";
118 print STDERR "To install it, change to the directory\n";
119 print STDERR " $src\n";
120 print STDERR "and type \"make install-phind\".\n\n";
121 exit(1);
122 }
123
124 # Ensure the Phind Java applet is in place
125 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
126 $file1 = &util::filename_cat($src, "Phind.class");
127
128 if (!(-e $file1)) {
129 print STDERR "The phind Java classes are not compiled. ";
130 print STDERR "To compile them, change to the directory\n";
131 print STDERR " $src\n";
132 print STDERR "and use your Java compiler to compile Phind.java.\n";
133 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
134 exit(1);
135 }
136
137
138 # The installation appears OK - set up the classifier
139 my $collection = $ENV{'GSDLCOLLECTION'};
140 my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");
141 my $language = "english";
142
143 my $title = "Topic";
144 my $indexes = "section:Title,section:text";
145
146 my $verbosity = 2;
147 my $untidy = 0;
148
149 # parse the options
150 foreach $option (@options) {
151
152 if ($option =~ /^text=(.*)$/i) {
153 $indexes = $1;
154 } elsif ($option =~ /^title=(.*)$/i) {
155 $title = $1;
156 } elsif ($option =~ /^phindexdir=(.*)$/i) {
157 $phindexdir = $1;
158 } elsif ($option =~ /^verbosity=(.*)$/i) {
159 $verbosity = $1;
160 } elsif ($option =~ /^untidy/i) {
161 $untidy = 1;
162 }
163 }
164
165
166 $self->{'collection'} = $collection;
167 $self->{'title'} = $title;
168 $self->{'indexes'} = $indexes;
169 $self->{'verbosity'} = $verbosity;
170 $self->{'untidy'} = $untidy;
171
172 # limit languages
173 $language =~ s/,/\|/g;
174 $self->{'language_exp'} = $language;
175 $self->{'delimiter'} = $delimiter;
176
177 # reset phindex directory
178 if (-e "$phindexdir") {
179 &util::rm_r("$phindexdir");
180 }
181 &util::mk_dir("$phindexdir");
182 $self->{'phindexdir'} = $phindexdir;
183
184 return bless $self, $class;
185}
186
187
188# Initialise the phind classifier
189
190sub init {
191 my $self = shift (@_);
192
193 # open filehandles for documents and text
194 my $phindexdir = $self->{'phindexdir'};
195
196 my $clausefile = &util::filename_cat("$phindexdir", "clauses");
197 &util::rm($clausefile) if (-e $clausefile);
198 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
199 $self->{'txthandle'} = TEXT;
200
201 my $docfile = &util::filename_cat("$phindexdir", "docs.txt");
202 &util::rm($docfile) if (-e $docfile);
203 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
204 $self->{'dochandle'} = DOCS;
205
206}
207
208# Classify each document.
209#
210# Each document is passed here in turn. The classifier extracts the
211# text of each and stores it in the clauses file. Document details are
212# stored in the docs.txt file.
213
214sub classify {
215 my $self = shift (@_);
216 my ($doc_obj) = @_;
217
218 my $verbosity = $self->{'verbosity'};
219 my $top_section = $doc_obj->get_top_section();
220
221 my $title = $doc_obj->get_metadata_element ($top_section, "Title");
222 print "process: $title\n" if ($verbosity > 2);
223
224
225 # only consider english-language files
226 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
227 my $phrlanguage = $self->{'language_exp'};
228 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
229
230 # record this file
231 my $total++;
232 print "file $total: $file\n" if ($self->{'$verbosity'});
233
234
235 # Store document details
236 my $OID = $doc_obj->get_OID();
237 $OID = "NULL" unless defined $OID;
238 my $dochandle = $self->{'dochandle'};
239 print $dochandle "<Document>\t$OID\t$title\n";
240
241 # Store the text occuring in this object
242
243 # output the document delimiter
244 my $txthandle = $self->{'txthandle'};
245 print $txthandle "$doclimit\n";
246
247 # iterarate over the required indexes and store their text
248 my $indexes = $self->{'indexes'};
249 my $text = "";
250 my ($part, $level, $field, $section, $data);
251
252 foreach $part (split(/,/, $indexes)) {
253
254 # Each field has a level and a data element ((e.g. document:Title)
255 ($level, $field) = split(/:/, $part);
256 die unless ($level && $field);
257
258 # Extract the text from every section
259 # (In phind, document:text and section:text are equivalent)
260 if ($field eq "text") {
261 $data = "";
262 $section = $doc_obj->get_top_section();
263 while (defined($section)) {
264 $data .= $doc_obj->get_text($section) . "\n";
265 $section = $doc_obj->get_next_section($section);
266 }
267 $text .= convert_gml_to_tokens($data) . "\n";
268 }
269
270 # Extract a metadata field from a document
271 elsif ($level eq "document") {
272 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
273 $text .= convert_gml_to_tokens($data) . "\n";
274 }
275
276 # Extract metadata from every section in a document
277 elsif ($level eq "section") {
278 $data = "";
279 $section = $doc_obj->get_top_section();
280 while (defined($section)) {
281 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
282 $section = $doc_obj->get_next_section($section);
283 }
284 $text .= convert_gml_to_tokens($data) . "\n";
285 }
286
287 # Some sort of specification which I don't understand
288 else {
289 die "Unknown level ($level) in phind index ($part)\n";
290 }
291
292 }
293
294 # output the text
295 $text =~ tr/\n//s;
296 print $txthandle "$text";
297
298}
299
300
301
302# Construct the classifier from the information already gathered
303#
304# When get_classify_info is called, the clauses and docs.txt files have
305# already been constructed in the phindex directory. This function will
306# translate them into compressed, indexed MGPP files that can be read by
307# the phindcgi script. It will also register our classifier so that it
308# shows up in thenavigation bar.
309
310sub get_classify_info {
311 my $self = shift (@_);
312
313 my $verbosity = $self->{'verbosity'};
314 my $phindexdir = $self->{'phindexdir'};
315 my $language = "english";
316
317 if ($verbosity) {
318 print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
319 }
320
321 # Construct phind indexes
322 my $mode = 1;
323 my $symbol_limit = 40000000;
324 my ($command, $status);
325
326 # Generate the vocabulary, symbol statistics, and numbers file
327 # from the clauses file
328 print "\nExtracting vocabulary and statistics\n" if $verbosity;
329 &extract_vocabulary($phindexdir, $language, $verbosity);
330
331 # Use the suffix program to generate the phindex/phrases file
332 print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
333 &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
334
335 # Create the phrase file and put phrase numbers in phindex/phrases
336 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
337 &renumber_phrases("$phindexdir", $verbosity);
338
339 # Create the mg phrase database
340 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
341 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
342 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
343
344 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
345 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
346 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
347 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
348
349 print "\nCreating phrase databases\n";
350 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
351 my $mg_stem = "pdata";
352
353 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
354 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
355 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
356
357 # create the mg index of words
358 print "\nCreating word-level search indexes\n";
359 $mg_input = &util::filename_cat($phindexdir, "pword.txt");
360 $mg_stem = "pword";
361
362 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
363 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
364 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
365 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
366 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
367 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
368
369 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
370 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
371 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
372
373 # create the mg document information database
374 print "\nCreating document information databases\n";
375 $mg_input = &util::filename_cat($phindexdir, "docs.txt");
376 $mg_stem = "docs";
377
378 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
379 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
380 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
381
382
383 # Tidy up stray files
384 if (!$untidy) {
385 print "\nCleaning up\n" if ($verbosity > 2);
386 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
387 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
388 "$phindexdir/phrases", "$phindexdir/docs.txt",
389 "$phindexdir/pdata.txt", "$phindexdir/pword.txt");
390 my $outfile = 1;
391 while (-e "$phindexdir/outPhrase.$outfile") {
392 &util::rm("$phindexdir/outPhrase.$outfile");
393 $outfile++;
394 }
395 }
396
397
398 # Insert the classifier into.... what?
399 my $collection = $self->{'collection'};
400 my $url = "library?a=p&p=phind&c=$collection";
401
402 my %classifyinfo = ('thistype'=>'Invisible',
403 'childtype'=>'Phind',
404 'Title'=>$self->{'title'},
405 'contains'=>[]);
406
407 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
408 return \%classifyinfo;
409}
410
411
412
413sub convert_gml_to_tokens {
414
415 $_ = shift @_;
416
417 # FIRST, remove GML tags
418
419 # Replace all whitespace with a simple space
420 s/\s+/ /gs;
421
422 # Remove everything that is in a tag
423 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
424 s/\s*<br>\s*/ LINEBREAK /isg;
425 s/<[^>]*>/ /sg;
426
427 # Now we have the text, but it may contain HTML
428 # elements coded as &gt; etc. Remove these tags.
429 s/&lt;/</sg;
430 s/&gt;/>/sg;
431
432 s/\s+/ /sg;
433 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
434 s/\s*<br>\s*/ LINEBREAK /isg;
435 s/<[^>]*>/ /sg;
436
437 # remove &amp; and other miscellaneous markup tags
438 s/&amp;/&/sg;
439 s/&lt;/</sg;
440 s/&gt;/>/sg;
441 s/&amp;/&/sg;
442
443 # replace<p> and <br> placeholders with carriage returns
444 s/PARAGRAPHBREAK/\n/sg;
445 s/LINEBREAK/\n/sg;
446
447
448 # Exceptional punctuation
449 #
450 # We make special cases of some punctuation
451
452 # remove any apostrophe that indicates omitted letters
453 s/(\w+)\'(\w*\s)/ $1$2 /g;
454
455 # remove period that appears in a person's initals
456 s/\s([A-Z])\./ $1 /g;
457
458 # replace hyphens in hypheanted words and names with a space
459 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
460
461
462 # Convert the remaining text to "clause format",
463 # This means removing all excess punctuation and garbage text,
464 # normalising valid punctuation to fullstops and commas,
465 # then putting one cluse on each line.
466
467 # Insert newline when the end of a sentence is detected
468 # (delimter is: "[\.\?\!]\s")
469 s/\s*[\.\?\!]\s+/\n/g;
470
471 # split numbers after four digits
472 s/(\d\d\d\d)/$1 /g;
473
474 # split words after 32 characters
475
476 # squash repeated punctuation
477 tr/A-Za-z0-9 //cs;
478
479 # save email addresses
480 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
481
482 # normalise clause breaks (mostly punctuation symbols) to commas
483 s/[^A-Za-z0-9 \n]+/ , /g;
484
485 # Remove repeated commas, and replace with newline
486 s/\s*,[, ]+/\n/g;
487
488 # remove extra whitespace
489 s/ +/ /sg;
490 s/^\s+//mg;
491 s/\s*$/\n/mg;
492
493 # remove lines that contain one word or less
494 s/^\w*$//mg;
495 s/^\s*$//mg;
496 tr/\n//s;
497
498 return $_;
499}
500
501
502# Execute a system command
503
504sub execute {
505 my ($command, $verbosity) = @_;
506 print "Executing: $command\n" if ($verbosity > 2);
507 my $status = system($command);
508 if ($status != 0) {
509 print STDERR "phindgen.pl - Error executing $command: $!\n";
510 exit($status);
511 }
512}
513
514
515# Generate the vocabulary, symbol statistics, and numbers file from the
516# clauses file. This is legacy code, so is a bit messy and probably wont
517# run under windows.
518
519sub extract_vocabulary {
520 my ($phindex_dir, $language, $verbosity) = @_;
521
522 my ($w, $l, $line, $word);
523
524 my ($first_delimiter, $last_delimiter,
525 $first_stopword, $last_stopword,
526 $first_extractword, $last_extractword,
527 $first_contentword, $last_contentword,
528 $phrasedelimiter);
529
530 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
531
532
533 my %symbol;
534 my (%freq);
535
536 print "Calculating vocabulary\n" if ($verbosity > 1);
537
538 # Read and store the stopwords
539 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
540 my %stopwords;
541 foreach my $w (split(/\s+/, $words)) {
542 $l = lc($w);
543 $stopwords{$l} = $w;
544 }
545
546 # Read and store the thesaurus terms
547 $use_thesaurus = 0;
548 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
549 "$language", "agrovoc.lex");
550 if (-e "$lex_file") {
551 open(TH, "<$lex_file");
552 while(<TH>) {
553 s/^\d+ //;
554 s/\(.*\)//;
555 foreach my $w (split(/\s+/, $_)) {
556 $thesaurus{lc($w)} = $w;
557 }
558 }
559 close TH;
560 $use_thesaurus = 1;
561 }
562
563 # Read words in the text and count occurences
564 open(TXT, "<$phindex_dir/clauses");
565 my @words;
566
567 while(<TXT>) {
568 $line = $_;
569 next unless ($line =~ /./);
570
571 @words = split(/\s+/, $line);
572 foreach $w (@words) {
573 $l = lc($w);
574 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
575 $freq{$w}++;
576 }
577 $freq{$senlimit}++;
578 }
579
580 # Calculate the "best" form of each word
581 my (%bestform, %totalfreq, %bestfreq);
582
583 foreach $w (sort (keys %freq)) {
584 $l = lc($w);
585
586 # totalfreq is the number of times a term appears in any form
587 $totalfreq{$l} += $freq{$w};
588
589 if (defined $stopwords{$l}) {
590 $bestform{$l} = $stopwords{$l};
591
592 } elsif (defined $thesaurus{$l}) {
593 $bestform{$l} = $thesaurus{$l};
594
595 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
596 $bestfreq{$l} = $freq{$w};
597 $bestform{$l} = $w;
598 }
599 }
600
601 undef %freq;
602 undef %bestfreq;
603
604
605 # Assign symbol numbers to tokens
606 my $nextsymbol = 1;
607 my (@vocab);
608
609 # Delimiters
610 $first_delimiter = 1;
611
612 foreach $word (@delimiters) {
613
614 $word = lc($word);
615 $bestform{$word} = uc($word);
616 $vocab[$nextsymbol] = $word;
617 $symbol{$word} = $nextsymbol;
618 $nextsymbol++;
619 }
620 $last_delimiter = $nextsymbol - 1;
621
622 # Stopwords
623 $first_stopword = $nextsymbol;
624
625 foreach my $word (sort keys %stopwords) {
626
627 # don't incluse stopword unless it occurs in the text
628 $word = lc($word);
629 next unless ($totalfreq{$word});
630 next if ($symbol{$word});
631
632 $vocab[$nextsymbol] = $word;
633 $symbol{$word} = $nextsymbol;
634 $nextsymbol++;
635 }
636 $last_stopword = $nextsymbol - 1;
637 $first_contentword = $nextsymbol;
638
639 # Thesaurus terms
640 if ($use_thesaurus) {
641 $first_thesaurusword = $nextsymbol;
642
643 foreach my $word (sort keys %thesaurus) {
644
645 $word = lc($word);
646 next if ($symbol{$word});
647 $bestform{$word} = $thesaurus{$word};
648
649 $vocab[$nextsymbol] = $word;
650 $symbol{$word} = $nextsymbol;
651 $nextsymbol++;
652
653 }
654 $last_thesaurusword = $nextsymbol - 1;
655 }
656
657 # Other content words
658 $first_extractword = $nextsymbol;
659
660 foreach my $word (sort (keys %bestform)) {
661
662 next if ($symbol{$word});
663
664 $vocab[$nextsymbol] = $word;
665 $symbol{$word} = $nextsymbol;
666 $nextsymbol++;
667 }
668 $last_extractword = $nextsymbol - 1;
669 $last_contentword = $nextsymbol - 1;
670
671
672 # Outut the words
673 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
674 open(VOC, ">$phindex_dir/clauses.vocab");
675
676 for (my $i = 1; $i < $nextsymbol; $i++) {
677 $w = $vocab[$i];
678
679 print VOC "$bestform{$w}\n";
680 $totalfreq{$w} = 0 unless ($totalfreq{$w});
681 }
682 close VOC;
683
684
685 # Output statistics about the vocablary
686 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
687 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
688 open(STAT, ">$phindex_dir/clauses.stats")
689 || die "Cannot open $phindex_dir/clauses.stats: $!";
690
691 print STAT "first_delimiter $first_delimiter\n";
692 print STAT "last_delimiter $last_delimiter\n";
693 print STAT "first_stopword $first_stopword\n";
694 print STAT "last_stopword $last_stopword\n";
695 if ($use_thesaurus) {
696 print STAT "first_thesaurusword $first_thesaurusword\n";
697 print STAT "last_thesaurusword $last_thesaurusword\n";
698 }
699 print STAT "first_extractword $first_extractword\n";
700 print STAT "last_extractword $last_extractword\n";
701 print STAT "first_contentword $first_contentword\n";
702 print STAT "last_contentword $last_contentword\n";
703 print STAT "first_symbol $first_delimiter\n";
704 print STAT "last_symbol $last_contentword\n";
705 print STAT "first_word $first_stopword\n";
706 print STAT "last_word $last_contentword\n";
707 close STAT;
708
709 undef @vocab;
710
711
712 # Save text as symbol numbers
713 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
714
715 open(TXT, "<$phindex_dir/clauses");
716 open(NUM, ">$phindex_dir/clauses.numbers");
717
718 $phrasedelimiter = $symbol{lc($senlimit)};
719 print NUM "$symbol{lc($colstart)}\n";
720
721 # set up the special symbols that delimit documents and sentences
722 while(<TXT>) {
723
724 # split sentence into a list of tokens
725 $line = $_;
726 next unless ($line =~ /./);
727 @words = split(/\s+/, $line);
728
729 # output one token at a time
730 foreach $word (@words) {
731 $word = lc($word);
732 print NUM "$symbol{$word}\n";
733 }
734
735 # output phrase delimiter
736 print NUM "$phrasedelimiter\n";
737 }
738
739 print NUM "$symbol{lc($colend)}\n";
740
741}
742
743
744# Prepare the phrases file to be input to mgpp.
745# This means renumbering the phrases in order of decreasing frequency.
746
747
748# This is legacy code, and a little ugly, and may be unix-specific
749# (particularly the sort command).
750
751sub renumber_phrases {
752 my ($phindex_dir, $verbosity) = @_;
753
754 # Sort the phrases into order of increasing frequency
755 # This means the expansions will be sorted correctly later on.
756 print "Sorting phrases into freq order\n" if ($verbosity);
757 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
758
759 my @symbol;
760
761 # Read the vocabulary
762 print "Reading the vocabulary\n" if ($verbosity);
763 open(V, "<$phindex_dir/clauses.vocab")
764 || die "Cannot open $phindex_dir/clauses.vocab: $!";
765
766 my $i = 1;
767 while(<V>) {
768 chomp;
769 $symbol[$i++] = $_;
770 }
771
772 # Create file for phrase data
773 #
774 # The phrases file looks something like this
775 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
776 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
777 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
778 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
779
780 # The first field on each line is a unique phrase identifier.
781 # We need to calculate phrase numbers for each phrase
782 print "Calculate phrase numbers\n" if ($verbosity);
783
784 my %phrasenumber;
785 my $nextphrase = 1;
786 my ($line);
787
788 open(IN, "<$phindex_dir/phrases");
789 while(<IN>) {
790
791 # read the line
792 chomp;
793 $line = $_;
794
795 # we're only interested in the first field
796 $line =~ s/:.*//;
797
798 # get a phrase number for this line
799 $phrasenumber{$line} = $nextphrase;
800 $nextphrase++;
801 }
802
803
804 # Now we create a new phrase file using phrase numbers, not the old IDs.
805 print "Format phrase data for MGPP\n" if ($verbosity);
806
807 open(IN, "<$phindex_dir/phrases");
808 open(DATA, ">$phindex_dir/pdata.txt");
809 open(IDX, ">$phindex_dir/pword.txt");
810
811 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
812 my @fields;
813 my @documents;
814 my (@newexp, $k, $n);
815
816 my $linenumber = 0;
817
818 while(<IN>) {
819
820 # read the line
821 chomp;
822 $line = $_;
823 @fields = split(/:/, $line);
824
825 # get a phrase number for this line
826 $key = shift @fields;
827 die unless (defined($phrasenumber{$key}));
828 $num = $phrasenumber{$key};
829
830 # get the text of the phrase
831 $text = shift @fields;
832 $text =~ s/s(\d+)/$symbol[$1]/g;
833 if ($text =~ / /) {
834 $word = "";
835 } else {
836 $word = $text;
837 }
838
839 $linenumber++;
840 if ($linenumber % 1000 == 0) {
841 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
842 }
843 print "$num: $key\t($text)\n" if ($verbosity > 3);
844
845 # get the phrase frequency
846 $tf = shift @fields;
847
848 # get the number of expansions
849 $countexp = shift @fields;
850
851 # get the expansions and convert them into phrase numbers
852 $expansions = shift @fields;
853 @newexp = ();
854 foreach $k (split(/,/, $expansions)) {
855 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
856 $n = $phrasenumber{$k};
857 push @newexp, $n;
858 }
859 @newexp = sort numerically @newexp;
860
861 # get the number of documents
862 $countdocs = shift @fields;
863
864 # get the documents
865 $documents = shift @fields;
866 $documents =~ s/d//g;
867 @documents = split(/;/, $documents);
868 @documents = sort by_frequency @documents;
869
870 # output the phrase data
871 print DATA "<Document>";
872 print DATA "$num:$text:$tf:$countexp:$countdocs:";
873 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
874
875 # output the word index search data
876 print IDX "<Document>$word\n";
877
878
879 }
880}
881
882# sort routines used to renumber phrases
883
884sub numerically { $a <=> $b }
885
886sub by_frequency {
887 my $fa = 1;
888 if ($a =~ /,/) {
889 $fa = $a;
890 $fa =~ s/\d+,//;
891 }
892 my $fb = 1;
893 if ($b =~ /,/) {
894 $fb = $b;
895 $fb =~ s/\d+,//;
896 }
897
898 return ($fb <=> $fa);
899}
900
901
9021;
Note: See TracBrowser for help on using the repository browser.