source: main/tags/2.30/gsdl/perllib/classify/phind.pm@ 23841

Last change on this file since 23841 was 1808, checked in by paynter, 24 years ago

Option to save the phind phrases to a text file.

  • Property svn:keywords set to Author Date Id Revision
File size: 27.6 KB
Line 
1###########################################################################
2#
3# phind.pm -- the Phind classifier
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright (C) 2000 New Zealand Digital Library Project
7#
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# The phind clasifier plugin
30#
31# options are:
32# button=Name The label for the classifiers button in the
33# navigation bar (defaults to "Phrase").
34# title=Title The metadata field used to describe each document
35# (defaults to "Title").
36# text=fields The text used to build the phrase hierarchy
37# (defaults to "section:Title,section:text").
38# phinddir=directory Location of phind index files
39# verbosity=num Control amount of output
40# untidy=true Do not clean up intermediate files
41# suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword)
42# suffixsize=num Number of symbols available to suffix program
43# savephrases=filename If set, phrase infomation will be stored in filename
44# as text. (By defualt, it is not set.)
45
46# How a classifier works.
47#
48# For each classifier requested in the collect.cfg file, buildcol.pl creates
49# a new classifier object (such as the one defined in theis file) and later
50# passes each document object to the classifier in turn for classification.
51#
52# Four functions are used:
53#
54# 1. "new" is called before the documents are processed to set up the
55# classifier.
56#
57# 2. "init" is called after buildcol.pl has created the indexes etc but
58# before the documents are classified in order that the classifier might
59# set any variables it requires, etc.
60#
61# 3. "classify" is called once for each document object. The classifier
62# "classifies" each document and updates its local data accordingly.
63#
64# 4. "get_classify_info" is called after every document has been
65# classified. It collates the information about the documents and
66# stores a reference to the classifier so that Greenstone can later
67# display it.
68
69
70package phind;
71
72use BasClas;
73use util;
74
75sub BEGIN {
76 @ISA = ('BasClas');
77}
78
79# Define delimiter symbols - this should be abstracted out someplace
80my $colstart = "COLLECTIONSTART";
81my $colend = "COLLECTIONEND";
82my $doclimit = "DOCUMENTLIMIT";
83my $senlimit = "SENTENCELIMIT";
84my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
85
86# Create a new phind browser based on the options in collect.cfg
87
88sub new {
89 my ($class, @options) = @_;
90 my $self = new BasClas ($class, @_);
91 my $out = pop @options;
92
93 # Phind installation check
94 # The phind phrase browser is research software and is not installed
95 # by defualt. If the user attepts to use it we warn them that it's a
96 # bit dodgy, then tell them how to install it. If they can do that
97 # and get all the files in place, then we let them proceed.
98
99 print $out "The Phind classifier for Greenstone.\n";
100 print $out "Checking the phind phrase browser requirements...\n";
101
102 # Make sure we're not in windows
103 if ($ENV{'GSDLOS'} =~ /windows/i) {
104 print STDERR "Phind currently only works under Unix";
105 exit(1);
106 }
107
108 # Ensure the Phind generate scripts are in place
109 my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix");
110 my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate");
111
112 if (!(-e $file1)) {
113 print STDERR "The phind \"suffix\" program is not installed. ";
114 print STDERR "To install it, change to the directory\n";
115 print STDERR " $src\n";
116 print STDERR "and type \"make install-phind\".\n\n";
117 exit(1);
118 }
119
120 # Ensure the Phind CGI script is in place
121 $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi");
122 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host");
123
124 if (!(-e $file1)) {
125 print STDERR "The phind CGI program is not installed. ";
126 print STDERR "To install it, change to the directory\n";
127 print STDERR " $src\n";
128 print STDERR "and type \"make install-phind\".\n\n";
129 exit(1);
130 }
131
132 # Ensure the Phind Java applet is in place
133 $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client");
134 $file1 = &util::filename_cat($src, "Phind.class");
135
136 if (!(-e $file1)) {
137 print STDERR "The phind Java classes are not compiled. ";
138 print STDERR "To compile them, change to the directory\n";
139 print STDERR " $src\n";
140 print STDERR "and use your Java compiler to compile Phind.java.\n";
141 print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n";
142 exit(1);
143 }
144
145 # The installation appears OK - set up the classifier
146 my $collection = $ENV{'GSDLCOLLECTION'};
147 my $language = "english";
148
149 my $button = "Phrase";
150 my $title = "Title";
151 my $indexes = "section:Title,section:text";
152
153 my $builddir = "";
154 my $phinddir = "";
155
156 my $suffixmode = 1;
157 my $suffixsize = 40000000;
158 my $savephrases = "";
159
160 my $verbosity = 2;
161 my $untidy = 0;
162
163 # parse the options
164 foreach $option (@options) {
165
166 print STDERR "option: $option\n";
167
168 if ($option =~ /^text=(.*)$/i) {
169 $indexes = $1;
170 } elsif ($option =~ /^title=(.*)$/i) {
171 $title = $1;
172 } elsif ($option =~ /^button=(.*)$/i) {
173 $button = $1;
174 } elsif ($option =~ /^builddir=(.*)$/i) {
175 $builddir = $1;
176 } elsif ($option =~ /^phinddir=(.*)$/i) {
177 $phinddir = $1;
178 } elsif ($option =~ /^savephrases=(.*)$/i) {
179 $savephrases = $1;
180 } elsif ($option =~ /^suffixsize=(.*)$/i) {
181 $suffixsize = $1;
182 } elsif ($option =~ /^suffixmode=(.*)$/i) {
183 $suffixmode = $1;
184 } elsif ($option =~ /^untidy/i) {
185 $untidy = 1;
186 } elsif ($option =~ /^verbosity=(.*)$/i) {
187 $verbosity = $1;
188 }
189 }
190
191 # classifier information
192 $self->{'collection'} = $collection;
193 $self->{'titlefield'} = $title;
194 $self->{'buttonname'} = $button;
195 $self->{'indexes'} = $indexes;
196
197 # phrase extraction options
198 $self->{'suffixmode'} = $suffixmode;
199 $self->{'suffixsize'} = $suffixsize;
200 $self->{'savephrases'} = $savephrases if ($savephrases);
201
202 # limit languages
203 $language =~ s/,/\|/g;
204 $self->{'language_exp'} = $language;
205 $self->{'delimiter'} = $delimiter;
206
207 # build directory
208 if (!$builddir) {
209 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building");
210 }
211 $self->{'builddir'} = $builddir;
212
213 # phind directory
214 if (!$phinddir) {
215 $phinddir = &util::filename_cat($builddir, "phind");
216 }
217 $self->{'phinddir'} = $phinddir;
218
219 # debugging levels
220 $self->{'verbosity'} = $verbosity;
221 $self->{'untidy'} = $untidy;
222
223 return bless $self, $class;
224}
225
226
227# Initialise the phind classifier
228
229sub init {
230 my $self = shift (@_);
231
232 # ensure we have a build directory
233 my $builddir = $self->{'builddir'};
234 die unless (-e "$builddir");
235
236 # create phind directory
237 my $phinddir = $self->{'phinddir'};
238 if (-e "$phinddir") {
239 &util::rm_r("$phinddir");
240 }
241 &util::mk_dir("$phinddir");
242
243 # open filehandles for documents and text
244 my $clausefile = &util::filename_cat("$phinddir", "clauses");
245 &util::rm($clausefile) if (-e $clausefile);
246 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!";
247 $self->{'txthandle'} = TEXT;
248
249 my $docfile = &util::filename_cat("$phinddir", "docs.txt");
250 &util::rm($docfile) if (-e $docfile);
251 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!";
252 $self->{'dochandle'} = DOCS;
253
254}
255
256# Classify each document.
257#
258# Each document is passed here in turn. The classifier extracts the
259# text of each and stores it in the clauses file. Document details are
260# stored in the docs.txt file.
261
262sub classify {
263 my $self = shift (@_);
264 my ($doc_obj) = @_;
265
266 my $verbosity = $self->{'verbosity'};
267 my $top_section = $doc_obj->get_top_section();
268
269 my $titlefield = $self->{'titlefield'};
270
271 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield);
272 print "process: $title\n" if ($verbosity > 2);
273
274
275 # only consider english-language files
276 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
277 my $phrlanguage = $self->{'language_exp'};
278 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
279
280 # record this file
281 my $total++;
282 print "file $total: $file\n" if ($self->{'$verbosity'});
283
284
285 # Store document details
286 my $OID = $doc_obj->get_OID();
287 $OID = "NULL" unless defined $OID;
288 my $dochandle = $self->{'dochandle'};
289 print $dochandle "<Document>\t$OID\t$title\n";
290
291 # Store the text occuring in this object
292
293 # output the document delimiter
294 my $txthandle = $self->{'txthandle'};
295 print $txthandle "$doclimit\n";
296
297 # iterarate over the required indexes and store their text
298 my $indexes = $self->{'indexes'};
299 my $text = "";
300 my ($part, $level, $field, $section, $data);
301
302 foreach $part (split(/,/, $indexes)) {
303
304 # Each field has a level and a data element ((e.g. document:Title)
305 ($level, $field) = split(/:/, $part);
306 die unless ($level && $field);
307
308 # Extract the text from every section
309 # (In phind, document:text and section:text are equivalent)
310 if ($field eq "text") {
311 $data = "";
312 $section = $doc_obj->get_top_section();
313 while (defined($section)) {
314 $data .= $doc_obj->get_text($section) . "\n";
315 $section = $doc_obj->get_next_section($section);
316 }
317 $text .= convert_gml_to_tokens($data) . "\n";
318 }
319
320 # Extract a metadata field from a document
321 elsif ($level eq "document") {
322 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
323 $text .= convert_gml_to_tokens($data) . "\n";
324 }
325
326 # Extract metadata from every section in a document
327 elsif ($level eq "section") {
328 $data = "";
329 $section = $doc_obj->get_top_section();
330 while (defined($section)) {
331 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
332 $section = $doc_obj->get_next_section($section);
333 }
334 $text .= convert_gml_to_tokens($data) . "\n";
335 }
336
337 # Some sort of specification which I don't understand
338 else {
339 die "Unknown level ($level) in phind index ($part)\n";
340 }
341
342 }
343
344 # output the text
345 $text =~ tr/\n//s;
346 print $txthandle "$text";
347
348}
349
350
351
352# Construct the classifier from the information already gathered
353#
354# When get_classify_info is called, the clauses and docs.txt files have
355# already been constructed in the phind directory. This function will
356# translate them into compressed, indexed MGPP files that can be read by
357# the phindcgi script. It will also register our classifier so that it
358# shows up in the navigation bar.
359
360sub get_classify_info {
361 my $self = shift (@_);
362
363 my $verbosity = $self->{'verbosity'};
364 my $phinddir = $self->{'phinddir'};
365 my $language = "english";
366
367 if ($verbosity) {
368 print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n";
369 }
370
371 # Construct phind indexes
372 my $suffixmode = $self->{'suffixmode'};
373 my $suffixsize = $self->{'suffixsize'};
374 my ($command, $status);
375
376 # Generate the vocabulary, symbol statistics, and numbers file
377 # from the clauses file
378 print "\nExtracting vocabulary and statistics\n" if $verbosity;
379 &extract_vocabulary($phinddir, $language, $verbosity);
380
381 # Use the suffix program to generate the phind/phrases file
382 print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity;
383 &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity);
384
385 # Create the phrase file and put phrase numbers in phind/phrases
386 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
387 &renumber_phrases($self);
388
389 # Create the mg phrase database
390 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
391 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
392 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
393
394 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
395 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
396 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
397 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
398
399 print "\nCreating phrase databases\n";
400 my $mg_input = &util::filename_cat($phinddir, "pdata.txt");
401 my $mg_stem = "pdata";
402
403 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
404 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
405 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
406
407 # create the mg index of words
408 print "\nCreating word-level search indexes\n";
409 $mg_input = &util::filename_cat($phinddir, "pword.txt");
410 $mg_stem = "pword";
411
412 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
413 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
414 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity);
415 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
416 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity);
417 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity);
418
419 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity);
420 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity);
421 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity);
422
423 # create the mg document information database
424 print "\nCreating document information databases\n";
425 $mg_input = &util::filename_cat($phinddir, "docs.txt");
426 $mg_stem = "docs";
427
428 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity);
429 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity);
430 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity);
431
432
433 # Tidy up stray files
434 if (!$self->{'untidy'}) {
435 print "\nCleaning up\n" if ($verbosity > 2);
436 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers",
437 "$phinddir/clauses.vocab", "$phinddir/clauses.stats",
438 "$phinddir/phrases", "$phinddir/docs.txt",
439 "$phinddir/pdata.txt", "$phinddir/pword.txt");
440 my $outfile = 1;
441 while (-e "$phinddir/outPhrase.$outfile") {
442 &util::rm("$phinddir/outPhrase.$outfile");
443 $outfile++;
444 }
445 }
446
447
448 # Insert the classifier into.... what?
449 my $collection = $self->{'collection'};
450 my $url = "library?a=p&p=phind&c=$collection";
451
452 my %classifyinfo = ('thistype'=>'Invisible',
453 'childtype'=>'Phind',
454 'Title'=>$self->{'buttonname'},
455 'contains'=>[]);
456
457 push (@{$classifyinfo{'contains'}}, {'OID'=>$url});
458 return \%classifyinfo;
459}
460
461
462
463sub convert_gml_to_tokens {
464
465 $_ = shift @_;
466
467 # FIRST, remove GML tags
468
469 # Replace all whitespace with a simple space
470 s/\s+/ /gs;
471
472 # Remove everything that is in a tag
473 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
474 s/\s*<br>\s*/ LINEBREAK /isg;
475 s/<[^>]*>/ /sg;
476
477 # Now we have the text, but it may contain HTML
478 # elements coded as &gt; etc. Remove these tags.
479 s/&lt;/</sg;
480 s/&gt;/>/sg;
481
482 s/\s+/ /sg;
483 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
484 s/\s*<br>\s*/ LINEBREAK /isg;
485 s/<[^>]*>/ /sg;
486
487 # remove &amp; and other miscellaneous markup tags
488 s/&amp;/&/sg;
489 s/&lt;/</sg;
490 s/&gt;/>/sg;
491 s/&amp;/&/sg;
492
493 # replace<p> and <br> placeholders with carriage returns
494 s/PARAGRAPHBREAK/\n/sg;
495 s/LINEBREAK/\n/sg;
496
497
498 # Exceptional punctuation
499 #
500 # We make special cases of some punctuation
501
502 # remove any apostrophe that indicates omitted letters
503 s/(\w+)\'(\w*\s)/ $1$2 /g;
504
505 # remove period that appears in a person's initals
506 s/\s([A-Z])\./ $1 /g;
507
508 # replace hyphens in hypheanted words and names with a space
509 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
510
511
512 # Convert the remaining text to "clause format",
513 # This means removing all excess punctuation and garbage text,
514 # normalising valid punctuation to fullstops and commas,
515 # then putting one cluse on each line.
516
517 # Insert newline when the end of a sentence is detected
518 # (delimter is: "[\.\?\!]\s")
519 s/\s*[\.\?\!]\s+/\n/g;
520
521 # split numbers after four digits
522 s/(\d\d\d\d)/$1 /g;
523
524 # split words after 32 characters
525
526 # squash repeated punctuation
527 tr/A-Za-z0-9 //cs;
528
529 # save email addresses
530 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
531
532 # normalise clause breaks (mostly punctuation symbols) to commas
533 s/[^A-Za-z0-9 \n]+/ , /g;
534
535 # Remove repeated commas, and replace with newline
536 s/\s*,[, ]+/\n/g;
537
538 # remove extra whitespace
539 s/ +/ /sg;
540 s/^\s+//mg;
541 s/\s*$/\n/mg;
542
543 # remove lines that contain one word or less
544 s/^\w*$//mg;
545 s/^\s*$//mg;
546 tr/\n//s;
547
548 return $_;
549}
550
551
552# Execute a system command
553
554sub execute {
555 my ($command, $verbosity) = @_;
556 print "Executing: $command\n" if ($verbosity > 2);
557 my $status = system($command);
558 if ($status != 0) {
559 print STDERR "phindgen.pl - Error executing $command: $!\n";
560 exit($status);
561 }
562}
563
564
565# Generate the vocabulary, symbol statistics, and numbers file from the
566# clauses file. This is legacy code, so is a bit messy and probably wont
567# run under windows.
568
569sub extract_vocabulary {
570 my ($phind_dir, $language, $verbosity) = @_;
571
572 my ($w, $l, $line, $word);
573
574 my ($first_delimiter, $last_delimiter,
575 $first_stopword, $last_stopword,
576 $first_extractword, $last_extractword,
577 $first_contentword, $last_contentword,
578 $phrasedelimiter);
579
580 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
581
582
583 my %symbol;
584 my (%freq);
585
586 print "Calculating vocabulary\n" if ($verbosity > 1);
587
588 # Read and store the stopwords
589 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
590 my %stopwords;
591 foreach my $w (split(/\s+/, $words)) {
592 $l = lc($w);
593 $stopwords{$l} = $w;
594 }
595
596 # Read and store the thesaurus terms
597 $use_thesaurus = 0;
598 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
599 "$language", "agrovoc.lex");
600 if (-e "$lex_file") {
601 open(TH, "<$lex_file");
602 while(<TH>) {
603 s/^\d+ //;
604 s/\(.*\)//;
605 foreach my $w (split(/\s+/, $_)) {
606 $thesaurus{lc($w)} = $w;
607 }
608 }
609 close TH;
610 $use_thesaurus = 1;
611 }
612
613 # Read words in the text and count occurences
614 open(TXT, "<$phind_dir/clauses");
615 my @words;
616
617 while(<TXT>) {
618 $line = $_;
619 next unless ($line =~ /./);
620
621 @words = split(/\s+/, $line);
622 foreach $w (@words) {
623 $l = lc($w);
624 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
625 $freq{$w}++;
626 }
627 $freq{$senlimit}++;
628 }
629
630 # Calculate the "best" form of each word
631 my (%bestform, %totalfreq, %bestfreq);
632
633 foreach $w (sort (keys %freq)) {
634 $l = lc($w);
635
636 # totalfreq is the number of times a term appears in any form
637 $totalfreq{$l} += $freq{$w};
638
639 if (defined $stopwords{$l}) {
640 $bestform{$l} = $stopwords{$l};
641
642 } elsif (defined $thesaurus{$l}) {
643 $bestform{$l} = $thesaurus{$l};
644
645 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
646 $bestfreq{$l} = $freq{$w};
647 $bestform{$l} = $w;
648 }
649 }
650
651 undef %freq;
652 undef %bestfreq;
653
654
655 # Assign symbol numbers to tokens
656 my $nextsymbol = 1;
657 my (@vocab);
658
659 # Delimiters
660 $first_delimiter = 1;
661
662 foreach $word (@delimiters) {
663
664 $word = lc($word);
665 $bestform{$word} = uc($word);
666 $vocab[$nextsymbol] = $word;
667 $symbol{$word} = $nextsymbol;
668 $nextsymbol++;
669 }
670 $last_delimiter = $nextsymbol - 1;
671
672 # Stopwords
673 $first_stopword = $nextsymbol;
674
675 foreach my $word (sort keys %stopwords) {
676
677 # don't incluse stopword unless it occurs in the text
678 $word = lc($word);
679 next unless ($totalfreq{$word});
680 next if ($symbol{$word});
681
682 $vocab[$nextsymbol] = $word;
683 $symbol{$word} = $nextsymbol;
684 $nextsymbol++;
685 }
686 $last_stopword = $nextsymbol - 1;
687 $first_contentword = $nextsymbol;
688
689 # Thesaurus terms
690 if ($use_thesaurus) {
691 $first_thesaurusword = $nextsymbol;
692
693 foreach my $word (sort keys %thesaurus) {
694
695 $word = lc($word);
696 next if ($symbol{$word});
697 $bestform{$word} = $thesaurus{$word};
698
699 $vocab[$nextsymbol] = $word;
700 $symbol{$word} = $nextsymbol;
701 $nextsymbol++;
702
703 }
704 $last_thesaurusword = $nextsymbol - 1;
705 }
706
707 # Other content words
708 $first_extractword = $nextsymbol;
709
710 foreach my $word (sort (keys %bestform)) {
711
712 next if ($symbol{$word});
713
714 $vocab[$nextsymbol] = $word;
715 $symbol{$word} = $nextsymbol;
716 $nextsymbol++;
717 }
718 $last_extractword = $nextsymbol - 1;
719 $last_contentword = $nextsymbol - 1;
720
721
722 # Outut the words
723 print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);
724 open(VOC, ">$phind_dir/clauses.vocab");
725
726 for (my $i = 1; $i < $nextsymbol; $i++) {
727 $w = $vocab[$i];
728
729 print VOC "$bestform{$w}\n";
730 $totalfreq{$w} = 0 unless ($totalfreq{$w});
731 }
732 close VOC;
733
734
735 # Output statistics about the vocablary
736 print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1);
737 &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats");
738 open(STAT, ">$phind_dir/clauses.stats")
739 || die "Cannot open $phind_dir/clauses.stats: $!";
740
741 print STAT "first_delimiter $first_delimiter\n";
742 print STAT "last_delimiter $last_delimiter\n";
743 print STAT "first_stopword $first_stopword\n";
744 print STAT "last_stopword $last_stopword\n";
745 if ($use_thesaurus) {
746 print STAT "first_thesaurusword $first_thesaurusword\n";
747 print STAT "last_thesaurusword $last_thesaurusword\n";
748 }
749 print STAT "first_extractword $first_extractword\n";
750 print STAT "last_extractword $last_extractword\n";
751 print STAT "first_contentword $first_contentword\n";
752 print STAT "last_contentword $last_contentword\n";
753 print STAT "first_symbol $first_delimiter\n";
754 print STAT "last_symbol $last_contentword\n";
755 print STAT "first_word $first_stopword\n";
756 print STAT "last_word $last_contentword\n";
757 close STAT;
758
759 undef @vocab;
760
761
762 # Save text as symbol numbers
763 print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);
764
765 open(TXT, "<$phind_dir/clauses");
766 open(NUM, ">$phind_dir/clauses.numbers");
767
768 $phrasedelimiter = $symbol{lc($senlimit)};
769 print NUM "$symbol{lc($colstart)}\n";
770
771 # set up the special symbols that delimit documents and sentences
772 while(<TXT>) {
773
774 # split sentence into a list of tokens
775 $line = $_;
776 next unless ($line =~ /./);
777 @words = split(/\s+/, $line);
778
779 # output one token at a time
780 foreach $word (@words) {
781 $word = lc($word);
782 print NUM "$symbol{$word}\n";
783 }
784
785 # output phrase delimiter
786 print NUM "$phrasedelimiter\n";
787 }
788
789 print NUM "$symbol{lc($colend)}\n";
790
791}
792
793
794# Prepare the phrases file to be input to mgpp.
795# This means renumbering the phrases in order of decreasing frequency.
796
797
798# This is legacy code, and a little ugly, and may be unix-specific
799# (particularly the sort command).
800
801sub renumber_phrases {
802 my $self = shift (@_);
803
804 my $verbosity = $self->{'verbosity'};
805 my $phind_dir = $self->{'phinddir'};
806
807 my $savephrases = 0;
808 $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'}));
809
810
811
812 # Sort the phrases into order of increasing frequency
813 # This means the expansions will be sorted correctly later on.
814 print "Sorting phrases into freq order\n" if ($verbosity);
815 system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
816
817 # Read the vocabulary
818 my @symbol;
819 print "Reading the vocabulary\n" if ($verbosity);
820 open(V, "<$phind_dir/clauses.vocab")
821 || die "Cannot open $phind_dir/clauses.vocab: $!";
822
823 my $i = 1;
824 while(<V>) {
825 chomp;
826 $symbol[$i++] = $_;
827 }
828
829 # Create file for phrase data
830 #
831 # The phrases file looks something like this
832 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
833 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
834 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
835 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
836
837 # The first field on each line is a unique phrase identifier.
838 # We need to calculate phrase numbers for each phrase
839 print "Calculate phrase numbers\n" if ($verbosity);
840
841 my %phrasenumber;
842 my $nextphrase = 1;
843 my ($line);
844
845 open(IN, "<$phind_dir/phrases");
846 while(<IN>) {
847
848 # read the line
849 chomp;
850 $line = $_;
851
852 # we're only interested in the first field
853 $line =~ s/:.*//;
854
855 # get a phrase number for this line
856 $phrasenumber{$line} = $nextphrase;
857 $nextphrase++;
858 }
859
860
861 # Now we create a new phrase file using phrase numbers, not the old IDs.
862 print "Format phrase data for MGPP\n" if ($verbosity);
863
864 # Open the basic files
865 open(IN, "<$phind_dir/phrases");
866 open(DATA, ">$phind_dir/pdata.txt");
867 open(IDX, ">$phind_dir/pword.txt");
868
869 # We may want to save the phrases in a separate text file
870 if ($savephrases) {
871 print "Saving phrases in $savephrases\n" if ($verbosity);
872 open(SAVE, ">$savephrases");
873 }
874
875 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
876 my @fields;
877 my @documents;
878 my (@newexp, $k, $n);
879
880 my $linenumber = 0;
881
882 while(<IN>) {
883
884 # read the line
885 chomp;
886 $line = $_;
887 @fields = split(/:/, $line);
888
889 # get a phrase number for this line
890 $key = shift @fields;
891 die unless (defined($phrasenumber{$key}));
892 $num = $phrasenumber{$key};
893
894 # get the text of the phrase
895 $text = shift @fields;
896 $text =~ s/s(\d+)/$symbol[$1]/g;
897 if ($text =~ / /) {
898 $word = "";
899 } else {
900 $word = $text;
901 }
902
903 $linenumber++;
904 if ($linenumber % 1000 == 0) {
905 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
906 }
907 print "$num: $key\t($text)\n" if ($verbosity > 3);
908
909 # get the phrase frequency
910 $tf = shift @fields;
911
912 # get the number of expansions
913 $countexp = shift @fields;
914
915 # get the expansions and convert them into phrase numbers
916 $expansions = shift @fields;
917 @newexp = ();
918 foreach $k (split(/,/, $expansions)) {
919 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
920 $n = $phrasenumber{$k};
921 push @newexp, $n;
922 }
923 @newexp = sort numerically @newexp;
924
925 # get the number of documents
926 $countdocs = shift @fields;
927
928 # get the documents
929 $documents = shift @fields;
930 $documents =~ s/d//g;
931 @documents = split(/;/, $documents);
932 @documents = sort by_frequency @documents;
933
934 # output the phrase data
935 print DATA "<Document>";
936 print DATA "$num:$text:$tf:$countexp:$countdocs:";
937 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
938
939 # output the word index search data
940 print IDX "<Document>$word\n";
941
942 # output the phrases to a text file
943 if ($savephrases) {
944 print SAVE "$tf\t$countdocs\t$text\n";
945 }
946
947 }
948
949 close SAVE if ($savephrases);
950}
951
952# sort routines used to renumber phrases
953
954sub numerically { $a <=> $b }
955
956sub by_frequency {
957 my $fa = 1;
958 if ($a =~ /,/) {
959 $fa = $a;
960 $fa =~ s/\d+,//;
961 }
962 my $fb = 1;
963 if ($b =~ /,/) {
964 $fb = $b;
965 $fb =~ s/\d+,//;
966 }
967
968 return ($fb <=> $fa);
969}
970
971
9721;
Note: See TracBrowser for help on using the repository browser.