source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1591

Last change on this file since 1591 was 1591, checked in by paynter, 24 years ago

Having a thesurus is now optional.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1#! /usr/bin/perl -w
2
3###########################################################################
4#
5# phindgen.pl -- produce Phind index files for a GSDL collection.
6#
7# Copyright (C) 2000 Gordon W. Paynter
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30# This program generates phind phrase browsing indexes for a Greenstone
31# digital library collection.
32#
33# The GML files that have been imported to the archives directory are
34# read, and then the phind indexes are created in the phindex directory.
35#
36# This version od Phind uses suffix arrays to extract phrases.
37
38
39BEGIN {
40 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44}
45
46
47use strict;
48use colcfg;
49use util;
50use parsargv;
51use plugin;
52use phproc;
53
54
55sub print_usage {
56 print STDERR "\n usage: $0 [options] collection-name\n\n";
57 print STDERR " options:\n";
58 print STDERR " -verbosity number 0=none, 3=lots\n";
59 print STDERR " -archivedir directory Where the converted material ends up\n";
60 print STDERR " -phindexdir directory Where to place the phind indexes\n";
61 print STDERR " -languages names The languages to consider in the browser\n";
62 print STDERR " -maxdocs number Maximum number of documents to process\n";
63 print STDERR " -untidy Leave working files in phindex directory\n";
64 print STDERR "\n";
65}
66
67
68# Define delimiter symbols - this should be abstracted out someplace
69my $colstart = "COLLECTIONSTART";
70my $colend = "COLLECTIONEND";
71my $doclimit = "DOCUMENTLIMIT";
72my $senlimit = "SENTENCELIMIT";
73my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75&main ();
76
77
78sub main {
79
80 my ($verbosity, $archivedir, $phindexdir,
81 $phind, $language, $maxdocs, $untidy,
82 $collection, $configfilename, $collectcfg);
83
84 # Parse command-line arguments and get the collection name
85 if (!parsargv::parse(\@ARGV,
86 'verbosity/\d+/1', \$verbosity,
87 'archivedir/.*/', \$archivedir,
88 'phindexdir/.*/', \$phindexdir,
89 'languages/\w+/english', \$language,
90 'maxdocs/^\-?\d+/-1', \$maxdocs,
91 'untidy', \$untidy )) {
92 &print_usage();
93 die "\n";
94 }
95 $collection = &util::use_collection(@ARGV);
96 if ($collection eq "") {
97 &print_usage();
98 die "\n";
99 }
100
101 # Read the collection configuration file
102 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103 if (-e $configfilename) {
104 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
105 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106 $archivedir = $collectcfg->{'archivedir'};
107 }
108 if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109 $phindexdir = $collectcfg->{'phindexdir'};
110 }
111 if (defined $collectcfg->{'phind'}) {
112 $phind = $collectcfg->{'phind'};
113 }
114 } else {
115 die "Couldn't find the configuration file $configfilename\n";
116 }
117
118 # Calculate default archive and phindex directories
119 if ($archivedir eq "") {
120 $archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121 }
122 if ($phindexdir eq "") {
123 $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124 }
125
126 # Make sure theuser has in fact requested phind indexes
127 if (!defined($phind)) {
128 print "No phind information in $configfilename\n";
129 exit;
130 }
131
132
133 # Construct phind indexes
134 my $mode = 1;
135 my $symbol_limit = 40000000;
136 my ($command, $status);
137
138 # Make a clean phindex directory
139 if (-e "$phindexdir") {
140 &util::rm_r("$phindexdir");
141 }
142 &util::mk_dir("$phindexdir");
143
144 # Read the archives directory and build the clauses file
145 print "\nReading archive directory\n" if $verbosity;
146 &build_clauses($archivedir, $phindexdir, $language, $verbosity, $maxdocs);
147
148 # Generate the vocabulary, symbol statistics, and numbers file
149 # from the clauses file
150 print "\nExtracting vocabulary and statistics\n" if $verbosity;
151 &extract_vocabulary($phindexdir, $language, $verbosity);
152
153 # Use the suffix program to generate the phindex/phrases file
154 $command = "suffix $phindexdir $symbol_limit $mode";
155 print "\nExecuting: $command\n" if $verbosity;
156 $status = system($command);
157 if ($status != 0) {
158 print STDERR "phindgen.pl - Error executing $command: $!\n";
159 exit($status);
160 }
161
162 # Create the phrase file and put phrase numbers in phindex/phrases
163 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
164 &renumber_phrases("$phindexdir", $verbosity);
165
166 # Create the mg phrase database
167 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
168 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
169 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
170 my $mg_input = &util::filename_cat($phindexdir, "mg-p.txt");
171
172 $command = "$mg_passes -d $phindexdir -f phrase -T1 $mg_input";
173 print "\nExecuting: $command\n" if $verbosity;
174 $status = system($command);
175 if ($status != 0) {
176 print STDERR "phindgen.pl - Error executing $command: $!\n";
177 exit($status);
178 }
179
180 $command = "$mg_compression_dict -d $phindexdir -f phrase";
181 print "\nExecuting: $command\n" if $verbosity;
182 $status = system($command);
183 if ($status != 0) {
184 print STDERR "phindgen.pl - Error executing $command: $!\n";
185 exit($status);
186 }
187
188 $command = "$mg_passes -d $phindexdir -f phrase -T2 $mg_input";
189 print "\nExecuting: $command\n" if $verbosity;
190 $status = system($command);
191 if ($status != 0) {
192 print STDERR "phindgen.pl - Error executing $command: $!\n";
193 exit($status);
194 }
195
196 # Tidy up stray files
197 if (!$untidy) {
198 print "\nCleaning up\n" if $verbosity;
199 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
200 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
201 "$phindexdir/phrases", "$phindexdir/mg-p.txt");
202 my $outfile = 1;
203 while (-e "$phindexdir/outPhrase.$outfile") {
204 &util::rm("$phindexdir/outPhrase.$outfile");
205 $outfile++;
206 }
207 }
208}
209
210
211
212
213
214
215
216# Get the text for the collection.
217# Save the document names and identifiers as we go.
218
219sub build_clauses {
220 my ($archive_dir, $phindex_dir, $language, $verbosity, $maxdocs) = @_;
221
222 # create a "pluginfo" for ArcPlug and RecPlug
223 my $plugins = [["GMLPlug"], ["ArcPlug"]];
224 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
225 die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
226
227
228 # create a phind document processor object to process the documents
229 my $processor = new phproc ($archive_dir, $phindex_dir, $language,
230 $doclimit, $verbosity, "STDOUT");
231
232 # process the archives directory
233 &plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
234}
235
236
237
238# Generate the vocabulary, symbol statistics, and numbers file from the
239# clauses file. This is legacy code, so is a bit messy and probably wont
240# run under windows.
241
242sub extract_vocabulary {
243 my ($phindex_dir, $language, $verbosity) = @_;
244
245 my ($w, $l, $line, $word);
246
247 my ($first_delimiter, $last_delimiter,
248 $first_stopword, $last_stopword,
249 $first_extractword, $last_extractword,
250 $first_contentword, $last_contentword,
251 $phrasedelimiter);
252
253 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
254
255
256 my %symbol;
257 my (%freq);
258
259 print "Calculating vocabulary\n" if ($verbosity > 1);
260
261 # Read and store the stopwords
262 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
263 my %stopwords;
264 foreach my $w (split(/\s+/, $words)) {
265 $l = lc($w);
266 $stopwords{$l} = $w;
267 }
268
269 # Read and store the thesaurus terms
270 $use_thesaurus = 0;
271 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
272 "$language", "agrovoc.lex");
273 if (-e "$lex_file") {
274 open(TH, "<$lex_file");
275 while(<TH>) {
276 s/^\d+ //;
277 s/\(.*\)//;
278 foreach my $w (split(/\s+/, $_)) {
279 $thesaurus{lc($w)} = $w;
280 }
281 }
282 close TH;
283 $use_thesaurus = 1;
284 }
285
286 # Read words in the text and count occurences
287 open(TXT, "<$phindex_dir/clauses");
288 my @words;
289
290 while(<TXT>) {
291 $line = $_;
292 next unless ($line =~ /./);
293
294 @words = split(/\s+/, $line);
295 foreach $w (@words) {
296 $l = lc($w);
297 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
298 $freq{$w}++;
299 }
300 $freq{$senlimit}++;
301 }
302
303 # Calculate the "best" form of each word
304 my (%bestform, %totalfreq, %bestfreq);
305
306 foreach $w (sort (keys %freq)) {
307 $l = lc($w);
308
309 # totalfreq is the number of times a term appears in any form
310 $totalfreq{$l} += $freq{$w};
311
312 if (defined $stopwords{$l}) {
313 $bestform{$l} = $stopwords{$l};
314
315 } elsif (defined $thesaurus{$l}) {
316 $bestform{$l} = $thesaurus{$l};
317
318 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
319 $bestfreq{$l} = $freq{$w};
320 $bestform{$l} = $w;
321 }
322 }
323
324 undef %freq;
325 undef %bestfreq;
326
327
328 # Assign symbol numbers to tokens
329 my $nextsymbol = 1;
330 my (@vocab);
331
332 # Delimiters
333 $first_delimiter = 1;
334
335 foreach $word (@delimiters) {
336
337 $word = lc($word);
338 $bestform{$word} = uc($word);
339 $vocab[$nextsymbol] = $word;
340 $symbol{$word} = $nextsymbol;
341 $nextsymbol++;
342 }
343 $last_delimiter = $nextsymbol - 1;
344
345 # Stopwords
346 $first_stopword = $nextsymbol;
347
348 foreach my $word (sort keys %stopwords) {
349
350 # don't incluse stopword unless it occurs in the text
351 $word = lc($word);
352 next unless ($totalfreq{$word});
353 next if ($symbol{$word});
354
355 $vocab[$nextsymbol] = $word;
356 $symbol{$word} = $nextsymbol;
357 $nextsymbol++;
358 }
359 $last_stopword = $nextsymbol - 1;
360 $first_contentword = $nextsymbol;
361
362 # Thesaurus terms
363 if ($use_thesaurus) {
364 $first_thesaurusword = $nextsymbol;
365
366 foreach my $word (sort keys %thesaurus) {
367
368 $word = lc($word);
369 next if ($symbol{$word});
370 $bestform{$word} = $thesaurus{$word};
371
372 $vocab[$nextsymbol] = $word;
373 $symbol{$word} = $nextsymbol;
374 $nextsymbol++;
375
376 }
377 $last_thesaurusword = $nextsymbol - 1;
378 }
379
380 # Other content words
381 $first_extractword = $nextsymbol;
382
383 foreach my $word (sort (keys %bestform)) {
384
385 next if ($symbol{$word});
386
387 $vocab[$nextsymbol] = $word;
388 $symbol{$word} = $nextsymbol;
389 $nextsymbol++;
390 }
391 $last_extractword = $nextsymbol - 1;
392 $last_contentword = $nextsymbol - 1;
393
394
395 # Outut the words
396 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
397 open(VOC, ">$phindex_dir/clauses.vocab");
398
399 for (my $i = 1; $i < $nextsymbol; $i++) {
400 $w = $vocab[$i];
401
402 print VOC "$bestform{$w}\n";
403 $totalfreq{$w} = 0 unless ($totalfreq{$w});
404 }
405 close VOC;
406
407
408 # Output statistics about the vocablary
409 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
410 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
411 open(STAT, ">$phindex_dir/clauses.stats")
412 || die "Cannot open $phindex_dir/clauses.stats: $!";
413
414 print STAT "first_delimiter $first_delimiter\n";
415 print STAT "last_delimiter $last_delimiter\n";
416 print STAT "first_stopword $first_stopword\n";
417 print STAT "last_stopword $last_stopword\n";
418 if ($use_thesaurus) {
419 print STAT "first_thesaurusword $first_thesaurusword\n";
420 print STAT "last_thesaurusword $last_thesaurusword\n";
421 }
422 print STAT "first_extractword $first_extractword\n";
423 print STAT "last_extractword $last_extractword\n";
424 print STAT "first_contentword $first_contentword\n";
425 print STAT "last_contentword $last_contentword\n";
426 print STAT "first_symbol $first_delimiter\n";
427 print STAT "last_symbol $last_contentword\n";
428 print STAT "first_word $first_stopword\n";
429 print STAT "last_word $last_contentword\n";
430 close STAT;
431
432 undef @vocab;
433
434
435 # Save text as symbol numbers
436 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
437
438 open(TXT, "<$phindex_dir/clauses");
439 open(NUM, ">$phindex_dir/clauses.numbers");
440
441 $phrasedelimiter = $symbol{lc($senlimit)};
442 print NUM "$symbol{lc($colstart)}\n";
443
444 # set up the special symbols that delimit documents and sentences
445 while(<TXT>) {
446
447 # split sentence into a list of tokens
448 $line = $_;
449 next unless ($line =~ /./);
450 @words = split(/\s+/, $line);
451
452 # output one token at a time
453 foreach $word (@words) {
454 $word = lc($word);
455 print NUM "$symbol{$word}\n";
456 }
457
458 # output phrase delimiter
459 print NUM "$phrasedelimiter\n";
460 }
461
462 print NUM "$symbol{lc($colend)}\n";
463
464}
465
466
467# Prepare the phrases file to be input to mgpp.
468# This means renumbering the phrases in order of decreasing frequency.
469
470
471# This is legacy code, and a little ugly, and may be unix-specific
472# (particularly the sort command).
473
474sub renumber_phrases {
475 my ($phindex_dir, $verbosity) = @_;
476
477 # Sort the phrases into order of increasing frequency
478 print "Sorting phrases into freq order\n" if ($verbosity);
479 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
480
481 my @symbol;
482
483 # Read the vocabulary
484 print "Reading the vocabulary\n" if ($verbosity);
485 open(V, "<$phindex_dir/clauses.vocab")
486 || die "Cannot open $phindex_dir/clauses.vocab: $!";
487
488 my $i = 1;
489 while(<V>) {
490 chomp;
491 $symbol[$i++] = $_;
492 }
493
494 # Create file for phrase data
495 #
496 # The phrases file looks something like this
497 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
498 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
499 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
500 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
501
502
503 print "Calculate phrase numbers\n" if ($verbosity);
504
505 open(IN, "<$phindex_dir/phrases");
506
507 my %phrasenumber;
508 my $nextphrase = 1;
509
510 my ($line, $num);
511
512 while(<IN>) {
513
514 # read the line
515 chomp;
516 $line = $_;
517
518 # we're only interested in ther first field
519 $line =~ s/:.*//;
520
521 # get a phrase number for this line
522 $num = $nextphrase;
523 $phrasenumber{$line} = $num;
524 $nextphrase++;
525 }
526
527 # Extract the phrase data
528 print "Create phrase file and frequency file\n" if ($verbosity);
529
530 open(IN, "<$phindex_dir/phrases");
531 open(DATA, ">$phindex_dir/mg-p.txt");
532
533 my ($key, $tf, $countexp, $expansions, $countdocs, $documents, $text);
534 my @fields;
535 my @documents;
536 my (@newexp, $k, $n);
537
538 my $linenumber = 0;
539
540 while(<IN>) {
541
542 # read the line
543 chomp;
544 $line = $_;
545 @fields = split(/:/, $line);
546
547 # output the MG document tag
548 print DATA "<Document>";
549
550 # get a phrase number for this line
551 $key = shift @fields;
552 die unless (defined($phrasenumber{$key}));
553 $num = $phrasenumber{$key};
554 print DATA "$num:";
555
556 # get the text of the phrase
557 $text = shift @fields;
558 $text =~ s/s(\d+)/$symbol[$1]/g;
559 print DATA "$text:";
560
561 $linenumber++;
562 if ($linenumber % 1000 == 0) {
563 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 1);
564 }
565 print "$num: $key\t($text)\n" if ($verbosity > 1);
566
567 # get the phrase frequency
568 $tf = shift @fields;
569 print DATA "$tf:";
570
571 # get the number of expansions
572 $countexp = shift @fields;
573 print DATA "$countexp:";
574
575 # get the expansions and convert them into phrase numbers
576 $expansions = shift @fields;
577 @newexp = ();
578 foreach $k (split(/,/, $expansions)) {
579 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
580 $n = $phrasenumber{$k};
581 push @newexp, $n;
582 }
583 print DATA join(",", (sort numerically @newexp)), ":";
584
585 # get the number of documents
586 $countdocs = shift @fields;
587 print DATA "$countdocs:";
588
589 # get the documents
590 $documents = shift @fields;
591 $documents =~ s/d//g;
592 @documents = split(/;/, $documents);
593
594 print DATA join(";", (sort by_frequency @documents));
595
596 print DATA "\n";
597
598 }
599}
600
601# sort routines used to renumber phrases
602
603sub numerically { $a <=> $b }
604
605sub by_frequency {
606 my $fa = 1;
607 if ($a =~ /,/) {
608 $fa = $a;
609 $fa =~ s/\d+,//;
610 }
611 my $fb = 1;
612 if ($b =~ /,/) {
613 $fb = $b;
614 $fb =~ s/\d+,//;
615 }
616
617 return ($fb <=> $fa);
618}
619
Note: See TracBrowser for help on using the repository browser.