source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1613

Last change on this file since 1613 was 1613, checked in by paynter, 24 years ago

A little less verbose by default.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1#! /usr/bin/perl -w
2
3###########################################################################
4#
5# phindgen.pl -- produce Phind index files for a GSDL collection.
6#
7# Copyright (C) 2000 Gordon W. Paynter
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30# This program generates phind phrase browsing indexes for a Greenstone
31# digital library collection.
32#
33# The GML files that have been imported to the archives directory are
34# read, and then the phind indexes are created in the phindex directory.
35#
36# This version od Phind uses suffix arrays to extract phrases.
37
38
39BEGIN {
40 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44}
45
46
47use strict;
48use colcfg;
49use util;
50use parsargv;
51use plugin;
52use phproc;
53
54
55sub print_usage {
56 print STDERR "\n usage: $0 [options] collection-name\n\n";
57 print STDERR " options:\n";
58 print STDERR " -verbosity number 0=none, 3=lots\n";
59 print STDERR " -archivedir directory Where the converted material ends up\n";
60 print STDERR " -phindexdir directory Where to place the phind indexes\n";
61 print STDERR " -languages names The languages to consider in the browser\n";
62 print STDERR " -maxdocs number Maximum number of documents to process\n";
63 print STDERR " -untidy Leave working files in phindex directory\n";
64 print STDERR "\n";
65}
66
67
68# Define delimiter symbols - this should be abstracted out someplace
69my $colstart = "COLLECTIONSTART";
70my $colend = "COLLECTIONEND";
71my $doclimit = "DOCUMENTLIMIT";
72my $senlimit = "SENTENCELIMIT";
73my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75&main ();
76
77
78sub main {
79
80 my ($verbosity, $archivedir, $phindexdir,
81 $phindcfg, $language, $maxdocs, $untidy,
82 $collection, $configfilename, $collectcfg);
83
84 # Parse command-line arguments and get the collection name
85 if (!parsargv::parse(\@ARGV,
86 'verbosity/\d+/1', \$verbosity,
87 'archivedir/.*/', \$archivedir,
88 'phindexdir/.*/', \$phindexdir,
89 'languages/\w+/english', \$language,
90 'maxdocs/^\-?\d+/-1', \$maxdocs,
91 'untidy', \$untidy )) {
92 &print_usage();
93 die "\n";
94 }
95 $collection = &util::use_collection(@ARGV);
96 if ($collection eq "") {
97 &print_usage();
98 die "\n";
99 }
100
101 # Read the collection configuration file
102 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103 if (-e $configfilename) {
104 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
105 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106 $archivedir = $collectcfg->{'archivedir'};
107 }
108 if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109 $phindexdir = $collectcfg->{'phindexdir'};
110 }
111 if (defined $collectcfg->{'phind'}) {
112 $phindcfg = $collectcfg->{'phind'};
113 }
114 } else {
115 die "Couldn't find the configuration file $configfilename\n";
116 }
117
118 # Calculate default archive and phindex directories
119 if ($archivedir eq "") {
120 $archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121 }
122 if ($phindexdir eq "") {
123 $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124 }
125
126 # Make sure theuser has in fact requested phind indexes
127 if (!defined($phindcfg)) {
128 print "No phind information in $configfilename\n";
129 exit;
130 }
131
132
133 # Construct phind indexes
134 my $mode = 1;
135 my $symbol_limit = 40000000;
136 my ($command, $status);
137
138 # Make a clean phindex directory
139 if (-e "$phindexdir") {
140 &util::rm_r("$phindexdir");
141 }
142 &util::mk_dir("$phindexdir");
143
144 # Read the archives directory and build the clauses file
145 print "\nReading archive directory\n" if $verbosity;
146 &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
147
148 # Generate the vocabulary, symbol statistics, and numbers file
149 # from the clauses file
150 print "\nExtracting vocabulary and statistics\n" if $verbosity;
151 &extract_vocabulary($phindexdir, $language, $verbosity);
152
153 # Use the suffix program to generate the phindex/phrases file
154 &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
155
156 # Create the phrase file and put phrase numbers in phindex/phrases
157 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
158 &renumber_phrases("$phindexdir", $verbosity);
159
160 # Create the mg phrase database
161 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
162 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
163 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
164
165 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
166 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
167 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
168 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
169
170 print "\nCreating phrase databases\n";
171 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
172 my $mg_stem = "pdata";
173
174 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
175 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
176 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
177
178 # create the mg index of words
179 print "\nCreating word-level search indexes\n";
180 $mg_input = &util::filename_cat($phindexdir, "pword.txt");
181 $mg_stem = "pword";
182
183 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
184 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
185 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
186 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
187 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
188 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
189
190 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
191 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
192 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
193
194 # create the mg document information database
195 print "\nCreating document information databases\n";
196 $mg_input = &util::filename_cat($phindexdir, "docs.txt");
197 $mg_stem = "docs";
198
199 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
200 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
201 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
202
203
204 # Tidy up stray files
205 if (!$untidy) {
206 print "\nCleaning up\n" if $verbosity;
207 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
208 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
209 "$phindexdir/phrases", "$phindexdir/docs.txt");
210 my $outfile = 1;
211 while (-e "$phindexdir/outPhrase.$outfile") {
212 &util::rm("$phindexdir/outPhrase.$outfile");
213 $outfile++;
214 }
215 }
216}
217
218# Execute a system command
219
220sub execute {
221 my ($command, $verbosity) = @_;
222 print "Executing: $command\n" if $verbosity;
223 my $status = system($command);
224 if ($status != 0) {
225 print STDERR "phindgen.pl - Error executing $command: $!\n";
226 exit($status);
227 }
228}
229
230
231
232
233
234# Get the text for the collection.
235# Save the document names and identifiers as we go.
236
237sub build_clauses {
238 my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
239
240 # create a "pluginfo" for ArcPlug and RecPlug
241 my $plugins = [["GMLPlug"], ["ArcPlug"]];
242 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
243 die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
244
245
246 # create a phind document processor object to process the documents
247 my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
248 $doclimit, $verbosity, "STDOUT");
249
250 # process the archives directory
251 &plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
252}
253
254
255
256# Generate the vocabulary, symbol statistics, and numbers file from the
257# clauses file. This is legacy code, so is a bit messy and probably wont
258# run under windows.
259
260sub extract_vocabulary {
261 my ($phindex_dir, $language, $verbosity) = @_;
262
263 my ($w, $l, $line, $word);
264
265 my ($first_delimiter, $last_delimiter,
266 $first_stopword, $last_stopword,
267 $first_extractword, $last_extractword,
268 $first_contentword, $last_contentword,
269 $phrasedelimiter);
270
271 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
272
273
274 my %symbol;
275 my (%freq);
276
277 print "Calculating vocabulary\n" if ($verbosity > 1);
278
279 # Read and store the stopwords
280 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
281 my %stopwords;
282 foreach my $w (split(/\s+/, $words)) {
283 $l = lc($w);
284 $stopwords{$l} = $w;
285 }
286
287 # Read and store the thesaurus terms
288 $use_thesaurus = 0;
289 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
290 "$language", "agrovoc.lex");
291 if (-e "$lex_file") {
292 open(TH, "<$lex_file");
293 while(<TH>) {
294 s/^\d+ //;
295 s/\(.*\)//;
296 foreach my $w (split(/\s+/, $_)) {
297 $thesaurus{lc($w)} = $w;
298 }
299 }
300 close TH;
301 $use_thesaurus = 1;
302 }
303
304 # Read words in the text and count occurences
305 open(TXT, "<$phindex_dir/clauses");
306 my @words;
307
308 while(<TXT>) {
309 $line = $_;
310 next unless ($line =~ /./);
311
312 @words = split(/\s+/, $line);
313 foreach $w (@words) {
314 $l = lc($w);
315 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
316 $freq{$w}++;
317 }
318 $freq{$senlimit}++;
319 }
320
321 # Calculate the "best" form of each word
322 my (%bestform, %totalfreq, %bestfreq);
323
324 foreach $w (sort (keys %freq)) {
325 $l = lc($w);
326
327 # totalfreq is the number of times a term appears in any form
328 $totalfreq{$l} += $freq{$w};
329
330 if (defined $stopwords{$l}) {
331 $bestform{$l} = $stopwords{$l};
332
333 } elsif (defined $thesaurus{$l}) {
334 $bestform{$l} = $thesaurus{$l};
335
336 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
337 $bestfreq{$l} = $freq{$w};
338 $bestform{$l} = $w;
339 }
340 }
341
342 undef %freq;
343 undef %bestfreq;
344
345
346 # Assign symbol numbers to tokens
347 my $nextsymbol = 1;
348 my (@vocab);
349
350 # Delimiters
351 $first_delimiter = 1;
352
353 foreach $word (@delimiters) {
354
355 $word = lc($word);
356 $bestform{$word} = uc($word);
357 $vocab[$nextsymbol] = $word;
358 $symbol{$word} = $nextsymbol;
359 $nextsymbol++;
360 }
361 $last_delimiter = $nextsymbol - 1;
362
363 # Stopwords
364 $first_stopword = $nextsymbol;
365
366 foreach my $word (sort keys %stopwords) {
367
368 # don't incluse stopword unless it occurs in the text
369 $word = lc($word);
370 next unless ($totalfreq{$word});
371 next if ($symbol{$word});
372
373 $vocab[$nextsymbol] = $word;
374 $symbol{$word} = $nextsymbol;
375 $nextsymbol++;
376 }
377 $last_stopword = $nextsymbol - 1;
378 $first_contentword = $nextsymbol;
379
380 # Thesaurus terms
381 if ($use_thesaurus) {
382 $first_thesaurusword = $nextsymbol;
383
384 foreach my $word (sort keys %thesaurus) {
385
386 $word = lc($word);
387 next if ($symbol{$word});
388 $bestform{$word} = $thesaurus{$word};
389
390 $vocab[$nextsymbol] = $word;
391 $symbol{$word} = $nextsymbol;
392 $nextsymbol++;
393
394 }
395 $last_thesaurusword = $nextsymbol - 1;
396 }
397
398 # Other content words
399 $first_extractword = $nextsymbol;
400
401 foreach my $word (sort (keys %bestform)) {
402
403 next if ($symbol{$word});
404
405 $vocab[$nextsymbol] = $word;
406 $symbol{$word} = $nextsymbol;
407 $nextsymbol++;
408 }
409 $last_extractword = $nextsymbol - 1;
410 $last_contentword = $nextsymbol - 1;
411
412
413 # Outut the words
414 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
415 open(VOC, ">$phindex_dir/clauses.vocab");
416
417 for (my $i = 1; $i < $nextsymbol; $i++) {
418 $w = $vocab[$i];
419
420 print VOC "$bestform{$w}\n";
421 $totalfreq{$w} = 0 unless ($totalfreq{$w});
422 }
423 close VOC;
424
425
426 # Output statistics about the vocablary
427 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
428 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
429 open(STAT, ">$phindex_dir/clauses.stats")
430 || die "Cannot open $phindex_dir/clauses.stats: $!";
431
432 print STAT "first_delimiter $first_delimiter\n";
433 print STAT "last_delimiter $last_delimiter\n";
434 print STAT "first_stopword $first_stopword\n";
435 print STAT "last_stopword $last_stopword\n";
436 if ($use_thesaurus) {
437 print STAT "first_thesaurusword $first_thesaurusword\n";
438 print STAT "last_thesaurusword $last_thesaurusword\n";
439 }
440 print STAT "first_extractword $first_extractword\n";
441 print STAT "last_extractword $last_extractword\n";
442 print STAT "first_contentword $first_contentword\n";
443 print STAT "last_contentword $last_contentword\n";
444 print STAT "first_symbol $first_delimiter\n";
445 print STAT "last_symbol $last_contentword\n";
446 print STAT "first_word $first_stopword\n";
447 print STAT "last_word $last_contentword\n";
448 close STAT;
449
450 undef @vocab;
451
452
453 # Save text as symbol numbers
454 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
455
456 open(TXT, "<$phindex_dir/clauses");
457 open(NUM, ">$phindex_dir/clauses.numbers");
458
459 $phrasedelimiter = $symbol{lc($senlimit)};
460 print NUM "$symbol{lc($colstart)}\n";
461
462 # set up the special symbols that delimit documents and sentences
463 while(<TXT>) {
464
465 # split sentence into a list of tokens
466 $line = $_;
467 next unless ($line =~ /./);
468 @words = split(/\s+/, $line);
469
470 # output one token at a time
471 foreach $word (@words) {
472 $word = lc($word);
473 print NUM "$symbol{$word}\n";
474 }
475
476 # output phrase delimiter
477 print NUM "$phrasedelimiter\n";
478 }
479
480 print NUM "$symbol{lc($colend)}\n";
481
482}
483
484
485# Prepare the phrases file to be input to mgpp.
486# This means renumbering the phrases in order of decreasing frequency.
487
488
489# This is legacy code, and a little ugly, and may be unix-specific
490# (particularly the sort command).
491
492sub renumber_phrases {
493 my ($phindex_dir, $verbosity) = @_;
494
495 # Sort the phrases into order of increasing frequency
496 # This means the expansions will be sorted correctly later on.
497 print "Sorting phrases into freq order\n" if ($verbosity);
498 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
499
500 my @symbol;
501
502 # Read the vocabulary
503 print "Reading the vocabulary\n" if ($verbosity);
504 open(V, "<$phindex_dir/clauses.vocab")
505 || die "Cannot open $phindex_dir/clauses.vocab: $!";
506
507 my $i = 1;
508 while(<V>) {
509 chomp;
510 $symbol[$i++] = $_;
511 }
512
513 # Create file for phrase data
514 #
515 # The phrases file looks something like this
516 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
517 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
518 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
519 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
520
521 # The first field on each line is a unique phrase identifier.
522 # We need to calculate phrase numbers for each phrase
523 print "Calculate phrase numbers\n" if ($verbosity);
524
525 my %phrasenumber;
526 my $nextphrase = 1;
527 my ($line);
528
529 open(IN, "<$phindex_dir/phrases");
530 while(<IN>) {
531
532 # read the line
533 chomp;
534 $line = $_;
535
536 # we're only interested in the first field
537 $line =~ s/:.*//;
538
539 # get a phrase number for this line
540 $phrasenumber{$line} = $nextphrase;
541 $nextphrase++;
542 }
543
544
545 # Now we create a new phrase file using phrase numbers, not the old IDs.
546 print "Format phrase data for MGPP\n" if ($verbosity);
547
548 open(IN, "<$phindex_dir/phrases");
549 open(DATA, ">$phindex_dir/pdata.txt");
550 open(IDX, ">$phindex_dir/pword.txt");
551
552 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
553 my @fields;
554 my @documents;
555 my (@newexp, $k, $n);
556
557 my $linenumber = 0;
558
559 while(<IN>) {
560
561 # read the line
562 chomp;
563 $line = $_;
564 @fields = split(/:/, $line);
565
566 # get a phrase number for this line
567 $key = shift @fields;
568 die unless (defined($phrasenumber{$key}));
569 $num = $phrasenumber{$key};
570
571 # get the text of the phrase
572 $text = shift @fields;
573 $text =~ s/s(\d+)/$symbol[$1]/g;
574 if ($text =~ / /) {
575 $word = "";
576 } else {
577 $word = $text;
578 }
579
580 $linenumber++;
581 if ($linenumber % 1000 == 0) {
582 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
583 }
584 print "$num: $key\t($text)\n" if ($verbosity > 3);
585
586 # get the phrase frequency
587 $tf = shift @fields;
588
589 # get the number of expansions
590 $countexp = shift @fields;
591
592 # get the expansions and convert them into phrase numbers
593 $expansions = shift @fields;
594 @newexp = ();
595 foreach $k (split(/,/, $expansions)) {
596 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
597 $n = $phrasenumber{$k};
598 push @newexp, $n;
599 }
600 @newexp = sort numerically @newexp;
601
602 # get the number of documents
603 $countdocs = shift @fields;
604
605 # get the documents
606 $documents = shift @fields;
607 $documents =~ s/d//g;
608 @documents = split(/;/, $documents);
609 @documents = sort by_frequency @documents;
610
611 # output the phrase data
612 print DATA "<Document>";
613 print DATA "$num:$text:$tf:$countexp:$countdocs:";
614 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
615
616 # output the word index search data
617 print IDX "<Document>$word\n";
618
619
620 }
621}
622
623# sort routines used to renumber phrases
624
625sub numerically { $a <=> $b }
626
627sub by_frequency {
628 my $fa = 1;
629 if ($a =~ /,/) {
630 $fa = $a;
631 $fa =~ s/\d+,//;
632 }
633 my $fb = 1;
634 if ($b =~ /,/) {
635 $fb = $b;
636 $fb =~ s/\d+,//;
637 }
638
639 return ($fb <=> $fa);
640}
641
Note: See TracBrowser for help on using the repository browser.