source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1604

Last change on this file since 1604 was 1604, checked in by paynter, 24 years ago

Numerous improvements for use with the new phindcgi script. The main ones
are that three MGPP databases are now created (document data, phrase data,
and word search) and that the data extracted from each document is set
explicitly in the collection configuration file (usually it will be
something like document:text or section:Title).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1#! /usr/bin/perl -w
2
3###########################################################################
4#
5# phindgen.pl -- produce Phind index files for a GSDL collection.
6#
7# Copyright (C) 2000 Gordon W. Paynter
8#
9# A component of the Greenstone digital library software
10# from the New Zealand Digital Library Project at the
11# University of Waikato, New Zealand.
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29
30# This program generates phind phrase browsing indexes for a Greenstone
31# digital library collection.
32#
33# The GML files that have been imported to the archives directory are
34# read, and then the phind indexes are created in the phindex directory.
35#
36# This version od Phind uses suffix arrays to extract phrases.
37
38
39BEGIN {
40 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44}
45
46
47use strict;
48use colcfg;
49use util;
50use parsargv;
51use plugin;
52use phproc;
53
54
55sub print_usage {
56 print STDERR "\n usage: $0 [options] collection-name\n\n";
57 print STDERR " options:\n";
58 print STDERR " -verbosity number 0=none, 3=lots\n";
59 print STDERR " -archivedir directory Where the converted material ends up\n";
60 print STDERR " -phindexdir directory Where to place the phind indexes\n";
61 print STDERR " -languages names The languages to consider in the browser\n";
62 print STDERR " -maxdocs number Maximum number of documents to process\n";
63 print STDERR " -untidy Leave working files in phindex directory\n";
64 print STDERR "\n";
65}
66
67
68# Define delimiter symbols - this should be abstracted out someplace
69my $colstart = "COLLECTIONSTART";
70my $colend = "COLLECTIONEND";
71my $doclimit = "DOCUMENTLIMIT";
72my $senlimit = "SENTENCELIMIT";
73my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75&main ();
76
77
78sub main {
79
80 my ($verbosity, $archivedir, $phindexdir,
81 $phindcfg, $language, $maxdocs, $untidy,
82 $collection, $configfilename, $collectcfg);
83
84 # Parse command-line arguments and get the collection name
85 if (!parsargv::parse(\@ARGV,
86 'verbosity/\d+/1', \$verbosity,
87 'archivedir/.*/', \$archivedir,
88 'phindexdir/.*/', \$phindexdir,
89 'languages/\w+/english', \$language,
90 'maxdocs/^\-?\d+/-1', \$maxdocs,
91 'untidy', \$untidy )) {
92 &print_usage();
93 die "\n";
94 }
95 $collection = &util::use_collection(@ARGV);
96 if ($collection eq "") {
97 &print_usage();
98 die "\n";
99 }
100
101 # Read the collection configuration file
102 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103 if (-e $configfilename) {
104 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
105 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106 $archivedir = $collectcfg->{'archivedir'};
107 }
108 if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109 $phindexdir = $collectcfg->{'phindexdir'};
110 }
111 if (defined $collectcfg->{'phind'}) {
112 $phindcfg = $collectcfg->{'phind'};
113 }
114 } else {
115 die "Couldn't find the configuration file $configfilename\n";
116 }
117
118 # Calculate default archive and phindex directories
119 if ($archivedir eq "") {
120 $archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121 }
122 if ($phindexdir eq "") {
123 $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124 }
125
126 # Make sure theuser has in fact requested phind indexes
127 if (!defined($phindcfg)) {
128 print "No phind information in $configfilename\n";
129 exit;
130 }
131
132
133 # Construct phind indexes
134 my $mode = 1;
135 my $symbol_limit = 40000000;
136 my ($command, $status);
137
138 # Make a clean phindex directory
139 if (-e "$phindexdir") {
140 &util::rm_r("$phindexdir");
141 }
142 &util::mk_dir("$phindexdir");
143
144 # Read the archives directory and build the clauses file
145 print "\nReading archive directory\n" if $verbosity;
146 &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
147
148 # Generate the vocabulary, symbol statistics, and numbers file
149 # from the clauses file
150 print "\nExtracting vocabulary and statistics\n" if $verbosity;
151 &extract_vocabulary($phindexdir, $language, $verbosity);
152
153 # Use the suffix program to generate the phindex/phrases file
154 &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
155
156 # Create the phrase file and put phrase numbers in phindex/phrases
157 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
158 &renumber_phrases("$phindexdir", $verbosity);
159
160 # Create the mg phrase database
161 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
162 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
163 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
164
165 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
166 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
167 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
168 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
169
170 print "\nCreating phrase databases\n";
171 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
172 my $mg_stem = "pdata";
173
174 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
175 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
176 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
177
178 # create the mg index of words
179 print "\nCreating word-level search indexes\n";
180 $mg_input = &util::filename_cat($phindexdir, "pword.txt");
181 $mg_stem = "pword";
182
183 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
184 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
185 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
186 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
187 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
188 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
189
190 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
191 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
192 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
193
194 # create the mg document information database
195 print "\nCreating document information databases\n";
196 $mg_input = &util::filename_cat($phindexdir, "docs.txt");
197 $mg_stem = "docs";
198
199 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
200 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
201 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
202
203
204 # Tidy up stray files
205 if (!$untidy) {
206 print "\nCleaning up\n" if $verbosity;
207 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
208 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
209 "$phindexdir/phrases", "$phindexdir/docs.txt");
210 my $outfile = 1;
211 while (-e "$phindexdir/outPhrase.$outfile") {
212 &util::rm("$phindexdir/outPhrase.$outfile");
213 $outfile++;
214 }
215 }
216}
217
218# Execute a system command
219
220sub execute {
221 my ($command, $verbosity) = @_;
222 print "Executing: $command\n" if $verbosity;
223 my $status = system($command);
224 if ($status != 0) {
225 print STDERR "phindgen.pl - Error executing $command: $!\n";
226 exit($status);
227 }
228}
229
230
231
232
233
234# Get the text for the collection.
235# Save the document names and identifiers as we go.
236
237sub build_clauses {
238 my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
239
240 # create a "pluginfo" for ArcPlug and RecPlug
241 my $plugins = [["GMLPlug"], ["ArcPlug"]];
242 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
243 die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
244
245
246 # create a phind document processor object to process the documents
247 my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
248 $doclimit, $verbosity, "STDOUT");
249
250 # process the archives directory
251 &plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
252}
253
254
255
256# Generate the vocabulary, symbol statistics, and numbers file from the
257# clauses file. This is legacy code, so is a bit messy and probably wont
258# run under windows.
259
260sub extract_vocabulary {
261 my ($phindex_dir, $language, $verbosity) = @_;
262
263 my ($w, $l, $line, $word);
264
265 my ($first_delimiter, $last_delimiter,
266 $first_stopword, $last_stopword,
267 $first_extractword, $last_extractword,
268 $first_contentword, $last_contentword,
269 $phrasedelimiter);
270
271 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
272
273
274 my %symbol;
275 my (%freq);
276
277 print "Calculating vocabulary\n" if ($verbosity > 1);
278
279 # Read and store the stopwords
280 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
281 my %stopwords;
282 foreach my $w (split(/\s+/, $words)) {
283 $l = lc($w);
284 $stopwords{$l} = $w;
285 }
286
287 # Read and store the thesaurus terms
288 $use_thesaurus = 0;
289 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
290 "$language", "agrovoc.lex");
291 if (-e "$lex_file") {
292 open(TH, "<$lex_file");
293 while(<TH>) {
294 s/^\d+ //;
295 s/\(.*\)//;
296 foreach my $w (split(/\s+/, $_)) {
297 $thesaurus{lc($w)} = $w;
298 }
299 }
300 close TH;
301 $use_thesaurus = 1;
302 }
303
304 # Read words in the text and count occurences
305 open(TXT, "<$phindex_dir/clauses");
306 my @words;
307
308 while(<TXT>) {
309 $line = $_;
310 next unless ($line =~ /./);
311
312 @words = split(/\s+/, $line);
313 foreach $w (@words) {
314 $l = lc($w);
315 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
316 $freq{$w}++;
317 }
318 $freq{$senlimit}++;
319 }
320
321 # Calculate the "best" form of each word
322 my (%bestform, %totalfreq, %bestfreq);
323
324 foreach $w (sort (keys %freq)) {
325 $l = lc($w);
326
327 # totalfreq is the number of times a term appears in any form
328 $totalfreq{$l} += $freq{$w};
329
330 if (defined $stopwords{$l}) {
331 $bestform{$l} = $stopwords{$l};
332
333 } elsif (defined $thesaurus{$l}) {
334 $bestform{$l} = $thesaurus{$l};
335
336 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
337 $bestfreq{$l} = $freq{$w};
338 $bestform{$l} = $w;
339 }
340 }
341
342 undef %freq;
343 undef %bestfreq;
344
345
346 # Assign symbol numbers to tokens
347 my $nextsymbol = 1;
348 my (@vocab);
349
350 # Delimiters
351 $first_delimiter = 1;
352
353 foreach $word (@delimiters) {
354
355 $word = lc($word);
356 $bestform{$word} = uc($word);
357 $vocab[$nextsymbol] = $word;
358 $symbol{$word} = $nextsymbol;
359 $nextsymbol++;
360 }
361 $last_delimiter = $nextsymbol - 1;
362
363 # Stopwords
364 $first_stopword = $nextsymbol;
365
366 foreach my $word (sort keys %stopwords) {
367
368 # don't incluse stopword unless it occurs in the text
369 $word = lc($word);
370 next unless ($totalfreq{$word});
371 next if ($symbol{$word});
372
373 $vocab[$nextsymbol] = $word;
374 $symbol{$word} = $nextsymbol;
375 $nextsymbol++;
376 }
377 $last_stopword = $nextsymbol - 1;
378 $first_contentword = $nextsymbol;
379
380 # Thesaurus terms
381 if ($use_thesaurus) {
382 $first_thesaurusword = $nextsymbol;
383
384 foreach my $word (sort keys %thesaurus) {
385
386 $word = lc($word);
387 next if ($symbol{$word});
388 $bestform{$word} = $thesaurus{$word};
389
390 $vocab[$nextsymbol] = $word;
391 $symbol{$word} = $nextsymbol;
392 $nextsymbol++;
393
394 }
395 $last_thesaurusword = $nextsymbol - 1;
396 }
397
398 # Other content words
399 $first_extractword = $nextsymbol;
400
401 foreach my $word (sort (keys %bestform)) {
402
403 next if ($symbol{$word});
404
405 $vocab[$nextsymbol] = $word;
406 $symbol{$word} = $nextsymbol;
407 $nextsymbol++;
408 }
409 $last_extractword = $nextsymbol - 1;
410 $last_contentword = $nextsymbol - 1;
411
412
413 # Outut the words
414 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
415 open(VOC, ">$phindex_dir/clauses.vocab");
416
417 for (my $i = 1; $i < $nextsymbol; $i++) {
418 $w = $vocab[$i];
419
420 print VOC "$bestform{$w}\n";
421 $totalfreq{$w} = 0 unless ($totalfreq{$w});
422 }
423 close VOC;
424
425
426 # Output statistics about the vocablary
427 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
428 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
429 open(STAT, ">$phindex_dir/clauses.stats")
430 || die "Cannot open $phindex_dir/clauses.stats: $!";
431
432 print STAT "first_delimiter $first_delimiter\n";
433 print STAT "last_delimiter $last_delimiter\n";
434 print STAT "first_stopword $first_stopword\n";
435 print STAT "last_stopword $last_stopword\n";
436 if ($use_thesaurus) {
437 print STAT "first_thesaurusword $first_thesaurusword\n";
438 print STAT "last_thesaurusword $last_thesaurusword\n";
439 }
440 print STAT "first_extractword $first_extractword\n";
441 print STAT "last_extractword $last_extractword\n";
442 print STAT "first_contentword $first_contentword\n";
443 print STAT "last_contentword $last_contentword\n";
444 print STAT "first_symbol $first_delimiter\n";
445 print STAT "last_symbol $last_contentword\n";
446 print STAT "first_word $first_stopword\n";
447 print STAT "last_word $last_contentword\n";
448 close STAT;
449
450 undef @vocab;
451
452
453 # Save text as symbol numbers
454 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
455
456 open(TXT, "<$phindex_dir/clauses");
457 open(NUM, ">$phindex_dir/clauses.numbers");
458
459 $phrasedelimiter = $symbol{lc($senlimit)};
460 print NUM "$symbol{lc($colstart)}\n";
461
462 # set up the special symbols that delimit documents and sentences
463 while(<TXT>) {
464
465 # split sentence into a list of tokens
466 $line = $_;
467 next unless ($line =~ /./);
468 @words = split(/\s+/, $line);
469
470 # output one token at a time
471 foreach $word (@words) {
472 $word = lc($word);
473 print NUM "$symbol{$word}\n";
474 }
475
476 # output phrase delimiter
477 print NUM "$phrasedelimiter\n";
478 }
479
480 print NUM "$symbol{lc($colend)}\n";
481
482}
483
484
485# Prepare the phrases file to be input to mgpp.
486# This means renumbering the phrases in order of decreasing frequency.
487
488
489# This is legacy code, and a little ugly, and may be unix-specific
490# (particularly the sort command).
491
492sub renumber_phrases {
493 my ($phindex_dir, $verbosity) = @_;
494
495 # Sort the phrases into order of increasing frequency
496 # This means the expansions will be sorted correctly later on.
497 print "Sorting phrases into freq order\n" if ($verbosity);
498 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
499
500 my @symbol;
501
502 # Read the vocabulary
503 print "Reading the vocabulary\n" if ($verbosity);
504 open(V, "<$phindex_dir/clauses.vocab")
505 || die "Cannot open $phindex_dir/clauses.vocab: $!";
506
507 my $i = 1;
508 while(<V>) {
509 chomp;
510 $symbol[$i++] = $_;
511 }
512
513 # Create file for phrase data
514 #
515 # The phrases file looks something like this
516 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
517 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
518 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
519 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
520
521 # The first field on each line is a unique phrase identifier.
522 # We need to calculate phrase numbers for each phrase
523 print "Calculate phrase numbers\n" if ($verbosity);
524
525 my %phrasenumber;
526 my $nextphrase = 1;
527 my ($line);
528
529 open(IN, "<$phindex_dir/phrases");
530 while(<IN>) {
531
532 # read the line
533 chomp;
534 $line = $_;
535
536 # we're only interested in the first field
537 $line =~ s/:.*//;
538
539 # get a phrase number for this line
540 $phrasenumber{$line} = $nextphrase;
541 $nextphrase++;
542 }
543
544
545 # Now we create a new phrase file using phrase numbers, not the old IDs.
546 print "Format phrase data for MGPP\n" if ($verbosity);
547
548 open(IN, "<$phindex_dir/phrases");
549 open(DATA, ">$phindex_dir/pdata.txt");
550 open(IDX, ">$phindex_dir/pword.txt");
551
552 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
553 my @fields;
554 my @documents;
555 my (@newexp, $k, $n);
556
557 my $linenumber = 0;
558
559 while(<IN>) {
560
561 # read the line
562 chomp;
563 $line = $_;
564 @fields = split(/:/, $line);
565
566 # get a phrase number for this line
567 $key = shift @fields;
568 die unless (defined($phrasenumber{$key}));
569 $num = $phrasenumber{$key};
570
571 # get the text of the phrase
572 $text = shift @fields;
573 $text =~ s/s(\d+)/$symbol[$1]/g;
574 if ($text =~ / /) {
575 $word = "";
576 } else {
577 $word = $text;
578 }
579
580 $linenumber++;
581 if ($linenumber % 1000 == 0) {
582 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 1);
583 }
584 print "$num: $key\t($text)\n" if ($verbosity > 1);
585
586 # get the phrase frequency
587 $tf = shift @fields;
588
589 # get the number of expansions
590 $countexp = shift @fields;
591
592 # get the expansions and convert them into phrase numbers
593 $expansions = shift @fields;
594 @newexp = ();
595 foreach $k (split(/,/, $expansions)) {
596 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
597 $n = $phrasenumber{$k};
598 push @newexp, $n;
599 }
600 @newexp = sort numerically @newexp;
601
602 # get the number of documents
603 $countdocs = shift @fields;
604
605 # get the documents
606 $documents = shift @fields;
607 $documents =~ s/d//g;
608 @documents = split(/;/, $documents);
609 @documents = sort by_frequency @documents;
610
611 # output the phrase data
612 print DATA "<Document>";
613 print DATA "$num:$text:$tf:$countexp:$countdocs:";
614 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
615
616 # output the word index search data
617 print IDX "<Document>$word\n";
618
619
620 }
621}
622
623# sort routines used to renumber phrases
624
625sub numerically { $a <=> $b }
626
627sub by_frequency {
628 my $fa = 1;
629 if ($a =~ /,/) {
630 $fa = $a;
631 $fa =~ s/\d+,//;
632 }
633 my $fb = 1;
634 if ($b =~ /,/) {
635 $fb = $b;
636 $fb =~ s/\d+,//;
637 }
638
639 return ($fb <=> $fa);
640}
641
Note: See TracBrowser for help on using the repository browser.