source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1631

Last change on this file since 1631 was 1630, checked in by paynter, 24 years ago

Tidy up pword.txt and pdata.txt files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.4 KB
Line 
1#! /usr/bin/perl -w
2
3###########################################################################
4#
5# phindgen.pl -- produce Phind index files for a GSDL collection.
6#
7# Copyright 2000 Gordon W. Paynter
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# This program generates phind phrase browsing indexes for a Greenstone
32# digital library collection.
33#
34# The GML files that have been imported to the archives directory are
35# read, and then the phind indexes are created in the phindex directory.
36#
37# This version od Phind uses suffix arrays to extract phrases.
38
39
40BEGIN {
41 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
42 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
43 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
44 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
45}
46
47
48use strict;
49use colcfg;
50use util;
51use parsargv;
52use plugin;
53use phproc;
54
55
56sub print_usage {
57 print STDERR "\n usage: $0 [options] collection-name\n\n";
58 print STDERR " options:\n";
59 print STDERR " -verbosity number 0=none, 3=lots\n";
60 print STDERR " -archivedir directory Where the converted material ends up\n";
61 print STDERR " -phindexdir directory Where to place the phind indexes\n";
62 print STDERR " -languages names The languages to consider in the browser\n";
63 print STDERR " -maxdocs number Maximum number of documents to process\n";
64 print STDERR " -untidy Leave working files in phindex directory\n";
65 print STDERR "\n";
66}
67
68
69# Define delimiter symbols - this should be abstracted out someplace
70my $colstart = "COLLECTIONSTART";
71my $colend = "COLLECTIONEND";
72my $doclimit = "DOCUMENTLIMIT";
73my $senlimit = "SENTENCELIMIT";
74my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
75
76&main ();
77
78
79sub main {
80
81 my ($verbosity, $archivedir, $phindexdir,
82 $phindcfg, $language, $maxdocs, $untidy,
83 $collection, $configfilename, $collectcfg);
84
85 # Parse command-line arguments and get the collection name
86 if (!parsargv::parse(\@ARGV,
87 'verbosity/\d+/1', \$verbosity,
88 'archivedir/.*/', \$archivedir,
89 'phindexdir/.*/', \$phindexdir,
90 'languages/\w+/english', \$language,
91 'maxdocs/^\-?\d+/-1', \$maxdocs,
92 'untidy', \$untidy )) {
93 &print_usage();
94 die "\n";
95 }
96 $collection = &util::use_collection(@ARGV);
97 if ($collection eq "") {
98 &print_usage();
99 die "\n";
100 }
101
102 # Read the collection configuration file
103 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
104 if (-e $configfilename) {
105 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
106 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
107 $archivedir = $collectcfg->{'archivedir'};
108 }
109 if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
110 $phindexdir = $collectcfg->{'phindexdir'};
111 }
112 if (defined $collectcfg->{'phind'}) {
113 $phindcfg = $collectcfg->{'phind'};
114 }
115 } else {
116 die "Couldn't find the configuration file $configfilename\n";
117 }
118
119 # Calculate default archive and phindex directories
120 if ($archivedir eq "") {
121 $archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
122 }
123 if ($phindexdir eq "") {
124 $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
125 }
126
127 # Make sure theuser has in fact requested phind indexes
128 if (!defined($phindcfg)) {
129 print "No phind information in $configfilename\n";
130 exit;
131 }
132
133
134 # Construct phind indexes
135 my $mode = 1;
136 my $symbol_limit = 40000000;
137 my ($command, $status);
138
139 # Make a clean phindex directory
140 if (-e "$phindexdir") {
141 &util::rm_r("$phindexdir");
142 }
143 &util::mk_dir("$phindexdir");
144
145 # Read the archives directory and build the clauses file
146 print "\nReading archive directory\n" if $verbosity;
147 &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
148
149 # Generate the vocabulary, symbol statistics, and numbers file
150 # from the clauses file
151 print "\nExtracting vocabulary and statistics\n" if $verbosity;
152 &extract_vocabulary($phindexdir, $language, $verbosity);
153
154 # Use the suffix program to generate the phindex/phrases file
155 &execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
156
157 # Create the phrase file and put phrase numbers in phindex/phrases
158 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
159 &renumber_phrases("$phindexdir", $verbosity);
160
161 # Create the mg phrase database
162 my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
163 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
164 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
165
166 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
167 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
168 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
169 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
170
171 print "\nCreating phrase databases\n";
172 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
173 my $mg_stem = "pdata";
174
175 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
176 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
177 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
178
179 # create the mg index of words
180 print "\nCreating word-level search indexes\n";
181 $mg_input = &util::filename_cat($phindexdir, "pword.txt");
182 $mg_stem = "pword";
183
184 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
185 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
186 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
187 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
188 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
189 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
190
191 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
192 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
193 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
194
195 # create the mg document information database
196 print "\nCreating document information databases\n";
197 $mg_input = &util::filename_cat($phindexdir, "docs.txt");
198 $mg_stem = "docs";
199
200 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
201 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
202 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
203
204
205 # Tidy up stray files
206 if (!$untidy) {
207 print "\nCleaning up\n" if $verbosity;
208 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
209 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
210 "$phindexdir/phrases", "$phindexdir/docs.txt",
211 "$phindexdir/pdata.txt", "$phindexdir/pword.txt");
212 my $outfile = 1;
213 while (-e "$phindexdir/outPhrase.$outfile") {
214 &util::rm("$phindexdir/outPhrase.$outfile");
215 $outfile++;
216 }
217 }
218}
219
220# Execute a system command
221
222sub execute {
223 my ($command, $verbosity) = @_;
224 print "Executing: $command\n" if $verbosity;
225 my $status = system($command);
226 if ($status != 0) {
227 print STDERR "phindgen.pl - Error executing $command: $!\n";
228 exit($status);
229 }
230}
231
232
233
234
235
236# Get the text for the collection.
237# Save the document names and identifiers as we go.
238
239sub build_clauses {
240 my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
241
242 # create a "pluginfo" for ArcPlug and RecPlug
243 my $plugins = [["GMLPlug"], ["ArcPlug"]];
244 my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
245 die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
246
247
248 # create a phind document processor object to process the documents
249 my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
250 $doclimit, $verbosity, "STDOUT");
251
252 # process the archives directory
253 &plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
254}
255
256
257
258# Generate the vocabulary, symbol statistics, and numbers file from the
259# clauses file. This is legacy code, so is a bit messy and probably wont
260# run under windows.
261
262sub extract_vocabulary {
263 my ($phindex_dir, $language, $verbosity) = @_;
264
265 my ($w, $l, $line, $word);
266
267 my ($first_delimiter, $last_delimiter,
268 $first_stopword, $last_stopword,
269 $first_extractword, $last_extractword,
270 $first_contentword, $last_contentword,
271 $phrasedelimiter);
272
273 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
274
275
276 my %symbol;
277 my (%freq);
278
279 print "Calculating vocabulary\n" if ($verbosity > 1);
280
281 # Read and store the stopwords
282 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
283 my %stopwords;
284 foreach my $w (split(/\s+/, $words)) {
285 $l = lc($w);
286 $stopwords{$l} = $w;
287 }
288
289 # Read and store the thesaurus terms
290 $use_thesaurus = 0;
291 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
292 "$language", "agrovoc.lex");
293 if (-e "$lex_file") {
294 open(TH, "<$lex_file");
295 while(<TH>) {
296 s/^\d+ //;
297 s/\(.*\)//;
298 foreach my $w (split(/\s+/, $_)) {
299 $thesaurus{lc($w)} = $w;
300 }
301 }
302 close TH;
303 $use_thesaurus = 1;
304 }
305
306 # Read words in the text and count occurences
307 open(TXT, "<$phindex_dir/clauses");
308 my @words;
309
310 while(<TXT>) {
311 $line = $_;
312 next unless ($line =~ /./);
313
314 @words = split(/\s+/, $line);
315 foreach $w (@words) {
316 $l = lc($w);
317 $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l}));
318 $freq{$w}++;
319 }
320 $freq{$senlimit}++;
321 }
322
323 # Calculate the "best" form of each word
324 my (%bestform, %totalfreq, %bestfreq);
325
326 foreach $w (sort (keys %freq)) {
327 $l = lc($w);
328
329 # totalfreq is the number of times a term appears in any form
330 $totalfreq{$l} += $freq{$w};
331
332 if (defined $stopwords{$l}) {
333 $bestform{$l} = $stopwords{$l};
334
335 } elsif (defined $thesaurus{$l}) {
336 $bestform{$l} = $thesaurus{$l};
337
338 } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) {
339 $bestfreq{$l} = $freq{$w};
340 $bestform{$l} = $w;
341 }
342 }
343
344 undef %freq;
345 undef %bestfreq;
346
347
348 # Assign symbol numbers to tokens
349 my $nextsymbol = 1;
350 my (@vocab);
351
352 # Delimiters
353 $first_delimiter = 1;
354
355 foreach $word (@delimiters) {
356
357 $word = lc($word);
358 $bestform{$word} = uc($word);
359 $vocab[$nextsymbol] = $word;
360 $symbol{$word} = $nextsymbol;
361 $nextsymbol++;
362 }
363 $last_delimiter = $nextsymbol - 1;
364
365 # Stopwords
366 $first_stopword = $nextsymbol;
367
368 foreach my $word (sort keys %stopwords) {
369
370 # don't incluse stopword unless it occurs in the text
371 $word = lc($word);
372 next unless ($totalfreq{$word});
373 next if ($symbol{$word});
374
375 $vocab[$nextsymbol] = $word;
376 $symbol{$word} = $nextsymbol;
377 $nextsymbol++;
378 }
379 $last_stopword = $nextsymbol - 1;
380 $first_contentword = $nextsymbol;
381
382 # Thesaurus terms
383 if ($use_thesaurus) {
384 $first_thesaurusword = $nextsymbol;
385
386 foreach my $word (sort keys %thesaurus) {
387
388 $word = lc($word);
389 next if ($symbol{$word});
390 $bestform{$word} = $thesaurus{$word};
391
392 $vocab[$nextsymbol] = $word;
393 $symbol{$word} = $nextsymbol;
394 $nextsymbol++;
395
396 }
397 $last_thesaurusword = $nextsymbol - 1;
398 }
399
400 # Other content words
401 $first_extractword = $nextsymbol;
402
403 foreach my $word (sort (keys %bestform)) {
404
405 next if ($symbol{$word});
406
407 $vocab[$nextsymbol] = $word;
408 $symbol{$word} = $nextsymbol;
409 $nextsymbol++;
410 }
411 $last_extractword = $nextsymbol - 1;
412 $last_contentword = $nextsymbol - 1;
413
414
415 # Outut the words
416 print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
417 open(VOC, ">$phindex_dir/clauses.vocab");
418
419 for (my $i = 1; $i < $nextsymbol; $i++) {
420 $w = $vocab[$i];
421
422 print VOC "$bestform{$w}\n";
423 $totalfreq{$w} = 0 unless ($totalfreq{$w});
424 }
425 close VOC;
426
427
428 # Output statistics about the vocablary
429 print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
430 &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
431 open(STAT, ">$phindex_dir/clauses.stats")
432 || die "Cannot open $phindex_dir/clauses.stats: $!";
433
434 print STAT "first_delimiter $first_delimiter\n";
435 print STAT "last_delimiter $last_delimiter\n";
436 print STAT "first_stopword $first_stopword\n";
437 print STAT "last_stopword $last_stopword\n";
438 if ($use_thesaurus) {
439 print STAT "first_thesaurusword $first_thesaurusword\n";
440 print STAT "last_thesaurusword $last_thesaurusword\n";
441 }
442 print STAT "first_extractword $first_extractword\n";
443 print STAT "last_extractword $last_extractword\n";
444 print STAT "first_contentword $first_contentword\n";
445 print STAT "last_contentword $last_contentword\n";
446 print STAT "first_symbol $first_delimiter\n";
447 print STAT "last_symbol $last_contentword\n";
448 print STAT "first_word $first_stopword\n";
449 print STAT "last_word $last_contentword\n";
450 close STAT;
451
452 undef @vocab;
453
454
455 # Save text as symbol numbers
456 print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
457
458 open(TXT, "<$phindex_dir/clauses");
459 open(NUM, ">$phindex_dir/clauses.numbers");
460
461 $phrasedelimiter = $symbol{lc($senlimit)};
462 print NUM "$symbol{lc($colstart)}\n";
463
464 # set up the special symbols that delimit documents and sentences
465 while(<TXT>) {
466
467 # split sentence into a list of tokens
468 $line = $_;
469 next unless ($line =~ /./);
470 @words = split(/\s+/, $line);
471
472 # output one token at a time
473 foreach $word (@words) {
474 $word = lc($word);
475 print NUM "$symbol{$word}\n";
476 }
477
478 # output phrase delimiter
479 print NUM "$phrasedelimiter\n";
480 }
481
482 print NUM "$symbol{lc($colend)}\n";
483
484}
485
486
487# Prepare the phrases file to be input to mgpp.
488# This means renumbering the phrases in order of decreasing frequency.
489
490
491# This is legacy code, and a little ugly, and may be unix-specific
492# (particularly the sort command).
493
494sub renumber_phrases {
495 my ($phindex_dir, $verbosity) = @_;
496
497 # Sort the phrases into order of increasing frequency
498 # This means the expansions will be sorted correctly later on.
499 print "Sorting phrases into freq order\n" if ($verbosity);
500 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
501
502 my @symbol;
503
504 # Read the vocabulary
505 print "Reading the vocabulary\n" if ($verbosity);
506 open(V, "<$phindex_dir/clauses.vocab")
507 || die "Cannot open $phindex_dir/clauses.vocab: $!";
508
509 my $i = 1;
510 while(<V>) {
511 chomp;
512 $symbol[$i++] = $_;
513 }
514
515 # Create file for phrase data
516 #
517 # The phrases file looks something like this
518 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
519 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
520 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
521 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
522
523 # The first field on each line is a unique phrase identifier.
524 # We need to calculate phrase numbers for each phrase
525 print "Calculate phrase numbers\n" if ($verbosity);
526
527 my %phrasenumber;
528 my $nextphrase = 1;
529 my ($line);
530
531 open(IN, "<$phindex_dir/phrases");
532 while(<IN>) {
533
534 # read the line
535 chomp;
536 $line = $_;
537
538 # we're only interested in the first field
539 $line =~ s/:.*//;
540
541 # get a phrase number for this line
542 $phrasenumber{$line} = $nextphrase;
543 $nextphrase++;
544 }
545
546
547 # Now we create a new phrase file using phrase numbers, not the old IDs.
548 print "Format phrase data for MGPP\n" if ($verbosity);
549
550 open(IN, "<$phindex_dir/phrases");
551 open(DATA, ">$phindex_dir/pdata.txt");
552 open(IDX, ">$phindex_dir/pword.txt");
553
554 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
555 my @fields;
556 my @documents;
557 my (@newexp, $k, $n);
558
559 my $linenumber = 0;
560
561 while(<IN>) {
562
563 # read the line
564 chomp;
565 $line = $_;
566 @fields = split(/:/, $line);
567
568 # get a phrase number for this line
569 $key = shift @fields;
570 die unless (defined($phrasenumber{$key}));
571 $num = $phrasenumber{$key};
572
573 # get the text of the phrase
574 $text = shift @fields;
575 $text =~ s/s(\d+)/$symbol[$1]/g;
576 if ($text =~ / /) {
577 $word = "";
578 } else {
579 $word = $text;
580 }
581
582 $linenumber++;
583 if ($linenumber % 1000 == 0) {
584 print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
585 }
586 print "$num: $key\t($text)\n" if ($verbosity > 3);
587
588 # get the phrase frequency
589 $tf = shift @fields;
590
591 # get the number of expansions
592 $countexp = shift @fields;
593
594 # get the expansions and convert them into phrase numbers
595 $expansions = shift @fields;
596 @newexp = ();
597 foreach $k (split(/,/, $expansions)) {
598 die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
599 $n = $phrasenumber{$k};
600 push @newexp, $n;
601 }
602 @newexp = sort numerically @newexp;
603
604 # get the number of documents
605 $countdocs = shift @fields;
606
607 # get the documents
608 $documents = shift @fields;
609 $documents =~ s/d//g;
610 @documents = split(/;/, $documents);
611 @documents = sort by_frequency @documents;
612
613 # output the phrase data
614 print DATA "<Document>";
615 print DATA "$num:$text:$tf:$countexp:$countdocs:";
616 print DATA join(",", @newexp), ":", join(";", @documents), "\n";
617
618 # output the word index search data
619 print IDX "<Document>$word\n";
620
621
622 }
623}
624
625# sort routines used to renumber phrases
626
627sub numerically { $a <=> $b }
628
629sub by_frequency {
630 my $fa = 1;
631 if ($a =~ /,/) {
632 $fa = $a;
633 $fa =~ s/\d+,//;
634 }
635 my $fb = 1;
636 if ($b =~ /,/) {
637 $fb = $b;
638 $fb =~ s/\d+,//;
639 }
640
641 return ($fb <=> $fa);
642}
643
Note: See TracBrowser for help on using the repository browser.