Context Navigation

source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1631

Last change on this file since 1631 was 1630, checked in by paynter, 24 years ago
Tidy up pword.txt and pdata.txt files.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.4 KB

Line
1	#! /usr/bin/perl -w
2
3	###########################################################################
4	#
5	# phindgen.pl -- produce Phind index files for a GSDL collection.
6	#
7	# Copyright 2000 Gordon W. Paynter
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# This program generates phind phrase browsing indexes for a Greenstone
32	# digital library collection.
33	#
34	# The GML files that have been imported to the archives directory are
35	# read, and then the phind indexes are created in the phindex directory.
36	#
37	# This version od Phind uses suffix arrays to extract phrases.
38
39
40	BEGIN {
41	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
42	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
43	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
44	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
45	}
46
47
48	use strict;
49	use colcfg;
50	use util;
51	use parsargv;
52	use plugin;
53	use phproc;
54
55
56	sub print_usage {
57	print STDERR "\n usage: $0 [options] collection-name\n\n";
58	print STDERR " options:\n";
59	print STDERR " -verbosity number 0=none, 3=lots\n";
60	print STDERR " -archivedir directory Where the converted material ends up\n";
61	print STDERR " -phindexdir directory Where to place the phind indexes\n";
62	print STDERR " -languages names The languages to consider in the browser\n";
63	print STDERR " -maxdocs number Maximum number of documents to process\n";
64	print STDERR " -untidy Leave working files in phindex directory\n";
65	print STDERR "\n";
66	}
67
68
69	# Define delimiter symbols - this should be abstracted out someplace
70	my $colstart = "COLLECTIONSTART";
71	my $colend = "COLLECTIONEND";
72	my $doclimit = "DOCUMENTLIMIT";
73	my $senlimit = "SENTENCELIMIT";
74	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
75
76	&main ();
77
78
79	sub main {
80
81	my ($verbosity, $archivedir, $phindexdir,
82	$phindcfg, $language, $maxdocs, $untidy,
83	$collection, $configfilename, $collectcfg);
84
85	# Parse command-line arguments and get the collection name
86	if (!parsargv::parse(\@ARGV,
87	'verbosity/\d+/1', \$verbosity,
88	'archivedir/.*/', \$archivedir,
89	'phindexdir/.*/', \$phindexdir,
90	'languages/\w+/english', \$language,
91	'maxdocs/^\-?\d+/-1', \$maxdocs,
92	'untidy', \$untidy )) {
93	&print_usage();
94	die "\n";
95	}
96	$collection = &util::use_collection(@ARGV);
97	if ($collection eq "") {
98	&print_usage();
99	die "\n";
100	}
101
102	# Read the collection configuration file
103	$configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
104	if (-e $configfilename) {
105	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
106	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
107	$archivedir = $collectcfg->{'archivedir'};
108	}
109	if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
110	$phindexdir = $collectcfg->{'phindexdir'};
111	}
112	if (defined $collectcfg->{'phind'}) {
113	$phindcfg = $collectcfg->{'phind'};
114	}
115	} else {
116	die "Couldn't find the configuration file $configfilename\n";
117	}
118
119	# Calculate default archive and phindex directories
120	if ($archivedir eq "") {
121	$archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
122	}
123	if ($phindexdir eq "") {
124	$phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
125	}
126
127	# Make sure theuser has in fact requested phind indexes
128	if (!defined($phindcfg)) {
129	print "No phind information in $configfilename\n";
130	exit;
131	}
132
133
134	# Construct phind indexes
135	my $mode = 1;
136	my $symbol_limit = 40000000;
137	my ($command, $status);
138
139	# Make a clean phindex directory
140	if (-e "$phindexdir") {
141	&util::rm_r("$phindexdir");
142	}
143	&util::mk_dir("$phindexdir");
144
145	# Read the archives directory and build the clauses file
146	print "\nReading archive directory\n" if $verbosity;
147	&build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
148
149	# Generate the vocabulary, symbol statistics, and numbers file
150	# from the clauses file
151	print "\nExtracting vocabulary and statistics\n" if $verbosity;
152	&extract_vocabulary($phindexdir, $language, $verbosity);
153
154	# Use the suffix program to generate the phindex/phrases file
155	&execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
156
157	# Create the phrase file and put phrase numbers in phindex/phrases
158	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
159	&renumber_phrases("$phindexdir", $verbosity);
160
161	# Create the mg phrase database
162	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
163	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
164	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
165
166	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
167	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
168	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
169	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
170
171	print "\nCreating phrase databases\n";
172	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
173	my $mg_stem = "pdata";
174
175	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
176	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
177	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
178
179	# create the mg index of words
180	print "\nCreating word-level search indexes\n";
181	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
182	$mg_stem = "pword";
183
184	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
185	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
186	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
187	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
188	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
189	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
190
191	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
192	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
193	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
194
195	# create the mg document information database
196	print "\nCreating document information databases\n";
197	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
198	$mg_stem = "docs";
199
200	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
201	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
202	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
203
204
205	# Tidy up stray files
206	if (!$untidy) {
207	print "\nCleaning up\n" if $verbosity;
208	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
209	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
210	"$phindexdir/phrases", "$phindexdir/docs.txt",
211	"$phindexdir/pdata.txt", "$phindexdir/pword.txt");
212	my $outfile = 1;
213	while (-e "$phindexdir/outPhrase.$outfile") {
214	&util::rm("$phindexdir/outPhrase.$outfile");
215	$outfile++;
216	}
217	}
218	}
219
220	# Execute a system command
221
222	sub execute {
223	my ($command, $verbosity) = @_;
224	print "Executing: $command\n" if $verbosity;
225	my $status = system($command);
226	if ($status != 0) {
227	print STDERR "phindgen.pl - Error executing $command: $!\n";
228	exit($status);
229	}
230	}
231
232
233
234
235
236	# Get the text for the collection.
237	# Save the document names and identifiers as we go.
238
239	sub build_clauses {
240	my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
241
242	# create a "pluginfo" for ArcPlug and RecPlug
243	my $plugins = [["GMLPlug"], ["ArcPlug"]];
244	my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
245	die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
246
247
248	# create a phind document processor object to process the documents
249	my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
250	$doclimit, $verbosity, "STDOUT");
251
252	# process the archives directory
253	&plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
254	}
255
256
257
258	# Generate the vocabulary, symbol statistics, and numbers file from the
259	# clauses file. This is legacy code, so is a bit messy and probably wont
260	# run under windows.
261
262	sub extract_vocabulary {
263	my ($phindex_dir, $language, $verbosity) = @_;
264
265	my ($w, $l, $line, $word);
266
267	my ($first_delimiter, $last_delimiter,
268	$first_stopword, $last_stopword,
269	$first_extractword, $last_extractword,
270	$first_contentword, $last_contentword,
271	$phrasedelimiter);
272
273	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
274
275
276	my %symbol;
277	my (%freq);
278
279	print "Calculating vocabulary\n" if ($verbosity > 1);
280
281	# Read and store the stopwords
282	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
283	my %stopwords;
284	foreach my $w (split(/\s+/, $words)) {
285	$l = lc($w);
286	$stopwords{$l} = $w;
287	}
288
289	# Read and store the thesaurus terms
290	$use_thesaurus = 0;
291	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
292	"$language", "agrovoc.lex");
293	if (-e "$lex_file") {
294	open(TH, "<$lex_file");
295	while(<TH>) {
296	s/^\d+ //;
297	s/\(.*\)//;
298	foreach my $w (split(/\s+/, $_)) {
299	$thesaurus{lc($w)} = $w;
300	}
301	}
302	close TH;
303	$use_thesaurus = 1;
304	}
305
306	# Read words in the text and count occurences
307	open(TXT, "<$phindex_dir/clauses");
308	my @words;
309
310	while(<TXT>) {
311	$line = $_;
312	next unless ($line =~ /./);
313
314	@words = split(/\s+/, $line);
315	foreach $w (@words) {
316	$l = lc($w);
317	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
318	$freq{$w}++;
319	}
320	$freq{$senlimit}++;
321	}
322
323	# Calculate the "best" form of each word
324	my (%bestform, %totalfreq, %bestfreq);
325
326	foreach $w (sort (keys %freq)) {
327	$l = lc($w);
328
329	# totalfreq is the number of times a term appears in any form
330	$totalfreq{$l} += $freq{$w};
331
332	if (defined $stopwords{$l}) {
333	$bestform{$l} = $stopwords{$l};
334
335	} elsif (defined $thesaurus{$l}) {
336	$bestform{$l} = $thesaurus{$l};
337
338	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
339	$bestfreq{$l} = $freq{$w};
340	$bestform{$l} = $w;
341	}
342	}
343
344	undef %freq;
345	undef %bestfreq;
346
347
348	# Assign symbol numbers to tokens
349	my $nextsymbol = 1;
350	my (@vocab);
351
352	# Delimiters
353	$first_delimiter = 1;
354
355	foreach $word (@delimiters) {
356
357	$word = lc($word);
358	$bestform{$word} = uc($word);
359	$vocab[$nextsymbol] = $word;
360	$symbol{$word} = $nextsymbol;
361	$nextsymbol++;
362	}
363	$last_delimiter = $nextsymbol - 1;
364
365	# Stopwords
366	$first_stopword = $nextsymbol;
367
368	foreach my $word (sort keys %stopwords) {
369
370	# don't incluse stopword unless it occurs in the text
371	$word = lc($word);
372	next unless ($totalfreq{$word});
373	next if ($symbol{$word});
374
375	$vocab[$nextsymbol] = $word;
376	$symbol{$word} = $nextsymbol;
377	$nextsymbol++;
378	}
379	$last_stopword = $nextsymbol - 1;
380	$first_contentword = $nextsymbol;
381
382	# Thesaurus terms
383	if ($use_thesaurus) {
384	$first_thesaurusword = $nextsymbol;
385
386	foreach my $word (sort keys %thesaurus) {
387
388	$word = lc($word);
389	next if ($symbol{$word});
390	$bestform{$word} = $thesaurus{$word};
391
392	$vocab[$nextsymbol] = $word;
393	$symbol{$word} = $nextsymbol;
394	$nextsymbol++;
395
396	}
397	$last_thesaurusword = $nextsymbol - 1;
398	}
399
400	# Other content words
401	$first_extractword = $nextsymbol;
402
403	foreach my $word (sort (keys %bestform)) {
404
405	next if ($symbol{$word});
406
407	$vocab[$nextsymbol] = $word;
408	$symbol{$word} = $nextsymbol;
409	$nextsymbol++;
410	}
411	$last_extractword = $nextsymbol - 1;
412	$last_contentword = $nextsymbol - 1;
413
414
415	# Outut the words
416	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
417	open(VOC, ">$phindex_dir/clauses.vocab");
418
419	for (my $i = 1; $i < $nextsymbol; $i++) {
420	$w = $vocab[$i];
421
422	print VOC "$bestform{$w}\n";
423	$totalfreq{$w} = 0 unless ($totalfreq{$w});
424	}
425	close VOC;
426
427
428	# Output statistics about the vocablary
429	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
430	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
431	open(STAT, ">$phindex_dir/clauses.stats")
432	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
433
434	print STAT "first_delimiter $first_delimiter\n";
435	print STAT "last_delimiter $last_delimiter\n";
436	print STAT "first_stopword $first_stopword\n";
437	print STAT "last_stopword $last_stopword\n";
438	if ($use_thesaurus) {
439	print STAT "first_thesaurusword $first_thesaurusword\n";
440	print STAT "last_thesaurusword $last_thesaurusword\n";
441	}
442	print STAT "first_extractword $first_extractword\n";
443	print STAT "last_extractword $last_extractword\n";
444	print STAT "first_contentword $first_contentword\n";
445	print STAT "last_contentword $last_contentword\n";
446	print STAT "first_symbol $first_delimiter\n";
447	print STAT "last_symbol $last_contentword\n";
448	print STAT "first_word $first_stopword\n";
449	print STAT "last_word $last_contentword\n";
450	close STAT;
451
452	undef @vocab;
453
454
455	# Save text as symbol numbers
456	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
457
458	open(TXT, "<$phindex_dir/clauses");
459	open(NUM, ">$phindex_dir/clauses.numbers");
460
461	$phrasedelimiter = $symbol{lc($senlimit)};
462	print NUM "$symbol{lc($colstart)}\n";
463
464	# set up the special symbols that delimit documents and sentences
465	while(<TXT>) {
466
467	# split sentence into a list of tokens
468	$line = $_;
469	next unless ($line =~ /./);
470	@words = split(/\s+/, $line);
471
472	# output one token at a time
473	foreach $word (@words) {
474	$word = lc($word);
475	print NUM "$symbol{$word}\n";
476	}
477
478	# output phrase delimiter
479	print NUM "$phrasedelimiter\n";
480	}
481
482	print NUM "$symbol{lc($colend)}\n";
483
484	}
485
486
487	# Prepare the phrases file to be input to mgpp.
488	# This means renumbering the phrases in order of decreasing frequency.
489
490
491	# This is legacy code, and a little ugly, and may be unix-specific
492	# (particularly the sort command).
493
494	sub renumber_phrases {
495	my ($phindex_dir, $verbosity) = @_;
496
497	# Sort the phrases into order of increasing frequency
498	# This means the expansions will be sorted correctly later on.
499	print "Sorting phrases into freq order\n" if ($verbosity);
500	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
501
502	my @symbol;
503
504	# Read the vocabulary
505	print "Reading the vocabulary\n" if ($verbosity);
506	open(V, "<$phindex_dir/clauses.vocab")
507	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
508
509	my $i = 1;
510	while(<V>) {
511	chomp;
512	$symbol[$i++] = $_;
513	}
514
515	# Create file for phrase data
516	#
517	# The phrases file looks something like this
518	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
519	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
520	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
521	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
522
523	# The first field on each line is a unique phrase identifier.
524	# We need to calculate phrase numbers for each phrase
525	print "Calculate phrase numbers\n" if ($verbosity);
526
527	my %phrasenumber;
528	my $nextphrase = 1;
529	my ($line);
530
531	open(IN, "<$phindex_dir/phrases");
532	while(<IN>) {
533
534	# read the line
535	chomp;
536	$line = $_;
537
538	# we're only interested in the first field
539	$line =~ s/:.*//;
540
541	# get a phrase number for this line
542	$phrasenumber{$line} = $nextphrase;
543	$nextphrase++;
544	}
545
546
547	# Now we create a new phrase file using phrase numbers, not the old IDs.
548	print "Format phrase data for MGPP\n" if ($verbosity);
549
550	open(IN, "<$phindex_dir/phrases");
551	open(DATA, ">$phindex_dir/pdata.txt");
552	open(IDX, ">$phindex_dir/pword.txt");
553
554	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
555	my @fields;
556	my @documents;
557	my (@newexp, $k, $n);
558
559	my $linenumber = 0;
560
561	while(<IN>) {
562
563	# read the line
564	chomp;
565	$line = $_;
566	@fields = split(/:/, $line);
567
568	# get a phrase number for this line
569	$key = shift @fields;
570	die unless (defined($phrasenumber{$key}));
571	$num = $phrasenumber{$key};
572
573	# get the text of the phrase
574	$text = shift @fields;
575	$text =~ s/s(\d+)/$symbol[$1]/g;
576	if ($text =~ / /) {
577	$word = "";
578	} else {
579	$word = $text;
580	}
581
582	$linenumber++;
583	if ($linenumber % 1000 == 0) {
584	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
585	}
586	print "$num: $key\t($text)\n" if ($verbosity > 3);
587
588	# get the phrase frequency
589	$tf = shift @fields;
590
591	# get the number of expansions
592	$countexp = shift @fields;
593
594	# get the expansions and convert them into phrase numbers
595	$expansions = shift @fields;
596	@newexp = ();
597	foreach $k (split(/,/, $expansions)) {
598	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
599	$n = $phrasenumber{$k};
600	push @newexp, $n;
601	}
602	@newexp = sort numerically @newexp;
603
604	# get the number of documents
605	$countdocs = shift @fields;
606
607	# get the documents
608	$documents = shift @fields;
609	$documents =~ s/d//g;
610	@documents = split(/;/, $documents);
611	@documents = sort by_frequency @documents;
612
613	# output the phrase data
614	print DATA "<Document>";
615	print DATA "$num:$text:$tf:$countexp:$countdocs:";
616	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
617
618	# output the word index search data
619	print IDX "<Document>$word\n";
620
621
622	}
623	}
624
625	# sort routines used to renumber phrases
626
627	sub numerically { $a <=> $b }
628
629	sub by_frequency {
630	my $fa = 1;
631	if ($a =~ /,/) {
632	$fa = $a;
633	$fa =~ s/\d+,//;
634	}
635	my $fb = 1;
636	if ($b =~ /,/) {
637	$fb = $b;
638	$fb =~ s/\d+,//;
639	}
640
641	return ($fb <=> $fa);
642	}
643

Note: See TracBrowser for help on using the repository browser.

Download in other formats: