Context Navigation

source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1613

Last change on this file since 1613 was 1613, checked in by paynter, 24 years ago
A little less verbose by default.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.3 KB

Line
1	#! /usr/bin/perl -w
2
3	###########################################################################
4	#
5	# phindgen.pl -- produce Phind index files for a GSDL collection.
6	#
7	# Copyright (C) 2000 Gordon W. Paynter
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29
30	# This program generates phind phrase browsing indexes for a Greenstone
31	# digital library collection.
32	#
33	# The GML files that have been imported to the archives directory are
34	# read, and then the phind indexes are created in the phindex directory.
35	#
36	# This version od Phind uses suffix arrays to extract phrases.
37
38
39	BEGIN {
40	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44	}
45
46
47	use strict;
48	use colcfg;
49	use util;
50	use parsargv;
51	use plugin;
52	use phproc;
53
54
55	sub print_usage {
56	print STDERR "\n usage: $0 [options] collection-name\n\n";
57	print STDERR " options:\n";
58	print STDERR " -verbosity number 0=none, 3=lots\n";
59	print STDERR " -archivedir directory Where the converted material ends up\n";
60	print STDERR " -phindexdir directory Where to place the phind indexes\n";
61	print STDERR " -languages names The languages to consider in the browser\n";
62	print STDERR " -maxdocs number Maximum number of documents to process\n";
63	print STDERR " -untidy Leave working files in phindex directory\n";
64	print STDERR "\n";
65	}
66
67
68	# Define delimiter symbols - this should be abstracted out someplace
69	my $colstart = "COLLECTIONSTART";
70	my $colend = "COLLECTIONEND";
71	my $doclimit = "DOCUMENTLIMIT";
72	my $senlimit = "SENTENCELIMIT";
73	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75	&main ();
76
77
78	sub main {
79
80	my ($verbosity, $archivedir, $phindexdir,
81	$phindcfg, $language, $maxdocs, $untidy,
82	$collection, $configfilename, $collectcfg);
83
84	# Parse command-line arguments and get the collection name
85	if (!parsargv::parse(\@ARGV,
86	'verbosity/\d+/1', \$verbosity,
87	'archivedir/.*/', \$archivedir,
88	'phindexdir/.*/', \$phindexdir,
89	'languages/\w+/english', \$language,
90	'maxdocs/^\-?\d+/-1', \$maxdocs,
91	'untidy', \$untidy )) {
92	&print_usage();
93	die "\n";
94	}
95	$collection = &util::use_collection(@ARGV);
96	if ($collection eq "") {
97	&print_usage();
98	die "\n";
99	}
100
101	# Read the collection configuration file
102	$configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103	if (-e $configfilename) {
104	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
105	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106	$archivedir = $collectcfg->{'archivedir'};
107	}
108	if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109	$phindexdir = $collectcfg->{'phindexdir'};
110	}
111	if (defined $collectcfg->{'phind'}) {
112	$phindcfg = $collectcfg->{'phind'};
113	}
114	} else {
115	die "Couldn't find the configuration file $configfilename\n";
116	}
117
118	# Calculate default archive and phindex directories
119	if ($archivedir eq "") {
120	$archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121	}
122	if ($phindexdir eq "") {
123	$phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124	}
125
126	# Make sure theuser has in fact requested phind indexes
127	if (!defined($phindcfg)) {
128	print "No phind information in $configfilename\n";
129	exit;
130	}
131
132
133	# Construct phind indexes
134	my $mode = 1;
135	my $symbol_limit = 40000000;
136	my ($command, $status);
137
138	# Make a clean phindex directory
139	if (-e "$phindexdir") {
140	&util::rm_r("$phindexdir");
141	}
142	&util::mk_dir("$phindexdir");
143
144	# Read the archives directory and build the clauses file
145	print "\nReading archive directory\n" if $verbosity;
146	&build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
147
148	# Generate the vocabulary, symbol statistics, and numbers file
149	# from the clauses file
150	print "\nExtracting vocabulary and statistics\n" if $verbosity;
151	&extract_vocabulary($phindexdir, $language, $verbosity);
152
153	# Use the suffix program to generate the phindex/phrases file
154	&execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
155
156	# Create the phrase file and put phrase numbers in phindex/phrases
157	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
158	&renumber_phrases("$phindexdir", $verbosity);
159
160	# Create the mg phrase database
161	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
162	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
163	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
164
165	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
166	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
167	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
168	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
169
170	print "\nCreating phrase databases\n";
171	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
172	my $mg_stem = "pdata";
173
174	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
175	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
176	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
177
178	# create the mg index of words
179	print "\nCreating word-level search indexes\n";
180	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
181	$mg_stem = "pword";
182
183	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
184	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
185	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
186	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
187	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
188	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
189
190	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
191	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
192	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
193
194	# create the mg document information database
195	print "\nCreating document information databases\n";
196	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
197	$mg_stem = "docs";
198
199	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
200	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
201	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
202
203
204	# Tidy up stray files
205	if (!$untidy) {
206	print "\nCleaning up\n" if $verbosity;
207	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
208	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
209	"$phindexdir/phrases", "$phindexdir/docs.txt");
210	my $outfile = 1;
211	while (-e "$phindexdir/outPhrase.$outfile") {
212	&util::rm("$phindexdir/outPhrase.$outfile");
213	$outfile++;
214	}
215	}
216	}
217
218	# Execute a system command
219
220	sub execute {
221	my ($command, $verbosity) = @_;
222	print "Executing: $command\n" if $verbosity;
223	my $status = system($command);
224	if ($status != 0) {
225	print STDERR "phindgen.pl - Error executing $command: $!\n";
226	exit($status);
227	}
228	}
229
230
231
232
233
234	# Get the text for the collection.
235	# Save the document names and identifiers as we go.
236
237	sub build_clauses {
238	my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
239
240	# create a "pluginfo" for ArcPlug and RecPlug
241	my $plugins = [["GMLPlug"], ["ArcPlug"]];
242	my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
243	die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
244
245
246	# create a phind document processor object to process the documents
247	my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
248	$doclimit, $verbosity, "STDOUT");
249
250	# process the archives directory
251	&plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
252	}
253
254
255
256	# Generate the vocabulary, symbol statistics, and numbers file from the
257	# clauses file. This is legacy code, so is a bit messy and probably wont
258	# run under windows.
259
260	sub extract_vocabulary {
261	my ($phindex_dir, $language, $verbosity) = @_;
262
263	my ($w, $l, $line, $word);
264
265	my ($first_delimiter, $last_delimiter,
266	$first_stopword, $last_stopword,
267	$first_extractword, $last_extractword,
268	$first_contentword, $last_contentword,
269	$phrasedelimiter);
270
271	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
272
273
274	my %symbol;
275	my (%freq);
276
277	print "Calculating vocabulary\n" if ($verbosity > 1);
278
279	# Read and store the stopwords
280	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
281	my %stopwords;
282	foreach my $w (split(/\s+/, $words)) {
283	$l = lc($w);
284	$stopwords{$l} = $w;
285	}
286
287	# Read and store the thesaurus terms
288	$use_thesaurus = 0;
289	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
290	"$language", "agrovoc.lex");
291	if (-e "$lex_file") {
292	open(TH, "<$lex_file");
293	while(<TH>) {
294	s/^\d+ //;
295	s/\(.*\)//;
296	foreach my $w (split(/\s+/, $_)) {
297	$thesaurus{lc($w)} = $w;
298	}
299	}
300	close TH;
301	$use_thesaurus = 1;
302	}
303
304	# Read words in the text and count occurences
305	open(TXT, "<$phindex_dir/clauses");
306	my @words;
307
308	while(<TXT>) {
309	$line = $_;
310	next unless ($line =~ /./);
311
312	@words = split(/\s+/, $line);
313	foreach $w (@words) {
314	$l = lc($w);
315	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
316	$freq{$w}++;
317	}
318	$freq{$senlimit}++;
319	}
320
321	# Calculate the "best" form of each word
322	my (%bestform, %totalfreq, %bestfreq);
323
324	foreach $w (sort (keys %freq)) {
325	$l = lc($w);
326
327	# totalfreq is the number of times a term appears in any form
328	$totalfreq{$l} += $freq{$w};
329
330	if (defined $stopwords{$l}) {
331	$bestform{$l} = $stopwords{$l};
332
333	} elsif (defined $thesaurus{$l}) {
334	$bestform{$l} = $thesaurus{$l};
335
336	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
337	$bestfreq{$l} = $freq{$w};
338	$bestform{$l} = $w;
339	}
340	}
341
342	undef %freq;
343	undef %bestfreq;
344
345
346	# Assign symbol numbers to tokens
347	my $nextsymbol = 1;
348	my (@vocab);
349
350	# Delimiters
351	$first_delimiter = 1;
352
353	foreach $word (@delimiters) {
354
355	$word = lc($word);
356	$bestform{$word} = uc($word);
357	$vocab[$nextsymbol] = $word;
358	$symbol{$word} = $nextsymbol;
359	$nextsymbol++;
360	}
361	$last_delimiter = $nextsymbol - 1;
362
363	# Stopwords
364	$first_stopword = $nextsymbol;
365
366	foreach my $word (sort keys %stopwords) {
367
368	# don't incluse stopword unless it occurs in the text
369	$word = lc($word);
370	next unless ($totalfreq{$word});
371	next if ($symbol{$word});
372
373	$vocab[$nextsymbol] = $word;
374	$symbol{$word} = $nextsymbol;
375	$nextsymbol++;
376	}
377	$last_stopword = $nextsymbol - 1;
378	$first_contentword = $nextsymbol;
379
380	# Thesaurus terms
381	if ($use_thesaurus) {
382	$first_thesaurusword = $nextsymbol;
383
384	foreach my $word (sort keys %thesaurus) {
385
386	$word = lc($word);
387	next if ($symbol{$word});
388	$bestform{$word} = $thesaurus{$word};
389
390	$vocab[$nextsymbol] = $word;
391	$symbol{$word} = $nextsymbol;
392	$nextsymbol++;
393
394	}
395	$last_thesaurusword = $nextsymbol - 1;
396	}
397
398	# Other content words
399	$first_extractword = $nextsymbol;
400
401	foreach my $word (sort (keys %bestform)) {
402
403	next if ($symbol{$word});
404
405	$vocab[$nextsymbol] = $word;
406	$symbol{$word} = $nextsymbol;
407	$nextsymbol++;
408	}
409	$last_extractword = $nextsymbol - 1;
410	$last_contentword = $nextsymbol - 1;
411
412
413	# Outut the words
414	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
415	open(VOC, ">$phindex_dir/clauses.vocab");
416
417	for (my $i = 1; $i < $nextsymbol; $i++) {
418	$w = $vocab[$i];
419
420	print VOC "$bestform{$w}\n";
421	$totalfreq{$w} = 0 unless ($totalfreq{$w});
422	}
423	close VOC;
424
425
426	# Output statistics about the vocablary
427	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
428	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
429	open(STAT, ">$phindex_dir/clauses.stats")
430	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
431
432	print STAT "first_delimiter $first_delimiter\n";
433	print STAT "last_delimiter $last_delimiter\n";
434	print STAT "first_stopword $first_stopword\n";
435	print STAT "last_stopword $last_stopword\n";
436	if ($use_thesaurus) {
437	print STAT "first_thesaurusword $first_thesaurusword\n";
438	print STAT "last_thesaurusword $last_thesaurusword\n";
439	}
440	print STAT "first_extractword $first_extractword\n";
441	print STAT "last_extractword $last_extractword\n";
442	print STAT "first_contentword $first_contentword\n";
443	print STAT "last_contentword $last_contentword\n";
444	print STAT "first_symbol $first_delimiter\n";
445	print STAT "last_symbol $last_contentword\n";
446	print STAT "first_word $first_stopword\n";
447	print STAT "last_word $last_contentword\n";
448	close STAT;
449
450	undef @vocab;
451
452
453	# Save text as symbol numbers
454	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
455
456	open(TXT, "<$phindex_dir/clauses");
457	open(NUM, ">$phindex_dir/clauses.numbers");
458
459	$phrasedelimiter = $symbol{lc($senlimit)};
460	print NUM "$symbol{lc($colstart)}\n";
461
462	# set up the special symbols that delimit documents and sentences
463	while(<TXT>) {
464
465	# split sentence into a list of tokens
466	$line = $_;
467	next unless ($line =~ /./);
468	@words = split(/\s+/, $line);
469
470	# output one token at a time
471	foreach $word (@words) {
472	$word = lc($word);
473	print NUM "$symbol{$word}\n";
474	}
475
476	# output phrase delimiter
477	print NUM "$phrasedelimiter\n";
478	}
479
480	print NUM "$symbol{lc($colend)}\n";
481
482	}
483
484
485	# Prepare the phrases file to be input to mgpp.
486	# This means renumbering the phrases in order of decreasing frequency.
487
488
489	# This is legacy code, and a little ugly, and may be unix-specific
490	# (particularly the sort command).
491
492	sub renumber_phrases {
493	my ($phindex_dir, $verbosity) = @_;
494
495	# Sort the phrases into order of increasing frequency
496	# This means the expansions will be sorted correctly later on.
497	print "Sorting phrases into freq order\n" if ($verbosity);
498	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
499
500	my @symbol;
501
502	# Read the vocabulary
503	print "Reading the vocabulary\n" if ($verbosity);
504	open(V, "<$phindex_dir/clauses.vocab")
505	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
506
507	my $i = 1;
508	while(<V>) {
509	chomp;
510	$symbol[$i++] = $_;
511	}
512
513	# Create file for phrase data
514	#
515	# The phrases file looks something like this
516	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
517	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
518	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
519	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
520
521	# The first field on each line is a unique phrase identifier.
522	# We need to calculate phrase numbers for each phrase
523	print "Calculate phrase numbers\n" if ($verbosity);
524
525	my %phrasenumber;
526	my $nextphrase = 1;
527	my ($line);
528
529	open(IN, "<$phindex_dir/phrases");
530	while(<IN>) {
531
532	# read the line
533	chomp;
534	$line = $_;
535
536	# we're only interested in the first field
537	$line =~ s/:.*//;
538
539	# get a phrase number for this line
540	$phrasenumber{$line} = $nextphrase;
541	$nextphrase++;
542	}
543
544
545	# Now we create a new phrase file using phrase numbers, not the old IDs.
546	print "Format phrase data for MGPP\n" if ($verbosity);
547
548	open(IN, "<$phindex_dir/phrases");
549	open(DATA, ">$phindex_dir/pdata.txt");
550	open(IDX, ">$phindex_dir/pword.txt");
551
552	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
553	my @fields;
554	my @documents;
555	my (@newexp, $k, $n);
556
557	my $linenumber = 0;
558
559	while(<IN>) {
560
561	# read the line
562	chomp;
563	$line = $_;
564	@fields = split(/:/, $line);
565
566	# get a phrase number for this line
567	$key = shift @fields;
568	die unless (defined($phrasenumber{$key}));
569	$num = $phrasenumber{$key};
570
571	# get the text of the phrase
572	$text = shift @fields;
573	$text =~ s/s(\d+)/$symbol[$1]/g;
574	if ($text =~ / /) {
575	$word = "";
576	} else {
577	$word = $text;
578	}
579
580	$linenumber++;
581	if ($linenumber % 1000 == 0) {
582	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2);
583	}
584	print "$num: $key\t($text)\n" if ($verbosity > 3);
585
586	# get the phrase frequency
587	$tf = shift @fields;
588
589	# get the number of expansions
590	$countexp = shift @fields;
591
592	# get the expansions and convert them into phrase numbers
593	$expansions = shift @fields;
594	@newexp = ();
595	foreach $k (split(/,/, $expansions)) {
596	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
597	$n = $phrasenumber{$k};
598	push @newexp, $n;
599	}
600	@newexp = sort numerically @newexp;
601
602	# get the number of documents
603	$countdocs = shift @fields;
604
605	# get the documents
606	$documents = shift @fields;
607	$documents =~ s/d//g;
608	@documents = split(/;/, $documents);
609	@documents = sort by_frequency @documents;
610
611	# output the phrase data
612	print DATA "<Document>";
613	print DATA "$num:$text:$tf:$countexp:$countdocs:";
614	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
615
616	# output the word index search data
617	print IDX "<Document>$word\n";
618
619
620	}
621	}
622
623	# sort routines used to renumber phrases
624
625	sub numerically { $a <=> $b }
626
627	sub by_frequency {
628	my $fa = 1;
629	if ($a =~ /,/) {
630	$fa = $a;
631	$fa =~ s/\d+,//;
632	}
633	my $fb = 1;
634	if ($b =~ /,/) {
635	$fb = $b;
636	$fb =~ s/\d+,//;
637	}
638
639	return ($fb <=> $fa);
640	}
641

Note: See TracBrowser for help on using the repository browser.

Download in other formats: