Context Navigation

source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1604

Last change on this file since 1604 was 1604, checked in by paynter, 24 years ago
Numerous improvements for use with the new phindcgi script. The main ones are that three MGPP databases are now created (document data, phrase data, and word search) and that the data extracted from each document is set explicitly in the collection configuration file (usually it will be something like document:text or section:Title).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.3 KB

Line
1	#! /usr/bin/perl -w
2
3	###########################################################################
4	#
5	# phindgen.pl -- produce Phind index files for a GSDL collection.
6	#
7	# Copyright (C) 2000 Gordon W. Paynter
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29
30	# This program generates phind phrase browsing indexes for a Greenstone
31	# digital library collection.
32	#
33	# The GML files that have been imported to the archives directory are
34	# read, and then the phind indexes are created in the phindex directory.
35	#
36	# This version od Phind uses suffix arrays to extract phrases.
37
38
39	BEGIN {
40	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44	}
45
46
47	use strict;
48	use colcfg;
49	use util;
50	use parsargv;
51	use plugin;
52	use phproc;
53
54
55	sub print_usage {
56	print STDERR "\n usage: $0 [options] collection-name\n\n";
57	print STDERR " options:\n";
58	print STDERR " -verbosity number 0=none, 3=lots\n";
59	print STDERR " -archivedir directory Where the converted material ends up\n";
60	print STDERR " -phindexdir directory Where to place the phind indexes\n";
61	print STDERR " -languages names The languages to consider in the browser\n";
62	print STDERR " -maxdocs number Maximum number of documents to process\n";
63	print STDERR " -untidy Leave working files in phindex directory\n";
64	print STDERR "\n";
65	}
66
67
68	# Define delimiter symbols - this should be abstracted out someplace
69	my $colstart = "COLLECTIONSTART";
70	my $colend = "COLLECTIONEND";
71	my $doclimit = "DOCUMENTLIMIT";
72	my $senlimit = "SENTENCELIMIT";
73	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75	&main ();
76
77
78	sub main {
79
80	my ($verbosity, $archivedir, $phindexdir,
81	$phindcfg, $language, $maxdocs, $untidy,
82	$collection, $configfilename, $collectcfg);
83
84	# Parse command-line arguments and get the collection name
85	if (!parsargv::parse(\@ARGV,
86	'verbosity/\d+/1', \$verbosity,
87	'archivedir/.*/', \$archivedir,
88	'phindexdir/.*/', \$phindexdir,
89	'languages/\w+/english', \$language,
90	'maxdocs/^\-?\d+/-1', \$maxdocs,
91	'untidy', \$untidy )) {
92	&print_usage();
93	die "\n";
94	}
95	$collection = &util::use_collection(@ARGV);
96	if ($collection eq "") {
97	&print_usage();
98	die "\n";
99	}
100
101	# Read the collection configuration file
102	$configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103	if (-e $configfilename) {
104	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
105	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106	$archivedir = $collectcfg->{'archivedir'};
107	}
108	if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109	$phindexdir = $collectcfg->{'phindexdir'};
110	}
111	if (defined $collectcfg->{'phind'}) {
112	$phindcfg = $collectcfg->{'phind'};
113	}
114	} else {
115	die "Couldn't find the configuration file $configfilename\n";
116	}
117
118	# Calculate default archive and phindex directories
119	if ($archivedir eq "") {
120	$archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121	}
122	if ($phindexdir eq "") {
123	$phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124	}
125
126	# Make sure theuser has in fact requested phind indexes
127	if (!defined($phindcfg)) {
128	print "No phind information in $configfilename\n";
129	exit;
130	}
131
132
133	# Construct phind indexes
134	my $mode = 1;
135	my $symbol_limit = 40000000;
136	my ($command, $status);
137
138	# Make a clean phindex directory
139	if (-e "$phindexdir") {
140	&util::rm_r("$phindexdir");
141	}
142	&util::mk_dir("$phindexdir");
143
144	# Read the archives directory and build the clauses file
145	print "\nReading archive directory\n" if $verbosity;
146	&build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs);
147
148	# Generate the vocabulary, symbol statistics, and numbers file
149	# from the clauses file
150	print "\nExtracting vocabulary and statistics\n" if $verbosity;
151	&extract_vocabulary($phindexdir, $language, $verbosity);
152
153	# Use the suffix program to generate the phindex/phrases file
154	&execute("suffix $phindexdir $symbol_limit $mode", $verbosity);
155
156	# Create the phrase file and put phrase numbers in phindex/phrases
157	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
158	&renumber_phrases("$phindexdir", $verbosity);
159
160	# Create the mg phrase database
161	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
162	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
163	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
164
165	my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build");
166	my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build");
167	my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict");
168	my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx");
169
170	print "\nCreating phrase databases\n";
171	my $mg_input = &util::filename_cat($phindexdir, "pdata.txt");
172	my $mg_stem = "pdata";
173
174	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
175	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
176	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
177
178	# create the mg index of words
179	print "\nCreating word-level search indexes\n";
180	$mg_input = &util::filename_cat($phindexdir, "pword.txt");
181	$mg_stem = "pword";
182
183	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);
184	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
185	&execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity);
186	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);
187	&execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity);
188	&execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity);
189
190	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity);
191	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity);
192	&execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity);
193
194	# create the mg document information database
195	print "\nCreating document information databases\n";
196	$mg_input = &util::filename_cat($phindexdir, "docs.txt");
197	$mg_stem = "docs";
198
199	&execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity);
200	&execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity);
201	&execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity);
202
203
204	# Tidy up stray files
205	if (!$untidy) {
206	print "\nCleaning up\n" if $verbosity;
207	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
208	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
209	"$phindexdir/phrases", "$phindexdir/docs.txt");
210	my $outfile = 1;
211	while (-e "$phindexdir/outPhrase.$outfile") {
212	&util::rm("$phindexdir/outPhrase.$outfile");
213	$outfile++;
214	}
215	}
216	}
217
218	# Execute a system command
219
220	sub execute {
221	my ($command, $verbosity) = @_;
222	print "Executing: $command\n" if $verbosity;
223	my $status = system($command);
224	if ($status != 0) {
225	print STDERR "phindgen.pl - Error executing $command: $!\n";
226	exit($status);
227	}
228	}
229
230
231
232
233
234	# Get the text for the collection.
235	# Save the document names and identifiers as we go.
236
237	sub build_clauses {
238	my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_;
239
240	# create a "pluginfo" for ArcPlug and RecPlug
241	my $plugins = [["GMLPlug"], ["ArcPlug"]];
242	my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
243	die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
244
245
246	# create a phind document processor object to process the documents
247	my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language,
248	$doclimit, $verbosity, "STDOUT");
249
250	# process the archives directory
251	&plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
252	}
253
254
255
256	# Generate the vocabulary, symbol statistics, and numbers file from the
257	# clauses file. This is legacy code, so is a bit messy and probably wont
258	# run under windows.
259
260	sub extract_vocabulary {
261	my ($phindex_dir, $language, $verbosity) = @_;
262
263	my ($w, $l, $line, $word);
264
265	my ($first_delimiter, $last_delimiter,
266	$first_stopword, $last_stopword,
267	$first_extractword, $last_extractword,
268	$first_contentword, $last_contentword,
269	$phrasedelimiter);
270
271	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
272
273
274	my %symbol;
275	my (%freq);
276
277	print "Calculating vocabulary\n" if ($verbosity > 1);
278
279	# Read and store the stopwords
280	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
281	my %stopwords;
282	foreach my $w (split(/\s+/, $words)) {
283	$l = lc($w);
284	$stopwords{$l} = $w;
285	}
286
287	# Read and store the thesaurus terms
288	$use_thesaurus = 0;
289	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
290	"$language", "agrovoc.lex");
291	if (-e "$lex_file") {
292	open(TH, "<$lex_file");
293	while(<TH>) {
294	s/^\d+ //;
295	s/\(.*\)//;
296	foreach my $w (split(/\s+/, $_)) {
297	$thesaurus{lc($w)} = $w;
298	}
299	}
300	close TH;
301	$use_thesaurus = 1;
302	}
303
304	# Read words in the text and count occurences
305	open(TXT, "<$phindex_dir/clauses");
306	my @words;
307
308	while(<TXT>) {
309	$line = $_;
310	next unless ($line =~ /./);
311
312	@words = split(/\s+/, $line);
313	foreach $w (@words) {
314	$l = lc($w);
315	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
316	$freq{$w}++;
317	}
318	$freq{$senlimit}++;
319	}
320
321	# Calculate the "best" form of each word
322	my (%bestform, %totalfreq, %bestfreq);
323
324	foreach $w (sort (keys %freq)) {
325	$l = lc($w);
326
327	# totalfreq is the number of times a term appears in any form
328	$totalfreq{$l} += $freq{$w};
329
330	if (defined $stopwords{$l}) {
331	$bestform{$l} = $stopwords{$l};
332
333	} elsif (defined $thesaurus{$l}) {
334	$bestform{$l} = $thesaurus{$l};
335
336	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
337	$bestfreq{$l} = $freq{$w};
338	$bestform{$l} = $w;
339	}
340	}
341
342	undef %freq;
343	undef %bestfreq;
344
345
346	# Assign symbol numbers to tokens
347	my $nextsymbol = 1;
348	my (@vocab);
349
350	# Delimiters
351	$first_delimiter = 1;
352
353	foreach $word (@delimiters) {
354
355	$word = lc($word);
356	$bestform{$word} = uc($word);
357	$vocab[$nextsymbol] = $word;
358	$symbol{$word} = $nextsymbol;
359	$nextsymbol++;
360	}
361	$last_delimiter = $nextsymbol - 1;
362
363	# Stopwords
364	$first_stopword = $nextsymbol;
365
366	foreach my $word (sort keys %stopwords) {
367
368	# don't incluse stopword unless it occurs in the text
369	$word = lc($word);
370	next unless ($totalfreq{$word});
371	next if ($symbol{$word});
372
373	$vocab[$nextsymbol] = $word;
374	$symbol{$word} = $nextsymbol;
375	$nextsymbol++;
376	}
377	$last_stopword = $nextsymbol - 1;
378	$first_contentword = $nextsymbol;
379
380	# Thesaurus terms
381	if ($use_thesaurus) {
382	$first_thesaurusword = $nextsymbol;
383
384	foreach my $word (sort keys %thesaurus) {
385
386	$word = lc($word);
387	next if ($symbol{$word});
388	$bestform{$word} = $thesaurus{$word};
389
390	$vocab[$nextsymbol] = $word;
391	$symbol{$word} = $nextsymbol;
392	$nextsymbol++;
393
394	}
395	$last_thesaurusword = $nextsymbol - 1;
396	}
397
398	# Other content words
399	$first_extractword = $nextsymbol;
400
401	foreach my $word (sort (keys %bestform)) {
402
403	next if ($symbol{$word});
404
405	$vocab[$nextsymbol] = $word;
406	$symbol{$word} = $nextsymbol;
407	$nextsymbol++;
408	}
409	$last_extractword = $nextsymbol - 1;
410	$last_contentword = $nextsymbol - 1;
411
412
413	# Outut the words
414	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
415	open(VOC, ">$phindex_dir/clauses.vocab");
416
417	for (my $i = 1; $i < $nextsymbol; $i++) {
418	$w = $vocab[$i];
419
420	print VOC "$bestform{$w}\n";
421	$totalfreq{$w} = 0 unless ($totalfreq{$w});
422	}
423	close VOC;
424
425
426	# Output statistics about the vocablary
427	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
428	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
429	open(STAT, ">$phindex_dir/clauses.stats")
430	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
431
432	print STAT "first_delimiter $first_delimiter\n";
433	print STAT "last_delimiter $last_delimiter\n";
434	print STAT "first_stopword $first_stopword\n";
435	print STAT "last_stopword $last_stopword\n";
436	if ($use_thesaurus) {
437	print STAT "first_thesaurusword $first_thesaurusword\n";
438	print STAT "last_thesaurusword $last_thesaurusword\n";
439	}
440	print STAT "first_extractword $first_extractword\n";
441	print STAT "last_extractword $last_extractword\n";
442	print STAT "first_contentword $first_contentword\n";
443	print STAT "last_contentword $last_contentword\n";
444	print STAT "first_symbol $first_delimiter\n";
445	print STAT "last_symbol $last_contentword\n";
446	print STAT "first_word $first_stopword\n";
447	print STAT "last_word $last_contentword\n";
448	close STAT;
449
450	undef @vocab;
451
452
453	# Save text as symbol numbers
454	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
455
456	open(TXT, "<$phindex_dir/clauses");
457	open(NUM, ">$phindex_dir/clauses.numbers");
458
459	$phrasedelimiter = $symbol{lc($senlimit)};
460	print NUM "$symbol{lc($colstart)}\n";
461
462	# set up the special symbols that delimit documents and sentences
463	while(<TXT>) {
464
465	# split sentence into a list of tokens
466	$line = $_;
467	next unless ($line =~ /./);
468	@words = split(/\s+/, $line);
469
470	# output one token at a time
471	foreach $word (@words) {
472	$word = lc($word);
473	print NUM "$symbol{$word}\n";
474	}
475
476	# output phrase delimiter
477	print NUM "$phrasedelimiter\n";
478	}
479
480	print NUM "$symbol{lc($colend)}\n";
481
482	}
483
484
485	# Prepare the phrases file to be input to mgpp.
486	# This means renumbering the phrases in order of decreasing frequency.
487
488
489	# This is legacy code, and a little ugly, and may be unix-specific
490	# (particularly the sort command).
491
492	sub renumber_phrases {
493	my ($phindex_dir, $verbosity) = @_;
494
495	# Sort the phrases into order of increasing frequency
496	# This means the expansions will be sorted correctly later on.
497	print "Sorting phrases into freq order\n" if ($verbosity);
498	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
499
500	my @symbol;
501
502	# Read the vocabulary
503	print "Reading the vocabulary\n" if ($verbosity);
504	open(V, "<$phindex_dir/clauses.vocab")
505	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
506
507	my $i = 1;
508	while(<V>) {
509	chomp;
510	$symbol[$i++] = $_;
511	}
512
513	# Create file for phrase data
514	#
515	# The phrases file looks something like this
516	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
517	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
518	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
519	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
520
521	# The first field on each line is a unique phrase identifier.
522	# We need to calculate phrase numbers for each phrase
523	print "Calculate phrase numbers\n" if ($verbosity);
524
525	my %phrasenumber;
526	my $nextphrase = 1;
527	my ($line);
528
529	open(IN, "<$phindex_dir/phrases");
530	while(<IN>) {
531
532	# read the line
533	chomp;
534	$line = $_;
535
536	# we're only interested in the first field
537	$line =~ s/:.*//;
538
539	# get a phrase number for this line
540	$phrasenumber{$line} = $nextphrase;
541	$nextphrase++;
542	}
543
544
545	# Now we create a new phrase file using phrase numbers, not the old IDs.
546	print "Format phrase data for MGPP\n" if ($verbosity);
547
548	open(IN, "<$phindex_dir/phrases");
549	open(DATA, ">$phindex_dir/pdata.txt");
550	open(IDX, ">$phindex_dir/pword.txt");
551
552	my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
553	my @fields;
554	my @documents;
555	my (@newexp, $k, $n);
556
557	my $linenumber = 0;
558
559	while(<IN>) {
560
561	# read the line
562	chomp;
563	$line = $_;
564	@fields = split(/:/, $line);
565
566	# get a phrase number for this line
567	$key = shift @fields;
568	die unless (defined($phrasenumber{$key}));
569	$num = $phrasenumber{$key};
570
571	# get the text of the phrase
572	$text = shift @fields;
573	$text =~ s/s(\d+)/$symbol[$1]/g;
574	if ($text =~ / /) {
575	$word = "";
576	} else {
577	$word = $text;
578	}
579
580	$linenumber++;
581	if ($linenumber % 1000 == 0) {
582	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 1);
583	}
584	print "$num: $key\t($text)\n" if ($verbosity > 1);
585
586	# get the phrase frequency
587	$tf = shift @fields;
588
589	# get the number of expansions
590	$countexp = shift @fields;
591
592	# get the expansions and convert them into phrase numbers
593	$expansions = shift @fields;
594	@newexp = ();
595	foreach $k (split(/,/, $expansions)) {
596	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
597	$n = $phrasenumber{$k};
598	push @newexp, $n;
599	}
600	@newexp = sort numerically @newexp;
601
602	# get the number of documents
603	$countdocs = shift @fields;
604
605	# get the documents
606	$documents = shift @fields;
607	$documents =~ s/d//g;
608	@documents = split(/;/, $documents);
609	@documents = sort by_frequency @documents;
610
611	# output the phrase data
612	print DATA "<Document>";
613	print DATA "$num:$text:$tf:$countexp:$countdocs:";
614	print DATA join(",", @newexp), ":", join(";", @documents), "\n";
615
616	# output the word index search data
617	print IDX "<Document>$word\n";
618
619
620	}
621	}
622
623	# sort routines used to renumber phrases
624
625	sub numerically { $a <=> $b }
626
627	sub by_frequency {
628	my $fa = 1;
629	if ($a =~ /,/) {
630	$fa = $a;
631	$fa =~ s/\d+,//;
632	}
633	my $fb = 1;
634	if ($b =~ /,/) {
635	$fb = $b;
636	$fb =~ s/\d+,//;
637	}
638
639	return ($fb <=> $fa);
640	}
641

Note: See TracBrowser for help on using the repository browser.

Download in other formats: