Context Navigation

source: trunk/gsdl/src/phind/generate/phindgen.pl@ 1591

Last change on this file since 1591 was 1591, checked in by paynter, 24 years ago
Having a thesurus is now optional.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	#! /usr/bin/perl -w
2
3	###########################################################################
4	#
5	# phindgen.pl -- produce Phind index files for a GSDL collection.
6	#
7	# Copyright (C) 2000 Gordon W. Paynter
8	#
9	# A component of the Greenstone digital library software
10	# from the New Zealand Digital Library Project at the
11	# University of Waikato, New Zealand.
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29
30	# This program generates phind phrase browsing indexes for a Greenstone
31	# digital library collection.
32	#
33	# The GML files that have been imported to the archives directory are
34	# read, and then the phind indexes are created in the phindex directory.
35	#
36	# This version od Phind uses suffix arrays to extract phrases.
37
38
39	BEGIN {
40	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
41	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
42	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
43	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
44	}
45
46
47	use strict;
48	use colcfg;
49	use util;
50	use parsargv;
51	use plugin;
52	use phproc;
53
54
55	sub print_usage {
56	print STDERR "\n usage: $0 [options] collection-name\n\n";
57	print STDERR " options:\n";
58	print STDERR " -verbosity number 0=none, 3=lots\n";
59	print STDERR " -archivedir directory Where the converted material ends up\n";
60	print STDERR " -phindexdir directory Where to place the phind indexes\n";
61	print STDERR " -languages names The languages to consider in the browser\n";
62	print STDERR " -maxdocs number Maximum number of documents to process\n";
63	print STDERR " -untidy Leave working files in phindex directory\n";
64	print STDERR "\n";
65	}
66
67
68	# Define delimiter symbols - this should be abstracted out someplace
69	my $colstart = "COLLECTIONSTART";
70	my $colend = "COLLECTIONEND";
71	my $doclimit = "DOCUMENTLIMIT";
72	my $senlimit = "SENTENCELIMIT";
73	my @delimiters = ($colstart, $colend, $doclimit, $senlimit);
74
75	&main ();
76
77
78	sub main {
79
80	my ($verbosity, $archivedir, $phindexdir,
81	$phind, $language, $maxdocs, $untidy,
82	$collection, $configfilename, $collectcfg);
83
84	# Parse command-line arguments and get the collection name
85	if (!parsargv::parse(\@ARGV,
86	'verbosity/\d+/1', \$verbosity,
87	'archivedir/.*/', \$archivedir,
88	'phindexdir/.*/', \$phindexdir,
89	'languages/\w+/english', \$language,
90	'maxdocs/^\-?\d+/-1', \$maxdocs,
91	'untidy', \$untidy )) {
92	&print_usage();
93	die "\n";
94	}
95	$collection = &util::use_collection(@ARGV);
96	if ($collection eq "") {
97	&print_usage();
98	die "\n";
99	}
100
101	# Read the collection configuration file
102	$configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
103	if (-e $configfilename) {
104	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
105	if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
106	$archivedir = $collectcfg->{'archivedir'};
107	}
108	if (defined $collectcfg->{'phindexdir'} && $phindexdir eq "") {
109	$phindexdir = $collectcfg->{'phindexdir'};
110	}
111	if (defined $collectcfg->{'phind'}) {
112	$phind = $collectcfg->{'phind'};
113	}
114	} else {
115	die "Couldn't find the configuration file $configfilename\n";
116	}
117
118	# Calculate default archive and phindex directories
119	if ($archivedir eq "") {
120	$archivedir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "archives");
121	}
122	if ($phindexdir eq "") {
123	$phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "phindex");
124	}
125
126	# Make sure theuser has in fact requested phind indexes
127	if (!defined($phind)) {
128	print "No phind information in $configfilename\n";
129	exit;
130	}
131
132
133	# Construct phind indexes
134	my $mode = 1;
135	my $symbol_limit = 40000000;
136	my ($command, $status);
137
138	# Make a clean phindex directory
139	if (-e "$phindexdir") {
140	&util::rm_r("$phindexdir");
141	}
142	&util::mk_dir("$phindexdir");
143
144	# Read the archives directory and build the clauses file
145	print "\nReading archive directory\n" if $verbosity;
146	&build_clauses($archivedir, $phindexdir, $language, $verbosity, $maxdocs);
147
148	# Generate the vocabulary, symbol statistics, and numbers file
149	# from the clauses file
150	print "\nExtracting vocabulary and statistics\n" if $verbosity;
151	&extract_vocabulary($phindexdir, $language, $verbosity);
152
153	# Use the suffix program to generate the phindex/phrases file
154	$command = "suffix $phindexdir $symbol_limit $mode";
155	print "\nExecuting: $command\n" if $verbosity;
156	$status = system($command);
157	if ($status != 0) {
158	print STDERR "phindgen.pl - Error executing $command: $!\n";
159	exit($status);
160	}
161
162	# Create the phrase file and put phrase numbers in phindex/phrases
163	print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity;
164	&renumber_phrases("$phindexdir", $verbosity);
165
166	# Create the mg phrase database
167	my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp");
168	my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes");
169	my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict");
170	my $mg_input = &util::filename_cat($phindexdir, "mg-p.txt");
171
172	$command = "$mg_passes -d $phindexdir -f phrase -T1 $mg_input";
173	print "\nExecuting: $command\n" if $verbosity;
174	$status = system($command);
175	if ($status != 0) {
176	print STDERR "phindgen.pl - Error executing $command: $!\n";
177	exit($status);
178	}
179
180	$command = "$mg_compression_dict -d $phindexdir -f phrase";
181	print "\nExecuting: $command\n" if $verbosity;
182	$status = system($command);
183	if ($status != 0) {
184	print STDERR "phindgen.pl - Error executing $command: $!\n";
185	exit($status);
186	}
187
188	$command = "$mg_passes -d $phindexdir -f phrase -T2 $mg_input";
189	print "\nExecuting: $command\n" if $verbosity;
190	$status = system($command);
191	if ($status != 0) {
192	print STDERR "phindgen.pl - Error executing $command: $!\n";
193	exit($status);
194	}
195
196	# Tidy up stray files
197	if (!$untidy) {
198	print "\nCleaning up\n" if $verbosity;
199	&util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers",
200	"$phindexdir/clauses.vocab", "$phindexdir/clauses.stats",
201	"$phindexdir/phrases", "$phindexdir/mg-p.txt");
202	my $outfile = 1;
203	while (-e "$phindexdir/outPhrase.$outfile") {
204	&util::rm("$phindexdir/outPhrase.$outfile");
205	$outfile++;
206	}
207	}
208	}
209
210
211
212
213
214
215
216	# Get the text for the collection.
217	# Save the document names and identifiers as we go.
218
219	sub build_clauses {
220	my ($archive_dir, $phindex_dir, $language, $verbosity, $maxdocs) = @_;
221
222	# create a "pluginfo" for ArcPlug and RecPlug
223	my $plugins = [["GMLPlug"], ["ArcPlug"]];
224	my $pluginfo = &plugin::load_plugins ($plugins, $verbosity);
225	die "No plugins were loaded." if (scalar(@$pluginfo) == 0);
226
227
228	# create a phind document processor object to process the documents
229	my $processor = new phproc ($archive_dir, $phindex_dir, $language,
230	$doclimit, $verbosity, "STDOUT");
231
232	# process the archives directory
233	&plugin::read ($pluginfo, $archive_dir, "", {}, $processor, $maxdocs);
234	}
235
236
237
238	# Generate the vocabulary, symbol statistics, and numbers file from the
239	# clauses file. This is legacy code, so is a bit messy and probably wont
240	# run under windows.
241
242	sub extract_vocabulary {
243	my ($phindex_dir, $language, $verbosity) = @_;
244
245	my ($w, $l, $line, $word);
246
247	my ($first_delimiter, $last_delimiter,
248	$first_stopword, $last_stopword,
249	$first_extractword, $last_extractword,
250	$first_contentword, $last_contentword,
251	$phrasedelimiter);
252
253	my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
254
255
256	my %symbol;
257	my (%freq);
258
259	print "Calculating vocabulary\n" if ($verbosity > 1);
260
261	# Read and store the stopwords
262	my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" \| xargs cat`;
263	my %stopwords;
264	foreach my $w (split(/\s+/, $words)) {
265	$l = lc($w);
266	$stopwords{$l} = $w;
267	}
268
269	# Read and store the thesaurus terms
270	$use_thesaurus = 0;
271	my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
272	"$language", "agrovoc.lex");
273	if (-e "$lex_file") {
274	open(TH, "<$lex_file");
275	while(<TH>) {
276	s/^\d+ //;
277	s/\(.*\)//;
278	foreach my $w (split(/\s+/, $_)) {
279	$thesaurus{lc($w)} = $w;
280	}
281	}
282	close TH;
283	$use_thesaurus = 1;
284	}
285
286	# Read words in the text and count occurences
287	open(TXT, "<$phindex_dir/clauses");
288	my @words;
289
290	while(<TXT>) {
291	$line = $_;
292	next unless ($line =~ /./);
293
294	@words = split(/\s+/, $line);
295	foreach $w (@words) {
296	$l = lc($w);
297	$w = $l if ((defined $stopwords{$l}) \|\| (defined $thesaurus{$l}));
298	$freq{$w}++;
299	}
300	$freq{$senlimit}++;
301	}
302
303	# Calculate the "best" form of each word
304	my (%bestform, %totalfreq, %bestfreq);
305
306	foreach $w (sort (keys %freq)) {
307	$l = lc($w);
308
309	# totalfreq is the number of times a term appears in any form
310	$totalfreq{$l} += $freq{$w};
311
312	if (defined $stopwords{$l}) {
313	$bestform{$l} = $stopwords{$l};
314
315	} elsif (defined $thesaurus{$l}) {
316	$bestform{$l} = $thesaurus{$l};
317
318	} elsif (!$bestform{$l} \|\| ($freq{$w} > $bestfreq{$l})) {
319	$bestfreq{$l} = $freq{$w};
320	$bestform{$l} = $w;
321	}
322	}
323
324	undef %freq;
325	undef %bestfreq;
326
327
328	# Assign symbol numbers to tokens
329	my $nextsymbol = 1;
330	my (@vocab);
331
332	# Delimiters
333	$first_delimiter = 1;
334
335	foreach $word (@delimiters) {
336
337	$word = lc($word);
338	$bestform{$word} = uc($word);
339	$vocab[$nextsymbol] = $word;
340	$symbol{$word} = $nextsymbol;
341	$nextsymbol++;
342	}
343	$last_delimiter = $nextsymbol - 1;
344
345	# Stopwords
346	$first_stopword = $nextsymbol;
347
348	foreach my $word (sort keys %stopwords) {
349
350	# don't incluse stopword unless it occurs in the text
351	$word = lc($word);
352	next unless ($totalfreq{$word});
353	next if ($symbol{$word});
354
355	$vocab[$nextsymbol] = $word;
356	$symbol{$word} = $nextsymbol;
357	$nextsymbol++;
358	}
359	$last_stopword = $nextsymbol - 1;
360	$first_contentword = $nextsymbol;
361
362	# Thesaurus terms
363	if ($use_thesaurus) {
364	$first_thesaurusword = $nextsymbol;
365
366	foreach my $word (sort keys %thesaurus) {
367
368	$word = lc($word);
369	next if ($symbol{$word});
370	$bestform{$word} = $thesaurus{$word};
371
372	$vocab[$nextsymbol] = $word;
373	$symbol{$word} = $nextsymbol;
374	$nextsymbol++;
375
376	}
377	$last_thesaurusword = $nextsymbol - 1;
378	}
379
380	# Other content words
381	$first_extractword = $nextsymbol;
382
383	foreach my $word (sort (keys %bestform)) {
384
385	next if ($symbol{$word});
386
387	$vocab[$nextsymbol] = $word;
388	$symbol{$word} = $nextsymbol;
389	$nextsymbol++;
390	}
391	$last_extractword = $nextsymbol - 1;
392	$last_contentword = $nextsymbol - 1;
393
394
395	# Outut the words
396	print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1);
397	open(VOC, ">$phindex_dir/clauses.vocab");
398
399	for (my $i = 1; $i < $nextsymbol; $i++) {
400	$w = $vocab[$i];
401
402	print VOC "$bestform{$w}\n";
403	$totalfreq{$w} = 0 unless ($totalfreq{$w});
404	}
405	close VOC;
406
407
408	# Output statistics about the vocablary
409	print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1);
410	&util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");
411	open(STAT, ">$phindex_dir/clauses.stats")
412	\|\| die "Cannot open $phindex_dir/clauses.stats: $!";
413
414	print STAT "first_delimiter $first_delimiter\n";
415	print STAT "last_delimiter $last_delimiter\n";
416	print STAT "first_stopword $first_stopword\n";
417	print STAT "last_stopword $last_stopword\n";
418	if ($use_thesaurus) {
419	print STAT "first_thesaurusword $first_thesaurusword\n";
420	print STAT "last_thesaurusword $last_thesaurusword\n";
421	}
422	print STAT "first_extractword $first_extractword\n";
423	print STAT "last_extractword $last_extractword\n";
424	print STAT "first_contentword $first_contentword\n";
425	print STAT "last_contentword $last_contentword\n";
426	print STAT "first_symbol $first_delimiter\n";
427	print STAT "last_symbol $last_contentword\n";
428	print STAT "first_word $first_stopword\n";
429	print STAT "last_word $last_contentword\n";
430	close STAT;
431
432	undef @vocab;
433
434
435	# Save text as symbol numbers
436	print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1);
437
438	open(TXT, "<$phindex_dir/clauses");
439	open(NUM, ">$phindex_dir/clauses.numbers");
440
441	$phrasedelimiter = $symbol{lc($senlimit)};
442	print NUM "$symbol{lc($colstart)}\n";
443
444	# set up the special symbols that delimit documents and sentences
445	while(<TXT>) {
446
447	# split sentence into a list of tokens
448	$line = $_;
449	next unless ($line =~ /./);
450	@words = split(/\s+/, $line);
451
452	# output one token at a time
453	foreach $word (@words) {
454	$word = lc($word);
455	print NUM "$symbol{$word}\n";
456	}
457
458	# output phrase delimiter
459	print NUM "$phrasedelimiter\n";
460	}
461
462	print NUM "$symbol{lc($colend)}\n";
463
464	}
465
466
467	# Prepare the phrases file to be input to mgpp.
468	# This means renumbering the phrases in order of decreasing frequency.
469
470
471	# This is legacy code, and a little ugly, and may be unix-specific
472	# (particularly the sort command).
473
474	sub renumber_phrases {
475	my ($phindex_dir, $verbosity) = @_;
476
477	# Sort the phrases into order of increasing frequency
478	print "Sorting phrases into freq order\n" if ($verbosity);
479	system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases");
480
481	my @symbol;
482
483	# Read the vocabulary
484	print "Reading the vocabulary\n" if ($verbosity);
485	open(V, "<$phindex_dir/clauses.vocab")
486	\|\| die "Cannot open $phindex_dir/clauses.vocab: $!";
487
488	my $i = 1;
489	while(<V>) {
490	chomp;
491	$symbol[$i++] = $_;
492	}
493
494	# Create file for phrase data
495	#
496	# The phrases file looks something like this
497	# 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
498	# 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
499	# 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
500	# 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
501
502
503	print "Calculate phrase numbers\n" if ($verbosity);
504
505	open(IN, "<$phindex_dir/phrases");
506
507	my %phrasenumber;
508	my $nextphrase = 1;
509
510	my ($line, $num);
511
512	while(<IN>) {
513
514	# read the line
515	chomp;
516	$line = $_;
517
518	# we're only interested in ther first field
519	$line =~ s/:.*//;
520
521	# get a phrase number for this line
522	$num = $nextphrase;
523	$phrasenumber{$line} = $num;
524	$nextphrase++;
525	}
526
527	# Extract the phrase data
528	print "Create phrase file and frequency file\n" if ($verbosity);
529
530	open(IN, "<$phindex_dir/phrases");
531	open(DATA, ">$phindex_dir/mg-p.txt");
532
533	my ($key, $tf, $countexp, $expansions, $countdocs, $documents, $text);
534	my @fields;
535	my @documents;
536	my (@newexp, $k, $n);
537
538	my $linenumber = 0;
539
540	while(<IN>) {
541
542	# read the line
543	chomp;
544	$line = $_;
545	@fields = split(/:/, $line);
546
547	# output the MG document tag
548	print DATA "<Document>";
549
550	# get a phrase number for this line
551	$key = shift @fields;
552	die unless (defined($phrasenumber{$key}));
553	$num = $phrasenumber{$key};
554	print DATA "$num:";
555
556	# get the text of the phrase
557	$text = shift @fields;
558	$text =~ s/s(\d+)/$symbol[$1]/g;
559	print DATA "$text:";
560
561	$linenumber++;
562	if ($linenumber % 1000 == 0) {
563	print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 1);
564	}
565	print "$num: $key\t($text)\n" if ($verbosity > 1);
566
567	# get the phrase frequency
568	$tf = shift @fields;
569	print DATA "$tf:";
570
571	# get the number of expansions
572	$countexp = shift @fields;
573	print DATA "$countexp:";
574
575	# get the expansions and convert them into phrase numbers
576	$expansions = shift @fields;
577	@newexp = ();
578	foreach $k (split(/,/, $expansions)) {
579	die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
580	$n = $phrasenumber{$k};
581	push @newexp, $n;
582	}
583	print DATA join(",", (sort numerically @newexp)), ":";
584
585	# get the number of documents
586	$countdocs = shift @fields;
587	print DATA "$countdocs:";
588
589	# get the documents
590	$documents = shift @fields;
591	$documents =~ s/d//g;
592	@documents = split(/;/, $documents);
593
594	print DATA join(";", (sort by_frequency @documents));
595
596	print DATA "\n";
597
598	}
599	}
600
601	# sort routines used to renumber phrases
602
603	sub numerically { $a <=> $b }
604
605	sub by_frequency {
606	my $fa = 1;
607	if ($a =~ /,/) {
608	$fa = $a;
609	$fa =~ s/\d+,//;
610	}
611	my $fb = 1;
612	if ($b =~ /,/) {
613	$fb = $b;
614	$fb =~ s/\d+,//;
615	}
616
617	return ($fb <=> $fa);
618	}
619

Note: See TracBrowser for help on using the repository browser.

Download in other formats: