Context Navigation

source: trunk/gsdl/bin/script/unindex.pl@ 1970

Last change on this file since 1970 was 1970, checked in by sjboddie, 23 years ago
Added more usage information to all perl programs and removed a few programs that are no longer useful.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.8 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# unindex.pl --
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# this program will decompress all the text from a built index
29	# and return it to gml format - may be slightly broken at present
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
34	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
35	$FileHandle = 'FH000';
36	}
37
38	use doc;
39	use docsave;
40	use util;
41	use parsargv;
42	use GDBM_File;
43	use FileHandle;
44	use English;
45
46	select STDERR; $\| = 1;
47	select STDOUT; $\| = 1;
48
49
50	# globals
51	$collection = "";
52	$index = "";
53	$textdir = "";
54	$classinfo = {};
55	$mgread = ++$FileHandle;
56	$mgwrite = ++$FileHandle;
57
58
59
60	sub print_usage {
61	print STDERR "\n";
62	print STDERR "unindex.pl: Attempts to generate gml files from a built Greenstone index.\n\n";
63	print STDERR " usage: $0 [options]\n\n";
64	print STDERR " options:\n";
65	print STDERR " -verbosity number 0=none, 3=lots\n";
66	print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
67	print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n";
68	print STDERR " -removeold Will remove the old contents of the archives\n";
69	print STDERR " directory -- use with care\n\n";
70	}
71
72	&main ();
73
74	sub main {
75	if (!parsargv::parse(\@ARGV,
76	'verbosity/\d+/2', \$verbosity,
77	'indexdir/.*/index', \$indexdir,
78	'archivedir/.*/archives', \$archivedir,
79	'removeold', \$removeold)) {
80	&print_usage();
81	die "\n";
82	}
83
84	die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
85	$indexdir =~ s/\/$//;
86	if (-d $archivedir) {
87	if ($removeold) {
88	print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
89	print STDERR " in preparation for the import\n";
90	sleep(5); # just in case...
91	&util::rm_r ($archivedir);
92	}
93	} else {
94	&util::mk_all_dir ($archivedir);
95	}
96
97	my $gdbmfile = &get_gdbmfile ($indexdir);
98	&set_index ();
99	&get_classinfo ($gdbmfile);
100	&openmg ();
101
102	# read the archive information file
103	my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
104	my $archive_info = new arcinfo ();
105	$archive_info->load_info ($archive_info_filename);
106
107	# create a docsave object to process the documents
108	my $processor = new docsave ("", $archive_info, $verbosity);
109	$processor->setarchivedir ($archivedir);
110
111	# create a document object for the classification file
112	my $clsudoc_obj = new doc($gdbmfile, "classification");
113	$clsudoc_obj->set_OID ("CLSU");
114
115	my ($doc_obj);
116	foreach $classification (keys %$classinfo) {
117	my $count = 0;
118	foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
119	print STDERR "\n$classification.$section" if $verbosity >= 2;
120	if ($classification =~ /^B$/i) {
121	last if $count > 10;
122	# create a new document
123	$doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'},
124	"indexed_doc");
125	$doc_obj->set_OID();
126	my $cursection = $doc_obj->get_top_section();
127	&add_section_content ($doc_obj, $cursection,
128	$classinfo->{$classification}->{$section});
129	&recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section},
130	$doc_obj->get_end_child($cursection), 0);
131	$processor->process($doc_obj);
132	} else {
133	my $classifier = &int_classification ($classification);
134	$clsudoc_obj->create_named_section("$classifier.$section");
135	&add_section_content ($clsudoc_obj, "$classifier.$section",
136	$classinfo->{$classification}->{$section});
137	&recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section},
138	"", "$classifier.$section");
139	}
140	$count ++;
141	}
142	}
143	$processor->process($clsudoc_obj);
144	&closemg();
145
146	# write out the archive information file
147	$archive_info->save_info($archive_info_filename);
148	}
149
150	# returns the path to the gdbm info database - also
151	# sets the $collection and $textdir global variable
152	sub get_gdbmfile {
153	my ($indexdir) = @_;
154	my ($gdbmfile);
155
156	opendir (DIR, $indexdir) \|\| die "Couldn't open directory $indexdir\n\n";
157	my @conts = readdir DIR;
158	close DIR;
159
160	foreach $file (@conts) {
161	if ($file =~ /text$/) {
162	$textdir = $file;
163	last;
164	}
165	}
166	die "No text directory found in $indexdir\n\n"
167	unless defined $textdir && $textdir =~ /text$/;
168
169	$gdbmfile = &util::filename_cat ($indexdir, $textdir);
170
171	opendir (DIR, $gdbmfile) \|\| die "Couldn't open directory $gdbmfile\n\n";
172	@conts = readdir DIR;
173	close DIR;
174
175	foreach $file (@conts) {
176	if ($file =~ /^(.*?)\.(?:ldb\|bdb)$/) {
177	$collection = $1;
178	$gdbmfile = &util::filename_cat ($gdbmfile, $file);
179	last;
180	}
181	}
182
183	if (defined $collection && $collection =~ /\w/) {
184	$textdir = &util::filename_cat ($textdir, $collection);
185	} else {
186	die "collection global wasn't set\n";
187	}
188	return $gdbmfile if (-e $gdbmfile);
189	die "Couldn't find gdbm info database in $indexdir\n\n";
190	}
191
192	sub get_classinfo {
193	my ($gdbmfile) = @_;
194	my ($class);
195
196	open (DB2TXT, "db2txt $gdbmfile \|") \|\| die "couldn't open pipe to db2txt\n";
197
198	print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
199	my $count = 0;
200
201	$/ = '-' x 70;
202	my $entry = "";
203	while (defined ($entry = <DB2TXT>)) {
204	next unless $entry =~ /\w/;
205	$entry =~ s/\n+/\\n/g;
206	my ($key, $value) = $entry =~ /\[([^\]])\](.)/;
207
208	print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);
209
210	next if ($key !~ /\./); # ignore the docnums and top level of classifications
211
212	die "Badly formatted key $key in $gdbmfile\n\n"
213	unless (($class, $section) = $key =~ /^(.)\.(.*)$/);
214	$classinfo->{$class} = {} unless defined $classinfo->{$class};
215	my @sections = split /\./, $section;
216
217	my $hashptr = $classinfo->{$class};
218	foreach $sec (@sections) {
219	$hashptr->{$sec} = {} unless defined $hashptr->{$sec};
220	$hashptr = $hashptr->{$sec};
221	}
222	&get_metadata ($key, \$value, $hashptr);
223
224	$count ++;
225	}
226
227	$/ = "\n";
228	print STDERR "\n" if $verbosity >= 2;
229	}
230
231	sub get_metadata {
232	my ($key, $gdb_str_ref, $hashsection) = @_;
233
234	my ($title) = $$gdb_str_ref =~ /(?:<t>\|<title>)(.*?)(?:\\n\|$)/i;
235	my ($docnum) = $$gdb_str_ref =~ /(?:<d>\|<docnum>)(.*?)(?:\\n\|$)/i;
236	my ($jobnum) = $$gdb_str_ref =~ /(?:<j>\|<jobnumber>)(.*?)(?:\\n\|$)/i;
237	my ($classifications) = $$gdb_str_ref =~ /(?:<x>\|<classification>)(.*?)(?:\\n\|$)/i;
238	my $hastext = 1;
239	$hastext = 0 if ($$gdb_str_ref =~ /(?:<c>\|<contains>)(.*?)(?:\\n\|$)/i);
240	my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n\|$)/i;
241	my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n\|$)/i;
242
243	# just in case there are empty classifications
244	if ($hastext && !defined $docnum) {
245	print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
246	if ($verbosity >= 2) {
247	my $valuestr = $$gdb_str_ref;
248	$valuestr =~ s/\\n/\n/g;
249	print STDERR "$valuestr\n";
250	}
251	$hastext = 0;
252	}
253
254	# shove metadata in 9999999 to keep it numeric and prevent
255	# sorting from being a pain. Watch out for documents with
256	# 9999999 subsections ;-)
257	if (defined $hashsection->{'9999999'}) {
258	print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
259	} else {
260	$hashsection->{'9999999'}->{'Title'} = $title if defined $title;
261	$hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
262	$hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
263	$hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
264	$hashsection->{'9999999'}->{'hastext'} = $hastext;
265	$hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
266	$hashsection->{'9999999'}->{'Date'} = $date if defined $date;
267	}
268	}
269
270	sub recurse_classinfo {
271	my ($doc_obj, $hashsection, $cursection, $class) = @_;
272	foreach $section (sort numerically keys %$hashsection) {
273	next if $section == 9999999;
274	if ($class) {
275	$doc_obj->create_named_section("$class.$section");
276	&add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
277	&recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
278	} else {
279	$cursection =
280	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
281	&add_section_content ($doc_obj, $cursection, $hashsection->{$section});
282	&recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
283	}
284	}
285	}
286
287	sub add_section_content {
288	my ($doc_obj, $cursection, $hashsection) = @_;
289
290	$doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
291	if defined $hashsection->{'9999999'}->{'Title'};
292	$doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
293	if defined $hashsection->{'9999999'}->{'Creator'};
294	$doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
295	if defined $hashsection->{'9999999'}->{'Date'};
296
297	if (defined $hashsection->{'9999999'}->{'classifications'}) {
298	my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
299	map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
300	}
301
302	$doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
303	if ($hashsection->{'9999999'}->{'hastext'});
304	}
305
306	sub set_index {
307	# check that $collection has been set
308	die "collection global was not set\n"
309	unless defined $collection && $collection =~ /\w/;
310
311	# find an index (just use first non-text directory we come across in $indexdir)
312	opendir (INDEXDIR, $indexdir) \|\| die "couldn't open directory $indexdir\n";
313	my @indexes = readdir INDEXDIR;
314	close INDEXDIR;
315	foreach $i (@indexes) {
316	next if $i =~ /text$/i \|\| $i =~ /\./ ;
317	$index = &util::filename_cat ($i, $collection);
318	last;
319	}
320	}
321
322	sub get_text {
323	my ($docnum) = @_;
324
325	print STDERR "." if $verbosity >= 2;
326	&mgcommand ($docnum);
327
328	<$mgread>; # eat the document separator
329
330	my $text = "";
331	my $line = "";
332
333	while (defined ($line = <$mgread>))
334	{
335	last if $line =~ /^<\/mg>/;
336	$text .= $line;
337	}
338
339	# Read in the last statement, which should be:
340	# "dd documents retrieved."
341	<$mgread>;
342
343	return $text;
344	}
345
346	sub numerically {$a <=> $b;}
347
348	# converts leading letter of a classification into its ascii equivalent
349	# i.e C.2.4 becomes 67.2.4
350	sub int_classification {
351	my ($classification) = @_;
352	my $c = ord($classification);
353	$classification =~ s/^./$c/;
354
355	return $classification;
356	}
357
358
359	# mg stuff
360
361	sub openmg {
362
363	die "Unable to start mgquery." unless
364	openpipe($mgread, $mgwrite,
365	"mgquery -d $indexdir -f $index -t $textdir");
366
367	$mgwrite->autoflush();
368
369	&mgcommand('.set expert true');
370	&mgcommand('.set terminator "</mg>\n"');
371	&mgcommand('.set mode text');
372	&mgcommand('.set query docnums');
373	&mgcommand('.set term_freq off');
374	}
375
376	sub closemg {
377	&mgcommand (".quit");
378	close($mgread);
379	close($mgwrite);
380	}
381
382	sub mgcommand {
383	my ($command) = @_;
384
385	return if $command =~ /^\s*$/;
386	print $mgwrite "$command\n";
387
388	# eat up the command executed which is echoed
389	<$mgread>;
390	}
391
392	# openpipe(READ, WRITE, CMD)
393	#
394	# Like open2, except CMD's stderr is also redirected.
395	#
396	sub openpipe
397	{
398	my ($read, $write, $cmd) = @_;
399	my ($child_read, $child_write);
400
401	$child_read = ++$FileHandle;
402	$child_write = ++$FileHandle;
403
404	pipe($read, $child_write) \|\| die "Failed pipe($read, $child_write): $!";
405	pipe($child_read, $write) \|\| die "Failed pipe($child_read, $write): $!";
406	my $pid;
407
408	if (($pid = fork) < 0) {
409	die "Failed fork: $!";
410	} elsif ($pid == 0) {
411	close($read);
412	close($write);
413	open(STDIN, "<&$child_read");
414	open(STDOUT, ">&$child_write");
415	open(STDERR, ">&$child_write");
416	exec($cmd);
417	die "Failed exec $cmd: $!";
418	}
419
420	close($child_read);
421	close($child_write);
422
423	$write->autoflush();
424	$read->autoflush();
425
426	return 1;
427	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: