Context Navigation

source: trunk/gsdl/bin/script/unindex.pl@ 1666

Last change on this file since 1666 was 1666, checked in by sjboddie, 23 years ago
another script with old #! line
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# unindex.pl --
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# this program will decompress all the text from a built index
29	# and return it to gml format - may be slightly broken at present
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
34	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
35	$FileHandle = 'FH000';
36	}
37
38	use doc;
39	use docsave;
40	use util;
41	use parsargv;
42	use GDBM_File;
43	use FileHandle;
44	use English;
45
46	select STDERR; $\| = 1;
47	select STDOUT; $\| = 1;
48
49
50	# globals
51	$collection = "";
52	$index = "";
53	$textdir = "";
54	$classinfo = {};
55	$mgread = ++$FileHandle;
56	$mgwrite = ++$FileHandle;
57
58
59
60	sub print_usage {
61	print STDERR "\n usage: $0 [options]\n\n";
62	print STDERR " options:\n";
63	print STDERR " -verbosity number 0=none, 3=lots\n";
64	print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
65	print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n";
66	print STDERR " -removeold Will remove the old contents of the archives\n";
67	print STDERR " directory -- use with care\n\n";
68	}
69
70	&main ();
71
72	sub main {
73	if (!parsargv::parse(\@ARGV,
74	'verbosity/\d+/2', \$verbosity,
75	'indexdir/.*/index', \$indexdir,
76	'archivedir/.*/archives', \$archivedir,
77	'removeold', \$removeold)) {
78	&print_usage();
79	die "\n";
80	}
81
82	die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
83	$indexdir =~ s/\/$//;
84	if (-d $archivedir) {
85	if ($removeold) {
86	print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
87	print STDERR " in preparation for the import\n";
88	sleep(5); # just in case...
89	&util::rm_r ($archivedir);
90	}
91	} else {
92	&util::mk_all_dir ($archivedir);
93	}
94
95	my $gdbmfile = &get_gdbmfile ($indexdir);
96	&set_index ();
97	&get_classinfo ($gdbmfile);
98	&openmg ();
99
100	# read the archive information file
101	my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
102	my $archive_info = new arcinfo ();
103	$archive_info->load_info ($archive_info_filename);
104
105	# create a docsave object to process the documents
106	my $processor = new docsave ("", $archive_info, $verbosity);
107	$processor->setarchivedir ($archivedir);
108
109	# create a document object for the classification file
110	my $clsudoc_obj = new doc($gdbmfile, "classification");
111	$clsudoc_obj->set_OID ("CLSU");
112
113	my ($doc_obj);
114	foreach $classification (keys %$classinfo) {
115	my $count = 0;
116	foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
117	print STDERR "\n$classification.$section" if $verbosity >= 2;
118	if ($classification =~ /^B$/i) {
119	last if $count > 10;
120	# create a new document
121	$doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'},
122	"indexed_doc");
123	$doc_obj->set_OID();
124	my $cursection = $doc_obj->get_top_section();
125	&add_section_content ($doc_obj, $cursection,
126	$classinfo->{$classification}->{$section});
127	&recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section},
128	$doc_obj->get_end_child($cursection), 0);
129	$processor->process($doc_obj);
130	} else {
131	my $classifier = &int_classification ($classification);
132	$clsudoc_obj->create_named_section("$classifier.$section");
133	&add_section_content ($clsudoc_obj, "$classifier.$section",
134	$classinfo->{$classification}->{$section});
135	&recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section},
136	"", "$classifier.$section");
137	}
138	$count ++;
139	}
140	}
141	$processor->process($clsudoc_obj);
142	&closemg();
143
144	# write out the archive information file
145	$archive_info->save_info($archive_info_filename);
146	}
147
148	# returns the path to the gdbm info database - also
149	# sets the $collection and $textdir global variable
150	sub get_gdbmfile {
151	my ($indexdir) = @_;
152	my ($gdbmfile);
153
154	opendir (DIR, $indexdir) \|\| die "Couldn't open directory $indexdir\n\n";
155	my @conts = readdir DIR;
156	close DIR;
157
158	foreach $file (@conts) {
159	if ($file =~ /text$/) {
160	$textdir = $file;
161	last;
162	}
163	}
164	die "No text directory found in $indexdir\n\n"
165	unless defined $textdir && $textdir =~ /text$/;
166
167	$gdbmfile = &util::filename_cat ($indexdir, $textdir);
168
169	opendir (DIR, $gdbmfile) \|\| die "Couldn't open directory $gdbmfile\n\n";
170	@conts = readdir DIR;
171	close DIR;
172
173	foreach $file (@conts) {
174	if ($file =~ /^(.*?)\.(?:ldb\|bdb)$/) {
175	$collection = $1;
176	$gdbmfile = &util::filename_cat ($gdbmfile, $file);
177	last;
178	}
179	}
180
181	if (defined $collection && $collection =~ /\w/) {
182	$textdir = &util::filename_cat ($textdir, $collection);
183	} else {
184	die "collection global wasn't set\n";
185	}
186	return $gdbmfile if (-e $gdbmfile);
187	die "Couldn't find gdbm info database in $indexdir\n\n";
188	}
189
190	sub get_classinfo {
191	my ($gdbmfile) = @_;
192	my ($class);
193
194	open (DB2TXT, "db2txt $gdbmfile \|") \|\| die "couldn't open pipe to db2txt\n";
195
196	print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
197	my $count = 0;
198
199	$/ = '-' x 70;
200	my $entry = "";
201	while (defined ($entry = <DB2TXT>)) {
202	next unless $entry =~ /\w/;
203	$entry =~ s/\n+/\\n/g;
204	my ($key, $value) = $entry =~ /\[([^\]])\](.)/;
205
206	print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);
207
208	next if ($key !~ /\./); # ignore the docnums and top level of classifications
209
210	die "Badly formatted key $key in $gdbmfile\n\n"
211	unless (($class, $section) = $key =~ /^(.)\.(.*)$/);
212	$classinfo->{$class} = {} unless defined $classinfo->{$class};
213	my @sections = split /\./, $section;
214
215	my $hashptr = $classinfo->{$class};
216	foreach $sec (@sections) {
217	$hashptr->{$sec} = {} unless defined $hashptr->{$sec};
218	$hashptr = $hashptr->{$sec};
219	}
220	&get_metadata ($key, \$value, $hashptr);
221
222	$count ++;
223	}
224
225	$/ = "\n";
226	print STDERR "\n" if $verbosity >= 2;
227	}
228
229	sub get_metadata {
230	my ($key, $gdb_str_ref, $hashsection) = @_;
231
232	my ($title) = $$gdb_str_ref =~ /(?:<t>\|<title>)(.*?)(?:\\n\|$)/i;
233	my ($docnum) = $$gdb_str_ref =~ /(?:<d>\|<docnum>)(.*?)(?:\\n\|$)/i;
234	my ($jobnum) = $$gdb_str_ref =~ /(?:<j>\|<jobnumber>)(.*?)(?:\\n\|$)/i;
235	my ($classifications) = $$gdb_str_ref =~ /(?:<x>\|<classification>)(.*?)(?:\\n\|$)/i;
236	my $hastext = 1;
237	$hastext = 0 if ($$gdb_str_ref =~ /(?:<c>\|<contains>)(.*?)(?:\\n\|$)/i);
238	my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n\|$)/i;
239	my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n\|$)/i;
240
241	# just in case there are empty classifications
242	if ($hastext && !defined $docnum) {
243	print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
244	if ($verbosity >= 2) {
245	my $valuestr = $$gdb_str_ref;
246	$valuestr =~ s/\\n/\n/g;
247	print STDERR "$valuestr\n";
248	}
249	$hastext = 0;
250	}
251
252	# shove metadata in 9999999 to keep it numeric and prevent
253	# sorting from being a pain. Watch out for documents with
254	# 9999999 subsections ;-)
255	if (defined $hashsection->{'9999999'}) {
256	print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
257	} else {
258	$hashsection->{'9999999'}->{'Title'} = $title if defined $title;
259	$hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
260	$hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
261	$hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
262	$hashsection->{'9999999'}->{'hastext'} = $hastext;
263	$hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
264	$hashsection->{'9999999'}->{'Date'} = $date if defined $date;
265	}
266	}
267
268	sub recurse_classinfo {
269	my ($doc_obj, $hashsection, $cursection, $class) = @_;
270	foreach $section (sort numerically keys %$hashsection) {
271	next if $section == 9999999;
272	if ($class) {
273	$doc_obj->create_named_section("$class.$section");
274	&add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
275	&recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
276	} else {
277	$cursection =
278	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
279	&add_section_content ($doc_obj, $cursection, $hashsection->{$section});
280	&recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
281	}
282	}
283	}
284
285	sub add_section_content {
286	my ($doc_obj, $cursection, $hashsection) = @_;
287
288	$doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
289	if defined $hashsection->{'9999999'}->{'Title'};
290	$doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
291	if defined $hashsection->{'9999999'}->{'Creator'};
292	$doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
293	if defined $hashsection->{'9999999'}->{'Date'};
294
295	if (defined $hashsection->{'9999999'}->{'classifications'}) {
296	my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
297	map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
298	}
299
300	$doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
301	if ($hashsection->{'9999999'}->{'hastext'});
302	}
303
304	sub set_index {
305	# check that $collection has been set
306	die "collection global was not set\n"
307	unless defined $collection && $collection =~ /\w/;
308
309	# find an index (just use first non-text directory we come across in $indexdir)
310	opendir (INDEXDIR, $indexdir) \|\| die "couldn't open directory $indexdir\n";
311	my @indexes = readdir INDEXDIR;
312	close INDEXDIR;
313	foreach $i (@indexes) {
314	next if $i =~ /text$/i \|\| $i =~ /\./ ;
315	$index = &util::filename_cat ($i, $collection);
316	last;
317	}
318	}
319
320	sub get_text {
321	my ($docnum) = @_;
322
323	print STDERR "." if $verbosity >= 2;
324	&mgcommand ($docnum);
325
326	<$mgread>; # eat the document separator
327
328	my $text = "";
329	my $line = "";
330
331	while (defined ($line = <$mgread>))
332	{
333	last if $line =~ /^<\/mg>/;
334	$text .= $line;
335	}
336
337	# Read in the last statement, which should be:
338	# "dd documents retrieved."
339	<$mgread>;
340
341	return $text;
342	}
343
344	sub numerically {$a <=> $b;}
345
346	# converts leading letter of a classification into its ascii equivalent
347	# i.e C.2.4 becomes 67.2.4
348	sub int_classification {
349	my ($classification) = @_;
350	my $c = ord($classification);
351	$classification =~ s/^./$c/;
352
353	return $classification;
354	}
355
356
357	# mg stuff
358
359	sub openmg {
360
361	die "Unable to start mgquery." unless
362	openpipe($mgread, $mgwrite,
363	"mgquery -d $indexdir -f $index -t $textdir");
364
365	$mgwrite->autoflush();
366
367	&mgcommand('.set expert true');
368	&mgcommand('.set terminator "</mg>\n"');
369	&mgcommand('.set mode text');
370	&mgcommand('.set query docnums');
371	&mgcommand('.set term_freq off');
372	}
373
374	sub closemg {
375	&mgcommand (".quit");
376	close($mgread);
377	close($mgwrite);
378	}
379
380	sub mgcommand {
381	my ($command) = @_;
382
383	return if $command =~ /^\s*$/;
384	print $mgwrite "$command\n";
385
386	# eat up the command executed which is echoed
387	<$mgread>;
388	}
389
390	# openpipe(READ, WRITE, CMD)
391	#
392	# Like open2, except CMD's stderr is also redirected.
393	#
394	sub openpipe
395	{
396	my ($read, $write, $cmd) = @_;
397	my ($child_read, $child_write);
398
399	$child_read = ++$FileHandle;
400	$child_write = ++$FileHandle;
401
402	pipe($read, $child_write) \|\| die "Failed pipe($read, $child_write): $!";
403	pipe($child_read, $write) \|\| die "Failed pipe($child_read, $write): $!";
404	my $pid;
405
406	if (($pid = fork) < 0) {
407	die "Failed fork: $!";
408	} elsif ($pid == 0) {
409	close($read);
410	close($write);
411	open(STDIN, "<&$child_read");
412	open(STDOUT, ">&$child_write");
413	open(STDERR, ">&$child_write");
414	exec($cmd);
415	die "Failed exec $cmd: $!";
416	}
417
418	close($child_read);
419	close($child_write);
420
421	$write->autoflush();
422	$read->autoflush();
423
424	return 1;
425	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: