source: trunk/gsdl/bin/script/unindex.pl@ 1970

Last change on this file since 1970 was 1970, checked in by sjboddie, 23 years ago

Added more usage information to all perl programs and removed a few
programs that are no longer useful.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# unindex.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# this program will decompress all the text from a built index
29# and return it to gml format - may be slightly broken at present
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
35 $FileHandle = 'FH000';
36}
37
38use doc;
39use docsave;
40use util;
41use parsargv;
42use GDBM_File;
43use FileHandle;
44use English;
45
46select STDERR; $| = 1;
47select STDOUT; $| = 1;
48
49
50# globals
51$collection = "";
52$index = "";
53$textdir = "";
54$classinfo = {};
55$mgread = ++$FileHandle;
56$mgwrite = ++$FileHandle;
57
58
59
60sub print_usage {
61 print STDERR "\n";
62 print STDERR "unindex.pl: Attempts to generate gml files from a built Greenstone index.\n\n";
63 print STDERR " usage: $0 [options]\n\n";
64 print STDERR " options:\n";
65 print STDERR " -verbosity number 0=none, 3=lots\n";
66 print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
67 print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n";
68 print STDERR " -removeold Will remove the old contents of the archives\n";
69 print STDERR " directory -- use with care\n\n";
70}
71
72&main ();
73
74sub main {
75 if (!parsargv::parse(\@ARGV,
76 'verbosity/\d+/2', \$verbosity,
77 'indexdir/.*/index', \$indexdir,
78 'archivedir/.*/archives', \$archivedir,
79 'removeold', \$removeold)) {
80 &print_usage();
81 die "\n";
82 }
83
84 die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
85 $indexdir =~ s/\/$//;
86 if (-d $archivedir) {
87 if ($removeold) {
88 print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
89 print STDERR " in preparation for the import\n";
90 sleep(5); # just in case...
91 &util::rm_r ($archivedir);
92 }
93 } else {
94 &util::mk_all_dir ($archivedir);
95 }
96
97 my $gdbmfile = &get_gdbmfile ($indexdir);
98 &set_index ();
99 &get_classinfo ($gdbmfile);
100 &openmg ();
101
102 # read the archive information file
103 my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
104 my $archive_info = new arcinfo ();
105 $archive_info->load_info ($archive_info_filename);
106
107 # create a docsave object to process the documents
108 my $processor = new docsave ("", $archive_info, $verbosity);
109 $processor->setarchivedir ($archivedir);
110
111 # create a document object for the classification file
112 my $clsudoc_obj = new doc($gdbmfile, "classification");
113 $clsudoc_obj->set_OID ("CLSU");
114
115 my ($doc_obj);
116 foreach $classification (keys %$classinfo) {
117 my $count = 0;
118 foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
119 print STDERR "\n$classification.$section" if $verbosity >= 2;
120 if ($classification =~ /^B$/i) {
121 last if $count > 10;
122 # create a new document
123 $doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'},
124 "indexed_doc");
125 $doc_obj->set_OID();
126 my $cursection = $doc_obj->get_top_section();
127 &add_section_content ($doc_obj, $cursection,
128 $classinfo->{$classification}->{$section});
129 &recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section},
130 $doc_obj->get_end_child($cursection), 0);
131 $processor->process($doc_obj);
132 } else {
133 my $classifier = &int_classification ($classification);
134 $clsudoc_obj->create_named_section("$classifier.$section");
135 &add_section_content ($clsudoc_obj, "$classifier.$section",
136 $classinfo->{$classification}->{$section});
137 &recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section},
138 "", "$classifier.$section");
139 }
140 $count ++;
141 }
142 }
143 $processor->process($clsudoc_obj);
144 &closemg();
145
146 # write out the archive information file
147 $archive_info->save_info($archive_info_filename);
148}
149
150# returns the path to the gdbm info database - also
151# sets the $collection and $textdir global variable
152sub get_gdbmfile {
153 my ($indexdir) = @_;
154 my ($gdbmfile);
155
156 opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
157 my @conts = readdir DIR;
158 close DIR;
159
160 foreach $file (@conts) {
161 if ($file =~ /text$/) {
162 $textdir = $file;
163 last;
164 }
165 }
166 die "No text directory found in $indexdir\n\n"
167 unless defined $textdir && $textdir =~ /text$/;
168
169 $gdbmfile = &util::filename_cat ($indexdir, $textdir);
170
171 opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
172 @conts = readdir DIR;
173 close DIR;
174
175 foreach $file (@conts) {
176 if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
177 $collection = $1;
178 $gdbmfile = &util::filename_cat ($gdbmfile, $file);
179 last;
180 }
181 }
182
183 if (defined $collection && $collection =~ /\w/) {
184 $textdir = &util::filename_cat ($textdir, $collection);
185 } else {
186 die "collection global wasn't set\n";
187 }
188 return $gdbmfile if (-e $gdbmfile);
189 die "Couldn't find gdbm info database in $indexdir\n\n";
190}
191
192sub get_classinfo {
193 my ($gdbmfile) = @_;
194 my ($class);
195
196 open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
197
198 print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
199 my $count = 0;
200
201 $/ = '-' x 70;
202 my $entry = "";
203 while (defined ($entry = <DB2TXT>)) {
204 next unless $entry =~ /\w/;
205 $entry =~ s/\n+/\\n/g;
206 my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
207
208 print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);
209
210 next if ($key !~ /\./); # ignore the docnums and top level of classifications
211
212 die "Badly formatted key $key in $gdbmfile\n\n"
213 unless (($class, $section) = $key =~ /^(.)\.(.*)$/);
214 $classinfo->{$class} = {} unless defined $classinfo->{$class};
215 my @sections = split /\./, $section;
216
217 my $hashptr = $classinfo->{$class};
218 foreach $sec (@sections) {
219 $hashptr->{$sec} = {} unless defined $hashptr->{$sec};
220 $hashptr = $hashptr->{$sec};
221 }
222 &get_metadata ($key, \$value, $hashptr);
223
224 $count ++;
225 }
226
227 $/ = "\n";
228 print STDERR "\n" if $verbosity >= 2;
229}
230
231sub get_metadata {
232 my ($key, $gdb_str_ref, $hashsection) = @_;
233
234 my ($title) = $$gdb_str_ref =~ /(?:<t>|<title>)(.*?)(?:\\n|$)/i;
235 my ($docnum) = $$gdb_str_ref =~ /(?:<d>|<docnum>)(.*?)(?:\\n|$)/i;
236 my ($jobnum) = $$gdb_str_ref =~ /(?:<j>|<jobnumber>)(.*?)(?:\\n|$)/i;
237 my ($classifications) = $$gdb_str_ref =~ /(?:<x>|<classification>)(.*?)(?:\\n|$)/i;
238 my $hastext = 1;
239 $hastext = 0 if ($$gdb_str_ref =~ /(?:<c>|<contains>)(.*?)(?:\\n|$)/i);
240 my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n|$)/i;
241 my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n|$)/i;
242
243 # just in case there are empty classifications
244 if ($hastext && !defined $docnum) {
245 print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
246 if ($verbosity >= 2) {
247 my $valuestr = $$gdb_str_ref;
248 $valuestr =~ s/\\n/\n/g;
249 print STDERR "$valuestr\n";
250 }
251 $hastext = 0;
252 }
253
254 # shove metadata in 9999999 to keep it numeric and prevent
255 # sorting from being a pain. Watch out for documents with
256 # 9999999 subsections ;-)
257 if (defined $hashsection->{'9999999'}) {
258 print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
259 } else {
260 $hashsection->{'9999999'}->{'Title'} = $title if defined $title;
261 $hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
262 $hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
263 $hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
264 $hashsection->{'9999999'}->{'hastext'} = $hastext;
265 $hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
266 $hashsection->{'9999999'}->{'Date'} = $date if defined $date;
267 }
268}
269
270sub recurse_classinfo {
271 my ($doc_obj, $hashsection, $cursection, $class) = @_;
272 foreach $section (sort numerically keys %$hashsection) {
273 next if $section == 9999999;
274 if ($class) {
275 $doc_obj->create_named_section("$class.$section");
276 &add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
277 &recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
278 } else {
279 $cursection =
280 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
281 &add_section_content ($doc_obj, $cursection, $hashsection->{$section});
282 &recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
283 }
284 }
285}
286
287sub add_section_content {
288 my ($doc_obj, $cursection, $hashsection) = @_;
289
290 $doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
291 if defined $hashsection->{'9999999'}->{'Title'};
292 $doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
293 if defined $hashsection->{'9999999'}->{'Creator'};
294 $doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
295 if defined $hashsection->{'9999999'}->{'Date'};
296
297 if (defined $hashsection->{'9999999'}->{'classifications'}) {
298 my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
299 map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
300 }
301
302 $doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
303 if ($hashsection->{'9999999'}->{'hastext'});
304}
305
306sub set_index {
307 # check that $collection has been set
308 die "collection global was not set\n"
309 unless defined $collection && $collection =~ /\w/;
310
311 # find an index (just use first non-text directory we come across in $indexdir)
312 opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
313 my @indexes = readdir INDEXDIR;
314 close INDEXDIR;
315 foreach $i (@indexes) {
316 next if $i =~ /text$/i || $i =~ /\./ ;
317 $index = &util::filename_cat ($i, $collection);
318 last;
319 }
320}
321
322sub get_text {
323 my ($docnum) = @_;
324
325 print STDERR "." if $verbosity >= 2;
326 &mgcommand ($docnum);
327
328 <$mgread>; # eat the document separator
329
330 my $text = "";
331 my $line = "";
332
333 while (defined ($line = <$mgread>))
334 {
335 last if $line =~ /^<\/mg>/;
336 $text .= $line;
337 }
338
339 # Read in the last statement, which should be:
340 # "dd documents retrieved."
341 <$mgread>;
342
343 return $text;
344}
345
346sub numerically {$a <=> $b;}
347
348# converts leading letter of a classification into its ascii equivalent
349# i.e C.2.4 becomes 67.2.4
350sub int_classification {
351 my ($classification) = @_;
352 my $c = ord($classification);
353 $classification =~ s/^./$c/;
354
355 return $classification;
356}
357
358
359# mg stuff
360
361sub openmg {
362
363 die "Unable to start mgquery." unless
364 openpipe($mgread, $mgwrite,
365 "mgquery -d $indexdir -f $index -t $textdir");
366
367 $mgwrite->autoflush();
368
369 &mgcommand('.set expert true');
370 &mgcommand('.set terminator "</mg>\n"');
371 &mgcommand('.set mode text');
372 &mgcommand('.set query docnums');
373 &mgcommand('.set term_freq off');
374}
375
376sub closemg {
377 &mgcommand (".quit");
378 close($mgread);
379 close($mgwrite);
380}
381
382sub mgcommand {
383 my ($command) = @_;
384
385 return if $command =~ /^\s*$/;
386 print $mgwrite "$command\n";
387
388 # eat up the command executed which is echoed
389 <$mgread>;
390}
391
392# openpipe(READ, WRITE, CMD)
393#
394# Like open2, except CMD's stderr is also redirected.
395#
396sub openpipe
397{
398 my ($read, $write, $cmd) = @_;
399 my ($child_read, $child_write);
400
401 $child_read = ++$FileHandle;
402 $child_write = ++$FileHandle;
403
404 pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
405 pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
406 my $pid;
407
408 if (($pid = fork) < 0) {
409 die "Failed fork: $!";
410 } elsif ($pid == 0) {
411 close($read);
412 close($write);
413 open(STDIN, "<&$child_read");
414 open(STDOUT, ">&$child_write");
415 open(STDERR, ">&$child_write");
416 exec($cmd);
417 die "Failed exec $cmd: $!";
418 }
419
420 close($child_read);
421 close($child_write);
422
423 $write->autoflush();
424 $read->autoflush();
425
426 return 1;
427}
Note: See TracBrowser for help on using the repository browser.