source: trunk/gsdl/bin/script/unindex.pl@ 1666

Last change on this file since 1666 was 1666, checked in by sjboddie, 23 years ago

another script with old #! line

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# unindex.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# this program will decompress all the text from a built index
29# and return it to gml format - may be slightly broken at present
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
35 $FileHandle = 'FH000';
36}
37
38use doc;
39use docsave;
40use util;
41use parsargv;
42use GDBM_File;
43use FileHandle;
44use English;
45
46select STDERR; $| = 1;
47select STDOUT; $| = 1;
48
49
50# globals
51$collection = "";
52$index = "";
53$textdir = "";
54$classinfo = {};
55$mgread = ++$FileHandle;
56$mgwrite = ++$FileHandle;
57
58
59
60sub print_usage {
61 print STDERR "\n usage: $0 [options]\n\n";
62 print STDERR " options:\n";
63 print STDERR " -verbosity number 0=none, 3=lots\n";
64 print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
65 print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n";
66 print STDERR " -removeold Will remove the old contents of the archives\n";
67 print STDERR " directory -- use with care\n\n";
68}
69
70&main ();
71
72sub main {
73 if (!parsargv::parse(\@ARGV,
74 'verbosity/\d+/2', \$verbosity,
75 'indexdir/.*/index', \$indexdir,
76 'archivedir/.*/archives', \$archivedir,
77 'removeold', \$removeold)) {
78 &print_usage();
79 die "\n";
80 }
81
82 die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
83 $indexdir =~ s/\/$//;
84 if (-d $archivedir) {
85 if ($removeold) {
86 print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
87 print STDERR " in preparation for the import\n";
88 sleep(5); # just in case...
89 &util::rm_r ($archivedir);
90 }
91 } else {
92 &util::mk_all_dir ($archivedir);
93 }
94
95 my $gdbmfile = &get_gdbmfile ($indexdir);
96 &set_index ();
97 &get_classinfo ($gdbmfile);
98 &openmg ();
99
100 # read the archive information file
101 my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
102 my $archive_info = new arcinfo ();
103 $archive_info->load_info ($archive_info_filename);
104
105 # create a docsave object to process the documents
106 my $processor = new docsave ("", $archive_info, $verbosity);
107 $processor->setarchivedir ($archivedir);
108
109 # create a document object for the classification file
110 my $clsudoc_obj = new doc($gdbmfile, "classification");
111 $clsudoc_obj->set_OID ("CLSU");
112
113 my ($doc_obj);
114 foreach $classification (keys %$classinfo) {
115 my $count = 0;
116 foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
117 print STDERR "\n$classification.$section" if $verbosity >= 2;
118 if ($classification =~ /^B$/i) {
119 last if $count > 10;
120 # create a new document
121 $doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'},
122 "indexed_doc");
123 $doc_obj->set_OID();
124 my $cursection = $doc_obj->get_top_section();
125 &add_section_content ($doc_obj, $cursection,
126 $classinfo->{$classification}->{$section});
127 &recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section},
128 $doc_obj->get_end_child($cursection), 0);
129 $processor->process($doc_obj);
130 } else {
131 my $classifier = &int_classification ($classification);
132 $clsudoc_obj->create_named_section("$classifier.$section");
133 &add_section_content ($clsudoc_obj, "$classifier.$section",
134 $classinfo->{$classification}->{$section});
135 &recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section},
136 "", "$classifier.$section");
137 }
138 $count ++;
139 }
140 }
141 $processor->process($clsudoc_obj);
142 &closemg();
143
144 # write out the archive information file
145 $archive_info->save_info($archive_info_filename);
146}
147
148# returns the path to the gdbm info database - also
149# sets the $collection and $textdir global variable
150sub get_gdbmfile {
151 my ($indexdir) = @_;
152 my ($gdbmfile);
153
154 opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
155 my @conts = readdir DIR;
156 close DIR;
157
158 foreach $file (@conts) {
159 if ($file =~ /text$/) {
160 $textdir = $file;
161 last;
162 }
163 }
164 die "No text directory found in $indexdir\n\n"
165 unless defined $textdir && $textdir =~ /text$/;
166
167 $gdbmfile = &util::filename_cat ($indexdir, $textdir);
168
169 opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
170 @conts = readdir DIR;
171 close DIR;
172
173 foreach $file (@conts) {
174 if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
175 $collection = $1;
176 $gdbmfile = &util::filename_cat ($gdbmfile, $file);
177 last;
178 }
179 }
180
181 if (defined $collection && $collection =~ /\w/) {
182 $textdir = &util::filename_cat ($textdir, $collection);
183 } else {
184 die "collection global wasn't set\n";
185 }
186 return $gdbmfile if (-e $gdbmfile);
187 die "Couldn't find gdbm info database in $indexdir\n\n";
188}
189
190sub get_classinfo {
191 my ($gdbmfile) = @_;
192 my ($class);
193
194 open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
195
196 print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
197 my $count = 0;
198
199 $/ = '-' x 70;
200 my $entry = "";
201 while (defined ($entry = <DB2TXT>)) {
202 next unless $entry =~ /\w/;
203 $entry =~ s/\n+/\\n/g;
204 my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
205
206 print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);
207
208 next if ($key !~ /\./); # ignore the docnums and top level of classifications
209
210 die "Badly formatted key $key in $gdbmfile\n\n"
211 unless (($class, $section) = $key =~ /^(.)\.(.*)$/);
212 $classinfo->{$class} = {} unless defined $classinfo->{$class};
213 my @sections = split /\./, $section;
214
215 my $hashptr = $classinfo->{$class};
216 foreach $sec (@sections) {
217 $hashptr->{$sec} = {} unless defined $hashptr->{$sec};
218 $hashptr = $hashptr->{$sec};
219 }
220 &get_metadata ($key, \$value, $hashptr);
221
222 $count ++;
223 }
224
225 $/ = "\n";
226 print STDERR "\n" if $verbosity >= 2;
227}
228
229sub get_metadata {
230 my ($key, $gdb_str_ref, $hashsection) = @_;
231
232 my ($title) = $$gdb_str_ref =~ /(?:<t>|<title>)(.*?)(?:\\n|$)/i;
233 my ($docnum) = $$gdb_str_ref =~ /(?:<d>|<docnum>)(.*?)(?:\\n|$)/i;
234 my ($jobnum) = $$gdb_str_ref =~ /(?:<j>|<jobnumber>)(.*?)(?:\\n|$)/i;
235 my ($classifications) = $$gdb_str_ref =~ /(?:<x>|<classification>)(.*?)(?:\\n|$)/i;
236 my $hastext = 1;
237 $hastext = 0 if ($$gdb_str_ref =~ /(?:<c>|<contains>)(.*?)(?:\\n|$)/i);
238 my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n|$)/i;
239 my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n|$)/i;
240
241 # just in case there are empty classifications
242 if ($hastext && !defined $docnum) {
243 print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
244 if ($verbosity >= 2) {
245 my $valuestr = $$gdb_str_ref;
246 $valuestr =~ s/\\n/\n/g;
247 print STDERR "$valuestr\n";
248 }
249 $hastext = 0;
250 }
251
252 # shove metadata in 9999999 to keep it numeric and prevent
253 # sorting from being a pain. Watch out for documents with
254 # 9999999 subsections ;-)
255 if (defined $hashsection->{'9999999'}) {
256 print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
257 } else {
258 $hashsection->{'9999999'}->{'Title'} = $title if defined $title;
259 $hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
260 $hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
261 $hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
262 $hashsection->{'9999999'}->{'hastext'} = $hastext;
263 $hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
264 $hashsection->{'9999999'}->{'Date'} = $date if defined $date;
265 }
266}
267
268sub recurse_classinfo {
269 my ($doc_obj, $hashsection, $cursection, $class) = @_;
270 foreach $section (sort numerically keys %$hashsection) {
271 next if $section == 9999999;
272 if ($class) {
273 $doc_obj->create_named_section("$class.$section");
274 &add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
275 &recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
276 } else {
277 $cursection =
278 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
279 &add_section_content ($doc_obj, $cursection, $hashsection->{$section});
280 &recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
281 }
282 }
283}
284
285sub add_section_content {
286 my ($doc_obj, $cursection, $hashsection) = @_;
287
288 $doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
289 if defined $hashsection->{'9999999'}->{'Title'};
290 $doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
291 if defined $hashsection->{'9999999'}->{'Creator'};
292 $doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
293 if defined $hashsection->{'9999999'}->{'Date'};
294
295 if (defined $hashsection->{'9999999'}->{'classifications'}) {
296 my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
297 map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
298 }
299
300 $doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
301 if ($hashsection->{'9999999'}->{'hastext'});
302}
303
304sub set_index {
305 # check that $collection has been set
306 die "collection global was not set\n"
307 unless defined $collection && $collection =~ /\w/;
308
309 # find an index (just use first non-text directory we come across in $indexdir)
310 opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
311 my @indexes = readdir INDEXDIR;
312 close INDEXDIR;
313 foreach $i (@indexes) {
314 next if $i =~ /text$/i || $i =~ /\./ ;
315 $index = &util::filename_cat ($i, $collection);
316 last;
317 }
318}
319
320sub get_text {
321 my ($docnum) = @_;
322
323 print STDERR "." if $verbosity >= 2;
324 &mgcommand ($docnum);
325
326 <$mgread>; # eat the document separator
327
328 my $text = "";
329 my $line = "";
330
331 while (defined ($line = <$mgread>))
332 {
333 last if $line =~ /^<\/mg>/;
334 $text .= $line;
335 }
336
337 # Read in the last statement, which should be:
338 # "dd documents retrieved."
339 <$mgread>;
340
341 return $text;
342}
343
344sub numerically {$a <=> $b;}
345
346# converts leading letter of a classification into its ascii equivalent
347# i.e C.2.4 becomes 67.2.4
348sub int_classification {
349 my ($classification) = @_;
350 my $c = ord($classification);
351 $classification =~ s/^./$c/;
352
353 return $classification;
354}
355
356
357# mg stuff
358
359sub openmg {
360
361 die "Unable to start mgquery." unless
362 openpipe($mgread, $mgwrite,
363 "mgquery -d $indexdir -f $index -t $textdir");
364
365 $mgwrite->autoflush();
366
367 &mgcommand('.set expert true');
368 &mgcommand('.set terminator "</mg>\n"');
369 &mgcommand('.set mode text');
370 &mgcommand('.set query docnums');
371 &mgcommand('.set term_freq off');
372}
373
374sub closemg {
375 &mgcommand (".quit");
376 close($mgread);
377 close($mgwrite);
378}
379
380sub mgcommand {
381 my ($command) = @_;
382
383 return if $command =~ /^\s*$/;
384 print $mgwrite "$command\n";
385
386 # eat up the command executed which is echoed
387 <$mgread>;
388}
389
390# openpipe(READ, WRITE, CMD)
391#
392# Like open2, except CMD's stderr is also redirected.
393#
394sub openpipe
395{
396 my ($read, $write, $cmd) = @_;
397 my ($child_read, $child_write);
398
399 $child_read = ++$FileHandle;
400 $child_write = ++$FileHandle;
401
402 pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
403 pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
404 my $pid;
405
406 if (($pid = fork) < 0) {
407 die "Failed fork: $!";
408 } elsif ($pid == 0) {
409 close($read);
410 close($write);
411 open(STDIN, "<&$child_read");
412 open(STDOUT, ">&$child_write");
413 open(STDERR, ">&$child_write");
414 exec($cmd);
415 die "Failed exec $cmd: $!";
416 }
417
418 close($child_read);
419 close($child_write);
420
421 $write->autoflush();
422 $read->autoflush();
423
424 return 1;
425}
Note: See TracBrowser for help on using the repository browser.