1 | #!/usr/local/bin/perl5 -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # unindex.pl --
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | # this program will decompress all the text from a built index
|
---|
29 | # and return it to gml format - may be slightly broken at present
|
---|
30 |
|
---|
31 | BEGIN {
|
---|
32 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
33 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
34 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
35 | $FileHandle = 'FH000';
|
---|
36 | }
|
---|
37 |
|
---|
38 | use doc;
|
---|
39 | use docsave;
|
---|
40 | use util;
|
---|
41 | use parsargv;
|
---|
42 | use GDBM_File;
|
---|
43 | use FileHandle;
|
---|
44 | use English;
|
---|
45 |
|
---|
46 | select STDERR; $| = 1;
|
---|
47 | select STDOUT; $| = 1;
|
---|
48 |
|
---|
49 |
|
---|
50 | # globals
|
---|
51 | $collection = "";
|
---|
52 | $index = "";
|
---|
53 | $textdir = "";
|
---|
54 | $classinfo = {};
|
---|
55 | $mgread = ++$FileHandle;
|
---|
56 | $mgwrite = ++$FileHandle;
|
---|
57 |
|
---|
58 |
|
---|
59 |
|
---|
60 | sub print_usage {
|
---|
61 | print STDERR "\n usage: $0 [options]\n\n";
|
---|
62 | print STDERR " options:\n";
|
---|
63 | print STDERR " -verbosity number 0=none, 3=lots\n";
|
---|
64 | print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
|
---|
65 | print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n";
|
---|
66 | print STDERR " -removeold Will remove the old contents of the archives\n";
|
---|
67 | print STDERR " directory -- use with care\n\n";
|
---|
68 | }
|
---|
69 |
|
---|
70 | &main ();
|
---|
71 |
|
---|
72 | sub main {
|
---|
73 | if (!parsargv::parse(\@ARGV,
|
---|
74 | 'verbosity/\d+/2', \$verbosity,
|
---|
75 | 'indexdir/.*/index', \$indexdir,
|
---|
76 | 'archivedir/.*/archives', \$archivedir,
|
---|
77 | 'removeold', \$removeold)) {
|
---|
78 | &print_usage();
|
---|
79 | die "\n";
|
---|
80 | }
|
---|
81 |
|
---|
82 | die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
|
---|
83 | $indexdir =~ s/\/$//;
|
---|
84 | if (-d $archivedir) {
|
---|
85 | if ($removeold) {
|
---|
86 | print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
|
---|
87 | print STDERR " in preparation for the import\n";
|
---|
88 | sleep(5); # just in case...
|
---|
89 | &util::rm_r ($archivedir);
|
---|
90 | }
|
---|
91 | } else {
|
---|
92 | &util::mk_all_dir ($archivedir);
|
---|
93 | }
|
---|
94 |
|
---|
95 | my $gdbmfile = &get_gdbmfile ($indexdir);
|
---|
96 | &set_index ();
|
---|
97 | &get_classinfo ($gdbmfile);
|
---|
98 | &openmg ();
|
---|
99 |
|
---|
100 | # read the archive information file
|
---|
101 | my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
|
---|
102 | my $archive_info = new arcinfo ();
|
---|
103 | $archive_info->load_info ($archive_info_filename);
|
---|
104 |
|
---|
105 | # create a docsave object to process the documents
|
---|
106 | my $processor = new docsave ("", $archive_info, $verbosity);
|
---|
107 | $processor->setarchivedir ($archivedir);
|
---|
108 |
|
---|
109 | # create a document object for the classification file
|
---|
110 | my $clsudoc_obj = new doc($gdbmfile, "classification");
|
---|
111 | $clsudoc_obj->set_OID ("CLSU");
|
---|
112 |
|
---|
113 | my ($doc_obj);
|
---|
114 | foreach $classification (keys %$classinfo) {
|
---|
115 | my $count = 0;
|
---|
116 | foreach $section (sort numerically keys %{$classinfo->{$classification}}) {
|
---|
117 | print STDERR "\n$classification.$section" if $verbosity >= 2;
|
---|
118 | if ($classification =~ /^B$/i) {
|
---|
119 | last if $count > 10;
|
---|
120 | # create a new document
|
---|
121 | $doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'},
|
---|
122 | "indexed_doc");
|
---|
123 | $doc_obj->set_OID();
|
---|
124 | my $cursection = $doc_obj->get_top_section();
|
---|
125 | &add_section_content ($doc_obj, $cursection,
|
---|
126 | $classinfo->{$classification}->{$section});
|
---|
127 | &recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section},
|
---|
128 | $doc_obj->get_end_child($cursection), 0);
|
---|
129 | $processor->process($doc_obj);
|
---|
130 | } else {
|
---|
131 | my $classifier = &int_classification ($classification);
|
---|
132 | $clsudoc_obj->create_named_section("$classifier.$section");
|
---|
133 | &add_section_content ($clsudoc_obj, "$classifier.$section",
|
---|
134 | $classinfo->{$classification}->{$section});
|
---|
135 | &recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section},
|
---|
136 | "", "$classifier.$section");
|
---|
137 | }
|
---|
138 | $count ++;
|
---|
139 | }
|
---|
140 | }
|
---|
141 | $processor->process($clsudoc_obj);
|
---|
142 | &closemg();
|
---|
143 |
|
---|
144 | # write out the archive information file
|
---|
145 | $archive_info->save_info($archive_info_filename);
|
---|
146 | }
|
---|
147 |
|
---|
148 | # returns the path to the gdbm info database - also
|
---|
149 | # sets the $collection and $textdir global variable
|
---|
150 | sub get_gdbmfile {
|
---|
151 | my ($indexdir) = @_;
|
---|
152 | my ($gdbmfile);
|
---|
153 |
|
---|
154 | opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
|
---|
155 | my @conts = readdir DIR;
|
---|
156 | close DIR;
|
---|
157 |
|
---|
158 | foreach $file (@conts) {
|
---|
159 | if ($file =~ /text$/) {
|
---|
160 | $textdir = $file;
|
---|
161 | last;
|
---|
162 | }
|
---|
163 | }
|
---|
164 | die "No text directory found in $indexdir\n\n"
|
---|
165 | unless defined $textdir && $textdir =~ /text$/;
|
---|
166 |
|
---|
167 | $gdbmfile = &util::filename_cat ($indexdir, $textdir);
|
---|
168 |
|
---|
169 | opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
|
---|
170 | @conts = readdir DIR;
|
---|
171 | close DIR;
|
---|
172 |
|
---|
173 | foreach $file (@conts) {
|
---|
174 | if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
|
---|
175 | $collection = $1;
|
---|
176 | $gdbmfile = &util::filename_cat ($gdbmfile, $file);
|
---|
177 | last;
|
---|
178 | }
|
---|
179 | }
|
---|
180 |
|
---|
181 | if (defined $collection && $collection =~ /\w/) {
|
---|
182 | $textdir = &util::filename_cat ($textdir, $collection);
|
---|
183 | } else {
|
---|
184 | die "collection global wasn't set\n";
|
---|
185 | }
|
---|
186 | return $gdbmfile if (-e $gdbmfile);
|
---|
187 | die "Couldn't find gdbm info database in $indexdir\n\n";
|
---|
188 | }
|
---|
189 |
|
---|
190 | sub get_classinfo {
|
---|
191 | my ($gdbmfile) = @_;
|
---|
192 | my ($class);
|
---|
193 |
|
---|
194 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
195 |
|
---|
196 | print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2;
|
---|
197 | my $count = 0;
|
---|
198 |
|
---|
199 | $/ = '-' x 70;
|
---|
200 | my $entry = "";
|
---|
201 | while (defined ($entry = <DB2TXT>)) {
|
---|
202 | next unless $entry =~ /\w/;
|
---|
203 | $entry =~ s/\n+/\\n/g;
|
---|
204 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
205 |
|
---|
206 | print STDERR "." if (($count % 100) == 99 && $verbosity >= 2);
|
---|
207 |
|
---|
208 | next if ($key !~ /\./); # ignore the docnums and top level of classifications
|
---|
209 |
|
---|
210 | die "Badly formatted key $key in $gdbmfile\n\n"
|
---|
211 | unless (($class, $section) = $key =~ /^(.)\.(.*)$/);
|
---|
212 | $classinfo->{$class} = {} unless defined $classinfo->{$class};
|
---|
213 | my @sections = split /\./, $section;
|
---|
214 |
|
---|
215 | my $hashptr = $classinfo->{$class};
|
---|
216 | foreach $sec (@sections) {
|
---|
217 | $hashptr->{$sec} = {} unless defined $hashptr->{$sec};
|
---|
218 | $hashptr = $hashptr->{$sec};
|
---|
219 | }
|
---|
220 | &get_metadata ($key, \$value, $hashptr);
|
---|
221 |
|
---|
222 | $count ++;
|
---|
223 | }
|
---|
224 |
|
---|
225 | $/ = "\n";
|
---|
226 | print STDERR "\n" if $verbosity >= 2;
|
---|
227 | }
|
---|
228 |
|
---|
229 | sub get_metadata {
|
---|
230 | my ($key, $gdb_str_ref, $hashsection) = @_;
|
---|
231 |
|
---|
232 | my ($title) = $$gdb_str_ref =~ /(?:<t>|<title>)(.*?)(?:\\n|$)/i;
|
---|
233 | my ($docnum) = $$gdb_str_ref =~ /(?:<d>|<docnum>)(.*?)(?:\\n|$)/i;
|
---|
234 | my ($jobnum) = $$gdb_str_ref =~ /(?:<j>|<jobnumber>)(.*?)(?:\\n|$)/i;
|
---|
235 | my ($classifications) = $$gdb_str_ref =~ /(?:<x>|<classification>)(.*?)(?:\\n|$)/i;
|
---|
236 | my $hastext = 1;
|
---|
237 | $hastext = 0 if ($$gdb_str_ref =~ /(?:<c>|<contains>)(.*?)(?:\\n|$)/i);
|
---|
238 | my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n|$)/i;
|
---|
239 | my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n|$)/i;
|
---|
240 |
|
---|
241 | # just in case there are empty classifications
|
---|
242 | if ($hastext && !defined $docnum) {
|
---|
243 | print STDERR "\nwarning: $key entry has no contents\n" if $verbosity;
|
---|
244 | if ($verbosity >= 2) {
|
---|
245 | my $valuestr = $$gdb_str_ref;
|
---|
246 | $valuestr =~ s/\\n/\n/g;
|
---|
247 | print STDERR "$valuestr\n";
|
---|
248 | }
|
---|
249 | $hastext = 0;
|
---|
250 | }
|
---|
251 |
|
---|
252 | # shove metadata in 9999999 to keep it numeric and prevent
|
---|
253 | # sorting from being a pain. Watch out for documents with
|
---|
254 | # 9999999 subsections ;-)
|
---|
255 | if (defined $hashsection->{'9999999'}) {
|
---|
256 | print STDERR "\nwarning: $key appears multiple times\n" if $verbosity;
|
---|
257 | } else {
|
---|
258 | $hashsection->{'9999999'}->{'Title'} = $title if defined $title;
|
---|
259 | $hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum;
|
---|
260 | $hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum;
|
---|
261 | $hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications;
|
---|
262 | $hashsection->{'9999999'}->{'hastext'} = $hastext;
|
---|
263 | $hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator;
|
---|
264 | $hashsection->{'9999999'}->{'Date'} = $date if defined $date;
|
---|
265 | }
|
---|
266 | }
|
---|
267 |
|
---|
268 | sub recurse_classinfo {
|
---|
269 | my ($doc_obj, $hashsection, $cursection, $class) = @_;
|
---|
270 | foreach $section (sort numerically keys %$hashsection) {
|
---|
271 | next if $section == 9999999;
|
---|
272 | if ($class) {
|
---|
273 | $doc_obj->create_named_section("$class.$section");
|
---|
274 | &add_section_content ($doc_obj, "$class.$section", $hashsection->{$section});
|
---|
275 | &recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section");
|
---|
276 | } else {
|
---|
277 | $cursection =
|
---|
278 | $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection)));
|
---|
279 | &add_section_content ($doc_obj, $cursection, $hashsection->{$section});
|
---|
280 | &recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection));
|
---|
281 | }
|
---|
282 | }
|
---|
283 | }
|
---|
284 |
|
---|
285 | sub add_section_content {
|
---|
286 | my ($doc_obj, $cursection, $hashsection) = @_;
|
---|
287 |
|
---|
288 | $doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'})
|
---|
289 | if defined $hashsection->{'9999999'}->{'Title'};
|
---|
290 | $doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'})
|
---|
291 | if defined $hashsection->{'9999999'}->{'Creator'};
|
---|
292 | $doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'})
|
---|
293 | if defined $hashsection->{'9999999'}->{'Date'};
|
---|
294 |
|
---|
295 | if (defined $hashsection->{'9999999'}->{'classifications'}) {
|
---|
296 | my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'};
|
---|
297 | map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications;
|
---|
298 | }
|
---|
299 |
|
---|
300 | $doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'}))
|
---|
301 | if ($hashsection->{'9999999'}->{'hastext'});
|
---|
302 | }
|
---|
303 |
|
---|
304 | sub set_index {
|
---|
305 | # check that $collection has been set
|
---|
306 | die "collection global was not set\n"
|
---|
307 | unless defined $collection && $collection =~ /\w/;
|
---|
308 |
|
---|
309 | # find an index (just use first non-text directory we come across in $indexdir)
|
---|
310 | opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
|
---|
311 | my @indexes = readdir INDEXDIR;
|
---|
312 | close INDEXDIR;
|
---|
313 | foreach $i (@indexes) {
|
---|
314 | next if $i =~ /text$/i || $i =~ /\./ ;
|
---|
315 | $index = &util::filename_cat ($i, $collection);
|
---|
316 | last;
|
---|
317 | }
|
---|
318 | }
|
---|
319 |
|
---|
320 | sub get_text {
|
---|
321 | my ($docnum) = @_;
|
---|
322 |
|
---|
323 | print STDERR "." if $verbosity >= 2;
|
---|
324 | &mgcommand ($docnum);
|
---|
325 |
|
---|
326 | <$mgread>; # eat the document separator
|
---|
327 |
|
---|
328 | my $text = "";
|
---|
329 | my $line = "";
|
---|
330 |
|
---|
331 | while (defined ($line = <$mgread>))
|
---|
332 | {
|
---|
333 | last if $line =~ /^<\/mg>/;
|
---|
334 | $text .= $line;
|
---|
335 | }
|
---|
336 |
|
---|
337 | # Read in the last statement, which should be:
|
---|
338 | # "dd documents retrieved."
|
---|
339 | <$mgread>;
|
---|
340 |
|
---|
341 | return $text;
|
---|
342 | }
|
---|
343 |
|
---|
344 | sub numerically {$a <=> $b;}
|
---|
345 |
|
---|
346 | # converts leading letter of a classification into its ascii equivalent
|
---|
347 | # i.e C.2.4 becomes 67.2.4
|
---|
348 | sub int_classification {
|
---|
349 | my ($classification) = @_;
|
---|
350 | my $c = ord($classification);
|
---|
351 | $classification =~ s/^./$c/;
|
---|
352 |
|
---|
353 | return $classification;
|
---|
354 | }
|
---|
355 |
|
---|
356 |
|
---|
357 | # mg stuff
|
---|
358 |
|
---|
359 | sub openmg {
|
---|
360 |
|
---|
361 | die "Unable to start mgquery." unless
|
---|
362 | openpipe($mgread, $mgwrite,
|
---|
363 | "mgquery -d $indexdir -f $index -t $textdir");
|
---|
364 |
|
---|
365 | $mgwrite->autoflush();
|
---|
366 |
|
---|
367 | &mgcommand('.set expert true');
|
---|
368 | &mgcommand('.set terminator "</mg>\n"');
|
---|
369 | &mgcommand('.set mode text');
|
---|
370 | &mgcommand('.set query docnums');
|
---|
371 | &mgcommand('.set term_freq off');
|
---|
372 | }
|
---|
373 |
|
---|
374 | sub closemg {
|
---|
375 | &mgcommand (".quit");
|
---|
376 | close($mgread);
|
---|
377 | close($mgwrite);
|
---|
378 | }
|
---|
379 |
|
---|
380 | sub mgcommand {
|
---|
381 | my ($command) = @_;
|
---|
382 |
|
---|
383 | return if $command =~ /^\s*$/;
|
---|
384 | print $mgwrite "$command\n";
|
---|
385 |
|
---|
386 | # eat up the command executed which is echoed
|
---|
387 | <$mgread>;
|
---|
388 | }
|
---|
389 |
|
---|
390 | # openpipe(READ, WRITE, CMD)
|
---|
391 | #
|
---|
392 | # Like open2, except CMD's stderr is also redirected.
|
---|
393 | #
|
---|
394 | sub openpipe
|
---|
395 | {
|
---|
396 | my ($read, $write, $cmd) = @_;
|
---|
397 | my ($child_read, $child_write);
|
---|
398 |
|
---|
399 | $child_read = ++$FileHandle;
|
---|
400 | $child_write = ++$FileHandle;
|
---|
401 |
|
---|
402 | pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
|
---|
403 | pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
|
---|
404 | my $pid;
|
---|
405 |
|
---|
406 | if (($pid = fork) < 0) {
|
---|
407 | die "Failed fork: $!";
|
---|
408 | } elsif ($pid == 0) {
|
---|
409 | close($read);
|
---|
410 | close($write);
|
---|
411 | open(STDIN, "<&$child_read");
|
---|
412 | open(STDOUT, ">&$child_write");
|
---|
413 | open(STDERR, ">&$child_write");
|
---|
414 | exec($cmd);
|
---|
415 | die "Failed exec $cmd: $!";
|
---|
416 | }
|
---|
417 |
|
---|
418 | close($child_read);
|
---|
419 | close($child_write);
|
---|
420 |
|
---|
421 | $write->autoflush();
|
---|
422 | $read->autoflush();
|
---|
423 |
|
---|
424 | return 1;
|
---|
425 | }
|
---|