#!/usr/local/bin/perl5 -w ########################################################################### # # unindex.pl -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # this program will decompress all the text from a built index # and return it to gml format - may be slightly broken at present BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); $FileHandle = 'FH000'; } use doc; use docsave; use util; use parsargv; use GDBM_File; use FileHandle; use English; select STDERR; $| = 1; select STDOUT; $| = 1; # globals $collection = ""; $index = ""; $textdir = ""; $classinfo = {}; $mgread = ++$FileHandle; $mgwrite = ++$FileHandle; sub print_usage { print STDERR "\n usage: $0 [options]\n\n"; print STDERR " options:\n"; print STDERR " -verbosity number 0=none, 3=lots\n"; print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n"; print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives\n"; print STDERR " -removeold Will remove the old contents of the archives\n"; print STDERR " directory -- use with care\n\n"; } &main (); sub main { if (!parsargv::parse(\@ARGV, 'verbosity/\d+/2', \$verbosity, 'indexdir/.*/index', \$indexdir, 'archivedir/.*/archives', \$archivedir, 'removeold', \$removeold)) { &print_usage(); die "\n"; } die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir); $indexdir =~ s/\/$//; if (-d $archivedir) { if ($removeold) { print STDERR "Warning - removing current contents of the archives directory $archivedir\n"; print STDERR " in preparation for the import\n"; sleep(5); # just in case... &util::rm_r ($archivedir); } } else { &util::mk_all_dir ($archivedir); } my $gdbmfile = &get_gdbmfile ($indexdir); &set_index (); &get_classinfo ($gdbmfile); &openmg (); # read the archive information file my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf"); my $archive_info = new arcinfo (); $archive_info->load_info ($archive_info_filename); # create a docsave object to process the documents my $processor = new docsave ("", $archive_info, $verbosity); $processor->setarchivedir ($archivedir); # create a document object for the classification file my $clsudoc_obj = new doc($gdbmfile, "classification"); $clsudoc_obj->set_OID ("CLSU"); my ($doc_obj); foreach $classification (keys %$classinfo) { my $count = 0; foreach $section (sort numerically keys %{$classinfo->{$classification}}) { print STDERR "\n$classification.$section" if $verbosity >= 2; if ($classification =~ /^B$/i) { last if $count > 10; # create a new document $doc_obj = new doc ($classinfo->{$classification}->{$section}->{'9999999'}->{'jobnum'}, "indexed_doc"); $doc_obj->set_OID(); my $cursection = $doc_obj->get_top_section(); &add_section_content ($doc_obj, $cursection, $classinfo->{$classification}->{$section}); &recurse_classinfo ($doc_obj, $classinfo->{$classification}->{$section}, $doc_obj->get_end_child($cursection), 0); $processor->process($doc_obj); } else { my $classifier = &int_classification ($classification); $clsudoc_obj->create_named_section("$classifier.$section"); &add_section_content ($clsudoc_obj, "$classifier.$section", $classinfo->{$classification}->{$section}); &recurse_classinfo ($clsudoc_obj, $classinfo->{$classification}->{$section}, "", "$classifier.$section"); } $count ++; } } $processor->process($clsudoc_obj); &closemg(); # write out the archive information file $archive_info->save_info($archive_info_filename); } # returns the path to the gdbm info database - also # sets the $collection and $textdir global variable sub get_gdbmfile { my ($indexdir) = @_; my ($gdbmfile); opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n"; my @conts = readdir DIR; close DIR; foreach $file (@conts) { if ($file =~ /text$/) { $textdir = $file; last; } } die "No text directory found in $indexdir\n\n" unless defined $textdir && $textdir =~ /text$/; $gdbmfile = &util::filename_cat ($indexdir, $textdir); opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n"; @conts = readdir DIR; close DIR; foreach $file (@conts) { if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) { $collection = $1; $gdbmfile = &util::filename_cat ($gdbmfile, $file); last; } } if (defined $collection && $collection =~ /\w/) { $textdir = &util::filename_cat ($textdir, $collection); } else { die "collection global wasn't set\n"; } return $gdbmfile if (-e $gdbmfile); die "Couldn't find gdbm info database in $indexdir\n\n"; } sub get_classinfo { my ($gdbmfile) = @_; my ($class); open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n"; print STDERR "Generating classification table from $gdbmfile\n" if $verbosity >= 2; my $count = 0; $/ = '-' x 70; my $entry = ""; while (defined ($entry = )) { next unless $entry =~ /\w/; $entry =~ s/\n+/\\n/g; my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/; print STDERR "." if (($count % 100) == 99 && $verbosity >= 2); next if ($key !~ /\./); # ignore the docnums and top level of classifications die "Badly formatted key $key in $gdbmfile\n\n" unless (($class, $section) = $key =~ /^(.)\.(.*)$/); $classinfo->{$class} = {} unless defined $classinfo->{$class}; my @sections = split /\./, $section; my $hashptr = $classinfo->{$class}; foreach $sec (@sections) { $hashptr->{$sec} = {} unless defined $hashptr->{$sec}; $hashptr = $hashptr->{$sec}; } &get_metadata ($key, \$value, $hashptr); $count ++; } $/ = "\n"; print STDERR "\n" if $verbosity >= 2; } sub get_metadata { my ($key, $gdb_str_ref, $hashsection) = @_; my ($title) = $$gdb_str_ref =~ /(?:|)(.*?)(?:\\n|$)/i; my ($docnum) = $$gdb_str_ref =~ /(?:<d>|<docnum>)(.*?)(?:\\n|$)/i; my ($jobnum) = $$gdb_str_ref =~ /(?:<j>|<jobnumber>)(.*?)(?:\\n|$)/i; my ($classifications) = $$gdb_str_ref =~ /(?:<x>|<classification>)(.*?)(?:\\n|$)/i; my $hastext = 1; $hastext = 0 if ($$gdb_str_ref =~ /(?:<c>|<contains>)(.*?)(?:\\n|$)/i); my ($creator) = $$gdb_str_ref =~ /<a>(.*?)(?:\\n|$)/i; my ($date) = $$gdb_str_ref =~ / <i>(.*?)(?:\\n|$)/i; # just in case there are empty classifications if ($hastext && !defined $docnum) { print STDERR "\nwarning: $key entry has no contents\n" if $verbosity; if ($verbosity >= 2) { my $valuestr = $$gdb_str_ref; $valuestr =~ s/\\n/\n/g; print STDERR "$valuestr\n"; } $hastext = 0; } # shove metadata in 9999999 to keep it numeric and prevent # sorting from being a pain. Watch out for documents with # 9999999 subsections ;-) if (defined $hashsection->{'9999999'}) { print STDERR "\nwarning: $key appears multiple times\n" if $verbosity; } else { $hashsection->{'9999999'}->{'Title'} = $title if defined $title; $hashsection->{'9999999'}->{'docnum'} = $docnum if defined $docnum; $hashsection->{'9999999'}->{'jobnum'} = $jobnum if defined $jobnum; $hashsection->{'9999999'}->{'classifications'} = $classifications if defined $classifications; $hashsection->{'9999999'}->{'hastext'} = $hastext; $hashsection->{'9999999'}->{'Creator'} = $creator if defined $creator; $hashsection->{'9999999'}->{'Date'} = $date if defined $date; } } sub recurse_classinfo { my ($doc_obj, $hashsection, $cursection, $class) = @_; foreach $section (sort numerically keys %$hashsection) { next if $section == 9999999; if ($class) { $doc_obj->create_named_section("$class.$section"); &add_section_content ($doc_obj, "$class.$section", $hashsection->{$section}); &recurse_classinfo ($doc_obj, $hashsection->{$section}, "", "$class.$section"); } else { $cursection = $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_parent_section($cursection))); &add_section_content ($doc_obj, $cursection, $hashsection->{$section}); &recurse_classinfo ($doc_obj, $hashsection->{$section}, $doc_obj->get_end_child($cursection)); } } } sub add_section_content { my ($doc_obj, $cursection, $hashsection) = @_; $doc_obj->add_metadata ($cursection, "Title", $hashsection->{'9999999'}->{'Title'}) if defined $hashsection->{'9999999'}->{'Title'}; $doc_obj->add_metadata ($cursection, "Creator", $hashsection->{'9999999'}->{'Creator'}) if defined $hashsection->{'9999999'}->{'Creator'}; $doc_obj->add_metadata ($cursection, "Date", $hashsection->{'9999999'}->{'Date'}) if defined $hashsection->{'9999999'}->{'Date'}; if (defined $hashsection->{'9999999'}->{'classifications'}) { my @classifications = split /:/, $hashsection->{'9999999'}->{'classifications'}; map {$doc_obj->add_metadata ($cursection, 'Subject', $_); } @classifications; } $doc_obj->add_text ($cursection, &get_text ($hashsection->{'9999999'}->{'docnum'})) if ($hashsection->{'9999999'}->{'hastext'}); } sub set_index { # check that $collection has been set die "collection global was not set\n" unless defined $collection && $collection =~ /\w/; # find an index (just use first non-text directory we come across in $indexdir) opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n"; my @indexes = readdir INDEXDIR; close INDEXDIR; foreach $i (@indexes) { next if $i =~ /text$/i || $i =~ /\./ ; $index = &util::filename_cat ($i, $collection); last; } } sub get_text { my ($docnum) = @_; print STDERR "." if $verbosity >= 2; &mgcommand ($docnum); <$mgread>; # eat the document separator my $text = ""; my $line = ""; while (defined ($line = <$mgread>)) { last if $line =~ /^<\/mg>/; $text .= $line; } # Read in the last statement, which should be: # "dd documents retrieved." <$mgread>; return $text; } sub numerically {$a <=> $b;} # converts leading letter of a classification into its ascii equivalent # i.e C.2.4 becomes 67.2.4 sub int_classification { my ($classification) = @_; my $c = ord($classification); $classification =~ s/^./$c/; return $classification; } # mg stuff sub openmg { die "Unable to start mgquery." unless openpipe($mgread, $mgwrite, "mgquery -d $indexdir -f $index -t $textdir"); $mgwrite->autoflush(); &mgcommand('.set expert true'); &mgcommand('.set terminator "</mg>\n"'); &mgcommand('.set mode text'); &mgcommand('.set query docnums'); &mgcommand('.set term_freq off'); } sub closemg { &mgcommand (".quit"); close($mgread); close($mgwrite); } sub mgcommand { my ($command) = @_; return if $command =~ /^\s*$/; print $mgwrite "$command\n"; # eat up the command executed which is echoed <$mgread>; } # openpipe(READ, WRITE, CMD) # # Like open2, except CMD's stderr is also redirected. # sub openpipe { my ($read, $write, $cmd) = @_; my ($child_read, $child_write); $child_read = ++$FileHandle; $child_write = ++$FileHandle; pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!"; pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!"; my $pid; if (($pid = fork) < 0) { die "Failed fork: $!"; } elsif ($pid == 0) { close($read); close($write); open(STDIN, "<&$child_read"); open(STDOUT, ">&$child_write"); open(STDERR, ">&$child_write"); exec($cmd); die "Failed exec $cmd: $!"; } close($child_read); close($child_write); $write->autoflush(); $read->autoflush(); return 1; }