[3324] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | ###########################################################################
|
---|
| 4 | #
|
---|
| 5 | # unbuildv1.pl --
|
---|
| 6 | # A component of the Greenstone digital library software
|
---|
| 7 | # from the New Zealand Digital Library Project at the
|
---|
| 8 | # University of Waikato, New Zealand.
|
---|
| 9 | #
|
---|
| 10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | # You should have received a copy of the GNU General Public License
|
---|
| 23 | # along with this program; if not, write to the Free Software
|
---|
| 24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | #
|
---|
| 26 | ###########################################################################
|
---|
| 27 |
|
---|
| 28 | # this program will decompress all the text from a built index
|
---|
| 29 | # and return it to gml format - this version of the program is designed to
|
---|
| 30 | # work on collections built with the first version of the greenstone software
|
---|
| 31 | # it extracts the browse classifications from the gdbm file, and recreates
|
---|
| 32 | # them. It also creates a default collect.cfg file.
|
---|
| 33 |
|
---|
| 34 | # To run, this program needs the old archive directory (called archives),
|
---|
| 35 | # the index directory with the text and one other index, not paragraphs!! -
|
---|
| 36 | # use one of the section indexes. Also needs the build.cfg file (in the
|
---|
| 37 | # index dir) - used in creating a collect.cfg file. If build.cfg is not
|
---|
| 38 | # present, the parts of the collect.cfg file relating to indexes will not
|
---|
| 39 | # be entered.
|
---|
| 40 |
|
---|
| 41 | # Stefan altered this script slightly August 2002 to use
|
---|
| 42 | # $GSDLHOME/bin/$GSDLOS/mgquery_old instead of the old mgquery in
|
---|
| 43 | # nzdl-1.2. mgquery_old is a statically linked binary compiled from
|
---|
| 44 | # mg-1.3d. It was compiled with SHORT_SUFFIX set so doesn't need the index
|
---|
| 45 | # file extensions to be altered.
|
---|
| 46 | # Also changed the command line options a little.
|
---|
| 47 |
|
---|
| 48 | BEGIN {
|
---|
| 49 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 50 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
| 51 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 52 | $FileHandle = 'FH000';
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 | use doc;
|
---|
| 56 | use util;
|
---|
| 57 | use parsargv;
|
---|
| 58 | use FileHandle;
|
---|
| 59 | use English;
|
---|
| 60 | use cfgread;
|
---|
| 61 | use unicode;
|
---|
[12355] | 62 | use plugout;
|
---|
[3324] | 63 | select STDERR; $| = 1;
|
---|
| 64 | select STDOUT; $| = 1;
|
---|
| 65 |
|
---|
| 66 |
|
---|
| 67 | # globals
|
---|
| 68 | $collection = ""; # the collection name
|
---|
| 69 | $index = ""; # the selected index (like stt/unu)
|
---|
| 70 | $textdir = ""; # the textdir (like text/unu)
|
---|
| 71 | $toplevelinfo = []; #list of document OIDs
|
---|
| 72 | %infodb = (); #hash into GDBM file
|
---|
| 73 | $classifyinfo = []; # list of classifications
|
---|
| 74 | $doc_classif_info = {}; # hash of OIDs->classifications they belong to
|
---|
| 75 | $collect_cfg = {}; #data for the configuration file
|
---|
| 76 | $mgread = ++$FileHandle;
|
---|
| 77 | $mgwrite = ++$FileHandle;
|
---|
| 78 |
|
---|
| 79 | sub print_usage {
|
---|
| 80 | print STDERR "\n usage: $0 [options]\n\n";
|
---|
| 81 | print STDERR " options:\n";
|
---|
| 82 | print STDERR " -verbosity number 0=none, 3=lots\n";
|
---|
| 83 | print STDERR " -old directory The directory in which the old collection lives\n";
|
---|
| 84 | print STDERR " -new directory The directory in which to put the new collection\n";
|
---|
| 85 | }
|
---|
| 86 |
|
---|
| 87 | &main ();
|
---|
| 88 |
|
---|
| 89 | sub main {
|
---|
| 90 | if (!parsargv::parse(\@ARGV,
|
---|
| 91 | 'verbosity/\d+/2', \$verbosity,
|
---|
| 92 | 'old/.*/', \$old,
|
---|
| 93 | 'new/.*/', \$new)) {
|
---|
| 94 | &print_usage();
|
---|
| 95 | die "\n";
|
---|
| 96 | }
|
---|
| 97 |
|
---|
| 98 | die "$old does not exist\n\n" unless (-d $old);
|
---|
| 99 | my $old_indexdir = &util::filename_cat($old, "index");
|
---|
| 100 | my $old_archivedir = &util::filename_cat($old, "archives");
|
---|
| 101 |
|
---|
| 102 | my $new_archivedir = &util::filename_cat($new, "archives");
|
---|
| 103 | my $new_etcdir = &util::filename_cat($new, "etc");
|
---|
| 104 | if (!-e $new_archivedir) {
|
---|
| 105 | &util::mk_all_dir($new_archivedir);
|
---|
| 106 | }
|
---|
| 107 | if (!(-d $new_etcdir)) {
|
---|
| 108 | &util::mk_all_dir($new_etcdir);
|
---|
| 109 | }
|
---|
| 110 |
|
---|
| 111 | my $gdbmfile = &get_gdbmfile($old_indexdir); # sets $collection and $textdir
|
---|
| 112 | &set_index($old_indexdir); # sets $index (just chooses one index)
|
---|
| 113 |
|
---|
| 114 | my $buildcfgfile = &util::filename_cat($old_indexdir, "build.cfg");
|
---|
| 115 | my $colcfgfile = &util::filename_cat($new_etcdir, "collect.cfg");
|
---|
| 116 |
|
---|
| 117 | # don't need this but giving it a value stops an annoying undefined
|
---|
| 118 | # value warning
|
---|
| 119 | $ENV{'GSDLCOLLECTDIR'} = 'tmp';
|
---|
| 120 |
|
---|
| 121 | &add_default_cfg();
|
---|
| 122 | &add_index_cfg($buildcfgfile);
|
---|
| 123 |
|
---|
| 124 | # work out all the classifications from the gdbm file, info for each doc
|
---|
| 125 | # (ie which classifications they belong to, are kept in $doc_classif_info
|
---|
| 126 | &get_classifyinfo($gdbmfile); #puts a list of classifications into classifyinfo
|
---|
| 127 | &get_toplevel_OID($gdbmfile); # puts a list of the top level document OIDs into $toplevelinfo
|
---|
| 128 |
|
---|
| 129 | # read ldb file into %infodb
|
---|
| 130 | &read_gdbm($gdbmfile);
|
---|
| 131 |
|
---|
| 132 | #this makes the files specifying the hierarchy of subjects, titles etc
|
---|
| 133 | foreach my $classify (@$classifyinfo) {
|
---|
| 134 | &make_info_file($new_etcdir, $classify);
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | # write out the collect.cfg
|
---|
| 138 | &output_cfg_file($colcfgfile);
|
---|
| 139 |
|
---|
| 140 | &openmg($old_indexdir);
|
---|
| 141 |
|
---|
| 142 | # read the archive information file
|
---|
| 143 | my $archive_info_filename = &util::filename_cat ($new_archivedir, "archives.inf");
|
---|
| 144 | my $archive_info = new arcinfo ();
|
---|
| 145 |
|
---|
[12355] | 146 | my $opts = [];
|
---|
| 147 | push @$opts,("-output_info",$archive_info);
|
---|
[3324] | 148 |
|
---|
[17750] | 149 | $processor = &plugout::load_plugout("GreenstoneXMLPlugout",$opts);
|
---|
[12355] | 150 | $processor->setoutputdir ($new_archivedir);
|
---|
| 151 |
|
---|
[3324] | 152 | my ($doc_obj, $hashref, $children);
|
---|
| 153 | print STDERR "processing documents now\n" if $verbosity >=2;
|
---|
| 154 | foreach $oid (@$toplevelinfo) {
|
---|
| 155 | $value = $infodb{$oid};
|
---|
| 156 | $hashref={};
|
---|
| 157 | $children = [];
|
---|
| 158 | &get_metadata($value, $hashref);
|
---|
| 159 | $doc_obj = new doc ();
|
---|
| 160 | $doc_obj->set_OID($oid);
|
---|
| 161 | my ($olddir) = $hashref->{'archivedir'}; # old dir for this doc, where images are stored
|
---|
| 162 | $top = $doc_obj->get_top_section();
|
---|
| 163 | &add_section_content ($doc_obj, $top, $hashref, $olddir);
|
---|
| 164 | &add_classification_metadata($oid, $doc_obj, $top);
|
---|
| 165 | &add_cover_image($doc_obj, $old_archivedir, $olddir);
|
---|
| 166 | &get_children($hashref, $children);
|
---|
| 167 | &recurse_sections($doc_obj, $children, $oid, $top, $olddir) if (defined ($children));
|
---|
| 168 |
|
---|
| 169 | #change OID to new format
|
---|
| 170 | $doc_obj->delete_OID();
|
---|
| 171 | $doc_obj->set_OID();
|
---|
| 172 | $processor->process($doc_obj);
|
---|
| 173 | # last;
|
---|
| 174 | }
|
---|
| 175 | print STDERR "\n";
|
---|
| 176 |
|
---|
| 177 | &closemg();
|
---|
| 178 |
|
---|
| 179 | # write out the archive information file
|
---|
| 180 | $archive_info->save_info($archive_info_filename);
|
---|
| 181 |
|
---|
| 182 |
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 | # returns the path to the gdbm info database - also
|
---|
| 186 | # sets the $collection and $textdir global variable
|
---|
| 187 | sub get_gdbmfile {
|
---|
| 188 | my ($indexdir) = @_;
|
---|
| 189 | my ($gdbmfile);
|
---|
| 190 |
|
---|
| 191 | opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
|
---|
| 192 | my @conts = readdir DIR;
|
---|
| 193 | close DIR;
|
---|
| 194 |
|
---|
| 195 | foreach $file (@conts) {
|
---|
| 196 | if ($file =~ /text$/) {
|
---|
| 197 | $textdir = $file;
|
---|
| 198 | last;
|
---|
| 199 | }
|
---|
| 200 | }
|
---|
| 201 | die "No text directory found in $indexdir\n\n"
|
---|
| 202 | unless defined $textdir && $textdir =~ /text$/;
|
---|
| 203 |
|
---|
| 204 | $gdbmfile = &util::filename_cat ($indexdir, $textdir);
|
---|
| 205 |
|
---|
| 206 | opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
|
---|
| 207 | @conts = readdir DIR;
|
---|
| 208 | close DIR;
|
---|
| 209 |
|
---|
| 210 | foreach my $file (@conts) {
|
---|
| 211 | if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
|
---|
| 212 | $collection = $1;
|
---|
| 213 | $gdbmfile = &util::filename_cat ($gdbmfile, $file);
|
---|
| 214 | last;
|
---|
| 215 | }
|
---|
| 216 | }
|
---|
| 217 |
|
---|
| 218 | if (defined $collection && $collection =~ /\w/) {
|
---|
| 219 | $textdir = &util::filename_cat ($textdir, $collection);
|
---|
| 220 | } else {
|
---|
| 221 | die "collection global wasn't set\n";
|
---|
| 222 | }
|
---|
| 223 | return $gdbmfile if (-e $gdbmfile);
|
---|
| 224 | die "Couldn't find gdbm info database in $indexdir\n\n";
|
---|
| 225 | }
|
---|
| 226 |
|
---|
| 227 |
|
---|
| 228 | #this function will probably need to be changed for any different file formats
|
---|
| 229 | sub get_toplevel_OID {
|
---|
| 230 | my ($gdbmfile) = @_;
|
---|
| 231 |
|
---|
| 232 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
| 233 | print STDERR "Finding all top level sections from $gdbmfile\n" if $verbosity >= 2;
|
---|
| 234 |
|
---|
| 235 | $/ = '-' x 70;
|
---|
| 236 | my $entry = "";
|
---|
| 237 | while (defined ($entry = <DB2TXT>)) {
|
---|
| 238 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
| 239 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
| 240 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
| 241 |
|
---|
| 242 | next unless ($key =~ /B\.\d*$/); # assumes top level OID is like
|
---|
| 243 | # B.12
|
---|
| 244 |
|
---|
| 245 | push( @$toplevelinfo, $key);
|
---|
| 246 |
|
---|
| 247 | }
|
---|
| 248 |
|
---|
| 249 | $/ = "\n";
|
---|
| 250 | }
|
---|
| 251 |
|
---|
| 252 | # gets all the metadata from a gdbm file entry, and puts it into a hashref
|
---|
| 253 | #this has changed for old style formats
|
---|
| 254 | sub get_metadata {
|
---|
| 255 |
|
---|
| 256 | my ($gdb_str_ref, $hashref) = @_;
|
---|
| 257 | # print STDERR $gdb_str_ref;
|
---|
| 258 | my @entries = split(/\n/, $gdb_str_ref);
|
---|
| 259 |
|
---|
| 260 | my $hastext = 1;
|
---|
| 261 | foreach $entry (@entries) {
|
---|
| 262 |
|
---|
| 263 | my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
|
---|
| 264 |
|
---|
| 265 | next if ($key eq "p");
|
---|
| 266 | next if ($key eq "j");
|
---|
| 267 |
|
---|
| 268 | $key = "Title" if $key eq "t";
|
---|
| 269 | $key = "docnum" if $key eq "d";
|
---|
| 270 | if ($key eq "o") {
|
---|
| 271 | $key = "archivedir";
|
---|
| 272 | $value =~ s/HASH/hash/;
|
---|
| 273 | }
|
---|
| 274 | if ($key eq "c") {
|
---|
| 275 | $key = "contains";
|
---|
| 276 | $hastext=0;}
|
---|
| 277 |
|
---|
| 278 | $$hashref{$key} .= $value;
|
---|
| 279 |
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | $$hashref{'hastxt'} = $hastext;
|
---|
| 283 |
|
---|
| 284 | }
|
---|
| 285 |
|
---|
| 286 | # takes a hashref containing the metadata for a gdbmfile entry, and
|
---|
| 287 | # extracts the childrens numbers (from the 'contains' entry). assumes
|
---|
| 288 | # format is like B.14.1:B.24.2:B.24.3 returns a list like 1 2 3 4
|
---|
| 289 | sub get_children {
|
---|
| 290 | my ($hashref, $children) = @_;
|
---|
| 291 |
|
---|
| 292 | my $childs = $hashref->{'contains'};
|
---|
| 293 | if (defined ($childs)) {
|
---|
| 294 | @xchildren = split /\:/, $childs;
|
---|
| 295 | foreach $kid (@xchildren) {
|
---|
| 296 | my ($child)= $kid =~ /\.([^\.]*)(?:\\n|$)/;
|
---|
| 297 | push (@$children, $child);
|
---|
| 298 | }
|
---|
| 299 | # print STDERR "children are: ", @$children;
|
---|
| 300 | }
|
---|
| 301 | else {
|
---|
| 302 | $children = [];
|
---|
| 303 | }
|
---|
| 304 | }
|
---|
| 305 |
|
---|
| 306 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
| 307 | #the childrens numbers (from the 'contains' entry).
|
---|
| 308 | #assumes format is like B.14.1:B.24.2:B.24.3
|
---|
| 309 | #returns a list with the full child name ie B.14.1 B.14.2 etc
|
---|
| 310 | #used for classification stuff
|
---|
| 311 | sub get_whole_children {
|
---|
| 312 |
|
---|
| 313 | my ($hashref, $children) = @_;
|
---|
| 314 |
|
---|
| 315 | my $childs = $hashref->{'contains'};
|
---|
| 316 | if (defined ($childs)) {
|
---|
| 317 | @$children = split /\:/, $childs;
|
---|
| 318 | #print STDERR "children are: ", @$children;
|
---|
| 319 | }
|
---|
| 320 | else {
|
---|
| 321 | $children = [];
|
---|
| 322 | }
|
---|
| 323 | }
|
---|
| 324 |
|
---|
| 325 | sub recurse_sections {
|
---|
| 326 | my ($doc_obj, $children, $parentoid, $parentsection, $olddir) = @_;
|
---|
| 327 |
|
---|
| 328 | foreach $child (sort numerically @$children) {
|
---|
| 329 | $doc_obj->create_named_section("$parentsection.$child");
|
---|
| 330 | my $value = $infodb{"$parentoid.$child"};
|
---|
| 331 | my $hashref={};
|
---|
| 332 | &get_metadata($value, $hashref); # get childs metadata
|
---|
| 333 | my $newchildren = [];
|
---|
| 334 | &get_children($hashref, $newchildren); # get childs children
|
---|
| 335 | #add content fo rcurrent section
|
---|
| 336 | &add_section_content($doc_obj, "$parentsection.$child", $hashref, $olddir);
|
---|
| 337 | # process all the children if there are any
|
---|
| 338 | &recurse_sections($doc_obj, $newchildren, "$parentoid.$child", "$parentsection.$child", $olddir)
|
---|
| 339 | if (defined ($newchildren));
|
---|
| 340 | }
|
---|
| 341 | }
|
---|
| 342 |
|
---|
| 343 | sub add_section_content {
|
---|
| 344 | my ($doc_obj, $cursection, $hashref, $olddir) = @_;
|
---|
| 345 |
|
---|
| 346 | foreach $key (keys %$hashref) {
|
---|
| 347 | #dont need to store these metadata
|
---|
| 348 | next if $key =~ /(contains|docnum|hastxt|doctype|archivedir|classifytype)/i;
|
---|
| 349 | my @items = split /@/, $hashref->{$key};
|
---|
| 350 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
| 351 |
|
---|
| 352 | }
|
---|
| 353 |
|
---|
| 354 | my ($hastext) = $hashref->{'hastxt'} =~ /(0|1)/;
|
---|
| 355 | if ($hastext) {
|
---|
| 356 |
|
---|
| 357 | my ($docnum)= $hashref->{'docnum'} =~ /(\d*)/;
|
---|
| 358 | my $text = &get_text($docnum);
|
---|
| 359 | $doc_obj->add_text ($cursection, $text);
|
---|
| 360 |
|
---|
| 361 | my (@images) = $text =~ /<img.*?src=\"([^\"]*)\"[^>]*>/g;
|
---|
| 362 |
|
---|
| 363 | if (scalar(@images)>0) {
|
---|
| 364 |
|
---|
| 365 | foreach $img (@images) {
|
---|
| 366 | my ($assoc_file) = $img =~ /([^\/\\]*\..*)$/; #the name of the image
|
---|
| 367 | $img =~ s/_httpcollection_/$old/; #replace _httpcollection_ with path to old collection
|
---|
| 368 | $olddir =~ s/HASH/hash/;
|
---|
| 369 | $img =~ s/_thisOID_/$olddir/; #replace _thisOID_ with old archivedir name #the path to the image
|
---|
| 370 | $img = lc $img; # doc names and path upper case in file, lower
|
---|
| 371 | # case in directory
|
---|
| 372 | if (-e $img) {
|
---|
| 373 | $doc_obj->associate_file($img, $assoc_file);
|
---|
| 374 | } else {
|
---|
| 375 | print STDERR "WARNING: $img does not exist\n";
|
---|
| 376 | }
|
---|
| 377 | }
|
---|
| 378 |
|
---|
| 379 | }
|
---|
| 380 | }
|
---|
| 381 | }
|
---|
| 382 |
|
---|
| 383 |
|
---|
| 384 | sub add_classification_metadata {
|
---|
| 385 |
|
---|
| 386 | my ($oid, $doc_obj, $cursection) = @_;
|
---|
| 387 |
|
---|
| 388 | if (defined $doc_classif_info->{$oid}) {
|
---|
| 389 |
|
---|
| 390 | $hashref = $doc_classif_info->{$oid};
|
---|
| 391 |
|
---|
| 392 | foreach $key (keys %$hashref) {
|
---|
| 393 | my @items = @{$hashref->{$key}};
|
---|
| 394 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
| 395 | }
|
---|
| 396 | }
|
---|
| 397 | }
|
---|
| 398 | # picks up the cover image "cover.jpg" from the old archives directory.
|
---|
| 399 | sub add_cover_image {
|
---|
| 400 | my ($doc_obj, $archivedir, $olddir) = @_;
|
---|
| 401 | $assoc_file = "cover.jpg";
|
---|
| 402 | $img = &util::filename_cat($archivedir, $olddir, $assoc_file);
|
---|
| 403 |
|
---|
| 404 | if (-e $img) {
|
---|
| 405 | $doc_obj->associate_file($img, $assoc_file);
|
---|
| 406 | } else {
|
---|
| 407 | print STDERR "WARNING: couldn't find $img\n";
|
---|
| 408 | }
|
---|
| 409 | }
|
---|
| 410 |
|
---|
| 411 | sub set_index {
|
---|
| 412 | my ($indexdir) = @_;
|
---|
| 413 |
|
---|
| 414 | # check that $collection has been set
|
---|
| 415 | die "collection global was not set\n"
|
---|
| 416 | unless defined $collection && $collection =~ /\w/;
|
---|
| 417 |
|
---|
| 418 | # find an index (just use first non-text and non-paragraph directory we
|
---|
| 419 | # come across in $indexdir)
|
---|
| 420 | opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
|
---|
| 421 | my @indexes = readdir INDEXDIR;
|
---|
| 422 | close INDEXDIR;
|
---|
| 423 | foreach $i (@indexes) {
|
---|
| 424 | next if $i =~ /text$/i || $i =~ /\./ || $i =~ /^p/i;
|
---|
| 425 | $index = &util::filename_cat ($i, $collection);
|
---|
| 426 | last;
|
---|
| 427 | }
|
---|
| 428 | }
|
---|
| 429 |
|
---|
| 430 |
|
---|
| 431 | #########################################################################
|
---|
| 432 |
|
---|
| 433 | ################ functions involving mg ################################
|
---|
| 434 | sub get_text {
|
---|
| 435 | my ($docnum) = @_;
|
---|
| 436 |
|
---|
| 437 | print STDERR "." if $verbosity >= 2;
|
---|
| 438 | &mgcommand ($docnum);
|
---|
| 439 |
|
---|
| 440 | <$mgread>; # eat the document separator
|
---|
| 441 |
|
---|
| 442 | my $text = "";
|
---|
| 443 | my $line = "";
|
---|
| 444 |
|
---|
| 445 | while (defined ($line = <$mgread>))
|
---|
| 446 | {
|
---|
| 447 | last if $line =~ /^<\/mg>/;
|
---|
| 448 | $text .= $line;
|
---|
| 449 | }
|
---|
| 450 |
|
---|
| 451 | # Read in the last statement, which should be:
|
---|
| 452 | # "dd documents retrieved."
|
---|
| 453 | <$mgread>;
|
---|
| 454 |
|
---|
| 455 | return $text;
|
---|
| 456 | }
|
---|
| 457 |
|
---|
| 458 |
|
---|
| 459 | sub numerically {$a <=> $b;}
|
---|
| 460 |
|
---|
| 461 |
|
---|
| 462 |
|
---|
| 463 | # mg stuff
|
---|
| 464 |
|
---|
| 465 | sub openmg {
|
---|
| 466 | my ($indexdir) = @_;
|
---|
| 467 |
|
---|
| 468 | die "Unable to start mgquery." unless
|
---|
| 469 | &openpipe($mgread, $mgwrite, "mgquery_old -d $indexdir -f $index -t $textdir");
|
---|
| 470 |
|
---|
| 471 | $mgwrite->autoflush();
|
---|
| 472 | &mgcommand('.set expert true');
|
---|
| 473 | &mgcommand('.set terminator "</mg>\n"');
|
---|
| 474 | &mgcommand('.set mode text');
|
---|
| 475 | &mgcommand('.set query docnums');
|
---|
| 476 | &mgcommand('.set term_freq off');
|
---|
| 477 | }
|
---|
| 478 |
|
---|
| 479 | sub closemg {
|
---|
| 480 | &mgcommand (".quit");
|
---|
| 481 | close($mgread);
|
---|
| 482 | close($mgwrite);
|
---|
| 483 | }
|
---|
| 484 |
|
---|
| 485 | sub mgcommand {
|
---|
| 486 | my ($command) = @_;
|
---|
| 487 |
|
---|
| 488 | return if $command =~ /^\s*$/; #whitespace
|
---|
| 489 | #print STDERR "command: $command\n";
|
---|
| 490 | print $mgwrite "$command\n";
|
---|
| 491 |
|
---|
| 492 | # eat up the command executed which is echoed
|
---|
| 493 | <$mgread>;
|
---|
| 494 | }
|
---|
| 495 |
|
---|
| 496 | # openpipe(READ, WRITE, CMD)
|
---|
| 497 | #
|
---|
| 498 | # Like open2, except CMD's stderr is also redirected.
|
---|
| 499 | #
|
---|
| 500 | sub openpipe
|
---|
| 501 | {
|
---|
| 502 | my ($read, $write, $cmd) = @_;
|
---|
| 503 | my ($child_read, $child_write);
|
---|
| 504 |
|
---|
| 505 | $child_read = ++$FileHandle;
|
---|
| 506 | $child_write = ++$FileHandle;
|
---|
| 507 |
|
---|
| 508 | pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
|
---|
| 509 | pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
|
---|
| 510 | my $pid;
|
---|
| 511 |
|
---|
| 512 | if (($pid = fork) < 0) {
|
---|
| 513 | die "Failed fork: $!";
|
---|
| 514 | } elsif ($pid == 0) {
|
---|
| 515 | close($read);
|
---|
| 516 | close($write);
|
---|
| 517 | open(STDIN, "<&$child_read");
|
---|
| 518 | open(STDOUT, ">&$child_write");
|
---|
| 519 | open(STDERR, ">&$child_write");
|
---|
| 520 | exec($cmd);
|
---|
| 521 | die "Failed exec $cmd: $!";
|
---|
| 522 | }
|
---|
| 523 |
|
---|
| 524 | close($child_read);
|
---|
| 525 | close($child_write);
|
---|
| 526 |
|
---|
| 527 | $write->autoflush();
|
---|
| 528 | $read->autoflush();
|
---|
| 529 |
|
---|
| 530 | return 1;
|
---|
| 531 | }
|
---|
| 532 |
|
---|
| 533 |
|
---|
| 534 | ######################################################################
|
---|
| 535 |
|
---|
| 536 | ############# functions to do with the classificaiton stuff ##########
|
---|
| 537 | sub get_classifyinfo {
|
---|
| 538 | my ($gdbmfile) = @_;
|
---|
| 539 |
|
---|
| 540 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
| 541 | print STDERR "Finding all classification sections from $gdbmfile\n" ;
|
---|
| 542 |
|
---|
| 543 | $/ = '-' x 70;
|
---|
| 544 | my $entry = "";
|
---|
| 545 | while (defined ($entry = <DB2TXT>)) {
|
---|
| 546 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
| 547 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
| 548 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
| 549 |
|
---|
| 550 | next unless ($key =~/^[A-Z]$/); # assumes classification OID is like
|
---|
| 551 | # C or T etc
|
---|
| 552 |
|
---|
| 553 | push( @$classifyinfo, $key);
|
---|
| 554 |
|
---|
| 555 | }
|
---|
| 556 |
|
---|
| 557 | $/ = "\n";
|
---|
| 558 | }
|
---|
| 559 |
|
---|
| 560 | #this creates the classification files needed for the hierarchy classifier
|
---|
| 561 | #used for subjects, titles, orgs etc
|
---|
| 562 | #also adds in entries to the collect_cfg hash
|
---|
| 563 | sub make_info_file {
|
---|
| 564 | my ($etcdir, $classifier) = @_;
|
---|
| 565 | my $info_file = "";
|
---|
| 566 |
|
---|
| 567 | $info_file = &util::filename_cat($etcdir, "CL.${classifier}.txt");
|
---|
| 568 | $classifier_name = "CL.$classifier";
|
---|
| 569 | print STDERR "classification $classifier will be called CL.$classifier\n";
|
---|
| 570 |
|
---|
| 571 | open (OUTDOC, ">$info_file" ) || die "couldn't open file $info_file\n";
|
---|
| 572 |
|
---|
| 573 | my $entry = $infodb{$classifier};
|
---|
| 574 | #print STDERR "entry = $entry\n";
|
---|
| 575 | my $hashref = {};
|
---|
| 576 | &get_metadata($entry, $hashref);
|
---|
| 577 | my $children=[];
|
---|
| 578 | &get_whole_children($hashref, $children);
|
---|
| 579 | foreach $child (@$children) {
|
---|
| 580 | &process_entry(OUTDOC, $classifier_name, $child);
|
---|
| 581 | }
|
---|
| 582 |
|
---|
| 583 | close OUTDOC;
|
---|
| 584 |
|
---|
| 585 | &add_classify_cfg($classifier, $classifier_name, $info_file);
|
---|
| 586 |
|
---|
| 587 | }
|
---|
| 588 |
|
---|
| 589 |
|
---|
| 590 | sub process_entry {
|
---|
| 591 | my ($handle, $classifier_name, $classify_id) = @_;
|
---|
| 592 |
|
---|
| 593 | my $value = $infodb{$classify_id};
|
---|
| 594 | my $hashref={};
|
---|
| 595 | &get_metadata($value, $hashref);
|
---|
| 596 | my $title = $hashref->{'Title'};
|
---|
| 597 |
|
---|
| 598 | &add_line($handle, $classify_id, $title);
|
---|
| 599 |
|
---|
| 600 | my $children = [];
|
---|
| 601 | &get_whole_children($hashref, $children);
|
---|
| 602 | foreach $child (@$children) {
|
---|
| 603 | if (&is_document($child)) {
|
---|
| 604 | &add_doc_metadata($child, $classifier_name, $classify_id);
|
---|
| 605 | }
|
---|
| 606 | else {
|
---|
| 607 | &process_entry($handle, $classifier_name, $child);
|
---|
| 608 | }
|
---|
| 609 | }
|
---|
| 610 | }
|
---|
| 611 |
|
---|
| 612 | sub add_doc_metadata {
|
---|
| 613 | my ($doc_id, $classifier_name, $classifier_id) = @_;
|
---|
| 614 |
|
---|
| 615 | #add entry to doc database
|
---|
| 616 | #print STDERR "at doc level, docnum=$classify_id\n";
|
---|
| 617 | $doc_classif_info->{$doc_id}={} unless defined $doc_classif_info->{$doc_id};
|
---|
| 618 | $doc_classif_info->{$doc_id}->{$classifier_name}=[] unless
|
---|
| 619 | defined $doc_classif_info->{$doc_id}->{$classifier_name};
|
---|
| 620 | push (@{$doc_classif_info->{$doc_id}->{$classifier_name}}, $classifier_id);
|
---|
| 621 | }
|
---|
| 622 |
|
---|
| 623 | sub add_line {
|
---|
| 624 | my ($handle, $classify_id, $title) = @_;
|
---|
| 625 | $title = &unicode::ascii2utf8(\$title);
|
---|
| 626 | my ($num) = $classify_id =~ /^[A-Z]\.(.*)$/; #remove the C. from the front
|
---|
| 627 | print $handle "$classify_id\t$num\t\"$title\"\n";
|
---|
| 628 | }
|
---|
| 629 |
|
---|
| 630 | sub is_document {
|
---|
| 631 | my ($oid) = @_;
|
---|
| 632 | return 1 if $oid =~ /^B\.\d/;
|
---|
| 633 | return 0;
|
---|
| 634 | }
|
---|
| 635 |
|
---|
| 636 | ########################################################################
|
---|
| 637 |
|
---|
| 638 | ########## stuff for producing collect.cfg file ###########################
|
---|
| 639 |
|
---|
| 640 | sub add_default_cfg {
|
---|
| 641 |
|
---|
| 642 | $username=`whoami`;
|
---|
| 643 | $username=`logname` unless defined $username;
|
---|
| 644 | $username="a_user" unless defined $username;
|
---|
| 645 | $username =~ s/\n//;
|
---|
| 646 | $collect_cfg->{'creator'}="$username\@cs.waikato.ac.nz";
|
---|
| 647 | $collect_cfg->{'maintainer'}="$username\@cs.waikato.ac.nz";
|
---|
| 648 | $collect_cfg->{'public'}="true";
|
---|
| 649 |
|
---|
| 650 | $collect_cfg->{'plugin'}=[];
|
---|
[17750] | 651 | push (@{$collect_cfg->{'plugin'}}, ["GreenstoneXMLPlugin"]);
|
---|
| 652 | push (@{$collect_cfg->{'plugin'}}, ["ArchivesInfPlugin"]);
|
---|
| 653 | push (@{$collect_cfg->{'plugin'}}, ["DirectoryPlugin"]);
|
---|
[3324] | 654 |
|
---|
| 655 | $collect_cfg->{'format'}={};
|
---|
| 656 | $collect_cfg->{'format'}->{'DocumentImages'}="true";
|
---|
| 657 | $collect_cfg->{'format'}->{'DocumentText'} =
|
---|
| 658 | "\"<h3>[Title]</h3>\\\\n\\\\n<p>[Text]\"";
|
---|
| 659 | $collect_cfg->{'format'}->{'SearchVList'} =
|
---|
| 660 | "\"<td valign=top>[link][icon][/link]</td><td>{If}{[parent(All': '):Title],[parent(All': '):Title]:}[link][Title][/link]</td>\"";
|
---|
| 661 |
|
---|
| 662 | $collect_cfg->{'collectionmeta'}={};
|
---|
| 663 | $collect_cfg->{'collectionmeta'}->{'collectionname'}="\"$collection\"";
|
---|
| 664 | $collect_cfg->{'collectionmeta'}->{'iconcollection'}="\"_httpprefix_/collect/$collection/images/$collection.gif\"";
|
---|
| 665 | $collect_cfg->{'collectionmeta'}->{'iconcollectionsmall'}="\"_httpprefix_/collect/$collection/images/${collection}sm.gif\"";
|
---|
| 666 | $collect_cfg->{'collectionmeta'}->{'collectionextra'} = "\"This is a collection rebuilt from CDROM.\"";
|
---|
| 667 |
|
---|
| 668 | }
|
---|
| 669 |
|
---|
| 670 | sub add_index_cfg {
|
---|
| 671 | my ($buildfile) = @_;
|
---|
| 672 |
|
---|
| 673 | my $data={};
|
---|
| 674 | $collect_cfg->{'indexes'}=[];
|
---|
| 675 | if (-e $buildfile) {
|
---|
| 676 | $data=&cfgread::read_cfg_file($buildfile, '^(this)$', '^(indexmap)$');
|
---|
| 677 | foreach my $i (@{$data->{'indexmap'}}) {
|
---|
| 678 | my ($thisindex, $abbrev)= split (/\-\>/, $i);
|
---|
| 679 | push (@{$collect_cfg->{'indexes'}}, $thisindex);
|
---|
| 680 | $collect_cfg->{'defaultindex'} = $thisindex unless defined
|
---|
| 681 | $collect_cfg->{'defaultindex'};
|
---|
| 682 | $name=&get_index_name($thisindex);
|
---|
| 683 | $thisindex=".$thisindex";
|
---|
| 684 | $collect_cfg->{'collectionmeta'}->{$thisindex} = "\"$name\"";
|
---|
| 685 | }
|
---|
| 686 | }
|
---|
| 687 | else {
|
---|
| 688 | print STDERR "Couldn't read $buildfile, could not add index data to configuration file\n";
|
---|
| 689 | }
|
---|
| 690 |
|
---|
| 691 | }
|
---|
| 692 |
|
---|
| 693 | sub get_index_name {
|
---|
| 694 | my ($thisindex) = @_;
|
---|
| 695 | return "paragraphs" if $thisindex =~ /paragraph/;
|
---|
| 696 | return "chapters" if $thisindex =~ /section.*text/;
|
---|
| 697 | return "titles" if $thisindex =~ /Title/;
|
---|
| 698 | return "other";
|
---|
| 699 | }
|
---|
| 700 |
|
---|
| 701 | sub add_classify_cfg {
|
---|
| 702 |
|
---|
| 703 | my ($classify, $metadata, $file) = @_;
|
---|
| 704 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
| 705 |
|
---|
| 706 | my ($title);
|
---|
| 707 | $title = "Howto" if $classify eq 'H';
|
---|
| 708 | $title = "Subject" if $classify eq 'C';
|
---|
| 709 | $title = "Organization" if $classify eq 'O';
|
---|
| 710 | $title = "Title" if $classify eq 'T';
|
---|
| 711 | $title = "Other" unless defined $title;
|
---|
| 712 |
|
---|
| 713 | my ($filename) = $file =~ /\/([^\/]*)$/;
|
---|
[3326] | 714 | my $entry = "Hierarchy -hfile $filename -metadata $metadata -buttonname $title -sort Title";
|
---|
| 715 | $entry .= " -hlist_at_top" if $title eq "Title";
|
---|
[3324] | 716 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
| 717 |
|
---|
| 718 |
|
---|
| 719 | }
|
---|
| 720 |
|
---|
| 721 | sub output_cfg_file {
|
---|
| 722 |
|
---|
| 723 | my ($collfile) = @_;
|
---|
| 724 | &cfgread::write_cfg_file($collfile, $collect_cfg,
|
---|
| 725 | '^(creator|maintainer|public|beta|defaultindex)$',
|
---|
| 726 | '^(indexes)$', '^(format|collectionmeta)$',
|
---|
| 727 | '^(plugin|classify)$');
|
---|
| 728 | }
|
---|
| 729 |
|
---|
| 730 | sub read_gdbm {
|
---|
| 731 | my ($filename) = @_;
|
---|
| 732 |
|
---|
| 733 | open (PIPEIN, "db2txt \"$filename\" |") || die "couldn't open pipe from db2txt\n";
|
---|
| 734 | my $line = ""; my $key = ""; my $value = "";
|
---|
| 735 | while (defined ($line = <PIPEIN>)) {
|
---|
| 736 | if ($line =~ /^\[([^\]]+)\]$/) {
|
---|
| 737 | $key = $1;
|
---|
| 738 | } elsif ($line =~ /^-{70}$/) {
|
---|
| 739 | $infodb{$key} = $value;
|
---|
| 740 | $value = "";
|
---|
| 741 | $key = "";
|
---|
| 742 | } else {
|
---|
| 743 | $value .= $line;
|
---|
| 744 | }
|
---|
| 745 | }
|
---|
| 746 | close PIPEIN;
|
---|
| 747 | }
|
---|