[3324] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | ###########################################################################
|
---|
| 4 | #
|
---|
[7588] | 5 | # unbuildv2.pl --
|
---|
[3324] | 6 | # A component of the Greenstone digital library software
|
---|
| 7 | # from the New Zealand Digital Library Project at the
|
---|
| 8 | # University of Waikato, New Zealand.
|
---|
| 9 | #
|
---|
| 10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | # You should have received a copy of the GNU General Public License
|
---|
| 23 | # along with this program; if not, write to the Free Software
|
---|
| 24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | #
|
---|
| 26 | ###########################################################################
|
---|
| 27 |
|
---|
| 28 | # this program will decompress all the text from a built index
|
---|
| 29 | # and return it to gml format -
|
---|
| 30 | #this is based on the format current in Nov 1999. (version two)
|
---|
| 31 |
|
---|
| 32 | # Stefan updated unbuildv1.pl in August 2002 but unbuildv2.pl was not
|
---|
| 33 | # updated. It probably needs some work done before using.
|
---|
| 34 |
|
---|
[7588] | 35 | # Katherine updated this but apparently it still doesn't work.
|
---|
| 36 |
|
---|
[3324] | 37 | BEGIN {
|
---|
| 38 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
| 39 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
| 40 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
| 41 | $FileHandle = 'FH000';
|
---|
| 42 | }
|
---|
| 43 |
|
---|
[15706] | 44 | use dbutil;
|
---|
[3324] | 45 | use doc;
|
---|
| 46 | use util;
|
---|
| 47 | use parsargv;
|
---|
| 48 | use GDBM_File;
|
---|
| 49 | use FileHandle;
|
---|
| 50 | use English;
|
---|
| 51 | use cfgread;
|
---|
| 52 | use unicode;
|
---|
[12355] | 53 | use plugout;
|
---|
[3324] | 54 |
|
---|
| 55 | select STDERR; $| = 1;
|
---|
| 56 | select STDOUT; $| = 1;
|
---|
| 57 |
|
---|
| 58 |
|
---|
| 59 | # globals
|
---|
| 60 | $collection = ""; # the collection name
|
---|
| 61 | $index = ""; # the selected index (like stt/unu)
|
---|
| 62 | $textdir = ""; # the textdir (like text/unu)
|
---|
| 63 | $toplevelinfo = []; #list of document OIDs
|
---|
| 64 | %infodb = (); #hash into GDBM file
|
---|
| 65 | $classifyinfo = []; # list of classifications
|
---|
| 66 | $doc_classif_info = {}; # hash of OIDs->classifications they belong to
|
---|
| 67 | $collect_cfg = {}; #data for the configuration file
|
---|
| 68 |
|
---|
| 69 | $mgread = ++$FileHandle;
|
---|
| 70 | $mgwrite = ++$FileHandle;
|
---|
| 71 |
|
---|
| 72 |
|
---|
| 73 |
|
---|
| 74 | sub print_usage {
|
---|
| 75 | print STDERR "\n usage: $0 [options]\n\n";
|
---|
| 76 | print STDERR " options:\n";
|
---|
| 77 | print STDERR " -verbosity number 0=none, 3=lots\n";
|
---|
| 78 | print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
|
---|
| 79 | print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives.new\n";
|
---|
| 80 | print STDERR " -removeold Will remove the old contents of the archives\n";
|
---|
| 81 | print STDERR " directory -- use with care\n\n";
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | &main ();
|
---|
| 85 |
|
---|
| 86 | sub main {
|
---|
| 87 | if (!parsargv::parse(\@ARGV,
|
---|
| 88 | 'verbosity/\d+/2', \$verbosity,
|
---|
| 89 | 'indexdir/.*/index', \$indexdir,
|
---|
| 90 | 'archivedir/.*/archives.new', \$archivedir,
|
---|
| 91 | 'removeold', \$removeold)) {
|
---|
| 92 | &print_usage();
|
---|
| 93 | die "\n";
|
---|
| 94 | }
|
---|
| 95 |
|
---|
| 96 | die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
|
---|
| 97 | $indexdir =~ s/\/$//;
|
---|
| 98 | if (-d $archivedir) {
|
---|
| 99 | if ($removeold) {
|
---|
| 100 | print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
|
---|
| 101 | print STDERR " in preparation for the import\n";
|
---|
| 102 | sleep(5); # just in case...
|
---|
| 103 | &util::rm_r ($archivedir);
|
---|
| 104 | }
|
---|
| 105 | } else {
|
---|
| 106 | &util::mk_all_dir ($archivedir);
|
---|
| 107 | }
|
---|
| 108 |
|
---|
| 109 | $etcdir = "./etc";
|
---|
| 110 | if (!(-d $etcdir)) {
|
---|
| 111 | &util::mk_all_dir ($etcdir);
|
---|
| 112 | }
|
---|
| 113 |
|
---|
| 114 |
|
---|
| 115 | my $gdbmfile = &get_gdbmfile ($indexdir); #sets $collection and $textdir
|
---|
| 116 | &set_index (); # sets $index (just chooses one index)
|
---|
| 117 |
|
---|
| 118 |
|
---|
| 119 | my $buildcfgfile = &util::filename_cat($indexdir, "build.cfg");
|
---|
| 120 | my $colcfgfile = &util::filename_cat($etcdir, "collect.cfg");
|
---|
| 121 |
|
---|
| 122 | &add_default_cfg();
|
---|
| 123 | &add_index_cfg($buildcfgfile);
|
---|
| 124 |
|
---|
| 125 |
|
---|
| 126 | #work out all the classifications from the gdbm file, info for each doc
|
---|
| 127 | #(ie which classifications they belong to, are kept in $doc_classif_info
|
---|
| 128 | &get_classifyinfo ($gdbmfile); #puts a list of classifications into classifyinfo
|
---|
| 129 |
|
---|
| 130 | &get_toplevel_OID ($gdbmfile); # puts a list of the top level document OIDs into $toplevelinfo
|
---|
| 131 |
|
---|
[7588] | 132 | #tie (%infodb, "GDBM_File", $gdbmfile, 1, 0);
|
---|
| 133 |
|
---|
| 134 | #read ldb file into %infodb
|
---|
[15741] | 135 | &dbutil::read_infodb_file("gdbm", $gdbmfile, \%infodb);
|
---|
[3324] | 136 |
|
---|
| 137 | #this makes the files specifying the hierarchy of subjects, titles etc
|
---|
| 138 | foreach $classify (@$classifyinfo) {
|
---|
| 139 |
|
---|
[7588] | 140 | &make_info_file($classify);
|
---|
[3324] | 141 |
|
---|
| 142 | }
|
---|
| 143 |
|
---|
| 144 |
|
---|
| 145 | #write out the collect.cfg
|
---|
| 146 | &output_cfg_file($colcfgfile);
|
---|
| 147 |
|
---|
| 148 | &openmg ();
|
---|
| 149 |
|
---|
| 150 | # read the archive information file
|
---|
| 151 | my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
|
---|
| 152 | my $archive_info = new arcinfo ();
|
---|
| 153 |
|
---|
[12355] | 154 | my $opts = [];
|
---|
| 155 | push @$opts,("-output_info",$archive_info);
|
---|
[3324] | 156 |
|
---|
[12355] | 157 | $processor = &plugout::load_plugout("GAPlugout",$opts);
|
---|
| 158 | $processor->setoutputdir ($new_archivedir);
|
---|
| 159 |
|
---|
[3324] | 160 | my ($doc_obj, $hashref, $children);
|
---|
| 161 | print STDERR "processing documents now\n" if $verbosity >=2;
|
---|
| 162 | foreach $oid (@$toplevelinfo) {
|
---|
| 163 | $value = $infodb{$oid};
|
---|
| 164 | $hashref={};
|
---|
| 165 | $children = [];
|
---|
| 166 | &get_metadata($value, $hashref);
|
---|
| 167 | $doc_obj = new doc ();
|
---|
| 168 | $doc_obj->set_OID($oid);
|
---|
| 169 | my ($olddir) = $hashref->{'archivedir'}; # old dir for this doc, where images are stored
|
---|
| 170 | $top = $doc_obj->get_top_section();
|
---|
| 171 | &add_section_content ($doc_obj, $top, $hashref, $olddir);
|
---|
| 172 | &add_classification_metadata($oid, $doc_obj, $top);
|
---|
| 173 | &add_cover_image($doc_obj, $olddir);
|
---|
| 174 | &get_children($hashref, $children);
|
---|
| 175 | &recurse_sections($doc_obj, $children, $oid, $top, $olddir);
|
---|
| 176 | $processor->process($doc_obj);
|
---|
| 177 |
|
---|
| 178 | }
|
---|
| 179 | print STDERR "\n";
|
---|
| 180 |
|
---|
| 181 | &closemg();
|
---|
| 182 |
|
---|
| 183 | # write out the archive information file
|
---|
| 184 | $archive_info->save_info($archive_info_filename);
|
---|
| 185 | }
|
---|
| 186 |
|
---|
| 187 | # returns the path to the gdbm info database - also
|
---|
| 188 | # sets the $collection and $textdir global variable
|
---|
| 189 | sub get_gdbmfile {
|
---|
| 190 | my ($indexdir) = @_;
|
---|
| 191 | my ($gdbmfile);
|
---|
| 192 |
|
---|
| 193 | opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
|
---|
| 194 | my @conts = readdir DIR;
|
---|
| 195 | close DIR;
|
---|
| 196 |
|
---|
| 197 | foreach $file (@conts) {
|
---|
| 198 | if ($file =~ /text$/) {
|
---|
| 199 | $textdir = $file;
|
---|
| 200 | last;
|
---|
| 201 | }
|
---|
| 202 | }
|
---|
| 203 | die "No text directory found in $indexdir\n\n"
|
---|
| 204 | unless defined $textdir && $textdir =~ /text$/;
|
---|
| 205 |
|
---|
| 206 | $gdbmfile = &util::filename_cat ($indexdir, $textdir);
|
---|
| 207 |
|
---|
| 208 | opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
|
---|
| 209 | @conts = readdir DIR;
|
---|
| 210 | close DIR;
|
---|
| 211 |
|
---|
| 212 | foreach $file (@conts) {
|
---|
| 213 | if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
|
---|
| 214 | $collection = $1;
|
---|
| 215 | $gdbmfile = &util::filename_cat ($gdbmfile, $file);
|
---|
| 216 | last;
|
---|
| 217 | }
|
---|
| 218 | }
|
---|
| 219 |
|
---|
| 220 | if (defined $collection && $collection =~ /\w/) {
|
---|
| 221 | $textdir = &util::filename_cat ($textdir, $collection);
|
---|
| 222 | } else {
|
---|
| 223 | die "collection global wasn't set\n";
|
---|
| 224 | }
|
---|
| 225 | return $gdbmfile if (-e $gdbmfile);
|
---|
| 226 | die "Couldn't find gdbm info database in $indexdir\n\n";
|
---|
| 227 | }
|
---|
| 228 |
|
---|
| 229 |
|
---|
| 230 | sub get_toplevel_OID {
|
---|
| 231 | my ($gdbmfile) = @_;
|
---|
| 232 |
|
---|
| 233 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
| 234 | print STDERR "Finding all top level sections from $gdbmfile\n" if $verbosity >= 2;
|
---|
| 235 |
|
---|
| 236 | $/ = '-' x 70;
|
---|
| 237 | my $entry = "";
|
---|
| 238 | while (defined ($entry = <DB2TXT>)) {
|
---|
| 239 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
| 240 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
| 241 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
| 242 |
|
---|
| 243 | next if ($key =~ /\./); #ignore any lower level entries
|
---|
| 244 | next if ($key =~ /^CL/); #ignore classification entries
|
---|
| 245 | next if ($value =~ /<section>/); #ignore docnum->OID entries
|
---|
[7588] | 246 | next if ($value =~ /<docoid>/); #ignore strange s133->OID entries
|
---|
[3324] | 247 | next if ($key !~ /\d/); #ignore collection, browse entries
|
---|
| 248 |
|
---|
| 249 | push( @$toplevelinfo, $key);
|
---|
| 250 |
|
---|
| 251 | }
|
---|
| 252 |
|
---|
| 253 | $/ = "\n";
|
---|
| 254 | #print STDERR "toplevel sections are: ", join ("\n", @$toplevelinfo);
|
---|
[7588] | 255 | #print STDERR "\n";
|
---|
[3324] | 256 | }
|
---|
| 257 |
|
---|
| 258 | # gets all the metadata from a gdbm file entry, and puts it into a hashref
|
---|
| 259 | sub get_metadata {
|
---|
| 260 |
|
---|
| 261 | my ($gdb_str_ref, $hashref) = @_;
|
---|
| 262 | my @entries = split(/\n/, $gdb_str_ref);
|
---|
| 263 | foreach $entry (@entries) {
|
---|
| 264 | my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
|
---|
| 265 | $$hashref{$key} .= '@' if defined $$hashref{$key};
|
---|
| 266 | $$hashref{$key} .= $value;
|
---|
[7588] | 267 |
|
---|
[3324] | 268 | }
|
---|
| 269 |
|
---|
| 270 |
|
---|
| 271 | }
|
---|
| 272 |
|
---|
| 273 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
| 274 | #the childrens numbers (from the 'contains' entry).
|
---|
| 275 | #assumes format is ".1;".2;".3
|
---|
| 276 | sub get_children {
|
---|
| 277 | my ($hashref, $children) = @_;
|
---|
| 278 |
|
---|
| 279 | $childs = $hashref->{'contains'};
|
---|
| 280 | if (defined ($childs)) {
|
---|
| 281 | $childs =~ s/\@$//; #remove trailing @
|
---|
| 282 | $childs =~ s/^\"\.//; #remove initial ".
|
---|
| 283 | @$children = split /\;\"\./, $childs;
|
---|
| 284 |
|
---|
| 285 | }
|
---|
| 286 | else {
|
---|
| 287 | $children = [];
|
---|
| 288 | }
|
---|
| 289 | }
|
---|
| 290 |
|
---|
| 291 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
| 292 | #the childrens numbers (from the 'contains' entry).
|
---|
| 293 | #assumes format is ".1;".2;".3
|
---|
| 294 | #returns a list with the full child name ie HASH0123...ac.1 HASH0123...ac.2
|
---|
| 295 | #etc
|
---|
| 296 | #used for classification stuff
|
---|
| 297 | sub get_whole_children {
|
---|
| 298 |
|
---|
| 299 | my ($parentoid, $hashref, $children) = @_;
|
---|
| 300 |
|
---|
| 301 | my $childs = $hashref->{'contains'};
|
---|
| 302 | my @items;
|
---|
| 303 | if (defined ($childs)) {
|
---|
| 304 | $childs =~ s/\@$//; #remove trailing @
|
---|
[7588] | 305 | @items = split /\;/, $childs; #split on ;
|
---|
| 306 | foreach $item (@items) {
|
---|
| 307 | $item =~ s/^\"/$parentoid/; # replace " with parentoid
|
---|
| 308 | push (@$children, "$item");
|
---|
[3324] | 309 | }
|
---|
| 310 | }
|
---|
| 311 | else {
|
---|
| 312 | $children = [];
|
---|
| 313 | }
|
---|
[7588] | 314 | }
|
---|
[3324] | 315 |
|
---|
[7588] | 316 |
|
---|
[3324] | 317 | sub recurse_sections {
|
---|
| 318 | my ($doc_obj, $children, $parentoid, $parentsection, $olddir) = @_;
|
---|
| 319 |
|
---|
| 320 | foreach $child (sort numerically @$children) {
|
---|
| 321 | $doc_obj->create_named_section("$parentsection.$child");
|
---|
| 322 | my $value = $infodb{"$parentoid.$child"};
|
---|
| 323 | my $hashref={};
|
---|
| 324 | &get_metadata($value, $hashref); # get childs metadata
|
---|
| 325 | my $newchildren = [];
|
---|
| 326 | &get_children($hashref, $newchildren); # get childs children
|
---|
| 327 | #add content fo rcurrent section
|
---|
| 328 | &add_section_content($doc_obj, "$parentsection.$child", $hashref, $olddir);
|
---|
| 329 | # process all the children if there are any
|
---|
| 330 | &recurse_sections($doc_obj, $newchildren, "$parentoid.$child", "$parentsection.$child", $olddir)
|
---|
| 331 | if (defined ($newchildren));
|
---|
| 332 | }
|
---|
| 333 |
|
---|
| 334 |
|
---|
| 335 | }
|
---|
| 336 |
|
---|
| 337 | sub add_section_content {
|
---|
| 338 | my ($doc_obj, $cursection, $hashref, $olddir) = @_;
|
---|
| 339 |
|
---|
| 340 | foreach $key (keys %$hashref) {
|
---|
| 341 | #dont need to store these metadata
|
---|
| 342 | next if $key =~ /(contains|docnum|hastxt|doctype|archivedir|classifytype)/i;
|
---|
| 343 | my @items = split /@/, $hashref->{$key};
|
---|
| 344 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
| 345 |
|
---|
| 346 | }
|
---|
| 347 | my ($docnum)= $hashref->{'docnum'} =~ /(\d*)/;
|
---|
| 348 | my ($hastext) =$hashref->{'hastxt'} =~ /(0|1)/;
|
---|
| 349 |
|
---|
| 350 | my $images=[];
|
---|
| 351 | if ($hastext) {
|
---|
| 352 | my $text = &get_text($docnum);
|
---|
| 353 |
|
---|
| 354 | #my (@images) = $text =~ /<img.*?src=\"([^\"]*)\"[^>]*>/g;
|
---|
| 355 |
|
---|
| 356 | # in text replace path to image with _httpdocimg_/blah.gif
|
---|
| 357 | #while ($text =~ s/(<img.*?src=\")([^\"]*)(\"[^>]*>)/
|
---|
| 358 | # $1.&get_img($2, \@images).$3/sgei) {
|
---|
| 359 | $text =~ s/(<img.*?src=\")([^\"]*)(\"[^>]*>)/
|
---|
| 360 | $1.&get_img($2,$images).$3/sgei;
|
---|
| 361 |
|
---|
| 362 | $doc_obj->add_text ($cursection, $text);
|
---|
| 363 |
|
---|
| 364 | if (scalar(@$images)>0) {
|
---|
| 365 |
|
---|
| 366 | foreach $img (@$images) {
|
---|
| 367 | my ($assoc_file) = $img =~ /([^\/\\]*\..*)$/; #the name of the image
|
---|
| 368 | $img =~ s/_httpcollection_/\./; #replace _httpcollection_ with .
|
---|
| 369 | $img =~ s/_thisOID_/$olddir/; #replace _thisOID_ with old archivedir name
|
---|
| 370 |
|
---|
| 371 | $doc_obj->associate_file($img, $assoc_file);
|
---|
| 372 | }
|
---|
| 373 | }
|
---|
| 374 | }
|
---|
| 375 | }
|
---|
| 376 |
|
---|
| 377 |
|
---|
| 378 |
|
---|
| 379 | sub get_img {
|
---|
| 380 | my ($path, $images) = @_;
|
---|
| 381 | my $img = "_httpdocimg_/";
|
---|
| 382 | my ($imgname) = $path =~ /([^\/\\]*\..*)$/;
|
---|
| 383 | push (@$images, $path);
|
---|
| 384 | $img .= $imgname;
|
---|
| 385 | return $img;
|
---|
| 386 | }
|
---|
| 387 |
|
---|
| 388 |
|
---|
| 389 | sub add_classification_metadata {
|
---|
| 390 |
|
---|
| 391 | my ($oid, $doc_obj, $cursection) = @_;
|
---|
| 392 |
|
---|
| 393 | if (defined $doc_classif_info->{$oid}) {
|
---|
| 394 |
|
---|
| 395 | my $hashref = $doc_classif_info->{$oid};
|
---|
| 396 |
|
---|
| 397 | foreach $key (keys %$hashref) {
|
---|
| 398 | my @items = @{$hashref->{$key}};
|
---|
| 399 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
| 400 | }
|
---|
| 401 | }
|
---|
| 402 | }
|
---|
| 403 |
|
---|
| 404 | # picks up the cover image "cover.jpg" from the old archives directory.
|
---|
| 405 | sub add_cover_image {
|
---|
| 406 |
|
---|
| 407 | my ($doc_obj, $olddir) = @_;
|
---|
| 408 | $assoc_file = "cover.jpg";
|
---|
| 409 | $img = "archives/$olddir/$assoc_file";
|
---|
| 410 |
|
---|
| 411 |
|
---|
| 412 | if (-e $img) {
|
---|
| 413 | $doc_obj->associate_file($img, $assoc_file);
|
---|
| 414 | }
|
---|
| 415 | }
|
---|
| 416 |
|
---|
| 417 |
|
---|
| 418 |
|
---|
| 419 | sub set_index {
|
---|
| 420 | # check that $collection has been set
|
---|
| 421 | die "collection global was not set\n"
|
---|
| 422 | unless defined $collection && $collection =~ /\w/;
|
---|
| 423 |
|
---|
| 424 | # find an index (just use first non-text directory we come across in $indexdir)
|
---|
| 425 | opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
|
---|
| 426 | my @indexes = readdir INDEXDIR;
|
---|
| 427 | close INDEXDIR;
|
---|
| 428 | foreach $i (@indexes) {
|
---|
| 429 | next if $i =~ /text$/i || $i =~ /\./ || $i =~ /assoc$/i;
|
---|
| 430 | $index = &util::filename_cat ($i, $collection);
|
---|
| 431 | last;
|
---|
| 432 | }
|
---|
| 433 | }
|
---|
| 434 |
|
---|
| 435 |
|
---|
| 436 | #########################################################################
|
---|
| 437 |
|
---|
| 438 | ################ functions involving mg ################################
|
---|
| 439 |
|
---|
| 440 | sub get_text {
|
---|
| 441 | my ($docnum) = @_;
|
---|
| 442 |
|
---|
| 443 | print STDERR "." if $verbosity >= 2;
|
---|
| 444 | &mgcommand ($docnum);
|
---|
| 445 |
|
---|
| 446 | <$mgread>; # eat the document separator
|
---|
| 447 |
|
---|
| 448 | my $text = "";
|
---|
| 449 | my $line = "";
|
---|
| 450 |
|
---|
| 451 | while (defined ($line = <$mgread>))
|
---|
| 452 | {
|
---|
| 453 | last if $line =~ /^<\/mg>/;
|
---|
| 454 | $text .= $line;
|
---|
| 455 | }
|
---|
| 456 |
|
---|
| 457 | # Read in the last statement, which should be:
|
---|
| 458 | # "dd documents retrieved."
|
---|
| 459 | <$mgread>;
|
---|
| 460 |
|
---|
| 461 | return $text;
|
---|
| 462 | }
|
---|
| 463 |
|
---|
| 464 | sub numerically {$a <=> $b;}
|
---|
| 465 |
|
---|
| 466 |
|
---|
| 467 |
|
---|
| 468 | sub openmg {
|
---|
| 469 |
|
---|
| 470 | #print STDERR "index: $index\n";
|
---|
| 471 |
|
---|
| 472 | die "Unable to start mgquery." unless
|
---|
| 473 | &openpipe($mgread, $mgwrite,
|
---|
| 474 | "mgquery -d $indexdir -f $index -t $textdir");
|
---|
| 475 |
|
---|
| 476 | $mgwrite->autoflush();
|
---|
| 477 |
|
---|
| 478 | &mgcommand('.set expert true');
|
---|
| 479 | &mgcommand('.set terminator "</mg>\n"');
|
---|
| 480 | &mgcommand('.set mode text');
|
---|
| 481 | &mgcommand('.set query docnums');
|
---|
| 482 | &mgcommand('.set term_freq off');
|
---|
[7588] | 483 | &mgcommand('.set briefstats off');
|
---|
| 484 | &mgcommand('.set memstats off');
|
---|
| 485 | &mgcommand('.set sizestats off');
|
---|
| 486 | &mgcommand('.set timestats off');
|
---|
[3324] | 487 | }
|
---|
| 488 |
|
---|
| 489 | sub closemg {
|
---|
| 490 | &mgcommand (".quit");
|
---|
| 491 | close($mgread);
|
---|
| 492 | close($mgwrite);
|
---|
| 493 | }
|
---|
| 494 |
|
---|
| 495 | sub mgcommand {
|
---|
| 496 | my ($command) = @_;
|
---|
| 497 |
|
---|
| 498 | return if $command =~ /^\s*$/; #whitespace
|
---|
| 499 | #print STDERR "command: $command\n";
|
---|
| 500 | print $mgwrite "$command\n";
|
---|
| 501 |
|
---|
| 502 | # eat up the command executed which is echoed
|
---|
| 503 | <$mgread>;
|
---|
| 504 | }
|
---|
| 505 |
|
---|
| 506 | # openpipe(READ, WRITE, CMD)
|
---|
| 507 | #
|
---|
| 508 | # Like open2, except CMD's stderr is also redirected.
|
---|
| 509 | #
|
---|
| 510 | sub openpipe
|
---|
| 511 | {
|
---|
| 512 | my ($read, $write, $cmd) = @_;
|
---|
| 513 | my ($child_read, $child_write);
|
---|
| 514 |
|
---|
| 515 | $child_read = ++$FileHandle;
|
---|
| 516 | $child_write = ++$FileHandle;
|
---|
| 517 |
|
---|
| 518 | pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
|
---|
| 519 | pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
|
---|
| 520 | my $pid;
|
---|
| 521 |
|
---|
| 522 | if (($pid = fork) < 0) {
|
---|
| 523 | die "Failed fork: $!";
|
---|
| 524 | } elsif ($pid == 0) {
|
---|
| 525 | close($read);
|
---|
| 526 | close($write);
|
---|
| 527 | open(STDIN, "<&$child_read");
|
---|
| 528 | open(STDOUT, ">&$child_write");
|
---|
| 529 | open(STDERR, ">&$child_write");
|
---|
| 530 | exec($cmd);
|
---|
| 531 | die "Failed exec $cmd: $!";
|
---|
| 532 | }
|
---|
| 533 |
|
---|
| 534 | close($child_read);
|
---|
| 535 | close($child_write);
|
---|
| 536 |
|
---|
| 537 | $write->autoflush();
|
---|
| 538 | $read->autoflush();
|
---|
| 539 |
|
---|
| 540 | return 1;
|
---|
| 541 | }
|
---|
| 542 |
|
---|
| 543 |
|
---|
| 544 |
|
---|
| 545 |
|
---|
| 546 | ######################################################################
|
---|
| 547 |
|
---|
| 548 | ############# functions to do with the classification stuff ##########
|
---|
| 549 |
|
---|
| 550 | #returns the top level classification oids
|
---|
| 551 | sub get_classifyinfo {
|
---|
| 552 | my ($gdbmfile) = @_;
|
---|
| 553 |
|
---|
| 554 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
| 555 | print STDERR "Finding all classification sections from $gdbmfile\n" ;
|
---|
| 556 |
|
---|
| 557 | $/ = '-' x 70;
|
---|
| 558 | my $entry = "";
|
---|
| 559 | while (defined ($entry = <DB2TXT>)) {
|
---|
| 560 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
| 561 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
| 562 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
| 563 |
|
---|
| 564 | next unless ($key =~/^CL\d$/); # assumes classification OID is like
|
---|
| 565 | # CL1 or CL2 etc
|
---|
| 566 |
|
---|
| 567 | push( @$classifyinfo, $key);
|
---|
| 568 |
|
---|
| 569 | }
|
---|
| 570 |
|
---|
| 571 | $/ = "\n";
|
---|
[7588] | 572 | #print STDERR "classifications are: ", join(", ", @$classifyinfo);
|
---|
| 573 | #print STDERR "\n";
|
---|
| 574 |
|
---|
[3324] | 575 | }
|
---|
| 576 |
|
---|
| 577 | #this creates the classification files needed for the hierarchy classifier
|
---|
| 578 | #used for subjects, titles, orgs etc
|
---|
| 579 | #also adds in entries to the collect_cfg hash
|
---|
| 580 | sub make_info_file {
|
---|
| 581 |
|
---|
| 582 | my ($classifier) = @_;
|
---|
| 583 | my $info_file = "";
|
---|
[7588] | 584 | my $entry = $infodb{$classifier};
|
---|
[3324] | 585 |
|
---|
| 586 | my $hashref = {};
|
---|
| 587 | &get_metadata($entry, $hashref);
|
---|
| 588 |
|
---|
| 589 | my $classifier_name = "CL".$hashref->{'Title'}; #like CLSubject
|
---|
| 590 | $classifier_name =~ s/\@$//; #remove trailing @
|
---|
| 591 |
|
---|
[7588] | 592 | # check children - if there is a classifier node at this level,
|
---|
| 593 | # use a hierarchy, otherwise use an AZList.
|
---|
[3324] | 594 |
|
---|
| 595 | my $children=[];
|
---|
[7588] | 596 | my $hierarchy = 0;
|
---|
| 597 | &get_whole_children($classifier, $hashref, $children); #returns a list of the child ids
|
---|
| 598 | foreach $child(@$children) {
|
---|
| 599 | if(not &is_document($child)) {
|
---|
| 600 | $hierarchy = 1;
|
---|
| 601 | last;
|
---|
| 602 | }
|
---|
| 603 | }
|
---|
| 604 |
|
---|
| 605 | if (!$hierarchy) {
|
---|
[3324] | 606 | &add_classify_cfg_list($classifier_name);
|
---|
| 607 | }else { #there is a hierarchy so create a file
|
---|
| 608 | $info_file = "./etc/$classifier_name.txt";
|
---|
| 609 |
|
---|
| 610 | print STDERR "classification $classifier will be called $classifier_name\n";
|
---|
| 611 |
|
---|
| 612 | open (OUTDOC, "> $info_file" ) || die "couldn't open file $info_file\n";
|
---|
| 613 |
|
---|
| 614 | foreach $child (@$children) {
|
---|
| 615 | &process_entry(OUTDOC, $classifier_name, $child);
|
---|
| 616 | }
|
---|
| 617 |
|
---|
| 618 | close OUTDOC;
|
---|
| 619 |
|
---|
| 620 | &add_classify_cfg($classifier, $classifier_name, $info_file);
|
---|
| 621 | }
|
---|
| 622 | }
|
---|
| 623 |
|
---|
| 624 |
|
---|
| 625 | sub process_entry {
|
---|
| 626 | my ($handle, $classifier_name, $classify_id) = @_;
|
---|
| 627 | my $value = $infodb{$classify_id};
|
---|
| 628 |
|
---|
| 629 | my $hashref={};
|
---|
| 630 | &get_metadata($value, $hashref);
|
---|
| 631 | my $title = $hashref->{'Title'};
|
---|
| 632 | $title =~ s/\@$//; #remove trailing @
|
---|
| 633 | &add_line($handle, $classify_id, $title);
|
---|
| 634 |
|
---|
| 635 | my $children = [];
|
---|
| 636 | &get_whole_children($classify_id, $hashref, $children);
|
---|
| 637 | foreach $child (@$children) {
|
---|
| 638 | if (&is_document($child)) {
|
---|
| 639 | &add_doc_metadata($child, $classifier_name, $title);
|
---|
| 640 | }else {
|
---|
[7588] | 641 | &process_entry($handle, $classifier_name, $child);
|
---|
[3324] | 642 | }
|
---|
| 643 | }
|
---|
| 644 |
|
---|
| 645 | }
|
---|
| 646 |
|
---|
| 647 |
|
---|
| 648 |
|
---|
| 649 | sub add_doc_metadata {
|
---|
| 650 |
|
---|
| 651 | my ($doc_id, $classifier_name, $classifier_id) = @_;
|
---|
| 652 |
|
---|
| 653 | #add entry to doc database
|
---|
| 654 | #print STDERR "at doc level, docnum=$classify_id\n";
|
---|
| 655 |
|
---|
| 656 | $doc_classif_info->{$doc_id}={} unless defined $doc_classif_info->{$doc_id};
|
---|
| 657 | $doc_classif_info->{$doc_id}->{$classifier_name}=[] unless
|
---|
| 658 | defined $doc_classif_info->{$doc_id}->{$classifier_name};
|
---|
| 659 | push (@{$doc_classif_info->{$doc_id}->{$classifier_name}}, $classifier_id);
|
---|
| 660 |
|
---|
| 661 | }
|
---|
| 662 |
|
---|
| 663 |
|
---|
| 664 |
|
---|
| 665 |
|
---|
| 666 | sub add_line {
|
---|
| 667 |
|
---|
| 668 | my ($handle, $classify_id, $title) = @_;
|
---|
| 669 | #print STDERR "classify id= $classify_id, title= $title\n";
|
---|
[7588] | 670 | $title = &unicode::ascii2utf8(\$title);
|
---|
[3324] | 671 | my ($num) = $classify_id =~ /^CL\d\.(.*)$/; #remove the CL1. from the front
|
---|
| 672 |
|
---|
| 673 | print $handle "\"$title\"\t$num\t\"$title\"\n";
|
---|
| 674 |
|
---|
| 675 |
|
---|
| 676 | }
|
---|
| 677 |
|
---|
| 678 | sub is_document {
|
---|
| 679 | my ($oid) = @_;
|
---|
| 680 | return 1 if $oid =~ /^HASH/i;
|
---|
| 681 | return 0;
|
---|
| 682 | }
|
---|
| 683 |
|
---|
| 684 | ########################################################################
|
---|
| 685 |
|
---|
| 686 | ########## stuff for producing collect.cfg file ###########################
|
---|
| 687 |
|
---|
| 688 | sub add_default_cfg {
|
---|
| 689 |
|
---|
| 690 | $username=`whoami`;
|
---|
| 691 | $username=`logname` unless defined $username;
|
---|
| 692 | $username="a_user" unless defined $username;
|
---|
| 693 | $username =~ s/\n//;
|
---|
| 694 | $collect_cfg->{'creator'}="$username\@cs.waikato.ac.nz";
|
---|
| 695 | $collect_cfg->{'maintainer'}="$username\@cs.waikato.ac.nz";
|
---|
[7588] | 696 | $collect_cfg->{'public'}="true";
|
---|
[3324] | 697 | $collect_cfg->{'beta'}="true";
|
---|
| 698 |
|
---|
| 699 | $collect_cfg->{'plugin'}=[];
|
---|
[7588] | 700 | push (@{$collect_cfg->{'plugin'}}, ["GAPlug"]);
|
---|
[3324] | 701 | push (@{$collect_cfg->{'plugin'}}, ["ArcPlug"]);
|
---|
| 702 | push (@{$collect_cfg->{'plugin'}}, ["RecPlug"]);
|
---|
| 703 |
|
---|
| 704 | $collect_cfg->{'format'}={};
|
---|
| 705 | $collect_cfg->{'format'}->{'DocumentImages'}="true";
|
---|
| 706 | $collect_cfg->{'format'}->{'DocumentText'} =
|
---|
| 707 | "\"<h3>[Title]</h3>\\\\n\\\\n<p>[Text]\"";
|
---|
| 708 | $collect_cfg->{'format'}->{'SearchVList'} =
|
---|
| 709 | "\"<td valign=top>[link][icon][/link]</td><td>{If}{[parent(All': '):Title],[parent(All': '):Title]:}[link][Title][/link]</td>\"";
|
---|
| 710 |
|
---|
| 711 | $collect_cfg->{'collectionmeta'}={};
|
---|
| 712 | $collect_cfg->{'collectionmeta'}->{'collectionname'}="\"$collection\"";
|
---|
[7588] | 713 | $collect_cfg->{'collectionmeta'}->{'iconcollection'}="\"_httpprefix_/collect/$collection/images/$collection.gif\"";
|
---|
| 714 | $collect_cfg->{'collectionmeta'}->{'iconcollectionsmall'}="\"_httpprefix_/collect/$collection/images/${collection}sm.gif\"";
|
---|
[3324] | 715 | $collect_cfg->{'collectionmeta'}->{'collectionextra'} = "\"This is a collection rebuilt from CDROM.\"";
|
---|
| 716 |
|
---|
| 717 | }
|
---|
| 718 |
|
---|
| 719 | sub add_index_cfg {
|
---|
| 720 | my ($buildfile) = @_;
|
---|
| 721 |
|
---|
| 722 | my $data={};
|
---|
| 723 | $collect_cfg->{'indexes'}=[];
|
---|
| 724 | if (-e $buildfile) {
|
---|
| 725 | $data=&cfgread::read_cfg_file($buildfile, '^(this)$', '^(indexmap)$');
|
---|
| 726 | foreach $i (@{$data->{'indexmap'}}) {
|
---|
| 727 | ($thisindex, $abbrev)= split ("\-\>", $i);
|
---|
| 728 | push (@{$collect_cfg->{'indexes'}}, $thisindex);
|
---|
| 729 | $collect_cfg->{'defaultindex'} = $thisindex unless defined
|
---|
| 730 | $collect_cfg->{'defaultindex'};
|
---|
| 731 | $name=&get_index_name($thisindex);
|
---|
| 732 | $thisindex=".$thisindex";
|
---|
| 733 | $collect_cfg->{'collectionmeta'}->{$thisindex} = "\"$name\"";
|
---|
| 734 | }
|
---|
| 735 | }
|
---|
| 736 | else {
|
---|
| 737 | print STDERR "Couldn't read $buildfile, could not add index data to configuration file\n";
|
---|
| 738 | }
|
---|
| 739 |
|
---|
| 740 | }
|
---|
| 741 |
|
---|
| 742 | sub get_index_name {
|
---|
| 743 | my ($thisindex) = @_;
|
---|
| 744 | return "paragraphs" if $thisindex =~ /paragraph/;
|
---|
| 745 | return "chapters" if $thisindex =~ /section.*text/;
|
---|
| 746 | return "titles" if $thisindex =~ /Title/;
|
---|
| 747 | return "other";
|
---|
| 748 | }
|
---|
| 749 |
|
---|
| 750 | sub add_classify_cfg {
|
---|
| 751 |
|
---|
| 752 | my ($classify, $classifier_name, $file) = @_;
|
---|
| 753 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
| 754 |
|
---|
| 755 | my ($title) = $classifier_name =~ /^CL(.*)$/;
|
---|
| 756 | my ($filename) = $file =~ /\/([^\/]*)$/;
|
---|
[7588] | 757 | my $entry = "Hierarchy -hfile $filename -metadata $classifier_name -buttonname $title -sort Title";
|
---|
[3324] | 758 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
| 759 |
|
---|
| 760 |
|
---|
| 761 | }
|
---|
| 762 |
|
---|
| 763 | sub add_classify_cfg_list {
|
---|
| 764 |
|
---|
| 765 | my ($classifier) = @_;
|
---|
| 766 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
| 767 | my ($title) = $classifier =~ /^CL(.*)$/;
|
---|
[7588] | 768 | my $entry = "AZList -metadata $classifier -buttonname $title";
|
---|
[3324] | 769 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
| 770 | }
|
---|
| 771 |
|
---|
| 772 | sub output_cfg_file {
|
---|
| 773 |
|
---|
| 774 | my ($collfile) = @_;
|
---|
| 775 |
|
---|
| 776 | if (-e $collfile) { #collect.cfg already exists
|
---|
| 777 | $collfile .= ".new";
|
---|
| 778 | }
|
---|
| 779 | &cfgread::write_cfg_file($collfile, $collect_cfg,
|
---|
| 780 | '^(creator|maintainer|public|beta|defaultindex)$',
|
---|
| 781 | '^(indexes)$', '^(format|collectionmeta)$',
|
---|
| 782 | '^(plugin|classify)$');
|
---|
| 783 |
|
---|
| 784 |
|
---|
| 785 |
|
---|
| 786 | }
|
---|
| 787 |
|
---|
| 788 |
|
---|