1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # unbuildv2.pl --
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | # this program will decompress all the text from a built index
|
---|
29 | # and return it to gml format -
|
---|
30 | #this is based on the format current in Nov 1999. (version two)
|
---|
31 |
|
---|
32 | # Stefan updated unbuildv1.pl in August 2002 but unbuildv2.pl was not
|
---|
33 | # updated. It probably needs some work done before using.
|
---|
34 |
|
---|
35 | # Katherine updated this but apparently it still doesn't work.
|
---|
36 |
|
---|
37 | BEGIN {
|
---|
38 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
39 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
40 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
41 | $FileHandle = 'FH000';
|
---|
42 | }
|
---|
43 |
|
---|
44 | use unbuildutil;
|
---|
45 | use doc;
|
---|
46 | use docsave;
|
---|
47 | use util;
|
---|
48 | use parsargv;
|
---|
49 | use GDBM_File;
|
---|
50 | use FileHandle;
|
---|
51 | use English;
|
---|
52 | use cfgread;
|
---|
53 | use unicode;
|
---|
54 |
|
---|
55 | select STDERR; $| = 1;
|
---|
56 | select STDOUT; $| = 1;
|
---|
57 |
|
---|
58 |
|
---|
59 | # globals
|
---|
60 | $collection = ""; # the collection name
|
---|
61 | $index = ""; # the selected index (like stt/unu)
|
---|
62 | $textdir = ""; # the textdir (like text/unu)
|
---|
63 | $toplevelinfo = []; #list of document OIDs
|
---|
64 | %infodb = (); #hash into GDBM file
|
---|
65 | $classifyinfo = []; # list of classifications
|
---|
66 | $doc_classif_info = {}; # hash of OIDs->classifications they belong to
|
---|
67 | $collect_cfg = {}; #data for the configuration file
|
---|
68 |
|
---|
69 | $mgread = ++$FileHandle;
|
---|
70 | $mgwrite = ++$FileHandle;
|
---|
71 |
|
---|
72 |
|
---|
73 |
|
---|
74 | sub print_usage {
|
---|
75 | print STDERR "\n usage: $0 [options]\n\n";
|
---|
76 | print STDERR " options:\n";
|
---|
77 | print STDERR " -verbosity number 0=none, 3=lots\n";
|
---|
78 | print STDERR " -indexdir directory The index to be decompressed (defaults to ./index)\n";
|
---|
79 | print STDERR " -archivedir directory Where the converted material ends up (defaults to ./archives.new\n";
|
---|
80 | print STDERR " -removeold Will remove the old contents of the archives\n";
|
---|
81 | print STDERR " directory -- use with care\n\n";
|
---|
82 | }
|
---|
83 |
|
---|
84 | &main ();
|
---|
85 |
|
---|
86 | sub main {
|
---|
87 | if (!parsargv::parse(\@ARGV,
|
---|
88 | 'verbosity/\d+/2', \$verbosity,
|
---|
89 | 'indexdir/.*/index', \$indexdir,
|
---|
90 | 'archivedir/.*/archives.new', \$archivedir,
|
---|
91 | 'removeold', \$removeold)) {
|
---|
92 | &print_usage();
|
---|
93 | die "\n";
|
---|
94 | }
|
---|
95 |
|
---|
96 | die "indexdir $indexdir does not exist\n\n" unless (-d $indexdir);
|
---|
97 | $indexdir =~ s/\/$//;
|
---|
98 | if (-d $archivedir) {
|
---|
99 | if ($removeold) {
|
---|
100 | print STDERR "Warning - removing current contents of the archives directory $archivedir\n";
|
---|
101 | print STDERR " in preparation for the import\n";
|
---|
102 | sleep(5); # just in case...
|
---|
103 | &util::rm_r ($archivedir);
|
---|
104 | }
|
---|
105 | } else {
|
---|
106 | &util::mk_all_dir ($archivedir);
|
---|
107 | }
|
---|
108 |
|
---|
109 | $etcdir = "./etc";
|
---|
110 | if (!(-d $etcdir)) {
|
---|
111 | &util::mk_all_dir ($etcdir);
|
---|
112 | }
|
---|
113 |
|
---|
114 |
|
---|
115 | my $gdbmfile = &get_gdbmfile ($indexdir); #sets $collection and $textdir
|
---|
116 | &set_index (); # sets $index (just chooses one index)
|
---|
117 |
|
---|
118 |
|
---|
119 | my $buildcfgfile = &util::filename_cat($indexdir, "build.cfg");
|
---|
120 | my $colcfgfile = &util::filename_cat($etcdir, "collect.cfg");
|
---|
121 |
|
---|
122 | &add_default_cfg();
|
---|
123 | &add_index_cfg($buildcfgfile);
|
---|
124 |
|
---|
125 |
|
---|
126 | #work out all the classifications from the gdbm file, info for each doc
|
---|
127 | #(ie which classifications they belong to, are kept in $doc_classif_info
|
---|
128 | &get_classifyinfo ($gdbmfile); #puts a list of classifications into classifyinfo
|
---|
129 |
|
---|
130 | &get_toplevel_OID ($gdbmfile); # puts a list of the top level document OIDs into $toplevelinfo
|
---|
131 |
|
---|
132 | #tie (%infodb, "GDBM_File", $gdbmfile, 1, 0);
|
---|
133 |
|
---|
134 | #read ldb file into %infodb
|
---|
135 | &unbuildutil::read_gdbm($gdbmfile, \%infodb);
|
---|
136 |
|
---|
137 | #this makes the files specifying the hierarchy of subjects, titles etc
|
---|
138 | foreach $classify (@$classifyinfo) {
|
---|
139 |
|
---|
140 | &make_info_file($classify);
|
---|
141 |
|
---|
142 | }
|
---|
143 |
|
---|
144 |
|
---|
145 | #write out the collect.cfg
|
---|
146 | &output_cfg_file($colcfgfile);
|
---|
147 |
|
---|
148 | &openmg ();
|
---|
149 |
|
---|
150 | # read the archive information file
|
---|
151 | my $archive_info_filename = &util::filename_cat ($archivedir, "archives.inf");
|
---|
152 | my $archive_info = new arcinfo ();
|
---|
153 |
|
---|
154 | # create a docsave object to process the documents
|
---|
155 | my $processor = new docsave ("", $archive_info, $verbosity);
|
---|
156 | $processor->setarchivedir ($archivedir);
|
---|
157 |
|
---|
158 | my ($doc_obj, $hashref, $children);
|
---|
159 | print STDERR "processing documents now\n" if $verbosity >=2;
|
---|
160 | foreach $oid (@$toplevelinfo) {
|
---|
161 | $value = $infodb{$oid};
|
---|
162 | $hashref={};
|
---|
163 | $children = [];
|
---|
164 | &get_metadata($value, $hashref);
|
---|
165 | $doc_obj = new doc ();
|
---|
166 | $doc_obj->set_OID($oid);
|
---|
167 | my ($olddir) = $hashref->{'archivedir'}; # old dir for this doc, where images are stored
|
---|
168 | $top = $doc_obj->get_top_section();
|
---|
169 | &add_section_content ($doc_obj, $top, $hashref, $olddir);
|
---|
170 | &add_classification_metadata($oid, $doc_obj, $top);
|
---|
171 | &add_cover_image($doc_obj, $olddir);
|
---|
172 | &get_children($hashref, $children);
|
---|
173 | &recurse_sections($doc_obj, $children, $oid, $top, $olddir);
|
---|
174 | $processor->process($doc_obj);
|
---|
175 |
|
---|
176 | }
|
---|
177 | print STDERR "\n";
|
---|
178 |
|
---|
179 | &closemg();
|
---|
180 |
|
---|
181 | # write out the archive information file
|
---|
182 | $archive_info->save_info($archive_info_filename);
|
---|
183 | }
|
---|
184 |
|
---|
185 | # returns the path to the gdbm info database - also
|
---|
186 | # sets the $collection and $textdir global variable
|
---|
187 | sub get_gdbmfile {
|
---|
188 | my ($indexdir) = @_;
|
---|
189 | my ($gdbmfile);
|
---|
190 |
|
---|
191 | opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
|
---|
192 | my @conts = readdir DIR;
|
---|
193 | close DIR;
|
---|
194 |
|
---|
195 | foreach $file (@conts) {
|
---|
196 | if ($file =~ /text$/) {
|
---|
197 | $textdir = $file;
|
---|
198 | last;
|
---|
199 | }
|
---|
200 | }
|
---|
201 | die "No text directory found in $indexdir\n\n"
|
---|
202 | unless defined $textdir && $textdir =~ /text$/;
|
---|
203 |
|
---|
204 | $gdbmfile = &util::filename_cat ($indexdir, $textdir);
|
---|
205 |
|
---|
206 | opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
|
---|
207 | @conts = readdir DIR;
|
---|
208 | close DIR;
|
---|
209 |
|
---|
210 | foreach $file (@conts) {
|
---|
211 | if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
|
---|
212 | $collection = $1;
|
---|
213 | $gdbmfile = &util::filename_cat ($gdbmfile, $file);
|
---|
214 | last;
|
---|
215 | }
|
---|
216 | }
|
---|
217 |
|
---|
218 | if (defined $collection && $collection =~ /\w/) {
|
---|
219 | $textdir = &util::filename_cat ($textdir, $collection);
|
---|
220 | } else {
|
---|
221 | die "collection global wasn't set\n";
|
---|
222 | }
|
---|
223 | return $gdbmfile if (-e $gdbmfile);
|
---|
224 | die "Couldn't find gdbm info database in $indexdir\n\n";
|
---|
225 | }
|
---|
226 |
|
---|
227 |
|
---|
228 | sub get_toplevel_OID {
|
---|
229 | my ($gdbmfile) = @_;
|
---|
230 |
|
---|
231 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
232 | print STDERR "Finding all top level sections from $gdbmfile\n" if $verbosity >= 2;
|
---|
233 |
|
---|
234 | $/ = '-' x 70;
|
---|
235 | my $entry = "";
|
---|
236 | while (defined ($entry = <DB2TXT>)) {
|
---|
237 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
238 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
239 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
240 |
|
---|
241 | next if ($key =~ /\./); #ignore any lower level entries
|
---|
242 | next if ($key =~ /^CL/); #ignore classification entries
|
---|
243 | next if ($value =~ /<section>/); #ignore docnum->OID entries
|
---|
244 | next if ($value =~ /<docoid>/); #ignore strange s133->OID entries
|
---|
245 | next if ($key !~ /\d/); #ignore collection, browse entries
|
---|
246 |
|
---|
247 | push( @$toplevelinfo, $key);
|
---|
248 |
|
---|
249 | }
|
---|
250 |
|
---|
251 | $/ = "\n";
|
---|
252 | #print STDERR "toplevel sections are: ", join ("\n", @$toplevelinfo);
|
---|
253 | #print STDERR "\n";
|
---|
254 | }
|
---|
255 |
|
---|
256 | # gets all the metadata from a gdbm file entry, and puts it into a hashref
|
---|
257 | sub get_metadata {
|
---|
258 |
|
---|
259 | my ($gdb_str_ref, $hashref) = @_;
|
---|
260 | my @entries = split(/\n/, $gdb_str_ref);
|
---|
261 | foreach $entry (@entries) {
|
---|
262 | my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
|
---|
263 | $$hashref{$key} .= '@' if defined $$hashref{$key};
|
---|
264 | $$hashref{$key} .= $value;
|
---|
265 |
|
---|
266 | }
|
---|
267 |
|
---|
268 |
|
---|
269 | }
|
---|
270 |
|
---|
271 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
272 | #the childrens numbers (from the 'contains' entry).
|
---|
273 | #assumes format is ".1;".2;".3
|
---|
274 | sub get_children {
|
---|
275 | my ($hashref, $children) = @_;
|
---|
276 |
|
---|
277 | $childs = $hashref->{'contains'};
|
---|
278 | if (defined ($childs)) {
|
---|
279 | $childs =~ s/\@$//; #remove trailing @
|
---|
280 | $childs =~ s/^\"\.//; #remove initial ".
|
---|
281 | @$children = split /\;\"\./, $childs;
|
---|
282 |
|
---|
283 | }
|
---|
284 | else {
|
---|
285 | $children = [];
|
---|
286 | }
|
---|
287 | }
|
---|
288 |
|
---|
289 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
290 | #the childrens numbers (from the 'contains' entry).
|
---|
291 | #assumes format is ".1;".2;".3
|
---|
292 | #returns a list with the full child name ie HASH0123...ac.1 HASH0123...ac.2
|
---|
293 | #etc
|
---|
294 | #used for classification stuff
|
---|
295 | sub get_whole_children {
|
---|
296 |
|
---|
297 | my ($parentoid, $hashref, $children) = @_;
|
---|
298 |
|
---|
299 | my $childs = $hashref->{'contains'};
|
---|
300 | my @items;
|
---|
301 | if (defined ($childs)) {
|
---|
302 | $childs =~ s/\@$//; #remove trailing @
|
---|
303 | @items = split /\;/, $childs; #split on ;
|
---|
304 | foreach $item (@items) {
|
---|
305 | $item =~ s/^\"/$parentoid/; # replace " with parentoid
|
---|
306 | push (@$children, "$item");
|
---|
307 | }
|
---|
308 | }
|
---|
309 | else {
|
---|
310 | $children = [];
|
---|
311 | }
|
---|
312 | }
|
---|
313 |
|
---|
314 |
|
---|
315 | sub recurse_sections {
|
---|
316 | my ($doc_obj, $children, $parentoid, $parentsection, $olddir) = @_;
|
---|
317 |
|
---|
318 | foreach $child (sort numerically @$children) {
|
---|
319 | $doc_obj->create_named_section("$parentsection.$child");
|
---|
320 | my $value = $infodb{"$parentoid.$child"};
|
---|
321 | my $hashref={};
|
---|
322 | &get_metadata($value, $hashref); # get childs metadata
|
---|
323 | my $newchildren = [];
|
---|
324 | &get_children($hashref, $newchildren); # get childs children
|
---|
325 | #add content fo rcurrent section
|
---|
326 | &add_section_content($doc_obj, "$parentsection.$child", $hashref, $olddir);
|
---|
327 | # process all the children if there are any
|
---|
328 | &recurse_sections($doc_obj, $newchildren, "$parentoid.$child", "$parentsection.$child", $olddir)
|
---|
329 | if (defined ($newchildren));
|
---|
330 | }
|
---|
331 |
|
---|
332 |
|
---|
333 | }
|
---|
334 |
|
---|
335 | sub add_section_content {
|
---|
336 | my ($doc_obj, $cursection, $hashref, $olddir) = @_;
|
---|
337 |
|
---|
338 | foreach $key (keys %$hashref) {
|
---|
339 | #dont need to store these metadata
|
---|
340 | next if $key =~ /(contains|docnum|hastxt|doctype|archivedir|classifytype)/i;
|
---|
341 | my @items = split /@/, $hashref->{$key};
|
---|
342 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
343 |
|
---|
344 | }
|
---|
345 | my ($docnum)= $hashref->{'docnum'} =~ /(\d*)/;
|
---|
346 | my ($hastext) =$hashref->{'hastxt'} =~ /(0|1)/;
|
---|
347 |
|
---|
348 | my $images=[];
|
---|
349 | if ($hastext) {
|
---|
350 | my $text = &get_text($docnum);
|
---|
351 |
|
---|
352 | #my (@images) = $text =~ /<img.*?src=\"([^\"]*)\"[^>]*>/g;
|
---|
353 |
|
---|
354 | # in text replace path to image with _httpdocimg_/blah.gif
|
---|
355 | #while ($text =~ s/(<img.*?src=\")([^\"]*)(\"[^>]*>)/
|
---|
356 | # $1.&get_img($2, \@images).$3/sgei) {
|
---|
357 | $text =~ s/(<img.*?src=\")([^\"]*)(\"[^>]*>)/
|
---|
358 | $1.&get_img($2,$images).$3/sgei;
|
---|
359 |
|
---|
360 | $doc_obj->add_text ($cursection, $text);
|
---|
361 |
|
---|
362 | if (scalar(@$images)>0) {
|
---|
363 |
|
---|
364 | foreach $img (@$images) {
|
---|
365 | my ($assoc_file) = $img =~ /([^\/\\]*\..*)$/; #the name of the image
|
---|
366 | $img =~ s/_httpcollection_/\./; #replace _httpcollection_ with .
|
---|
367 | $img =~ s/_thisOID_/$olddir/; #replace _thisOID_ with old archivedir name
|
---|
368 |
|
---|
369 | $doc_obj->associate_file($img, $assoc_file);
|
---|
370 | }
|
---|
371 | }
|
---|
372 | }
|
---|
373 | }
|
---|
374 |
|
---|
375 |
|
---|
376 |
|
---|
377 | sub get_img {
|
---|
378 | my ($path, $images) = @_;
|
---|
379 | my $img = "_httpdocimg_/";
|
---|
380 | my ($imgname) = $path =~ /([^\/\\]*\..*)$/;
|
---|
381 | push (@$images, $path);
|
---|
382 | $img .= $imgname;
|
---|
383 | return $img;
|
---|
384 | }
|
---|
385 |
|
---|
386 |
|
---|
387 | sub add_classification_metadata {
|
---|
388 |
|
---|
389 | my ($oid, $doc_obj, $cursection) = @_;
|
---|
390 |
|
---|
391 | if (defined $doc_classif_info->{$oid}) {
|
---|
392 |
|
---|
393 | my $hashref = $doc_classif_info->{$oid};
|
---|
394 |
|
---|
395 | foreach $key (keys %$hashref) {
|
---|
396 | my @items = @{$hashref->{$key}};
|
---|
397 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
398 | }
|
---|
399 | }
|
---|
400 | }
|
---|
401 |
|
---|
402 | # picks up the cover image "cover.jpg" from the old archives directory.
|
---|
403 | sub add_cover_image {
|
---|
404 |
|
---|
405 | my ($doc_obj, $olddir) = @_;
|
---|
406 | $assoc_file = "cover.jpg";
|
---|
407 | $img = "archives/$olddir/$assoc_file";
|
---|
408 |
|
---|
409 |
|
---|
410 | if (-e $img) {
|
---|
411 | $doc_obj->associate_file($img, $assoc_file);
|
---|
412 | }
|
---|
413 | }
|
---|
414 |
|
---|
415 |
|
---|
416 |
|
---|
417 | sub set_index {
|
---|
418 | # check that $collection has been set
|
---|
419 | die "collection global was not set\n"
|
---|
420 | unless defined $collection && $collection =~ /\w/;
|
---|
421 |
|
---|
422 | # find an index (just use first non-text directory we come across in $indexdir)
|
---|
423 | opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
|
---|
424 | my @indexes = readdir INDEXDIR;
|
---|
425 | close INDEXDIR;
|
---|
426 | foreach $i (@indexes) {
|
---|
427 | next if $i =~ /text$/i || $i =~ /\./ || $i =~ /assoc$/i;
|
---|
428 | $index = &util::filename_cat ($i, $collection);
|
---|
429 | last;
|
---|
430 | }
|
---|
431 | }
|
---|
432 |
|
---|
433 |
|
---|
434 | #########################################################################
|
---|
435 |
|
---|
436 | ################ functions involving mg ################################
|
---|
437 |
|
---|
438 | sub get_text {
|
---|
439 | my ($docnum) = @_;
|
---|
440 |
|
---|
441 | print STDERR "." if $verbosity >= 2;
|
---|
442 | &mgcommand ($docnum);
|
---|
443 |
|
---|
444 | <$mgread>; # eat the document separator
|
---|
445 |
|
---|
446 | my $text = "";
|
---|
447 | my $line = "";
|
---|
448 |
|
---|
449 | while (defined ($line = <$mgread>))
|
---|
450 | {
|
---|
451 | last if $line =~ /^<\/mg>/;
|
---|
452 | $text .= $line;
|
---|
453 | }
|
---|
454 |
|
---|
455 | # Read in the last statement, which should be:
|
---|
456 | # "dd documents retrieved."
|
---|
457 | <$mgread>;
|
---|
458 |
|
---|
459 | return $text;
|
---|
460 | }
|
---|
461 |
|
---|
462 | sub numerically {$a <=> $b;}
|
---|
463 |
|
---|
464 |
|
---|
465 |
|
---|
466 | sub openmg {
|
---|
467 |
|
---|
468 | #print STDERR "index: $index\n";
|
---|
469 |
|
---|
470 | die "Unable to start mgquery." unless
|
---|
471 | &openpipe($mgread, $mgwrite,
|
---|
472 | "mgquery -d $indexdir -f $index -t $textdir");
|
---|
473 |
|
---|
474 | $mgwrite->autoflush();
|
---|
475 |
|
---|
476 | &mgcommand('.set expert true');
|
---|
477 | &mgcommand('.set terminator "</mg>\n"');
|
---|
478 | &mgcommand('.set mode text');
|
---|
479 | &mgcommand('.set query docnums');
|
---|
480 | &mgcommand('.set term_freq off');
|
---|
481 | &mgcommand('.set briefstats off');
|
---|
482 | &mgcommand('.set memstats off');
|
---|
483 | &mgcommand('.set sizestats off');
|
---|
484 | &mgcommand('.set timestats off');
|
---|
485 | }
|
---|
486 |
|
---|
487 | sub closemg {
|
---|
488 | &mgcommand (".quit");
|
---|
489 | close($mgread);
|
---|
490 | close($mgwrite);
|
---|
491 | }
|
---|
492 |
|
---|
493 | sub mgcommand {
|
---|
494 | my ($command) = @_;
|
---|
495 |
|
---|
496 | return if $command =~ /^\s*$/; #whitespace
|
---|
497 | #print STDERR "command: $command\n";
|
---|
498 | print $mgwrite "$command\n";
|
---|
499 |
|
---|
500 | # eat up the command executed which is echoed
|
---|
501 | <$mgread>;
|
---|
502 | }
|
---|
503 |
|
---|
504 | # openpipe(READ, WRITE, CMD)
|
---|
505 | #
|
---|
506 | # Like open2, except CMD's stderr is also redirected.
|
---|
507 | #
|
---|
508 | sub openpipe
|
---|
509 | {
|
---|
510 | my ($read, $write, $cmd) = @_;
|
---|
511 | my ($child_read, $child_write);
|
---|
512 |
|
---|
513 | $child_read = ++$FileHandle;
|
---|
514 | $child_write = ++$FileHandle;
|
---|
515 |
|
---|
516 | pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
|
---|
517 | pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
|
---|
518 | my $pid;
|
---|
519 |
|
---|
520 | if (($pid = fork) < 0) {
|
---|
521 | die "Failed fork: $!";
|
---|
522 | } elsif ($pid == 0) {
|
---|
523 | close($read);
|
---|
524 | close($write);
|
---|
525 | open(STDIN, "<&$child_read");
|
---|
526 | open(STDOUT, ">&$child_write");
|
---|
527 | open(STDERR, ">&$child_write");
|
---|
528 | exec($cmd);
|
---|
529 | die "Failed exec $cmd: $!";
|
---|
530 | }
|
---|
531 |
|
---|
532 | close($child_read);
|
---|
533 | close($child_write);
|
---|
534 |
|
---|
535 | $write->autoflush();
|
---|
536 | $read->autoflush();
|
---|
537 |
|
---|
538 | return 1;
|
---|
539 | }
|
---|
540 |
|
---|
541 |
|
---|
542 |
|
---|
543 |
|
---|
544 | ######################################################################
|
---|
545 |
|
---|
546 | ############# functions to do with the classification stuff ##########
|
---|
547 |
|
---|
548 | #returns the top level classification oids
|
---|
549 | sub get_classifyinfo {
|
---|
550 | my ($gdbmfile) = @_;
|
---|
551 |
|
---|
552 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
553 | print STDERR "Finding all classification sections from $gdbmfile\n" ;
|
---|
554 |
|
---|
555 | $/ = '-' x 70;
|
---|
556 | my $entry = "";
|
---|
557 | while (defined ($entry = <DB2TXT>)) {
|
---|
558 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
559 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
560 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
561 |
|
---|
562 | next unless ($key =~/^CL\d$/); # assumes classification OID is like
|
---|
563 | # CL1 or CL2 etc
|
---|
564 |
|
---|
565 | push( @$classifyinfo, $key);
|
---|
566 |
|
---|
567 | }
|
---|
568 |
|
---|
569 | $/ = "\n";
|
---|
570 | #print STDERR "classifications are: ", join(", ", @$classifyinfo);
|
---|
571 | #print STDERR "\n";
|
---|
572 |
|
---|
573 | }
|
---|
574 |
|
---|
575 | #this creates the classification files needed for the hierarchy classifier
|
---|
576 | #used for subjects, titles, orgs etc
|
---|
577 | #also adds in entries to the collect_cfg hash
|
---|
578 | sub make_info_file {
|
---|
579 |
|
---|
580 | my ($classifier) = @_;
|
---|
581 | my $info_file = "";
|
---|
582 | my $entry = $infodb{$classifier};
|
---|
583 |
|
---|
584 | my $hashref = {};
|
---|
585 | &get_metadata($entry, $hashref);
|
---|
586 |
|
---|
587 | my $classifier_name = "CL".$hashref->{'Title'}; #like CLSubject
|
---|
588 | $classifier_name =~ s/\@$//; #remove trailing @
|
---|
589 |
|
---|
590 | # check children - if there is a classifier node at this level,
|
---|
591 | # use a hierarchy, otherwise use an AZList.
|
---|
592 |
|
---|
593 | my $children=[];
|
---|
594 | my $hierarchy = 0;
|
---|
595 | &get_whole_children($classifier, $hashref, $children); #returns a list of the child ids
|
---|
596 | foreach $child(@$children) {
|
---|
597 | if(not &is_document($child)) {
|
---|
598 | $hierarchy = 1;
|
---|
599 | last;
|
---|
600 | }
|
---|
601 | }
|
---|
602 |
|
---|
603 | if (!$hierarchy) {
|
---|
604 | &add_classify_cfg_list($classifier_name);
|
---|
605 | }else { #there is a hierarchy so create a file
|
---|
606 | $info_file = "./etc/$classifier_name.txt";
|
---|
607 |
|
---|
608 | print STDERR "classification $classifier will be called $classifier_name\n";
|
---|
609 |
|
---|
610 | open (OUTDOC, "> $info_file" ) || die "couldn't open file $info_file\n";
|
---|
611 |
|
---|
612 | foreach $child (@$children) {
|
---|
613 | &process_entry(OUTDOC, $classifier_name, $child);
|
---|
614 | }
|
---|
615 |
|
---|
616 | close OUTDOC;
|
---|
617 |
|
---|
618 | &add_classify_cfg($classifier, $classifier_name, $info_file);
|
---|
619 | }
|
---|
620 | }
|
---|
621 |
|
---|
622 |
|
---|
623 | sub process_entry {
|
---|
624 | my ($handle, $classifier_name, $classify_id) = @_;
|
---|
625 | my $value = $infodb{$classify_id};
|
---|
626 |
|
---|
627 | my $hashref={};
|
---|
628 | &get_metadata($value, $hashref);
|
---|
629 | my $title = $hashref->{'Title'};
|
---|
630 | $title =~ s/\@$//; #remove trailing @
|
---|
631 | &add_line($handle, $classify_id, $title);
|
---|
632 |
|
---|
633 | my $children = [];
|
---|
634 | &get_whole_children($classify_id, $hashref, $children);
|
---|
635 | foreach $child (@$children) {
|
---|
636 | if (&is_document($child)) {
|
---|
637 | &add_doc_metadata($child, $classifier_name, $title);
|
---|
638 | }else {
|
---|
639 | &process_entry($handle, $classifier_name, $child);
|
---|
640 | }
|
---|
641 | }
|
---|
642 |
|
---|
643 | }
|
---|
644 |
|
---|
645 |
|
---|
646 |
|
---|
647 | sub add_doc_metadata {
|
---|
648 |
|
---|
649 | my ($doc_id, $classifier_name, $classifier_id) = @_;
|
---|
650 |
|
---|
651 | #add entry to doc database
|
---|
652 | #print STDERR "at doc level, docnum=$classify_id\n";
|
---|
653 |
|
---|
654 | $doc_classif_info->{$doc_id}={} unless defined $doc_classif_info->{$doc_id};
|
---|
655 | $doc_classif_info->{$doc_id}->{$classifier_name}=[] unless
|
---|
656 | defined $doc_classif_info->{$doc_id}->{$classifier_name};
|
---|
657 | push (@{$doc_classif_info->{$doc_id}->{$classifier_name}}, $classifier_id);
|
---|
658 |
|
---|
659 | }
|
---|
660 |
|
---|
661 |
|
---|
662 |
|
---|
663 |
|
---|
664 | sub add_line {
|
---|
665 |
|
---|
666 | my ($handle, $classify_id, $title) = @_;
|
---|
667 | #print STDERR "classify id= $classify_id, title= $title\n";
|
---|
668 | $title = &unicode::ascii2utf8(\$title);
|
---|
669 | my ($num) = $classify_id =~ /^CL\d\.(.*)$/; #remove the CL1. from the front
|
---|
670 |
|
---|
671 | print $handle "\"$title\"\t$num\t\"$title\"\n";
|
---|
672 |
|
---|
673 |
|
---|
674 | }
|
---|
675 |
|
---|
676 | sub is_document {
|
---|
677 | my ($oid) = @_;
|
---|
678 | return 1 if $oid =~ /^HASH/i;
|
---|
679 | return 0;
|
---|
680 | }
|
---|
681 |
|
---|
682 | ########################################################################
|
---|
683 |
|
---|
684 | ########## stuff for producing collect.cfg file ###########################
|
---|
685 |
|
---|
686 | sub add_default_cfg {
|
---|
687 |
|
---|
688 | $username=`whoami`;
|
---|
689 | $username=`logname` unless defined $username;
|
---|
690 | $username="a_user" unless defined $username;
|
---|
691 | $username =~ s/\n//;
|
---|
692 | $collect_cfg->{'creator'}="$username\@cs.waikato.ac.nz";
|
---|
693 | $collect_cfg->{'maintainer'}="$username\@cs.waikato.ac.nz";
|
---|
694 | $collect_cfg->{'public'}="true";
|
---|
695 | $collect_cfg->{'beta'}="true";
|
---|
696 |
|
---|
697 | $collect_cfg->{'plugin'}=[];
|
---|
698 | push (@{$collect_cfg->{'plugin'}}, ["GAPlug"]);
|
---|
699 | push (@{$collect_cfg->{'plugin'}}, ["ArcPlug"]);
|
---|
700 | push (@{$collect_cfg->{'plugin'}}, ["RecPlug"]);
|
---|
701 |
|
---|
702 | $collect_cfg->{'format'}={};
|
---|
703 | $collect_cfg->{'format'}->{'DocumentImages'}="true";
|
---|
704 | $collect_cfg->{'format'}->{'DocumentText'} =
|
---|
705 | "\"<h3>[Title]</h3>\\\\n\\\\n<p>[Text]\"";
|
---|
706 | $collect_cfg->{'format'}->{'SearchVList'} =
|
---|
707 | "\"<td valign=top>[link][icon][/link]</td><td>{If}{[parent(All': '):Title],[parent(All': '):Title]:}[link][Title][/link]</td>\"";
|
---|
708 |
|
---|
709 | $collect_cfg->{'collectionmeta'}={};
|
---|
710 | $collect_cfg->{'collectionmeta'}->{'collectionname'}="\"$collection\"";
|
---|
711 | $collect_cfg->{'collectionmeta'}->{'iconcollection'}="\"_httpprefix_/collect/$collection/images/$collection.gif\"";
|
---|
712 | $collect_cfg->{'collectionmeta'}->{'iconcollectionsmall'}="\"_httpprefix_/collect/$collection/images/${collection}sm.gif\"";
|
---|
713 | $collect_cfg->{'collectionmeta'}->{'collectionextra'} = "\"This is a collection rebuilt from CDROM.\"";
|
---|
714 |
|
---|
715 | }
|
---|
716 |
|
---|
717 | sub add_index_cfg {
|
---|
718 | my ($buildfile) = @_;
|
---|
719 |
|
---|
720 | my $data={};
|
---|
721 | $collect_cfg->{'indexes'}=[];
|
---|
722 | if (-e $buildfile) {
|
---|
723 | $data=&cfgread::read_cfg_file($buildfile, '^(this)$', '^(indexmap)$');
|
---|
724 | foreach $i (@{$data->{'indexmap'}}) {
|
---|
725 | ($thisindex, $abbrev)= split ("\-\>", $i);
|
---|
726 | push (@{$collect_cfg->{'indexes'}}, $thisindex);
|
---|
727 | $collect_cfg->{'defaultindex'} = $thisindex unless defined
|
---|
728 | $collect_cfg->{'defaultindex'};
|
---|
729 | $name=&get_index_name($thisindex);
|
---|
730 | $thisindex=".$thisindex";
|
---|
731 | $collect_cfg->{'collectionmeta'}->{$thisindex} = "\"$name\"";
|
---|
732 | }
|
---|
733 | }
|
---|
734 | else {
|
---|
735 | print STDERR "Couldn't read $buildfile, could not add index data to configuration file\n";
|
---|
736 | }
|
---|
737 |
|
---|
738 | }
|
---|
739 |
|
---|
740 | sub get_index_name {
|
---|
741 | my ($thisindex) = @_;
|
---|
742 | return "paragraphs" if $thisindex =~ /paragraph/;
|
---|
743 | return "chapters" if $thisindex =~ /section.*text/;
|
---|
744 | return "titles" if $thisindex =~ /Title/;
|
---|
745 | return "other";
|
---|
746 | }
|
---|
747 |
|
---|
748 | sub add_classify_cfg {
|
---|
749 |
|
---|
750 | my ($classify, $classifier_name, $file) = @_;
|
---|
751 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
752 |
|
---|
753 | my ($title) = $classifier_name =~ /^CL(.*)$/;
|
---|
754 | my ($filename) = $file =~ /\/([^\/]*)$/;
|
---|
755 | my $entry = "Hierarchy -hfile $filename -metadata $classifier_name -buttonname $title -sort Title";
|
---|
756 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
757 |
|
---|
758 |
|
---|
759 | }
|
---|
760 |
|
---|
761 | sub add_classify_cfg_list {
|
---|
762 |
|
---|
763 | my ($classifier) = @_;
|
---|
764 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
765 | my ($title) = $classifier =~ /^CL(.*)$/;
|
---|
766 | my $entry = "AZList -metadata $classifier -buttonname $title";
|
---|
767 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
768 | }
|
---|
769 |
|
---|
770 | sub output_cfg_file {
|
---|
771 |
|
---|
772 | my ($collfile) = @_;
|
---|
773 |
|
---|
774 | if (-e $collfile) { #collect.cfg already exists
|
---|
775 | $collfile .= ".new";
|
---|
776 | }
|
---|
777 | &cfgread::write_cfg_file($collfile, $collect_cfg,
|
---|
778 | '^(creator|maintainer|public|beta|defaultindex)$',
|
---|
779 | '^(indexes)$', '^(format|collectionmeta)$',
|
---|
780 | '^(plugin|classify)$');
|
---|
781 |
|
---|
782 |
|
---|
783 |
|
---|
784 | }
|
---|
785 |
|
---|
786 |
|
---|