1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # unbuildv1.pl --
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | # this program will decompress all the text from a built index
|
---|
29 | # and return it to gml format - this version of the program is designed to
|
---|
30 | # work on collections built with the first version of the greenstone software
|
---|
31 | # it extracts the browse classifications from the gdbm file, and recreates
|
---|
32 | # them. It also creates a default collect.cfg file.
|
---|
33 |
|
---|
34 | # To run, this program needs the old archive directory (called archives),
|
---|
35 | # the index directory with the text and one other index, not paragraphs!! -
|
---|
36 | # use one of the section indexes. Also needs the build.cfg file (in the
|
---|
37 | # index dir) - used in creating a collect.cfg file. If build.cfg is not
|
---|
38 | # present, the parts of the collect.cfg file relating to indexes will not
|
---|
39 | # be entered.
|
---|
40 |
|
---|
41 | # Stefan altered this script slightly August 2002 to use
|
---|
42 | # $GSDLHOME/bin/$GSDLOS/mgquery_old instead of the old mgquery in
|
---|
43 | # nzdl-1.2. mgquery_old is a statically linked binary compiled from
|
---|
44 | # mg-1.3d. It was compiled with SHORT_SUFFIX set so doesn't need the index
|
---|
45 | # file extensions to be altered.
|
---|
46 | # Also changed the command line options a little.
|
---|
47 |
|
---|
48 | BEGIN {
|
---|
49 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
50 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
51 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
52 | $FileHandle = 'FH000';
|
---|
53 | }
|
---|
54 |
|
---|
55 | use doc;
|
---|
56 | use util;
|
---|
57 | use parsargv;
|
---|
58 | use FileHandle;
|
---|
59 | use English;
|
---|
60 | use cfgread;
|
---|
61 | use unicode;
|
---|
62 | use plugout;
|
---|
63 | select STDERR; $| = 1;
|
---|
64 | select STDOUT; $| = 1;
|
---|
65 |
|
---|
66 |
|
---|
67 | # globals
|
---|
68 | $collection = ""; # the collection name
|
---|
69 | $index = ""; # the selected index (like stt/unu)
|
---|
70 | $textdir = ""; # the textdir (like text/unu)
|
---|
71 | $toplevelinfo = []; #list of document OIDs
|
---|
72 | %infodb = (); #hash into GDBM file
|
---|
73 | $classifyinfo = []; # list of classifications
|
---|
74 | $doc_classif_info = {}; # hash of OIDs->classifications they belong to
|
---|
75 | $collect_cfg = {}; #data for the configuration file
|
---|
76 | $mgread = ++$FileHandle;
|
---|
77 | $mgwrite = ++$FileHandle;
|
---|
78 |
|
---|
79 | sub print_usage {
|
---|
80 | print STDERR "\n usage: $0 [options]\n\n";
|
---|
81 | print STDERR " options:\n";
|
---|
82 | print STDERR " -verbosity number 0=none, 3=lots\n";
|
---|
83 | print STDERR " -old directory The directory in which the old collection lives\n";
|
---|
84 | print STDERR " -new directory The directory in which to put the new collection\n";
|
---|
85 | }
|
---|
86 |
|
---|
87 | &main ();
|
---|
88 |
|
---|
89 | sub main {
|
---|
90 | if (!parsargv::parse(\@ARGV,
|
---|
91 | 'verbosity/\d+/2', \$verbosity,
|
---|
92 | 'old/.*/', \$old,
|
---|
93 | 'new/.*/', \$new)) {
|
---|
94 | &print_usage();
|
---|
95 | die "\n";
|
---|
96 | }
|
---|
97 |
|
---|
98 | die "$old does not exist\n\n" unless (-d $old);
|
---|
99 | my $old_indexdir = &util::filename_cat($old, "index");
|
---|
100 | my $old_archivedir = &util::filename_cat($old, "archives");
|
---|
101 |
|
---|
102 | my $new_archivedir = &util::filename_cat($new, "archives");
|
---|
103 | my $new_etcdir = &util::filename_cat($new, "etc");
|
---|
104 | if (!-e $new_archivedir) {
|
---|
105 | &util::mk_all_dir($new_archivedir);
|
---|
106 | }
|
---|
107 | if (!(-d $new_etcdir)) {
|
---|
108 | &util::mk_all_dir($new_etcdir);
|
---|
109 | }
|
---|
110 |
|
---|
111 | my $gdbmfile = &get_gdbmfile($old_indexdir); # sets $collection and $textdir
|
---|
112 | &set_index($old_indexdir); # sets $index (just chooses one index)
|
---|
113 |
|
---|
114 | my $buildcfgfile = &util::filename_cat($old_indexdir, "build.cfg");
|
---|
115 | my $colcfgfile = &util::filename_cat($new_etcdir, "collect.cfg");
|
---|
116 |
|
---|
117 | # don't need this but giving it a value stops an annoying undefined
|
---|
118 | # value warning
|
---|
119 | $ENV{'GSDLCOLLECTDIR'} = 'tmp';
|
---|
120 |
|
---|
121 | &add_default_cfg();
|
---|
122 | &add_index_cfg($buildcfgfile);
|
---|
123 |
|
---|
124 | # work out all the classifications from the gdbm file, info for each doc
|
---|
125 | # (ie which classifications they belong to, are kept in $doc_classif_info
|
---|
126 | &get_classifyinfo($gdbmfile); #puts a list of classifications into classifyinfo
|
---|
127 | &get_toplevel_OID($gdbmfile); # puts a list of the top level document OIDs into $toplevelinfo
|
---|
128 |
|
---|
129 | # read ldb file into %infodb
|
---|
130 | &read_gdbm($gdbmfile);
|
---|
131 |
|
---|
132 | #this makes the files specifying the hierarchy of subjects, titles etc
|
---|
133 | foreach my $classify (@$classifyinfo) {
|
---|
134 | &make_info_file($new_etcdir, $classify);
|
---|
135 | }
|
---|
136 |
|
---|
137 | # write out the collect.cfg
|
---|
138 | &output_cfg_file($colcfgfile);
|
---|
139 |
|
---|
140 | &openmg($old_indexdir);
|
---|
141 |
|
---|
142 | # read the archive information file
|
---|
143 | my $archive_info_filename = &util::filename_cat ($new_archivedir, "archives.inf");
|
---|
144 | my $archive_info = new arcinfo ();
|
---|
145 |
|
---|
146 | my $opts = [];
|
---|
147 | push @$opts,("-output_info",$archive_info);
|
---|
148 |
|
---|
149 | $processor = &plugout::load_plugout("GreenstoneXMLPlugout",$opts);
|
---|
150 | $processor->setoutputdir ($new_archivedir);
|
---|
151 |
|
---|
152 | my ($doc_obj, $hashref, $children);
|
---|
153 | print STDERR "processing documents now\n" if $verbosity >=2;
|
---|
154 | foreach $oid (@$toplevelinfo) {
|
---|
155 | $value = $infodb{$oid};
|
---|
156 | $hashref={};
|
---|
157 | $children = [];
|
---|
158 | &get_metadata($value, $hashref);
|
---|
159 | $doc_obj = new doc ();
|
---|
160 | $doc_obj->set_OID($oid);
|
---|
161 | my ($olddir) = $hashref->{'archivedir'}; # old dir for this doc, where images are stored
|
---|
162 | $top = $doc_obj->get_top_section();
|
---|
163 | &add_section_content ($doc_obj, $top, $hashref, $olddir);
|
---|
164 | &add_classification_metadata($oid, $doc_obj, $top);
|
---|
165 | &add_cover_image($doc_obj, $old_archivedir, $olddir);
|
---|
166 | &get_children($hashref, $children);
|
---|
167 | &recurse_sections($doc_obj, $children, $oid, $top, $olddir) if (defined ($children));
|
---|
168 |
|
---|
169 | #change OID to new format
|
---|
170 | $doc_obj->delete_OID();
|
---|
171 | $doc_obj->set_OID();
|
---|
172 | $processor->process($doc_obj);
|
---|
173 | # last;
|
---|
174 | }
|
---|
175 | print STDERR "\n";
|
---|
176 |
|
---|
177 | &closemg();
|
---|
178 |
|
---|
179 | # write out the archive information file
|
---|
180 | $archive_info->save_info($archive_info_filename);
|
---|
181 |
|
---|
182 |
|
---|
183 | }
|
---|
184 |
|
---|
185 | # returns the path to the gdbm info database - also
|
---|
186 | # sets the $collection and $textdir global variable
|
---|
187 | sub get_gdbmfile {
|
---|
188 | my ($indexdir) = @_;
|
---|
189 | my ($gdbmfile);
|
---|
190 |
|
---|
191 | opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
|
---|
192 | my @conts = readdir DIR;
|
---|
193 | close DIR;
|
---|
194 |
|
---|
195 | foreach $file (@conts) {
|
---|
196 | if ($file =~ /text$/) {
|
---|
197 | $textdir = $file;
|
---|
198 | last;
|
---|
199 | }
|
---|
200 | }
|
---|
201 | die "No text directory found in $indexdir\n\n"
|
---|
202 | unless defined $textdir && $textdir =~ /text$/;
|
---|
203 |
|
---|
204 | $gdbmfile = &util::filename_cat ($indexdir, $textdir);
|
---|
205 |
|
---|
206 | opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
|
---|
207 | @conts = readdir DIR;
|
---|
208 | close DIR;
|
---|
209 |
|
---|
210 | foreach my $file (@conts) {
|
---|
211 | if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
|
---|
212 | $collection = $1;
|
---|
213 | $gdbmfile = &util::filename_cat ($gdbmfile, $file);
|
---|
214 | last;
|
---|
215 | }
|
---|
216 | }
|
---|
217 |
|
---|
218 | if (defined $collection && $collection =~ /\w/) {
|
---|
219 | $textdir = &util::filename_cat ($textdir, $collection);
|
---|
220 | } else {
|
---|
221 | die "collection global wasn't set\n";
|
---|
222 | }
|
---|
223 | return $gdbmfile if (-e $gdbmfile);
|
---|
224 | die "Couldn't find gdbm info database in $indexdir\n\n";
|
---|
225 | }
|
---|
226 |
|
---|
227 |
|
---|
228 | #this function will probably need to be changed for any different file formats
|
---|
229 | sub get_toplevel_OID {
|
---|
230 | my ($gdbmfile) = @_;
|
---|
231 |
|
---|
232 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
233 | print STDERR "Finding all top level sections from $gdbmfile\n" if $verbosity >= 2;
|
---|
234 |
|
---|
235 | $/ = '-' x 70;
|
---|
236 | my $entry = "";
|
---|
237 | while (defined ($entry = <DB2TXT>)) {
|
---|
238 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
239 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
240 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
241 |
|
---|
242 | next unless ($key =~ /B\.\d*$/); # assumes top level OID is like
|
---|
243 | # B.12
|
---|
244 |
|
---|
245 | push( @$toplevelinfo, $key);
|
---|
246 |
|
---|
247 | }
|
---|
248 |
|
---|
249 | $/ = "\n";
|
---|
250 | }
|
---|
251 |
|
---|
252 | # gets all the metadata from a gdbm file entry, and puts it into a hashref
|
---|
253 | #this has changed for old style formats
|
---|
254 | sub get_metadata {
|
---|
255 |
|
---|
256 | my ($gdb_str_ref, $hashref) = @_;
|
---|
257 | # print STDERR $gdb_str_ref;
|
---|
258 | my @entries = split(/\n/, $gdb_str_ref);
|
---|
259 |
|
---|
260 | my $hastext = 1;
|
---|
261 | foreach $entry (@entries) {
|
---|
262 |
|
---|
263 | my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
|
---|
264 |
|
---|
265 | next if ($key eq "p");
|
---|
266 | next if ($key eq "j");
|
---|
267 |
|
---|
268 | $key = "Title" if $key eq "t";
|
---|
269 | $key = "docnum" if $key eq "d";
|
---|
270 | if ($key eq "o") {
|
---|
271 | $key = "archivedir";
|
---|
272 | $value =~ s/HASH/hash/;
|
---|
273 | }
|
---|
274 | if ($key eq "c") {
|
---|
275 | $key = "contains";
|
---|
276 | $hastext=0;}
|
---|
277 |
|
---|
278 | $$hashref{$key} .= $value;
|
---|
279 |
|
---|
280 | }
|
---|
281 |
|
---|
282 | $$hashref{'hastxt'} = $hastext;
|
---|
283 |
|
---|
284 | }
|
---|
285 |
|
---|
286 | # takes a hashref containing the metadata for a gdbmfile entry, and
|
---|
287 | # extracts the childrens numbers (from the 'contains' entry). assumes
|
---|
288 | # format is like B.14.1:B.24.2:B.24.3 returns a list like 1 2 3 4
|
---|
289 | sub get_children {
|
---|
290 | my ($hashref, $children) = @_;
|
---|
291 |
|
---|
292 | my $childs = $hashref->{'contains'};
|
---|
293 | if (defined ($childs)) {
|
---|
294 | @xchildren = split /\:/, $childs;
|
---|
295 | foreach $kid (@xchildren) {
|
---|
296 | my ($child)= $kid =~ /\.([^\.]*)(?:\\n|$)/;
|
---|
297 | push (@$children, $child);
|
---|
298 | }
|
---|
299 | # print STDERR "children are: ", @$children;
|
---|
300 | }
|
---|
301 | else {
|
---|
302 | $children = [];
|
---|
303 | }
|
---|
304 | }
|
---|
305 |
|
---|
306 | #takes a hashref containing the metadata for a gdbmfile entry, and extracts
|
---|
307 | #the childrens numbers (from the 'contains' entry).
|
---|
308 | #assumes format is like B.14.1:B.24.2:B.24.3
|
---|
309 | #returns a list with the full child name ie B.14.1 B.14.2 etc
|
---|
310 | #used for classification stuff
|
---|
311 | sub get_whole_children {
|
---|
312 |
|
---|
313 | my ($hashref, $children) = @_;
|
---|
314 |
|
---|
315 | my $childs = $hashref->{'contains'};
|
---|
316 | if (defined ($childs)) {
|
---|
317 | @$children = split /\:/, $childs;
|
---|
318 | #print STDERR "children are: ", @$children;
|
---|
319 | }
|
---|
320 | else {
|
---|
321 | $children = [];
|
---|
322 | }
|
---|
323 | }
|
---|
324 |
|
---|
325 | sub recurse_sections {
|
---|
326 | my ($doc_obj, $children, $parentoid, $parentsection, $olddir) = @_;
|
---|
327 |
|
---|
328 | foreach $child (sort numerically @$children) {
|
---|
329 | $doc_obj->create_named_section("$parentsection.$child");
|
---|
330 | my $value = $infodb{"$parentoid.$child"};
|
---|
331 | my $hashref={};
|
---|
332 | &get_metadata($value, $hashref); # get childs metadata
|
---|
333 | my $newchildren = [];
|
---|
334 | &get_children($hashref, $newchildren); # get childs children
|
---|
335 | #add content fo rcurrent section
|
---|
336 | &add_section_content($doc_obj, "$parentsection.$child", $hashref, $olddir);
|
---|
337 | # process all the children if there are any
|
---|
338 | &recurse_sections($doc_obj, $newchildren, "$parentoid.$child", "$parentsection.$child", $olddir)
|
---|
339 | if (defined ($newchildren));
|
---|
340 | }
|
---|
341 | }
|
---|
342 |
|
---|
343 | sub add_section_content {
|
---|
344 | my ($doc_obj, $cursection, $hashref, $olddir) = @_;
|
---|
345 |
|
---|
346 | foreach $key (keys %$hashref) {
|
---|
347 | #dont need to store these metadata
|
---|
348 | next if $key =~ /(contains|docnum|hastxt|doctype|archivedir|classifytype)/i;
|
---|
349 | my @items = split /@/, $hashref->{$key};
|
---|
350 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
351 |
|
---|
352 | }
|
---|
353 |
|
---|
354 | my ($hastext) = $hashref->{'hastxt'} =~ /(0|1)/;
|
---|
355 | if ($hastext) {
|
---|
356 |
|
---|
357 | my ($docnum)= $hashref->{'docnum'} =~ /(\d*)/;
|
---|
358 | my $text = &get_text($docnum);
|
---|
359 | $doc_obj->add_text ($cursection, $text);
|
---|
360 |
|
---|
361 | my (@images) = $text =~ /<img.*?src=\"([^\"]*)\"[^>]*>/g;
|
---|
362 |
|
---|
363 | if (scalar(@images)>0) {
|
---|
364 |
|
---|
365 | foreach $img (@images) {
|
---|
366 | my ($assoc_file) = $img =~ /([^\/\\]*\..*)$/; #the name of the image
|
---|
367 | $img =~ s/_httpcollection_/$old/; #replace _httpcollection_ with path to old collection
|
---|
368 | $olddir =~ s/HASH/hash/;
|
---|
369 | $img =~ s/_thisOID_/$olddir/; #replace _thisOID_ with old archivedir name #the path to the image
|
---|
370 | $img = lc $img; # doc names and path upper case in file, lower
|
---|
371 | # case in directory
|
---|
372 | if (-e $img) {
|
---|
373 | $doc_obj->associate_file($img, $assoc_file);
|
---|
374 | } else {
|
---|
375 | print STDERR "WARNING: $img does not exist\n";
|
---|
376 | }
|
---|
377 | }
|
---|
378 |
|
---|
379 | }
|
---|
380 | }
|
---|
381 | }
|
---|
382 |
|
---|
383 |
|
---|
384 | sub add_classification_metadata {
|
---|
385 |
|
---|
386 | my ($oid, $doc_obj, $cursection) = @_;
|
---|
387 |
|
---|
388 | if (defined $doc_classif_info->{$oid}) {
|
---|
389 |
|
---|
390 | $hashref = $doc_classif_info->{$oid};
|
---|
391 |
|
---|
392 | foreach $key (keys %$hashref) {
|
---|
393 | my @items = @{$hashref->{$key}};
|
---|
394 | map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
|
---|
395 | }
|
---|
396 | }
|
---|
397 | }
|
---|
398 | # picks up the cover image "cover.jpg" from the old archives directory.
|
---|
399 | sub add_cover_image {
|
---|
400 | my ($doc_obj, $archivedir, $olddir) = @_;
|
---|
401 | $assoc_file = "cover.jpg";
|
---|
402 | $img = &util::filename_cat($archivedir, $olddir, $assoc_file);
|
---|
403 |
|
---|
404 | if (-e $img) {
|
---|
405 | $doc_obj->associate_file($img, $assoc_file);
|
---|
406 | } else {
|
---|
407 | print STDERR "WARNING: couldn't find $img\n";
|
---|
408 | }
|
---|
409 | }
|
---|
410 |
|
---|
411 | sub set_index {
|
---|
412 | my ($indexdir) = @_;
|
---|
413 |
|
---|
414 | # check that $collection has been set
|
---|
415 | die "collection global was not set\n"
|
---|
416 | unless defined $collection && $collection =~ /\w/;
|
---|
417 |
|
---|
418 | # find an index (just use first non-text and non-paragraph directory we
|
---|
419 | # come across in $indexdir)
|
---|
420 | opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
|
---|
421 | my @indexes = readdir INDEXDIR;
|
---|
422 | close INDEXDIR;
|
---|
423 | foreach $i (@indexes) {
|
---|
424 | next if $i =~ /text$/i || $i =~ /\./ || $i =~ /^p/i;
|
---|
425 | $index = &util::filename_cat ($i, $collection);
|
---|
426 | last;
|
---|
427 | }
|
---|
428 | }
|
---|
429 |
|
---|
430 |
|
---|
431 | #########################################################################
|
---|
432 |
|
---|
433 | ################ functions involving mg ################################
|
---|
434 | sub get_text {
|
---|
435 | my ($docnum) = @_;
|
---|
436 |
|
---|
437 | print STDERR "." if $verbosity >= 2;
|
---|
438 | &mgcommand ($docnum);
|
---|
439 |
|
---|
440 | <$mgread>; # eat the document separator
|
---|
441 |
|
---|
442 | my $text = "";
|
---|
443 | my $line = "";
|
---|
444 |
|
---|
445 | while (defined ($line = <$mgread>))
|
---|
446 | {
|
---|
447 | last if $line =~ /^<\/mg>/;
|
---|
448 | $text .= $line;
|
---|
449 | }
|
---|
450 |
|
---|
451 | # Read in the last statement, which should be:
|
---|
452 | # "dd documents retrieved."
|
---|
453 | <$mgread>;
|
---|
454 |
|
---|
455 | return $text;
|
---|
456 | }
|
---|
457 |
|
---|
458 |
|
---|
459 | sub numerically {$a <=> $b;}
|
---|
460 |
|
---|
461 |
|
---|
462 |
|
---|
463 | # mg stuff
|
---|
464 |
|
---|
465 | sub openmg {
|
---|
466 | my ($indexdir) = @_;
|
---|
467 |
|
---|
468 | die "Unable to start mgquery." unless
|
---|
469 | &openpipe($mgread, $mgwrite, "mgquery_old -d $indexdir -f $index -t $textdir");
|
---|
470 |
|
---|
471 | $mgwrite->autoflush();
|
---|
472 | &mgcommand('.set expert true');
|
---|
473 | &mgcommand('.set terminator "</mg>\n"');
|
---|
474 | &mgcommand('.set mode text');
|
---|
475 | &mgcommand('.set query docnums');
|
---|
476 | &mgcommand('.set term_freq off');
|
---|
477 | }
|
---|
478 |
|
---|
479 | sub closemg {
|
---|
480 | &mgcommand (".quit");
|
---|
481 | close($mgread);
|
---|
482 | close($mgwrite);
|
---|
483 | }
|
---|
484 |
|
---|
485 | sub mgcommand {
|
---|
486 | my ($command) = @_;
|
---|
487 |
|
---|
488 | return if $command =~ /^\s*$/; #whitespace
|
---|
489 | #print STDERR "command: $command\n";
|
---|
490 | print $mgwrite "$command\n";
|
---|
491 |
|
---|
492 | # eat up the command executed which is echoed
|
---|
493 | <$mgread>;
|
---|
494 | }
|
---|
495 |
|
---|
496 | # openpipe(READ, WRITE, CMD)
|
---|
497 | #
|
---|
498 | # Like open2, except CMD's stderr is also redirected.
|
---|
499 | #
|
---|
500 | sub openpipe
|
---|
501 | {
|
---|
502 | my ($read, $write, $cmd) = @_;
|
---|
503 | my ($child_read, $child_write);
|
---|
504 |
|
---|
505 | $child_read = ++$FileHandle;
|
---|
506 | $child_write = ++$FileHandle;
|
---|
507 |
|
---|
508 | pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
|
---|
509 | pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
|
---|
510 | my $pid;
|
---|
511 |
|
---|
512 | if (($pid = fork) < 0) {
|
---|
513 | die "Failed fork: $!";
|
---|
514 | } elsif ($pid == 0) {
|
---|
515 | close($read);
|
---|
516 | close($write);
|
---|
517 | open(STDIN, "<&$child_read");
|
---|
518 | open(STDOUT, ">&$child_write");
|
---|
519 | open(STDERR, ">&$child_write");
|
---|
520 | exec($cmd);
|
---|
521 | die "Failed exec $cmd: $!";
|
---|
522 | }
|
---|
523 |
|
---|
524 | close($child_read);
|
---|
525 | close($child_write);
|
---|
526 |
|
---|
527 | $write->autoflush();
|
---|
528 | $read->autoflush();
|
---|
529 |
|
---|
530 | return 1;
|
---|
531 | }
|
---|
532 |
|
---|
533 |
|
---|
534 | ######################################################################
|
---|
535 |
|
---|
536 | ############# functions to do with the classificaiton stuff ##########
|
---|
537 | sub get_classifyinfo {
|
---|
538 | my ($gdbmfile) = @_;
|
---|
539 |
|
---|
540 | open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
|
---|
541 | print STDERR "Finding all classification sections from $gdbmfile\n" ;
|
---|
542 |
|
---|
543 | $/ = '-' x 70;
|
---|
544 | my $entry = "";
|
---|
545 | while (defined ($entry = <DB2TXT>)) {
|
---|
546 | next unless $entry =~ /\w/; #ignore blank entries
|
---|
547 | $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
|
---|
548 | my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
|
---|
549 |
|
---|
550 | next unless ($key =~/^[A-Z]$/); # assumes classification OID is like
|
---|
551 | # C or T etc
|
---|
552 |
|
---|
553 | push( @$classifyinfo, $key);
|
---|
554 |
|
---|
555 | }
|
---|
556 |
|
---|
557 | $/ = "\n";
|
---|
558 | }
|
---|
559 |
|
---|
560 | #this creates the classification files needed for the hierarchy classifier
|
---|
561 | #used for subjects, titles, orgs etc
|
---|
562 | #also adds in entries to the collect_cfg hash
|
---|
563 | sub make_info_file {
|
---|
564 | my ($etcdir, $classifier) = @_;
|
---|
565 | my $info_file = "";
|
---|
566 |
|
---|
567 | $info_file = &util::filename_cat($etcdir, "CL.${classifier}.txt");
|
---|
568 | $classifier_name = "CL.$classifier";
|
---|
569 | print STDERR "classification $classifier will be called CL.$classifier\n";
|
---|
570 |
|
---|
571 | open (OUTDOC, ">$info_file" ) || die "couldn't open file $info_file\n";
|
---|
572 |
|
---|
573 | my $entry = $infodb{$classifier};
|
---|
574 | #print STDERR "entry = $entry\n";
|
---|
575 | my $hashref = {};
|
---|
576 | &get_metadata($entry, $hashref);
|
---|
577 | my $children=[];
|
---|
578 | &get_whole_children($hashref, $children);
|
---|
579 | foreach $child (@$children) {
|
---|
580 | &process_entry(OUTDOC, $classifier_name, $child);
|
---|
581 | }
|
---|
582 |
|
---|
583 | close OUTDOC;
|
---|
584 |
|
---|
585 | &add_classify_cfg($classifier, $classifier_name, $info_file);
|
---|
586 |
|
---|
587 | }
|
---|
588 |
|
---|
589 |
|
---|
590 | sub process_entry {
|
---|
591 | my ($handle, $classifier_name, $classify_id) = @_;
|
---|
592 |
|
---|
593 | my $value = $infodb{$classify_id};
|
---|
594 | my $hashref={};
|
---|
595 | &get_metadata($value, $hashref);
|
---|
596 | my $title = $hashref->{'Title'};
|
---|
597 |
|
---|
598 | &add_line($handle, $classify_id, $title);
|
---|
599 |
|
---|
600 | my $children = [];
|
---|
601 | &get_whole_children($hashref, $children);
|
---|
602 | foreach $child (@$children) {
|
---|
603 | if (&is_document($child)) {
|
---|
604 | &add_doc_metadata($child, $classifier_name, $classify_id);
|
---|
605 | }
|
---|
606 | else {
|
---|
607 | &process_entry($handle, $classifier_name, $child);
|
---|
608 | }
|
---|
609 | }
|
---|
610 | }
|
---|
611 |
|
---|
612 | sub add_doc_metadata {
|
---|
613 | my ($doc_id, $classifier_name, $classifier_id) = @_;
|
---|
614 |
|
---|
615 | #add entry to doc database
|
---|
616 | #print STDERR "at doc level, docnum=$classify_id\n";
|
---|
617 | $doc_classif_info->{$doc_id}={} unless defined $doc_classif_info->{$doc_id};
|
---|
618 | $doc_classif_info->{$doc_id}->{$classifier_name}=[] unless
|
---|
619 | defined $doc_classif_info->{$doc_id}->{$classifier_name};
|
---|
620 | push (@{$doc_classif_info->{$doc_id}->{$classifier_name}}, $classifier_id);
|
---|
621 | }
|
---|
622 |
|
---|
623 | sub add_line {
|
---|
624 | my ($handle, $classify_id, $title) = @_;
|
---|
625 | $title = &unicode::ascii2utf8(\$title);
|
---|
626 | my ($num) = $classify_id =~ /^[A-Z]\.(.*)$/; #remove the C. from the front
|
---|
627 | print $handle "$classify_id\t$num\t\"$title\"\n";
|
---|
628 | }
|
---|
629 |
|
---|
630 | sub is_document {
|
---|
631 | my ($oid) = @_;
|
---|
632 | return 1 if $oid =~ /^B\.\d/;
|
---|
633 | return 0;
|
---|
634 | }
|
---|
635 |
|
---|
636 | ########################################################################
|
---|
637 |
|
---|
638 | ########## stuff for producing collect.cfg file ###########################
|
---|
639 |
|
---|
640 | sub add_default_cfg {
|
---|
641 |
|
---|
642 | $username=`whoami`;
|
---|
643 | $username=`logname` unless defined $username;
|
---|
644 | $username="a_user" unless defined $username;
|
---|
645 | $username =~ s/\n//;
|
---|
646 | $collect_cfg->{'creator'}="$username\@cs.waikato.ac.nz";
|
---|
647 | $collect_cfg->{'maintainer'}="$username\@cs.waikato.ac.nz";
|
---|
648 | $collect_cfg->{'public'}="true";
|
---|
649 |
|
---|
650 | $collect_cfg->{'plugin'}=[];
|
---|
651 | push (@{$collect_cfg->{'plugin'}}, ["GreenstoneXMLPlugin"]);
|
---|
652 | push (@{$collect_cfg->{'plugin'}}, ["ArchivesInfPlugin"]);
|
---|
653 | push (@{$collect_cfg->{'plugin'}}, ["DirectoryPlugin"]);
|
---|
654 |
|
---|
655 | $collect_cfg->{'format'}={};
|
---|
656 | $collect_cfg->{'format'}->{'DocumentImages'}="true";
|
---|
657 | $collect_cfg->{'format'}->{'DocumentText'} =
|
---|
658 | "\"<h3>[Title]</h3>\\\\n\\\\n<p>[Text]\"";
|
---|
659 | $collect_cfg->{'format'}->{'SearchVList'} =
|
---|
660 | "\"<td valign=top>[link][icon][/link]</td><td>{If}{[parent(All': '):Title],[parent(All': '):Title]:}[link][Title][/link]</td>\"";
|
---|
661 |
|
---|
662 | $collect_cfg->{'collectionmeta'}={};
|
---|
663 | $collect_cfg->{'collectionmeta'}->{'collectionname'}="\"$collection\"";
|
---|
664 | $collect_cfg->{'collectionmeta'}->{'iconcollection'}="\"_httpprefix_/collect/$collection/images/$collection.gif\"";
|
---|
665 | $collect_cfg->{'collectionmeta'}->{'iconcollectionsmall'}="\"_httpprefix_/collect/$collection/images/${collection}sm.gif\"";
|
---|
666 | $collect_cfg->{'collectionmeta'}->{'collectionextra'} = "\"This is a collection rebuilt from CDROM.\"";
|
---|
667 |
|
---|
668 | }
|
---|
669 |
|
---|
670 | sub add_index_cfg {
|
---|
671 | my ($buildfile) = @_;
|
---|
672 |
|
---|
673 | my $data={};
|
---|
674 | $collect_cfg->{'indexes'}=[];
|
---|
675 | if (-e $buildfile) {
|
---|
676 | $data=&cfgread::read_cfg_file($buildfile, '^(this)$', '^(indexmap)$');
|
---|
677 | foreach my $i (@{$data->{'indexmap'}}) {
|
---|
678 | my ($thisindex, $abbrev)= split (/\-\>/, $i);
|
---|
679 | push (@{$collect_cfg->{'indexes'}}, $thisindex);
|
---|
680 | $collect_cfg->{'defaultindex'} = $thisindex unless defined
|
---|
681 | $collect_cfg->{'defaultindex'};
|
---|
682 | $name=&get_index_name($thisindex);
|
---|
683 | $thisindex=".$thisindex";
|
---|
684 | $collect_cfg->{'collectionmeta'}->{$thisindex} = "\"$name\"";
|
---|
685 | }
|
---|
686 | }
|
---|
687 | else {
|
---|
688 | print STDERR "Couldn't read $buildfile, could not add index data to configuration file\n";
|
---|
689 | }
|
---|
690 |
|
---|
691 | }
|
---|
692 |
|
---|
693 | sub get_index_name {
|
---|
694 | my ($thisindex) = @_;
|
---|
695 | return "paragraphs" if $thisindex =~ /paragraph/;
|
---|
696 | return "chapters" if $thisindex =~ /section.*text/;
|
---|
697 | return "titles" if $thisindex =~ /Title/;
|
---|
698 | return "other";
|
---|
699 | }
|
---|
700 |
|
---|
701 | sub add_classify_cfg {
|
---|
702 |
|
---|
703 | my ($classify, $metadata, $file) = @_;
|
---|
704 | $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
|
---|
705 |
|
---|
706 | my ($title);
|
---|
707 | $title = "Howto" if $classify eq 'H';
|
---|
708 | $title = "Subject" if $classify eq 'C';
|
---|
709 | $title = "Organization" if $classify eq 'O';
|
---|
710 | $title = "Title" if $classify eq 'T';
|
---|
711 | $title = "Other" unless defined $title;
|
---|
712 |
|
---|
713 | my ($filename) = $file =~ /\/([^\/]*)$/;
|
---|
714 | my $entry = "Hierarchy -hfile $filename -metadata $metadata -buttonname $title -sort Title";
|
---|
715 | $entry .= " -hlist_at_top" if $title eq "Title";
|
---|
716 | push (@{$collect_cfg->{'classify'}},[$entry]);
|
---|
717 |
|
---|
718 |
|
---|
719 | }
|
---|
720 |
|
---|
721 | sub output_cfg_file {
|
---|
722 |
|
---|
723 | my ($collfile) = @_;
|
---|
724 | &cfgread::write_cfg_file($collfile, $collect_cfg,
|
---|
725 | '^(creator|maintainer|public|beta|defaultindex)$',
|
---|
726 | '^(indexes)$', '^(format|collectionmeta)$',
|
---|
727 | '^(plugin|classify)$');
|
---|
728 | }
|
---|
729 |
|
---|
730 | sub read_gdbm {
|
---|
731 | my ($filename) = @_;
|
---|
732 |
|
---|
733 | open (PIPEIN, "db2txt \"$filename\" |") || die "couldn't open pipe from db2txt\n";
|
---|
734 | my $line = ""; my $key = ""; my $value = "";
|
---|
735 | while (defined ($line = <PIPEIN>)) {
|
---|
736 | if ($line =~ /^\[([^\]]+)\]$/) {
|
---|
737 | $key = $1;
|
---|
738 | } elsif ($line =~ /^-{70}$/) {
|
---|
739 | $infodb{$key} = $value;
|
---|
740 | $value = "";
|
---|
741 | $key = "";
|
---|
742 | } else {
|
---|
743 | $value .= $line;
|
---|
744 | }
|
---|
745 | }
|
---|
746 | close PIPEIN;
|
---|
747 | }
|
---|