source: trunk/gsdl/bin/script/unbuildv1.pl@ 3324

Last change on this file since 3324 was 3324, checked in by sjboddie, 22 years ago

Removed the really old unindex.pl script and added the not quite so old
unbuildv1.pl and unbuildv2.pl. These scripts can be used to convert built
indexes back to Greenstone Archive format documents. unbuildv1.pl handles
very old Greenstone collections like those on the various HumanInfo CDROMs
while unbuildv2.pl handles more recently created collections. unbuildv1.pl
was altered and now works but unbuildv2.pl probably still needs some work.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# unbuildv1.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# this program will decompress all the text from a built index
29# and return it to gml format - this version of the program is designed to
30# work on collections built with the first version of the greenstone software
31# it extracts the browse classifications from the gdbm file, and recreates
32# them. It also creates a default collect.cfg file.
33
34# To run, this program needs the old archive directory (called archives),
35# the index directory with the text and one other index, not paragraphs!! -
36# use one of the section indexes. Also needs the build.cfg file (in the
37# index dir) - used in creating a collect.cfg file. If build.cfg is not
38# present, the parts of the collect.cfg file relating to indexes will not
39# be entered.
40
41# Stefan altered this script slightly August 2002 to use
42# $GSDLHOME/bin/$GSDLOS/mgquery_old instead of the old mgquery in
43# nzdl-1.2. mgquery_old is a statically linked binary compiled from
44# mg-1.3d. It was compiled with SHORT_SUFFIX set so doesn't need the index
45# file extensions to be altered.
46# Also changed the command line options a little.
47
48BEGIN {
49 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
50 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
51 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
52 $FileHandle = 'FH000';
53}
54
55use doc;
56use docsave;
57use util;
58use parsargv;
59use FileHandle;
60use English;
61use cfgread;
62use unicode;
63select STDERR; $| = 1;
64select STDOUT; $| = 1;
65
66
67# globals
68$collection = ""; # the collection name
69$index = ""; # the selected index (like stt/unu)
70$textdir = ""; # the textdir (like text/unu)
71$toplevelinfo = []; #list of document OIDs
72%infodb = (); #hash into GDBM file
73$classifyinfo = []; # list of classifications
74$doc_classif_info = {}; # hash of OIDs->classifications they belong to
75$collect_cfg = {}; #data for the configuration file
76$mgread = ++$FileHandle;
77$mgwrite = ++$FileHandle;
78
79sub print_usage {
80 print STDERR "\n usage: $0 [options]\n\n";
81 print STDERR " options:\n";
82 print STDERR " -verbosity number 0=none, 3=lots\n";
83 print STDERR " -old directory The directory in which the old collection lives\n";
84 print STDERR " -new directory The directory in which to put the new collection\n";
85}
86
87&main ();
88
89sub main {
90 if (!parsargv::parse(\@ARGV,
91 'verbosity/\d+/2', \$verbosity,
92 'old/.*/', \$old,
93 'new/.*/', \$new)) {
94 &print_usage();
95 die "\n";
96 }
97
98 die "$old does not exist\n\n" unless (-d $old);
99 my $old_indexdir = &util::filename_cat($old, "index");
100 my $old_archivedir = &util::filename_cat($old, "archives");
101
102 my $new_archivedir = &util::filename_cat($new, "archives");
103 my $new_etcdir = &util::filename_cat($new, "etc");
104 if (!-e $new_archivedir) {
105 &util::mk_all_dir($new_archivedir);
106 }
107 if (!(-d $new_etcdir)) {
108 &util::mk_all_dir($new_etcdir);
109 }
110
111 my $gdbmfile = &get_gdbmfile($old_indexdir); # sets $collection and $textdir
112 &set_index($old_indexdir); # sets $index (just chooses one index)
113
114 my $buildcfgfile = &util::filename_cat($old_indexdir, "build.cfg");
115 my $colcfgfile = &util::filename_cat($new_etcdir, "collect.cfg");
116
117 # don't need this but giving it a value stops an annoying undefined
118 # value warning
119 $ENV{'GSDLCOLLECTDIR'} = 'tmp';
120
121 &add_default_cfg();
122 &add_index_cfg($buildcfgfile);
123
124 # work out all the classifications from the gdbm file, info for each doc
125 # (ie which classifications they belong to, are kept in $doc_classif_info
126 &get_classifyinfo($gdbmfile); #puts a list of classifications into classifyinfo
127 &get_toplevel_OID($gdbmfile); # puts a list of the top level document OIDs into $toplevelinfo
128
129 # read ldb file into %infodb
130 &read_gdbm($gdbmfile);
131
132 #this makes the files specifying the hierarchy of subjects, titles etc
133 foreach my $classify (@$classifyinfo) {
134 &make_info_file($new_etcdir, $classify);
135 }
136
137 # write out the collect.cfg
138 &output_cfg_file($colcfgfile);
139
140 &openmg($old_indexdir);
141
142 # read the archive information file
143 my $archive_info_filename = &util::filename_cat ($new_archivedir, "archives.inf");
144 my $archive_info = new arcinfo ();
145
146 # create a docsave object to process the documents
147 my $processor = new docsave ("", $archive_info, $verbosity);
148 $processor->setarchivedir ($new_archivedir);
149
150 my ($doc_obj, $hashref, $children);
151 print STDERR "processing documents now\n" if $verbosity >=2;
152 foreach $oid (@$toplevelinfo) {
153 $value = $infodb{$oid};
154 $hashref={};
155 $children = [];
156 &get_metadata($value, $hashref);
157 $doc_obj = new doc ();
158 $doc_obj->set_OID($oid);
159 my ($olddir) = $hashref->{'archivedir'}; # old dir for this doc, where images are stored
160 $top = $doc_obj->get_top_section();
161 &add_section_content ($doc_obj, $top, $hashref, $olddir);
162 &add_classification_metadata($oid, $doc_obj, $top);
163 &add_cover_image($doc_obj, $old_archivedir, $olddir);
164 &get_children($hashref, $children);
165 &recurse_sections($doc_obj, $children, $oid, $top, $olddir) if (defined ($children));
166
167 #change OID to new format
168 $doc_obj->delete_OID();
169 $doc_obj->set_OID();
170 $processor->process($doc_obj);
171# last;
172 }
173 print STDERR "\n";
174
175 &closemg();
176
177 # write out the archive information file
178 $archive_info->save_info($archive_info_filename);
179
180
181}
182
183# returns the path to the gdbm info database - also
184# sets the $collection and $textdir global variable
185sub get_gdbmfile {
186 my ($indexdir) = @_;
187 my ($gdbmfile);
188
189 opendir (DIR, $indexdir) || die "Couldn't open directory $indexdir\n\n";
190 my @conts = readdir DIR;
191 close DIR;
192
193 foreach $file (@conts) {
194 if ($file =~ /text$/) {
195 $textdir = $file;
196 last;
197 }
198 }
199 die "No text directory found in $indexdir\n\n"
200 unless defined $textdir && $textdir =~ /text$/;
201
202 $gdbmfile = &util::filename_cat ($indexdir, $textdir);
203
204 opendir (DIR, $gdbmfile) || die "Couldn't open directory $gdbmfile\n\n";
205 @conts = readdir DIR;
206 close DIR;
207
208 foreach my $file (@conts) {
209 if ($file =~ /^(.*?)\.(?:ldb|bdb)$/) {
210 $collection = $1;
211 $gdbmfile = &util::filename_cat ($gdbmfile, $file);
212 last;
213 }
214 }
215
216 if (defined $collection && $collection =~ /\w/) {
217 $textdir = &util::filename_cat ($textdir, $collection);
218 } else {
219 die "collection global wasn't set\n";
220 }
221 return $gdbmfile if (-e $gdbmfile);
222 die "Couldn't find gdbm info database in $indexdir\n\n";
223}
224
225
226#this function will probably need to be changed for any different file formats
227sub get_toplevel_OID {
228 my ($gdbmfile) = @_;
229
230 open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
231 print STDERR "Finding all top level sections from $gdbmfile\n" if $verbosity >= 2;
232
233 $/ = '-' x 70;
234 my $entry = "";
235 while (defined ($entry = <DB2TXT>)) {
236 next unless $entry =~ /\w/; #ignore blank entries
237 $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
238 my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
239
240 next unless ($key =~ /B\.\d*$/); # assumes top level OID is like
241 # B.12
242
243 push( @$toplevelinfo, $key);
244
245 }
246
247 $/ = "\n";
248}
249
250# gets all the metadata from a gdbm file entry, and puts it into a hashref
251#this has changed for old style formats
252sub get_metadata {
253
254 my ($gdb_str_ref, $hashref) = @_;
255# print STDERR $gdb_str_ref;
256 my @entries = split(/\n/, $gdb_str_ref);
257
258 my $hastext = 1;
259 foreach $entry (@entries) {
260
261 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ );
262
263 next if ($key eq "p");
264 next if ($key eq "j");
265
266 $key = "Title" if $key eq "t";
267 $key = "docnum" if $key eq "d";
268 if ($key eq "o") {
269 $key = "archivedir";
270 $value =~ s/HASH/hash/;
271 }
272 if ($key eq "c") {
273 $key = "contains";
274 $hastext=0;}
275
276 $$hashref{$key} .= $value;
277
278 }
279
280 $$hashref{'hastxt'} = $hastext;
281
282}
283
284# takes a hashref containing the metadata for a gdbmfile entry, and
285# extracts the childrens numbers (from the 'contains' entry). assumes
286# format is like B.14.1:B.24.2:B.24.3 returns a list like 1 2 3 4
287sub get_children {
288 my ($hashref, $children) = @_;
289
290 my $childs = $hashref->{'contains'};
291 if (defined ($childs)) {
292 @xchildren = split /\:/, $childs;
293 foreach $kid (@xchildren) {
294 my ($child)= $kid =~ /\.([^\.]*)(?:\\n|$)/;
295 push (@$children, $child);
296 }
297# print STDERR "children are: ", @$children;
298 }
299 else {
300 $children = [];
301 }
302}
303
304#takes a hashref containing the metadata for a gdbmfile entry, and extracts
305#the childrens numbers (from the 'contains' entry).
306#assumes format is like B.14.1:B.24.2:B.24.3
307#returns a list with the full child name ie B.14.1 B.14.2 etc
308#used for classification stuff
309sub get_whole_children {
310
311 my ($hashref, $children) = @_;
312
313 my $childs = $hashref->{'contains'};
314 if (defined ($childs)) {
315 @$children = split /\:/, $childs;
316 #print STDERR "children are: ", @$children;
317 }
318 else {
319 $children = [];
320 }
321}
322
323sub recurse_sections {
324 my ($doc_obj, $children, $parentoid, $parentsection, $olddir) = @_;
325
326 foreach $child (sort numerically @$children) {
327 $doc_obj->create_named_section("$parentsection.$child");
328 my $value = $infodb{"$parentoid.$child"};
329 my $hashref={};
330 &get_metadata($value, $hashref); # get childs metadata
331 my $newchildren = [];
332 &get_children($hashref, $newchildren); # get childs children
333 #add content fo rcurrent section
334 &add_section_content($doc_obj, "$parentsection.$child", $hashref, $olddir);
335 # process all the children if there are any
336 &recurse_sections($doc_obj, $newchildren, "$parentoid.$child", "$parentsection.$child", $olddir)
337 if (defined ($newchildren));
338 }
339}
340
341sub add_section_content {
342 my ($doc_obj, $cursection, $hashref, $olddir) = @_;
343
344 foreach $key (keys %$hashref) {
345 #dont need to store these metadata
346 next if $key =~ /(contains|docnum|hastxt|doctype|archivedir|classifytype)/i;
347 my @items = split /@/, $hashref->{$key};
348 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
349
350 }
351
352 my ($hastext) = $hashref->{'hastxt'} =~ /(0|1)/;
353 if ($hastext) {
354
355 my ($docnum)= $hashref->{'docnum'} =~ /(\d*)/;
356 my $text = &get_text($docnum);
357 $doc_obj->add_text ($cursection, $text);
358
359 my (@images) = $text =~ /<img.*?src=\"([^\"]*)\"[^>]*>/g;
360
361 if (scalar(@images)>0) {
362
363 foreach $img (@images) {
364 my ($assoc_file) = $img =~ /([^\/\\]*\..*)$/; #the name of the image
365 $img =~ s/_httpcollection_/$old/; #replace _httpcollection_ with path to old collection
366 $olddir =~ s/HASH/hash/;
367 $img =~ s/_thisOID_/$olddir/; #replace _thisOID_ with old archivedir name #the path to the image
368 $img = lc $img; # doc names and path upper case in file, lower
369 # case in directory
370 if (-e $img) {
371 $doc_obj->associate_file($img, $assoc_file);
372 } else {
373 print STDERR "WARNING: $img does not exist\n";
374 }
375 }
376
377 }
378 }
379}
380
381
382sub add_classification_metadata {
383
384 my ($oid, $doc_obj, $cursection) = @_;
385
386 if (defined $doc_classif_info->{$oid}) {
387
388 $hashref = $doc_classif_info->{$oid};
389
390 foreach $key (keys %$hashref) {
391 my @items = @{$hashref->{$key}};
392 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items;
393 }
394 }
395}
396# picks up the cover image "cover.jpg" from the old archives directory.
397sub add_cover_image {
398 my ($doc_obj, $archivedir, $olddir) = @_;
399 $assoc_file = "cover.jpg";
400 $img = &util::filename_cat($archivedir, $olddir, $assoc_file);
401
402 if (-e $img) {
403 $doc_obj->associate_file($img, $assoc_file);
404 } else {
405 print STDERR "WARNING: couldn't find $img\n";
406 }
407}
408
409sub set_index {
410 my ($indexdir) = @_;
411
412 # check that $collection has been set
413 die "collection global was not set\n"
414 unless defined $collection && $collection =~ /\w/;
415
416 # find an index (just use first non-text and non-paragraph directory we
417 # come across in $indexdir)
418 opendir (INDEXDIR, $indexdir) || die "couldn't open directory $indexdir\n";
419 my @indexes = readdir INDEXDIR;
420 close INDEXDIR;
421 foreach $i (@indexes) {
422 next if $i =~ /text$/i || $i =~ /\./ || $i =~ /^p/i;
423 $index = &util::filename_cat ($i, $collection);
424 last;
425 }
426}
427
428
429#########################################################################
430
431################ functions involving mg ################################
432sub get_text {
433 my ($docnum) = @_;
434
435 print STDERR "." if $verbosity >= 2;
436 &mgcommand ($docnum);
437
438 <$mgread>; # eat the document separator
439
440 my $text = "";
441 my $line = "";
442
443 while (defined ($line = <$mgread>))
444 {
445 last if $line =~ /^<\/mg>/;
446 $text .= $line;
447 }
448
449 # Read in the last statement, which should be:
450 # "dd documents retrieved."
451 <$mgread>;
452
453 return $text;
454}
455
456
457sub numerically {$a <=> $b;}
458
459
460
461# mg stuff
462
463sub openmg {
464 my ($indexdir) = @_;
465
466 die "Unable to start mgquery." unless
467 &openpipe($mgread, $mgwrite, "mgquery_old -d $indexdir -f $index -t $textdir");
468
469 $mgwrite->autoflush();
470 &mgcommand('.set expert true');
471 &mgcommand('.set terminator "</mg>\n"');
472 &mgcommand('.set mode text');
473 &mgcommand('.set query docnums');
474 &mgcommand('.set term_freq off');
475}
476
477sub closemg {
478 &mgcommand (".quit");
479 close($mgread);
480 close($mgwrite);
481}
482
483sub mgcommand {
484 my ($command) = @_;
485
486 return if $command =~ /^\s*$/; #whitespace
487 #print STDERR "command: $command\n";
488 print $mgwrite "$command\n";
489
490 # eat up the command executed which is echoed
491 <$mgread>;
492}
493
494# openpipe(READ, WRITE, CMD)
495#
496# Like open2, except CMD's stderr is also redirected.
497#
498sub openpipe
499{
500 my ($read, $write, $cmd) = @_;
501 my ($child_read, $child_write);
502
503 $child_read = ++$FileHandle;
504 $child_write = ++$FileHandle;
505
506 pipe($read, $child_write) || die "Failed pipe($read, $child_write): $!";
507 pipe($child_read, $write) || die "Failed pipe($child_read, $write): $!";
508 my $pid;
509
510 if (($pid = fork) < 0) {
511 die "Failed fork: $!";
512 } elsif ($pid == 0) {
513 close($read);
514 close($write);
515 open(STDIN, "<&$child_read");
516 open(STDOUT, ">&$child_write");
517 open(STDERR, ">&$child_write");
518 exec($cmd);
519 die "Failed exec $cmd: $!";
520 }
521
522 close($child_read);
523 close($child_write);
524
525 $write->autoflush();
526 $read->autoflush();
527
528 return 1;
529}
530
531
532######################################################################
533
534############# functions to do with the classificaiton stuff ##########
535sub get_classifyinfo {
536 my ($gdbmfile) = @_;
537
538 open (DB2TXT, "db2txt $gdbmfile |") || die "couldn't open pipe to db2txt\n";
539 print STDERR "Finding all classification sections from $gdbmfile\n" ;
540
541 $/ = '-' x 70;
542 my $entry = "";
543 while (defined ($entry = <DB2TXT>)) {
544 next unless $entry =~ /\w/; #ignore blank entries
545 $entry =~ s/\n+/\\n/g; # replace multiple \n with single \n
546 my ($key, $value) = $entry =~ /\[([^\]]*)\](.*)/;
547
548 next unless ($key =~/^[A-Z]$/); # assumes classification OID is like
549 # C or T etc
550
551 push( @$classifyinfo, $key);
552
553 }
554
555 $/ = "\n";
556}
557
558#this creates the classification files needed for the hierarchy classifier
559#used for subjects, titles, orgs etc
560#also adds in entries to the collect_cfg hash
561sub make_info_file {
562 my ($etcdir, $classifier) = @_;
563 my $info_file = "";
564
565 $info_file = &util::filename_cat($etcdir, "CL.${classifier}.txt");
566 $classifier_name = "CL.$classifier";
567 print STDERR "classification $classifier will be called CL.$classifier\n";
568
569 open (OUTDOC, ">$info_file" ) || die "couldn't open file $info_file\n";
570
571 my $entry = $infodb{$classifier};
572 #print STDERR "entry = $entry\n";
573 my $hashref = {};
574 &get_metadata($entry, $hashref);
575 my $children=[];
576 &get_whole_children($hashref, $children);
577 foreach $child (@$children) {
578 &process_entry(OUTDOC, $classifier_name, $child);
579 }
580
581 close OUTDOC;
582
583 &add_classify_cfg($classifier, $classifier_name, $info_file);
584
585}
586
587
588sub process_entry {
589 my ($handle, $classifier_name, $classify_id) = @_;
590
591 my $value = $infodb{$classify_id};
592 my $hashref={};
593 &get_metadata($value, $hashref);
594 my $title = $hashref->{'Title'};
595
596 &add_line($handle, $classify_id, $title);
597
598 my $children = [];
599 &get_whole_children($hashref, $children);
600 foreach $child (@$children) {
601 if (&is_document($child)) {
602 &add_doc_metadata($child, $classifier_name, $classify_id);
603 }
604 else {
605 &process_entry($handle, $classifier_name, $child);
606 }
607 }
608}
609
610sub add_doc_metadata {
611 my ($doc_id, $classifier_name, $classifier_id) = @_;
612
613 #add entry to doc database
614 #print STDERR "at doc level, docnum=$classify_id\n";
615 $doc_classif_info->{$doc_id}={} unless defined $doc_classif_info->{$doc_id};
616 $doc_classif_info->{$doc_id}->{$classifier_name}=[] unless
617 defined $doc_classif_info->{$doc_id}->{$classifier_name};
618 push (@{$doc_classif_info->{$doc_id}->{$classifier_name}}, $classifier_id);
619}
620
621sub add_line {
622 my ($handle, $classify_id, $title) = @_;
623 $title = &unicode::ascii2utf8(\$title);
624 my ($num) = $classify_id =~ /^[A-Z]\.(.*)$/; #remove the C. from the front
625 print $handle "$classify_id\t$num\t\"$title\"\n";
626}
627
628sub is_document {
629 my ($oid) = @_;
630 return 1 if $oid =~ /^B\.\d/;
631 return 0;
632}
633
634########################################################################
635
636########## stuff for producing collect.cfg file ###########################
637
638sub add_default_cfg {
639
640 $username=`whoami`;
641 $username=`logname` unless defined $username;
642 $username="a_user" unless defined $username;
643 $username =~ s/\n//;
644 $collect_cfg->{'creator'}="$username\@cs.waikato.ac.nz";
645 $collect_cfg->{'maintainer'}="$username\@cs.waikato.ac.nz";
646 $collect_cfg->{'public'}="true";
647
648 $collect_cfg->{'plugin'}=[];
649 push (@{$collect_cfg->{'plugin'}}, ["GAPlug"]);
650 push (@{$collect_cfg->{'plugin'}}, ["ArcPlug"]);
651 push (@{$collect_cfg->{'plugin'}}, ["RecPlug"]);
652
653 $collect_cfg->{'format'}={};
654 $collect_cfg->{'format'}->{'DocumentImages'}="true";
655 $collect_cfg->{'format'}->{'DocumentText'} =
656 "\"<h3>[Title]</h3>\\\\n\\\\n<p>[Text]\"";
657 $collect_cfg->{'format'}->{'SearchVList'} =
658 "\"<td valign=top>[link][icon][/link]</td><td>{If}{[parent(All': '):Title],[parent(All': '):Title]:}[link][Title][/link]</td>\"";
659
660 $collect_cfg->{'collectionmeta'}={};
661 $collect_cfg->{'collectionmeta'}->{'collectionname'}="\"$collection\"";
662 $collect_cfg->{'collectionmeta'}->{'iconcollection'}="\"_httpprefix_/collect/$collection/images/$collection.gif\"";
663 $collect_cfg->{'collectionmeta'}->{'iconcollectionsmall'}="\"_httpprefix_/collect/$collection/images/${collection}sm.gif\"";
664 $collect_cfg->{'collectionmeta'}->{'collectionextra'} = "\"This is a collection rebuilt from CDROM.\"";
665
666}
667
668sub add_index_cfg {
669 my ($buildfile) = @_;
670
671 my $data={};
672 $collect_cfg->{'indexes'}=[];
673 if (-e $buildfile) {
674 $data=&cfgread::read_cfg_file($buildfile, '^(this)$', '^(indexmap)$');
675 foreach my $i (@{$data->{'indexmap'}}) {
676 my ($thisindex, $abbrev)= split (/\-\>/, $i);
677 push (@{$collect_cfg->{'indexes'}}, $thisindex);
678 $collect_cfg->{'defaultindex'} = $thisindex unless defined
679 $collect_cfg->{'defaultindex'};
680 $name=&get_index_name($thisindex);
681 $thisindex=".$thisindex";
682 $collect_cfg->{'collectionmeta'}->{$thisindex} = "\"$name\"";
683 }
684 }
685 else {
686 print STDERR "Couldn't read $buildfile, could not add index data to configuration file\n";
687 }
688
689}
690
691sub get_index_name {
692 my ($thisindex) = @_;
693 return "paragraphs" if $thisindex =~ /paragraph/;
694 return "chapters" if $thisindex =~ /section.*text/;
695 return "titles" if $thisindex =~ /Title/;
696 return "other";
697}
698
699sub add_classify_cfg {
700
701 my ($classify, $metadata, $file) = @_;
702 $collect_cfg->{'classify'} = [] unless defined $collect_cfg->{'classify'};
703
704 my ($title);
705 $title = "Howto" if $classify eq 'H';
706 $title = "Subject" if $classify eq 'C';
707 $title = "Organization" if $classify eq 'O';
708 $title = "Title" if $classify eq 'T';
709 $title = "Other" unless defined $title;
710
711 my ($filename) = $file =~ /\/([^\/]*)$/;
712 my $entry = "Hierarchy hfile=$filename metadata=$metadata buttonname=$title sort=Title";
713 push (@{$collect_cfg->{'classify'}},[$entry]);
714
715
716}
717
718sub output_cfg_file {
719
720 my ($collfile) = @_;
721 &cfgread::write_cfg_file($collfile, $collect_cfg,
722 '^(creator|maintainer|public|beta|defaultindex)$',
723 '^(indexes)$', '^(format|collectionmeta)$',
724 '^(plugin|classify)$');
725}
726
727sub read_gdbm {
728 my ($filename) = @_;
729
730 open (PIPEIN, "db2txt \"$filename\" |") || die "couldn't open pipe from db2txt\n";
731 my $line = ""; my $key = ""; my $value = "";
732 while (defined ($line = <PIPEIN>)) {
733 if ($line =~ /^\[([^\]]+)\]$/) {
734 $key = $1;
735 } elsif ($line =~ /^-{70}$/) {
736 $infodb{$key} = $value;
737 $value = "";
738 $key = "";
739 } else {
740 $value .= $line;
741 }
742 }
743 close PIPEIN;
744}
Note: See TracBrowser for help on using the repository browser.