Changeset 215
- Timestamp:
- 1999-03-30T17:08:18+12:00 (25 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/buildcol.pl
r130 r215 29 29 { 30 30 if (!parsargv::parse(\@ARGV, 31 'newgdbm', \$newgdbm, 31 32 'verbosity/\d+/2', \$verbosity, 32 33 'archivedir/.*/', \$archivedir, … … 36 37 die "\n"; 37 38 } 39 40 $newgdbm = 0 unless $newgdbm; 38 41 39 42 # get and check the collection … … 109 112 110 113 eval("\$builder = new $buildertype(\$collection, " . 111 "\$realarchivedir, \$realbuilddir, \$verbosity )");114 "\$realarchivedir, \$realbuilddir, \$verbosity, \$newgdbm)"); 112 115 die "$@" if $@; 113 116 -
trunk/gsdl/perllib/doc.pm
r97 r215 116 116 my $sectionref = $self; 117 117 118 while ( $section ne "") {118 while (defined $section && $section ne "") { 119 119 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/; 120 120 $num =~ s/^0+(\d)/$1/; # remove leading 0s … … 260 260 261 261 # returns a reference to a list of children 262 263 262 sub get_children { 264 263 my $self = shift (@_); … … 516 515 517 516 foreach $data (@{$section_ptr->{'metadata'}}) { 518 return $data->[1] if ( $data->[0] eq $field);517 return $data->[1] if (scalar(@$data) >= 2 && $data->[0] eq $field); 519 518 } 520 519 -
trunk/gsdl/perllib/mgbuilder.pm
r139 r215 4 4 package mgbuilder; 5 5 6 use classify; 6 7 use cfgread; 7 8 use colcfg; … … 24 25 25 26 sub new { 26 my ($class, $collection, $source_dir, $build_dir, $verbosity ) = @_;27 my ($class, $collection, $source_dir, $build_dir, $verbosity, $newgdbm) = @_; 27 28 28 29 # create an mgbuilder object … … 30 31 'source_dir'=>$source_dir, 31 32 'build_dir'=>$build_dir, 32 'verbosity'=>$verbosity}, $class; 33 'verbosity'=>$verbosity, 34 'newgdbm'=>$newgdbm}, $class; 33 35 34 36 35 37 # read in the collection configuration file 36 if (!-e "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg") { 38 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; 39 if (!-e $colcfgname) { 37 40 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n"; 38 41 } 39 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" . 40 "$collection/etc/collect.cfg"); 41 42 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 42 43 43 44 # sort out subcollection indexes … … 76 77 die "\n"; 77 78 } 78 79 80 # load all the classifiers 81 if ($self->{'newgdbm'}) { 82 $self->{'classifiers'} = []; 83 if (open (COLCFG, $colcfgname)) { 84 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) { 85 if (scalar(@$line) >= 2) { 86 my $key = shift (@$line); 87 if ($key eq "classify") { 88 my $classinfo = &classify::load_classifier($line); 89 push (@{$self->{'classifiers'}}, $classinfo) 90 if defined $classinfo; 91 } 92 } 93 } 94 close (COLCFG); 95 } else { 96 print STDERR "mgbuilder::new couldn't read the cfg file $colcfgname\n"; 97 print STDERR " no classifiers were loaded\n"; 98 } 99 } 79 100 80 101 # load up the document processor for building … … 92 113 93 114 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . 94 "\$source_dir, \$build_dir, \$verbosity )");115 "\$source_dir, \$build_dir, \$verbosity, \$newgdbm)"); 95 116 die "$@" if $@; 96 117 … … 436 457 print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1); 437 458 459 # init all the classifiers 460 if ($self->{'newgdbm'}) { 461 &classify::init_classifiers ($self->{'classifiers'}); 462 } 463 438 464 # set up the document processor 439 465 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); 440 $self->{'buildproc'}->set_mode ('infodb'); 441 442 # collect the statistics for the text 443 # -b $maxdocsize sets the maximum document size to be 12 meg 466 if ($self->{'newgdbm'}) { 467 $self->{'buildproc'}->set_mode ('newinfodb'); 468 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 469 } else { 470 $self->{'buildproc'}->set_mode ('infodb'); 471 } 472 473 # create the infodatabase 444 474 if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT, 445 475 "| $exedir/txt2db$exe $fulldbname")) { … … 449 479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 450 480 "", {}, $self->{'buildproc'}); 481 482 # output classification information 483 if ($self->{'newgdbm'}) { 484 &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT'); 485 } 486 451 487 close (PIPEOUT); 452 488 } -
trunk/gsdl/perllib/mgbuildproc.pm
r139 r215 5 5 package mgbuildproc; 6 6 7 use classify; 8 use doc; 7 9 use docproc; 8 10 use util; … … 22 24 $self->{'build_dir'} = $build_dir; 23 25 $self->{'verbosity'} = $verbosity; 26 $self->{'classifiers'} = []; 24 27 $self->{'mode'} = "text"; 25 28 $self->{'index'} = "section:text"; … … 81 84 } 82 85 86 sub set_classifiers { 87 my $self = shift (@_); 88 my ($classifiers) = @_; 89 90 $self->{'classifiers'} = $classifiers; 91 } 92 83 93 sub process { 84 94 my $self = shift (@_); … … 87 97 $self->$method(@_); 88 98 } 99 100 sub newinfodb { 101 my $self = shift (@_); 102 my ($doc_obj, $filename) = @_; 103 my $handle = $self->{'output_handle'}; 104 # $handle = "main::STDOUT"; 105 106 # this was used in the old version 107 return if ($doc_obj eq 'classifications'); 108 109 my $doctype = $doc_obj->get_doc_type(); 110 111 # only output this document if it is one to be indexed 112 return if ($doctype ne "indexed_doc"); 113 114 # this is another document 115 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 116 117 my $section = $doc_obj->get_top_section (); 118 my $doc_OID = $doc_obj->get_OID(); 119 while (defined $section) { 120 # update a few statistics 121 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 122 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 123 124 # output the section name 125 if ($section eq "") { print $handle "[$doc_OID]\n"; } 126 else { print $handle "[$doc_OID.$section]\n"; } 127 128 # output the fact that this document is a document 129 print $handle "<doctype>doc\n"; 130 131 # output whether this node contains text 132 if ($doc_obj->get_text_length($section) > 0) { 133 print $handle "<hastxt>1\n"; 134 } else { 135 print $handle "<hastxt>0\n"; 136 } 137 138 # output all the section metadata 139 my $metadata = $doc_obj->get_all_metadata ($section); 140 foreach $pair (@$metadata) { 141 my ($field, $value) = (@$pair); 142 143 if ($field ne "Identifier" && $field !~ /^gsdl/ && 144 defined $value && $value ne "") { 145 # escape problematic stuff 146 $value =~ s/\\/\\\\/g; 147 $value =~ s/\n/\\n/g; 148 $value =~ s/\r/\\r/g; 149 150 print $handle "<$field>$value\n"; 151 } 152 } 153 154 # output a list of children 155 my $children = $doc_obj->get_children (); 156 if (scalar(@$children) > 0) { 157 print $handle "<contains>"; 158 my $firstchild = 1; 159 foreach $child (@$children) { 160 print $handle ";" unless $firstchild; 161 $firstchild = 0; 162 if ($child eq "") { print $handle "$doc_OID"; } 163 else { print $handle "$doc_OID.$child"; } 164 } 165 print $handle "\n"; 166 } 167 168 # output the matching document number 169 print $handle "<docnum>$self->{'num_sections'}\n"; 170 171 print $handle '-' x 70, "\n"; 172 173 174 # output a database entry for the document number 175 print $handle "[$self->{'num_sections'}]\n"; 176 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 177 else { print $handle "<section>$doc_OID.$section\n"; } 178 print $handle '-' x 70, "\n"; 179 180 181 $section = $doc_obj->get_next_section($section); 182 } 183 184 # classify this document 185 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 186 187 } 188 189 89 190 90 191 sub infodb {
Note:
See TracChangeset
for help on using the changeset viewer.