Changeset 1694 for trunk/gsdl/perllib/mgppbuildproc.pm
- Timestamp:
- 2000-11-23T11:55:50+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuildproc.pm
r932 r1694 42 42 43 43 sub new { 44 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_; 44 my ($class, $collection, $source_dir, $build_dir, 45 $verbosity, $outhandle) = @_; 45 46 my $self = new docproc (); 47 48 # outhandle is where all the debugging info goes 49 # output_handle is where the output of the plugins is piped 50 # to (i.e. mg, gdbm etc.) 51 $outhandle = STDERR unless defined $outhandle; 46 52 47 53 $self->{'collection'} = $collection; … … 59 65 $self->{'num_sections'} = 0; 60 66 $self->{'num_bytes'} = 0; 67 $self->{'num_processed_bytes'} = 0; 68 $self->{'outhandle'} = $outhandle; 61 69 62 70 $self->{'indexing_text'} = 0; … … 70 78 $self->{'num_docs'} = 0; 71 79 $self->{'num_sections'} = 0; 80 $self->{'num_processed_bytes'} = 0; 72 81 $self->{'num_bytes'} = 0; 73 82 } … … 85 94 } 86 95 96 # num_bytes is the actual number of bytes in the collection 97 # this is normally the same as what's processed during text compression 87 98 sub get_num_bytes { 88 99 my $self = shift (@_); 89 100 90 101 return $self->{'num_bytes'}; 102 } 103 104 # num_processed_bytes is the number of bytes actually passed 105 # to mgpp for the current index 106 sub get_num_processed_bytes { 107 my $self = shift (@_); 108 109 return $self->{'num_processed_bytes'}; 91 110 } 92 111 … … 127 146 } 128 147 148 sub get_index { 149 my $self = shift (@_); 150 151 return $self->{'index'}; 152 } 153 129 154 sub set_classifiers { 130 155 my $self = shift (@_); … … 139 164 140 165 $self->{'indexing_text'} = $indexing_text; 166 } 167 168 sub get_indexing_text { 169 my $self = shift (@_); 170 171 return $self->{'indexing_text'}; 141 172 } 142 173 … … 240 271 else { print $handle "[$doc_OID.$section]\n"; } 241 272 242 # output the fact that this document is a document 243 #print $handle "<doctype>doc\n"; 273 # output the fact that this document is a document (unless doctype 274 # has been set to something else from within a plugin 275 my $dtype = $doc_obj->get_metadata_element ($section, "doctype"); 276 if (!defined $dtype || $dtype !~ /\w/) { 277 print $handle "<doctype>doc\n"; 278 } 244 279 245 280 # output whether this node contains text … … 251 286 252 287 # output all the section metadata 253 my $found_doctype = 0;288 #my $found_doctype = 0; 254 289 my $metadata = $doc_obj->get_all_metadata ($section); 255 290 foreach $pair (@$metadata) { 256 291 my ($field, $value) = (@$pair); 257 292 258 $found_doctype = 1 if $field eq "doctype";293 #$found_doctype = 1 if $field eq "doctype"; 259 294 if ($field ne "Identifier" && $field !~ /^gsdl/ && 260 295 defined $value && $value ne "") { … … 282 317 # (unless doctype was already output as part of 283 318 # metadata) 284 if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {285 print $handle "<doctype>doc\n";286 }319 #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) { 320 # print $handle "<doctype>doc\n"; 321 #} 287 322 288 323
Note:
See TracChangeset
for help on using the changeset viewer.