Changeset 11994
- Timestamp:
- 2006-07-04T15:06:34+12:00 (18 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/basebuilder.pm
r11965 r11994 274 274 } 275 275 276 277 276 278 sub make_infodatabase { 277 279 my $self = shift (@_); … … 299 301 # init all the classifiers 300 302 &classify::init_classifiers ($self->{'classifiers'}); 303 304 305 my $reconstructed_docs = undef; 306 if ($self->{'keepold'}) { 307 # reconstruct doc_obj metadata from gdbm for all docs 308 $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($fulldbname); 309 } 301 310 302 311 # set up the document processor … … 319 328 $self->{'buildproc'}->set_indexing_text (0); 320 329 $self->{'buildproc'}->set_store_text(1); 321 # make_infodatabase does not support incremental build 322 # => full reset needed 323 $self->{'buildproc'}->zero_reset(); 330 331 # make_infodatabase needs full reset even for incremental build 332 # as incremental works by reconstructing all docs from GDBM and 333 # then adding in the new ones 334 $self->{'buildproc'}->zero_reset(); 335 336 if ($self->{'keepold'}) { 337 # create flat classify structure, ready for new docs to be added 338 foreach my $doc_obj ( @$reconstructed_docs ) { 339 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n"; 340 $self->{'buildproc'}->process($doc_obj,undef); 341 } 342 } 343 324 344 325 345 # this has changed to only output collection meta if its -
trunk/gsdl/perllib/basebuildproc.pm
r11793 r11994 296 296 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc"); 297 297 298 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; 299 $archivedir = "" unless defined $archivedir; 300 $archivedir =~ s/\\/\//g; 301 $archivedir =~ s/^\/+//; 302 $archivedir =~ s/\/+$//; 303 304 # resolve the final filenames of the files associated with this document 305 $self->assoc_files ($doc_obj, $archivedir); 298 my $archivedir = ""; 299 300 if (defined $filename) 301 { 302 # doc_obj derived directly from file 303 304 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; 305 $dir = "" unless defined $dir; 306 $dir =~ s/\\/\//g; 307 $dir =~ s/^\/+//; 308 $dir =~ s/\/+$//; 309 310 $archivedir = $dir; 311 312 # resolve the final filenames of the files associated with this document 313 $self->assoc_files ($doc_obj, $archivedir); 314 } 315 else 316 { 317 # doc_obj reconstructed from GDBM (has metadata, doc structure but no text) 318 my $top_section = $doc_obj->get_top_section(); 319 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir"); 320 } 321 306 322 307 323 #GRB: moved 1/06/2004 from GRB01062004 … … 340 356 } 341 357 342 # output whether this node contains text 343 if ($doc_obj->get_text_length($section) > 0) { 344 print $handle "<hastxt>1\n"; 345 } else { 346 print $handle "<hastxt>0\n"; 358 # Output whether this node contains text 359 # 360 # If doc_obj reconstructed from GDBM file then no need to 361 # explicitly add <hastxt> as this is preserved as metadata when 362 # the GDBM file is loaded in 363 364 if (defined $filename) 365 { 366 # doc_obj derived directly from file 367 if ($doc_obj->get_text_length($section) > 0) { 368 print $handle "<hastxt>1\n"; 369 } else { 370 print $handle "<hastxt>0\n"; 371 } 347 372 } 348 373 … … 380 405 } 381 406 382 # output archivedir if at top level 383 if ($section eq $doc_obj->get_top_section()) { 384 print $handle "<archivedir>$archivedir\n"; 407 408 # If doc_obj reconstructed from GDBM file then no need to 409 # explicitly add <archivedir> as this is preserved as metadata when 410 # the GDBM file is loaded in 411 412 if (defined $filename) 413 { 414 # output archivedir if at top level 415 if ($section eq $doc_obj->get_top_section()) { 416 print $handle "<archivedir>$archivedir\n"; 417 } 385 418 } 386 419 … … 389 422 print $handle "<thistype>$thistype\n"; 390 423 } 424 391 425 392 426 if ($self->{'gdbm_level'} eq "document") { -
trunk/gsdl/perllib/classify.pm
r11644 r11994 31 31 require AllList; 32 32 use gsprintf; 33 34 use GDBM_File; 33 35 34 36 … … 139 141 } 140 142 } 143 144 145 146 # takes a hashref containing the metadata for a gdbmfile entry, and extracts 147 # the childrens numbers (from the 'contains' entry). 148 # assumes format is ".1;".2;".3 149 sub get_children { 150 my ($doc_db_hash) = @_; 151 152 my $children = undef; 153 154 $childs = $doc_db_hash->{'contains'}; 155 if (defined ($childs)) { 156 $childs =~ s/\@$//; #remove trailing @ 157 $childs =~ s/^\"\.//; #remove initial ". 158 @$children = split /\;\"\./, $childs; 159 160 } 161 162 return $children; 163 } 164 165 166 sub recurse_sections { 167 my ($doc_obj, $children, $parentoid, $parentsection, $gdbm_recs) = @_; 168 169 return if (!defined $children); 170 171 foreach my $child (sort { $a <=> $b} @$children) { 172 $doc_obj->create_named_section("$parentsection.$child"); 173 my $doc_db_rec = $gdbm_recs->{"$parentoid.$child"}; 174 my $doc_db_hash = db_rec_to_hash($doc_db_rec); 175 176 # get child's children 177 my $newchildren = &get_children($doc_db_hash); 178 179 # add content for current section 180 add_section_content($doc_obj, "$parentsection.$child", $doc_db_hash); 181 182 # process all the children if there are any 183 if (defined ($newchildren)) 184 { 185 recurse_sections($doc_obj, $newchildren, "$parentoid.$child", 186 "$parentsection.$child", $gdbm_recs); 187 } 188 } 189 } 190 191 192 sub add_section_content { 193 my ($doc_obj, $cursection, $doc_db_hash) = @_; 194 195 foreach $key (keys %$doc_db_hash) { 196 #don't need to store these metadata 197 next if $key =~ /(thistype|childtype|contains|docnum|doctype|classifytype)/i; 198 # but do want things like hastxt and archivedir 199 my @items = split /@/, $doc_db_hash->{$key}; 200 map {$doc_obj->add_metadata ($cursection, $key, $_); } @items; 201 202 } 203 } 204 205 206 # gets all the metadata from a gdbm file entry, and puts it into a hashref 207 sub db_rec_to_hash { 208 209 my ($gdb_str_ref) = @_; 210 211 my $hashref = {}; 212 213 my @entries = split(/\n/, $gdb_str_ref); 214 foreach $entry (@entries) { 215 my($key, $value) = ($entry =~ /^<([^>]*)>(.*?)$/ ); 216 $hashref->{$key} .= '@' if defined $hashref->{$key}; 217 $hashref->{$key} .= $value; 218 219 } 220 221 return $hashref; 222 } 223 224 225 sub reconstruct_doc_objs_metadata 226 { 227 my ($fulldbname) = @_; 228 229 tie %gdbm_recs, 'GDBM_File', $fulldbname, &GDBM_WRCREAT, 0640; 230 231 # dig out top level doc sections 232 my %top_sections = (); 233 foreach my $key ( keys %gdbm_recs ) 234 { 235 my $md_rec = $gdbm_recs{$key}; 236 my $md_hash = db_rec_to_hash($md_rec); 237 238 if ((defined $md_hash->{'doctype'}) && ($md_hash->{'doctype'} eq "doc")) { 239 next if ($key =~ m/\./); 240 $top_sections{$key} = $md_hash; 241 } 242 } 243 244 # for greenstone document objects based on metadata in gdbm file 245 my @all_docs = (); 246 foreach my $oid ( keys %top_sections ) 247 { 248 my $doc_db_hash = $top_sections{$oid}; 249 250 my $doc_obj = new doc(); 251 $doc_obj->set_OID($oid); 252 253 my $top = $doc_obj->get_top_section(); 254 add_section_content ($doc_obj, $top, $doc_db_hash); 255 my $children = &get_children($doc_db_hash); 256 recurse_sections($doc_obj, $children, $oid, $top, \%gdbm_recs); 257 258 push(@all_docs,$doc_obj); 259 } 260 261 untie %gdbm_recs; 262 263 return \@all_docs; 264 } 265 266 267 268 141 269 142 270 # classify_doc lets each of the classifiers classify a document
Note:
See TracChangeset
for help on using the changeset viewer.