Changeset 17319
- Timestamp:
- 2008-09-18T10:03:44+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/OAIPlugin.pm
r17300 r17319 48 48 'reqd' => "no", 49 49 'deft' => &get_default_process_exp() }, 50 { 'name' => " xxx",51 'desc' => "{OAIPlugin. xxx}",50 { 'name' => "document_field", 51 'desc' => "{OAIPlugin.document_field}", 52 52 'type' => "metadata", 53 53 'reqd' => "no", … … 176 176 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 177 177 return undef unless $self->can_process_this_file($filename_full_path); 178 # print STDERR "initial\n"; 179 # foreach my $k (keys %$metadata) { 180 # print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 181 # } 182 # print STDERR "\n"; 183 184 my $total_count = 0; # is total count used? 178 185 179 if (!$self->parse_file($filename_full_path, $file, $gli)) { 186 180 $self->{'saved_metadata'} = undef; … … 190 184 my $new_metadata = $self->{'saved_metadata'}; 191 185 $self->{'saved_metadata'} = undef; 186 192 187 # add the pretty metadata table as metadata 193 188 my $ppmd_table = $self->{'ppmd_table'}; 194 189 $new_metadata->{'prettymd'} = $ppmd_table; 195 190 $self->{'ppmd_table'} = undef; 196 197 print STDERR "after parse\n"; 198 foreach my $k (keys %$new_metadata) { 199 print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; "; 200 } 201 print STDERR "\n"; 202 203 204 # if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) { 205 # calling "SUPER::read" at this point sets up $metadata 206 # data-structure. We can then, later, in OAIPlug::read decide 207 # whether this $metadata will stick to an accompanying file, 208 # or else needs a new doc object to be formed that contains 209 # purely metadata 210 211 # $self->{'metadata'} = undef; 212 # print STDERR "after erad\n"; 213 # foreach my $k (keys %$metadata) { 214 # print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 215 # } 216 # print STDERR "\n"; 217 my $url_array = $new_metadata->{'dc.Identifier'}; 191 192 my $document_metadata_field = $self->{'document_field'}; 193 my $url_array = $new_metadata->{$document_metadata_field}; 218 194 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 219 print STDERR "$num_urls urls for $file\n";195 ##print STDERR "$num_urls urls for $file\n"; 220 196 my $srcdoc_exists = 0; 221 197 my $srcdoc_pos = 0; 222 198 my $filename_dir = &util::filename_head($filename_full_path); 223 my $filename_for_metadata = $file; 199 my $filename_for_metadata = $file; # this assumes there will only be one record per oai file - is this always the case?? 224 200 for (my $i=0; $i<$num_urls; $i++) { 225 201 … … 237 213 } 238 214 239 240 if ($srcdoc_exists) 241 { 215 if ($srcdoc_exists) { 242 216 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 243 217 } … … 247 221 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'}; 248 222 $self->{'rawxml'} = ""; 249 print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 250 } 251 252 ### print STDERR "**** storing OAI file: $file\n"; 223 } 253 224 254 225 # return all the metadata we have extracted to the caller. … … 267 238 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 268 239 269 270 ### print STDERR "**** checking OAI read: $file\n";271 272 240 if (!defined $self->{'oai-files'}->{$file}) { 273 241 return undef; 274 242 } 275 276 243 277 244 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 278 245 if ($srcdoc_exists) { 279 # do nothing more 246 # do nothing more - all the metadata has been extracted and associated with the srcdoc 280 247 # no more need to access details of this $file => tidy up as you go 281 248 delete $self->{'oai-files'}->{$file}; … … 283 250 } 284 251 285 ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";286 287 252 my $filename = $file; 288 253 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; … … 304 269 # include any metadata passed in from previous plugins 305 270 # note that this metadata is associated with the top level section 271 # this will include all the metadata from the oai file that we extracted 272 # during metadata_read 306 273 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 307 274 308 275 # do plugin specific processing of doc_obj 309 print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n";310 276 my $text = $self->{'oai-files'}->{$file}->{'rawxml'}; 311 277 delete $self->{'oai-files'}->{$file}; … … 331 297 332 298 333 sub read_old {334 my $self = shift (@_);335 336 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;337 338 my $outhandle = $self->{'outhandle'};339 340 my $filename = $file;341 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;342 343 # block the srcdocs dir - we will process files in them when we find an OAI record for them344 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));345 if ($self->SUPER::read(@_)) {346 # Do encoding stuff347 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);348 349 my $url_array = $metadata->{'dc.Identifier'};350 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;351 352 my $srcdoc_exists = 0;353 my $srcdoc_pos = 0;354 my $filename_dir = &util::filename_head($filename);355 356 for (my $i=0; $i<$num_urls; $i++) {357 if ($url_array->[$i] !~ m/^(http|ftp):/) {358 359 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);360 if (-e $src_filename) {361 $srcdoc_pos = $i;362 $srcdoc_exists = 1;363 }364 }365 }366 367 if ($srcdoc_exists)368 {369 print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"370 if ($self->{'verbosity'}>1);371 372 373 # Make pretty print metadata table stick with src filename374 my $ppmd_table = $self->{'ppmd_table'};375 $metadata->{'prettymd'} = [ $ppmd_table ];376 $self->{'ppmd_table'} = undef;377 378 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],379 $block_hash, $metadata, $processor, $maxdocs,380 $total_count, $gli);381 }382 else383 {384 # create a new document385 my $doc_obj = new doc ($filename, "indexed_doc");386 my $top_section = $doc_obj->get_top_section;387 my $plugin_type = $self->{'plugin_type'};388 389 $doc_obj->add_utf8_metadata($top_section, "Language", $language);390 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);391 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);392 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");393 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));394 395 # include any metadata passed in from previous plugins396 # note that this metadata is associated with the top level section397 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);398 399 # do plugin specific processing of doc_obj400 my $textref = \$self->{'rawxml'};401 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {402 print STDERR "<ProcessingError n='$file'>\n" if ($gli);403 return -1;404 }405 406 # do any automatic metadata extraction407 $self->auto_extract_metadata ($doc_obj);408 409 # add an OID410 $self->add_OID($doc_obj);411 412 my $ppmd_table = $self->{'ppmd_table'};413 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);414 $self->{'ppmd_table'} = undef;415 416 # process the document417 $processor->process($doc_obj);418 419 $self->{'num_processed'} ++;420 421 return 1; # processed the file422 }423 }424 else {425 return undef;426 }427 }428 429 430 299 # do plugin specific processing of doc_obj 431 300 sub process { … … 450 319 $$textref =~ s/\]/]/g; 451 320 452 ## print STDERR "*** adding text: $$textref\n";453 454 321 $doc_obj->add_utf8_text($cursection, $$textref); 455 322 … … 475 342 my ($metaname, $metavalue_utf8) = @_; 476 343 477 ### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;478 344 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8); 479 345 … … 596 462 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s) 597 463 { 598 # if URL given for document as identifier metadata, store it ...599 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);600 464 601 465 my $metaname = $1; 602 466 my $metavalue = $2; 603 467 $inner_metadata_text = $3; 604 605 # print STDERR "*** metaname = $metaname\n";606 # print STDERR "*** metavalue = $metavalue\n";607 468 608 469 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter … … 611 472 { 612 473 $metaname = "$top_level_prefix.$metaname"; 613 # print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";614 474 } 615 475 $metaname =~ s/\.(.)/\.\u$1/; … … 619 479 $metavalue =~ s/\[/[/g; 620 480 $metavalue =~ s/\]/]/g; 621 622 623 # if ($metaname eq "Identifier")624 # {625 # # name clashes with GSDL reserved metadata name for hash id626 # $metaname = "URL";627 # }628 481 629 482 if (defined $metadata->{$metaname})
Note:
See TracChangeset
for help on using the changeset viewer.