Changeset 17290
- Timestamp:
- 2008-09-15T15:28:48+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/OAIPlugin.pm
r17216 r17290 47 47 'type' => "regexp", 48 48 'reqd' => "no", 49 'deft' => &get_default_process_exp() } 49 'deft' => &get_default_process_exp() }, 50 { 'name' => "xxx", 51 'desc' => "{OAIPlugin.xxx}", 52 'type' => "metadata", 53 'reqd' => "no", 54 'deft' => "gi.Sourcedoc" } 50 55 ]; 51 56 … … 88 93 $self->{'in_metadata_node'} = 0; 89 94 $self->{'rawxml'} = ""; 95 $self->{'saved_metadata'} = {}; 90 96 } 91 97 … … 140 146 if ($element eq "metadata") { 141 147 my $textref = \$self->{'metadata_xml'}; 142 my $metadata = $self->{'metadata'}; 148 #my $metadata = $self->{'metadata'}; 149 my $metadata = $self->{'saved_metadata'}; 143 150 $self->extract_oai_metadata($textref,$metadata); 144 151 … … 169 176 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 170 177 return undef unless $self->can_process_this_file($filename_full_path); 178 # print STDERR "initial\n"; 179 # foreach my $k (keys %$metadata) { 180 # print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 181 # } 182 # print STDERR "\n"; 171 183 172 184 my $total_count = 0; # is total count used? 173 if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) { 174 # calling "SUPER::read" at this point sets up $metadata 175 # data-structure. We can then, later, in OAIPlug::read decide 176 # whether this $metadata will stick to an accompanying file, 177 # or else needs a new doc object to be formed that contains 178 # purely metadata 179 180 $self->{'metadata'} = undef; 185 if (!$self->parse_file($filename_full_path, $file, $gli)) { 186 $self->{'saved_metadata'} = undef; 187 return undef; 188 } 189 190 my $new_metadata = $self->{'saved_metadata'}; 191 $self->{'saved_metadata'} = undef; 192 # add the pretty metadata table as metadata 193 my $ppmd_table = $self->{'ppmd_table'}; 194 $new_metadata->{'prettymd'} = $ppmd_table; 195 $self->{'ppmd_table'} = undef; 196 197 print STDERR "after parse\n"; 198 foreach my $k (keys %$new_metadata) { 199 print STDERR "$k=".join (", ", @{$new_metadata->{$k}})."; "; 200 } 201 print STDERR "\n"; 202 203 204 # if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$new_metadata,$processor,$maxdocs,$total_count, $gli)) { 205 # calling "SUPER::read" at this point sets up $metadata 206 # data-structure. We can then, later, in OAIPlug::read decide 207 # whether this $metadata will stick to an accompanying file, 208 # or else needs a new doc object to be formed that contains 209 # purely metadata 210 211 # $self->{'metadata'} = undef; 212 # print STDERR "after erad\n"; 213 # foreach my $k (keys %$metadata) { 214 # print STDERR "$k=".join (", ", @{$metadata->{$k}})."; "; 215 # } 216 # print STDERR "\n"; 217 my $url_array = $new_metadata->{'dc.Identifier'}; 218 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 219 print STDERR "$num_urls urls for $file\n"; 220 my $srcdoc_exists = 0; 221 my $srcdoc_pos = 0; 222 my $filename_dir = &util::filename_head($filename_full_path); 223 my $filename_for_metadata = $file; 224 for (my $i=0; $i<$num_urls; $i++) { 181 225 182 #my $url_array = $metadata->{'gi.Sourcedoc'}; 183 my $url_array = $metadata->{'dc.Identifier'}; 184 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 185 186 my $srcdoc_exists = 0; 187 my $srcdoc_pos = 0; 188 my $filename_dir = &util::filename_head($filename_full_path); 189 190 for (my $i=0; $i<$num_urls; $i++) { 191 192 if ($url_array->[$i] !~ m/^(https?|ftp):/) { 193 194 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 195 196 if (-e $src_filename) { 197 $srcdoc_pos = $i; 198 $srcdoc_exists = 1; 199 last; 200 } 226 if ($url_array->[$i] !~ m/^(https?|ftp):/) { 227 228 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 229 230 if (-e $src_filename) { 231 $srcdoc_pos = $i; 232 $srcdoc_exists = 1; 233 $filename_for_metadata = $url_array->[$i]; 234 last; 201 235 } 202 236 } 203 204 205 if ($srcdoc_exists) 206 { 207 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 208 237 } 238 239 240 if ($srcdoc_exists) 241 { 242 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 243 } 244 else { 245 # save the rawxml for the source document 246 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 247 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'}; 248 $self->{'rawxml'} = ""; 249 print STDERR "raw xml = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 250 } 251 209 252 ### print STDERR "**** storing OAI file: $file\n"; 210 211 # Make pretty print metadata table stick with src filename 212 my $ppmd_table = $self->{'ppmd_table'}; 213 214 $metadata->{'prettymd'} = [ $ppmd_table ]; 215 $self->{'ppmd_table'} = undef; 216 217 } 218 else { 219 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 220 } 221 222 } 223 else { 224 return undef; 225 } 253 254 # return all the metadata we have extracted to the caller. 255 # Directory plug will pass it back in at read time, so we don't need to extract it again. 256 $extrametadata->{$filename_for_metadata} = $new_metadata; 257 push(@$extrametakeys, $filename_for_metadata); 258 259 return 1; 260 226 261 } 227 262 … … 229 264 sub read { 230 265 my $self = shift (@_); 231 266 232 267 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 233 268 … … 235 270 ### print STDERR "**** checking OAI read: $file\n"; 236 271 237 if (defined $self->{'oai-files'}->{$file}) { 238 239 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 240 272 if (!defined $self->{'oai-files'}->{$file}) { 273 return undef; 274 } 275 276 277 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 278 if ($srcdoc_exists) { 279 # do nothing more 241 280 # no more need to access details of this $file => tidy up as you go 242 281 delete $self->{'oai-files'}->{$file}; 282 return 0; # not processed here, but don't pass on to rest of plugins 283 } 243 284 244 285 ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n"; 245 if (!$srcdoc_exists) 246 { 247 248 my $filename = $file; 249 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 250 251 # Do encoding stuff on metadata 252 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 253 254 # create a new document 255 my $doc_obj = new doc ($filename, "indexed_doc"); 256 my $top_section = $doc_obj->get_top_section; 257 my $plugin_type = $self->{'plugin_type'}; 258 259 $doc_obj->add_utf8_metadata($top_section, "Language", $language); 260 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 261 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 262 $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 263 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 264 265 # include any metadata passed in from previous plugins 266 # note that this metadata is associated with the top level section 267 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 268 269 # do plugin specific processing of doc_obj 270 my $textref = \$self->{'rawxml'}; 271 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 272 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 273 return -1; 274 } 275 276 # do any automatic metadata extraction 277 $self->auto_extract_metadata ($doc_obj); 278 279 # add an OID 280 $self->add_OID($doc_obj); 281 282 my $prettymds = $self->{'prettymd'}; 283 foreach my $prettymd (@$prettymds) { 284 $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd); 285 } 286 $self->{'prettymd'} = undef; 287 288 # process the document 289 $processor->process($doc_obj); 290 291 $self->{'num_processed'} ++; 292 293 return 1; # processed the file 294 } 295 } 296 else { 297 return undef; 298 } 286 287 my $filename = $file; 288 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 289 290 # Do encoding stuff on metadata 291 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 292 293 # create a new document 294 my $doc_obj = new doc ($filename, "indexed_doc"); 295 my $top_section = $doc_obj->get_top_section; 296 my $plugin_type = $self->{'plugin_type'}; 297 298 $doc_obj->add_utf8_metadata($top_section, "Language", $language); 299 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 300 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 301 $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 302 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 303 304 # include any metadata passed in from previous plugins 305 # note that this metadata is associated with the top level section 306 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 307 308 # do plugin specific processing of doc_obj 309 print STDERR "raw xml 2 = $self->{'oai-files'}->{$file}->{'rawxml'}\n"; 310 my $text = $self->{'oai-files'}->{$file}->{'rawxml'}; 311 delete $self->{'oai-files'}->{$file}; 312 313 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 314 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 315 return -1; 316 } 317 318 # do any automatic metadata extraction 319 $self->auto_extract_metadata ($doc_obj); 320 321 # add an OID 322 $self->add_OID($doc_obj); 323 324 # process the document 325 $processor->process($doc_obj); 326 327 $self->{'num_processed'} ++; 328 329 return 1; # processed the file 299 330 } 300 331 … … 302 333 sub read_old { 303 334 my $self = shift (@_); 304 335 305 336 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 306 337 … … 436 467 my $style = "style=\'border-bottom: 4px solid #000080\'"; 437 468 438 469 $self->{'ppmd_table'} = "\n<table $att $style>"; 439 470 } 440 471
Note:
See TracChangeset
for help on using the changeset viewer.