Changeset 17216 for gsdl/trunk/perllib/plugins/OAIPlugin.pm
- Timestamp:
- 2008-09-08T14:59:50+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/OAIPlugin.pm
r17197 r17216 35 35 use ReadXMLFile; 36 36 use ReadTextFile; # needed for subroutine textcat_get_language_encoding 37 use metadatautil; 37 38 38 39 sub BEGIN { … … 160 161 161 162 163 sub metadata_read { 164 my $self = shift (@_); 165 166 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 167 168 # can we process this file?? 169 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 170 return undef unless $self->can_process_this_file($filename_full_path); 171 172 my $total_count = 0; # is total count used? 173 if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) { 174 # calling "SUPER::read" at this point sets up $metadata 175 # data-structure. We can then, later, in OAIPlug::read decide 176 # whether this $metadata will stick to an accompanying file, 177 # or else needs a new doc object to be formed that contains 178 # purely metadata 179 180 $self->{'metadata'} = undef; 181 182 #my $url_array = $metadata->{'gi.Sourcedoc'}; 183 my $url_array = $metadata->{'dc.Identifier'}; 184 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; 185 186 my $srcdoc_exists = 0; 187 my $srcdoc_pos = 0; 188 my $filename_dir = &util::filename_head($filename_full_path); 189 190 for (my $i=0; $i<$num_urls; $i++) { 191 192 if ($url_array->[$i] !~ m/^(https?|ftp):/) { 193 194 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 195 196 if (-e $src_filename) { 197 $srcdoc_pos = $i; 198 $srcdoc_exists = 1; 199 last; 200 } 201 } 202 } 203 204 205 if ($srcdoc_exists) 206 { 207 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; 208 209 ### print STDERR "**** storing OAI file: $file\n"; 210 211 # Make pretty print metadata table stick with src filename 212 my $ppmd_table = $self->{'ppmd_table'}; 213 214 $metadata->{'prettymd'} = [ $ppmd_table ]; 215 $self->{'ppmd_table'} = undef; 216 217 } 218 else { 219 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; 220 } 221 222 } 223 else { 224 return undef; 225 } 226 } 162 227 163 228 164 229 sub read { 230 my $self = shift (@_); 231 232 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 233 234 235 ### print STDERR "**** checking OAI read: $file\n"; 236 237 if (defined $self->{'oai-files'}->{$file}) { 238 239 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; 240 241 # no more need to access details of this $file => tidy up as you go 242 delete $self->{'oai-files'}->{$file}; 243 244 ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n"; 245 if (!$srcdoc_exists) 246 { 247 248 my $filename = $file; 249 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 250 251 # Do encoding stuff on metadata 252 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 253 254 # create a new document 255 my $doc_obj = new doc ($filename, "indexed_doc"); 256 my $top_section = $doc_obj->get_top_section; 257 my $plugin_type = $self->{'plugin_type'}; 258 259 $doc_obj->add_utf8_metadata($top_section, "Language", $language); 260 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 261 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 262 $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 263 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 264 265 # include any metadata passed in from previous plugins 266 # note that this metadata is associated with the top level section 267 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 268 269 # do plugin specific processing of doc_obj 270 my $textref = \$self->{'rawxml'}; 271 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 272 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 273 return -1; 274 } 275 276 # do any automatic metadata extraction 277 $self->auto_extract_metadata ($doc_obj); 278 279 # add an OID 280 $self->add_OID($doc_obj); 281 282 my $prettymds = $self->{'prettymd'}; 283 foreach my $prettymd (@$prettymds) { 284 $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd); 285 } 286 $self->{'prettymd'} = undef; 287 288 # process the document 289 $processor->process($doc_obj); 290 291 $self->{'num_processed'} ++; 292 293 return 1; # processed the file 294 } 295 } 296 else { 297 return undef; 298 } 299 } 300 301 302 sub read_old { 165 303 my $self = shift (@_); 166 304
Note:
See TracChangeset
for help on using the changeset viewer.