| | 163 | sub metadata_read { |
|---|
| | 164 | my $self = shift (@_); |
|---|
| | 165 | |
|---|
| | 166 | my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; |
|---|
| | 167 | |
|---|
| | 168 | # can we process this file?? |
|---|
| | 169 | my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); |
|---|
| | 170 | return undef unless $self->can_process_this_file($filename_full_path); |
|---|
| | 171 | |
|---|
| | 172 | my $total_count = 0; # is total count used? |
|---|
| | 173 | if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) { |
|---|
| | 174 | # calling "SUPER::read" at this point sets up $metadata |
|---|
| | 175 | # data-structure. We can then, later, in OAIPlug::read decide |
|---|
| | 176 | # whether this $metadata will stick to an accompanying file, |
|---|
| | 177 | # or else needs a new doc object to be formed that contains |
|---|
| | 178 | # purely metadata |
|---|
| | 179 | |
|---|
| | 180 | $self->{'metadata'} = undef; |
|---|
| | 181 | |
|---|
| | 182 | #my $url_array = $metadata->{'gi.Sourcedoc'}; |
|---|
| | 183 | my $url_array = $metadata->{'dc.Identifier'}; |
|---|
| | 184 | my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0; |
|---|
| | 185 | |
|---|
| | 186 | my $srcdoc_exists = 0; |
|---|
| | 187 | my $srcdoc_pos = 0; |
|---|
| | 188 | my $filename_dir = &util::filename_head($filename_full_path); |
|---|
| | 189 | |
|---|
| | 190 | for (my $i=0; $i<$num_urls; $i++) { |
|---|
| | 191 | |
|---|
| | 192 | if ($url_array->[$i] !~ m/^(https?|ftp):/) { |
|---|
| | 193 | |
|---|
| | 194 | my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); |
|---|
| | 195 | |
|---|
| | 196 | if (-e $src_filename) { |
|---|
| | 197 | $srcdoc_pos = $i; |
|---|
| | 198 | $srcdoc_exists = 1; |
|---|
| | 199 | last; |
|---|
| | 200 | } |
|---|
| | 201 | } |
|---|
| | 202 | } |
|---|
| | 203 | |
|---|
| | 204 | |
|---|
| | 205 | if ($srcdoc_exists) |
|---|
| | 206 | { |
|---|
| | 207 | $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1; |
|---|
| | 208 | |
|---|
| | 209 | ### print STDERR "**** storing OAI file: $file\n"; |
|---|
| | 210 | |
|---|
| | 211 | # Make pretty print metadata table stick with src filename |
|---|
| | 212 | my $ppmd_table = $self->{'ppmd_table'}; |
|---|
| | 213 | |
|---|
| | 214 | $metadata->{'prettymd'} = [ $ppmd_table ]; |
|---|
| | 215 | $self->{'ppmd_table'} = undef; |
|---|
| | 216 | |
|---|
| | 217 | } |
|---|
| | 218 | else { |
|---|
| | 219 | $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0; |
|---|
| | 220 | } |
|---|
| | 221 | |
|---|
| | 222 | } |
|---|
| | 223 | else { |
|---|
| | 224 | return undef; |
|---|
| | 225 | } |
|---|
| | 226 | } |
|---|
| | 230 | my $self = shift (@_); |
|---|
| | 231 | |
|---|
| | 232 | my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; |
|---|
| | 233 | |
|---|
| | 234 | |
|---|
| | 235 | ### print STDERR "**** checking OAI read: $file\n"; |
|---|
| | 236 | |
|---|
| | 237 | if (defined $self->{'oai-files'}->{$file}) { |
|---|
| | 238 | |
|---|
| | 239 | my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'}; |
|---|
| | 240 | |
|---|
| | 241 | # no more need to access details of this $file => tidy up as you go |
|---|
| | 242 | delete $self->{'oai-files'}->{$file}; |
|---|
| | 243 | |
|---|
| | 244 | ### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n"; |
|---|
| | 245 | if (!$srcdoc_exists) |
|---|
| | 246 | { |
|---|
| | 247 | |
|---|
| | 248 | my $filename = $file; |
|---|
| | 249 | $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; |
|---|
| | 250 | |
|---|
| | 251 | # Do encoding stuff on metadata |
|---|
| | 252 | my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); |
|---|
| | 253 | |
|---|
| | 254 | # create a new document |
|---|
| | 255 | my $doc_obj = new doc ($filename, "indexed_doc"); |
|---|
| | 256 | my $top_section = $doc_obj->get_top_section; |
|---|
| | 257 | my $plugin_type = $self->{'plugin_type'}; |
|---|
| | 258 | |
|---|
| | 259 | $doc_obj->add_utf8_metadata($top_section, "Language", $language); |
|---|
| | 260 | $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); |
|---|
| | 261 | $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); |
|---|
| | 262 | $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); |
|---|
| | 263 | $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); |
|---|
| | 264 | |
|---|
| | 265 | # include any metadata passed in from previous plugins |
|---|
| | 266 | # note that this metadata is associated with the top level section |
|---|
| | 267 | $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); |
|---|
| | 268 | |
|---|
| | 269 | # do plugin specific processing of doc_obj |
|---|
| | 270 | my $textref = \$self->{'rawxml'}; |
|---|
| | 271 | unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { |
|---|
| | 272 | print STDERR "<ProcessingError n='$file'>\n" if ($gli); |
|---|
| | 273 | return -1; |
|---|
| | 274 | } |
|---|
| | 275 | |
|---|
| | 276 | # do any automatic metadata extraction |
|---|
| | 277 | $self->auto_extract_metadata ($doc_obj); |
|---|
| | 278 | |
|---|
| | 279 | # add an OID |
|---|
| | 280 | $self->add_OID($doc_obj); |
|---|
| | 281 | |
|---|
| | 282 | my $prettymds = $self->{'prettymd'}; |
|---|
| | 283 | foreach my $prettymd (@$prettymds) { |
|---|
| | 284 | $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd); |
|---|
| | 285 | } |
|---|
| | 286 | $self->{'prettymd'} = undef; |
|---|
| | 287 | |
|---|
| | 288 | # process the document |
|---|
| | 289 | $processor->process($doc_obj); |
|---|
| | 290 | |
|---|
| | 291 | $self->{'num_processed'} ++; |
|---|
| | 292 | |
|---|
| | 293 | return 1; # processed the file |
|---|
| | 294 | } |
|---|
| | 295 | } |
|---|
| | 296 | else { |
|---|
| | 297 | return undef; |
|---|
| | 298 | } |
|---|
| | 299 | } |
|---|
| | 300 | |
|---|
| | 301 | |
|---|
| | 302 | sub read_old { |
|---|