Changeset 8563
- Timestamp:
- 2004-11-16T13:17:06+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/ISISPlug.pm
r8121 r8563 65 65 'type' => "string", 66 66 'reqd' => "no", 67 'deft' => ", " }, 68 { 'name' => "document_field", 69 'desc' => "{ISISPlug.document_field}", 70 'type' => "string", 71 'reqd' => "no", 72 'deft' => "" }, 73 { 'name' => "document_prefix", 74 'desc' => "{ISISPlug.document_prefix}", 75 'type' => "string", 76 'reqd' => "no", 77 'deft' => "" }, 78 { 'name' => "document_suffix", 79 'desc' => "{ISISPlug.document_suffix}", 80 'type' => "string", 81 'reqd' => "no", 82 'deft' => "" } 67 'deft' => ", " } 83 68 ]; 84 69 … … 108 93 109 94 110 sub new { 95 sub new 96 { 111 97 my $class = shift(@_); 112 98 … … 115 101 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'}, 116 102 q^entry_separator/.*/<br>^, \$self->{'entry_separator'}, 117 q^document_field/.*/^, \$self->{'document_field'},118 q^document_prefix/.*/^, \$self->{'document_prefix'},119 q^document_suffix/.*/^, \$self->{'document_suffix'},120 103 "allow_extra_options")) { 121 104 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n"; … … 132 115 133 116 134 sub read 135 { 136 my $self = shift(@_); 137 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 138 139 my $result = &SplitPlug::read($self, @_); 140 if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) { 141 &end_metadata_xml_file($self->{'documents_metadata_xml_file'}); 142 } 143 144 return $result; 145 } 146 147 148 sub read_file { 117 sub read_file 118 { 149 119 my $self = shift (@_); 150 120 my ($filename, $encoding, $language, $textref) = @_; 151 121 152 my ($databasename) = ($filename =~ /( [^\.]+)\.mst$/i);122 my ($databasename) = ($filename =~ /(.*)\.mst$/i); 153 123 154 124 # Check the associated .fdt and .xrf files exist … … 168 138 169 139 my $reader = new multiread(); 170 $reader->set_handle 171 $reader->set_encoding 172 $reader->read_file 140 $reader->set_handle('ISISPlug::FILE'); 141 $reader->set_encoding($encoding); 142 $reader->read_file($textref); 173 143 174 144 close(FILE); … … 182 152 # Remove the line at the start so it is split and processed properly 183 153 $$textref =~ s/^----------\n//; 184 185 # Obtain the documents specified in the CDS/ISIS database, if requested186 if ($self->{'document_field'}) {187 # Create a directory to store the document files188 $self->{'documents_directory'} = $databasename . ".all";189 if (-e $self->{'documents_directory'}) {190 &util::rm_r($self->{'documents_directory'});191 }192 &util::mk_dir($self->{'documents_directory'});193 194 # ...and a metadata.xml file for the document metadata (extracted from the database)195 $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");196 if (-e $self->{'documents_metadata_xml_file'}) {197 &util::rm($self->{'documents_metadata_xml_file'});198 }199 &begin_metadata_xml_file($self->{'documents_metadata_xml_file'});200 }201 154 } 202 155 … … 244 197 my $subfieldname = ""; 245 198 if ($rawtagvalue =~ s/^\^([a-z])//) { 246 $subfieldname = " ." . $1;199 $subfieldname = "^$1"; 247 200 } 248 201 … … 251 204 my $metadatafieldname = $tagname . $subfieldname; 252 205 my $metadatafieldvalue = $1; 253 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";254 206 255 207 # Handle Keywords specially … … 278 230 } 279 231 280 # print "Metadata: $tagname.all -> $completetagvalue\n"; 281 $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue); 282 } 283 # print "\n"; 284 # Add fileFormat as the metadata 285 $doc_obj->add_metadata($cursection, "FileFormat", "CDS/ISIS"); 232 $doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue); 233 } 234 286 235 # Add the full record as the document text 287 236 $$textref =~ s/\</</g; 288 237 $$textref =~ s/\>/>/g; 289 $doc_obj->add_utf8_text ($cursection, $$textref); 290 291 # Obtain the documents specified in the CDS/ISIS database, if requested 292 if ($self->{'document_field'}) { 293 my $document_field = $self->{'document_field'}; 294 my $document_prefix = $self->{'document_prefix'} || ""; 295 my $document_suffix = $self->{'document_suffix'} || ""; 296 297 my $documents_directory = $self->{'documents_directory'}; 298 my $document_obtained = 0; 299 300 # Look at all the metadata assigned to this record 301 my $record_metadata = $doc_obj->get_all_metadata($cursection); 302 foreach my $pair (@$record_metadata) { 303 my ($field, $value) = (@$pair); 304 305 # Does this metadata element specify a document to obtain? 306 if ($field eq $document_field) { 307 my $document_file_full = $document_prefix . $value . $document_suffix; 308 309 my $document_file = &obtain_document($self, $document_file_full, $documents_directory); 310 if ($document_file) { 311 $document_obtained = 1; 312 &write_metadata_xml_file($self->{'documents_metadata_xml_file'}, 313 $document_file, $record_metadata); 314 } 315 } 316 } 317 318 # If there was a document obtained for this record we don't want the record as well 319 if ($document_obtained) { 320 return 0; 321 } 322 } 238 $doc_obj->add_utf8_text($cursection, $$textref); 239 240 # Add FileFormat metadata 241 $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS"); 323 242 324 243 # Record was processed successfully (and there was no document obtained) … … 364 283 365 284 366 sub obtain_document367 {368 my $self = shift(@_);369 my $document_file_full = shift(@_);370 my $documents_directory = shift(@_);371 372 my $outhandle = $self->{'outhandle'};373 print $outhandle "Obtaining document file $document_file_full...\n"374 if ($self->{'verbosity'} > 1);375 376 my $document_file_name;377 my $local_document_file;378 379 # Document specified is on the web380 if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) {381 $document_file_full =~ /([^\/]+)$/;382 $document_file_name = $1;383 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);384 385 my $wget_options = "--quiet";386 $wget_options = "--verbose" if ($self->{'verbosity'} > 2);387 $wget_options .= " --timestamping"; # Only re-download files if they're newer388 `wget $wget_options $document_file_full --output-document $local_document_file`;389 }390 # Document specified is on the disk391 else {392 my $dir_sep = &util::get_os_dirsep();393 $document_file_full =~ /(.+$dir_sep)?(.*)$/;394 $document_file_name = $2;395 $local_document_file = &util::filename_cat($documents_directory, $document_file_name);396 397 &util::cp($document_file_full, $documents_directory);398 }399 400 # Check the document was obtained successfully401 if (!-e $local_document_file) {402 print STDERR "WARNING: Could not obtain document file $document_file_full\n";403 return undef;404 }405 406 return $document_file_name;407 }408 409 410 sub begin_metadata_xml_file411 {412 my $metadata_xml_file = shift(@_);413 414 open(METADATA_XML_FILE, ">$metadata_xml_file");415 print METADATA_XML_FILE416 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .417 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .418 "<DirectoryMetadata>\n";419 close(METADATA_XML_FILE);420 }421 422 423 sub write_metadata_xml_file424 {425 my $metadata_xml_file = shift(@_);426 my $file_name = shift(@_);427 my $record_metadata = shift(@_);428 429 # Make $file_name XML-safe430 $file_name =~ s/</</g;431 $file_name =~ s/>/>/g;432 433 open(METADATA_XML_FILE, ">>$metadata_xml_file");434 435 print METADATA_XML_FILE436 "\n" .437 " <FileSet>\n" .438 " <FileName>$file_name</FileName>\n" .439 " <Description>\n";440 441 foreach my $pair (@$record_metadata) {442 my ($field, $value) = (@$pair);443 444 # We're only interested in metadata from the database445 next if ($field eq "gsdlsourcefilename");446 next if ($field eq "gsdldoctype");447 next if ($field eq "Language");448 next if ($field eq "Encoding");449 next if ($field eq "Identifier");450 next if ($field eq "Source");451 next if ($field eq "SourceSegment");452 next if ($field eq "Plugin");453 454 # Make $value XML-safe455 $value =~ s/</</g;456 $value =~ s/>/>/g;457 458 print METADATA_XML_FILE " <Metadata name=\"$field\">$value</Metadata>\n";459 }460 461 print METADATA_XML_FILE462 " </Description>\n" .463 " </FileSet>\n";464 465 close(METADATA_XML_FILE);466 }467 468 469 sub end_metadata_xml_file470 {471 my $metadata_xml_file = shift(@_);472 473 open(METADATA_XML_FILE, ">>$metadata_xml_file");474 print METADATA_XML_FILE "\n</DirectoryMetadata>\n";475 close(METADATA_XML_FILE);476 }477 478 479 285 1;
Note:
See TracChangeset
for help on using the changeset viewer.