Changeset 7686
- Timestamp:
- 2004-07-01T14:48:55+12:00 (20 years ago)
- Location:
- trunk/gsdl/perllib/plugins
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/ISISPlug.pm
r7049 r7686 7 7 # University of Waikato, New Zealand. 8 8 # 9 # Copyright 1999-200 3New Zealand Digital Library Project9 # Copyright 1999-2004 New Zealand Digital Library Project 10 10 # 11 11 # This program is free software; you can redistribute it and/or modify … … 39 39 40 40 my $arguments = 41 [ { 'name' => "entry_separator", 42 'desc' => "{ISISPlug.entry_separator}", 43 'type' => "string", 44 'reqd' => "no", 45 'deft' => "<br>" }, 46 { 'name' => "process_exp", 41 [ { 'name' => "process_exp", 47 42 'desc' => "{BasPlug.process_exp}", 48 43 'type' => "regexp", … … 52 47 'desc' => "{BasPlug.block_exp}", 53 48 'type' => "regexp", 49 'reqd' => "no", 54 50 'deft' => &get_default_block_exp() }, 51 { 'name' => "split_exp", 52 'desc' => "{SplitPlug.split_exp}", 53 'type' => "regexp", 54 'reqd' => "no", 55 'deft' => &get_default_split_exp() }, 56 57 # The interesting options 58 { 'name' => "entry_separator", 59 'desc' => "{ISISPlug.entry_separator}", 60 'type' => "string", 61 'reqd' => "no", 62 'deft' => "<br>" }, 55 63 { 'name' => "subfield_separator", 56 64 'desc' => "{ISISPlug.subfield_separator}", … … 58 66 'reqd' => "no", 59 67 'deft' => ", " }, 60 { 'name' => "split_exp", 61 'desc' => "{SplitPlug.split_exp}", 62 'type' => "regexp", 63 'deft' => &get_default_split_exp(), 64 'reqd' => "no" } 68 { 'name' => "document_field", 69 'desc' => "{ISISPlug.document_field}", 70 'type' => "string", 71 'reqd' => "no", 72 'deft' => "" }, 73 { 'name' => "document_prefix", 74 'desc' => "{ISISPlug.document_prefix}", 75 'type' => "string", 76 'reqd' => "no", 77 'deft' => "" }, 78 { 'name' => "document_suffix", 79 'desc' => "{ISISPlug.document_suffix}", 80 'type' => "string", 81 'reqd' => "no", 82 'deft' => "" } 65 83 ]; 66 84 … … 97 115 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'}, 98 116 q^entry_separator/.*/<br>^, \$self->{'entry_separator'}, 117 q^document_field/.*/^, \$self->{'document_field'}, 118 q^document_prefix/.*/^, \$self->{'document_prefix'}, 119 q^document_suffix/.*/^, \$self->{'document_suffix'}, 99 120 "allow_extra_options")) { 100 121 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n"; … … 108 129 109 130 return bless $self, $class; 131 } 132 133 134 sub read 135 { 136 my $self = shift(@_); 137 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 138 139 my $result = &SplitPlug::read($self, @_); 140 if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) { 141 &end_metadata_xml_file($self->{'documents_metadata_xml_file'}); 142 } 143 144 return $result; 110 145 } 111 146 … … 145 180 $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g; 146 181 147 # Add a newline at the start so it is split properly 148 $$textref = "\n" . $$textref; 182 # Remove the line at the start so it is split and processed properly 183 $$textref =~ s/^----------\n//; 184 185 # Obtain the documents specified in the CDS/ISIS database, if requested 186 if ($self->{'document_field'}) { 187 # Create a directory to store the document files 188 $self->{'documents_directory'} = $databasename . ".all"; 189 if (-e $self->{'documents_directory'}) { 190 &util::rm_r($self->{'documents_directory'}); 191 } 192 &util::mk_dir($self->{'documents_directory'}); 193 194 # ...and a metadata.xml file for the document metadata (extracted from the database) 195 $self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml"); 196 if (-e $self->{'documents_metadata_xml_file'}) { 197 &util::rm($self->{'documents_metadata_xml_file'}); 198 } 199 &begin_metadata_xml_file($self->{'documents_metadata_xml_file'}); 200 } 149 201 } 150 202 … … 167 219 foreach $line (split(/\n/, $$textref)) { 168 220 $line =~ /^tag=(.+) data=(.+)$/; 169 local$rawtagname = $1;170 local$rawtagdata = $2;221 my $rawtagname = $1; 222 my $rawtagdata = $2; 171 223 # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n"; 172 224 173 225 # Metadata field names: title case, then remove spaces 174 local$tagname = "";226 my $tagname = ""; 175 227 foreach $word (split(/\s+/, $rawtagname)) { 176 228 substr($word, 0, 1) =~ tr/a-z/A-Z/; … … 182 234 183 235 # Handle each piece of metadata ('%' separated) 184 local$completetagvalue = "";236 my $completetagvalue = ""; 185 237 foreach $rawtagvalue (split(/%/, $rawtagdata)) { 186 238 $completetagvalue .= $entry_separator unless ($completetagvalue eq ""); 187 239 188 240 # Metadata field values: take care with subfields 189 local$completeentryvalue = "";241 my $completeentryvalue = ""; 190 242 while ($rawtagvalue ne "") { 191 243 # If there is a subfield specifier, parse it off 192 local$subfieldname = "";244 my $subfieldname = ""; 193 245 if ($rawtagvalue =~ s/^\^([a-z])//) { 194 246 $subfieldname = "." . $1; … … 197 249 # Parse the metadata value off 198 250 $rawtagvalue =~ s/^([^\^]*)//; 199 local$metadatafieldname = $tagname . $subfieldname;200 local$metadatafieldvalue = $1;251 my $metadatafieldname = $tagname . $subfieldname; 252 my $metadatafieldvalue = $1; 201 253 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n"; 202 254 203 255 # Handle Keywords specially 204 256 if ($metadatafieldname eq "Keywords") { 205 local$keywordmetadatavalue = $metadatafieldvalue;206 local$keywordlist = "";257 my $keywordmetadatavalue = $metadatafieldvalue; 258 my $keywordlist = ""; 207 259 while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) { 208 local$keyword = $1;260 my $keyword = $1; 209 261 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword); 210 262 $keywordlist .= ", " unless ($keywordlist eq ""); … … 236 288 $doc_obj->add_utf8_text ($cursection, $$textref); 237 289 238 # Document was processed successfully 290 # Obtain the documents specified in the CDS/ISIS database, if requested 291 if ($self->{'document_field'}) { 292 my $document_field = $self->{'document_field'}; 293 my $document_prefix = $self->{'document_prefix'} || ""; 294 my $document_suffix = $self->{'document_suffix'} || ""; 295 296 my $documents_directory = $self->{'documents_directory'}; 297 my $document_obtained = 0; 298 299 # Look at all the metadata assigned to this record 300 my $record_metadata = $doc_obj->get_all_metadata($cursection); 301 foreach my $pair (@$record_metadata) { 302 my ($field, $value) = (@$pair); 303 304 # Does this metadata element specify a document to obtain? 305 if ($field eq $document_field) { 306 my $document_file_full = $document_prefix . $value . $document_suffix; 307 my ($document_file) = ($document_file_full =~ /([^\/]+)$/); 308 if (&obtain_document($self, $document_file_full, $document_file, 309 $documents_directory)) { 310 $document_obtained = 1; 311 &write_metadata_xml_file($self->{'documents_metadata_xml_file'}, 312 $document_file, $record_metadata); 313 } 314 } 315 } 316 317 # If there was a document obtained for this record we don't want the record as well 318 if ($document_obtained) { 319 return 0; 320 } 321 } 322 323 # Record was processed successfully (and there was no document obtained) 239 324 return 1; 240 325 } … … 243 328 sub parse_field_definition_table 244 329 { 245 local$fdtfilename = shift(@_);246 247 local%fdtmapping = ();330 my $fdtfilename = shift(@_); 331 332 my %fdtmapping = (); 248 333 249 334 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n"; 250 335 251 local$amongstdefinitions = 0;336 my $amongstdefinitions = 0; 252 337 foreach $fdtfileline (<FDT_FILE>) { 253 338 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines 254 339 255 340 if ($amongstdefinitions) { 256 local$fieldtitle = substr($fdtfileline, 0, 30);257 local$fieldsubfields = substr($fdtfileline, 30, 20);258 local$fieldspecs = substr($fdtfileline, 50);341 my $fieldtitle = substr($fdtfileline, 0, 30); 342 my $fieldsubfields = substr($fdtfileline, 30, 20); 343 my $fieldspecs = substr($fdtfileline, 50); 259 344 260 345 # Remove extra spaces … … 263 348 264 349 # Map from tag number to metadata field title and subfields 265 local$fieldtag = (split(/ /, $fieldspecs))[0];350 my $fieldtag = (split(/ /, $fieldspecs))[0]; 266 351 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle, 267 352 'subfields' => $fieldsubfields }; … … 278 363 279 364 365 sub obtain_document 366 { 367 my $self = shift(@_); 368 my $document_file_full = shift(@_); 369 my $document_file_name = shift(@_); 370 my $documents_directory = shift(@_); 371 my $local_document_file = &util::filename_cat($documents_directory, $document_file_name); 372 373 my $outhandle = $self->{'outhandle'}; 374 print $outhandle "Obtaining document file $document_file_full...\n" 375 if ($self->{'verbosity'} > 1); 376 377 # Document specified is on the web 378 if ($document_file_full =~ /^http:/ || $document_file_full =~ /^ftp:/) { 379 my $wget_options = "--quiet"; 380 $wget_options = "--verbose" if ($self->{'verbosity'} > 2); 381 $wget_options .= " --timestamping"; # Only re-download files if they're newer 382 `wget $wget_options $document_file_full --output-document $local_document_file`; 383 } 384 # Document specified is on the disk 385 else { 386 &util::cp($document_file_full, $documents_directory); 387 } 388 389 # Check the document was obtained successfully 390 if (!-e $local_document_file) { 391 print STDERR "WARNING: Could not obtain document file $document_file_full\n"; 392 return 0; 393 } 394 395 return 1; 396 } 397 398 399 sub begin_metadata_xml_file 400 { 401 my $metadata_xml_file = shift(@_); 402 403 open(METADATA_XML_FILE, ">$metadata_xml_file"); 404 print METADATA_XML_FILE 405 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" . 406 "<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" . 407 "<DirectoryMetadata>\n"; 408 close(METADATA_XML_FILE); 409 } 410 411 412 sub write_metadata_xml_file 413 { 414 my $metadata_xml_file = shift(@_); 415 my $file_name = shift(@_); 416 my $record_metadata = shift(@_); 417 418 # Make $file_name XML-safe 419 $file_name =~ s/</</g; 420 $file_name =~ s/>/>/g; 421 422 open(METADATA_XML_FILE, ">>$metadata_xml_file"); 423 424 print METADATA_XML_FILE 425 "\n" . 426 " <FileSet>\n" . 427 " <FileName>$file_name</FileName>\n" . 428 " <Description>\n"; 429 430 foreach my $pair (@$record_metadata) { 431 my ($field, $value) = (@$pair); 432 433 # We're only interested in metadata from the database 434 next if ($field eq "gsdlsourcefilename"); 435 next if ($field eq "gsdldoctype"); 436 next if ($field eq "Language"); 437 next if ($field eq "Encoding"); 438 next if ($field eq "Identifier"); 439 next if ($field eq "Source"); 440 next if ($field eq "SourceSegment"); 441 next if ($field eq "Plugin"); 442 443 # Make $value XML-safe 444 $value =~ s/</</g; 445 $value =~ s/>/>/g; 446 447 print METADATA_XML_FILE " <Metadata name=\"$field\">$value</Metadata>\n"; 448 } 449 450 print METADATA_XML_FILE 451 " </Description>\n" . 452 " </FileSet>\n"; 453 454 close(METADATA_XML_FILE); 455 } 456 457 458 sub end_metadata_xml_file 459 { 460 my $metadata_xml_file = shift(@_); 461 462 open(METADATA_XML_FILE, ">>$metadata_xml_file"); 463 print METADATA_XML_FILE "\n</DirectoryMetadata>\n"; 464 close(METADATA_XML_FILE); 465 } 466 467 280 468 1; -
trunk/gsdl/perllib/plugins/RecPlug.pm
r7362 r7686 115 115 'desc' => "{RecPlug.use_metadata_files}", 116 116 'type' => "flag", 117 'reqd' => "no" }, 118 { 'name' => "recheck_directories", 119 'desc' => "{RecPlug.recheck_directories}", 120 'type' => "flag", 117 121 'reqd' => "no" } ]; 118 122 … … 137 141 if (!parsargv::parse(\@_, 138 142 q^use_metadata_files^, \$self->{'use_metadata_files'}, 143 q^recheck_directories^, \$self->{'recheck_directories'}, 139 144 "allow_extra_options")) { 140 145 print STDERR "\nRecPlug uses an incorrect option.\n"; … … 232 237 @dir = readdir (DIR); 233 238 closedir (DIR); 234 239 240 # Re-order the files in the list so any directories ending with .all are moved to the end 241 for ($i = scalar(@dir) - 1; $i >= 0; $i--) { 242 if (-d $dir[$i] && $dir[$i] =~ /\.all$/) { 243 push(@dir, splice(@dir, $i, 1)); 244 } 245 } 246 235 247 # read XML metadata files (if supplied) 236 248 my $additionalmetadata = 0; # is there extra metadata available? … … 252 264 # import each of the files in the directory 253 265 my $out_metadata; 254 foreach $subfile (@dir) { 255 266 my $num_files = scalar(@dir); 267 for (my $i = 0; $i <= scalar(@dir); $i++) { 268 # When every file in the directory has been done, pause for a moment (figuratively!) 269 # If the -recheck_directories argument hasn't been provided, stop now (default) 270 # Otherwise, re-read the contents of the directory to check for new files 271 # Any new files are added to the @dir list and are processed as normal 272 # This is necessary when documents to be indexed are specified in bibliographic DBs 273 # These files are copied/downloaded and stored in a new folder at import time 274 if ($i == $num_files) { 275 last unless $self->{'recheck_directories'}; 276 277 # Re-read the files in the directory to see if there are any new files 278 last if (!opendir (DIR, $dirname)); 279 my @dirnow = readdir (DIR); 280 closedir (DIR); 281 282 # We're only interested if there are more files than there were before 283 last if (scalar(@dirnow) <= scalar(@dir)); 284 285 # Any new files are added to the end of @dir to get processed by the loop 286 foreach my $subfilenow (@dirnow) { 287 for ($j = 0; $j < $num_files; $j++) { 288 last if ($subfilenow eq $dir[$j]); 289 } 290 if ($j == $num_files) { 291 # New file 292 push(@dir, $subfilenow); 293 } 294 } 295 296 # When the new files have been processed, check again 297 $num_files = scalar(@dir); 298 } 299 300 my $subfile = $dir[$i]; 256 301 last if ($maxdocs != -1 && $count >= $maxdocs); 257 302 next if ($subfile =~ /^\.\.?$/); … … 304 349 $out_metadata, $processor, $maxdocs, $gli); 305 350 } 306 return $count; 307 351 352 return $count; 308 353 } 309 354
Note:
See TracChangeset
for help on using the changeset viewer.