Changeset 26536 for main/trunk/greenstone2
- Timestamp:
- 2012-11-28T11:59:17+13:00 (11 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/import.pl
r23372 r26536 73 73 [ { 'name' => "hash", 74 74 'desc' => "{import.OIDtype.hash}" }, 75 { 'name' => "hash_on_full_filename", 76 'desc' => "{import.OIDtype.hash_on_full_filename}" }, 75 77 { 'name' => "assigned", 76 78 'desc' => "{import.OIDtype.assigned}" }, … … 78 80 'desc' => "{import.OIDtype.incremental}" }, 79 81 { 'name' => "dirname", 80 'desc' => "{import.OIDtype.dirname}" } ]; 82 'desc' => "{import.OIDtype.dirname}" }, 83 { 'name' => "full_filename", 84 'desc' => "{import.OIDtype.full_filename}" } ]; 81 85 82 86 -
main/trunk/greenstone2/perllib/doc.pm
r26221 r26536 204 204 my ($type, $metadata) = @_; 205 205 206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml| incremental|filename|dirname|assigned)$/) {206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) { 207 207 $self->{'OIDtype'} = $type; 208 208 } else { … … 404 404 my $filename = $self->get_source_filename(); 405 405 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/); 406 $OID = &util::tidy_up_oid($OID); 407 } elsif ($self->{'OIDtype'} eq "full_filename") { 408 my $source_filename = $self->get_source_filename(); 409 my $dirsep = &util::get_os_dirsep(); 410 411 $source_filename =~ s/^import$dirsep//; 412 $source_filename =~ s/$dirsep/-/g; 413 $source_filename =~ s/\./_/g; 414 415 $OID = $source_filename; 406 416 $OID = &util::tidy_up_oid($OID); 407 417 } elsif ($self->{'OIDtype'} eq "dirname") { … … 439 449 if ($use_hash_oid) { 440 450 my $hash_on_file = 1; 451 my $hash_on_ga_xml = 0; 452 441 453 if ($self->{'OIDtype'} eq "hash_on_ga_xml") { 442 454 $hash_on_file = 0; 455 $hash_on_ga_xml = 1; 443 456 } 457 458 if ($self->{'OIDtype'} eq "hash_on_full_filename") { 459 $hash_on_file = 0; 460 $hash_on_ga_xml = 0; 461 462 my $source_filename = $self->get_source_filename(); 463 my $dirsep = &util::get_os_dirsep(); 464 465 $source_filename =~ s/^import$dirsep//; 466 $source_filename =~ s/$dirsep/-/g; 467 $source_filename =~ s/\./_/g; 468 469 # If the filename is very short then (handled naively) 470 # this can cause conjestion in the hash-values 471 # computed, leading documents sharing the same leading 472 # Hex values in the computed has. 473 # 474 # The solution taken here is to replace the name of 475 # the file name a sufficient number of times (up to 476 # the character limit defined in 'rep_limit' and 477 # make that the content that is hashed on 478 479 # *** Think twice before changing the following value 480 # as it will break backward compatability of computed 481 # document HASH values 482 483 my $rep_limit = 256; 484 my $hash_content = undef; 485 486 if (length($source_filename)<$rep_limit) { 487 my $rep_string = "$source_filename|"; 488 my $rs_len = length($rep_string); 489 490 my $clone_times = int(($rep_limit-1)/$rs_len) +1; 491 492 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit); 493 } 494 else { 495 $hash_content = $source_filename; 496 } 497 498 my $filename = &util::get_tmp_filename(); 499 if (!open (OUTFILE, ">:utf8", $filename)) { 500 print STDERR "doc::set_OID could not write to $filename\n"; 501 } else { 502 print OUTFILE $hash_content; 503 close (OUTFILE); 504 } 505 $OID = $self->_calc_OID ($filename); 506 507 print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n"; 508 509 &util::rm ($filename); 510 } 511 444 512 if ($hash_on_file) { 445 513 # "hash" OID - feed file to hashfile.exe … … 450 518 $OID = $self->_calc_OID ($filename); 451 519 } else { 452 $hash_on_ file = 0;520 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead 453 521 } 454 522 } 455 if (!$hash_on_file) { 523 524 if ($hash_on_ga_xml) { 525 # In addition being asked to explicity calculate the has based on the GA file, 526 # can also end up coming into this block is doing 'hash_on_file' but the file 527 # itself is of zero bytes (as could be the case with 'doc.nul' file 528 456 529 my $filename = &util::get_tmp_filename(); 457 530 if (!open (OUTFILE, ">:utf8", $filename)) { -
main/trunk/greenstone2/perllib/docproc.pm
r17747 r26536 54 54 my ($type, $metadata) = @_; 55 55 56 if ($type =~ /^(hash| incremental|dirname|assigned)$/) {56 if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) { 57 57 $self->{'OIDtype'} = $type; 58 58 } else { -
main/trunk/greenstone2/perllib/inexport.pm
r26451 r26536 297 297 298 298 if (!defined $self->{'OIDtype'} 299 || ($self->{'OIDtype'} !~ /^(hash|incremental|assigned|dirname)$/ )) { 299 || ($self->{'OIDtype'} !~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/ )) { 300 # OIDtype was either not defined on the command-line, or if it was not one of the recognized values 300 301 if (defined $collectcfg->{'OIDtype'} 301 && $collectcfg->{'OIDtype'} =~ /^(hash| incremental|assigned|dirname)$/) {302 && $collectcfg->{'OIDtype'} =~ /^(hash|hash_on_full_filename|incremental|assigned|dirname|full_filename)$/) { 302 303 $self->{'OIDtype'} = $collectcfg->{'OIDtype'}; 303 304 } else { … … 548 549 $processor->setoutputdir ($archivedir); 549 550 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; 551 550 552 $processor->set_OIDtype ($OIDtype, $OIDmetadata); 551 553 -
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r26221 r26536 97 97 { 'name' => "hash_on_ga_xml", 98 98 'desc' => "{import.OIDtype.hash_on_ga_xml}" }, 99 { 'name' => "hash_on_full_filename", 100 'desc' => "{import.OIDtype.hash_on_full_filename}" }, 99 101 { 'name' => "assigned", 100 102 'desc' => "{import.OIDtype.assigned}" }, … … 104 106 'desc' => "{import.OIDtype.filename}" }, 105 107 { 'name' => "dirname", 106 'desc' => "{import.OIDtype.dirname}" } ]; 108 'desc' => "{import.OIDtype.dirname}" }, 109 { 'name' => "full_filename", 110 'desc' => "{import.OIDtype.full_filename}" } ]; 107 111 108 112 my $arguments = -
main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm
r24958 r26536 292 292 my ($type, $metadata) = @_; 293 293 294 if ($type =~ /^(hash| incremental|dirname|assigned)$/) {294 if ($type =~ /^(hash|hash_on_full_filename|incremental|dirname|full_filename|assigned)$/) { 295 295 $self->{'OIDtype'} = $type; 296 296 } else { -
main/trunk/greenstone2/perllib/strings.properties
r26268 r26536 279 279 import.OIDtype.hash:Hash the contents of the file. Document identifiers will be the same every time the collection is imported. 280 280 import.OIDtype.hash_on_ga_xml:Hash the contents of the Greenstone Archive XML file. Document identifiers will be the same every time the collection is imported as long as the metadata does not change. 281 import.OIDtype.hash_on_full_filename:Hash on the full filename to the document within the 'import' folder (and not its contents). Helps make document identifiers more stable across upgrades of the software, although it means that duplicate documents contained in the collection are no longer detected automatically. 281 282 282 283 import.OIDtype.incremental:Use a simple document count. Significantly faster than "hash", but does not necessarily assign the same identifier to the same document content if the collection is reimported. … … 285 286 286 287 import.OIDtype.dirname:Use the parent directory name (preceded by 'J'). There should only be one document per directory, and directory names should be unique. E.g. import/b13as/h15ef/page.html will get an identifier of Jh15ef. 288 289 import.OIDtype.filename:Use the tail file name. Requires every filename across all the folders within 'import' to be unique. 290 291 import.OIDtype.full_filename:Use the full file name within the 'import' folder as the identifier for the document (with _ and - substitutions made for symbols such as directory separators and the fullstop in a filename extension) 287 292 288 293 import.OIDmetadata:Specifies the metadata element that hold's the document's unique identifier, for use with -OIDtype=assigned.
Note:
See TracChangeset
for help on using the changeset viewer.