Changeset 28642
- Timestamp:
- 2013-11-19T12:06:05+13:00 (10 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugouts
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm
r28550 r28642 44 44 45 45 my $arguments = [ 46 { 'name' => "group_size",47 'desc' => "{BasPlugout.group_size}",48 'type' => "int",49 'deft' => "1",50 'reqd' => "no",51 'hiddengli' => "no"},52 { 'name' => "output_info",53 'desc' => "{BasPlugout.output_info}",54 'type' => "string",55 'reqd' => "yes",56 'hiddengli' => "yes"},57 46 { 'name' => "xslt_file", 58 47 'desc' => "{BasPlugout.xslt_file}", … … 61 50 'deft' => "", 62 51 'hiddengli' => "no"}, 63 { 'name' => "output_handle",64 'desc' => "{BasPlugout.output_handle}",65 'type' => "string",66 'deft' => 'STDERR',67 'reqd' => "no",68 'hiddengli' => "yes"},69 { 'name' => "verbosity",70 'desc' => "{BasPlugout.verbosity}",71 'type' => "int",72 'deft' => "0",73 'reqd' => "no",74 'hiddengli' => "no"},75 { 'name' => "gzip_output",76 'desc' => "{BasPlugout.gzip_output}",77 'type' => "flag",78 'reqd' => "no",79 'hiddengli' => "no"},80 { 'name' => "debug",81 'desc' => "{BasPlugout.debug}",82 'type' => "flag",83 'reqd' => "no",84 'hiddengli' => "yes"},85 52 { 'name' => "subdir_split_length", 86 53 'desc' => "{BasPlugout.subdir_split_length}", … … 95 62 'deft' => "0", 96 63 'hiddengli' => "no"}, 64 { 'name' => "gzip_output", 65 'desc' => "{BasPlugout.gzip_output}", 66 'type' => "flag", 67 'reqd' => "no", 68 'hiddengli' => "no"}, 69 { 'name' => "verbosity", 70 'desc' => "{BasPlugout.verbosity}", 71 'type' => "int", 72 'deft' => "0", 73 'reqd' => "no", 74 'hiddengli' => "no"}, 75 { 'name' => "output_info", 76 'desc' => "{BasPlugout.output_info}", 77 'type' => "string", 78 'reqd' => "yes", 79 'hiddengli' => "yes"}, 80 { 'name' => "output_handle", 81 'desc' => "{BasPlugout.output_handle}", 82 'type' => "string", 83 'deft' => 'STDERR', 84 'reqd' => "no", 85 'hiddengli' => "yes"}, 86 { 'name' => "debug", 87 'desc' => "{BasPlugout.debug}", 88 'type' => "flag", 89 'reqd' => "no", 90 'hiddengli' => "yes"}, 97 91 { 'name' => 'no_rss', 98 92 'desc' => '{BasPlugout.no_rss}', 99 93 'type' => 'flag', 100 94 'reqd' => 'no', 101 'hiddengli' => 'yes'} 95 'hiddengli' => 'yes'}, 102 96 ]; 103 97 … … 158 152 } 159 153 154 # for group processing 160 155 $self->{'gs_count'} = 0; 156 $self->{'group_position'} = 1; 161 157 162 158 $self->{'keep_import_structure'} = 0; … … 398 394 # Used to be '<!DOCTYPE Archive SYSTEM ...' 399 395 400 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n"; 396 print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n"; 401 397 } 402 398 … … 438 434 } 439 435 440 436 # This is called by the plugins after read_into_doc_obj generates the doc_obj. 441 437 sub process { 442 438 my $self = shift (@_); 443 439 my ($doc_obj) = @_; 444 440 441 my $output_info = $self->{'output_info'}; 442 return if (!defined $output_info); 443 445 444 # for OAI purposes 446 445 $doc_obj->set_lastmodified(); 447 446 $doc_obj->set_oailastmodified(); 448 447 449 if ($self->{'group_size'} > 1) { 450 $self->group_process ($doc_obj); 451 return; 452 } 453 454 my $OID = $doc_obj->get_OID(); 455 $OID = "NULL" unless defined $OID; 456 457 my $top_section = $doc_obj->get_top_section(); 458 459 #get document's directory 460 my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename()); 461 462 my $output_info = $self->{'output_info'}; 463 return if (!defined $output_info); 464 448 # find out which directory to save to 449 my $doc_dir = ""; 450 if ($self->is_group()) { 451 $doc_dir = $self->get_group_doc_dir($doc_obj); 452 } else { 453 $doc_dir = $self->get_doc_dir($doc_obj); 454 } 455 465 456 ############################## 466 457 # call subclass' saveas method 467 458 ############################## 468 459 $self->saveas($doc_obj,$doc_dir); 469 $self->archiveinf_db($doc_obj,$doc_dir); 470 460 461 # write out data to archiveinf-doc.db 462 $self->archiveinf_db($doc_obj); 463 464 if ($self->is_group()) { 465 $self->{'gs_count'}++; # do we want this for all cases? 466 $self->{'group_position'}++; 467 } 471 468 } 472 469 … … 477 474 my $output_info = $self->{'output_info'}; 478 475 my $metaname = $self->{'sortmeta'}; 479 476 477 my $group_position; 478 if ($self->is_group()) { 479 $group_position = $self->{'group_position'}; 480 } 480 481 if (!defined $metaname || $metaname !~ /\S/) { 481 482 my $OID = $doc_obj->get_OID(); 482 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "" );483 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position); 483 484 return; 484 485 } 485 486 486 487 if ($metaname eq "OID") { # sort by OID 487 488 my $OID = $doc_obj->get_OID(); 488 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID );489 $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef); 489 490 return; 490 491 } 491 492 492 493 my $metadata = ""; … … 506 507 507 508 # store reference in the output_info 508 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata); 509 510 } 511 512 sub group_process { 513 509 $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef); 510 511 } 512 513 514 515 sub saveas { 516 my $self = shift (@_); 517 my ($doc_obj, $doc_dir) = @_; 518 519 die "Basplug::saveas function must be implemented in sub classes\n"; 520 } 521 522 sub get_group_doc_dir { 514 523 my $self = shift (@_); 515 524 my ($doc_obj) = @_; 516 525 526 my $outhandle = $self->{'output_handle'}; 517 527 my $OID = $doc_obj->get_OID(); 518 528 $OID = "NULL" unless defined $OID; … … 521 531 my $gs_count = $self->{'gs_count'}; 522 532 my $open_new_file = (($gs_count % $groupsize)==0); 523 my $outhandle = $self->{'output_handle'}; 524 525 # opening a new file, or document has assoicated files => directory needed 526 if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) { 527 528 # The directory the archive file (doc.xml) and all associated files 529 # should end up in 530 my $doc_dir; 531 # If we've determined its time for a new file, open it now 532 if ($open_new_file || !defined($self->{'gs_doc_dir'})) 533 { 534 $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename()); 535 # only if opening new file 536 my $output_dir = $self->get_output_dir(); 537 &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir); 538 my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml"); 539 my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 540 541 if ($gs_count>0) 542 { 543 return if (!$self->close_file_output()); 544 } 545 546 open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return); 547 548 binmode(GROUPPROCESS, ":utf8"); 549 $self->{'gs_filename'} = $doc_file; 550 $self->{'short_doc_file'} = $short_doc_file; 551 $self->{'gs_OID'} = $OID; 552 $self->{'gs_doc_dir'} = $doc_dir; 553 554 $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive'); 555 } 556 # Otherwise load the same archive document directory used last time 557 else 558 { 559 $doc_dir = $self->{'gs_doc_dir'}; 560 } 561 562 # copy all the associated files, add this information as metadata 563 # to the document 564 print $outhandle "Writing associated files to $doc_dir\n"; 565 $self->process_assoc_files ($doc_obj, $doc_dir); 566 567 # look up 'gsdlmetafile' metadata and store that information 568 # explicitly in $doc_obj 569 $self->process_metafiles_metadata ($doc_obj); 570 } 571 572 # save this document 573 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section()); 574 print GROUPPROCESS $section_text; 575 576 $self->{'gs_count'}++; 577 } 578 579 580 sub saveas { 581 my $self = shift (@_); 582 583 die "Basplug::saveas function must be implemented in sub classes\n"; 584 } 585 533 534 my $doc_dir; 535 536 if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) { 537 # if we have some assoc files, then we will need to start a new file 538 if ($self->{'verbosity'} > 2) { 539 print $outhandle " Starting a archives folder for $OID as it has associated files\n"; 540 } 541 $open_new_file = 1; 542 } 543 544 # opening a new file 545 if (($open_new_file) || !defined($self->{'gs_doc_dir'})) { 546 # first we close off the old output 547 if ($gs_count>0) 548 { 549 return if (!$self->close_group_output()); 550 } 551 552 # this will create the directory 553 $doc_dir = $self->get_doc_dir ($doc_obj); 554 $self->{'new_doc_dir'} = 1; 555 $self->{'gs_doc_dir'} = $doc_dir; 556 $self->{'group_position'} = 1; 557 } 558 else { 559 $doc_dir = $self->{'gs_doc_dir'}; 560 $self->{'new_doc_dir'} = 0; 561 } 562 return $doc_dir; 563 564 } 586 565 sub get_doc_dir { 587 my $self = shift (@_); 588 my ($OID, $source_filename) = @_; 566 567 my $self = shift (@_); 568 my ($doc_obj) = @_; 569 570 my $OID = $doc_obj->get_OID(); 571 $OID = "NULL" unless defined $OID; 589 572 590 573 my $working_dir = $self->get_output_dir(); … … 603 586 elsif ($self->{'keep_import_structure'}) 604 587 { 588 my $source_filename = $doc_obj->get_source_filename(); 605 589 $source_filename = &File::Basename::dirname($source_filename); 606 590 $source_filename =~ s/[\\\/]+/\//g; … … 616 600 } 617 601 618 if (!defined $self->{'group'} || !$self->{'group'}){ 619 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir)); 620 } 602 &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir)); 621 603 622 604 return $doc_dir; 623 605 } 624 606 625 # Before parallel building, this was the method that created the new doc dirs in archives626 sub get_new_doc_dir_OLD {627 my $self = shift (@_);628 my($working_info,$working_dir,$OID) = @_;629 630 631 my $doc_dir = "";632 my $doc_dir_rest = $OID;633 634 # remove any \ and / from the OID635 $doc_dir_rest =~ s/[\\\/]//g;636 637 # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters638 $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);639 640 my $doc_dir_num = 0;641 642 do {643 $doc_dir .= "/" if $doc_dir_num > 0;644 my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';645 if ($self->{'subdir_hash_prefix'})646 {647 $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';648 }649 #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {650 if ($doc_dir_rest =~ s/$pattern//i)651 {652 $doc_dir .= $1;653 $doc_dir_num++;654 }655 } while ($doc_dir_rest ne "" &&656 ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||657 ($working_info->size() >= 1024 && $doc_dir_num < 2)));658 my $i = 1;659 my $doc_dir_base = $doc_dir;660 while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {661 $doc_dir = "$doc_dir_base-$i";662 $i++;663 }664 665 return "$doc_dir.dir";666 }667 607 668 608 ## @function get_new_doc_dir() … … 914 854 my $doc_info = $working_info->get_info($oid); 915 855 916 my ($doc_file,$index_status,$sortmeta ) = @$doc_info;856 my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info; 917 857 # doc_file is the path to the archive doc.xml. Make sure it has unix 918 858 # slashes, then if the collection is copied to linux, it can be built without reimport … … 924 864 'assoc-file' => [], 925 865 'meta-file' => [] }; 926 866 if (defined $group_position) { 867 $oid_files->{'group-position'} = $group_position; 868 } 927 869 my $reverse_lookups = { $source_filename => "1" }; 928 870 … … 989 931 $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ]; 990 932 $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ]; 933 if (defined $oid_files->{'group-position'}) { 934 $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ]; 935 } 991 936 992 937 my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append"); … … 1083 1028 } 1084 1029 1085 sub close_file_output1086 {1087 my ($self) = @_;1088 1089 # make sure that the handle has been opened - it won't be if we failed1090 # to import any documents...1091 if (defined(fileno(GROUPPROCESS))) {1092 $self->output_xml_footer('GROUPPROCESS','Archive');1093 close GROUPPROCESS;1094 }1095 1096 my $OID = $self->{'gs_OID'};1097 my $short_doc_file = $self->{'short_doc_file'};1098 1099 if ($self->{'gzip'}) {1100 my $doc_file = $self->{'gs_filename'};1101 `gzip $doc_file`;1102 $doc_file .= ".gz";1103 $short_doc_file .= ".gz";1104 if (!&FileUtils::fileExists($doc_file)) {1105 my $outhandle = $self->{'output_handle'};1106 print $outhandle "error while gzipping: $doc_file doesn't exist\n";1107 return 0;1108 }1109 }1110 1111 # store reference in output_info1112 my $output_info = $self->{'output_info'};1113 return 0 if (!defined $output_info);1114 $output_info->add_info($OID, $short_doc_file, undef, undef);1115 return 1;1116 }1117 1030 1118 1031 -
main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm
r27522 r28642 40 40 } 41 41 42 my $arguments = []; 43 42 my $arguments = [ 43 { 'name' => "group_size", 44 'desc' => "{BasePlugout.group_size}", 45 'type' => "int", 46 'deft' => "1", 47 'reqd' => "no", 48 'hiddengli' => "no"} 49 ]; 44 50 my $options = { 'name' => "GreenstoneXMLPlugout", 45 51 'desc' => "{GreenstoneXMLPlugout.desc}", 46 52 'abstract' => "no", 47 'inherits' => "yes" }; 53 'inherits' => "yes", 54 'args' => $arguments }; 48 55 49 56 sub new { … … 60 67 } 61 68 69 sub is_group { 70 my $self = shift (@_); 71 return ($self->{'group_size'} > 1); 72 } 73 62 74 sub saveas { 63 75 my $self = shift (@_); 64 my ($doc_obj,$doc_dir) = @_; 65 76 my ($doc_obj, $doc_dir) = @_; 66 77 my $outhandler; 67 78 my $output_file; 68 79 if ($self->{'debug'}) { 69 80 $outhandler = STDOUT; 70 # can we do the xslt and still do debug mode?71 81 } 72 82 else { 73 my $output_dir = $self->get_output_dir(); 74 if (!&FileUtils::directoryExists($output_dir)) 75 { 76 &FileUtils::makeAllDirectories($output_dir); 77 } 83 84 $self->process_assoc_files($doc_obj, $doc_dir, ''); 85 $self->process_metafiles_metadata ($doc_obj); 86 87 # open up the outhandler 88 if ($self->is_group() && !$self->{'new_doc_dir'}) { 89 # we already have a handle open ?? 90 $outhandler = $self->{'group_outhandler'}; 91 } else { 92 $output_file = &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml"); 93 # open the new handle 94 $self->open_xslt_pipe($output_file, $self->{'xslt_file'}); 78 95 79 my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir); 80 if (!&FileUtils::directoryExists($working_dir)) 81 { 82 &FileUtils::makeAllDirectories($working_dir); 83 } 96 if (defined $self->{'xslt_writer'}){ 97 $outhandler = $self->{'xslt_writer'}; 98 } 99 else{ 100 $outhandler = $self->get_output_handler($output_file); 101 } 102 103 if ($self->is_group()) { 104 $self->{'group_outhandler'} = $outhandler; 105 } 106 } 107 } # else not debug 108 binmode($outhandler,":utf8"); 84 109 85 $self->process_assoc_files ($doc_obj, $doc_dir, ''); 86 87 $self->process_metafiles_metadata ($doc_obj); 88 89 $output_file = &FileUtils::filenameConcatenate($working_dir, "doc.xml"); 90 91 $self->open_xslt_pipe($output_file, $self->{'xslt_file'}); 92 93 if (defined $self->{'xslt_writer'}){ 94 $outhandler = $self->{'xslt_writer'}; 95 } 96 else{ 97 $outhandler = $self->get_output_handler($output_file); 98 } 110 # only output the header if we have started a new doc 111 if (!$self->is_group() || $self->{'new_doc_dir'}) { 112 $self->output_xml_header($outhandler); 99 113 } 100 114 101 binmode($outhandler,":utf8"); 115 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section()); 116 print $outhandler $section_text; 117 118 # only output the footer if we are not doing group stuff. The group file will be finished in close_group_output 119 if (!$self->is_group()) { 120 $self->output_xml_footer($outhandler); 121 } 102 122 103 $self->output_xml_header($outhandler,"Archive"); 104 my $section_output = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section()); 105 print $outhandler $section_output; 106 $self->output_xml_footer($outhandler,"Archive"); 107 108 if (!$self->{'debug'}) { 123 # close off the output - in a group process situation, this will be done by close_group_output 124 if (!$self->is_group() && !$self->{'debug'}) { 109 125 if (defined $self->{'xslt_writer'}){ 110 126 $self->close_xslt_pipe(); … … 113 129 &FileUtils::closeFileHandle($output_file, \$outhandler) if defined $output_file; 114 130 } 115 116 $self->{'short_doc_file'} = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");117 118 $self->store_output_info_reference($doc_obj);119 131 } 132 $self->{'short_doc_file'} = &FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 133 134 $self->store_output_info_reference($doc_obj); 135 120 136 } 121 137 138 sub output_xml_header { 139 my $self = shift (@_); 140 my ($outhandle) = @_; 141 142 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n"; 143 print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n"; 144 print $outhandle "<Archive>\n"; 145 } 146 147 sub output_xml_footer { 148 my $self = shift (@_); 149 my ($outhandle) = @_; 150 151 print $outhandle "</Archive>\n"; 152 } 153 154 sub close_group_output 155 { 156 my $self = shift(@_); 157 158 # make sure that the handle has been opened - it won't be if we failed 159 # to import any documents... 160 my $outhandle = $self->{'group_outhandler'}; 161 if (defined(fileno($outhandle))) { 162 $self->output_xml_footer($outhandle); 163 &FileUtils::closeFileHandle("", \$outhandle); 164 undef $self->{'group_outhandler'} 165 } 166 167 my $OID = $self->{'gs_OID'}; 168 my $short_doc_file = $self->{'short_doc_file'}; 169 170 ### TODO - from here is old code. check that it is still valid. 171 if ($self->{'gzip'}) { 172 my $doc_file = $self->{'gs_filename'}; 173 `gzip $doc_file`; 174 $doc_file .= ".gz"; 175 $short_doc_file .= ".gz"; 176 if (!&FileUtils::fileExists($doc_file)) { 177 my $outhandle = $self->{'output_handle'}; 178 print $outhandle "error while gzipping: $doc_file doesn't exist\n"; 179 return 0; 180 } 181 } 182 183 return 1; 184 } 122 185 123 186
Note:
See TracChangeset
for help on using the changeset viewer.