Changeset 28642


Ignore:
Timestamp:
2013-11-19T12:06:05+13:00 (10 years ago)
Author:
kjdon
Message:

group processing code was GreenstoneXML format so moved it into GreenstoneXMLPlugout. tidying up the code. reordered options. group processing now writes out the correct archivesinf databases.

Location:
main/trunk/greenstone2/perllib/plugouts
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r28550 r28642  
    4444
    4545my $arguments = [
    46        { 'name' => "group_size",
    47     'desc' => "{BasPlugout.group_size}",
    48     'type' => "int",
    49         'deft' =>  "1",
    50     'reqd' => "no",
    51     'hiddengli' => "no"},
    52        { 'name' => "output_info",
    53     'desc' => "{BasPlugout.output_info}",
    54     'type' => "string",   
    55     'reqd' => "yes",
    56     'hiddengli' => "yes"},       
    5746       { 'name' => "xslt_file",
    5847    'desc' => "{BasPlugout.xslt_file}",
     
    6150     'deft' => "",
    6251    'hiddengli' => "no"},
    63        { 'name' => "output_handle",
    64     'desc' => "{BasPlugout.output_handle}",
    65     'type' => "string",
    66         'deft' =>  'STDERR',
    67     'reqd' => "no",
    68     'hiddengli' => "yes"},
    69        { 'name' => "verbosity",
    70     'desc' => "{BasPlugout.verbosity}",
    71     'type' => "int",
    72         'deft' =>  "0",
    73     'reqd' => "no", 
    74         'hiddengli' => "no"},
    75        { 'name' => "gzip_output",
    76     'desc' => "{BasPlugout.gzip_output}",
    77     'type' => "flag",
    78     'reqd' => "no", 
    79         'hiddengli' => "no"},
    80        { 'name' => "debug",
    81      'desc' => "{BasPlugout.debug}",
    82      'type' => "flag",
    83      'reqd' => "no",
    84      'hiddengli' => "yes"},
    8552       { 'name' => "subdir_split_length",
    8653     'desc' => "{BasPlugout.subdir_split_length}",
     
    9562         'deft' => "0",
    9663     'hiddengli' => "no"},
     64       { 'name' => "gzip_output",
     65    'desc' => "{BasPlugout.gzip_output}",
     66    'type' => "flag",
     67    'reqd' => "no", 
     68        'hiddengli' => "no"},
     69        { 'name' => "verbosity",
     70    'desc' => "{BasPlugout.verbosity}",
     71    'type' => "int",
     72        'deft' =>  "0",
     73    'reqd' => "no", 
     74        'hiddengli' => "no"},
     75      { 'name' => "output_info",
     76    'desc' => "{BasPlugout.output_info}",
     77    'type' => "string",   
     78    'reqd' => "yes",
     79    'hiddengli' => "yes"},       
     80       { 'name' => "output_handle",
     81    'desc' => "{BasPlugout.output_handle}",
     82    'type' => "string",
     83        'deft' =>  'STDERR',
     84    'reqd' => "no",
     85    'hiddengli' => "yes"},
     86       { 'name' => "debug",
     87     'desc' => "{BasPlugout.debug}",
     88     'type' => "flag",
     89     'reqd' => "no",
     90     'hiddengli' => "yes"},
    9791       { 'name' => 'no_rss',
    9892         'desc' => '{BasPlugout.no_rss}',
    9993         'type' => 'flag',
    10094         'reqd' => 'no',
    101          'hiddengli' => 'yes'}
     95         'hiddengli' => 'yes'},
    10296];
    10397
     
    158152    }
    159153
     154    # for group processing
    160155    $self->{'gs_count'} = 0;
     156    $self->{'group_position'} = 1;
    161157
    162158    $self->{'keep_import_structure'} = 0;
     
    398394    # Used to be '<!DOCTYPE Archive SYSTEM ...'
    399395   
    400     print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
     396    print $handle "<!DOCTYPE $doctype SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n"; 
    401397    }
    402398
     
    438434}
    439435
    440 
     436# This is called by the plugins after read_into_doc_obj generates the doc_obj.
    441437sub process {
    442438    my $self = shift (@_);
    443439    my ($doc_obj) = @_;
    444    
     440
     441    my $output_info = $self->{'output_info'};
     442    return if (!defined $output_info);
     443
    445444    # for OAI purposes
    446445    $doc_obj->set_lastmodified();
    447446    $doc_obj->set_oailastmodified();
    448447
    449      if ($self->{'group_size'} > 1) {
    450      $self->group_process ($doc_obj);
    451     return;
    452     }
    453 
    454     my $OID = $doc_obj->get_OID();
    455     $OID = "NULL" unless defined $OID;     
    456 
    457     my $top_section = $doc_obj->get_top_section();
    458 
    459     #get document's directory
    460     my $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
    461    
    462     my $output_info = $self->{'output_info'};
    463     return if (!defined $output_info);
    464      
     448    # find out which directory to save to
     449    my $doc_dir = "";
     450    if ($self->is_group()) {
     451    $doc_dir = $self->get_group_doc_dir($doc_obj);     
     452    } else {
     453    $doc_dir = $self->get_doc_dir($doc_obj);
     454    }
     455     
    465456    ##############################
    466457    # call subclass' saveas method
    467458    ##############################
    468459    $self->saveas($doc_obj,$doc_dir);
    469     $self->archiveinf_db($doc_obj,$doc_dir);
    470 
     460
     461    # write out data to archiveinf-doc.db
     462    $self->archiveinf_db($doc_obj);
     463
     464    if ($self->is_group()) {
     465    $self->{'gs_count'}++; # do we want this for all cases?
     466    $self->{'group_position'}++;
     467    }
    471468}
    472469
     
    477474    my $output_info = $self->{'output_info'};
    478475    my $metaname = $self->{'sortmeta'};
    479    
     476
     477    my $group_position;
     478    if ($self->is_group()) {
     479    $group_position = $self->{'group_position'};
     480    }
    480481    if (!defined $metaname || $metaname !~ /\S/) {
    481482    my $OID = $doc_obj->get_OID();
    482     $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "");
     483    $output_info->add_info($OID,$self->{'short_doc_file'}, undef, "", $group_position);
    483484    return;
    484485    }
    485486   
    486     if ($metaname eq "OID") { # sort by OID
     487    if ($metaname eq "OID") { # sort by OID
    487488    my $OID = $doc_obj->get_OID();
    488     $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID);
     489    $output_info->add_info($OID,$self->{'short_doc_file'}, undef, $OID, undef);
    489490    return;
    490     }
     491    }
    491492   
    492493    my $metadata = "";
     
    506507
    507508    # store reference in the output_info     
    508     $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata);
    509    
    510 }
    511 
    512 sub group_process {
    513 
     509    $output_info->add_info($doc_obj->get_OID(),$self->{'short_doc_file'}, undef, $metadata,undef);
     510   
     511}
     512
     513
     514
     515sub saveas {
     516    my $self = shift (@_);
     517    my ($doc_obj, $doc_dir) = @_;
     518   
     519    die "Basplug::saveas function must be implemented in sub classes\n";
     520}
     521
     522sub get_group_doc_dir {
    514523    my $self = shift (@_);
    515524    my ($doc_obj) = @_;
    516    
     525
     526    my $outhandle = $self->{'output_handle'};
    517527    my $OID = $doc_obj->get_OID();
    518528    $OID = "NULL" unless defined $OID;
     
    521531    my $gs_count = $self->{'gs_count'};
    522532    my $open_new_file = (($gs_count % $groupsize)==0);
    523     my $outhandle = $self->{'output_handle'};
    524 
    525     # opening a new file, or document has assoicated files => directory needed
    526     if (($open_new_file) || (scalar(@{$doc_obj->get_assoc_files()})>0)) {
    527          
    528         # The directory the archive file (doc.xml) and all associated files
    529         # should end up in
    530         my $doc_dir;
    531         # If we've determined its time for a new file, open it now
    532         if ($open_new_file || !defined($self->{'gs_doc_dir'}))
    533           {
    534             $doc_dir = $self->get_doc_dir ($OID, $doc_obj->get_source_filename());
    535             # only if opening new file
    536         my $output_dir = $self->get_output_dir();
    537         &FileUtils::makeAllDirectories($output_dir) unless &FileUtils::directoryExists($output_dir);
    538         my $doc_file = &FileUtils::filenameConcatenate($output_dir, $doc_dir, "doc.xml");
    539         my $short_doc_file = &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
    540        
    541         if ($gs_count>0)
    542         {
    543         return if (!$self->close_file_output());
    544         }
    545 
    546         open (GROUPPROCESS, ">$doc_file") or (print $outhandle "BasePlugout::group_process could not write to file $doc_file\n" and return);
    547            
    548         binmode(GROUPPROCESS, ":utf8");
    549         $self->{'gs_filename'} = $doc_file;
    550         $self->{'short_doc_file'} = $short_doc_file;
    551         $self->{'gs_OID'} = $OID;
    552             $self->{'gs_doc_dir'} = $doc_dir;
    553 
    554         $self->output_xml_header('BasePlugout::GROUPPROCESS','Archive');
    555     }
    556         # Otherwise load the same archive document directory used last time
    557         else
    558           {
    559             $doc_dir = $self->{'gs_doc_dir'};
    560           }
    561 
    562     # copy all the associated files, add this information as metadata
    563     # to the document
    564         print $outhandle "Writing associated files to $doc_dir\n";
    565     $self->process_assoc_files ($doc_obj, $doc_dir);
    566 
    567     # look up 'gsdlmetafile' metadata and store that information
    568     # explicitly in $doc_obj
    569     $self->process_metafiles_metadata ($doc_obj);
    570     }
    571 
    572     # save this document
    573     my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
    574     print GROUPPROCESS $section_text;
    575  
    576     $self->{'gs_count'}++;
    577 }
    578 
    579 
    580 sub saveas {
    581     my $self = shift (@_);
    582    
    583     die "Basplug::saveas function must be implemented in sub classes\n";
    584 }
    585 
     533
     534    my $doc_dir;
     535
     536    if (!$open_new_file && scalar(@{$doc_obj->get_assoc_files()})>0) {
     537    # if we have some assoc files, then we will need to start a new file
     538    if ($self->{'verbosity'} > 2) {
     539        print $outhandle " Starting a archives folder for $OID as it has associated files\n";
     540    }
     541    $open_new_file = 1;
     542    }
     543   
     544    # opening a new file
     545    if (($open_new_file)  || !defined($self->{'gs_doc_dir'})) {
     546    # first we close off the old output
     547    if ($gs_count>0)
     548    {
     549        return if (!$self->close_group_output());
     550    }
     551
     552    # this will create the directory
     553    $doc_dir = $self->get_doc_dir ($doc_obj);
     554    $self->{'new_doc_dir'} = 1;
     555    $self->{'gs_doc_dir'} = $doc_dir;
     556    $self->{'group_position'} = 1;
     557    }
     558    else {
     559    $doc_dir = $self->{'gs_doc_dir'};
     560    $self->{'new_doc_dir'} = 0;
     561    }
     562    return $doc_dir;
     563
     564}
    586565sub get_doc_dir {
    587     my $self = shift (@_);
    588     my ($OID, $source_filename) = @_;
     566   
     567    my $self = shift (@_);
     568    my ($doc_obj) = @_;
     569
     570    my $OID = $doc_obj->get_OID();
     571    $OID = "NULL" unless defined $OID;
    589572
    590573    my $working_dir  = $self->get_output_dir();
     
    603586    elsif ($self->{'keep_import_structure'})
    604587    {
     588    my $source_filename = $doc_obj->get_source_filename();
    605589    $source_filename = &File::Basename::dirname($source_filename);
    606590    $source_filename =~ s/[\\\/]+/\//g;
     
    616600    }
    617601
    618     if (!defined $self->{'group'} || !$self->{'group'}){
    619     &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
    620     }
     602    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($working_dir, $doc_dir));
    621603
    622604    return $doc_dir;
    623605}
    624606
    625 # Before parallel building, this was the method that created the new doc dirs in archives
    626 sub get_new_doc_dir_OLD {
    627    my $self = shift (@_); 
    628    my($working_info,$working_dir,$OID) = @_;     
    629    
    630    
    631    my $doc_dir = "";
    632    my $doc_dir_rest = $OID;
    633 
    634    # remove any \ and / from the OID
    635    $doc_dir_rest =~ s/[\\\/]//g;
    636 
    637    # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
    638    $doc_dir_rest =~ s/\://g if ($ENV{'GSDLOS'} =~ /^windows$/i);
    639 
    640    my $doc_dir_num = 0;
    641 
    642    do {
    643        $doc_dir .= "/" if $doc_dir_num > 0;
    644        my $pattern = '^(.{1,' . $self->{'subdir_split_length'} . '})';
    645        if ($self->{'subdir_hash_prefix'})
    646        {
    647          $pattern = '^((HASH)?.{1,' . $self->{'subdir_split_length'} . '})';
    648        }
    649        #if ($doc_dir_rest =~ s/^(.{1,$limit})//) {
    650        if ($doc_dir_rest =~ s/$pattern//i)
    651        {
    652        $doc_dir .= $1;
    653        $doc_dir_num++;
    654        }
    655    } while ($doc_dir_rest ne "" &&
    656         ((-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) ||
    657          ($working_info->size() >= 1024 && $doc_dir_num < 2)));
    658    my $i = 1;
    659    my $doc_dir_base = $doc_dir;
    660    while (-d &FileUtils::filenameConcatenate($working_dir, "$doc_dir.dir")) {
    661        $doc_dir = "$doc_dir_base-$i";
    662        $i++;
    663    }
    664          
    665    return "$doc_dir.dir";
    666 }
    667607
    668608## @function get_new_doc_dir()
     
    914854    my $doc_info = $working_info->get_info($oid);
    915855
    916     my ($doc_file,$index_status,$sortmeta) = @$doc_info;
     856    my ($doc_file,$index_status,$sortmeta, $group_position) = @$doc_info;
    917857    # doc_file is the path to the archive doc.xml. Make sure it has unix
    918858    # slashes, then if the collection is copied to linux, it can be built without reimport
     
    924864              'assoc-file' => [],
    925865              'meta-file'  => [] };
    926    
     866    if (defined $group_position) {
     867    $oid_files->{'group-position'} = $group_position;
     868    }
    927869    my $reverse_lookups = { $source_filename => "1" };
    928870
     
    989931    $oid_files->{'src-file'} = [ $oid_files->{'src-file'} ];
    990932    $oid_files->{'sort-meta'} = [ $oid_files->{'sort-meta'} ];
     933    if (defined $oid_files->{'group-position'}) {
     934    $oid_files->{'group-position'} = [ $oid_files->{'group-position'} ];
     935    }
    991936
    992937    my $infodb_file_handle = &dbutil::open_infodb_write_handle($infodbtype, $doc_db, "append");
     
    10831028}
    10841029
    1085 sub close_file_output
    1086 {
    1087     my ($self) = @_;
    1088  
    1089     # make sure that the handle has been opened - it won't be if we failed
    1090     # to import any documents...
    1091     if (defined(fileno(GROUPPROCESS))) {
    1092     $self->output_xml_footer('GROUPPROCESS','Archive');   
    1093     close GROUPPROCESS;
    1094     }
    1095 
    1096     my $OID = $self->{'gs_OID'};
    1097     my $short_doc_file = $self->{'short_doc_file'};
    1098    
    1099     if ($self->{'gzip'}) {
    1100     my $doc_file = $self->{'gs_filename'};
    1101     `gzip $doc_file`;
    1102     $doc_file .= ".gz";
    1103     $short_doc_file .= ".gz";
    1104     if (!&FileUtils::fileExists($doc_file)) {
    1105          my $outhandle = $self->{'output_handle'};
    1106         print $outhandle "error while gzipping: $doc_file doesn't exist\n";
    1107         return 0;
    1108     }
    1109     }
    1110 
    1111     # store reference in output_info
    1112     my $output_info = $self->{'output_info'};
    1113     return 0 if (!defined $output_info);
    1114     $output_info->add_info($OID, $short_doc_file, undef, undef);
    1115     return 1;
    1116 }
    11171030
    11181031
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm

    r27522 r28642  
    4040}
    4141
    42 my $arguments = [];
    43 
     42my $arguments = [
     43       { 'name' => "group_size",
     44    'desc' => "{BasePlugout.group_size}",
     45    'type' => "int",
     46        'deft' =>  "1",
     47    'reqd' => "no",
     48    'hiddengli' => "no"}
     49    ];
    4450my $options = { 'name'     => "GreenstoneXMLPlugout",
    4551        'desc'     => "{GreenstoneXMLPlugout.desc}",
    4652        'abstract' => "no",
    47         'inherits' => "yes" };
     53        'inherits' => "yes",
     54        'args'     => $arguments };
    4855
    4956sub new {
     
    6067}
    6168
     69sub is_group {
     70    my $self = shift (@_);
     71    return ($self->{'group_size'} > 1);
     72}
     73
    6274sub saveas {
    6375    my $self = shift (@_);
    64     my ($doc_obj,$doc_dir) = @_;
    65 
     76    my ($doc_obj, $doc_dir) = @_;
    6677    my $outhandler;
    6778    my $output_file;
    6879    if ($self->{'debug'}) {
    6980    $outhandler = STDOUT;
    70     # can we do the xslt and still do debug mode?
    7181    }
    7282    else {
    73     my $output_dir = $self->get_output_dir();
    74         if (!&FileUtils::directoryExists($output_dir))
    75         {
    76           &FileUtils::makeAllDirectories($output_dir);
    77         }
     83       
     84    $self->process_assoc_files($doc_obj, $doc_dir, '');
     85    $self->process_metafiles_metadata ($doc_obj);
     86   
     87    # open up the outhandler   
     88    if ($self->is_group() && !$self->{'new_doc_dir'}) {
     89        # we already have a handle open ??
     90        $outhandler = $self->{'group_outhandler'};
     91    } else {
     92        $output_file = &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml");
     93        # open the new handle
     94        $self->open_xslt_pipe($output_file, $self->{'xslt_file'});
    7895
    79     my $working_dir = &FileUtils::filenameConcatenate($output_dir, $doc_dir);
    80         if (!&FileUtils::directoryExists($working_dir))
    81         {
    82           &FileUtils::makeAllDirectories($working_dir);
    83         }
     96        if (defined $self->{'xslt_writer'}){
     97        $outhandler = $self->{'xslt_writer'};
     98        }
     99        else{
     100        $outhandler = $self->get_output_handler($output_file);
     101        }
     102       
     103        if ($self->is_group()) {
     104        $self->{'group_outhandler'} = $outhandler;
     105        }
     106    }
     107    } # else not debug
     108    binmode($outhandler,":utf8");
    84109
    85     $self->process_assoc_files ($doc_obj, $doc_dir, '');
    86 
    87     $self->process_metafiles_metadata ($doc_obj);
    88 
    89     $output_file = &FileUtils::filenameConcatenate($working_dir, "doc.xml");
    90 
    91     $self->open_xslt_pipe($output_file, $self->{'xslt_file'});
    92 
    93     if (defined $self->{'xslt_writer'}){
    94         $outhandler = $self->{'xslt_writer'};
    95     }
    96     else{
    97         $outhandler = $self->get_output_handler($output_file);
    98     }
     110    # only output the header if we have started a new doc
     111    if (!$self->is_group() || $self->{'new_doc_dir'}) {
     112    $self->output_xml_header($outhandler);
    99113    }
    100114
    101     binmode($outhandler,":utf8");
     115    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
     116    print $outhandler $section_text;
     117 
     118    # only output the footer if we are not doing group stuff. The group file will be finished in close_group_output
     119    if (!$self->is_group()) {
     120    $self->output_xml_footer($outhandler);
     121    }
    102122
    103     $self->output_xml_header($outhandler,"Archive");
    104     my $section_output = &docprint::get_section_xml($doc_obj, $doc_obj->get_top_section());
    105     print $outhandler $section_output;
    106     $self->output_xml_footer($outhandler,"Archive");
    107 
    108     if (!$self->{'debug'}) {
     123    # close off the output - in a group process situation, this will be done by close_group_output
     124    if (!$self->is_group() && !$self->{'debug'}) {
    109125    if (defined $self->{'xslt_writer'}){     
    110126        $self->close_xslt_pipe();
     
    113129        &FileUtils::closeFileHandle($output_file, \$outhandler) if defined $output_file;
    114130    }
    115    
    116     $self->{'short_doc_file'} = &FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 
    117    
    118     $self->store_output_info_reference($doc_obj);
    119131    }
     132    $self->{'short_doc_file'} = &FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 
     133   
     134    $self->store_output_info_reference($doc_obj);
     135   
    120136}
    121137
     138sub output_xml_header {
     139    my $self = shift (@_);
     140    my ($outhandle) = @_;
     141
     142    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
     143    print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
     144    print $outhandle "<Archive>\n";
     145}
     146
     147sub output_xml_footer {
     148    my $self = shift (@_);
     149    my ($outhandle) = @_;
     150
     151    print $outhandle "</Archive>\n";
     152}
     153
     154sub close_group_output
     155{
     156    my $self = shift(@_);
     157 
     158    # make sure that the handle has been opened - it won't be if we failed
     159    # to import any documents...
     160    my $outhandle = $self->{'group_outhandler'};
     161    if (defined(fileno($outhandle))) {
     162    $self->output_xml_footer($outhandle);   
     163    &FileUtils::closeFileHandle("", \$outhandle);
     164    undef $self->{'group_outhandler'}
     165    }
     166
     167    my $OID = $self->{'gs_OID'};
     168    my $short_doc_file = $self->{'short_doc_file'};
     169   
     170    ### TODO - from here is old code. check that it is still valid.
     171    if ($self->{'gzip'}) {
     172    my $doc_file = $self->{'gs_filename'};
     173    `gzip $doc_file`;
     174    $doc_file .= ".gz";
     175    $short_doc_file .= ".gz";
     176    if (!&FileUtils::fileExists($doc_file)) {
     177         my $outhandle = $self->{'output_handle'};
     178        print $outhandle "error while gzipping: $doc_file doesn't exist\n";
     179        return 0;
     180    }
     181    }
     182
     183    return 1;
     184}
    122185
    123186
Note: See TracChangeset for help on using the changeset viewer.