Changeset 28249


Ignore:
Timestamp:
2013-09-10T15:05:36+12:00 (11 years ago)
Author:
davidb
Message:

A very rough cut at dividig a document in to two parts

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm

    r28245 r28249  
    4646    "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
    4747        'compulsory-args' => [ "d", "json-sections" ],
    48         'optional-args'   => [ "json-metadata" ],
     48        'optional-args'   => [ "json-metadata", "newd" ],
    4949#       'optional-args'   => [ "where" ],
    5050        'help-string' => [
     
    7878
    7979
     80
     81sub dxml_start_section
     82{
     83    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
     84
     85    my $new_depth = scalar(@$contextArray);
     86
     87    if ($new_depth == 1) {
     88    $parser->{'parameters'}->{'curr_section_depth'} = 1;
     89    $parser->{'parameters'}->{'curr_section_num'}   = "";
     90    }
     91
     92    my $old_depth  = $parser->{'parameters'}->{'curr_section_depth'};
     93    my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
     94
     95    my $new_secnum;
     96
     97    if ($new_depth > $old_depth) {
     98    # child subsection
     99    $new_secnum = "$old_secnum.1";
     100    }
     101    elsif ($new_depth == $old_depth) {
     102    # sibling section => increase it's value by 1
     103    my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
     104    $tail_num++;
     105    $new_secnum = $old_secnum;
     106    $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
     107    }
     108    else {
     109    # back up to parent section => lopp off tail
     110### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n";
     111
     112    $new_secnum = $old_secnum;
     113    $new_secnum =~ s/\.\d+$//;
     114    }
     115
     116    $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
     117    $parser->{'parameters'}->{'curr_section_num'}   = $new_secnum;
     118   
     119    1;
     120}
     121
     122
     123sub dxml_section
     124{
     125    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
     126
     127    my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
     128    my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
     129
     130    my $mode = $parser->{'parameters'}->{'mode'};
     131
     132    my $depth = $parser->{'parameters'}->{'curr_section_depth'};
     133
     134    my $live_depth = scalar(@$contextArray);
     135
     136#   print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n";
     137
     138    if ($live_depth == 1) {
     139        # root sectin tag, which must always exist
     140#       print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ",
     141#         " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n";
     142        return [$tagname => $attrHash];
     143    }
     144    elsif (defined $sec_num_hash->{$curr_sec_num}) {
     145###     print STDERR "*** got a match on $curr_sec_num, mode = $mode\n";
     146
     147        if ($mode eq "extract") {
     148        # keep it
     149        return [$tagname => $attrHash];
     150        }
     151        else {
     152        # remove
     153        return undef;
     154        }
     155    }
     156    else {
     157        # not in our list
     158        if ($mode eq "extract") {
     159        # remove
     160        return undef;
     161        }
     162        else {
     163        # keep it
     164        return [$tagname => $attrHash];
     165        }
     166    }
     167}
     168
     169
     170sub remove_from_doc_xml
     171{
     172    my $self = shift @_;
     173    my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_;
     174   
     175    my @start_rules = ('Section' => \&dxml_start_section);
     176   
     177    # Set the call-back functions for the metadata tags
     178    my @rules =
     179    (
     180        _default => 'raw',
     181        'Section' => \&dxml_section
     182    );
     183       
     184    my $parser = XML::Rules->new
     185    (
     186        start_rules => \@start_rules,
     187        rules => \@rules,
     188        style => 'filter',
     189        output_encoding => 'utf8',
     190##   normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
     191#       stripspaces => 2|0|0 # ineffectual
     192    );
     193   
     194    my $status = 0;
     195    my $xml_in = "";
     196    if (!open(MIN,"<$doc_xml_filename"))
     197    {
     198        $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
     199        $status = 1;
     200    }
     201    else
     202    {
     203        # Read them in
     204        my $line;
     205        while (defined ($line=<MIN>)) {
     206            $xml_in .= $line;
     207        }
     208        close(MIN);
     209
     210        # Filter with the call-back functions
     211        my $xml_out = "";
     212
     213        my $MOUT;
     214        if (!open($MOUT,">$newdoc_xml_filename")) {
     215            $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
     216            $status = 1;
     217        }
     218        else {
     219            binmode($MOUT,":utf8");
     220
     221            my $options = {sec_num_hash => $sec_num_hash, mode => $mode };
     222
     223            $parser->filter($xml_in, $MOUT, $options);
     224            close($MOUT);       
     225        }
     226    }
     227    return $status;
     228}
     229
     230sub sections_as_hash
     231{
     232    my $self = shift @_;
     233
     234    my ($json_sections_array,$mode) = @_;
     235
     236    my $sec_num_hash = {};
     237
     238    foreach my $sn ( @$json_sections_array ) {
     239
     240    # our XML parser curr_sec_num puts '.' at the root
     241    # Need to do the same here, so things can be matched up
     242    $sec_num_hash->{".$sn"} = 1;
     243
     244### print STDERR "** storeing .$sn\n";
     245
     246    if ($mode eq "with-parents") {
     247        my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections
     248        while ($sn_copy =~ s/\.\d+$//) {
     249        $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above
     250
     251####        print STDERR "** ***** parent storeing .$sn_copy\n";
     252        }
     253    }
     254    }
     255
     256    return $sec_num_hash;
     257}
     258
     259
     260
     261sub _extract_archives_doc
     262{
     263    my $self = shift @_;
     264
     265    my $collect   = $self->{'collect'};
     266    my $gsdl_cgi  = $self->{'gsdl_cgi'};
     267    my $infodbtype = $self->{'infodbtype'};
     268   
     269    my $site = $self->{'site'};
     270       
     271    # Obtain the collect and archive dir   
     272    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
     273   
     274    my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
     275
     276    # look up additional args
     277    my $docid = $self->{'d'};
     278    my $new_docid = $self->{'newd'} || "HASH" . localtime(time);
     279
     280    my $json_sections_str = $self->{'json-sections'};
     281    my $json_sections_array = decode_json($json_sections_str);
     282
     283
     284    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
     285    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
     286
     287    my $doc_file = $doc_rec->{'doc-file'}->[0];
     288    my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
     289
     290    my $newdoc_filename = &util::filename_cat($archive_dir, "test.xml");
     291
     292#   # This now stores the full pathname
     293#   my $doc_filename = $doc_rec->{'doc-file'}->[0];
     294
     295    my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents");
     296
     297    my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract");
     298   
     299    if ($extract_status == 0)
     300    {
     301        my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
     302
     303        my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $delete_sec_num_hash, "delete");
     304
     305        if ($delete_status == 0) {
     306
     307        my $mess = "document-extract successful: Key[$docid]\n";
     308
     309        $gsdl_cgi->generate_ok_message($mess); 
     310        }
     311        else {
     312        my $mess .= "Failed to extract identified section numbers for key: $docid\n";
     313        $mess .= "Exit status: $delete_status\n";
     314        $mess .= "System Error Message: $!\n";
     315        $mess .= "-" x 20 . "\n";
     316       
     317        $gsdl_cgi->generate_error($mess);
     318        }
     319    }
     320    else
     321    {
     322        my $mess .= "Failed to remove identified section numbers for key: $docid\n";
     323        $mess .= "Exit status: $extract_status\n";
     324        $mess .= "System Error Message: $!\n";
     325        $mess .= "-" x 20 . "\n";
     326       
     327        $gsdl_cgi->generate_error($mess);
     328    }
     329   
     330    #return $status; # in case calling functions have a use for this
     331}
     332
     333
     334
    80335# JSON version that will get the requested metadata values
    81336# from the requested source (index, import, archives or live)
     
    84339sub extract_archives_doc
    85340{
    86     my $self = shift @_;
    87 
    88 #    my $where = $self->{'where'};
    89 #    if (!$where) {
    90 #   $where = "index"; # default behaviour is to get the values from index
    91 #    }
    92 
    93     # Only when setting metadata do we perform authentication and do we lock the collection,
    94     # not when getting metadata
    95 
    96     # for get_meta_array, the where param can only be ONE of import, archives, index, live
    97 #    if($where =~ m/index/) {
    98 #   $self->_get_index_metadata_array(@_);
    99 #    }
    100 #    elsif($where =~ m/archives/) {
    101 #   $self->_get_archives_metadata_array(@_);
    102 #    }
    103 #    elsif($where =~ m/import/) {
    104 #   $self->_get_import_metadata_array(@_);
    105 #    }
    106 #    elsif($where =~ m/live/) {
    107 #       $self->_get_live_metadata_array(@_);
    108 #    }
     341    my $self = shift @_;
     342
     343    my $username  = $self->{'username'};
     344    my $collect   = $self->{'collect'};
     345    my $gsdl_cgi  = $self->{'gsdl_cgi'};
     346   
     347    if ($baseaction::authentication_enabled)
     348    {
     349        # Ensure the user is allowed to edit this collection       
     350        $self->authenticate_user($username, $collect);
     351    }
     352
     353    # Make sure the collection isn't locked by someone else
     354    $self->lock_collection($username, $collect);
     355
     356    $self->_extract_archives_doc(@_);
     357
     358    # Release the lock once it is done
     359    $self->unlock_collection($username, $collect);
     360
     361
    109362}
    110363
Note: See TracChangeset for help on using the changeset viewer.