Changeset 28251


Ignore:
Timestamp:
09/11/13 14:11:32 (8 years ago)
Author:
davidb
Message:

Change to calculation of curr_sec_num so it correctly pops off values as a series of </Section> tags are encountered

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm

    r28249 r28251  
    4646    "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
    4747        'compulsory-args' => [ "d", "json-sections" ],
    48         'optional-args'   => [ "json-metadata", "newd" ],
     48        'optional-args'   => [ "json-metadata", "newd",
     49                   "keep-parent-metadata", "keep-parent-content" ],
    4950#       'optional-args'   => [ "where" ],
    5051        'help-string' => [
     
    9697
    9798    if ($new_depth > $old_depth) {
    98     # child subsection
     99    # first child subsection
    99100    $new_secnum = "$old_secnum.1";
    100101    }
     
    108109    else {
    109110    # back up to parent section => lopp off tail
    110 ### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n";
    111 
    112     $new_secnum = $old_secnum;
    113     $new_secnum =~ s/\.\d+$//;
     111#   $new_secnum = $old_secnum;
     112#   $new_secnum =~ s/\.\d+$//;
    114113    }
    115114
     
    125124    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
    126125
     126    my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
     127
    127128    my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
    128     my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
     129    my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'};
     130   
     131    my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'};
     132    my $keep_parent_content  = $parser->{'parameters'}->{'keep_parent_content'};
    129133
    130134    my $mode = $parser->{'parameters'}->{'mode'};
    131135
    132     my $depth = $parser->{'parameters'}->{'curr_section_depth'};
    133 
     136    my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'};
    134137    my $live_depth = scalar(@$contextArray);
    135138
    136 #   print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n";
     139    if ($live_depth < $prev_depth) {
     140        # In a closing-sections poping off situation:
     141        #   </Section>
     142        # </Section>
     143
     144        # => Back up to parent section => lopp off tail
     145
     146        $curr_sec_num =~ s/\.\d+$//;
     147        $parser->{'parameters'}->{'curr_section_depth'} = $live_depth;
     148        $parser->{'parameters'}->{'curr_section_num'}   = $curr_sec_num;
     149    }
     150
    137151
    138152    if ($live_depth == 1) {
    139153        # root sectin tag, which must always exist
    140 #       print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ",
    141 #         " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n";
    142154        return [$tagname => $attrHash];
    143155    }
    144     elsif (defined $sec_num_hash->{$curr_sec_num}) {
    145 ###     print STDERR "*** got a match on $curr_sec_num, mode = $mode\n";
    146 
    147         if ($mode eq "extract") {
    148         # keep it
    149         return [$tagname => $attrHash];
    150         }
    151         else {
    152         # remove
    153         return undef;
    154         }
    155     }
    156     else {
    157         # not in our list
    158         if ($mode eq "extract") {
    159         # remove
    160         return undef;
     156    elsif ($mode eq "delete") {
     157        if (defined $sec_num_hash->{$curr_sec_num}) {
     158        # remove it
     159        return undef
    161160        }
    162161        else {
     
    165164        }
    166165    }
     166    else {
     167        # mode is extract
     168
     169        if (defined $sec_num_hash->{$curr_sec_num}) {
     170        # keep it
     171##      print STDERR "**** Asked to keep: sec num = $curr_sec_num\n";
     172
     173        return [$tagname => $attrHash];
     174        }
     175        elsif (defined $parent_sec_num_hash->{$curr_sec_num}) {
     176        # want this element, but cut down to just the child <Section>
     177       
     178        my $section_child = undef;
     179
     180##      print STDERR "**** Parent match: sec num = $curr_sec_num\n";
     181
     182        my $filtered_elems = [];
     183       
     184        foreach my $elem ( @{$attrHash->{'_content'}}) {
     185            if (ref $elem eq "ARRAY") {
     186            my $child_tagname = $elem->[0];
     187##          print STDERR "***## elem name $child_tagname\n";
     188
     189           
     190            if ($child_tagname eq "Description") {
     191                if ($keep_parent_metadata) {
     192                push(@$filtered_elems,$elem);
     193                }
     194            }
     195            elsif ($child_tagname eq "Content") {
     196                if ($keep_parent_content) {
     197                push(@$filtered_elems,$elem);
     198                }
     199            }
     200            else {
     201                push(@$filtered_elems,$elem);
     202            }
     203            }
     204            else {
     205            push(@$filtered_elems,$elem);
     206            }
     207        }
     208
     209        $attrHash->{'_content'} = $filtered_elems;
     210
     211        return [$tagname => $attrHash];
     212
     213        }
     214        else {
     215        # not in our list => remove it
     216        return undef;
     217        }
     218    }
    167219}
    168220
     
    171223{
    172224    my $self = shift @_;
    173     my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_;
     225    my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,
     226        $sec_num_hash, $parent_sec_num_hash, $mode) = @_;
    174227   
    175228    my @start_rules = ('Section' => \&dxml_start_section);
     
    219272            binmode($MOUT,":utf8");
    220273
    221             my $options = {sec_num_hash => $sec_num_hash, mode => $mode };
     274            my $options = { sec_num_hash         => $sec_num_hash,
     275                    parent_sec_num_hash  => $parent_sec_num_hash,
     276                    keep_parent_metadata => $self->{'keep-parent-metadata'},
     277                    keep_parent_content  => $self->{'keep-parent-content'},
     278                        mode => $mode };
    222279
    223280            $parser->filter($xml_in, $MOUT, $options);
     
    231288{
    232289    my $self = shift @_;
    233 
    234     my ($json_sections_array,$mode) = @_;
     290   
     291    my ($json_sections_array) = @_;
    235292
    236293    my $sec_num_hash = {};
     
    241298    # Need to do the same here, so things can be matched up
    242299    $sec_num_hash->{".$sn"} = 1;
    243 
    244 ### print STDERR "** storeing .$sn\n";
    245 
    246     if ($mode eq "with-parents") {
    247         my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections
    248         while ($sn_copy =~ s/\.\d+$//) {
    249         $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above
    250 
    251 ####        print STDERR "** ***** parent storeing .$sn_copy\n";
    252         }
    253     }
    254300    }
    255301
     
    258304
    259305
     306sub parent_sections_as_hash
     307{
     308    my $self = shift @_;
     309
     310    my ($json_sections_array) = @_;
     311
     312    my $sec_num_hash = {};
     313
     314    foreach my $sn ( @$json_sections_array ) {
     315
     316    # needs to make a copy, otherwise version stored in json_sections gets changed
     317    my $sn_copy = $sn;
     318    while ($sn_copy =~ s/\.\d+$//) {
     319        # our XML parser curr_sec_num puts '.' at the root
     320        # Need to do the same here, so things can be matched up
     321
     322        $sec_num_hash->{".$sn_copy"} = 1;
     323    }
     324    }
     325
     326    return $sec_num_hash;
     327}
     328
     329sub parse_flag
     330{
     331        my $self = shift @_;
     332
     333    my ($arg_name) = @_;
     334
     335    my $flag = $self->{$arg_name} || 0;
     336
     337    $flag =~ s/^true/1/i;
     338    $flag =~ s/^false/0/i;
     339
     340    return $flag;
     341}
    260342
    261343sub _extract_archives_doc
     
    278360    my $new_docid = $self->{'newd'} || "HASH" . localtime(time);
    279361
     362    $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata");
     363    $self->{'keep-parent-content'}  = $self->parse_flag("keep-parent-content");
     364
    280365    my $json_sections_str = $self->{'json-sections'};
    281366    my $json_sections_array = decode_json($json_sections_str);
     
    293378#   my $doc_filename = $doc_rec->{'doc-file'}->[0];
    294379
    295     my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents");
    296 
    297     my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract");
     380    my $sec_num_hash = $self->sections_as_hash($json_sections_array);
     381    my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array);
     382
     383    my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract");
    298384   
    299385    if ($extract_status == 0)
     
    301387        my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
    302388
    303         my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $delete_sec_num_hash, "delete");
     389        my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete");
    304390
    305391        if ($delete_status == 0) {
Note: See TracChangeset for help on using the changeset viewer.