Changeset 28251

Show
Ignore:
Timestamp:
11.09.2013 14:11:32 (6 years ago)
Author:
davidb
Message:

Change to calculation of curr_sec_num so it correctly pops off values as a series of </Section> tags are encountered

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm

    r28249 r28251  
    4646    "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live 
    4747        'compulsory-args' => [ "d", "json-sections" ], 
    48         'optional-args'   => [ "json-metadata", "newd" ], 
     48        'optional-args'   => [ "json-metadata", "newd",  
     49                   "keep-parent-metadata", "keep-parent-content" ], 
    4950#       'optional-args'   => [ "where" ], 
    5051        'help-string' => [ 
     
    9697 
    9798    if ($new_depth > $old_depth) { 
    98     # child subsection 
     99    # first child subsection 
    99100    $new_secnum = "$old_secnum.1"; 
    100101    } 
     
    108109    else { 
    109110    # back up to parent section => lopp off tail 
    110 ### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n"; 
    111  
    112     $new_secnum = $old_secnum; 
    113     $new_secnum =~ s/\.\d+$//; 
     111#   $new_secnum = $old_secnum; 
     112#   $new_secnum =~ s/\.\d+$//; 
    114113    } 
    115114 
     
    125124    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 
    126125 
     126    my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef; 
     127 
    127128    my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'}; 
    128     my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef; 
     129    my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'}; 
     130     
     131    my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'}; 
     132    my $keep_parent_content  = $parser->{'parameters'}->{'keep_parent_content'}; 
    129133 
    130134    my $mode = $parser->{'parameters'}->{'mode'}; 
    131135 
    132     my $depth = $parser->{'parameters'}->{'curr_section_depth'}; 
    133  
     136    my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'}; 
    134137    my $live_depth = scalar(@$contextArray); 
    135138 
    136 #   print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n"; 
     139    if ($live_depth < $prev_depth) { 
     140        # In a closing-sections poping off situation: 
     141        #   </Section> 
     142        # </Section> 
     143 
     144        # => Back up to parent section => lopp off tail 
     145 
     146        $curr_sec_num =~ s/\.\d+$//; 
     147        $parser->{'parameters'}->{'curr_section_depth'} = $live_depth; 
     148        $parser->{'parameters'}->{'curr_section_num'}   = $curr_sec_num; 
     149    } 
     150 
    137151 
    138152    if ($live_depth == 1) { 
    139153        # root sectin tag, which must always exist 
    140 #       print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ",  
    141 #         " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n"; 
    142154        return [$tagname => $attrHash]; 
    143155    } 
    144     elsif (defined $sec_num_hash->{$curr_sec_num}) { 
    145 ###     print STDERR "*** got a match on $curr_sec_num, mode = $mode\n"; 
    146  
    147         if ($mode eq "extract") { 
    148         # keep it 
    149         return [$tagname => $attrHash]; 
    150         } 
    151         else { 
    152         # remove 
    153         return undef; 
    154         } 
    155     } 
    156     else { 
    157         # not in our list 
    158         if ($mode eq "extract") { 
    159         # remove 
    160         return undef; 
     156    elsif ($mode eq "delete") { 
     157        if (defined $sec_num_hash->{$curr_sec_num}) { 
     158        # remove it 
     159        return undef 
    161160        } 
    162161        else { 
     
    165164        } 
    166165    } 
     166    else { 
     167        # mode is extract 
     168 
     169        if (defined $sec_num_hash->{$curr_sec_num}) { 
     170        # keep it 
     171##      print STDERR "**** Asked to keep: sec num = $curr_sec_num\n"; 
     172 
     173        return [$tagname => $attrHash]; 
     174        } 
     175        elsif (defined $parent_sec_num_hash->{$curr_sec_num}) { 
     176        # want this element, but cut down to just the child <Section> 
     177         
     178        my $section_child = undef; 
     179 
     180##      print STDERR "**** Parent match: sec num = $curr_sec_num\n"; 
     181 
     182        my $filtered_elems = []; 
     183         
     184        foreach my $elem ( @{$attrHash->{'_content'}}) { 
     185            if (ref $elem eq "ARRAY") { 
     186            my $child_tagname = $elem->[0]; 
     187##          print STDERR "***## elem name $child_tagname\n"; 
     188 
     189             
     190            if ($child_tagname eq "Description") { 
     191                if ($keep_parent_metadata) { 
     192                push(@$filtered_elems,$elem); 
     193                } 
     194            } 
     195            elsif ($child_tagname eq "Content") { 
     196                if ($keep_parent_content) { 
     197                push(@$filtered_elems,$elem); 
     198                } 
     199            } 
     200            else { 
     201                push(@$filtered_elems,$elem); 
     202            } 
     203            } 
     204            else { 
     205            push(@$filtered_elems,$elem); 
     206            } 
     207        } 
     208 
     209        $attrHash->{'_content'} = $filtered_elems; 
     210 
     211        return [$tagname => $attrHash]; 
     212 
     213        } 
     214        else { 
     215        # not in our list => remove it 
     216        return undef; 
     217        } 
     218    } 
    167219} 
    168220 
     
    171223{ 
    172224    my $self = shift @_; 
    173     my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_; 
     225    my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename,  
     226        $sec_num_hash, $parent_sec_num_hash, $mode) = @_; 
    174227     
    175228    my @start_rules = ('Section' => \&dxml_start_section); 
     
    219272            binmode($MOUT,":utf8"); 
    220273 
    221             my $options = {sec_num_hash => $sec_num_hash, mode => $mode }; 
     274            my $options = { sec_num_hash         => $sec_num_hash,  
     275                    parent_sec_num_hash  => $parent_sec_num_hash, 
     276                    keep_parent_metadata => $self->{'keep-parent-metadata'}, 
     277                    keep_parent_content  => $self->{'keep-parent-content'}, 
     278                        mode => $mode }; 
    222279 
    223280            $parser->filter($xml_in, $MOUT, $options); 
     
    231288{ 
    232289    my $self = shift @_; 
    233  
    234     my ($json_sections_array,$mode) = @_; 
     290     
     291    my ($json_sections_array) = @_; 
    235292 
    236293    my $sec_num_hash = {}; 
     
    241298    # Need to do the same here, so things can be matched up 
    242299    $sec_num_hash->{".$sn"} = 1;  
    243  
    244 ### print STDERR "** storeing .$sn\n"; 
    245  
    246     if ($mode eq "with-parents") { 
    247         my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections 
    248         while ($sn_copy =~ s/\.\d+$//) { 
    249         $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above 
    250  
    251 ####        print STDERR "** ***** parent storeing .$sn_copy\n"; 
    252         } 
    253     } 
    254300    } 
    255301 
     
    258304 
    259305 
     306sub parent_sections_as_hash 
     307{ 
     308    my $self = shift @_; 
     309 
     310    my ($json_sections_array) = @_; 
     311 
     312    my $sec_num_hash = {}; 
     313 
     314    foreach my $sn ( @$json_sections_array ) { 
     315 
     316    # needs to make a copy, otherwise version stored in json_sections gets changed 
     317    my $sn_copy = $sn;  
     318    while ($sn_copy =~ s/\.\d+$//) { 
     319        # our XML parser curr_sec_num puts '.' at the root 
     320        # Need to do the same here, so things can be matched up 
     321 
     322        $sec_num_hash->{".$sn_copy"} = 1;  
     323    } 
     324    } 
     325 
     326    return $sec_num_hash; 
     327} 
     328 
     329sub parse_flag 
     330{ 
     331        my $self = shift @_; 
     332 
     333    my ($arg_name) = @_; 
     334 
     335    my $flag = $self->{$arg_name} || 0; 
     336 
     337    $flag =~ s/^true/1/i; 
     338    $flag =~ s/^false/0/i; 
     339 
     340    return $flag; 
     341} 
    260342 
    261343sub _extract_archives_doc 
     
    278360    my $new_docid = $self->{'newd'} || "HASH" . localtime(time); 
    279361 
     362    $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata"); 
     363    $self->{'keep-parent-content'}  = $self->parse_flag("keep-parent-content"); 
     364 
    280365    my $json_sections_str = $self->{'json-sections'}; 
    281366    my $json_sections_array = decode_json($json_sections_str); 
     
    293378#   my $doc_filename = $doc_rec->{'doc-file'}->[0];  
    294379 
    295     my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents"); 
    296  
    297     my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract"); 
     380    my $sec_num_hash = $self->sections_as_hash($json_sections_array); 
     381    my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array); 
     382 
     383    my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract"); 
    298384     
    299385    if ($extract_status == 0)  
     
    301387        my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents"); 
    302388 
    303         my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $delete_sec_num_hash, "delete"); 
     389        my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete"); 
    304390 
    305391        if ($delete_status == 0) {