- Timestamp:
- 2013-09-11T14:11:32+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm
r28249 r28251 46 46 "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live 47 47 'compulsory-args' => [ "d", "json-sections" ], 48 'optional-args' => [ "json-metadata", "newd" ], 48 'optional-args' => [ "json-metadata", "newd", 49 "keep-parent-metadata", "keep-parent-content" ], 49 50 # 'optional-args' => [ "where" ], 50 51 'help-string' => [ … … 96 97 97 98 if ($new_depth > $old_depth) { 98 # child subsection99 # first child subsection 99 100 $new_secnum = "$old_secnum.1"; 100 101 } … … 108 109 else { 109 110 # back up to parent section => lopp off tail 110 ### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n"; 111 112 $new_secnum = $old_secnum; 113 $new_secnum =~ s/\.\d+$//; 111 # $new_secnum = $old_secnum; 112 # $new_secnum =~ s/\.\d+$//; 114 113 } 115 114 … … 125 124 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 126 125 126 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef; 127 127 128 my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'}; 128 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef; 129 my $parent_sec_num_hash = $parser->{'parameters'}->{'parent_sec_num_hash'}; 130 131 my $keep_parent_metadata = $parser->{'parameters'}->{'keep_parent_metadata'}; 132 my $keep_parent_content = $parser->{'parameters'}->{'keep_parent_content'}; 129 133 130 134 my $mode = $parser->{'parameters'}->{'mode'}; 131 135 132 my $depth = $parser->{'parameters'}->{'curr_section_depth'}; 133 136 my $prev_depth = $parser->{'parameters'}->{'curr_section_depth'}; 134 137 my $live_depth = scalar(@$contextArray); 135 138 136 # print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n"; 139 if ($live_depth < $prev_depth) { 140 # In a closing-sections poping off situation: 141 # </Section> 142 # </Section> 143 144 # => Back up to parent section => lopp off tail 145 146 $curr_sec_num =~ s/\.\d+$//; 147 $parser->{'parameters'}->{'curr_section_depth'} = $live_depth; 148 $parser->{'parameters'}->{'curr_section_num'} = $curr_sec_num; 149 } 150 137 151 138 152 if ($live_depth == 1) { 139 153 # root sectin tag, which must always exist 140 # print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ",141 # " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n";142 154 return [$tagname => $attrHash]; 143 155 } 144 elsif (defined $sec_num_hash->{$curr_sec_num}) { 145 ### print STDERR "*** got a match on $curr_sec_num, mode = $mode\n"; 146 147 if ($mode eq "extract") { 148 # keep it 149 return [$tagname => $attrHash]; 150 } 151 else { 152 # remove 153 return undef; 154 } 155 } 156 else { 157 # not in our list 158 if ($mode eq "extract") { 159 # remove 160 return undef; 156 elsif ($mode eq "delete") { 157 if (defined $sec_num_hash->{$curr_sec_num}) { 158 # remove it 159 return undef 161 160 } 162 161 else { … … 165 164 } 166 165 } 166 else { 167 # mode is extract 168 169 if (defined $sec_num_hash->{$curr_sec_num}) { 170 # keep it 171 ## print STDERR "**** Asked to keep: sec num = $curr_sec_num\n"; 172 173 return [$tagname => $attrHash]; 174 } 175 elsif (defined $parent_sec_num_hash->{$curr_sec_num}) { 176 # want this element, but cut down to just the child <Section> 177 178 my $section_child = undef; 179 180 ## print STDERR "**** Parent match: sec num = $curr_sec_num\n"; 181 182 my $filtered_elems = []; 183 184 foreach my $elem ( @{$attrHash->{'_content'}}) { 185 if (ref $elem eq "ARRAY") { 186 my $child_tagname = $elem->[0]; 187 ## print STDERR "***## elem name $child_tagname\n"; 188 189 190 if ($child_tagname eq "Description") { 191 if ($keep_parent_metadata) { 192 push(@$filtered_elems,$elem); 193 } 194 } 195 elsif ($child_tagname eq "Content") { 196 if ($keep_parent_content) { 197 push(@$filtered_elems,$elem); 198 } 199 } 200 else { 201 push(@$filtered_elems,$elem); 202 } 203 } 204 else { 205 push(@$filtered_elems,$elem); 206 } 207 } 208 209 $attrHash->{'_content'} = $filtered_elems; 210 211 return [$tagname => $attrHash]; 212 213 } 214 else { 215 # not in our list => remove it 216 return undef; 217 } 218 } 167 219 } 168 220 … … 171 223 { 172 224 my $self = shift @_; 173 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_; 225 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, 226 $sec_num_hash, $parent_sec_num_hash, $mode) = @_; 174 227 175 228 my @start_rules = ('Section' => \&dxml_start_section); … … 219 272 binmode($MOUT,":utf8"); 220 273 221 my $options = {sec_num_hash => $sec_num_hash, mode => $mode }; 274 my $options = { sec_num_hash => $sec_num_hash, 275 parent_sec_num_hash => $parent_sec_num_hash, 276 keep_parent_metadata => $self->{'keep-parent-metadata'}, 277 keep_parent_content => $self->{'keep-parent-content'}, 278 mode => $mode }; 222 279 223 280 $parser->filter($xml_in, $MOUT, $options); … … 231 288 { 232 289 my $self = shift @_; 233 234 my ($json_sections_array ,$mode) = @_;290 291 my ($json_sections_array) = @_; 235 292 236 293 my $sec_num_hash = {}; … … 241 298 # Need to do the same here, so things can be matched up 242 299 $sec_num_hash->{".$sn"} = 1; 243 244 ### print STDERR "** storeing .$sn\n";245 246 if ($mode eq "with-parents") {247 my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections248 while ($sn_copy =~ s/\.\d+$//) {249 $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above250 251 #### print STDERR "** ***** parent storeing .$sn_copy\n";252 }253 }254 300 } 255 301 … … 258 304 259 305 306 sub parent_sections_as_hash 307 { 308 my $self = shift @_; 309 310 my ($json_sections_array) = @_; 311 312 my $sec_num_hash = {}; 313 314 foreach my $sn ( @$json_sections_array ) { 315 316 # needs to make a copy, otherwise version stored in json_sections gets changed 317 my $sn_copy = $sn; 318 while ($sn_copy =~ s/\.\d+$//) { 319 # our XML parser curr_sec_num puts '.' at the root 320 # Need to do the same here, so things can be matched up 321 322 $sec_num_hash->{".$sn_copy"} = 1; 323 } 324 } 325 326 return $sec_num_hash; 327 } 328 329 sub parse_flag 330 { 331 my $self = shift @_; 332 333 my ($arg_name) = @_; 334 335 my $flag = $self->{$arg_name} || 0; 336 337 $flag =~ s/^true/1/i; 338 $flag =~ s/^false/0/i; 339 340 return $flag; 341 } 260 342 261 343 sub _extract_archives_doc … … 278 360 my $new_docid = $self->{'newd'} || "HASH" . localtime(time); 279 361 362 $self->{'keep-parent-metadata'} = $self->parse_flag("keep-parent-metadata"); 363 $self->{'keep-parent-content'} = $self->parse_flag("keep-parent-content"); 364 280 365 my $json_sections_str = $self->{'json-sections'}; 281 366 my $json_sections_array = decode_json($json_sections_str); … … 293 378 # my $doc_filename = $doc_rec->{'doc-file'}->[0]; 294 379 295 my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents"); 296 297 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract"); 380 my $sec_num_hash = $self->sections_as_hash($json_sections_array); 381 my $parent_sec_num_hash = $self->parent_sections_as_hash($json_sections_array); 382 383 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $sec_num_hash, $parent_sec_num_hash, "extract"); 298 384 299 385 if ($extract_status == 0) … … 301 387 my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents"); 302 388 303 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $ delete_sec_num_hash, "delete");389 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $sec_num_hash, undef, "delete"); 304 390 305 391 if ($delete_status == 0) {
Note:
See TracChangeset
for help on using the changeset viewer.