- Timestamp:
- 2013-09-10T15:05:36+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm
r28245 r28249 46 46 "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live 47 47 'compulsory-args' => [ "d", "json-sections" ], 48 'optional-args' => [ "json-metadata" ],48 'optional-args' => [ "json-metadata", "newd" ], 49 49 # 'optional-args' => [ "where" ], 50 50 'help-string' => [ … … 78 78 79 79 80 81 sub dxml_start_section 82 { 83 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 84 85 my $new_depth = scalar(@$contextArray); 86 87 if ($new_depth == 1) { 88 $parser->{'parameters'}->{'curr_section_depth'} = 1; 89 $parser->{'parameters'}->{'curr_section_num'} = ""; 90 } 91 92 my $old_depth = $parser->{'parameters'}->{'curr_section_depth'}; 93 my $old_secnum = $parser->{'parameters'}->{'curr_section_num'}; 94 95 my $new_secnum; 96 97 if ($new_depth > $old_depth) { 98 # child subsection 99 $new_secnum = "$old_secnum.1"; 100 } 101 elsif ($new_depth == $old_depth) { 102 # sibling section => increase it's value by 1 103 my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/); 104 $tail_num++; 105 $new_secnum = $old_secnum; 106 $new_secnum =~ s/\.(\d+)$/\.$tail_num/; 107 } 108 else { 109 # back up to parent section => lopp off tail 110 ### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n"; 111 112 $new_secnum = $old_secnum; 113 $new_secnum =~ s/\.\d+$//; 114 } 115 116 $parser->{'parameters'}->{'curr_section_depth'} = $new_depth; 117 $parser->{'parameters'}->{'curr_section_num'} = $new_secnum; 118 119 1; 120 } 121 122 123 sub dxml_section 124 { 125 my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_; 126 127 my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'}; 128 my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef; 129 130 my $mode = $parser->{'parameters'}->{'mode'}; 131 132 my $depth = $parser->{'parameters'}->{'curr_section_depth'}; 133 134 my $live_depth = scalar(@$contextArray); 135 136 # print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n"; 137 138 if ($live_depth == 1) { 139 # root sectin tag, which must always exist 140 # print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ", 141 # " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n"; 142 return [$tagname => $attrHash]; 143 } 144 elsif (defined $sec_num_hash->{$curr_sec_num}) { 145 ### print STDERR "*** got a match on $curr_sec_num, mode = $mode\n"; 146 147 if ($mode eq "extract") { 148 # keep it 149 return [$tagname => $attrHash]; 150 } 151 else { 152 # remove 153 return undef; 154 } 155 } 156 else { 157 # not in our list 158 if ($mode eq "extract") { 159 # remove 160 return undef; 161 } 162 else { 163 # keep it 164 return [$tagname => $attrHash]; 165 } 166 } 167 } 168 169 170 sub remove_from_doc_xml 171 { 172 my $self = shift @_; 173 my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_; 174 175 my @start_rules = ('Section' => \&dxml_start_section); 176 177 # Set the call-back functions for the metadata tags 178 my @rules = 179 ( 180 _default => 'raw', 181 'Section' => \&dxml_section 182 ); 183 184 my $parser = XML::Rules->new 185 ( 186 start_rules => \@start_rules, 187 rules => \@rules, 188 style => 'filter', 189 output_encoding => 'utf8', 190 ## normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm 191 # stripspaces => 2|0|0 # ineffectual 192 ); 193 194 my $status = 0; 195 my $xml_in = ""; 196 if (!open(MIN,"<$doc_xml_filename")) 197 { 198 $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!"); 199 $status = 1; 200 } 201 else 202 { 203 # Read them in 204 my $line; 205 while (defined ($line=<MIN>)) { 206 $xml_in .= $line; 207 } 208 close(MIN); 209 210 # Filter with the call-back functions 211 my $xml_out = ""; 212 213 my $MOUT; 214 if (!open($MOUT,">$newdoc_xml_filename")) { 215 $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!"); 216 $status = 1; 217 } 218 else { 219 binmode($MOUT,":utf8"); 220 221 my $options = {sec_num_hash => $sec_num_hash, mode => $mode }; 222 223 $parser->filter($xml_in, $MOUT, $options); 224 close($MOUT); 225 } 226 } 227 return $status; 228 } 229 230 sub sections_as_hash 231 { 232 my $self = shift @_; 233 234 my ($json_sections_array,$mode) = @_; 235 236 my $sec_num_hash = {}; 237 238 foreach my $sn ( @$json_sections_array ) { 239 240 # our XML parser curr_sec_num puts '.' at the root 241 # Need to do the same here, so things can be matched up 242 $sec_num_hash->{".$sn"} = 1; 243 244 ### print STDERR "** storeing .$sn\n"; 245 246 if ($mode eq "with-parents") { 247 my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections 248 while ($sn_copy =~ s/\.\d+$//) { 249 $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above 250 251 #### print STDERR "** ***** parent storeing .$sn_copy\n"; 252 } 253 } 254 } 255 256 return $sec_num_hash; 257 } 258 259 260 261 sub _extract_archives_doc 262 { 263 my $self = shift @_; 264 265 my $collect = $self->{'collect'}; 266 my $gsdl_cgi = $self->{'gsdl_cgi'}; 267 my $infodbtype = $self->{'infodbtype'}; 268 269 my $site = $self->{'site'}; 270 271 # Obtain the collect and archive dir 272 my $collect_dir = $gsdl_cgi->get_collection_dir($site); 273 274 my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives"); 275 276 # look up additional args 277 my $docid = $self->{'d'}; 278 my $new_docid = $self->{'newd'} || "HASH" . localtime(time); 279 280 my $json_sections_str = $self->{'json-sections'}; 281 my $json_sections_array = decode_json($json_sections_str); 282 283 284 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir); 285 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid); 286 287 my $doc_file = $doc_rec->{'doc-file'}->[0]; 288 my $doc_filename = &util::filename_cat($archive_dir, $doc_file); 289 290 my $newdoc_filename = &util::filename_cat($archive_dir, "test.xml"); 291 292 # # This now stores the full pathname 293 # my $doc_filename = $doc_rec->{'doc-file'}->[0]; 294 295 my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents"); 296 297 my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract"); 298 299 if ($extract_status == 0) 300 { 301 my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents"); 302 303 my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $delete_sec_num_hash, "delete"); 304 305 if ($delete_status == 0) { 306 307 my $mess = "document-extract successful: Key[$docid]\n"; 308 309 $gsdl_cgi->generate_ok_message($mess); 310 } 311 else { 312 my $mess .= "Failed to extract identified section numbers for key: $docid\n"; 313 $mess .= "Exit status: $delete_status\n"; 314 $mess .= "System Error Message: $!\n"; 315 $mess .= "-" x 20 . "\n"; 316 317 $gsdl_cgi->generate_error($mess); 318 } 319 } 320 else 321 { 322 my $mess .= "Failed to remove identified section numbers for key: $docid\n"; 323 $mess .= "Exit status: $extract_status\n"; 324 $mess .= "System Error Message: $!\n"; 325 $mess .= "-" x 20 . "\n"; 326 327 $gsdl_cgi->generate_error($mess); 328 } 329 330 #return $status; # in case calling functions have a use for this 331 } 332 333 334 80 335 # JSON version that will get the requested metadata values 81 336 # from the requested source (index, import, archives or live) … … 84 339 sub extract_archives_doc 85 340 { 86 my $self = shift @_; 87 88 # my $where = $self->{'where'}; 89 # if (!$where) { 90 # $where = "index"; # default behaviour is to get the values from index 91 # } 92 93 # Only when setting metadata do we perform authentication and do we lock the collection, 94 # not when getting metadata 95 96 # for get_meta_array, the where param can only be ONE of import, archives, index, live 97 # if($where =~ m/index/) { 98 # $self->_get_index_metadata_array(@_); 99 # } 100 # elsif($where =~ m/archives/) { 101 # $self->_get_archives_metadata_array(@_); 102 # } 103 # elsif($where =~ m/import/) { 104 # $self->_get_import_metadata_array(@_); 105 # } 106 # elsif($where =~ m/live/) { 107 # $self->_get_live_metadata_array(@_); 108 # } 341 my $self = shift @_; 342 343 my $username = $self->{'username'}; 344 my $collect = $self->{'collect'}; 345 my $gsdl_cgi = $self->{'gsdl_cgi'}; 346 347 if ($baseaction::authentication_enabled) 348 { 349 # Ensure the user is allowed to edit this collection 350 $self->authenticate_user($username, $collect); 351 } 352 353 # Make sure the collection isn't locked by someone else 354 $self->lock_collection($username, $collect); 355 356 $self->_extract_archives_doc(@_); 357 358 # Release the lock once it is done 359 $self->unlock_collection($username, $collect); 360 361 109 362 } 110 363
Note:
See TracChangeset
for help on using the changeset viewer.