Context Navigation

← Previous Changeset
Next Changeset →

Changeset 28249

Timestamp:

2013-09-10T15:05:36+12:00 (11 years ago)

Author:

davidb

Message:

A very rough cut at dividig a document in to two parts

File:

: 1 edited

main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm

-              r28245
+              r28249
     "extract-archives-doc" => { # where param can be ONE of: index (default), import, archives, live
         'compulsory-args' => [ "d", "json-sections" ],
         'optional-args'   => [ "json-metadata" ],
+        'optional-args'   => [ "json-metadata", "newd" ],
 #       'optional-args'   => [ "where" ],
         'help-string' => [
 …
+sub dxml_start_section
+{
+    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
+    my $new_depth = scalar(@$contextArray);
+    if ($new_depth == 1) {
+    $parser->{'parameters'}->{'curr_section_depth'} = 1;
+    $parser->{'parameters'}->{'curr_section_num'}   = "";
+    }
+    my $old_depth  = $parser->{'parameters'}->{'curr_section_depth'};
+    my $old_secnum = $parser->{'parameters'}->{'curr_section_num'};
+    my $new_secnum;
+    if ($new_depth > $old_depth) {
+    # child subsection
+    $new_secnum = "$old_secnum.1";
+    }
+    elsif ($new_depth == $old_depth) {
+    # sibling section => increase it's value by 1
+    my ($tail_num) = ($old_secnum =~ m/\.(\d+)$/);
+    $tail_num++;
+    $new_secnum = $old_secnum;
+    $new_secnum =~ s/\.(\d+)$/\.$tail_num/;
+    }
+    else {
+    # back up to parent section => lopp off tail
+### print STDERR "**### back up to parent section, $old_secnum = $old_secnum\n";
+    $new_secnum = $old_secnum;
+    $new_secnum =~ s/\.\d+$//;
+    }
+    $parser->{'parameters'}->{'curr_section_depth'} = $new_depth;
+    $parser->{'parameters'}->{'curr_section_num'}   = $new_secnum;
+;
+}
+sub dxml_section
+{
+    my ($tagname, $attrHash, $contextArray, $parentDataArray, $parser) = @_;
+    my $sec_num_hash = $parser->{'parameters'}->{'sec_num_hash'};
+    my $curr_sec_num = $parser->{'parameters'}->{'curr_section_num'} || undef;
+    my $mode = $parser->{'parameters'}->{'mode'};
+    my $depth = $parser->{'parameters'}->{'curr_section_depth'};
+    my $live_depth = scalar(@$contextArray);
+#   print STDERR "closing secdtion tag, mode: $mode, curr sec num = $curr_sec_num, live depth = $live_depth\n";
+    if ($live_depth == 1) {
+        # root sectin tag, which must always exist
+#       print STDERR "*** root, tagname = $tagname, attrHash = $attrHash ",
+#         " '_content' = ", join(",",@{$attrHash->{'_content'}}), "\n";
+        return [$tagname => $attrHash];
+    }
+    elsif (defined $sec_num_hash->{$curr_sec_num}) {
+###     print STDERR "*** got a match on $curr_sec_num, mode = $mode\n";
+        if ($mode eq "extract") {
+        # keep it
+        return [$tagname => $attrHash];
+        }
+        else {
+        # remove
+        return undef;
+        }
+    }
+    else {
+        # not in our list
+        if ($mode eq "extract") {
+        # remove
+        return undef;
+        }
+        else {
+        # keep it
+        return [$tagname => $attrHash];
+        }
+    }
+}
+sub remove_from_doc_xml
+{
+    my $self = shift @_;
+    my ($gsdl_cgi, $doc_xml_filename, $newdoc_xml_filename, $sec_num_hash, $mode) = @_;
+    my @start_rules = ('Section' => \&dxml_start_section);
+    # Set the call-back functions for the metadata tags
+    my @rules =
+    (
+        _default => 'raw',
+        'Section' => \&dxml_section
+    );
+    my $parser = XML::Rules->new
+    (
+        start_rules => \@start_rules,
+        rules => \@rules,
+        style => 'filter',
+        output_encoding => 'utf8',
+##   normalisespaces => 1, # http://search.cpan.org/~jenda/XML-Rules-1.16/lib/XML/Rules.pm
+#       stripspaces => 2|0|0 # ineffectual
+    );
+    my $status = 0;
+    my $xml_in = "";
+    if (!open(MIN,"<$doc_xml_filename"))
+    {
+        $gsdl_cgi->generate_error("Unable to read in $doc_xml_filename: $!");
+        $status = 1;
+    }
+    else
+    {
+        # Read them in
+        my $line;
+        while (defined ($line=<MIN>)) {
+            $xml_in .= $line;
+        }
+        close(MIN);
+        # Filter with the call-back functions
+        my $xml_out = "";
+        my $MOUT;
+        if (!open($MOUT,">$newdoc_xml_filename")) {
+            $gsdl_cgi->generate_error("Unable to write out to $newdoc_xml_filename: $!");
+            $status = 1;
+        }
+        else {
+            binmode($MOUT,":utf8");
+            my $options = {sec_num_hash => $sec_num_hash, mode => $mode };
+            $parser->filter($xml_in, $MOUT, $options);
+            close($MOUT);
+        }
+    }
+    return $status;
+}
+sub sections_as_hash
+{
+    my $self = shift @_;
+    my ($json_sections_array,$mode) = @_;
+    my $sec_num_hash = {};
+    foreach my $sn ( @$json_sections_array ) {
+    # our XML parser curr_sec_num puts '.' at the root
+    # Need to do the same here, so things can be matched up
+    $sec_num_hash->{".$sn"} = 1;
+### print STDERR "** storeing .$sn\n";
+    if ($mode eq "with-parents") {
+        my $sn_copy = $sn; # needs to be a copy, otherwise chaning version stored in json_sections
+        while ($sn_copy =~ s/\.\d+$//) {
+        $sec_num_hash->{".$sn_copy"} = 1; # See '.' comment above
+####        print STDERR "** ***** parent storeing .$sn_copy\n";
+        }
+    }
+    }
+    return $sec_num_hash;
+}
+sub _extract_archives_doc
+{
+    my $self = shift @_;
+    my $collect   = $self->{'collect'};
+    my $gsdl_cgi  = $self->{'gsdl_cgi'};
+    my $infodbtype = $self->{'infodbtype'};
+    my $site = $self->{'site'};
+    # Obtain the collect and archive dir
+    my $collect_dir = $gsdl_cgi->get_collection_dir($site);
+    my $archive_dir = &util::filename_cat($collect_dir,$collect,"archives");
+    # look up additional args
+    my $docid = $self->{'d'};
+    my $new_docid = $self->{'newd'} || "HASH" . localtime(time);
+    my $json_sections_str = $self->{'json-sections'};
+    my $json_sections_array = decode_json($json_sections_str);
+    my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archive_dir);
+    my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $docid);
+    my $doc_file = $doc_rec->{'doc-file'}->[0];
+    my $doc_filename = &util::filename_cat($archive_dir, $doc_file);
+    my $newdoc_filename = &util::filename_cat($archive_dir, "test.xml");
+#   # This now stores the full pathname
+#   my $doc_filename = $doc_rec->{'doc-file'}->[0];
+    my $extract_sec_num_hash = $self->sections_as_hash($json_sections_array,"with-parents");
+    my $extract_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$newdoc_filename, $extract_sec_num_hash, "extract");
+    if ($extract_status == 0)
+    {
+        my $delete_sec_num_hash = $self->sections_as_hash($json_sections_array,"no-parents");
+        my $delete_status = $self->remove_from_doc_xml($gsdl_cgi, $doc_filename ,$doc_filename, $delete_sec_num_hash, "delete");
+        if ($delete_status == 0) {
+        my $mess = "document-extract successful: Key[$docid]\n";
+        $gsdl_cgi->generate_ok_message($mess);
+        }
+        else {
+        my $mess .= "Failed to extract identified section numbers for key: $docid\n";
+        $mess .= "Exit status: $delete_status\n";
+        $mess .= "System Error Message: $!\n";
+        $mess .= "-" x 20 . "\n";
+        $gsdl_cgi->generate_error($mess);
+        }
+    }
+    else
+    {
+        my $mess .= "Failed to remove identified section numbers for key: $docid\n";
+        $mess .= "Exit status: $extract_status\n";
+        $mess .= "System Error Message: $!\n";
+        $mess .= "-" x 20 . "\n";
+        $gsdl_cgi->generate_error($mess);
+    }
+    #return $status; # in case calling functions have a use for this
+}
 # JSON version that will get the requested metadata values
 # from the requested source (index, import, archives or live)
 …
 sub extract_archives_doc
+{
+    my $self = shift @_;
+#    my $where = $self->{'where'};
+#    if (!$where) {
+#   $where = "index"; # default behaviour is to get the values from index
+#    }
+    # Only when setting metadata do we perform authentication and do we lock the collection,
+    # not when getting metadata
+    # for get_meta_array, the where param can only be ONE of import, archives, index, live
+#    if($where =~ m/index/) {
+#   $self->_get_index_metadata_array(@_);
+#    }
+#    elsif($where =~ m/archives/) {
+#   $self->_get_archives_metadata_array(@_);
+#    }
+#    elsif($where =~ m/import/) {
+#   $self->_get_import_metadata_array(@_);
+#    }
+#    elsif($where =~ m/live/) {
+#       $self->_get_live_metadata_array(@_);
+#    }
+    my $self = shift @_;
+    my $username  = $self->{'username'};
+    my $collect   = $self->{'collect'};
+    my $gsdl_cgi  = $self->{'gsdl_cgi'};
+    if ($baseaction::authentication_enabled)
+    {
+        # Ensure the user is allowed to edit this collection
+        $self->authenticate_user($username, $collect);
+    }
+    # Make sure the collection isn't locked by someone else
+    $self->lock_collection($username, $collect);
+    $self->_extract_archives_doc(@_);
+    # Release the lock once it is done
+    $self->unlock_collection($username, $collect);
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 28249

Legend:

main/trunk/greenstone2/perllib/cgiactions/docextractaction.pm

Download in other formats: