Ignore:
Timestamp:
2021-03-22T22:07:10+13:00 (3 years ago)
Author:
davidb
Message:

Rough working version of code where there is now an infodatabase pass that adds in triples that correspond to the nodes that are made in the Classifiers

Location:
gs2-extensions/apache-jena/trunk/src/perllib
Files:
2 added
2 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuilder.pm

    r28802 r34969  
    6666    my $verbosity = $self->{'verbosity'};
    6767
    68     print $outhandle "\n*** Apache Jena TDB-configured Tripple store\n"  if ($verbosity >= 1);
     68    print $outhandle "\n*** Apache Jena TDB-configured Triple store\n"  if ($verbosity >= 1);
    6969
    7070    my $jenaTDB_dir = &util::filename_cat($build_dir, "jenaTDB");
     
    109109    if (!$self->{'keepold'}) {
    110110    my $collection = $self->{'collection'};
    111     my $cmd = "gs-triplestore-reset $collection";
     111#   my $cmd = "gs-triplestore-reset $collection";
     112    my $cmd = "gs-triplestore-reset3 $collection";
    112113       
    113114    my $status = system($cmd);
     
    130131}
    131132
     133sub supports_make_infodatabase {
     134    return 1;
     135}
     136
     137
     138
     139sub output_classifier {
     140    my $self = shift (@_);
     141   
     142    my ($infodb_type, $infodb_handle, $OID, $classify_infodb) = @_;
     143
     144    my $collect = $self->{'collection'};
     145   
     146    print STDERR "***** jenaTDBBuilder::output_classifier() info call for $OID\n";
     147    print STDERR "***** \n";
     148
     149    # print STDERR join("\n",keys %$classify_infodb);
     150}
     151
     152sub callback_with_self_closure {
     153    my ($self) = @_;
     154
     155    my $callback = sub {
     156    my ($infodb_type, $infodb_handle, $OID, $classify_infodb) = @_;
     157    $self->output_classifier($infodb_type, $infodb_handle, $OID, $classify_infodb);
     158    };
     159
     160    return $callback;
     161}
     162
     163   
     164sub make_infodatabase {
     165    my $self = shift (@_);
     166    my $outhandle = $self->{'outhandle'};
     167
     168    &classify::reset_next_classify_num();
     169       
     170    print STDERR "BuildDir: $self->{'build_dir'}\n";
     171
     172#    my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
     173#    my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
     174#    &FileUtils::makeAllDirectories ($textdir);
     175#    &FileUtils::makeAllDirectories ($assocdir);
     176
     177    ## Get info database file path
     178    print STDERR "*** jenaTDBBuilder::make_infodatabase() forcing infodbtype to be: fuseki\n";
     179    $self->{'infodbtype'} = "fuseki";
     180    my $infodb_type = $self->{'infodbtype'};
     181   
     182#   my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
     183
     184    print $outhandle "\n*** creating the jenaTDB classifier triples\n"
     185    if ($self->{'verbosity'} >= 1);
     186    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
     187
     188    # init all the classifiers
     189    &classify::init_classifiers ($self->{'classifiers'});
     190
     191    my $reconstructed_docs = undef;
     192    my $database_recs = undef;
     193
     194    if ($self->{'incremental'}) {
     195    print STDERR "!!!!!\n";
     196    print STDERR "! Incremental support for jenaTDBBuilder::make_infodatabase() has not been implemented\n";
     197    print STDERR "!!!!!\n";
     198    $database_recs = {};
     199
     200    ## &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
     201    }
     202
     203   
     204    my ($infodb_handle); # passed in to output_classify_info(), but OK to be null in this case
     205#    if ($self->{'debug'}) {
     206#       $infodb_handle = *STDOUT;
     207#    }
     208#    else {
     209#       $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
     210#       if (!defined($infodb_handle))
     211#       {
     212#           print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
     213#           die "builder::make_infodatabase - couldn't open infodb write handle\n";
     214#       }
     215#    }
     216
     217    # set up the document processor
     218
     219    #$self->{'buildproc'}->set_output_handle ($infodb_handle);
     220    $self->{'buildproc'}->set_mode ('infodb');
     221    #$self->{'buildproc'}->set_assocdir ($assocdir);
     222    #$self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
     223    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
     224    $self->{'buildproc'}->set_indexing_text (0);
     225    #$self->{'buildproc'}->set_store_text(1);
     226
     227    #if ($self->{'incremental'}) {
     228#   # reconstruct doc_obj metadata from database for all docs
     229#   $reconstructed_docs
     230#       = &classify::reconstruct_doc_objs_metadata($infodb_type,
     231#                              $infodb_file_path,
     232#                              $database_recs);
     233#   }
     234   
     235    # make_infodatabase needs full reset even for incremental build
     236    # as incremental works by reconstructing all docs from the database and
     237    # then adding in the new ones
     238    $self->{'buildproc'}->zero_reset();
     239
     240    $self->{'buildproc'}->{'mdprefix_fields'} = {};
     241   
     242    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     243           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
     244
     245#    if ($self->{'incremental'}) {
     246#   # create flat classify structure, ready for new docs to be added
     247#   foreach my $doc_obj ( @$reconstructed_docs ) {
     248#       if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
     249#       print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
     250#       $self->{'buildproc'}->process($doc_obj,undef);
     251#       }
     252#   }
     253#    }
     254    # this has changed to only output collection meta if its
     255    # not in the config file
     256    # print STDERR "****!!! Consider adding in a jenaTDB triplestore version of output_collection_meta()\n";
     257   
     258    $self->output_collection_meta($infodb_handle);
     259   
     260    my $output_callback = \&dbutil::write_infodb_entry;
     261    # my $output_callback = callback_with_self_closure($self);
     262   
     263    # output classification information
     264    &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
     265                     $self->{'remove_empty_classifications'},
     266                     $output_callback,
     267                     $self->{'gli'});
     268
     269    # Output classifier reverse lookup, used in incremental deletion
     270    ####&classify::print_reverse_lookup($infodb_handle);
     271
     272    # output doclist
     273    my @doc_list = $self->{'buildproc'}->get_doc_list();
     274    my $browselist_infodb = { 'hastxt' => [ "0" ],
     275                  'childtype' => [ "VList" ],
     276                  'numleafdocs' => [ scalar(@doc_list) ],
     277                  'thistype' => [ "Invisible" ],
     278                  'contains' => [ join(";", @doc_list) ] };
     279    &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
     280
     281#    &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
     282   
     283#    if ($infodb_type eq "gdbm-txtgz") {
     284#   my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
     285#   if (-e $gdb_infodb_file_path) {
     286#       &FileUtils::removeFiles($gdb_infodb_file_path);
     287#   }
     288#    }
     289    print STDERR "</Stage>\n" if $self->{'gli'};
     290}
     291
     292
     293
    1322941;
    133295
  • gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm

    r34691 r34969  
    191191}
    192192
    193 sub textedit {
     193
     194sub xml_to_ttl {
    194195    my $self = shift (@_);
    195     my ($doc_obj) = @_;
     196    my ($section_text,$output_root) = @_;
    196197    my $handle = $self->{'output_handle'};
    197198   
    198     my $doc_oid = $doc_obj->get_OID();
    199 
    200199    my $tmp_dir = $self->{'tmp_dir'};
    201     my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
     200    my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"$output_root.ttl");
    202201    my $tmp_doc_filename_cc    = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
    203202
     
    210209    binmode($xml_outhandler,":utf8");
    211210
    212     my $section_text = &docprint::get_section_xml($doc_obj);
    213 
    214 #    $section_text =~ s/[\r\n]+$//s; # remove very last newline char
    215 
    216 ##    $section_text =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
    217 ##    $section_text =~ s/\&#([0-9]+);/chr($1)/eig;
    218 
    219 #    $section_text =~ s/\\/\\\\/g;
    220 
    221 #    $section_text =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
    222 #    $section_text =~ s/\&amp;#([0-9]+);/chr($1)/eig;
    223 
    224 #    $section_text =~ s/(\r|\n)+/&lt;br \/&gt;/gs;
    225 #    $section_text =~ s/[\r\n]+/ /gs;
    226 
    227 ##    $section_text =~ s/\n/ AND /gs;
    228 
    229 #    open(DOUT,">/tmp/debug.xml") || die "Failed to open";
    230 #    print DOUT $section_text;
    231 #    print DOUT "\n";
    232 #    close DOUT;
    233 #    exit -1;
     211    ### my $section_text = &docprint::get_section_xml($doc_obj);
    234212
    235213    $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
    236 
    237 ## $1&make_ttl_safe($2)$3
    238 
    239 ##    print STDERR "*** st = $section_text\n\n";
    240 
    241 
     214    ## $1&make_ttl_safe($2)$3
     215
     216##      print STDERR "*** st = $section_text\n\n";
     217##  $self->debug_section_text($section_text);
     218   
    242219    print $xml_outhandler $section_text;
    243220    }
     
    245222    $self->close_xslt_pipe();
    246223
    247     # now feed the generated file to jena's (TDB) tripple store
     224    # now feed the generated file to jena's (TDB) triple store
    248225
    249226    my $outhandle = $self->{'outhandle'};
    250     print $outhandle "  Inserting tripples for $doc_oid\n";
     227    print $outhandle "  Inserting triples for $output_root\n";
    251228
    252229    my $collection = $self->{'collection'};
     
    254231    if (-f $tmp_doc_filename) {
    255232
    256     my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
     233#   my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
     234    my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\"";
    257235       
    258236    my $status = system($cmd);
     
    261239    }
    262240   
    263     unlink $tmp_doc_filename;
     241    print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n";
     242#   unlink $tmp_doc_filename;
    264243    }
    265244    else {
    266245    print STDERR "*** Failed to generate: $tmp_doc_filename\n";
    267246    }
     247
     248}
     249   
     250sub textedit {
     251    my $self = shift (@_);
     252    my ($doc_obj) = @_;
     253    my $handle = $self->{'output_handle'};
     254
     255    # print STDERR "**** jenaTDBBuildproc::textedit()\n";
     256   
     257    my $doc_oid = $doc_obj->get_OID();
     258    my $ttl_output_root_file = "doc-$doc_oid";
     259
     260    my $section_text = &docprint::get_section_xml($doc_obj);
     261    $self->xml_to_ttl($section_text,$ttl_output_root_file);
     262   
     263    # my $tmp_dir = $self->{'tmp_dir'};
     264    # my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
     265    # my $tmp_doc_filename_cc    = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
     266
     267    # my $xslt_filename = $self->{'xslt_filename'};
     268    # $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
     269
     270    # my $xml_outhandler = $self->{'xslt_writer'};
     271
     272    # if (defined $xml_outhandler) {
     273    #   binmode($xml_outhandler,":utf8");
     274
     275    #   my $section_text = &docprint::get_section_xml($doc_obj);
     276
     277    #   $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
     278    #   ## $1&make_ttl_safe($2)$3
     279
     280    #   ##    print STDERR "*** st = $section_text\n\n";
     281
     282    #   print $xml_outhandler $section_text;
     283    # }
     284
     285    # $self->close_xslt_pipe();
     286
     287    # # now feed the generated file to jena's (TDB) tripple store
     288
     289    # my $outhandle = $self->{'outhandle'};
     290    # print $outhandle "  Inserting tripples for $doc_oid\n";
     291
     292    # my $collection = $self->{'collection'};
     293
     294    # if (-f $tmp_doc_filename) {
     295
     296    #   # my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
     297    #   my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\"";
     298       
     299    #   my $status = system($cmd);
     300    #   if ($status != 0) {
     301    #       print STDERR "Error: failed to run:\n  $cmd\n$!\n";
     302    #   }
     303   
     304    #   # print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n";
     305    #   unlink $tmp_doc_filename;
     306    # }
     307    # else {
     308    #   print STDERR "*** Failed to generate: $tmp_doc_filename\n";
     309    # }
    268310
    269311}
     
    297339
    298340
    299 
     341sub infodbedit
     342{
     343    my $self = shift (@_);
     344    my ($doc_obj, $filename, $edit_mode) = @_;
     345
     346    # print STDERR "**** jenaTDBBuidproc::infodbedit(): $filename, $edit_mode\n";
     347   
     348    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
     349    my $doctype = $doc_obj->get_doc_type();
     350    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
     351
     352
     353    #
     354    # The following is done in basebuildproc, consider if it makes sense to do here
     355    #
     356   
     357#    #add this document to the browse structure
     358#    push(@{$self->{'doclist'}},$doc_obj->get_OID())
     359#   unless ($doctype eq "classification");
     360#    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
     361   
     362#    if (!defined $filename) {
     363#   # a reconstructed doc
     364#   my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
     365#   if (defined $num_reconstructed_bytes) {
     366#       $self->{'num_bytes'} += $num_reconstructed_bytes;
     367#   }
     368#    }
     369
     370    # classify the document
     371    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
     372
     373   
     374}   
    300375
    301376
Note: See TracChangeset for help on using the changeset viewer.