Changeset 34969 for gs2-extensions/apache-jena
- Timestamp:
- 2021-03-22T22:07:10+13:00 (3 years ago)
- Location:
- gs2-extensions/apache-jena/trunk/src/perllib
- Files:
-
- 2 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuilder.pm
r28802 r34969 66 66 my $verbosity = $self->{'verbosity'}; 67 67 68 print $outhandle "\n*** Apache Jena TDB-configured Trip ple store\n" if ($verbosity >= 1);68 print $outhandle "\n*** Apache Jena TDB-configured Triple store\n" if ($verbosity >= 1); 69 69 70 70 my $jenaTDB_dir = &util::filename_cat($build_dir, "jenaTDB"); … … 109 109 if (!$self->{'keepold'}) { 110 110 my $collection = $self->{'collection'}; 111 my $cmd = "gs-triplestore-reset $collection"; 111 # my $cmd = "gs-triplestore-reset $collection"; 112 my $cmd = "gs-triplestore-reset3 $collection"; 112 113 113 114 my $status = system($cmd); … … 130 131 } 131 132 133 sub supports_make_infodatabase { 134 return 1; 135 } 136 137 138 139 sub output_classifier { 140 my $self = shift (@_); 141 142 my ($infodb_type, $infodb_handle, $OID, $classify_infodb) = @_; 143 144 my $collect = $self->{'collection'}; 145 146 print STDERR "***** jenaTDBBuilder::output_classifier() info call for $OID\n"; 147 print STDERR "***** \n"; 148 149 # print STDERR join("\n",keys %$classify_infodb); 150 } 151 152 sub callback_with_self_closure { 153 my ($self) = @_; 154 155 my $callback = sub { 156 my ($infodb_type, $infodb_handle, $OID, $classify_infodb) = @_; 157 $self->output_classifier($infodb_type, $infodb_handle, $OID, $classify_infodb); 158 }; 159 160 return $callback; 161 } 162 163 164 sub make_infodatabase { 165 my $self = shift (@_); 166 my $outhandle = $self->{'outhandle'}; 167 168 &classify::reset_next_classify_num(); 169 170 print STDERR "BuildDir: $self->{'build_dir'}\n"; 171 172 # my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text"); 173 # my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc"); 174 # &FileUtils::makeAllDirectories ($textdir); 175 # &FileUtils::makeAllDirectories ($assocdir); 176 177 ## Get info database file path 178 print STDERR "*** jenaTDBBuilder::make_infodatabase() forcing infodbtype to be: fuseki\n"; 179 $self->{'infodbtype'} = "fuseki"; 180 my $infodb_type = $self->{'infodbtype'}; 181 182 # my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir); 183 184 print $outhandle "\n*** creating the jenaTDB classifier triples\n" 185 if ($self->{'verbosity'} >= 1); 186 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 187 188 # init all the classifiers 189 &classify::init_classifiers ($self->{'classifiers'}); 190 191 my $reconstructed_docs = undef; 192 my $database_recs = undef; 193 194 if ($self->{'incremental'}) { 195 print STDERR "!!!!!\n"; 196 print STDERR "! Incremental support for jenaTDBBuilder::make_infodatabase() has not been implemented\n"; 197 print STDERR "!!!!!\n"; 198 $database_recs = {}; 199 200 ## &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs); 201 } 202 203 204 my ($infodb_handle); # passed in to output_classify_info(), but OK to be null in this case 205 # if ($self->{'debug'}) { 206 # $infodb_handle = *STDOUT; 207 # } 208 # else { 209 # $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path); 210 # if (!defined($infodb_handle)) 211 # { 212 # print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'}; 213 # die "builder::make_infodatabase - couldn't open infodb write handle\n"; 214 # } 215 # } 216 217 # set up the document processor 218 219 #$self->{'buildproc'}->set_output_handle ($infodb_handle); 220 $self->{'buildproc'}->set_mode ('infodb'); 221 #$self->{'buildproc'}->set_assocdir ($assocdir); 222 #$self->{'buildproc'}->set_dontdb ($self->{'dontdb'}); 223 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 224 $self->{'buildproc'}->set_indexing_text (0); 225 #$self->{'buildproc'}->set_store_text(1); 226 227 #if ($self->{'incremental'}) { 228 # # reconstruct doc_obj metadata from database for all docs 229 # $reconstructed_docs 230 # = &classify::reconstruct_doc_objs_metadata($infodb_type, 231 # $infodb_file_path, 232 # $database_recs); 233 # } 234 235 # make_infodatabase needs full reset even for incremental build 236 # as incremental works by reconstructing all docs from the database and 237 # then adding in the new ones 238 $self->{'buildproc'}->zero_reset(); 239 240 $self->{'buildproc'}->{'mdprefix_fields'} = {}; 241 242 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 243 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 244 245 # if ($self->{'incremental'}) { 246 # # create flat classify structure, ready for new docs to be added 247 # foreach my $doc_obj ( @$reconstructed_docs ) { 248 # if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) { 249 # print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n"; 250 # $self->{'buildproc'}->process($doc_obj,undef); 251 # } 252 # } 253 # } 254 # this has changed to only output collection meta if its 255 # not in the config file 256 # print STDERR "****!!! Consider adding in a jenaTDB triplestore version of output_collection_meta()\n"; 257 258 $self->output_collection_meta($infodb_handle); 259 260 my $output_callback = \&dbutil::write_infodb_entry; 261 # my $output_callback = callback_with_self_closure($self); 262 263 # output classification information 264 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle, 265 $self->{'remove_empty_classifications'}, 266 $output_callback, 267 $self->{'gli'}); 268 269 # Output classifier reverse lookup, used in incremental deletion 270 ####&classify::print_reverse_lookup($infodb_handle); 271 272 # output doclist 273 my @doc_list = $self->{'buildproc'}->get_doc_list(); 274 my $browselist_infodb = { 'hastxt' => [ "0" ], 275 'childtype' => [ "VList" ], 276 'numleafdocs' => [ scalar(@doc_list) ], 277 'thistype' => [ "Invisible" ], 278 'contains' => [ join(";", @doc_list) ] }; 279 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb); 280 281 # &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'}; 282 283 # if ($infodb_type eq "gdbm-txtgz") { 284 # my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir); 285 # if (-e $gdb_infodb_file_path) { 286 # &FileUtils::removeFiles($gdb_infodb_file_path); 287 # } 288 # } 289 print STDERR "</Stage>\n" if $self->{'gli'}; 290 } 291 292 293 132 294 1; 133 295 -
gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm
r34691 r34969 191 191 } 192 192 193 sub textedit { 193 194 sub xml_to_ttl { 194 195 my $self = shift (@_); 195 my ($ doc_obj) = @_;196 my ($section_text,$output_root) = @_; 196 197 my $handle = $self->{'output_handle'}; 197 198 198 my $doc_oid = $doc_obj->get_OID();199 200 199 my $tmp_dir = $self->{'tmp_dir'}; 201 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir," doc-$doc_oid.ttl");200 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"$output_root.ttl"); 202 201 my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename); 203 202 … … 210 209 binmode($xml_outhandler,":utf8"); 211 210 212 my $section_text = &docprint::get_section_xml($doc_obj); 213 214 # $section_text =~ s/[\r\n]+$//s; # remove very last newline char 215 216 ## $section_text =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig; 217 ## $section_text =~ s/\&#([0-9]+);/chr($1)/eig; 218 219 # $section_text =~ s/\\/\\\\/g; 220 221 # $section_text =~ s/\&#x([0-9A-F]+);/chr(hex($1))/eig; 222 # $section_text =~ s/\&#([0-9]+);/chr($1)/eig; 223 224 # $section_text =~ s/(\r|\n)+/<br \/>/gs; 225 # $section_text =~ s/[\r\n]+/ /gs; 226 227 ## $section_text =~ s/\n/ AND /gs; 228 229 # open(DOUT,">/tmp/debug.xml") || die "Failed to open"; 230 # print DOUT $section_text; 231 # print DOUT "\n"; 232 # close DOUT; 233 # exit -1; 211 ### my $section_text = &docprint::get_section_xml($doc_obj); 234 212 235 213 $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse; 236 237 ## $1&make_ttl_safe($2)$3 238 239 ## print STDERR "*** st = $section_text\n\n"; 240 241 214 ## $1&make_ttl_safe($2)$3 215 216 ## print STDERR "*** st = $section_text\n\n"; 217 ## $self->debug_section_text($section_text); 218 242 219 print $xml_outhandler $section_text; 243 220 } … … 245 222 $self->close_xslt_pipe(); 246 223 247 # now feed the generated file to jena's (TDB) trip ple store224 # now feed the generated file to jena's (TDB) triple store 248 225 249 226 my $outhandle = $self->{'outhandle'}; 250 print $outhandle " Inserting trip ples for $doc_oid\n";227 print $outhandle " Inserting triples for $output_root\n"; 251 228 252 229 my $collection = $self->{'collection'}; … … 254 231 if (-f $tmp_doc_filename) { 255 232 256 my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\""; 233 # my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\""; 234 my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\""; 257 235 258 236 my $status = system($cmd); … … 261 239 } 262 240 263 unlink $tmp_doc_filename; 241 print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n"; 242 # unlink $tmp_doc_filename; 264 243 } 265 244 else { 266 245 print STDERR "*** Failed to generate: $tmp_doc_filename\n"; 267 246 } 247 248 } 249 250 sub textedit { 251 my $self = shift (@_); 252 my ($doc_obj) = @_; 253 my $handle = $self->{'output_handle'}; 254 255 # print STDERR "**** jenaTDBBuildproc::textedit()\n"; 256 257 my $doc_oid = $doc_obj->get_OID(); 258 my $ttl_output_root_file = "doc-$doc_oid"; 259 260 my $section_text = &docprint::get_section_xml($doc_obj); 261 $self->xml_to_ttl($section_text,$ttl_output_root_file); 262 263 # my $tmp_dir = $self->{'tmp_dir'}; 264 # my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl"); 265 # my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename); 266 267 # my $xslt_filename = $self->{'xslt_filename'}; 268 # $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe 269 270 # my $xml_outhandler = $self->{'xslt_writer'}; 271 272 # if (defined $xml_outhandler) { 273 # binmode($xml_outhandler,":utf8"); 274 275 # my $section_text = &docprint::get_section_xml($doc_obj); 276 277 # $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse; 278 # ## $1&make_ttl_safe($2)$3 279 280 # ## print STDERR "*** st = $section_text\n\n"; 281 282 # print $xml_outhandler $section_text; 283 # } 284 285 # $self->close_xslt_pipe(); 286 287 # # now feed the generated file to jena's (TDB) tripple store 288 289 # my $outhandle = $self->{'outhandle'}; 290 # print $outhandle " Inserting tripples for $doc_oid\n"; 291 292 # my $collection = $self->{'collection'}; 293 294 # if (-f $tmp_doc_filename) { 295 296 # # my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\""; 297 # my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\""; 298 299 # my $status = system($cmd); 300 # if ($status != 0) { 301 # print STDERR "Error: failed to run:\n $cmd\n$!\n"; 302 # } 303 304 # # print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n"; 305 # unlink $tmp_doc_filename; 306 # } 307 # else { 308 # print STDERR "*** Failed to generate: $tmp_doc_filename\n"; 309 # } 268 310 269 311 } … … 297 339 298 340 299 341 sub infodbedit 342 { 343 my $self = shift (@_); 344 my ($doc_obj, $filename, $edit_mode) = @_; 345 346 # print STDERR "**** jenaTDBBuidproc::infodbedit(): $filename, $edit_mode\n"; 347 348 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document 349 my $doctype = $doc_obj->get_doc_type(); 350 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc"); 351 352 353 # 354 # The following is done in basebuildproc, consider if it makes sense to do here 355 # 356 357 # #add this document to the browse structure 358 # push(@{$self->{'doclist'}},$doc_obj->get_OID()) 359 # unless ($doctype eq "classification"); 360 # $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 361 362 # if (!defined $filename) { 363 # # a reconstructed doc 364 # my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes"); 365 # if (defined $num_reconstructed_bytes) { 366 # $self->{'num_bytes'} += $num_reconstructed_bytes; 367 # } 368 # } 369 370 # classify the document 371 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 372 373 374 } 300 375 301 376
Note:
See TracChangeset
for help on using the changeset viewer.