########################################################################## # # jenaTDBBuildproc.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This document processor outputs a document for indexing (should be # implemented by subclass) and storing in the database package jenaTDBBuildproc; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use docprint; use util; use FileUtils; use extrabuildproc; BEGIN { @jenaTDBBuildproc::ISA = ('extrabuildproc'); } sub new() { my $class = shift @_; my $self = new extrabuildproc (@_); my $xslt_file = "gsdom2rdf.xsl"; my $xslt_filename = &util::locate_config_file($xslt_file); if (!defined $xslt_filename) { print STDERR "Can not find $xslt_file, please make sure you have supplied the correct file path\n"; die "\n"; } $self->{'xslt_file'} = $xslt_file; $self->{'xslt_filename'} = $xslt_filename; # Do the following here so it doesn't keep checking (within the util.pm method) # whether it needs to create the directory or not my $tmp_dir = &util::get_collectlevel_tmp_dir(); $self->{'tmp_dir'} = $tmp_dir; return bless $self, $class; } sub open_xslt_pipe { my $self = shift @_; my ($output_file_name, $xslt_file)=@_; return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file); my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar"); my $xalan_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar"); my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar); $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file); my $mapping_file_path = ""; my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" "; if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){ my $mapping_file_path = "\"".$self->{'mapping_file'}."\""; $cmd .= "-m $mapping_file_path"; } print STDERR "*** cmd = $cmd\n"; open(*XMLWRITER, $cmd) or die "can't open pipe to xslt: $!"; $self->{'xslt_writer'} = *XMLWRITER; print XMLWRITER "\n"; print XMLWRITER "$output_file_name\n"; } sub close_xslt_pipe { my $self = shift @_; return unless defined $self->{'xslt_writer'} ; my $xsltwriter = $self->{'xslt_writer'}; print $xsltwriter "\n"; close($xsltwriter); undef $self->{'xslt_writer'}; } sub textedit { my $self = shift (@_); my ($doc_obj) = @_; my $handle = $self->{'output_handle'}; my $doc_oid = $doc_obj->get_OID(); my $tmp_dir = $self->{'tmp_dir'}; my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.xml"); $tmp_doc_filename = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename); my $xslt_filename = $self->{'xslt_filename'}; $self->open_xslt_pipe($tmp_doc_filename, $xslt_filename); # stops with error if not able to open pipe my $outhandler = $self->{'xslt_writer'}; binmode($outhandler,":utf8"); my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section()); print $outhandler $section_text; $self->close_xslt_pipe(); # now feed to generated file to jena's (TDB) tripple store } sub texteditMG { my $self = shift (@_); my ($doc_obj) = @_; my $handle = $self->{'output_handle'}; my $doc_oid = $doc_obj->get_OID(); my $doc_section = 0; # just for this document my $text = ""; my $text_extra = ""; # get the text for this document my $section = $doc_obj->get_top_section(); while (defined $section) { # update a few statistics $doc_section++; my $title = $doc_obj->get_metadata_element($section, "Title"); if (defined $title && ($title =~ m/\S/)) { print "$doc_oid: Title = $title\n"; } my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title"); if (defined $dc_title && ($dc_title =~ m/\S/)) { print "$doc_oid: dc.Title = $dc_title\n"; } my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title"); if (defined $id3_title && ($id3_title =~ m/\S/)) { print "$doc_oid: id3.Title = $id3_title\n"; } $section = $doc_obj->get_next_section($section); } print $handle "$text$text_extra"; } sub texteditADB { my $self = shift (@_); my ($doc_obj,$file,$mode) = @_; # Code written on the assumption that that jenaTDB does a replace # operation when presented with a docid that already extis. # => don't need to do anything special to distinguish between # a mode of "add" and "update" my $outhandle = $self->{'outhandle'}; my $source_dir = $self->{'source_dir'}; # typically the archives dir my $build_dir = $self->{'build_dir'}; # full path to adb database my $adb_filename = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb"); # get doc id my $doc_oid = $doc_obj->get_OID(); # map to assoc dir my $top_section = $doc_obj->get_top_section(); my $assoc_file = $doc_obj->get_metadata_element ($top_section,"assocfilepath"); my $assoc_filename = &util::filename_cat($source_dir,$assoc_file); my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12"); my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power"); print $outhandle " Inserting tripples for $doc_oid\n"; # my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\""; # my $status = system($cmd); # if ($status != 0) { # print STDERR "Error: failed to run:\n $cmd\n$!\n"; # } } sub text { my $self = shift (@_); my ($doc_obj,$file) = @_; $self->textedit($doc_obj,$file,"add"); } sub textreindex { my $self = shift @_; my ($doc_obj,$file) = @_; $self->textedit($doc_obj,$file,"update"); } sub textdelete { my $self = shift @_; my ($doc_obj,$file) = @_; print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n"; # $self->textedit($doc_obj,$file,"delete"); } 1;