source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28410

Last change on this file since 28410 was 28410, checked in by davidb, 7 years ago

Code now runs an XSLT over the doc.xml file

File size: 7.0 KB
Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use docprint;
35use util;
36use FileUtils;
37
38use extrabuildproc;
39
40
41BEGIN {
42 @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46 {
47 my $class = shift @_;
48
49 my $self = new extrabuildproc (@_);
50
51 my $xslt_file = "gsdom2rdf.xsl";
52
53 my $xslt_filename = &util::locate_config_file($xslt_file);
54 if (!defined $xslt_filename) {
55 print STDERR "Can not find $xslt_file, please make sure you have supplied the correct file path\n";
56 die "\n";
57 }
58
59 $self->{'xslt_file'} = $xslt_file;
60 $self->{'xslt_filename'} = $xslt_filename;
61
62 # Do the following here so it doesn't keep checking (within the util.pm method)
63 # whether it needs to create the directory or not
64 my $tmp_dir = &util::get_collectlevel_tmp_dir();
65 $self->{'tmp_dir'} = $tmp_dir;
66
67 return bless $self, $class;
68}
69
70
71
72
73sub open_xslt_pipe
74{
75 my $self = shift @_;
76 my ($output_file_name, $xslt_file)=@_;
77
78 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
79
80 my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
81 my $xalan_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
82
83 my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
84
85 $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
86
87 my $mapping_file_path = "";
88
89 my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
90
91 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
92 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
93 $cmd .= "-m $mapping_file_path";
94 }
95
96 print STDERR "*** cmd = $cmd\n";
97
98 open(*XMLWRITER, $cmd)
99 or die "can't open pipe to xslt: $!";
100
101
102 $self->{'xslt_writer'} = *XMLWRITER;
103
104 print XMLWRITER "<?DocStart?>\n";
105 print XMLWRITER "$output_file_name\n";
106
107 }
108
109
110sub close_xslt_pipe
111{
112 my $self = shift @_;
113
114
115 return unless defined $self->{'xslt_writer'} ;
116
117 my $xsltwriter = $self->{'xslt_writer'};
118
119 print $xsltwriter "<?DocEnd?>\n";
120 close($xsltwriter);
121
122 undef $self->{'xslt_writer'};
123
124}
125
126
127sub textedit {
128 my $self = shift (@_);
129 my ($doc_obj) = @_;
130 my $handle = $self->{'output_handle'};
131
132 my $doc_oid = $doc_obj->get_OID();
133
134 my $tmp_dir = $self->{'tmp_dir'};
135 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.xml");
136 $tmp_doc_filename = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
137
138 my $xslt_filename = $self->{'xslt_filename'};
139 $self->open_xslt_pipe($tmp_doc_filename, $xslt_filename); # stops with error if not able to open pipe
140
141 my $outhandler = $self->{'xslt_writer'};
142 binmode($outhandler,":utf8");
143
144 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
145 print $outhandler $section_text;
146
147 $self->close_xslt_pipe();
148
149
150 # now feed to generated file to jena's (TDB) tripple store
151
152
153}
154
155
156sub texteditMG {
157 my $self = shift (@_);
158 my ($doc_obj) = @_;
159 my $handle = $self->{'output_handle'};
160
161 my $doc_oid = $doc_obj->get_OID();
162
163
164 my $doc_section = 0; # just for this document
165
166
167 my $text = "";
168 my $text_extra = "";
169
170 # get the text for this document
171 my $section = $doc_obj->get_top_section();
172 while (defined $section) {
173 # update a few statistics
174 $doc_section++;
175
176 my $title = $doc_obj->get_metadata_element($section, "Title");
177
178 if (defined $title && ($title =~ m/\S/)) {
179 print "$doc_oid: Title = $title\n";
180 }
181
182 my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title");
183
184 if (defined $dc_title && ($dc_title =~ m/\S/)) {
185 print "$doc_oid: dc.Title = $dc_title\n";
186 }
187
188
189 my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title");
190
191 if (defined $id3_title && ($id3_title =~ m/\S/)) {
192 print "$doc_oid: id3.Title = $id3_title\n";
193 }
194
195 $section = $doc_obj->get_next_section($section);
196 }
197
198 print $handle "$text$text_extra";
199}
200
201
202
203sub texteditADB {
204 my $self = shift (@_);
205 my ($doc_obj,$file,$mode) = @_;
206
207 # Code written on the assumption that that jenaTDB does a replace
208 # operation when presented with a docid that already extis.
209 # => don't need to do anything special to distinguish between
210 # a mode of "add" and "update"
211
212 my $outhandle = $self->{'outhandle'};
213
214
215 my $source_dir = $self->{'source_dir'}; # typically the archives dir
216 my $build_dir = $self->{'build_dir'};
217
218 # full path to adb database
219 my $adb_filename
220 = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
221
222 # get doc id
223 my $doc_oid = $doc_obj->get_OID();
224
225 # map to assoc dir
226 my $top_section = $doc_obj->get_top_section();
227 my $assoc_file
228 = $doc_obj->get_metadata_element ($top_section,"assocfilepath");
229 my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
230
231 my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
232 my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
233
234 print $outhandle " Inserting tripples for $doc_oid\n";
235
236# my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
237
238# my $status = system($cmd);
239# if ($status != 0) {
240# print STDERR "Error: failed to run:\n $cmd\n$!\n";
241# }
242
243}
244
245sub text {
246 my $self = shift (@_);
247 my ($doc_obj,$file) = @_;
248
249 $self->textedit($doc_obj,$file,"add");
250}
251
252sub textreindex
253{
254 my $self = shift @_;
255 my ($doc_obj,$file) = @_;
256
257 $self->textedit($doc_obj,$file,"update");
258}
259
260sub textdelete
261{
262 my $self = shift @_;
263
264 my ($doc_obj,$file) = @_;
265
266 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
267
268 # $self->textedit($doc_obj,$file,"delete");
269}
270
271
272
273
274
2751;
Note: See TracBrowser for help on using the repository browser.