source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 35139

Last change on this file since 35139 was 35139, checked in by davidb, 3 years ago

Code returned to removing tmp files

File size: 10.1 KB
Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use docprint;
35use util;
36use FileUtils;
37
38use extrabuildproc;
39
40
41BEGIN {
42 @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46 {
47 my $class = shift @_;
48
49 my $self = new extrabuildproc (@_);
50
51 # Do the following here so it doesn't keep checking (within the util.pm method)
52 # whether it needs to create the directory or not
53 my $tmp_dir = &util::get_collectlevel_tmp_dir();
54 $self->{'tmp_dir'} = $tmp_dir;
55
56
57 my $xslt_file_in = "gsdom2rdf.xsl";
58
59 my $xslt_filename_in = &util::locate_config_file($xslt_file_in);
60 if (!defined $xslt_filename_in) {
61 print STDERR "Can not find $xslt_file_in, please make sure you have supplied the correct file path\n";
62 die "\n";
63 }
64
65 my $xslt_filename_out = &FileUtils::filenameConcatenate($tmp_dir,$xslt_file_in);
66
67 my $collection = $self->{'collection'};
68
69 my $url_prefix = &util::get_full_greenstone_url_prefix();
70
71 my $property_hashmap = { 'libraryurl' => $url_prefix,
72 'collect' => $collection };
73
74 file_copy_with_property_sub($xslt_filename_in,$xslt_filename_out,$property_hashmap);
75
76 $self->{'xslt_file'} = $xslt_file_in;
77 $self->{'xslt_filename'} = $xslt_filename_out;
78
79 return bless $self, $class;
80}
81
82
83sub property_lookup
84{
85 my ($hashmap,$value) = @_;
86
87 my $lookup = (defined $hashmap->{$value}) ? $hashmap->{$value} : "\@$value\@";
88
89 return $lookup;
90}
91
92
93# Performs a text file copy, substituding substings of the form
94# @xxx@ in the input file with the values set in hashmap
95# passed in
96
97sub file_copy_with_property_sub
98{
99 my ($filename_in,$filename_out,$property_hashmap) = @_;
100
101 if (!open(FIN, "<$filename_in")) {
102 print STDERR "util::file_substitute_at_properteis failed to open $filename_in\n $!\n";
103 return;
104 }
105 binmode(FIN,":utf8");
106
107 if (!open(FOUT, ">$filename_out")) {
108 print STDERR "util::file_substitute_at_properteis failed to open $filename_out\n $!\n";
109 return;
110 }
111 binmode(FOUT,":utf8");
112
113 my $line;
114 while (defined($line = <FIN>)) {
115
116 $line =~ s/\@([^@ ]+)\@/&property_lookup($property_hashmap,$1)/ige;
117
118 print FOUT $line;
119 }
120
121 close(FIN);
122 close(FOUT);
123}
124
125
126sub open_xslt_pipe
127{
128 my $self = shift @_;
129 my ($output_file_name, $xslt_file)=@_;
130
131 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
132
133 my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
134 my $xalan_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
135
136 my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
137
138 $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
139
140 my $mapping_file_path = "";
141
142 my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
143
144
145 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
146 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
147 $cmd .= "-m $mapping_file_path";
148 }
149
150 if (!open(*XMLWRITER, $cmd)) {
151 print STDERR "Can't open pipe to xslt: $!";
152 print STDERR "Command was:\n $cmd\n";
153 $self->{'xslt_writer'} = undef;
154 }
155 else {
156 $self->{'xslt_writer'} = *XMLWRITER;
157
158 print XMLWRITER "<?DocStart?>\n";
159 print XMLWRITER "$output_file_name\n";
160 }
161 }
162
163
164sub close_xslt_pipe
165{
166 my $self = shift @_;
167
168 return unless defined $self->{'xslt_writer'} ;
169
170 my $xsltwriter = $self->{'xslt_writer'};
171
172 print $xsltwriter "<?DocEnd?>\n";
173 close($xsltwriter);
174
175 undef $self->{'xslt_writer'};
176
177}
178
179sub make_ttl_safe
180{
181 my ($front,$str,$back) = @_;
182
183 $str =~ s/\\/\\\\/g;
184
185 $str =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
186 $str =~ s/\&amp;#([0-9]+);/chr($1)/eig;
187
188 $str =~ s/[\r\n]+/\\n/g;
189
190 return "$front$str$back";
191}
192
193
194sub xml_to_ttl {
195 my $self = shift (@_);
196 my ($section_text,$output_root) = @_;
197 my $handle = $self->{'output_handle'};
198
199 my $tmp_dir = $self->{'tmp_dir'};
200 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"$output_root.ttl");
201 my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
202
203 my $xslt_filename = $self->{'xslt_filename'};
204 $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
205
206 my $xml_outhandler = $self->{'xslt_writer'};
207
208 if (defined $xml_outhandler) {
209 binmode($xml_outhandler,":utf8");
210
211 ### my $section_text = &docprint::get_section_xml($doc_obj);
212
213 $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
214 ## $1&make_ttl_safe($2)$3
215
216## print STDERR "*** st = $section_text\n\n";
217## $self->debug_section_text($section_text);
218
219 print $xml_outhandler $section_text;
220 }
221
222 $self->close_xslt_pipe();
223
224 # now feed the generated file to jena's (TDB) triple store
225
226 my $outhandle = $self->{'outhandle'};
227 print $outhandle " Inserting triples for $output_root\n";
228
229 my $collection = $self->{'collection'};
230
231 if (-f $tmp_doc_filename) {
232
233# my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
234 my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\"";
235
236 my $status = system($cmd);
237 if ($status != 0) {
238 print STDERR "Error: failed to run:\n $cmd\n$!\n";
239 }
240
241# print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n";
242 unlink $tmp_doc_filename;
243 }
244 else {
245 print STDERR "*** Failed to generate: $tmp_doc_filename\n";
246 }
247
248}
249
250sub textedit {
251 my $self = shift (@_);
252 my ($doc_obj) = @_;
253 my $handle = $self->{'output_handle'};
254
255 # print STDERR "**** jenaTDBBuildproc::textedit()\n";
256
257 my $doc_oid = $doc_obj->get_OID();
258 my $ttl_output_root_file = "doc-$doc_oid";
259
260 my $section_text = &docprint::get_section_xml($doc_obj);
261 $self->xml_to_ttl($section_text,$ttl_output_root_file);
262
263 # my $tmp_dir = $self->{'tmp_dir'};
264 # my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
265 # my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
266
267 # my $xslt_filename = $self->{'xslt_filename'};
268 # $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
269
270 # my $xml_outhandler = $self->{'xslt_writer'};
271
272 # if (defined $xml_outhandler) {
273 # binmode($xml_outhandler,":utf8");
274
275 # my $section_text = &docprint::get_section_xml($doc_obj);
276
277 # $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
278 # ## $1&make_ttl_safe($2)$3
279
280 # ## print STDERR "*** st = $section_text\n\n";
281
282 # print $xml_outhandler $section_text;
283 # }
284
285 # $self->close_xslt_pipe();
286
287 # # now feed the generated file to jena's (TDB) tripple store
288
289 # my $outhandle = $self->{'outhandle'};
290 # print $outhandle " Inserting tripples for $doc_oid\n";
291
292 # my $collection = $self->{'collection'};
293
294 # if (-f $tmp_doc_filename) {
295
296 # # my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
297 # my $cmd = "gs-triplestore-add3 $collection \"$tmp_doc_filename\"";
298
299 # my $status = system($cmd);
300 # if ($status != 0) {
301 # print STDERR "Error: failed to run:\n $cmd\n$!\n";
302 # }
303
304 # # print STDERR "**** temporarily supressing deletion of: $tmp_doc_filename\n";
305 # unlink $tmp_doc_filename;
306 # }
307 # else {
308 # print STDERR "*** Failed to generate: $tmp_doc_filename\n";
309 # }
310
311}
312
313
314sub text {
315 my $self = shift (@_);
316 my ($doc_obj,$file) = @_;
317
318 $self->textedit($doc_obj,$file,"add");
319}
320
321sub textreindex
322{
323 my $self = shift @_;
324 my ($doc_obj,$file) = @_;
325
326 $self->textedit($doc_obj,$file,"update");
327}
328
329sub textdelete
330{
331 my $self = shift @_;
332
333 my ($doc_obj,$file) = @_;
334
335 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
336
337 # $self->textedit($doc_obj,$file,"delete");
338}
339
340
341sub infodbedit
342{
343 my $self = shift (@_);
344 my ($doc_obj, $filename, $edit_mode) = @_;
345
346 # print STDERR "**** jenaTDBBuidproc::infodbedit(): $filename, $edit_mode\n";
347
348 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
349 my $doctype = $doc_obj->get_doc_type();
350 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
351
352
353 #
354 # The following is done in basebuildproc, consider if it makes sense to do here
355 #
356
357# #add this document to the browse structure
358# push(@{$self->{'doclist'}},$doc_obj->get_OID())
359# unless ($doctype eq "classification");
360# $self->{'num_docs'} += 1 unless ($doctype eq "classification");
361
362# if (!defined $filename) {
363# # a reconstructed doc
364# my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
365# if (defined $num_reconstructed_bytes) {
366# $self->{'num_bytes'} += $num_reconstructed_bytes;
367# }
368# }
369
370 # classify the document
371 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
372
373
374}
375
376
3771;
Note: See TracBrowser for help on using the repository browser.