source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28802

Last change on this file since 28802 was 28802, checked in by ak19, 10 years ago

Removed 'die' statements in favour of rint to STDERR to allow the perl process to continue, rather than stopping abruptly.

File size: 7.5 KB
Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use docprint;
35use util;
36use FileUtils;
37
38use extrabuildproc;
39
40
41BEGIN {
42 @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46 {
47 my $class = shift @_;
48
49 my $self = new extrabuildproc (@_);
50
51 # Do the following here so it doesn't keep checking (within the util.pm method)
52 # whether it needs to create the directory or not
53 my $tmp_dir = &util::get_collectlevel_tmp_dir();
54 $self->{'tmp_dir'} = $tmp_dir;
55
56
57 my $xslt_file_in = "gsdom2rdf.xsl";
58
59 my $xslt_filename_in = &util::locate_config_file($xslt_file_in);
60 if (!defined $xslt_filename_in) {
61 print STDERR "Can not find $xslt_file_in, please make sure you have supplied the correct file path\n";
62 die "\n";
63 }
64
65 my $xslt_filename_out = &FileUtils::filenameConcatenate($tmp_dir,$xslt_file_in);
66
67 my $collection = $self->{'collection'};
68
69 my $url_prefix = &util::get_full_greenstone_url_prefix();
70
71 my $property_hashmap = { 'libraryurl' => $url_prefix,
72 'collect' => $collection };
73
74 file_copy_with_property_sub($xslt_filename_in,$xslt_filename_out,$property_hashmap);
75
76 $self->{'xslt_file'} = $xslt_file_in;
77 $self->{'xslt_filename'} = $xslt_filename_out;
78
79 return bless $self, $class;
80}
81
82
83sub property_lookup
84{
85 my ($hashmap,$value) = @_;
86
87 my $lookup = (defined $hashmap->{$value}) ? $hashmap->{$value} : "\@$value\@";
88
89 return $lookup;
90}
91
92
93# Performs a text file copy, substituding substings of the form
94# @xxx@ in the input file with the values set in hashmap
95# passed in
96
97sub file_copy_with_property_sub
98{
99 my ($filename_in,$filename_out,$property_hashmap) = @_;
100
101 if (!open(FIN, "<$filename_in")) {
102 print STDERR "util::file_substitute_at_properteis failed to open $filename_in\n $!\n";
103 return;
104 }
105 binmode(FIN,":utf8");
106
107 if (!open(FOUT, ">$filename_out")) {
108 print STDERR "util::file_substitute_at_properteis failed to open $filename_out\n $!\n";
109 return;
110 }
111 binmode(FOUT,":utf8");
112
113 my $line;
114 while (defined($line = <FIN>)) {
115
116 $line =~ s/\@([^@ ]+)\@/&property_lookup($property_hashmap,$1)/ige;
117
118 print FOUT $line;
119 }
120
121 close(FIN);
122 close(FOUT);
123}
124
125
126sub open_xslt_pipe
127{
128 my $self = shift @_;
129 my ($output_file_name, $xslt_file)=@_;
130
131 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
132
133 my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
134 my $xalan_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
135
136 my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
137
138 $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
139
140 my $mapping_file_path = "";
141
142 my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
143
144
145 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
146 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
147 $cmd .= "-m $mapping_file_path";
148 }
149
150 if (!open(*XMLWRITER, $cmd)) {
151 print STDERR "Can't open pipe to xslt: $!";
152 print STDERR "Command was:\n $cmd\n";
153 $self->{'xslt_writer'} = undef;
154 }
155 else {
156 $self->{'xslt_writer'} = *XMLWRITER;
157
158 print XMLWRITER "<?DocStart?>\n";
159 print XMLWRITER "$output_file_name\n";
160 }
161 }
162
163
164sub close_xslt_pipe
165{
166 my $self = shift @_;
167
168 return unless defined $self->{'xslt_writer'} ;
169
170 my $xsltwriter = $self->{'xslt_writer'};
171
172 print $xsltwriter "<?DocEnd?>\n";
173 close($xsltwriter);
174
175 undef $self->{'xslt_writer'};
176
177}
178
179sub make_ttl_safe
180{
181 my ($front,$str,$back) = @_;
182
183 $str =~ s/\\/\\\\/g;
184
185 $str =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
186 $str =~ s/\&amp;#([0-9]+);/chr($1)/eig;
187
188 $str =~ s/[\r\n]+/\\n/g;
189
190 return "$front$str$back";
191}
192
193sub textedit {
194 my $self = shift (@_);
195 my ($doc_obj) = @_;
196 my $handle = $self->{'output_handle'};
197
198 my $doc_oid = $doc_obj->get_OID();
199
200 my $tmp_dir = $self->{'tmp_dir'};
201 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
202 my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
203
204 my $xslt_filename = $self->{'xslt_filename'};
205 $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
206
207 my $xml_outhandler = $self->{'xslt_writer'};
208
209 if (defined $xml_outhandler) {
210 binmode($xml_outhandler,":utf8");
211
212 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
213
214# $section_text =~ s/[\r\n]+$//s; # remove very last newline char
215
216## $section_text =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
217## $section_text =~ s/\&#([0-9]+);/chr($1)/eig;
218
219# $section_text =~ s/\\/\\\\/g;
220
221# $section_text =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
222# $section_text =~ s/\&amp;#([0-9]+);/chr($1)/eig;
223
224# $section_text =~ s/(\r|\n)+/&lt;br \/&gt;/gs;
225# $section_text =~ s/[\r\n]+/ /gs;
226
227## $section_text =~ s/\n/ AND /gs;
228
229# open(DOUT,">/tmp/debug.xml") || die "Failed to open";
230# print DOUT $section_text;
231# print DOUT "\n";
232# close DOUT;
233# exit -1;
234
235 $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
236
237## $1&make_ttl_safe($2)$3
238
239## print STDERR "*** st = $section_text\n\n";
240
241
242 print $xml_outhandler $section_text;
243 }
244
245 $self->close_xslt_pipe();
246
247 # now feed the generated file to jena's (TDB) tripple store
248
249 my $outhandle = $self->{'outhandle'};
250 print $outhandle " Inserting tripples for $doc_oid\n";
251
252 my $collection = $self->{'collection'};
253
254 if (-f $tmp_doc_filename) {
255
256 my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
257
258 my $status = system($cmd);
259 if ($status != 0) {
260 print STDERR "Error: failed to run:\n $cmd\n$!\n";
261 }
262
263 unlink $tmp_doc_filename;
264 }
265 else {
266 print STDERR "*** Failed to generate: $tmp_doc_filename\n";
267 }
268
269}
270
271
272sub text {
273 my $self = shift (@_);
274 my ($doc_obj,$file) = @_;
275
276 $self->textedit($doc_obj,$file,"add");
277}
278
279sub textreindex
280{
281 my $self = shift @_;
282 my ($doc_obj,$file) = @_;
283
284 $self->textedit($doc_obj,$file,"update");
285}
286
287sub textdelete
288{
289 my $self = shift @_;
290
291 my ($doc_obj,$file) = @_;
292
293 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
294
295 # $self->textedit($doc_obj,$file,"delete");
296}
297
298
299
300
301
3021;
Note: See TracBrowser for help on using the repository browser.