source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28488

Last change on this file since 28488 was 28488, checked in by davidb, 11 years ago

Further work was necessary to better process the data passing through to be suitable escaped for TTL

File size: 7.4 KB
RevLine 
[28391]1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
[28392]32no strict 'refs'; # allow filehandles to be variables and viceversa
[28391]33
[28410]34use docprint;
[28391]35use util;
[28410]36use FileUtils;
[28391]37
38use extrabuildproc;
39
40
41BEGIN {
42 @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46 {
47 my $class = shift @_;
48
49 my $self = new extrabuildproc (@_);
50
[28468]51 # Do the following here so it doesn't keep checking (within the util.pm method)
52 # whether it needs to create the directory or not
53 my $tmp_dir = &util::get_collectlevel_tmp_dir();
54 $self->{'tmp_dir'} = $tmp_dir;
[28410]55
[28468]56
57 my $xslt_file_in = "gsdom2rdf.xsl";
58
59 my $xslt_filename_in = &util::locate_config_file($xslt_file_in);
60 if (!defined $xslt_filename_in) {
61 print STDERR "Can not find $xslt_file_in, please make sure you have supplied the correct file path\n";
[28410]62 die "\n";
63 }
64
[28468]65 my $xslt_filename_out = &FileUtils::filenameConcatenate($tmp_dir,$xslt_file_in);
[28410]66
[28468]67 my $collection = $self->{'collection'};
[28410]68
[28468]69 my $url_prefix = &util::get_full_greenstone_url_prefix();
70
71 my $property_hashmap = { 'libraryurl' => $url_prefix,
72 'collect' => $collection };
73
74 file_copy_with_property_sub($xslt_filename_in,$xslt_filename_out,$property_hashmap);
75
76 $self->{'xslt_file'} = $xslt_file_in;
77 $self->{'xslt_filename'} = $xslt_filename_out;
78
[28391]79 return bless $self, $class;
80}
81
82
[28468]83sub property_lookup
84{
85 my ($hashmap,$value) = @_;
86
87 my $lookup = (defined $hashmap->{$value}) ? $hashmap->{$value} : "\@$value\@";
88
89 return $lookup;
90}
91
92
93# Performs a text file copy, substituding substings of the form
94# @xxx@ in the input file with the values set in hashmap
95# passed in
96
97sub file_copy_with_property_sub
98{
99 my ($filename_in,$filename_out,$property_hashmap) = @_;
100
101 if (!open(FIN, "<$filename_in")) {
102 print STDERR "util::file_substitute_at_properteis failed to open $filename_in\n $!\n";
103 return;
104 }
105 binmode(FIN,":utf8");
106
107 if (!open(FOUT, ">$filename_out")) {
108 print STDERR "util::file_substitute_at_properteis failed to open $filename_out\n $!\n";
109 return;
110 }
111 binmode(FOUT,":utf8");
112
113 my $line;
114 while (defined($line = <FIN>)) {
115
116 $line =~ s/\@([^@ ]+)\@/&property_lookup($property_hashmap,$1)/ige;
117
118 print FOUT $line;
119 }
120
121 close(FIN);
122 close(FOUT);
123}
124
125
[28410]126sub open_xslt_pipe
127{
128 my $self = shift @_;
129 my ($output_file_name, $xslt_file)=@_;
130
131 return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
132
133 my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
134 my $xalan_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
135
136 my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
137
138 $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
139
140 my $mapping_file_path = "";
141
142 my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
143
[28468]144
[28410]145 if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
146 my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
147 $cmd .= "-m $mapping_file_path";
148 }
149
150 open(*XMLWRITER, $cmd)
151 or die "can't open pipe to xslt: $!";
152
153 $self->{'xslt_writer'} = *XMLWRITER;
154
155 print XMLWRITER "<?DocStart?>\n";
156 print XMLWRITER "$output_file_name\n";
157
158 }
159
160
161sub close_xslt_pipe
162{
163 my $self = shift @_;
164
165
166 return unless defined $self->{'xslt_writer'} ;
167
168 my $xsltwriter = $self->{'xslt_writer'};
169
170 print $xsltwriter "<?DocEnd?>\n";
171 close($xsltwriter);
172
173 undef $self->{'xslt_writer'};
174
175}
176
[28488]177sub make_ttl_safe
178{
179 my ($front,$str,$back) = @_;
180
181 $str =~ s/\\/\\\\/g;
182
183 $str =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
184 $str =~ s/\&amp;#([0-9]+);/chr($1)/eig;
[28410]185
[28488]186 $str =~ s/[\r\n]+/\\n/g;
187
188 return "$front$str$back";
189}
190
[28391]191sub textedit {
192 my $self = shift (@_);
[28392]193 my ($doc_obj) = @_;
194 my $handle = $self->{'output_handle'};
195
196 my $doc_oid = $doc_obj->get_OID();
197
[28410]198 my $tmp_dir = $self->{'tmp_dir'};
[28468]199 my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
200 my $tmp_doc_filename_cc = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
[28410]201
202 my $xslt_filename = $self->{'xslt_filename'};
[28468]203 $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
[28410]204
205 my $outhandler = $self->{'xslt_writer'};
206 binmode($outhandler,":utf8");
207
208 my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
[28488]209
210# $section_text =~ s/[\r\n]+$//s; # remove very last newline char
211
212## $section_text =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
213## $section_text =~ s/\&#([0-9]+);/chr($1)/eig;
214
215# $section_text =~ s/\\/\\\\/g;
216
217# $section_text =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
218# $section_text =~ s/\&amp;#([0-9]+);/chr($1)/eig;
219
220# $section_text =~ s/(\r|\n)+/&lt;br \/&gt;/gs;
221# $section_text =~ s/[\r\n]+/ /gs;
222
223## $section_text =~ s/\n/ AND /gs;
224
225# open(DOUT,">/tmp/debug.xml") || die "Failed to open";
226# print DOUT $section_text;
227# print DOUT "\n";
228# close DOUT;
229# exit -1;
230
231 $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
232
233## $1&make_ttl_safe($2)$3
234
235## print STDERR "*** st = $section_text\n\n";
236
237
[28410]238 print $outhandler $section_text;
239
240 $self->close_xslt_pipe();
241
242 # now feed to generated file to jena's (TDB) tripple store
243
[28468]244 my $outhandle = $self->{'outhandle'};
245 print $outhandle " Inserting tripples for $doc_oid\n";
[28410]246
[28468]247 my $collection = $self->{'collection'};
[28410]248
[28468]249 if (-f $tmp_doc_filename) {
[28410]250
[28468]251 my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
[28488]252
[28468]253 my $status = system($cmd);
254 if ($status != 0) {
255 print STDERR "Error: failed to run:\n $cmd\n$!\n";
[28392]256 }
257
[28468]258 unlink $tmp_doc_filename;
[28392]259 }
[28468]260 else {
261 print STDERR "*** Failed to generate: $tmp_doc_filename\n";
262 }
[28392]263
264}
265
266
[28391]267sub text {
268 my $self = shift (@_);
269 my ($doc_obj,$file) = @_;
270
271 $self->textedit($doc_obj,$file,"add");
272}
273
274sub textreindex
275{
276 my $self = shift @_;
277 my ($doc_obj,$file) = @_;
278
279 $self->textedit($doc_obj,$file,"update");
280}
281
282sub textdelete
283{
284 my $self = shift @_;
285
286 my ($doc_obj,$file) = @_;
287
288 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
289
290 # $self->textedit($doc_obj,$file,"delete");
291}
292
293
294
295
296
2971;
Note: See TracBrowser for help on using the repository browser.