root/gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm @ 28488

Revision 28488, 7.4 KB (checked in by davidb, 7 years ago)

Further work was necessary to better process the data passing through to be suitable escaped for TTL

Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use docprint;
35use util;
36use FileUtils;
37
38use extrabuildproc;
39
40
41BEGIN {
42    @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46  {
47    my $class = shift @_;
48
49    my $self = new extrabuildproc (@_);
50
51    # Do the following here so it doesn't keep checking (within the util.pm method)
52    # whether it needs to create the directory or not
53    my $tmp_dir = &util::get_collectlevel_tmp_dir();
54    $self->{'tmp_dir'} = $tmp_dir;
55
56
57    my $xslt_file_in = "gsdom2rdf.xsl";
58
59    my $xslt_filename_in = &util::locate_config_file($xslt_file_in);
60    if (!defined $xslt_filename_in) {
61    print STDERR "Can not find $xslt_file_in, please make sure you have supplied the correct file path\n";
62    die "\n";
63    }
64
65    my $xslt_filename_out = &FileUtils::filenameConcatenate($tmp_dir,$xslt_file_in);
66
67    my $collection = $self->{'collection'};
68
69    my $url_prefix = &util::get_full_greenstone_url_prefix();
70
71    my $property_hashmap = { 'libraryurl' => $url_prefix,
72                 'collect'    => $collection };
73
74    file_copy_with_property_sub($xslt_filename_in,$xslt_filename_out,$property_hashmap);
75
76    $self->{'xslt_file'} = $xslt_file_in;
77    $self->{'xslt_filename'} = $xslt_filename_out;
78
79    return bless $self, $class;
80}
81
82
83sub property_lookup
84{
85    my ($hashmap,$value) = @_;
86   
87    my $lookup = (defined $hashmap->{$value}) ? $hashmap->{$value} : "\@$value\@";
88
89    return $lookup;
90}
91
92
93# Performs a text file copy, substituding substings of the form
94# @xxx@ in the input file with the values set in hashmap
95# passed in
96
97sub file_copy_with_property_sub
98{
99    my ($filename_in,$filename_out,$property_hashmap) = @_;
100
101    if (!open(FIN, "<$filename_in")) {
102    print STDERR "util::file_substitute_at_properteis failed to open $filename_in\n  $!\n";
103    return;
104    }
105    binmode(FIN,":utf8");
106
107    if (!open(FOUT, ">$filename_out")) {
108    print STDERR "util::file_substitute_at_properteis failed to open $filename_out\n  $!\n";
109    return;
110    }
111    binmode(FOUT,":utf8");
112
113    my $line;
114    while (defined($line = <FIN>)) {
115   
116    $line =~ s/\@([^@ ]+)\@/&property_lookup($property_hashmap,$1)/ige;
117
118    print FOUT $line;
119    }
120
121    close(FIN);
122    close(FOUT);       
123}
124
125
126sub open_xslt_pipe
127{
128    my $self = shift @_;
129    my ($output_file_name, $xslt_file)=@_;
130
131    return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
132   
133    my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
134    my $xalan_jar      = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
135
136    my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
137
138    $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
139
140    my $mapping_file_path = "";
141
142    my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
143
144
145    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
146    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
147    $cmd .= "-m $mapping_file_path";
148    }
149   
150    open(*XMLWRITER, $cmd)
151    or die "can't open pipe to xslt: $!";
152   
153    $self->{'xslt_writer'} = *XMLWRITER;
154
155    print XMLWRITER "<?DocStart?>\n";       
156    print XMLWRITER "$output_file_name\n";
157 
158  }
159 
160
161sub close_xslt_pipe
162{
163  my $self = shift @_;
164
165 
166  return unless defined $self->{'xslt_writer'} ;
167   
168  my $xsltwriter = $self->{'xslt_writer'};
169 
170  print $xsltwriter "<?DocEnd?>\n";
171  close($xsltwriter);
172
173  undef $self->{'xslt_writer'};
174
175}
176
177sub make_ttl_safe
178{
179    my ($front,$str,$back) = @_;
180   
181    $str =~ s/\\/\\\\/g;
182   
183    $str =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
184    $str =~ s/\&amp;#([0-9]+);/chr($1)/eig;
185
186    $str =~ s/[\r\n]+/\\n/g;
187
188    return "$front$str$back";
189}
190
191sub textedit {
192    my $self = shift (@_);
193    my ($doc_obj) = @_;
194    my $handle = $self->{'output_handle'};
195   
196    my $doc_oid = $doc_obj->get_OID();
197
198    my $tmp_dir = $self->{'tmp_dir'};
199    my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.ttl");
200    my $tmp_doc_filename_cc    = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
201
202    my $xslt_filename = $self->{'xslt_filename'};
203    $self->open_xslt_pipe($tmp_doc_filename_cc, $xslt_filename); # stops with error if not able to open pipe
204
205    my $outhandler = $self->{'xslt_writer'};
206    binmode($outhandler,":utf8");
207
208    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
209
210#    $section_text =~ s/[\r\n]+$//s; # remove very last newline char
211
212##    $section_text =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
213##    $section_text =~ s/\&#([0-9]+);/chr($1)/eig;
214
215#    $section_text =~ s/\\/\\\\/g;
216
217#    $section_text =~ s/\&amp;#x([0-9A-F]+);/chr(hex($1))/eig;
218#    $section_text =~ s/\&amp;#([0-9]+);/chr($1)/eig;
219
220#    $section_text =~ s/(\r|\n)+/&lt;br \/&gt;/gs;
221#    $section_text =~ s/[\r\n]+/ /gs;
222
223##    $section_text =~ s/\n/ AND /gs;
224
225#    open(DOUT,">/tmp/debug.xml") || die "Failed to open";
226#    print DOUT $section_text;
227#    print DOUT "\n";
228#    close DOUT;
229#    exit -1;
230
231    $section_text =~ s/(<Metadata[^>]*>)(.*?)(<\/Metadata>)/&make_ttl_safe($1,$2,$3)/gse;
232
233## $1&make_ttl_safe($2)$3
234
235##    print STDERR "*** st = $section_text\n\n";
236
237
238    print $outhandler $section_text;
239
240    $self->close_xslt_pipe();
241
242    # now feed to generated file to jena's (TDB) tripple store
243
244    my $outhandle = $self->{'outhandle'};
245    print $outhandle "  Inserting tripples for $doc_oid\n";
246
247    my $collection = $self->{'collection'};
248
249    if (-f $tmp_doc_filename) {
250
251    my $cmd = "gs-triplestore-add $collection \"$tmp_doc_filename\"";
252       
253    my $status = system($cmd);
254    if ($status != 0) {
255        print STDERR "Error: failed to run:\n  $cmd\n$!\n";
256    }
257   
258    unlink $tmp_doc_filename;
259    }
260    else {
261    print STDERR "*** Failed to generate: $tmp_doc_filename\n";
262    }
263
264}
265
266
267sub text {
268    my $self = shift (@_);
269    my ($doc_obj,$file) = @_;
270
271    $self->textedit($doc_obj,$file,"add");
272}
273
274sub textreindex
275{
276    my $self = shift @_;
277    my ($doc_obj,$file) = @_;
278
279    $self->textedit($doc_obj,$file,"update");
280}
281
282sub textdelete
283{
284    my $self = shift @_;
285
286    my ($doc_obj,$file) = @_;
287
288    print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
289
290    # $self->textedit($doc_obj,$file,"delete");
291}
292
293
294
295
296
2971;
Note: See TracBrowser for help on using the browser.