root/gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm @ 28410

Revision 28410, 7.0 KB (checked in by davidb, 6 years ago)

Code now runs an XSLT over the doc.xml file

Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use docprint;
35use util;
36use FileUtils;
37
38use extrabuildproc;
39
40
41BEGIN {
42    @jenaTDBBuildproc::ISA = ('extrabuildproc');
43}
44
45sub new()
46  {
47    my $class = shift @_;
48
49    my $self = new extrabuildproc (@_);
50
51    my $xslt_file = "gsdom2rdf.xsl";
52
53    my $xslt_filename = &util::locate_config_file($xslt_file);
54    if (!defined $xslt_filename) {
55    print STDERR "Can not find $xslt_file, please make sure you have supplied the correct file path\n";
56    die "\n";
57    }
58
59    $self->{'xslt_file'} = $xslt_file;
60    $self->{'xslt_filename'} = $xslt_filename;
61
62    # Do the following here so it doesn't keep checking (within the util.pm method)
63    # whether it needs to create the directory or not
64    my $tmp_dir = &util::get_collectlevel_tmp_dir();
65    $self->{'tmp_dir'} = $tmp_dir;
66
67    return bless $self, $class;
68}
69
70
71
72
73sub open_xslt_pipe
74{
75    my $self = shift @_;
76    my ($output_file_name, $xslt_file)=@_;
77
78    return unless defined $xslt_file and $xslt_file ne "" and &FileUtils::fileExists($xslt_file);
79   
80    my $apply_xslt_jar = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","ApplyXSLT.jar");
81    my $xalan_jar      = &FileUtils::javaFilenameConcatenate($ENV{'GSDLHOME'},"bin","java","xalan.jar");
82
83    my $java_class_path = &util::javapathname_cat($apply_xslt_jar,$xalan_jar);
84
85    $xslt_file = &util::makeFilenameJavaCygwinCompatible($xslt_file);
86
87    my $mapping_file_path = "";
88
89    my $cmd = "| java -cp \"$java_class_path\" org.nzdl.gsdl.ApplyXSLT -t \"$xslt_file\" ";
90
91    if (defined $self->{'mapping_file'} and $self->{'mapping_file'} ne ""){
92    my $mapping_file_path = "\"".$self->{'mapping_file'}."\"";
93    $cmd .= "-m $mapping_file_path";
94    }
95   
96    print STDERR "*** cmd = $cmd\n";
97
98    open(*XMLWRITER, $cmd)
99    or die "can't open pipe to xslt: $!";
100
101   
102    $self->{'xslt_writer'} = *XMLWRITER;
103
104    print XMLWRITER "<?DocStart?>\n";       
105    print XMLWRITER "$output_file_name\n";
106 
107  }
108 
109
110sub close_xslt_pipe
111{
112  my $self = shift @_;
113
114 
115  return unless defined $self->{'xslt_writer'} ;
116   
117  my $xsltwriter = $self->{'xslt_writer'};
118 
119  print $xsltwriter "<?DocEnd?>\n";
120  close($xsltwriter);
121
122  undef $self->{'xslt_writer'};
123
124}
125
126
127sub textedit {
128    my $self = shift (@_);
129    my ($doc_obj) = @_;
130    my $handle = $self->{'output_handle'};
131   
132    my $doc_oid = $doc_obj->get_OID();
133
134    my $tmp_dir = $self->{'tmp_dir'};
135    my $tmp_doc_filename = &FileUtils::filenameConcatenate($tmp_dir,"doc-$doc_oid.xml");
136    $tmp_doc_filename    = &util::makeFilenameJavaCygwinCompatible($tmp_doc_filename);
137
138    my $xslt_filename = $self->{'xslt_filename'};
139    $self->open_xslt_pipe($tmp_doc_filename, $xslt_filename); # stops with error if not able to open pipe
140
141    my $outhandler = $self->{'xslt_writer'};
142    binmode($outhandler,":utf8");
143
144    my $section_text = &docprint::get_section_xml($doc_obj,$doc_obj->get_top_section());
145    print $outhandler $section_text;
146
147    $self->close_xslt_pipe();
148
149
150    # now feed to generated file to jena's (TDB) tripple store
151
152
153}
154
155
156sub texteditMG {
157    my $self = shift (@_);
158    my ($doc_obj) = @_;
159    my $handle = $self->{'output_handle'};
160   
161    my $doc_oid = $doc_obj->get_OID();
162
163
164    my $doc_section = 0; # just for this document
165
166
167    my $text = "";
168    my $text_extra = "";
169
170    # get the text for this document
171    my $section = $doc_obj->get_top_section();
172    while (defined $section) {
173    # update a few statistics
174    $doc_section++;
175
176    my $title = $doc_obj->get_metadata_element($section, "Title");
177
178    if (defined $title && ($title =~ m/\S/)) {
179        print "$doc_oid: Title = $title\n";
180    }
181
182    my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title");
183
184    if (defined $dc_title && ($dc_title =~ m/\S/)) {
185        print "$doc_oid: dc.Title = $dc_title\n";
186    }
187
188
189    my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title");
190
191    if (defined $id3_title && ($id3_title =~ m/\S/)) {
192        print "$doc_oid: id3.Title = $id3_title\n";
193    }
194   
195    $section = $doc_obj->get_next_section($section);
196    }
197
198    print $handle "$text$text_extra";
199}
200
201
202
203sub texteditADB {
204    my $self = shift (@_);
205    my ($doc_obj,$file,$mode) = @_;
206
207    # Code written on the assumption that that jenaTDB does a replace
208    # operation when presented with a docid that already extis.
209    # => don't need to do anything special to distinguish between
210    #    a mode of "add" and "update"
211
212    my $outhandle = $self->{'outhandle'};
213
214
215    my $source_dir = $self->{'source_dir'}; # typically the archives dir
216    my $build_dir  = $self->{'build_dir'};
217
218    # full path to adb database
219    my $adb_filename
220    = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
221
222    # get doc id
223    my $doc_oid = $doc_obj->get_OID();
224
225    # map to assoc dir
226    my $top_section = $doc_obj->get_top_section();
227    my $assoc_file
228    = $doc_obj->get_metadata_element ($top_section,"assocfilepath");
229    my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
230
231    my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
232    my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
233
234    print $outhandle "  Inserting tripples for $doc_oid\n";
235
236#    my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
237
238#    my $status = system($cmd);
239#    if ($status != 0) {
240#   print STDERR "Error: failed to run:\n  $cmd\n$!\n";
241#    }
242
243}
244
245sub text {
246    my $self = shift (@_);
247    my ($doc_obj,$file) = @_;
248
249    $self->textedit($doc_obj,$file,"add");
250}
251
252sub textreindex
253{
254    my $self = shift @_;
255    my ($doc_obj,$file) = @_;
256
257    $self->textedit($doc_obj,$file,"update");
258}
259
260sub textdelete
261{
262    my $self = shift @_;
263
264    my ($doc_obj,$file) = @_;
265
266    print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
267
268    # $self->textedit($doc_obj,$file,"delete");
269}
270
271
272
273
274
2751;
Note: See TracBrowser for help on using the browser.