source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28391

Last change on this file since 28391 was 28391, checked in by davidb, 11 years ago

Structure adding to tripple store as part of the 'orthogonal' extrabuilder phase

File size: 6.6 KB
Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32#no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use util;
35
36use extrabuildproc;
37
38
39BEGIN {
40 @jenaTDBBuildproc::ISA = ('extrabuildproc');
41}
42
43sub new()
44 {
45 my $class = shift @_;
46
47 my $self = new extrabuildproc (@_);
48
49 return bless $self, $class;
50}
51
52
53sub textedit {
54 my $self = shift (@_);
55 my ($doc_obj,$file,$mode) = @_;
56
57 # Code written on the assumption that that jenaTDB does a replace
58 # operation when presented with a docid that already extis.
59 # => don't need to do anything special to distinguish between
60 # a mode of "add" and "update"
61
62 my $outhandle = $self->{'outhandle'};
63
64
65 my $source_dir = $self->{'source_dir'}; # typically the archives dir
66 my $build_dir = $self->{'build_dir'};
67
68 # full path to adb database
69 my $adb_filename
70 = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
71
72 # get doc id
73 my $doc_oid = $doc_obj->get_OID();
74
75 # map to assoc dir
76 my $top_section = $doc_obj->get_top_section();
77 my $assoc_file
78 = $doc_obj->get_metadata_element ($top_section,"assocfilepath");
79 my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
80
81 my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
82 my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
83
84 print $outhandle " Inserting tripples for $doc_oid\n";
85
86# my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
87
88# my $status = system($cmd);
89# if ($status != 0) {
90# print STDERR "Error: failed to run:\n $cmd\n$!\n";
91# }
92
93}
94
95sub text {
96 my $self = shift (@_);
97 my ($doc_obj,$file) = @_;
98
99 $self->textedit($doc_obj,$file,"add");
100}
101
102sub textreindex
103{
104 my $self = shift @_;
105 my ($doc_obj,$file) = @_;
106
107 $self->textedit($doc_obj,$file,"update");
108}
109
110sub textdelete
111{
112 my $self = shift @_;
113
114 my ($doc_obj,$file) = @_;
115
116 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
117
118 # $self->textedit($doc_obj,$file,"delete");
119}
120
121
122
123
124
125
126sub MGtext {
127 my $self = shift (@_);
128 my ($doc_obj) = @_;
129 my $handle = $self->{'output_handle'};
130
131 # only output this document if it is one to be indexed
132 return if ($doc_obj->get_doc_type() ne "indexed_doc");
133
134 # see if this document belongs to this subcollection
135 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
136
137 # this is another document
138 $self->{'num_docs'} += 1;
139
140 # get the parameters for the output
141 my ($level, $fields) = split (/:/, $self->{'index'});
142 $fields =~ s/\ball\b/Title,Creator,text/;
143 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
144
145 my $doc_section = 0; # just for this document
146 my $text = "";
147 my $text_extra = "";
148
149 # get the text for this document
150 my $section = $doc_obj->get_top_section();
151 while (defined $section) {
152 # update a few statistics
153 $doc_section++;
154 $self->{'num_sections'} += 1;
155
156 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
157 if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
158 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
159 foreach my $field (split (/,/, $fields)) {
160 # only deal with this field if it doesn't start with top or
161 # this is the first section
162 my $real_field = $field;
163 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
164 my $new_text = "";
165 if ($level eq "dummy") {
166 # a dummy index is a special case used when no
167 # indexes are specified (since there must always be
168 # at least one index or we can't retrieve the
169 # compressed text) - we add a small amount of text
170 # to these dummy indexes which will never be seen
171 # but will overcome mg's problems with building
172 # empty indexes
173 $new_text = "this is dummy text to stop mg barfing";
174 $self->{'num_processed_bytes'} += length ($new_text);
175
176 } elsif ($real_field eq "text") {
177 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
178 $self->{'num_processed_bytes'} += length ($new_text);
179 $new_text =~ s/[\cB\cC]//g;
180 $self->find_paragraphs($new_text);
181
182 } else {
183 my $first = 1;
184 $real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
185 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
186 if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
187 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
188 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
189 }
190 }
191 foreach my $meta (@section_metadata) {
192 $meta =~ s/[\cB\cC]//g;
193 $self->{'num_processed_bytes'} += length ($meta);
194 $new_text .= "\cC" unless $first;
195 $new_text .= $meta if $self->{'store_text'};
196 $first = 0;
197 }
198 }
199
200 # filter the text
201 $new_text = $self->filter_text ($field, $new_text);
202
203 $text .= "$new_text\cC";
204 }
205 }
206 }
207
208 if ($level eq "document") { $text_extra .= "\cB"; }
209 else { $text .= "\cB"; }
210
211 $section = $doc_obj->get_next_section($section);
212 }
213
214 print $handle "$text$text_extra";
215}
216
217
2181;
Note: See TracBrowser for help on using the repository browser.