Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28391

Last change on this file since 28391 was 28391, checked in by davidb, 11 years ago
Structure adding to tripple store as part of the 'orthogonal' extrabuilder phase
File size: 6.6 KB

Line
1	##########################################################################
2	#
3	# jenaTDBBuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package jenaTDBBuildproc;
30
31	use strict;
32	#no strict 'refs'; # allow filehandles to be variables and viceversa
33
34	use util;
35
36	use extrabuildproc;
37
38
39	BEGIN {
40	@jenaTDBBuildproc::ISA = ('extrabuildproc');
41	}
42
43	sub new()
44	{
45	my $class = shift @_;
46
47	my $self = new extrabuildproc (@_);
48
49	return bless $self, $class;
50	}
51
52
53	sub textedit {
54	my $self = shift (@_);
55	my ($doc_obj,$file,$mode) = @_;
56
57	# Code written on the assumption that that jenaTDB does a replace
58	# operation when presented with a docid that already extis.
59	# => don't need to do anything special to distinguish between
60	# a mode of "add" and "update"
61
62	my $outhandle = $self->{'outhandle'};
63
64
65	my $source_dir = $self->{'source_dir'}; # typically the archives dir
66	my $build_dir = $self->{'build_dir'};
67
68	# full path to adb database
69	my $adb_filename
70	= &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
71
72	# get doc id
73	my $doc_oid = $doc_obj->get_OID();
74
75	# map to assoc dir
76	my $top_section = $doc_obj->get_top_section();
77	my $assoc_file
78	= $doc_obj->get_metadata_element ($top_section,"assocfilepath");
79	my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
80
81	my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
82	my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
83
84	print $outhandle " Inserting tripples for $doc_oid\n";
85
86	# my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
87
88	# my $status = system($cmd);
89	# if ($status != 0) {
90	# print STDERR "Error: failed to run:\n $cmd\n$!\n";
91	# }
92
93	}
94
95	sub text {
96	my $self = shift (@_);
97	my ($doc_obj,$file) = @_;
98
99	$self->textedit($doc_obj,$file,"add");
100	}
101
102	sub textreindex
103	{
104	my $self = shift @_;
105	my ($doc_obj,$file) = @_;
106
107	$self->textedit($doc_obj,$file,"update");
108	}
109
110	sub textdelete
111	{
112	my $self = shift @_;
113
114	my ($doc_obj,$file) = @_;
115
116	print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
117
118	# $self->textedit($doc_obj,$file,"delete");
119	}
120
121
122
123
124
125
126	sub MGtext {
127	my $self = shift (@_);
128	my ($doc_obj) = @_;
129	my $handle = $self->{'output_handle'};
130
131	# only output this document if it is one to be indexed
132	return if ($doc_obj->get_doc_type() ne "indexed_doc");
133
134	# see if this document belongs to this subcollection
135	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
136
137	# this is another document
138	$self->{'num_docs'} += 1;
139
140	# get the parameters for the output
141	my ($level, $fields) = split (/:/, $self->{'index'});
142	$fields =~ s/\ball\b/Title,Creator,text/;
143	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
144
145	my $doc_section = 0; # just for this document
146	my $text = "";
147	my $text_extra = "";
148
149	# get the text for this document
150	my $section = $doc_obj->get_top_section();
151	while (defined $section) {
152	# update a few statistics
153	$doc_section++;
154	$self->{'num_sections'} += 1;
155
156	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
157	if (($indexed_doc) && ($indexed_section eq "indexed_section" \|\| $indexed_section eq "indexed_doc")) {
158	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
159	foreach my $field (split (/,/, $fields)) {
160	# only deal with this field if it doesn't start with top or
161	# this is the first section
162	my $real_field = $field;
163	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
164	my $new_text = "";
165	if ($level eq "dummy") {
166	# a dummy index is a special case used when no
167	# indexes are specified (since there must always be
168	# at least one index or we can't retrieve the
169	# compressed text) - we add a small amount of text
170	# to these dummy indexes which will never be seen
171	# but will overcome mg's problems with building
172	# empty indexes
173	$new_text = "this is dummy text to stop mg barfing";
174	$self->{'num_processed_bytes'} += length ($new_text);
175
176	} elsif ($real_field eq "text") {
177	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
178	$self->{'num_processed_bytes'} += length ($new_text);
179	$new_text =~ s/[\cB\cC]//g;
180	$self->find_paragraphs($new_text);
181
182	} else {
183	my $first = 1;
184	$real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
185	my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
186	if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
187	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
188	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
189	}
190	}
191	foreach my $meta (@section_metadata) {
192	$meta =~ s/[\cB\cC]//g;
193	$self->{'num_processed_bytes'} += length ($meta);
194	$new_text .= "\cC" unless $first;
195	$new_text .= $meta if $self->{'store_text'};
196	$first = 0;
197	}
198	}
199
200	# filter the text
201	$new_text = $self->filter_text ($field, $new_text);
202
203	$text .= "$new_text\cC";
204	}
205	}
206	}
207
208	if ($level eq "document") { $text_extra .= "\cB"; }
209	else { $text .= "\cB"; }
210
211	$section = $doc_obj->get_next_section($section);
212	}
213
214	print $handle "$text$text_extra";
215	}
216
217
218	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: