source: gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm@ 28392

Last change on this file since 28392 was 28392, checked in by davidb, 11 years ago

Simple version that looks for a hardwired title as metadata

File size: 4.3 KB
RevLine 
[28391]1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
[28392]32no strict 'refs'; # allow filehandles to be variables and viceversa
[28391]33
34use util;
35
36use extrabuildproc;
37
38
39BEGIN {
40 @jenaTDBBuildproc::ISA = ('extrabuildproc');
41}
42
43sub new()
44 {
45 my $class = shift @_;
46
47 my $self = new extrabuildproc (@_);
48
49 return bless $self, $class;
50}
51
52
[28392]53
54
[28391]55sub textedit {
56 my $self = shift (@_);
[28392]57 my ($doc_obj) = @_;
58 my $handle = $self->{'output_handle'};
59
60 my $doc_oid = $doc_obj->get_OID();
61
62 my $doc_section = 0; # just for this document
63
64 my $text = "";
65 my $text_extra = "";
66
67 # get the text for this document
68 my $section = $doc_obj->get_top_section();
69 while (defined $section) {
70 # update a few statistics
71 $doc_section++;
72
73 my $title = $doc_obj->get_metadata_element($section, "Title");
74
75 if (defined $title && ($title =~ m/\S/)) {
76 print "$doc_oid: Title = $title\n";
77 }
78
79 my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title");
80
81 if (defined $dc_title && ($dc_title =~ m/\S/)) {
82 print "$doc_oid: dc.Title = $dc_title\n";
83 }
84
85
86 my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title");
87
88 if (defined $id3_title && ($id3_title =~ m/\S/)) {
89 print "$doc_oid: id3.Title = $id3_title\n";
90 }
91
92 $section = $doc_obj->get_next_section($section);
93 }
94
95 print $handle "$text$text_extra";
96}
97
98
99
100sub texteditADB {
101 my $self = shift (@_);
[28391]102 my ($doc_obj,$file,$mode) = @_;
103
104 # Code written on the assumption that that jenaTDB does a replace
105 # operation when presented with a docid that already extis.
106 # => don't need to do anything special to distinguish between
107 # a mode of "add" and "update"
108
109 my $outhandle = $self->{'outhandle'};
110
111
112 my $source_dir = $self->{'source_dir'}; # typically the archives dir
113 my $build_dir = $self->{'build_dir'};
114
115 # full path to adb database
116 my $adb_filename
117 = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
118
119 # get doc id
120 my $doc_oid = $doc_obj->get_OID();
121
122 # map to assoc dir
123 my $top_section = $doc_obj->get_top_section();
124 my $assoc_file
125 = $doc_obj->get_metadata_element ($top_section,"assocfilepath");
126 my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
127
128 my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
129 my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
130
131 print $outhandle " Inserting tripples for $doc_oid\n";
132
133# my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
134
135# my $status = system($cmd);
136# if ($status != 0) {
137# print STDERR "Error: failed to run:\n $cmd\n$!\n";
138# }
139
140}
141
142sub text {
143 my $self = shift (@_);
144 my ($doc_obj,$file) = @_;
145
146 $self->textedit($doc_obj,$file,"add");
147}
148
149sub textreindex
150{
151 my $self = shift @_;
152 my ($doc_obj,$file) = @_;
153
154 $self->textedit($doc_obj,$file,"update");
155}
156
157sub textdelete
158{
159 my $self = shift @_;
160
161 my ($doc_obj,$file) = @_;
162
163 print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
164
165 # $self->textedit($doc_obj,$file,"delete");
166}
167
168
169
170
171
1721;
Note: See TracBrowser for help on using the repository browser.