root/gs2-extensions/apache-jena/trunk/src/perllib/jenaTDBBuildproc.pm @ 28392

Revision 28392, 4.3 KB (checked in by davidb, 6 years ago)

Simple version that looks for a hardwired title as metadata

Line 
1##########################################################################
2#
3# jenaTDBBuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package jenaTDBBuildproc;
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use util;
35
36use extrabuildproc;
37
38
39BEGIN {
40    @jenaTDBBuildproc::ISA = ('extrabuildproc');
41}
42
43sub new()
44  {
45    my $class = shift @_;
46
47    my $self = new extrabuildproc (@_);
48
49    return bless $self, $class;
50}
51
52
53
54
55sub textedit {
56    my $self = shift (@_);
57    my ($doc_obj) = @_;
58    my $handle = $self->{'output_handle'};
59   
60    my $doc_oid = $doc_obj->get_OID();
61
62    my $doc_section = 0; # just for this document
63
64    my $text = "";
65    my $text_extra = "";
66
67    # get the text for this document
68    my $section = $doc_obj->get_top_section();
69    while (defined $section) {
70    # update a few statistics
71    $doc_section++;
72
73    my $title = $doc_obj->get_metadata_element($section, "Title");
74
75    if (defined $title && ($title =~ m/\S/)) {
76        print "$doc_oid: Title = $title\n";
77    }
78
79    my $dc_title = $doc_obj->get_metadata_element($section, "dc.Title");
80
81    if (defined $dc_title && ($dc_title =~ m/\S/)) {
82        print "$doc_oid: dc.Title = $dc_title\n";
83    }
84
85
86    my $id3_title = $doc_obj->get_metadata_element($section, "ex.ID3.Title");
87
88    if (defined $id3_title && ($id3_title =~ m/\S/)) {
89        print "$doc_oid: id3.Title = $id3_title\n";
90    }
91   
92    $section = $doc_obj->get_next_section($section);
93    }
94
95    print $handle "$text$text_extra";
96}
97
98
99
100sub texteditADB {
101    my $self = shift (@_);
102    my ($doc_obj,$file,$mode) = @_;
103
104    # Code written on the assumption that that jenaTDB does a replace
105    # operation when presented with a docid that already extis.
106    # => don't need to do anything special to distinguish between
107    #    a mode of "add" and "update"
108
109    my $outhandle = $self->{'outhandle'};
110
111
112    my $source_dir = $self->{'source_dir'}; # typically the archives dir
113    my $build_dir  = $self->{'build_dir'};
114
115    # full path to adb database
116    my $adb_filename
117    = &util::filename_cat($build_dir, "jenaTDB", "lsh-features.adb");
118
119    # get doc id
120    my $doc_oid = $doc_obj->get_OID();
121
122    # map to assoc dir
123    my $top_section = $doc_obj->get_top_section();
124    my $assoc_file
125    = $doc_obj->get_metadata_element ($top_section,"assocfilepath");
126    my $assoc_filename = &util::filename_cat($source_dir,$assoc_file);
127
128    my $chr12_filename = &util::filename_cat($assoc_filename,"doc.chr12");
129    my $powerlog_filename = &util::filename_cat($assoc_filename,"doc.power");
130
131    print $outhandle "  Inserting tripples for $doc_oid\n";
132
133#    my $cmd = "jenaTDB -d \"$adb_filename\" -I -k \"$doc_oid\" -f \"$chr12_filename\" -w \"$powerlog_filename\"";
134
135#    my $status = system($cmd);
136#    if ($status != 0) {
137#   print STDERR "Error: failed to run:\n  $cmd\n$!\n";
138#    }
139
140}
141
142sub text {
143    my $self = shift (@_);
144    my ($doc_obj,$file) = @_;
145
146    $self->textedit($doc_obj,$file,"add");
147}
148
149sub textreindex
150{
151    my $self = shift @_;
152    my ($doc_obj,$file) = @_;
153
154    $self->textedit($doc_obj,$file,"update");
155}
156
157sub textdelete
158{
159    my $self = shift @_;
160
161    my ($doc_obj,$file) = @_;
162
163    print STDERR "Warning: jenaTDB command-line does not currently support delete operation\n";
164
165    # $self->textedit($doc_obj,$file,"delete");
166}
167
168
169
170
171
1721;
Note: See TracBrowser for help on using the browser.