source: main/trunk/greenstone2/perllib/manifest.pm@ 24950

Last change on this file since 24950 was 23485, checked in by davidb, 13 years ago

read_infodb_entry now returns a hashmap directly. Code updated to take advantage of this, and in places where the hashmap is not needed, the alternative read_infodb_rawentry is called.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# manifest.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006-2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package manifest;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use XMLParser;
34use dbutil;
35
36our $self;
37
38sub new {
39 my ($class) = shift (@_);
40 my ($infodbtype,$archivedir) = @_;
41
42 $self = {} ;
43
44 $self->{'index'} = {};
45 $self->{'reindex'} = {};
46 $self->{'delete'} = {};
47
48 my $arcinfo_doc_filename
49 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
50
51 if (-e $arcinfo_doc_filename) {
52 # Only store the infodb-doc filename if it exists
53 # If it doesn't exist then this means the collection has not been
54 # built yet (or else the archives folder has been deleted).
55 # Either way we have no way to look up which files
56 # are associated with an OID. If we we encounter an OID
57 # tag later on, we will use the fact that this field is
58 # not defined to issue a warning
59
60 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
61 $self->{'_infodbtype'} = $infodbtype;
62 }
63
64 return bless $self, $class;
65}
66
67sub parse
68{
69 my ($self) = shift (@_);
70 my ($filename) = @_;
71
72 my $parser = new XML::Parser('Style' => 'Stream',
73 'Handlers' => {'Char' => \&Char,
74 'XMLDecl' => \&XMLDecl,
75 'Entity' => \&Entity,
76 'Doctype' => \&Doctype,
77 'Default' => \&Default
78 });
79
80 $parser->parsefile($filename);
81}
82
83sub StartDocument {$self->xml_start_document(@_);}
84sub XMLDecl {$self->xml_xmldecl(@_);}
85sub Entity {$self->xml_entity(@_);}
86sub Doctype {$self->xml_doctype(@_);}
87sub StartTag {$self->xml_start_tag(@_);}
88sub EndTag {$self->xml_end_tag(@_);}
89sub Text {$self->xml_text(@_);}
90sub PI {$self->xml_pi(@_);}
91sub EndDocument {$self->xml_end_document(@_);}
92sub Default {$self->xml_default(@_);}
93
94# This Char function overrides the one in XML::Parser::Stream to overcome a
95# problem where $expat->{Text} is treated as the return value, slowing
96# things down significantly in some cases.
97sub Char {
98 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
99 $_[0]->{'Text'} .= $_[1];
100 return undef;
101}
102
103# Called at the beginning of the XML document.
104sub xml_start_document {
105 my $self = shift(@_);
106 my ($expat) = @_;
107
108}
109
110# Called for XML declarations
111sub xml_xmldecl {
112 my $self = shift(@_);
113 my ($expat, $version, $encoding, $standalone) = @_;
114}
115
116# Called for XML entities
117sub xml_entity {
118 my $self = shift(@_);
119 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
120}
121
122# Called for DOCTYPE declarations - use die to bail out if this doctype
123# is not meant for this plugin
124sub xml_doctype {
125 my $self = shift(@_);
126 my ($expat, $name, $sysid, $pubid, $internal) = @_;
127 die "Manifest Cannot process XML document with DOCTYPE of $name";
128}
129
130# Called for every start tag. The $_ variable will contain a copy of the
131# tag and the %_ variable will contain the element's attributes.
132sub xml_start_tag
133{
134 my $self = shift(@_);
135 my ($expat, $element) = @_;
136
137 if (($element eq "Filename") || ($element eq "OID"))
138 {
139 $self->{'item-val'} = "";
140 }
141 elsif ($element eq "Manifest") {
142 }
143 else
144 {
145 if (defined($self->{'file-type'}))
146 {
147 print STDERR "Warning: Malformed XML manifest\n";
148 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
149 }
150 else {
151 my $filetype = lc($element);
152 $self->{'file-type'} = $filetype;
153 if (!defined $self->{$filetype}) {
154 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
155 }
156 }
157
158 }
159}
160
161# Called for every end tag. The $_ variable will contain a copy of the tag.
162sub xml_end_tag
163{
164 my $self = shift(@_);
165 my ($expat, $element) = @_;
166
167 if ($element eq "Filename")
168 {
169 my $filetype = $self->{'file-type'};
170 my $filename = $self->{'item-val'};
171
172 $self->{$filetype}->{$filename} = 1;
173 $self->{'item-val'} = undef;
174 }
175 elsif ($element eq "OID") {
176 # look up src and assoc filenames used by this doc oid
177
178 my $filetype = $self->{'file-type'};
179 my $oid = $self->{'item-val'};
180
181 if (defined $self->{'_infodbtype'}) {
182
183
184 my $infodbtype = $self->{'_infodbtype'};
185 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
186
187 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
188
189 my $doc_source_file = $doc_rec->{'src-file'}->[0];
190 my $assoc_files = $doc_rec->{'assoc-file'};
191 my @all_files = ($doc_source_file);
192 push(@all_files,@$assoc_files) if defined $assoc_files;
193
194 foreach my $filename (@all_files) {
195
196 if (!&util::filename_is_absolute($filename)) {
197 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
198 }
199
200 $self->{$filetype}->{$filename} = 1;
201 }
202 }
203 else {
204 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
205 print STDERR " Unable to look up source files that constitute document $oid.\n";
206 }
207
208 $self->{'item-val'} = undef;
209 }
210 else
211 {
212 $self->{'file-type'} = undef;
213 }
214}
215
216# Called just before start or end tags with accumulated non-markup text in
217# the $_ variable.
218sub xml_text {
219 my $self = shift(@_);
220 my ($expat) = @_;
221
222 if (defined $self->{'item-val'}) {
223 my $text = $_;
224 chomp($text);
225
226 $text =~ s/^\s+//;
227 $text =~ s/\s+$//;
228
229 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
230 }
231}
232
233# Called for processing instructions. The $_ variable will contain a copy
234# of the pi.
235sub xml_pi {
236 my $self = shift(@_);
237 my ($expat, $target, $data) = @_;
238}
239
240# Called at the end of the XML document.
241sub xml_end_document {
242 my $self = shift(@_);
243 my ($expat) = @_;
244
245 if (defined $self->{'import'}) {
246 print STDERR "Warning: <Import> tag is deprecated.\n";
247 print STDERR " Processing data as if it were tagged as <Index>\n";
248 $self->{'index'} = $self->{'import'};
249 }
250
251}
252
253# Called for any characters not handled by the above functions.
254sub xml_default {
255 my $self = shift(@_);
256 my ($expat, $text) = @_;
257}
258
259
2601;
Note: See TracBrowser for help on using the repository browser.