source: main/trunk/greenstone2/perllib/manifest.pm@ 28211

Last change on this file since 28211 was 28211, checked in by ak19, 11 years ago

No more absolute paths in archiveinf-doc.gdb and archiveinf-src.gdb

  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1###########################################################################
2#
3# manifest.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006-2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package manifest;
28
29use strict;
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use XMLParser;
33use dbutil;
34
35our $self;
36
37sub new {
38 my ($class) = shift (@_);
39 my ($infodbtype,$archivedir) = @_;
40
41 $self = {} ;
42 # we can now mark a manifest with a version number
43 $self->{'version'} = 1;
44 $self->{'index'} = {};
45 $self->{'reindex'} = {};
46 $self->{'delete'} = {};
47
48 my $arcinfo_doc_filename
49 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
50
51 if (-e $arcinfo_doc_filename) {
52 # Only store the infodb-doc filename if it exists
53 # If it doesn't exist then this means the collection has not been
54 # built yet (or else the archives folder has been deleted).
55 # Either way we have no way to look up which files
56 # are associated with an OID. If we we encounter an OID
57 # tag later on, we will use the fact that this field is
58 # not defined to issue a warning
59
60 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
61 $self->{'_infodbtype'} = $infodbtype;
62 }
63
64 return bless $self, $class;
65}
66
67# /** @function get_version()
68# */
69sub get_version
70{
71 my $self = shift(@_);
72 return $self->{'version'};
73}
74# /** get_version() **/
75
76sub parse
77{
78 my ($self) = shift (@_);
79 my ($filename) = @_;
80
81 my $parser = new XML::Parser('Style' => 'Stream',
82 'Handlers' => {'Char' => \&Char,
83 'XMLDecl' => \&XMLDecl,
84 'Entity' => \&Entity,
85 'Doctype' => \&Doctype,
86 'Default' => \&Default
87 });
88
89 $parser->parsefile($filename);
90}
91
92sub StartDocument {$self->xml_start_document(@_);}
93sub XMLDecl {$self->xml_xmldecl(@_);}
94sub Entity {$self->xml_entity(@_);}
95sub Doctype {$self->xml_doctype(@_);}
96sub StartTag {$self->xml_start_tag(@_);}
97sub EndTag {$self->xml_end_tag(@_);}
98sub Text {$self->xml_text(@_);}
99sub PI {$self->xml_pi(@_);}
100sub EndDocument {$self->xml_end_document(@_);}
101sub Default {$self->xml_default(@_);}
102
103# This Char function overrides the one in XML::Parser::Stream to overcome a
104# problem where $expat->{Text} is treated as the return value, slowing
105# things down significantly in some cases.
106sub Char {
107 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
108 $_[0]->{'Text'} .= $_[1];
109 return undef;
110}
111
112# Called at the beginning of the XML document.
113sub xml_start_document {
114 my $self = shift(@_);
115 my ($expat) = @_;
116
117}
118
119# Called for XML declarations
120sub xml_xmldecl {
121 my $self = shift(@_);
122 my ($expat, $version, $encoding, $standalone) = @_;
123}
124
125# Called for XML entities
126sub xml_entity {
127 my $self = shift(@_);
128 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
129}
130
131# Called for DOCTYPE declarations - use die to bail out if this doctype
132# is not meant for this plugin
133sub xml_doctype {
134 my $self = shift(@_);
135 my ($expat, $name, $sysid, $pubid, $internal) = @_;
136 die "Manifest Cannot process XML document with DOCTYPE of $name";
137}
138
139# Called for every start tag. The $_ variable will contain a copy of the
140# tag and the %_ variable will contain the element's attributes.
141sub xml_start_tag
142{
143 my $self = shift(@_);
144 my ($expat, $element) = @_;
145 my $attributes = \%_;
146
147 if (($element eq "Filename") || ($element eq "OID"))
148 {
149 $self->{'item-val'} = "";
150 }
151 elsif ($element eq "Manifest")
152 {
153 if (defined $attributes->{'version'})
154 {
155 $self->{'version'} = $attributes->{'version'};
156 }
157 }
158 else
159 {
160 if (defined($self->{'file-type'}))
161 {
162 print STDERR "Warning: Malformed XML manifest\n";
163 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
164 }
165 else
166 {
167 my $filetype = lc($element);
168 $self->{'file-type'} = $filetype;
169 if (!defined $self->{$filetype})
170 {
171 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
172 }
173 }
174
175 }
176}
177
178# Called for every end tag. The $_ variable will contain a copy of the tag.
179sub xml_end_tag
180{
181 my $self = shift(@_);
182 my ($expat, $element) = @_;
183
184 if ($element eq "Filename")
185 {
186 my $filetype = $self->{'file-type'};
187 my $filename = $self->{'item-val'};
188
189 $self->{$filetype}->{$filename} = 1;
190 $self->{'item-val'} = undef;
191 }
192 elsif ($element eq "OID") {
193 # look up src and assoc filenames used by this doc oid
194
195 my $filetype = $self->{'file-type'};
196 my $oid = $self->{'item-val'};
197
198 if (defined $self->{'_infodbtype'}) {
199
200
201 my $infodbtype = $self->{'_infodbtype'};
202 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
203
204 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
205
206 my $doc_source_file = $doc_rec->{'src-file'}->[0];
207 my $assoc_files = $doc_rec->{'assoc-file'};
208 my @all_files = ($doc_source_file);
209 push(@all_files,@$assoc_files) if defined $assoc_files;
210
211 foreach my $filename (@all_files) {
212
213 $filename = &util::placeholders_to_abspath($filename);
214
215 if (!&util::filename_is_absolute($filename)) {
216 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
217 }
218
219 $self->{$filetype}->{$filename} = 1;
220 }
221 }
222 else {
223 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
224 print STDERR " Unable to look up source files that constitute document $oid.\n";
225 }
226
227 $self->{'item-val'} = undef;
228 }
229 else
230 {
231 $self->{'file-type'} = undef;
232 }
233}
234
235# Called just before start or end tags with accumulated non-markup text in
236# the $_ variable.
237sub xml_text {
238 my $self = shift(@_);
239 my ($expat) = @_;
240
241 if (defined $self->{'item-val'}) {
242 my $text = $_;
243 chomp($text);
244
245 $text =~ s/^\s+//;
246 $text =~ s/\s+$//;
247
248 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
249 }
250}
251
252# Called for processing instructions. The $_ variable will contain a copy
253# of the pi.
254sub xml_pi {
255 my $self = shift(@_);
256 my ($expat, $target, $data) = @_;
257}
258
259# Called at the end of the XML document.
260sub xml_end_document {
261 my $self = shift(@_);
262 my ($expat) = @_;
263
264 if (defined $self->{'import'}) {
265 print STDERR "Warning: <Import> tag is deprecated.\n";
266 print STDERR " Processing data as if it were tagged as <Index>\n";
267 $self->{'index'} = $self->{'import'};
268 }
269
270}
271
272# Called for any characters not handled by the above functions.
273sub xml_default {
274 my $self = shift(@_);
275 my ($expat, $text) = @_;
276}
277
278
2791;
Note: See TracBrowser for help on using the repository browser.