source: gs2-extensions/parallel-building/trunk/src/perllib/manifest.pm@ 26959

Last change on this file since 26959 was 26959, checked in by jmt12, 11 years ago

Adding the ability to include a version attribute in manifest files to indicate whether they are acting like a complex process expression, or whether they are a verbatim list of files to process (kinda like a manifest, right?)

File size: 7.4 KB
Line 
1###########################################################################
2#
3# manifest.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006-2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28package manifest;
29
30use strict;
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use XMLParser;
34use dbutil;
35
36our $self;
37
38sub new {
39 my ($class) = shift (@_);
40 my ($infodbtype,$archivedir) = @_;
41
42 $self = {} ;
43 $self->{'version'} = 0; # Default version
44 $self->{'index'} = {};
45 $self->{'reindex'} = {};
46 $self->{'delete'} = {};
47
48 my $arcinfo_doc_filename
49 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
50
51 if (-e $arcinfo_doc_filename) {
52 # Only store the infodb-doc filename if it exists
53 # If it doesn't exist then this means the collection has not been
54 # built yet (or else the archives folder has been deleted).
55 # Either way we have no way to look up which files
56 # are associated with an OID. If we we encounter an OID
57 # tag later on, we will use the fact that this field is
58 # not defined to issue a warning
59
60 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
61 $self->{'_infodbtype'} = $infodbtype;
62 }
63
64 return bless $self, $class;
65}
66
67sub get_version
68{
69 my $self = shift(@_);
70 return $self->{'version'};
71}
72
73sub parse
74{
75 my ($self) = shift (@_);
76 my ($filename) = @_;
77
78 my $parser = new XML::Parser('Style' => 'Stream',
79 'Handlers' => {'Char' => \&Char,
80 'XMLDecl' => \&XMLDecl,
81 'Entity' => \&Entity,
82 'Doctype' => \&Doctype,
83 'Default' => \&Default
84 });
85
86 $parser->parsefile($filename);
87}
88
89sub StartDocument {$self->xml_start_document(@_);}
90sub XMLDecl {$self->xml_xmldecl(@_);}
91sub Entity {$self->xml_entity(@_);}
92sub Doctype {$self->xml_doctype(@_);}
93sub StartTag {$self->xml_start_tag(@_);}
94sub EndTag {$self->xml_end_tag(@_);}
95sub Text {$self->xml_text(@_);}
96sub PI {$self->xml_pi(@_);}
97sub EndDocument {$self->xml_end_document(@_);}
98sub Default {$self->xml_default(@_);}
99
100# This Char function overrides the one in XML::Parser::Stream to overcome a
101# problem where $expat->{Text} is treated as the return value, slowing
102# things down significantly in some cases.
103sub Char {
104 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
105 $_[0]->{'Text'} .= $_[1];
106 return undef;
107}
108
109# Called at the beginning of the XML document.
110sub xml_start_document {
111 my $self = shift(@_);
112 my ($expat) = @_;
113
114}
115
116# Called for XML declarations
117sub xml_xmldecl {
118 my $self = shift(@_);
119 my ($expat, $version, $encoding, $standalone) = @_;
120}
121
122# Called for XML entities
123sub xml_entity {
124 my $self = shift(@_);
125 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
126}
127
128# Called for DOCTYPE declarations - use die to bail out if this doctype
129# is not meant for this plugin
130sub xml_doctype {
131 my $self = shift(@_);
132 my ($expat, $name, $sysid, $pubid, $internal) = @_;
133 die "Manifest Cannot process XML document with DOCTYPE of $name";
134}
135
136# Called for every start tag. The $_ variable will contain a copy of the
137# tag and the %_ variable will contain the element's attributes.
138sub xml_start_tag
139{
140 my $self = shift(@_);
141 my ($expat, $element) = @_;
142 my $attributes = \%_;
143
144 if (($element eq "Filename") || ($element eq "OID"))
145 {
146 $self->{'item-val'} = "";
147 }
148 elsif ($element eq "Manifest")
149 {
150 if (defined $attributes->{'version'})
151 {
152 $self->{'version'} = $attributes->{'version'};
153 }
154 }
155 else
156 {
157 if (defined($self->{'file-type'}))
158 {
159 print STDERR "Warning: Malformed XML manifest\n";
160 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
161 }
162 else
163 {
164 my $filetype = lc($element);
165 $self->{'file-type'} = $filetype;
166 if (!defined $self->{$filetype})
167 {
168 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
169 }
170 }
171 }
172}
173
174# Called for every end tag. The $_ variable will contain a copy of the tag.
175sub xml_end_tag
176{
177 my $self = shift(@_);
178 my ($expat, $element) = @_;
179
180 if ($element eq "Filename")
181 {
182 my $filetype = $self->{'file-type'};
183 my $filename = $self->{'item-val'};
184
185 $self->{$filetype}->{$filename} = 1;
186 $self->{'item-val'} = undef;
187 }
188 elsif ($element eq "OID") {
189 # look up src and assoc filenames used by this doc oid
190
191 my $filetype = $self->{'file-type'};
192 my $oid = $self->{'item-val'};
193
194 if (defined $self->{'_infodbtype'}) {
195
196
197 my $infodbtype = $self->{'_infodbtype'};
198 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
199
200 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
201
202 my $doc_source_file = $doc_rec->{'src-file'}->[0];
203 my $assoc_files = $doc_rec->{'assoc-file'};
204 my @all_files = ($doc_source_file);
205 push(@all_files,@$assoc_files) if defined $assoc_files;
206
207 foreach my $filename (@all_files) {
208
209 if (!&util::filename_is_absolute($filename)) {
210 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
211 }
212
213 $self->{$filetype}->{$filename} = 1;
214 }
215 }
216 else {
217 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
218 print STDERR " Unable to look up source files that constitute document $oid.\n";
219 }
220
221 $self->{'item-val'} = undef;
222 }
223 else
224 {
225 $self->{'file-type'} = undef;
226 }
227}
228
229# Called just before start or end tags with accumulated non-markup text in
230# the $_ variable.
231sub xml_text {
232 my $self = shift(@_);
233 my ($expat) = @_;
234
235 if (defined $self->{'item-val'}) {
236 my $text = $_;
237 chomp($text);
238
239 $text =~ s/^\s+//;
240 $text =~ s/\s+$//;
241
242 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
243 }
244}
245
246# Called for processing instructions. The $_ variable will contain a copy
247# of the pi.
248sub xml_pi {
249 my $self = shift(@_);
250 my ($expat, $target, $data) = @_;
251}
252
253# Called at the end of the XML document.
254sub xml_end_document {
255 my $self = shift(@_);
256 my ($expat) = @_;
257
258 if (defined $self->{'import'}) {
259 print STDERR "Warning: <Import> tag is deprecated.\n";
260 print STDERR " Processing data as if it were tagged as <Index>\n";
261 $self->{'index'} = $self->{'import'};
262 }
263
264}
265
266# Called for any characters not handled by the above functions.
267sub xml_default {
268 my $self = shift(@_);
269 my ($expat, $text) = @_;
270}
271
272
2731;
Note: See TracBrowser for help on using the repository browser.