source: main/trunk/greenstone2/perllib/manifest.pm@ 30597

Last change on this file since 30597 was 30597, checked in by ak19, 8 years ago
  1. Reindexing files with a user-generated manifest file did not work because modifications for generating nightly binaries broke this. Absolute file paths were replaced with filepaths containing placeholders in a pass by reference value and therefore affected other parts of the code. In this case, the code used for reindexing files mentioned in the manifest file. Some files ended up containing placeholders and couldn't be located and reindexed. 2. Use FileUtils method instead of going through deprecated util method.
  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1###########################################################################
2#
3# manifest.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2006-2010 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package manifest;
28
29use strict;
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use XMLParser;
33use dbutil;
34
35our $self;
36
37sub new {
38 my ($class) = shift (@_);
39 my ($infodbtype,$archivedir) = @_;
40
41 $self = {} ;
42 # we can now mark a manifest with a version number
43 $self->{'version'} = 1;
44 $self->{'index'} = {};
45 $self->{'reindex'} = {};
46 $self->{'delete'} = {};
47
48 my $arcinfo_doc_filename
49 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
50
51 if (-e $arcinfo_doc_filename) {
52 # Only store the infodb-doc filename if it exists
53 # If it doesn't exist then this means the collection has not been
54 # built yet (or else the archives folder has been deleted).
55 # Either way we have no way to look up which files
56 # are associated with an OID. If we we encounter an OID
57 # tag later on, we will use the fact that this field is
58 # not defined to issue a warning
59
60 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
61 $self->{'_infodbtype'} = $infodbtype;
62 }
63
64 return bless $self, $class;
65}
66
67# /** @function get_version()
68# */
69sub get_version
70{
71 my $self = shift(@_);
72 return $self->{'version'};
73}
74# /** get_version() **/
75
76sub parse
77{
78 my ($self) = shift (@_);
79 my ($filename) = @_;
80
81 my $parser = new XML::Parser('Style' => 'Stream',
82 'Handlers' => {'Char' => \&Char,
83 'XMLDecl' => \&XMLDecl,
84 'Entity' => \&Entity,
85 'Doctype' => \&Doctype,
86 'Default' => \&Default
87 });
88
89 $parser->parsefile($filename);
90}
91
92sub StartDocument {$self->xml_start_document(@_);}
93sub XMLDecl {$self->xml_xmldecl(@_);}
94sub Entity {$self->xml_entity(@_);}
95sub Doctype {$self->xml_doctype(@_);}
96sub StartTag {$self->xml_start_tag(@_);}
97sub EndTag {$self->xml_end_tag(@_);}
98sub Text {$self->xml_text(@_);}
99sub PI {$self->xml_pi(@_);}
100sub EndDocument {$self->xml_end_document(@_);}
101sub Default {$self->xml_default(@_);}
102
103# This Char function overrides the one in XML::Parser::Stream to overcome a
104# problem where $expat->{Text} is treated as the return value, slowing
105# things down significantly in some cases.
106sub Char {
107 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
108 $_[0]->{'Text'} .= $_[1];
109 return undef;
110}
111
112# Called at the beginning of the XML document.
113sub xml_start_document {
114 my $self = shift(@_);
115 my ($expat) = @_;
116
117}
118
119# Called for XML declarations
120sub xml_xmldecl {
121 my $self = shift(@_);
122 my ($expat, $version, $encoding, $standalone) = @_;
123}
124
125# Called for XML entities
126sub xml_entity {
127 my $self = shift(@_);
128 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
129}
130
131# Called for DOCTYPE declarations - use die to bail out if this doctype
132# is not meant for this plugin
133sub xml_doctype {
134 my $self = shift(@_);
135 my ($expat, $name, $sysid, $pubid, $internal) = @_;
136 die "Manifest Cannot process XML document with DOCTYPE of $name";
137}
138
139# Called for every start tag. The $_ variable will contain a copy of the
140# tag and the %_ variable will contain the element's attributes.
141sub xml_start_tag
142{
143 my $self = shift(@_);
144 my ($expat, $element) = @_;
145 my $attributes = \%_;
146
147 if (($element eq "Filename") || ($element eq "OID"))
148 {
149 $self->{'item-val'} = "";
150 }
151 elsif ($element eq "Manifest")
152 {
153 if (defined $attributes->{'version'})
154 {
155 $self->{'version'} = $attributes->{'version'};
156 }
157 }
158 else
159 {
160 if (defined($self->{'file-type'}))
161 {
162 print STDERR "Warning: Malformed XML manifest\n";
163 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
164 }
165 else
166 {
167 my $filetype = lc($element);
168 $self->{'file-type'} = $filetype;
169 if (!defined $self->{$filetype})
170 {
171 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
172 }
173 }
174
175 }
176}
177
178# Called for every end tag. The $_ variable will contain a copy of the tag.
179sub xml_end_tag
180{
181 my $self = shift(@_);
182 my ($expat, $element) = @_;
183
184 if ($element eq "Filename")
185 {
186 my $filetype = $self->{'file-type'};
187 my $filename = $self->{'item-val'};
188
189 $self->{$filetype}->{$filename} = 1;
190 $self->{'item-val'} = undef;
191 }
192 elsif ($element eq "OID") {
193 # look up src and assoc filenames used by this doc oid
194
195 my $filetype = $self->{'file-type'};
196 my $oid = $self->{'item-val'};
197
198 if (defined $self->{'_infodbtype'}) {
199
200
201 my $infodbtype = $self->{'_infodbtype'};
202 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
203
204 my $doc_rec = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
205
206 my $doc_source_file = $doc_rec->{'src-file'}->[0];
207 my $assoc_files = $doc_rec->{'assoc-file'};
208 my @all_files = ($doc_source_file);
209 push(@all_files,@$assoc_files) if defined $assoc_files;
210
211 foreach my $filename (@all_files) {
212
213 $filename = &util::placeholders_to_abspath($filename);
214
215 if (!&FileUtils::isFilenameAbsolute($filename)) {
216 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
217 }
218
219 $self->{$filetype}->{$filename} = 1;
220 }
221 }
222 else {
223 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
224 print STDERR " Unable to look up source files that constitute document $oid.\n";
225 }
226
227 $self->{'item-val'} = undef;
228 }
229 else
230 {
231 $self->{'file-type'} = undef;
232 }
233}
234
235# Called just before start or end tags with accumulated non-markup text in
236# the $_ variable.
237sub xml_text {
238 my $self = shift(@_);
239 my ($expat) = @_;
240
241 if (defined $self->{'item-val'}) {
242 my $text = $_;
243 chomp($text);
244
245 $text =~ s/^\s+//;
246 $text =~ s/\s+$//;
247
248 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
249 }
250}
251
252# Called for processing instructions. The $_ variable will contain a copy
253# of the pi.
254sub xml_pi {
255 my $self = shift(@_);
256 my ($expat, $target, $data) = @_;
257}
258
259# Called at the end of the XML document.
260sub xml_end_document {
261 my $self = shift(@_);
262 my ($expat) = @_;
263
264 if (defined $self->{'import'}) {
265 print STDERR "Warning: <Import> tag is deprecated.\n";
266 print STDERR " Processing data as if it were tagged as <Index>\n";
267 $self->{'index'} = $self->{'import'};
268 }
269
270}
271
272# Called for any characters not handled by the above functions.
273sub xml_default {
274 my $self = shift(@_);
275 my ($expat, $text) = @_;
276}
277
278
2791;
Note: See TracBrowser for help on using the repository browser.