source: main/trunk/greenstone2/perllib/manifest.pm@ 22352

Last change on this file since 22352 was 22120, checked in by max, 14 years ago

Extra check added to allow for documents that do not have any associated files (davidb)

  • Property svn:keywords set to Author Date Id Revision
File size: 6.1 KB
Line 
1package manifest;
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use XMLParser;
7use dbutil;
8
9our $self;
10
11sub new {
12 my ($class) = shift (@_);
13 my ($infodbtype,$archivedir) = @_;
14
15 $self = {} ;
16
17 $self->{'index'} = {};
18 $self->{'reindex'} = {};
19 $self->{'delete'} = {};
20
21 my $arcinfo_doc_filename
22 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
23
24 if (-e $arcinfo_doc_filename) {
25 # Only store the infodb-doc filename if it exists
26 # If it doesn't exist then this means the collection has not been
27 # built yet (or else the archives folder has been deleted).
28 # Either way we have no way to look up which files
29 # are associated with an OID. If we we encounter an OID
30 # tag later on, we will use the fact that this field is
31 # not defined to issue a warning
32
33 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
34 $self->{'_infodbtype'} = $infodbtype;
35 }
36
37 return bless $self, $class;
38}
39
40sub parse
41{
42 my ($self) = shift (@_);
43 my ($filename) = @_;
44
45 my $parser = new XML::Parser('Style' => 'Stream',
46 'Handlers' => {'Char' => \&Char,
47 'XMLDecl' => \&XMLDecl,
48 'Entity' => \&Entity,
49 'Doctype' => \&Doctype,
50 'Default' => \&Default
51 });
52
53 $parser->parsefile($filename);
54}
55
56sub StartDocument {$self->xml_start_document(@_);}
57sub XMLDecl {$self->xml_xmldecl(@_);}
58sub Entity {$self->xml_entity(@_);}
59sub Doctype {$self->xml_doctype(@_);}
60sub StartTag {$self->xml_start_tag(@_);}
61sub EndTag {$self->xml_end_tag(@_);}
62sub Text {$self->xml_text(@_);}
63sub PI {$self->xml_pi(@_);}
64sub EndDocument {$self->xml_end_document(@_);}
65sub Default {$self->xml_default(@_);}
66
67# This Char function overrides the one in XML::Parser::Stream to overcome a
68# problem where $expat->{Text} is treated as the return value, slowing
69# things down significantly in some cases.
70sub Char {
71 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
72 $_[0]->{'Text'} .= $_[1];
73 return undef;
74}
75
76# Called at the beginning of the XML document.
77sub xml_start_document {
78 my $self = shift(@_);
79 my ($expat) = @_;
80
81}
82
83# Called for XML declarations
84sub xml_xmldecl {
85 my $self = shift(@_);
86 my ($expat, $version, $encoding, $standalone) = @_;
87}
88
89# Called for XML entities
90sub xml_entity {
91 my $self = shift(@_);
92 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
93}
94
95# Called for DOCTYPE declarations - use die to bail out if this doctype
96# is not meant for this plugin
97sub xml_doctype {
98 my $self = shift(@_);
99 my ($expat, $name, $sysid, $pubid, $internal) = @_;
100 die "Manifest Cannot process XML document with DOCTYPE of $name";
101}
102
103# Called for every start tag. The $_ variable will contain a copy of the
104# tag and the %_ variable will contain the element's attributes.
105sub xml_start_tag
106{
107 my $self = shift(@_);
108 my ($expat, $element) = @_;
109
110 if (($element eq "Filename") || ($element eq "OID"))
111 {
112 $self->{'item-val'} = "";
113 }
114 elsif ($element eq "Manifest") {
115 }
116 else
117 {
118 if (defined($self->{'file-type'}))
119 {
120 print STDERR "Warning: Malformed XML manifest\n";
121 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
122 }
123 else {
124 my $filetype = lc($element);
125 $self->{'file-type'} = $filetype;
126 if (!defined $self->{$filetype}) {
127 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
128 }
129 }
130
131 }
132}
133
134# Called for every end tag. The $_ variable will contain a copy of the tag.
135sub xml_end_tag
136{
137 my $self = shift(@_);
138 my ($expat, $element) = @_;
139
140 if ($element eq "Filename")
141 {
142 my $filetype = $self->{'file-type'};
143 my $filename = $self->{'item-val'};
144
145 $self->{$filetype}->{$filename} = 1;
146 $self->{'item-val'} = undef;
147 }
148 elsif ($element eq "OID") {
149 # look up src and assoc filenames used by this doc oid
150
151 my $filetype = $self->{'file-type'};
152 my $oid = $self->{'item-val'};
153
154 if (defined $self->{'_infodbtype'}) {
155
156
157 my $infodbtype = $self->{'_infodbtype'};
158 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
159
160 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
161
162 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
163
164 my $doc_source_file = $doc_rec->{'src-file'}->[0];
165 my $assoc_files = $doc_rec->{'assoc-file'};
166 my @all_files = ($doc_source_file);
167 push(@all_files,@$assoc_files) if defined $assoc_files;
168
169 foreach my $filename (@all_files) {
170
171 if (!&util::filename_is_absolute($filename)) {
172 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
173 }
174
175 $self->{$filetype}->{$filename} = 1;
176 }
177 }
178 else {
179 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
180 print STDERR " Unable to look up source files that constitute document $oid.\n";
181 }
182
183 $self->{'item-val'} = undef;
184 }
185 else
186 {
187 $self->{'file-type'} = undef;
188 }
189}
190
191# Called just before start or end tags with accumulated non-markup text in
192# the $_ variable.
193sub xml_text {
194 my $self = shift(@_);
195 my ($expat) = @_;
196
197 if (defined $self->{'item-val'}) {
198 my $text = $_;
199 chomp($text);
200
201 $text =~ s/^\s+//;
202 $text =~ s/\s+$//;
203
204 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
205 }
206}
207
208# Called for processing instructions. The $_ variable will contain a copy
209# of the pi.
210sub xml_pi {
211 my $self = shift(@_);
212 my ($expat, $target, $data) = @_;
213}
214
215# Called at the end of the XML document.
216sub xml_end_document {
217 my $self = shift(@_);
218 my ($expat) = @_;
219
220 if (defined $self->{'import'}) {
221 print STDERR "Warning: <Import> tag is deprecated.\n";
222 print STDERR " Processing data as if it were tagged as <Index>\n";
223 $self->{'index'} = $self->{'import'};
224 }
225
226}
227
228# Called for any characters not handled by the above functions.
229sub xml_default {
230 my $self = shift(@_);
231 my ($expat, $text) = @_;
232}
233
234
2351;
Note: See TracBrowser for help on using the repository browser.