source: main/trunk/greenstone2/perllib/manifest.pm@ 22037

Last change on this file since 22037 was 22037, checked in by davidb, 14 years ago

Manifest file processing upgraded to support OIDs. The code then uses the archiveinf-doc database to look up which files it uses.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.1 KB
Line 
1package manifest;
2
3use strict;
4no strict 'refs'; # allow filehandles to be variables and viceversa
5
6use XMLParser;
7use dbutil;
8
9our $self;
10
11sub new {
12 my ($class) = shift (@_);
13 my ($infodbtype,$archivedir) = @_;
14
15 $self = {} ;
16
17 $self->{'index'} = {};
18 $self->{'reindex'} = {};
19 $self->{'delete'} = {};
20
21 my $arcinfo_doc_filename
22 = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $archivedir);
23
24 if (-e $arcinfo_doc_filename) {
25 # Only store the infodb-doc filename if it exists
26 # If it doesn't exist then this means the collection has not been
27 # built yet (or else the archives folder has been deleted).
28 # Either way we have no way to look up which files
29 # are associated with an OID. If we we encounter an OID
30 # tag later on, we will use the fact that this field is
31 # not defined to issue a warning
32
33 $self->{'_arcinfo-doc-filename'} = $arcinfo_doc_filename;
34 $self->{'_infodbtype'} = $infodbtype;
35 }
36
37 return bless $self, $class;
38}
39
40sub parse
41{
42 my ($self) = shift (@_);
43 my ($filename) = @_;
44
45 my $parser = new XML::Parser('Style' => 'Stream',
46 'Handlers' => {'Char' => \&Char,
47 'XMLDecl' => \&XMLDecl,
48 'Entity' => \&Entity,
49 'Doctype' => \&Doctype,
50 'Default' => \&Default
51 });
52
53 $parser->parsefile($filename);
54}
55
56sub StartDocument {$self->xml_start_document(@_);}
57sub XMLDecl {$self->xml_xmldecl(@_);}
58sub Entity {$self->xml_entity(@_);}
59sub Doctype {$self->xml_doctype(@_);}
60sub StartTag {$self->xml_start_tag(@_);}
61sub EndTag {$self->xml_end_tag(@_);}
62sub Text {$self->xml_text(@_);}
63sub PI {$self->xml_pi(@_);}
64sub EndDocument {$self->xml_end_document(@_);}
65sub Default {$self->xml_default(@_);}
66
67# This Char function overrides the one in XML::Parser::Stream to overcome a
68# problem where $expat->{Text} is treated as the return value, slowing
69# things down significantly in some cases.
70sub Char {
71 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
72 $_[0]->{'Text'} .= $_[1];
73 return undef;
74}
75
76# Called at the beginning of the XML document.
77sub xml_start_document {
78 my $self = shift(@_);
79 my ($expat) = @_;
80
81}
82
83# Called for XML declarations
84sub xml_xmldecl {
85 my $self = shift(@_);
86 my ($expat, $version, $encoding, $standalone) = @_;
87}
88
89# Called for XML entities
90sub xml_entity {
91 my $self = shift(@_);
92 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
93}
94
95# Called for DOCTYPE declarations - use die to bail out if this doctype
96# is not meant for this plugin
97sub xml_doctype {
98 my $self = shift(@_);
99 my ($expat, $name, $sysid, $pubid, $internal) = @_;
100 die "Manifest Cannot process XML document with DOCTYPE of $name";
101}
102
103# Called for every start tag. The $_ variable will contain a copy of the
104# tag and the %_ variable will contain the element's attributes.
105sub xml_start_tag
106{
107 my $self = shift(@_);
108 my ($expat, $element) = @_;
109
110 if (($element eq "Filename") || ($element eq "OID"))
111 {
112 $self->{'item-val'} = "";
113 }
114 elsif ($element eq "Manifest") {
115 }
116 else
117 {
118 if (defined($self->{'file-type'}))
119 {
120 print STDERR "Warning: Malformed XML manifest\n";
121 print STDERR " Unrecognized element $element nested inside " . $self->{'file-type'} . ".\n";
122 }
123 else {
124 my $filetype = lc($element);
125 $self->{'file-type'} = $filetype;
126 if (!defined $self->{$filetype}) {
127 print STDERR "Warning: <$element> is not one of the registered tags for manifest format.\n";
128 }
129 }
130
131 }
132}
133
134# Called for every end tag. The $_ variable will contain a copy of the tag.
135sub xml_end_tag
136{
137 my $self = shift(@_);
138 my ($expat, $element) = @_;
139
140 if ($element eq "Filename")
141 {
142 my $filetype = $self->{'file-type'};
143 my $filename = $self->{'item-val'};
144
145 $self->{$filetype}->{$filename} = 1;
146 $self->{'item-val'} = undef;
147 }
148 elsif ($element eq "OID") {
149 # look up src and assoc filenames used by this doc oid
150
151 my $filetype = $self->{'file-type'};
152 my $oid = $self->{'item-val'};
153
154 if (defined $self->{'_infodbtype'}) {
155
156
157 my $infodbtype = $self->{'_infodbtype'};
158 my $arcinfo_doc_filename = $self->{'_arcinfo-doc-filename'};
159
160 my $doc_rec_string = &dbutil::read_infodb_entry($infodbtype, $arcinfo_doc_filename, $oid);
161
162 my $doc_rec = &dbutil::convert_infodb_string_to_hash($doc_rec_string);
163
164 my $doc_source_file = $doc_rec->{'src-file'}->[0];
165 my $assoc_files = $doc_rec->{'assoc-file'};
166 my @all_files = ($doc_source_file,@$assoc_files);
167
168 foreach my $filename (@all_files) {
169
170 if (!&util::filename_is_absolute($filename)) {
171 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
172 }
173
174 $self->{$filetype}->{$filename} = 1;
175 }
176 }
177 else {
178 print STDERR "Warning: No archiveinf-doc database in archives directory.\n";
179 print STDERR " Unable to look up source files that constitute document $oid.\n";
180 }
181
182 $self->{'item-val'} = undef;
183 }
184 else
185 {
186 $self->{'file-type'} = undef;
187 }
188}
189
190# Called just before start or end tags with accumulated non-markup text in
191# the $_ variable.
192sub xml_text {
193 my $self = shift(@_);
194 my ($expat) = @_;
195
196 if (defined $self->{'item-val'}) {
197 my $text = $_;
198 chomp($text);
199
200 $text =~ s/^\s+//;
201 $text =~ s/\s+$//;
202
203 $self->{'item-val'} .= $text if ($text !~ m/^\s*$/);
204 }
205}
206
207# Called for processing instructions. The $_ variable will contain a copy
208# of the pi.
209sub xml_pi {
210 my $self = shift(@_);
211 my ($expat, $target, $data) = @_;
212}
213
214# Called at the end of the XML document.
215sub xml_end_document {
216 my $self = shift(@_);
217 my ($expat) = @_;
218
219 if (defined $self->{'import'}) {
220 print STDERR "Warning: <Import> tag is deprecated.\n";
221 print STDERR " Processing data as if it were tagged as <Index>\n";
222 $self->{'index'} = $self->{'import'};
223 }
224
225}
226
227# Called for any characters not handled by the above functions.
228sub xml_default {
229 my $self = shift(@_);
230 my ($expat, $text) = @_;
231}
232
233
2341;
Note: See TracBrowser for help on using the repository browser.