source: gsdl/trunk/perllib/manifest.pm@ 14926

Last change on this file since 14926 was 11994, checked in by davidb, 18 years ago

Improved support for incremental addition: instead of having to run the
classifier pass of buildcol.pl from scratch (i.e. read in all documents
from the archives folder) so correct browse structures are formed -- a
simple to implement strategy, but not very efficient -- the first layer
of a classifier structure is now reconstructed from the GDBM file. Then
the new files in the archives directory are added, and then finally the
completed browser structure is formed.

  • Property svn:keywords set to Author Date Id Revision
File size: 4.0 KB
Line 
1package manifest;
2
3
4use XMLParser;
5use strict;
6no strict 'refs'; # allow filehandles to be variables and viceversa
7
8
9our $self;
10
11sub new {
12 my ($class) = shift (@_);
13
14 $self = {} ;
15
16 $self->{'index'} = {};
17 $self->{'reindex'} = {};
18 $self->{'delete'} = {};
19
20 return bless $self, $class;
21}
22
23sub parse
24{
25 my ($self) = shift (@_);
26 my ($filename) = @_;
27
28 my $parser = new XML::Parser('Style' => 'Stream',
29 'Handlers' => {'Char' => \&Char,
30 'XMLDecl' => \&XMLDecl,
31 'Entity' => \&Entity,
32 'Doctype' => \&Doctype,
33 'Default' => \&Default
34 });
35
36 $parser->parsefile($filename);
37}
38
39sub StartDocument {$self->xml_start_document(@_);}
40sub XMLDecl {$self->xml_xmldecl(@_);}
41sub Entity {$self->xml_entity(@_);}
42sub Doctype {$self->xml_doctype(@_);}
43sub StartTag {$self->xml_start_tag(@_);}
44sub EndTag {$self->xml_end_tag(@_);}
45sub Text {$self->xml_text(@_);}
46sub PI {$self->xml_pi(@_);}
47sub EndDocument {$self->xml_end_document(@_);}
48sub Default {$self->xml_default(@_);}
49
50# This Char function overrides the one in XML::Parser::Stream to overcome a
51# problem where $expat->{Text} is treated as the return value, slowing
52# things down significantly in some cases.
53sub Char {
54 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
55 $_[0]->{'Text'} .= $_[1];
56 return undef;
57}
58
59# Called at the beginning of the XML document.
60sub xml_start_document {
61 my $self = shift(@_);
62 my ($expat) = @_;
63
64}
65
66# Called for XML declarations
67sub xml_xmldecl {
68 my $self = shift(@_);
69 my ($expat, $version, $encoding, $standalone) = @_;
70}
71
72# Called for XML entities
73sub xml_entity {
74 my $self = shift(@_);
75 my ($expat, $name, $val, $sysid, $pubid, $ndata) = @_;
76}
77
78# Called for DOCTYPE declarations - use die to bail out if this doctype
79# is not meant for this plugin
80sub xml_doctype {
81 my $self = shift(@_);
82 my ($expat, $name, $sysid, $pubid, $internal) = @_;
83 die "Manifest Cannot process XML document with DOCTYPE of $name";
84}
85
86# Called for every start tag. The $_ variable will contain a copy of the
87# tag and the %_ variable will contain the element's attributes.
88sub xml_start_tag {
89 my $self = shift(@_);
90 my ($expat, $element) = @_;
91
92 if ($_ =~ m/^<(Index|Reindex|Delete)>$/) {
93 my $tag = $1;
94 if (defined $self->{'file-type'}) {
95 print STDERR "Warning: Malformed XML manifest. ";
96 print STDERR "$tag nested inside ", $self->{'file-type'}, "\n";
97 print STDERR " Changing file-type scope to $tag\n";
98 }
99
100 $self->{'file-type'} = lc($tag);
101 }
102 elsif ($_ eq "<Filename>") {
103 $self->{'filename'} = "";
104 }
105}
106
107# Called for every end tag. The $_ variable will contain a copy of the tag.
108sub xml_end_tag {
109 my $self = shift(@_);
110 my ($expat, $element) = @_;
111
112 if ($_ =~ m/^<\/(Index|Reindex|Delete)>$/) {
113 $self->{'file-type'} = undef;
114 }
115 elsif ($_ eq "</Filename>") {
116 my $file_type = $self->{'file-type'};
117 my $filename = $self->{'filename'};
118
119 $self->{$file_type}->{$filename} = 1;
120
121 $self->{'filename'} = undef;
122 }
123
124}
125
126# Called just before start or end tags with accumulated non-markup text in
127# the $_ variable.
128sub xml_text {
129 my $self = shift(@_);
130 my ($expat) = @_;
131
132 if (defined $self->{'filename'}) {
133 my $text = $_;
134 chomp($text);
135
136 $text =~ s/^\s+//;
137 $text =~ s/\s+$//;
138
139 $self->{'filename'} .= $text if ($text !~ m/^\s*$/);
140 }
141}
142
143# Called for processing instructions. The $_ variable will contain a copy
144# of the pi.
145sub xml_pi {
146 my $self = shift(@_);
147 my ($expat, $target, $data) = @_;
148}
149
150# Called at the end of the XML document.
151sub xml_end_document {
152 my $self = shift(@_);
153 my ($expat) = @_;
154
155}
156
157# Called for any characters not handled by the above functions.
158sub xml_default {
159 my $self = shift(@_);
160 my ($expat, $text) = @_;
161}
162
163
1641;
Note: See TracBrowser for help on using the repository browser.