source: trunk/protemix/perllib/plugins/ProtemixPlug.pm@ 3207

Last change on this file since 3207 was 3207, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# ProtemixPlug.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProtemixPlug;
28
29use HTMLPlug;
30use util;
31
32sub BEGIN {
33 @ISA = ('HTMLPlug');
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
35}
36
37use XML::Parser;
38
39sub new {
40 my $class = shift (@_);
41
42 # $self must be global for XML parser routines
43 $self = new HTMLPlug ($class, @_);
44
45 $self->{'no_metadata'} = 1;
46 $self->{'nolinks'} = 1;
47 $self->{'section_metadata'} = {};
48 $self->{'Page'} = "TopLevel";
49 $self->{'metadata_name'} = "";
50
51 return bless $self, $class;
52}
53
54sub read {
55 my $self = shift (@_);
56 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
57
58 my $filename = $file;
59 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
60
61 my $basename = File::Basename::basename($filename);
62 return 0 if $basename =~ /^\d+(-(all|\d+))?\.(html?|pdf)/;
63 return undef unless $filename =~ /meta\.xml$/;
64
65 # create a new document
66 my $doc_obj = new doc ($filename, "indexed_doc");
67 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
68 my $topsection = $doc_obj->get_top_section();
69
70 # process the meta.xml file and set top level metadata
71 my $parser = new XML::Parser('Style' => 'Stream',
72 'Handlers' => {'Char' => \&Char}
73 );
74 $self->{'section_metadata'} = {};
75 $self->{'Page'} = "TopLevel";
76 $parser->parsefile($filename);
77 foreach my $key (keys %{$self->{'section_metadata'}->{'TopLevel'}}) {
78 $doc_obj->add_utf8_metadata ($topsection, $key, $self->{'section_metadata'}->{'TopLevel'}->{$key});
79 }
80
81 my $dir = File::Basename::dirname($filename);
82
83 my $outhandle = $self->{'outhandle'};
84 print $outhandle "ProtemixPlug: processing $dir\n";
85
86
87 # associate article level pdf file
88 my ($pdffile) = $dir =~ /([^\/\\]+)$/;
89 $pdffile = &util::filename_cat($dir, $pdffile);
90 $pdffile .= "-all.pdf";
91 if (-e $pdffile) {
92 $doc_obj->associate_file($pdffile, "article.pdf", undef, $topsection);
93 $doc_obj->add_utf8_metadata ($cursection, "pdf", "article.pdf");
94 } else {
95 print STDERR "ProtemixPlug: Error: $pdffile does not exist\n";
96 }
97
98 # read in directory and process individual files
99 opendir(DIR, $dir) || die;
100 my @files = readdir DIR;
101 closedir DIR;
102
103
104 # we rely on the files being named in such a way that they'll be read
105 # in the correct order
106 my $count = 1;
107 foreach my $thisfile (@files) {
108 if ($thisfile =~ /^(.*?)\.html?$/i) {
109 my $filesuf = $1;
110 $thisfile = &util::filename_cat($dir, $thisfile);
111 my ($language, $encoding) = $self->textcat_get_language_encoding ($thisfile);
112 # read in file ($text will be in utf8)
113 my $text = "";
114 $self->read_file ($thisfile, $encoding, $language, \$text);
115 if (!length ($text)) {
116 print STDERR "ProtemixPlug: Warning: $thisfile has no text\n";
117 }
118
119 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
120
121 # process HTML file with HTMLPlug
122 $self->process_section (\$text, '', $thisfile, $doc_obj, $cursection);
123
124 # associate PDF page level pdf file
125 my $pdffile = $thisfile;
126 $pdffile =~ s/\.html?$/\.pdf/;
127 if (-e $pdffile) {
128 $doc_obj->associate_file($pdffile, "page$count.pdf", undef, $cursection);
129 $doc_obj->add_utf8_metadata ($cursection, "pdf", "page$count.pdf");
130 } else {
131 print STDERR "ProtemixPlug: Warning: no pdf file for $thisfile\n";
132 }
133 $doc_obj->add_utf8_metadata ($cursection, "Title", $count);
134
135 # add any section level metadata we have for this page (set from within the meta.xml file)
136 if (defined ($self->{'section_metadata'}->{$filesuf})) {
137 foreach my $key (keys %{$self->{'section_metadata'}->{$filesuf}}) {
138 $doc_obj->add_utf8_metadata ($cursection, $key, $self->{'section_metadata'}->{$filesuf}->{$key});
139 }
140 }
141
142 $count ++;
143 }
144 }
145
146 # add an OID
147 $doc_obj->set_OID();
148
149 # process the document
150 $processor->process($doc_obj);
151
152 $self->{'num_processed'} ++;
153
154 return 1; # processed the file
155}
156
157
158sub StartTag {
159 my ($expat, $element) = @_;
160
161 if ($element eq "Page") {
162 $self->{'Page'} = $_{'filename'};
163
164 } elsif ($element eq "Metadata") {
165 if (!defined $self->{'section_metadata'}->{$self->{'Page'}}) {
166 $self->{'section_metadata'}->{$self->{'Page'}} = {};
167 }
168 $self->{'metadata_name'} = $_{'name'};
169 $self->{'section_metadata'}->{$self->{'Page'}}->{$_{'name'}} = "";
170 }
171}
172
173sub EndTag {
174 my ($expat, $element) = @_;
175
176 if ($element eq "Page") {
177 $self->{'Page'} eq "TopLevel";
178
179 } elsif ($element eq "Metadata") {
180 $self->{'metadata_name'} = "";
181 }
182}
183
184sub Text {
185 if ($self->{'metadata_name'} ne "") {
186 $self->{'section_metadata'}->{$self->{'Page'}}->{$self->{'metadata_name'}} .= $_;
187 }
188}
189
190# don't want to convert character entities for now as mgpp appears to be broken
191sub read_file {
192 my ($self, $filename, $encoding, $language, $textref) = @_;
193
194 &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
195
196 # Convert entities to their UTF8 equivalents
197# $$textref =~ s/&(lt|gt|amp|quot);/&z$1;/go;
198# $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
199# $$textref =~ s/&z(lt|gt|amp|quot);/&$1;/go;
200}
201
202# This Char function overrides the one in XML::Parser::Stream to overcome a
203# problem where $expat->{Text} is treated as the return value, slowing
204# things down significantly in some cases.
205sub Char {
206 $_[0]->{'Text'} .= $_[1];
207 return undef;
208}
209
210
2111;
Note: See TracBrowser for help on using the repository browser.