source: trunk/protemix/perllib/plugins/ProtemixPlug.pm@ 3177

Last change on this file since 3177 was 3177, checked in by sjboddie, 22 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1###########################################################################
2#
3# ProtemixPlug.pm --
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ProtemixPlug;
28
29use HTMLPlug;
30use util;
31
32sub BEGIN {
33 @ISA = ('HTMLPlug');
34}
35
36use XML::Parser;
37
38sub new {
39 my $class = shift (@_);
40
41 # $self must be global for XML parser routines
42 $self = new HTMLPlug ($class, @_);
43
44 $self->{'no_metadata'} = 1;
45 $self->{'nolinks'} = 1;
46 $self->{'section_metadata'} = {};
47 $self->{'Page'} = "TopLevel";
48 $self->{'metadata_name'} = "";
49
50 return bless $self, $class;
51}
52
53sub read {
54 my $self = shift (@_);
55 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
56
57 my $filename = $file;
58 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
59
60 return 0 if $filename =~ /\.(pdf|html?|jpe?g)$/i;
61 return undef unless $filename =~ /meta\.xml$/;
62
63 # create a new document
64 my $doc_obj = new doc ($filename, "indexed_doc");
65 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
66 my $topsection = $doc_obj->get_top_section();
67
68 # process the meta.xml file and set top level metadata
69 my $parser = new XML::Parser('Style' => 'Stream');
70 $self->{'section_metadata'} = {};
71 $self->{'Page'} = "TopLevel";
72 $parser->parsefile($filename);
73 foreach my $key (keys %{$self->{'section_metadata'}->{'TopLevel'}}) {
74 $doc_obj->add_utf8_metadata ($topsection, $key, $self->{'section_metadata'}->{'TopLevel'}->{$key});
75 }
76
77 my $dir = File::Basename::dirname($filename);
78
79 # associate article level pdf file
80 my ($pdffile) = $dir =~ /([^\/\\]+)$/;
81 $pdffile = &util::filename_cat($dir, $pdffile);
82 $pdffile .= "-all.pdf";
83 die "$pdffile does not exist" unless -e $pdffile;
84 $doc_obj->associate_file($pdffile, "article.pdf", undef, $topsection);
85 $doc_obj->add_utf8_metadata ($cursection, "pdf", "article.pdf");
86
87 # read in directory and process individual files
88 opendir(DIR, $dir) || die;
89 my @files = readdir DIR;
90 closedir DIR;
91
92
93 # we rely on the files being named in such a way that they'll be read
94 # in the correct order
95 my $count = 1;
96 foreach my $thisfile (@files) {
97 if ($thisfile =~ /^(.*?)\.html?$/i) {
98 my $filesuf = $1;
99 $thisfile = &util::filename_cat($dir, $thisfile);
100 my ($language, $encoding) = $self->textcat_get_language_encoding ($thisfile);
101 # read in file ($text will be in utf8)
102 my $text = "";
103 $self->read_file ($thisfile, $encoding, $language, \$text);
104 if (!length ($text)) {
105 die "$thisfile has no text\n";
106 }
107
108 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
109
110 # process HTML file with HTMLPlug
111 $self->process_section (\$text, '', $thisfile, $doc_obj, $cursection);
112
113 # associate PDF page level pdf file
114 my $pdffile = $thisfile;
115 $pdffile =~ s/\.html?$/\.pdf/;
116 die "no PDF file for $thisfile" unless -e $pdffile;
117 $doc_obj->associate_file($pdffile, "page$count.pdf", undef, $cursection);
118 $doc_obj->add_utf8_metadata ($cursection, "pdf", "page$count.pdf");
119 $doc_obj->add_utf8_metadata ($cursection, "Title", $count);
120
121 # add any section level metadata we have for this page (set from within the meta.xml file)
122
123# currently commented out as we're not using Class1, Class2, and Class3 metadata yet
124# if (defined ($self->{'section_metadata'}->{$filesuf})) {
125# foreach my $key (keys %{$self->{'section_metadata'}->{$filesuf}}) {
126# $doc_obj->add_utf8_metadata ($cursection, $key, $self->{'section_metadata'}->{$filesuf}->{$key});
127# }
128# }
129
130 $count ++;
131 }
132 }
133
134 # add an OID
135 $doc_obj->set_OID();
136
137 # process the document
138 $processor->process($doc_obj);
139
140 $self->{'num_processed'} ++;
141
142 return 1; # processed the file
143}
144
145
146sub StartTag {
147 my ($expat, $element) = @_;
148
149 if ($element eq "Page") {
150 $self->{'Page'} = $_{'filename'};
151
152 } elsif ($element eq "Metadata") {
153 if (!defined $self->{'section_metadata'}->{$self->{'Page'}}) {
154 $self->{'section_metadata'}->{$self->{'Page'}} = {};
155 }
156 $self->{'metadata_name'} = $_{'name'};
157 $self->{'section_metadata'}->{$self->{'Page'}}->{$_{'name'}} = "";
158 }
159}
160
161sub EndTag {
162 my ($expat, $element) = @_;
163
164 if ($element eq "Page") {
165 $self->{'Page'} eq "TopLevel";
166
167 } elsif ($element eq "Metadata") {
168 $self->{'metadata_name'} = "";
169 }
170}
171
172sub Text {
173 if ($self->{'metadata_name'} ne "") {
174 $self->{'section_metadata'}->{$self->{'Page'}}->{$self->{'metadata_name'}} .= $_;
175 }
176}
177
1781;
Note: See TracBrowser for help on using the repository browser.