Context Navigation

source: trunk/protemix/perllib/plugins/ProtemixPlug.pm@ 3177

Last change on this file since 3177 was 3177, checked in by sjboddie, 22 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 5.4 KB

Line
1	###########################################################################
2	#
3	# ProtemixPlug.pm --
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 2002 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ProtemixPlug;
28
29	use HTMLPlug;
30	use util;
31
32	sub BEGIN {
33	@ISA = ('HTMLPlug');
34	}
35
36	use XML::Parser;
37
38	sub new {
39	my $class = shift (@_);
40
41	# $self must be global for XML parser routines
42	$self = new HTMLPlug ($class, @_);
43
44	$self->{'no_metadata'} = 1;
45	$self->{'nolinks'} = 1;
46	$self->{'section_metadata'} = {};
47	$self->{'Page'} = "TopLevel";
48	$self->{'metadata_name'} = "";
49
50	return bless $self, $class;
51	}
52
53	sub read {
54	my $self = shift (@_);
55	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
56
57	my $filename = $file;
58	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
59
60	return 0 if $filename =~ /\.(pdf\|html?\|jpe?g)$/i;
61	return undef unless $filename =~ /meta\.xml$/;
62
63	# create a new document
64	my $doc_obj = new doc ($filename, "indexed_doc");
65	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
66	my $topsection = $doc_obj->get_top_section();
67
68	# process the meta.xml file and set top level metadata
69	my $parser = new XML::Parser('Style' => 'Stream');
70	$self->{'section_metadata'} = {};
71	$self->{'Page'} = "TopLevel";
72	$parser->parsefile($filename);
73	foreach my $key (keys %{$self->{'section_metadata'}->{'TopLevel'}}) {
74	$doc_obj->add_utf8_metadata ($topsection, $key, $self->{'section_metadata'}->{'TopLevel'}->{$key});
75	}
76
77	my $dir = File::Basename::dirname($filename);
78
79	# associate article level pdf file
80	my ($pdffile) = $dir =~ /([^\/\\]+)$/;
81	$pdffile = &util::filename_cat($dir, $pdffile);
82	$pdffile .= "-all.pdf";
83	die "$pdffile does not exist" unless -e $pdffile;
84	$doc_obj->associate_file($pdffile, "article.pdf", undef, $topsection);
85	$doc_obj->add_utf8_metadata ($cursection, "pdf", "article.pdf");
86
87	# read in directory and process individual files
88	opendir(DIR, $dir) \|\| die;
89	my @files = readdir DIR;
90	closedir DIR;
91
92
93	# we rely on the files being named in such a way that they'll be read
94	# in the correct order
95	my $count = 1;
96	foreach my $thisfile (@files) {
97	if ($thisfile =~ /^(.*?)\.html?$/i) {
98	my $filesuf = $1;
99	$thisfile = &util::filename_cat($dir, $thisfile);
100	my ($language, $encoding) = $self->textcat_get_language_encoding ($thisfile);
101	# read in file ($text will be in utf8)
102	my $text = "";
103	$self->read_file ($thisfile, $encoding, $language, \$text);
104	if (!length ($text)) {
105	die "$thisfile has no text\n";
106	}
107
108	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
109
110	# process HTML file with HTMLPlug
111	$self->process_section (\$text, '', $thisfile, $doc_obj, $cursection);
112
113	# associate PDF page level pdf file
114	my $pdffile = $thisfile;
115	$pdffile =~ s/\.html?$/\.pdf/;
116	die "no PDF file for $thisfile" unless -e $pdffile;
117	$doc_obj->associate_file($pdffile, "page$count.pdf", undef, $cursection);
118	$doc_obj->add_utf8_metadata ($cursection, "pdf", "page$count.pdf");
119	$doc_obj->add_utf8_metadata ($cursection, "Title", $count);
120
121	# add any section level metadata we have for this page (set from within the meta.xml file)
122
123	# currently commented out as we're not using Class1, Class2, and Class3 metadata yet
124	# if (defined ($self->{'section_metadata'}->{$filesuf})) {
125	# foreach my $key (keys %{$self->{'section_metadata'}->{$filesuf}}) {
126	# $doc_obj->add_utf8_metadata ($cursection, $key, $self->{'section_metadata'}->{$filesuf}->{$key});
127	# }
128	# }
129
130	$count ++;
131	}
132	}
133
134	# add an OID
135	$doc_obj->set_OID();
136
137	# process the document
138	$processor->process($doc_obj);
139
140	$self->{'num_processed'} ++;
141
142	return 1; # processed the file
143	}
144
145
146	sub StartTag {
147	my ($expat, $element) = @_;
148
149	if ($element eq "Page") {
150	$self->{'Page'} = $_{'filename'};
151
152	} elsif ($element eq "Metadata") {
153	if (!defined $self->{'section_metadata'}->{$self->{'Page'}}) {
154	$self->{'section_metadata'}->{$self->{'Page'}} = {};
155	}
156	$self->{'metadata_name'} = $_{'name'};
157	$self->{'section_metadata'}->{$self->{'Page'}}->{$_{'name'}} = "";
158	}
159	}
160
161	sub EndTag {
162	my ($expat, $element) = @_;
163
164	if ($element eq "Page") {
165	$self->{'Page'} eq "TopLevel";
166
167	} elsif ($element eq "Metadata") {
168	$self->{'metadata_name'} = "";
169	}
170	}
171
172	sub Text {
173	if ($self->{'metadata_name'} ne "") {
174	$self->{'section_metadata'}->{$self->{'Page'}}->{$self->{'metadata_name'}} .= $_;
175	}
176	}
177
178	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: