Context Navigation

source: trunk/protemix/perllib/plugins/ProtemixPlug.pm@ 3190

Last change on this file since 3190 was 3190, checked in by sjboddie, 22 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 6.1 KB

Rev	Line
[3162]	1	###########################################################################
	2	#
	3	# ProtemixPlug.pm --
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 2002 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	package ProtemixPlug;
	28
	29	use HTMLPlug;
	30	use util;
	31
	32	sub BEGIN {
	33	@ISA = ('HTMLPlug');
	34	}
	35
	36	use XML::Parser;
	37
	38	sub new {
	39	my $class = shift (@_);
	40
	41	# $self must be global for XML parser routines
	42	$self = new HTMLPlug ($class, @_);
	43
	44	$self->{'no_metadata'} = 1;
	45	$self->{'nolinks'} = 1;
	46	$self->{'section_metadata'} = {};
	47	$self->{'Page'} = "TopLevel";
	48	$self->{'metadata_name'} = "";
	49
	50	return bless $self, $class;
	51	}
	52
	53	sub read {
	54	my $self = shift (@_);
	55	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
	56
	57	my $filename = $file;
	58	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	59
[3186]	60	my $basename = File::Basename::basename($filename);
	61	return 0 if $basename =~ /^\d+(-(all\|\d+))?\.(html?\|pdf)/;
[3162]	62	return undef unless $filename =~ /meta\.xml$/;
	63
	64	# create a new document
	65	my $doc_obj = new doc ($filename, "indexed_doc");
	66	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
	67	my $topsection = $doc_obj->get_top_section();
	68
	69	# process the meta.xml file and set top level metadata
	70	my $parser = new XML::Parser('Style' => 'Stream');
	71	$self->{'section_metadata'} = {};
	72	$self->{'Page'} = "TopLevel";
	73	$parser->parsefile($filename);
	74	foreach my $key (keys %{$self->{'section_metadata'}->{'TopLevel'}}) {
	75	$doc_obj->add_utf8_metadata ($topsection, $key, $self->{'section_metadata'}->{'TopLevel'}->{$key});
	76	}
	77
	78	my $dir = File::Basename::dirname($filename);
	79
[3186]	80	my $outhandle = $self->{'outhandle'};
	81	print $outhandle "ProtemixPlug: processing $dir\n";
	82
	83
[3168]	84	# associate article level pdf file
	85	my ($pdffile) = $dir =~ /([^\/\\]+)$/;
	86	$pdffile = &util::filename_cat($dir, $pdffile);
	87	$pdffile .= "-all.pdf";
[3190]	88	if (-e $pdffile) {
	89	$doc_obj->associate_file($pdffile, "article.pdf", undef, $topsection);
	90	$doc_obj->add_utf8_metadata ($cursection, "pdf", "article.pdf");
	91	} else {
	92	print STDERR "ProtemixPlug: Error: $pdffile does not exist\n";
	93	}
[3168]	94
	95	# read in directory and process individual files
[3162]	96	opendir(DIR, $dir) \|\| die;
	97	my @files = readdir DIR;
	98	closedir DIR;
	99
	100
	101	# we rely on the files being named in such a way that they'll be read
	102	# in the correct order
	103	my $count = 1;
	104	foreach my $thisfile (@files) {
	105	if ($thisfile =~ /^(.*?)\.html?$/i) {
	106	my $filesuf = $1;
	107	$thisfile = &util::filename_cat($dir, $thisfile);
	108	my ($language, $encoding) = $self->textcat_get_language_encoding ($thisfile);
	109	# read in file ($text will be in utf8)
	110	my $text = "";
	111	$self->read_file ($thisfile, $encoding, $language, \$text);
	112	if (!length ($text)) {
[3188]	113	print STDERR "ProtemixPlug: Warning: $thisfile has no text\n";
[3162]	114	}
	115
	116	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
	117
	118	# process HTML file with HTMLPlug
	119	$self->process_section (\$text, '', $thisfile, $doc_obj, $cursection);
	120
[3168]	121	# associate PDF page level pdf file
[3162]	122	my $pdffile = $thisfile;
	123	$pdffile =~ s/\.html?$/\.pdf/;
[3189]	124	if (-e $pdffile) {
	125	$doc_obj->associate_file($pdffile, "page$count.pdf", undef, $cursection);
	126	$doc_obj->add_utf8_metadata ($cursection, "pdf", "page$count.pdf");
	127	$doc_obj->add_utf8_metadata ($cursection, "Title", $count);
	128	} else {
	129	print STDERR "ProtemixPlug: Warning: no pdf file for $thisfile\n";
	130	}
[3162]	131
	132	# add any section level metadata we have for this page (set from within the meta.xml file)
	133
[3177]	134	# currently commented out as we're not using Class1, Class2, and Class3 metadata yet
	135	# if (defined ($self->{'section_metadata'}->{$filesuf})) {
	136	# foreach my $key (keys %{$self->{'section_metadata'}->{$filesuf}}) {
	137	# $doc_obj->add_utf8_metadata ($cursection, $key, $self->{'section_metadata'}->{$filesuf}->{$key});
	138	# }
	139	# }
	140
[3162]	141	$count ++;
	142	}
	143	}
	144
	145	# add an OID
	146	$doc_obj->set_OID();
	147
	148	# process the document
	149	$processor->process($doc_obj);
	150
	151	$self->{'num_processed'} ++;
	152
	153	return 1; # processed the file
	154	}
	155
	156
	157	sub StartTag {
	158	my ($expat, $element) = @_;
	159
	160	if ($element eq "Page") {
	161	$self->{'Page'} = $_{'filename'};
	162
	163	} elsif ($element eq "Metadata") {
	164	if (!defined $self->{'section_metadata'}->{$self->{'Page'}}) {
	165	$self->{'section_metadata'}->{$self->{'Page'}} = {};
	166	}
	167	$self->{'metadata_name'} = $_{'name'};
	168	$self->{'section_metadata'}->{$self->{'Page'}}->{$_{'name'}} = "";
	169	}
	170	}
	171
	172	sub EndTag {
	173	my ($expat, $element) = @_;
	174
	175	if ($element eq "Page") {
	176	$self->{'Page'} eq "TopLevel";
	177
	178	} elsif ($element eq "Metadata") {
	179	$self->{'metadata_name'} = "";
	180	}
	181	}
	182
	183	sub Text {
	184	if ($self->{'metadata_name'} ne "") {
	185	$self->{'section_metadata'}->{$self->{'Page'}}->{$self->{'metadata_name'}} .= $_;
	186	}
	187	}
	188
[3182]	189	# don't want to convert character entities for now as mgpp appears to be broken
	190	sub read_file {
	191	my ($self, $filename, $encoding, $language, $textref) = @_;
	192
	193	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
	194
	195	# Convert entities to their UTF8 equivalents
	196	# $$textref =~ s/&(lt\|gt\|amp\|quot);/&z$1;/go;
	197	# $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
	198	# $$textref =~ s/&z(lt\|gt\|amp\|quot);/&$1;/go;
	199	}
	200
	201
[3162]	202	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: