Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BookPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:keywords set to `Author Date Id Revision`
File size: 7.3 KB

Rev	Line
[2356]	1	###########################################################################
	2	#
[15872]	3	# BookPlugin.pm (formally called HBSPlug) -- plugin for processing simple
[2356]	4	# html (or text) books
	5	#
	6	# A component of the Greenstone digital library software
	7	# from the New Zealand Digital Library Project at the
	8	# University of Waikato, New Zealand.
	9	#
	10	# Copyright (C) 1999 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28	# creates multi-level document from document containing
	29	# <<TOC>> level tags. Metadata for each section is taken from any
	30	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
	31	# sets Title metadata.
	32
	33	# Everything else between TOC tags is treated as simple html (i.e. no
	34	# processing of html links or any other HTMLPlug type stuff is done).
	35
	36	# expects input files to have a .hb file extension by default (this can be
	37	# changed by adding a -process_exp option
	38
	39	# a file with the same name as the hb file but a .jpg extension is
	40	# taken as the cover image (jpg files are blocked by this plugin)
	41
[15872]	42	# BookPlugin is a simplification (and extension) of the HBPlug used
	43	# by the Humanity Library collections. BookPlugin is faster as it expects
[2356]	44	# the input files to be cleaner (The input to the HDL collections
	45	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
	46	# tags to specify images, and simply takes all text between <<TOC>>
	47	# tags and start of text to be Title metadata). If you're marking up
	48	# documents to be displayed in the same way as the HDL collections,
	49	# use this plugin instead of HBPlug.
	50
[15872]	51	package BookPlugin;
[3540]	52
[15872]	53	use AutoExtractMetadata;
[2356]	54	use util;
[10254]	55	use strict;
	56	no strict 'refs'; # allow filehandles to be variables and viceversa
[2356]	57
	58	sub BEGIN {
[15872]	59	@BookPlugin::ISA = ('AutoExtractMetadata');
[2356]	60	}
	61
[3540]	62	my $arguments =
[4744]	63	[ { 'name' => "process_exp",
[31492]	64	'desc' => "{BaseImporter.process_exp}",
[6408]	65	'type' => "regexp",
[3540]	66	'reqd' => "no",
[4744]	67	'deft' => &get_default_process_exp() },
	68	{ 'name' => "block_exp",
[31492]	69	'desc' => "{BaseImporter.block_exp}",
[6408]	70	'type' => "regexp",
[3540]	71	'reqd' => "no",
[4744]	72	'deft' => &get_default_block_exp() } ];
[3540]	73
[15872]	74	my $options = { 'name' => "BookPlugin",
	75	'desc' => "{BookPlugin.desc}",
[6408]	76	'abstract' => "no",
	77	'inherits' => "yes",
[4744]	78	'args' => $arguments };
[3540]	79
[2356]	80	sub new {
[10218]	81	my ($class) = shift (@_);
	82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	83	push(@$pluginlist, $class);
	84
[15872]	85	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	86	push(@{$hashArgOptLists->{"OptList"}},$options);
[10218]	87
[15872]	88	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
[10218]	89
[2356]	90	return bless $self, $class;
	91	}
	92
	93	sub get_default_block_exp {
	94	my $self = shift (@_);
	95
	96	return q^\.jpg$^;
	97	}
	98
	99	sub get_default_process_exp {
	100	my $self = shift (@_);
	101
	102	return q^(?i)\.hb$^;
	103	}
	104
	105	# do plugin specific processing of doc_obj
	106	sub process {
	107	my $self = shift (@_);
[6332]	108	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[2356]	109	my $outhandle = $self->{'outhandle'};
	110
[15872]	111	print STDERR "<Processing n='$file' p='BookPlugin'>\n" if ($gli);
	112	print $outhandle "BookPlugin: processing $file\n"
[2356]	113	if $self->{'verbosity'} > 1;
	114
	115	my $cursection = $doc_obj->get_top_section();
	116
[8121]	117	# Add FileFormat as the metadata
	118	$doc_obj->add_metadata($doc_obj->get_top_section(),"FileFormat", "Book");
	119
[2356]	120	my $filename = &util::filename_cat($base_dir, $file);
	121	my $absdir = $filename;
	122	$absdir =~ s/[^\/\\]*$//;
	123
	124	# add the cover image
	125	my $coverimage = $filename;
	126	$coverimage =~ s/\.[^\.]*$/\.jpg/i;
	127	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
	128
	129	my $title = "";
	130
	131	# remove any leading rubbish
	132	$$textref =~ s/^.*?(<<TOC)/$1/ios;
	133
	134	my $curtoclevel = 1;
	135	my $firstsection = 1;
	136	my $toccount = 0;
	137	while ($$textref =~ /\w/) {
	138	$$textref =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
	139	my $toclevel = $1;
	140	my $metadata = $2;
	141	my $sectiontext = $3;
	142
	143	if ($toclevel == 2) {
	144	$toccount ++;
	145	}
	146
	147	# close any sections below the current level and
	148	# create a new section (special case for the firstsection)
	149	while (($curtoclevel > $toclevel) \|\|
	150	(!$firstsection && $curtoclevel == $toclevel)) {
	151	$cursection = $doc_obj->get_parent_section ($cursection);
	152	$curtoclevel--;
	153	}
	154	if ($curtoclevel+1 < $toclevel) {
	155	print $outhandle "WARNING - jump in toc levels in $filename " .
	156	"from $curtoclevel to $toclevel\n";
	157	}
	158	while ($curtoclevel < $toclevel) {
	159	$curtoclevel++;
	160	$cursection =
	161	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
	162	}
	163
	164	# sort out metadata
	165	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
	166	my $metakey = $1;
	167	my $metavalue = $2;
	168
	169	if ($metavalue ne "" && $metakey ne "") {
	170	# make sure key fits in with gsdl naming scheme
	171	$metakey =~ tr/[A-Z]/[a-z]/;
	172	$metakey = ucfirst ($metakey);
	173	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
	174	}
	175	}
	176
	177	# remove header rubbish
	178	$sectiontext =~ s/^.?<body[^>]>//ios;
	179
	180	# and any other unwanted tags
	181	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
	182
	183	# fix up the image links
	184	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
	185	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
	186
	187	# add the text
	188	$doc_obj->add_utf8_text($cursection, $sectiontext);
	189
	190	$firstsection = 0;
	191
	192	$$textref =~ s/^\s+//s;
	193	}
	194
	195	return 1;
	196	}
	197
	198	sub replace_image_links {
[10254]	199	my $self = shift (@_);
[2356]	200	my ($dir, $doc_obj, $front, $link, $back) = @_;
	201	my $outhandle = $self->{'outhandle'};
	202
	203	my ($filename, $error);
	204	my $foundimage = 0;
	205
	206	$link =~ s/\/\///;
	207	my ($imagetype) = $link =~ /([^\.]*)$/;
	208	$imagetype =~ tr/[A-Z]/[a-z]/;
	209	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
	210	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
[15872]	211	print $outhandle "BookPlugin: Warning - unknown image type ($imagetype)\n";
[2356]	212	}
	213	my ($imagefile) = $link =~ /([^\/]*)$/;
	214	my ($imagepath) = $link =~ /^[^\/](.)$/;
	215
	216	if (defined $imagepath && $imagepath =~ /\w/) {
	217	# relative link
	218	$filename = &util::filename_cat ($dir, $imagepath);
	219	if (-e $filename) {
	220	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
	221	$foundimage = 1;
	222	} else {
[15872]	223	$error = "BookPlugin: Warning - couldn't find image file $imagefile in either $filename or";
[2356]	224	}
	225	}
	226
	227	if (!$foundimage) {
	228	$filename = &util::filename_cat ($dir, $imagefile);
	229	if (-e $filename) {
	230	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
	231	$foundimage = 1;
	232	} elsif (defined $error) {
	233	print $outhandle "$error $filename\n";
	234	} else {
[15872]	235	print $outhandle "BookPlugin: Warning - couldn't find image file $imagefile in $filename\n";
[2356]	236	}
	237	}
	238
	239	if ($foundimage) {
	240	return "${front}_httpdocimg_/${imagefile}${back}";
	241	} else {
	242	return "";
	243	}
	244	}
	245
	246	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: