Context Navigation

source: gsdl/trunk/perllib/plugins/BookPlug.pm@ 15537

Last change on this file since 15537 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.4 KB

Line
1	###########################################################################
2	#
3	# BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
4	# html (or text) books
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# creates multi-level document from document containing
29	# <<TOC>> level tags. Metadata for each section is taken from any
30	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31	# sets Title metadata.
32
33	# Everything else between TOC tags is treated as simple html (i.e. no
34	# processing of html links or any other HTMLPlug type stuff is done).
35
36	# expects input files to have a .hb file extension by default (this can be
37	# changed by adding a -process_exp option
38
39	# a file with the same name as the hb file but a .jpg extension is
40	# taken as the cover image (jpg files are blocked by this plugin)
41
42	# BookPlug is a simplification (and extension) of the HBPlug used
43	# by the Humanity Library collections. BookPlug is faster as it expects
44	# the input files to be cleaner (The input to the HDL collections
45	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46	# tags to specify images, and simply takes all text between <<TOC>>
47	# tags and start of text to be Title metadata). If you're marking up
48	# documents to be displayed in the same way as the HDL collections,
49	# use this plugin instead of HBPlug.
50
51	# 12/05/02 Added usage datastructure - John Thompson
52
53	package BookPlug;
54
55	use BasPlug;
56	use util;
57	use strict;
58	no strict 'refs'; # allow filehandles to be variables and viceversa
59
60	sub BEGIN {
61	@BookPlug::ISA = ('BasPlug');
62	}
63
64	my $arguments =
65	[ { 'name' => "process_exp",
66	'desc' => "{BasPlug.process_exp}",
67	'type' => "regexp",
68	'reqd' => "no",
69	'deft' => &get_default_process_exp() },
70	{ 'name' => "block_exp",
71	'desc' => "{BasPlug.block_exp}",
72	'type' => "regexp",
73	'reqd' => "no",
74	'deft' => &get_default_block_exp() } ];
75
76	my $options = { 'name' => "BookPlug",
77	'desc' => "{BookPlug.desc}",
78	'abstract' => "no",
79	'inherits' => "yes",
80	'args' => $arguments };
81
82	sub new {
83	my ($class) = shift (@_);
84	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
85	push(@$pluginlist, $class);
86
87	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
88	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
89
90	my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
91
92	return bless $self, $class;
93	}
94
95	sub get_default_block_exp {
96	my $self = shift (@_);
97
98	return q^\.jpg$^;
99	}
100
101	sub get_default_process_exp {
102	my $self = shift (@_);
103
104	return q^(?i)\.hb$^;
105	}
106
107	# do plugin specific processing of doc_obj
108	sub process {
109	my $self = shift (@_);
110	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
111	my $outhandle = $self->{'outhandle'};
112
113	print STDERR "<Processing n='$file' p='BookPlug'>\n" if ($gli);
114	print $outhandle "BookPlug: processing $file\n"
115	if $self->{'verbosity'} > 1;
116
117	my $cursection = $doc_obj->get_top_section();
118
119	# Add FileFormat as the metadata
120	$doc_obj->add_metadata($doc_obj->get_top_section(),"FileFormat", "Book");
121
122	my $filename = &util::filename_cat($base_dir, $file);
123	my $absdir = $filename;
124	$absdir =~ s/[^\/\\]*$//;
125
126	# add the cover image
127	my $coverimage = $filename;
128	$coverimage =~ s/\.[^\.]*$/\.jpg/i;
129	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
130
131	my $title = "";
132
133	# remove any leading rubbish
134	$$textref =~ s/^.*?(<<TOC)/$1/ios;
135
136	my $curtoclevel = 1;
137	my $firstsection = 1;
138	my $toccount = 0;
139	while ($$textref =~ /\w/) {
140	$$textref =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
141	my $toclevel = $1;
142	my $metadata = $2;
143	my $sectiontext = $3;
144
145	if ($toclevel == 2) {
146	$toccount ++;
147	}
148
149	# close any sections below the current level and
150	# create a new section (special case for the firstsection)
151	while (($curtoclevel > $toclevel) \|\|
152	(!$firstsection && $curtoclevel == $toclevel)) {
153	$cursection = $doc_obj->get_parent_section ($cursection);
154	$curtoclevel--;
155	}
156	if ($curtoclevel+1 < $toclevel) {
157	print $outhandle "WARNING - jump in toc levels in $filename " .
158	"from $curtoclevel to $toclevel\n";
159	}
160	while ($curtoclevel < $toclevel) {
161	$curtoclevel++;
162	$cursection =
163	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
164	}
165
166	# sort out metadata
167	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
168	my $metakey = $1;
169	my $metavalue = $2;
170
171	if ($metavalue ne "" && $metakey ne "") {
172	# make sure key fits in with gsdl naming scheme
173	$metakey =~ tr/[A-Z]/[a-z]/;
174	$metakey = ucfirst ($metakey);
175	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
176	}
177	}
178
179	# remove header rubbish
180	$sectiontext =~ s/^.?<body[^>]>//ios;
181
182	# and any other unwanted tags
183	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
184
185	# fix up the image links
186	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
187	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
188
189	# add the text
190	$doc_obj->add_utf8_text($cursection, $sectiontext);
191
192	$firstsection = 0;
193
194	$$textref =~ s/^\s+//s;
195	}
196
197	return 1;
198	}
199
200	sub replace_image_links {
201	my $self = shift (@_);
202	my ($dir, $doc_obj, $front, $link, $back) = @_;
203	my $outhandle = $self->{'outhandle'};
204
205	my ($filename, $error);
206	my $foundimage = 0;
207
208	$link =~ s/\/\///;
209	my ($imagetype) = $link =~ /([^\.]*)$/;
210	$imagetype =~ tr/[A-Z]/[a-z]/;
211	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
212	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
213	print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
214	}
215	my ($imagefile) = $link =~ /([^\/]*)$/;
216	my ($imagepath) = $link =~ /^[^\/](.)$/;
217
218	if (defined $imagepath && $imagepath =~ /\w/) {
219	# relative link
220	$filename = &util::filename_cat ($dir, $imagepath);
221	if (-e $filename) {
222	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
223	$foundimage = 1;
224	} else {
225	$error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
226	}
227	}
228
229	if (!$foundimage) {
230	$filename = &util::filename_cat ($dir, $imagefile);
231	if (-e $filename) {
232	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
233	$foundimage = 1;
234	} elsif (defined $error) {
235	print $outhandle "$error $filename\n";
236	} else {
237	print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
238	}
239	}
240
241	if ($foundimage) {
242	return "${front}_httpdocimg_/${imagefile}${back}";
243	} else {
244	return "";
245	}
246	}
247
248	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: