Context Navigation

source: trunk/gsdl/perllib/plugins/BookPlug.pm@ 4845

Last change on this file since 4845 was 4744, checked in by mdewsnip, 21 years ago
Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
4	# html (or text) books
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# creates multi-level document from document containing
29	# <<TOC>> level tags. Metadata for each section is taken from any
30	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31	# sets Title metadata.
32
33	# Everything else between TOC tags is treated as simple html (i.e. no
34	# processing of html links or any other HTMLPlug type stuff is done).
35
36	# expects input files to have a .hb file extension by default (this can be
37	# changed by adding a -process_exp option
38
39	# a file with the same name as the hb file but a .jpg extension is
40	# taken as the cover image (jpg files are blocked by this plugin)
41
42	# BookPlug is a simplification (and extension) of the HBPlug used
43	# by the Humanity Library collections. BookPlug is faster as it expects
44	# the input files to be cleaner (The input to the HDL collections
45	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46	# tags to specify images, and simply takes all text between <<TOC>>
47	# tags and start of text to be Title metadata). If you're marking up
48	# documents to be displayed in the same way as the HDL collections,
49	# use this plugin instead of HBPlug.
50
51	# 12/05/02 Added usage datastructure - John Thompson
52
53	package BookPlug;
54
55	use BasPlug;
56	use util;
57
58	sub BEGIN {
59	@ISA = ('BasPlug');
60	}
61
62	my $arguments =
63	[ { 'name' => "process_exp",
64	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
65	'type' => "string",
66	'reqd' => "no",
67	'deft' => &get_default_process_exp() },
68	{ 'name' => "block_exp",
69	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
70	'type' => "string",
71	'reqd' => "no",
72	'deft' => &get_default_block_exp() } ];
73
74	my $options = { 'name' => "BookPlug",
75	'desc' => "Creates multi-level document from document containing <<TOC>> level tags. Metadata for each section is taken from any other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>> sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around <<TOC>> tags, uses <<I>> tags to specify images, and simply takes all text between <<TOC>> tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
76	'inherits' => "Yes",
77	'args' => $arguments };
78
79	sub new {
80	my ($class) = @_;
81	my $self = new BasPlug ("BookPlug", @_);
82
83	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
84	my $option_list = $self->{'option_list'};
85	push( @{$option_list}, $options );
86
87	return bless $self, $class;
88	}
89
90	sub get_default_block_exp {
91	my $self = shift (@_);
92
93	return q^\.jpg$^;
94	}
95
96	sub get_default_process_exp {
97	my $self = shift (@_);
98
99	return q^(?i)\.hb$^;
100	}
101
102	# do plugin specific processing of doc_obj
103	sub process {
104	my $self = shift (@_);
105	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
106	my $outhandle = $self->{'outhandle'};
107
108	print $outhandle "BookPlug: processing $file\n"
109	if $self->{'verbosity'} > 1;
110
111	my $cursection = $doc_obj->get_top_section();
112
113	my $filename = &util::filename_cat($base_dir, $file);
114	my $absdir = $filename;
115	$absdir =~ s/[^\/\\]*$//;
116
117	# add the cover image
118	my $coverimage = $filename;
119	$coverimage =~ s/\.[^\.]*$/\.jpg/i;
120	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
121
122	my $title = "";
123
124	# remove any leading rubbish
125	$$textref =~ s/^.*?(<<TOC)/$1/ios;
126
127	my $curtoclevel = 1;
128	my $firstsection = 1;
129	my $toccount = 0;
130	while ($$textref =~ /\w/) {
131	$$textref =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
132	my $toclevel = $1;
133	my $metadata = $2;
134	my $sectiontext = $3;
135
136	if ($toclevel == 2) {
137	$toccount ++;
138	}
139
140	# close any sections below the current level and
141	# create a new section (special case for the firstsection)
142	while (($curtoclevel > $toclevel) \|\|
143	(!$firstsection && $curtoclevel == $toclevel)) {
144	$cursection = $doc_obj->get_parent_section ($cursection);
145	$curtoclevel--;
146	}
147	if ($curtoclevel+1 < $toclevel) {
148	print $outhandle "WARNING - jump in toc levels in $filename " .
149	"from $curtoclevel to $toclevel\n";
150	}
151	while ($curtoclevel < $toclevel) {
152	$curtoclevel++;
153	$cursection =
154	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
155	}
156
157	# sort out metadata
158	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
159	my $metakey = $1;
160	my $metavalue = $2;
161
162	if ($metavalue ne "" && $metakey ne "") {
163	# make sure key fits in with gsdl naming scheme
164	$metakey =~ tr/[A-Z]/[a-z]/;
165	$metakey = ucfirst ($metakey);
166	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
167	}
168	}
169
170	# remove header rubbish
171	$sectiontext =~ s/^.?<body[^>]>//ios;
172
173	# and any other unwanted tags
174	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
175
176	# fix up the image links
177	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
178	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
179
180	# add the text
181	$doc_obj->add_utf8_text($cursection, $sectiontext);
182
183	$firstsection = 0;
184
185	$$textref =~ s/^\s+//s;
186	}
187
188	return 1;
189	}
190
191	sub replace_image_links {
192	my ($dir, $doc_obj, $front, $link, $back) = @_;
193	my $outhandle = $self->{'outhandle'};
194
195	my ($filename, $error);
196	my $foundimage = 0;
197
198	$link =~ s/\/\///;
199	my ($imagetype) = $link =~ /([^\.]*)$/;
200	$imagetype =~ tr/[A-Z]/[a-z]/;
201	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
202	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
203	print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
204	}
205	my ($imagefile) = $link =~ /([^\/]*)$/;
206	my ($imagepath) = $link =~ /^[^\/](.)$/;
207
208	if (defined $imagepath && $imagepath =~ /\w/) {
209	# relative link
210	$filename = &util::filename_cat ($dir, $imagepath);
211	if (-e $filename) {
212	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
213	$foundimage = 1;
214	} else {
215	$error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
216	}
217	}
218
219	if (!$foundimage) {
220	$filename = &util::filename_cat ($dir, $imagefile);
221	if (-e $filename) {
222	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
223	$foundimage = 1;
224	} elsif (defined $error) {
225	print $outhandle "$error $filename\n";
226	} else {
227	print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
228	}
229	}
230
231	if ($foundimage) {
232	return "${front}_httpdocimg_/${imagefile}${back}";
233	} else {
234	return "";
235	}
236	}
237
238	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: