Context Navigation

BookPlugin.pm@ 32215

Last change on this file since 32215 was 31494, checked in by kjdon, 7 years ago
updated text string keys based on new plugin names
Property svn:keywords set to `Author Date Id Revision`
File size: 7.3 KB

Line
1	###########################################################################
2	#
3	# BookPlugin.pm (formally called HBSPlug) -- plugin for processing simple
4	# html (or text) books
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# creates multi-level document from document containing
29	# <<TOC>> level tags. Metadata for each section is taken from any
30	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31	# sets Title metadata.
32
33	# Everything else between TOC tags is treated as simple html (i.e. no
34	# processing of html links or any other HTMLPlug type stuff is done).
35
36	# expects input files to have a .hb file extension by default (this can be
37	# changed by adding a -process_exp option
38
39	# a file with the same name as the hb file but a .jpg extension is
40	# taken as the cover image (jpg files are blocked by this plugin)
41
42	# BookPlugin is a simplification (and extension) of the HBPlug used
43	# by the Humanity Library collections. BookPlugin is faster as it expects
44	# the input files to be cleaner (The input to the HDL collections
45	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46	# tags to specify images, and simply takes all text between <<TOC>>
47	# tags and start of text to be Title metadata). If you're marking up
48	# documents to be displayed in the same way as the HDL collections,
49	# use this plugin instead of HBPlug.
50
51	package BookPlugin;
52
53	use AutoExtractMetadata;
54	use util;
55	use strict;
56	no strict 'refs'; # allow filehandles to be variables and viceversa
57
58	sub BEGIN {
59	@BookPlugin::ISA = ('AutoExtractMetadata');
60	}
61
62	my $arguments =
63	[ { 'name' => "process_exp",
64	'desc' => "{BaseImporter.process_exp}",
65	'type' => "regexp",
66	'reqd' => "no",
67	'deft' => &get_default_process_exp() },
68	{ 'name' => "block_exp",
69	'desc' => "{CommonUtil.block_exp}",
70	'type' => "regexp",
71	'reqd' => "no",
72	'deft' => &get_default_block_exp() } ];
73
74	my $options = { 'name' => "BookPlugin",
75	'desc' => "{BookPlugin.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'args' => $arguments };
79
80	sub new {
81	my ($class) = shift (@_);
82	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83	push(@$pluginlist, $class);
84
85	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86	push(@{$hashArgOptLists->{"OptList"}},$options);
87
88	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
89
90	return bless $self, $class;
91	}
92
93	sub get_default_block_exp {
94	my $self = shift (@_);
95
96	return q^\.jpg$^;
97	}
98
99	sub get_default_process_exp {
100	my $self = shift (@_);
101
102	return q^(?i)\.hb$^;
103	}
104
105	# do plugin specific processing of doc_obj
106	sub process {
107	my $self = shift (@_);
108	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
109	my $outhandle = $self->{'outhandle'};
110
111	print STDERR "<Processing n='$file' p='BookPlugin'>\n" if ($gli);
112	print $outhandle "BookPlugin: processing $file\n"
113	if $self->{'verbosity'} > 1;
114
115	my $cursection = $doc_obj->get_top_section();
116
117	# Add FileFormat as the metadata
118	$doc_obj->add_metadata($doc_obj->get_top_section(),"FileFormat", "Book");
119
120	my $filename = &util::filename_cat($base_dir, $file);
121	my $absdir = $filename;
122	$absdir =~ s/[^\/\\]*$//;
123
124	# add the cover image
125	my $coverimage = $filename;
126	$coverimage =~ s/\.[^\.]*$/\.jpg/i;
127	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
128
129	my $title = "";
130
131	# remove any leading rubbish
132	$$textref =~ s/^.*?(<<TOC)/$1/ios;
133
134	my $curtoclevel = 1;
135	my $firstsection = 1;
136	my $toccount = 0;
137	while ($$textref =~ /\w/) {
138	$$textref =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
139	my $toclevel = $1;
140	my $metadata = $2;
141	my $sectiontext = $3;
142
143	if ($toclevel == 2) {
144	$toccount ++;
145	}
146
147	# close any sections below the current level and
148	# create a new section (special case for the firstsection)
149	while (($curtoclevel > $toclevel) \|\|
150	(!$firstsection && $curtoclevel == $toclevel)) {
151	$cursection = $doc_obj->get_parent_section ($cursection);
152	$curtoclevel--;
153	}
154	if ($curtoclevel+1 < $toclevel) {
155	print $outhandle "WARNING - jump in toc levels in $filename " .
156	"from $curtoclevel to $toclevel\n";
157	}
158	while ($curtoclevel < $toclevel) {
159	$curtoclevel++;
160	$cursection =
161	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
162	}
163
164	# sort out metadata
165	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
166	my $metakey = $1;
167	my $metavalue = $2;
168
169	if ($metavalue ne "" && $metakey ne "") {
170	# make sure key fits in with gsdl naming scheme
171	$metakey =~ tr/[A-Z]/[a-z]/;
172	$metakey = ucfirst ($metakey);
173	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
174	}
175	}
176
177	# remove header rubbish
178	$sectiontext =~ s/^.?<body[^>]>//ios;
179
180	# and any other unwanted tags
181	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
182
183	# fix up the image links
184	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
185	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
186
187	# add the text
188	$doc_obj->add_utf8_text($cursection, $sectiontext);
189
190	$firstsection = 0;
191
192	$$textref =~ s/^\s+//s;
193	}
194
195	return 1;
196	}
197
198	sub replace_image_links {
199	my $self = shift (@_);
200	my ($dir, $doc_obj, $front, $link, $back) = @_;
201	my $outhandle = $self->{'outhandle'};
202
203	my ($filename, $error);
204	my $foundimage = 0;
205
206	$link =~ s/\/\///;
207	my ($imagetype) = $link =~ /([^\.]*)$/;
208	$imagetype =~ tr/[A-Z]/[a-z]/;
209	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
210	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
211	print $outhandle "BookPlugin: Warning - unknown image type ($imagetype)\n";
212	}
213	my ($imagefile) = $link =~ /([^\/]*)$/;
214	my ($imagepath) = $link =~ /^[^\/](.)$/;
215
216	if (defined $imagepath && $imagepath =~ /\w/) {
217	# relative link
218	$filename = &util::filename_cat ($dir, $imagepath);
219	if (-e $filename) {
220	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
221	$foundimage = 1;
222	} else {
223	$error = "BookPlugin: Warning - couldn't find image file $imagefile in either $filename or";
224	}
225	}
226
227	if (!$foundimage) {
228	$filename = &util::filename_cat ($dir, $imagefile);
229	if (-e $filename) {
230	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
231	$foundimage = 1;
232	} elsif (defined $error) {
233	print $outhandle "$error $filename\n";
234	} else {
235	print $outhandle "BookPlugin: Warning - couldn't find image file $imagefile in $filename\n";
236	}
237	}
238
239	if ($foundimage) {
240	return "${front}_httpdocimg_/${imagefile}${back}";
241	} else {
242	return "";
243	}
244	}
245
246	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/BookPlugin.pm@ 32215

Download in other formats: