source: trunk/gsdl/perllib/plugins/BookPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1###########################################################################
2#
3# BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
4# html (or text) books
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# creates multi-level document from document containing
29# <<TOC>> level tags. Metadata for each section is taken from any
30# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31# sets Title metadata.
32
33# Everything else between TOC tags is treated as simple html (i.e. no
34# processing of html links or any other HTMLPlug type stuff is done).
35
36# expects input files to have a .hb file extension by default (this can be
37# changed by adding a -process_exp option
38
39# a file with the same name as the hb file but a .jpg extension is
40# taken as the cover image (jpg files are blocked by this plugin)
41
42# BookPlug is a simplification (and extension) of the HBPlug used
43# by the Humanity Library collections. BookPlug is faster as it expects
44# the input files to be cleaner (The input to the HDL collections
45# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46# tags to specify images, and simply takes all text between <<TOC>>
47# tags and start of text to be Title metadata). If you're marking up
48# documents to be displayed in the same way as the HDL collections,
49# use this plugin instead of HBPlug.
50
51# 12/05/02 Added usage datastructure - John Thompson
52
53package BookPlug;
54
55use BasPlug;
56use util;
57use strict;
58no strict 'refs'; # allow filehandles to be variables and viceversa
59
60sub BEGIN {
61 @BookPlug::ISA = ('BasPlug');
62}
63
64my $arguments =
65 [ { 'name' => "process_exp",
66 'desc' => "{BasPlug.process_exp}",
67 'type' => "regexp",
68 'reqd' => "no",
69 'deft' => &get_default_process_exp() },
70 { 'name' => "block_exp",
71 'desc' => "{BasPlug.block_exp}",
72 'type' => "regexp",
73 'reqd' => "no",
74 'deft' => &get_default_block_exp() } ];
75
76my $options = { 'name' => "BookPlug",
77 'desc' => "{BookPlug.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82sub new {
83 my ($class) = shift (@_);
84 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
85 push(@$pluginlist, $class);
86
87 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
88 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
89
90 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
91
92 return bless $self, $class;
93}
94
95sub get_default_block_exp {
96 my $self = shift (@_);
97
98 return q^\.jpg$^;
99}
100
101sub get_default_process_exp {
102 my $self = shift (@_);
103
104 return q^(?i)\.hb$^;
105}
106
107# do plugin specific processing of doc_obj
108sub process {
109 my $self = shift (@_);
110 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
111 my $outhandle = $self->{'outhandle'};
112
113 print STDERR "<Processing n='$file' p='BookPlug'>\n" if ($gli);
114 print $outhandle "BookPlug: processing $file\n"
115 if $self->{'verbosity'} > 1;
116
117 my $cursection = $doc_obj->get_top_section();
118
119 # Add FileFormat as the metadata
120 $doc_obj->add_metadata($doc_obj->get_top_section(),"FileFormat", "Book");
121
122 my $filename = &util::filename_cat($base_dir, $file);
123 my $absdir = $filename;
124 $absdir =~ s/[^\/\\]*$//;
125
126 # add the cover image
127 my $coverimage = $filename;
128 $coverimage =~ s/\.[^\.]*$/\.jpg/i;
129 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
130
131 my $title = "";
132
133 # remove any leading rubbish
134 $$textref =~ s/^.*?(<<TOC)/$1/ios;
135
136 my $curtoclevel = 1;
137 my $firstsection = 1;
138 my $toccount = 0;
139 while ($$textref =~ /\w/) {
140 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
141 my $toclevel = $1;
142 my $metadata = $2;
143 my $sectiontext = $3;
144
145 if ($toclevel == 2) {
146 $toccount ++;
147 }
148
149 # close any sections below the current level and
150 # create a new section (special case for the firstsection)
151 while (($curtoclevel > $toclevel) ||
152 (!$firstsection && $curtoclevel == $toclevel)) {
153 $cursection = $doc_obj->get_parent_section ($cursection);
154 $curtoclevel--;
155 }
156 if ($curtoclevel+1 < $toclevel) {
157 print $outhandle "WARNING - jump in toc levels in $filename " .
158 "from $curtoclevel to $toclevel\n";
159 }
160 while ($curtoclevel < $toclevel) {
161 $curtoclevel++;
162 $cursection =
163 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
164 }
165
166 # sort out metadata
167 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
168 my $metakey = $1;
169 my $metavalue = $2;
170
171 if ($metavalue ne "" && $metakey ne "") {
172 # make sure key fits in with gsdl naming scheme
173 $metakey =~ tr/[A-Z]/[a-z]/;
174 $metakey = ucfirst ($metakey);
175 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
176 }
177 }
178
179 # remove header rubbish
180 $sectiontext =~ s/^.*?<body[^>]*>//ios;
181
182 # and any other unwanted tags
183 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
184
185 # fix up the image links
186 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
187 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
188
189 # add the text
190 $doc_obj->add_utf8_text($cursection, $sectiontext);
191
192 $firstsection = 0;
193
194 $$textref =~ s/^\s+//s;
195 }
196
197 return 1;
198}
199
200sub replace_image_links {
201 my $self = shift (@_);
202 my ($dir, $doc_obj, $front, $link, $back) = @_;
203 my $outhandle = $self->{'outhandle'};
204
205 my ($filename, $error);
206 my $foundimage = 0;
207
208 $link =~ s/\/\///;
209 my ($imagetype) = $link =~ /([^\.]*)$/;
210 $imagetype =~ tr/[A-Z]/[a-z]/;
211 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
212 if ($imagetype !~ /^(jpeg|gif|png)$/) {
213 print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
214 }
215 my ($imagefile) = $link =~ /([^\/]*)$/;
216 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
217
218 if (defined $imagepath && $imagepath =~ /\w/) {
219 # relative link
220 $filename = &util::filename_cat ($dir, $imagepath);
221 if (-e $filename) {
222 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
223 $foundimage = 1;
224 } else {
225 $error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
226 }
227 }
228
229 if (!$foundimage) {
230 $filename = &util::filename_cat ($dir, $imagefile);
231 if (-e $filename) {
232 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
233 $foundimage = 1;
234 } elsif (defined $error) {
235 print $outhandle "$error $filename\n";
236 } else {
237 print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
238 }
239 }
240
241 if ($foundimage) {
242 return "${front}_httpdocimg_/${imagefile}${back}";
243 } else {
244 return "";
245 }
246}
247
2481;
Note: See TracBrowser for help on using the repository browser.