source: gsdl/trunk/perllib/plugins/BookPlugin.pm@ 17739

Last change on this file since 17739 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1###########################################################################
2#
3# BookPlugin.pm (formally called HBSPlug) -- plugin for processing simple
4# html (or text) books
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# creates multi-level document from document containing
29# <<TOC>> level tags. Metadata for each section is taken from any
30# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31# sets Title metadata.
32
33# Everything else between TOC tags is treated as simple html (i.e. no
34# processing of html links or any other HTMLPlug type stuff is done).
35
36# expects input files to have a .hb file extension by default (this can be
37# changed by adding a -process_exp option
38
39# a file with the same name as the hb file but a .jpg extension is
40# taken as the cover image (jpg files are blocked by this plugin)
41
42# BookPlugin is a simplification (and extension) of the HBPlug used
43# by the Humanity Library collections. BookPlugin is faster as it expects
44# the input files to be cleaner (The input to the HDL collections
45# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46# tags to specify images, and simply takes all text between <<TOC>>
47# tags and start of text to be Title metadata). If you're marking up
48# documents to be displayed in the same way as the HDL collections,
49# use this plugin instead of HBPlug.
50
51package BookPlugin;
52
53use AutoExtractMetadata;
54use util;
55use strict;
56no strict 'refs'; # allow filehandles to be variables and viceversa
57
58sub BEGIN {
59 @BookPlugin::ISA = ('AutoExtractMetadata');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BasePlugin.process_exp}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "block_exp",
69 'desc' => "{BasePlugin.block_exp}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_block_exp() } ];
73
74my $options = { 'name' => "BookPlugin",
75 'desc' => "{BookPlugin.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80sub new {
81 my ($class) = shift (@_);
82 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83 push(@$pluginlist, $class);
84
85 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86 push(@{$hashArgOptLists->{"OptList"}},$options);
87
88 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
89
90 return bless $self, $class;
91}
92
93sub get_default_block_exp {
94 my $self = shift (@_);
95
96 return q^\.jpg$^;
97}
98
99sub get_default_process_exp {
100 my $self = shift (@_);
101
102 return q^(?i)\.hb$^;
103}
104
105# do plugin specific processing of doc_obj
106sub process {
107 my $self = shift (@_);
108 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
109 my $outhandle = $self->{'outhandle'};
110
111 print STDERR "<Processing n='$file' p='BookPlugin'>\n" if ($gli);
112 print $outhandle "BookPlugin: processing $file\n"
113 if $self->{'verbosity'} > 1;
114
115 my $cursection = $doc_obj->get_top_section();
116
117 # Add FileFormat as the metadata
118 $doc_obj->add_metadata($doc_obj->get_top_section(),"FileFormat", "Book");
119
120 my $filename = &util::filename_cat($base_dir, $file);
121 my $absdir = $filename;
122 $absdir =~ s/[^\/\\]*$//;
123
124 # add the cover image
125 my $coverimage = $filename;
126 $coverimage =~ s/\.[^\.]*$/\.jpg/i;
127 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
128
129 my $title = "";
130
131 # remove any leading rubbish
132 $$textref =~ s/^.*?(<<TOC)/$1/ios;
133
134 my $curtoclevel = 1;
135 my $firstsection = 1;
136 my $toccount = 0;
137 while ($$textref =~ /\w/) {
138 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
139 my $toclevel = $1;
140 my $metadata = $2;
141 my $sectiontext = $3;
142
143 if ($toclevel == 2) {
144 $toccount ++;
145 }
146
147 # close any sections below the current level and
148 # create a new section (special case for the firstsection)
149 while (($curtoclevel > $toclevel) ||
150 (!$firstsection && $curtoclevel == $toclevel)) {
151 $cursection = $doc_obj->get_parent_section ($cursection);
152 $curtoclevel--;
153 }
154 if ($curtoclevel+1 < $toclevel) {
155 print $outhandle "WARNING - jump in toc levels in $filename " .
156 "from $curtoclevel to $toclevel\n";
157 }
158 while ($curtoclevel < $toclevel) {
159 $curtoclevel++;
160 $cursection =
161 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
162 }
163
164 # sort out metadata
165 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
166 my $metakey = $1;
167 my $metavalue = $2;
168
169 if ($metavalue ne "" && $metakey ne "") {
170 # make sure key fits in with gsdl naming scheme
171 $metakey =~ tr/[A-Z]/[a-z]/;
172 $metakey = ucfirst ($metakey);
173 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
174 }
175 }
176
177 # remove header rubbish
178 $sectiontext =~ s/^.*?<body[^>]*>//ios;
179
180 # and any other unwanted tags
181 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
182
183 # fix up the image links
184 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
185 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
186
187 # add the text
188 $doc_obj->add_utf8_text($cursection, $sectiontext);
189
190 $firstsection = 0;
191
192 $$textref =~ s/^\s+//s;
193 }
194
195 return 1;
196}
197
198sub replace_image_links {
199 my $self = shift (@_);
200 my ($dir, $doc_obj, $front, $link, $back) = @_;
201 my $outhandle = $self->{'outhandle'};
202
203 my ($filename, $error);
204 my $foundimage = 0;
205
206 $link =~ s/\/\///;
207 my ($imagetype) = $link =~ /([^\.]*)$/;
208 $imagetype =~ tr/[A-Z]/[a-z]/;
209 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
210 if ($imagetype !~ /^(jpeg|gif|png)$/) {
211 print $outhandle "BookPlugin: Warning - unknown image type ($imagetype)\n";
212 }
213 my ($imagefile) = $link =~ /([^\/]*)$/;
214 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
215
216 if (defined $imagepath && $imagepath =~ /\w/) {
217 # relative link
218 $filename = &util::filename_cat ($dir, $imagepath);
219 if (-e $filename) {
220 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
221 $foundimage = 1;
222 } else {
223 $error = "BookPlugin: Warning - couldn't find image file $imagefile in either $filename or";
224 }
225 }
226
227 if (!$foundimage) {
228 $filename = &util::filename_cat ($dir, $imagefile);
229 if (-e $filename) {
230 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
231 $foundimage = 1;
232 } elsif (defined $error) {
233 print $outhandle "$error $filename\n";
234 } else {
235 print $outhandle "BookPlugin: Warning - couldn't find image file $imagefile in $filename\n";
236 }
237 }
238
239 if ($foundimage) {
240 return "${front}_httpdocimg_/${imagefile}${back}";
241 } else {
242 return "";
243 }
244}
245
2461;
Note: See TracBrowser for help on using the repository browser.