source: trunk/gsdl/perllib/plugins/BookPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
4# html (or text) books
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# creates multi-level document from document containing
29# <<TOC>> level tags. Metadata for each section is taken from any
30# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31# sets Title metadata.
32
33# Everything else between TOC tags is treated as simple html (i.e. no
34# processing of html links or any other HTMLPlug type stuff is done).
35
36# expects input files to have a .hb file extension by default (this can be
37# changed by adding a -process_exp option
38
39# a file with the same name as the hb file but a .jpg extension is
40# taken as the cover image (jpg files are blocked by this plugin)
41
42# BookPlug is a simplification (and extension) of the HBPlug used
43# by the Humanity Library collections. BookPlug is faster as it expects
44# the input files to be cleaner (The input to the HDL collections
45# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46# tags to specify images, and simply takes all text between <<TOC>>
47# tags and start of text to be Title metadata). If you're marking up
48# documents to be displayed in the same way as the HDL collections,
49# use this plugin instead of HBPlug.
50
51# 12/05/02 Added usage datastructure - John Thompson
52
53package BookPlug;
54
55use BasPlug;
56use util;
57
58sub BEGIN {
59 @ISA = ('BasPlug');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BasPlug.process_exp}",
65 'type' => "regexp",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "block_exp",
69 'desc' => "{BasPlug.block_exp}",
70 'type' => "regexp",
71 'reqd' => "no",
72 'deft' => &get_default_block_exp() } ];
73
74my $options = { 'name' => "BookPlug",
75 'desc' => "{BookPlug.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'args' => $arguments };
79
80sub new {
81 my ($class) = @_;
82 my $self = new BasPlug ("BookPlug", @_);
83 $self->{'plugin_type'} = "BookPlug";
84 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
85 my $option_list = $self->{'option_list'};
86 push( @{$option_list}, $options );
87
88 return bless $self, $class;
89}
90
91sub get_default_block_exp {
92 my $self = shift (@_);
93
94 return q^\.jpg$^;
95}
96
97sub get_default_process_exp {
98 my $self = shift (@_);
99
100 return q^(?i)\.hb$^;
101}
102
103# do plugin specific processing of doc_obj
104sub process {
105 my $self = shift (@_);
106 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
107 my $outhandle = $self->{'outhandle'};
108
109 print STDERR "<Processing n='$file' p='BookPlug'>\n" if ($gli);
110 print $outhandle "BookPlug: processing $file\n"
111 if $self->{'verbosity'} > 1;
112
113 my $cursection = $doc_obj->get_top_section();
114
115 my $filename = &util::filename_cat($base_dir, $file);
116 my $absdir = $filename;
117 $absdir =~ s/[^\/\\]*$//;
118
119 # add the cover image
120 my $coverimage = $filename;
121 $coverimage =~ s/\.[^\.]*$/\.jpg/i;
122 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
123
124 my $title = "";
125
126 # remove any leading rubbish
127 $$textref =~ s/^.*?(<<TOC)/$1/ios;
128
129 my $curtoclevel = 1;
130 my $firstsection = 1;
131 my $toccount = 0;
132 while ($$textref =~ /\w/) {
133 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
134 my $toclevel = $1;
135 my $metadata = $2;
136 my $sectiontext = $3;
137
138 if ($toclevel == 2) {
139 $toccount ++;
140 }
141
142 # close any sections below the current level and
143 # create a new section (special case for the firstsection)
144 while (($curtoclevel > $toclevel) ||
145 (!$firstsection && $curtoclevel == $toclevel)) {
146 $cursection = $doc_obj->get_parent_section ($cursection);
147 $curtoclevel--;
148 }
149 if ($curtoclevel+1 < $toclevel) {
150 print $outhandle "WARNING - jump in toc levels in $filename " .
151 "from $curtoclevel to $toclevel\n";
152 }
153 while ($curtoclevel < $toclevel) {
154 $curtoclevel++;
155 $cursection =
156 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
157 }
158
159 # sort out metadata
160 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
161 my $metakey = $1;
162 my $metavalue = $2;
163
164 if ($metavalue ne "" && $metakey ne "") {
165 # make sure key fits in with gsdl naming scheme
166 $metakey =~ tr/[A-Z]/[a-z]/;
167 $metakey = ucfirst ($metakey);
168 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
169 }
170 }
171
172 # remove header rubbish
173 $sectiontext =~ s/^.*?<body[^>]*>//ios;
174
175 # and any other unwanted tags
176 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
177
178 # fix up the image links
179 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
180 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
181
182 # add the text
183 $doc_obj->add_utf8_text($cursection, $sectiontext);
184
185 $firstsection = 0;
186
187 $$textref =~ s/^\s+//s;
188 }
189
190 return 1;
191}
192
193sub replace_image_links {
194 my ($dir, $doc_obj, $front, $link, $back) = @_;
195 my $outhandle = $self->{'outhandle'};
196
197 my ($filename, $error);
198 my $foundimage = 0;
199
200 $link =~ s/\/\///;
201 my ($imagetype) = $link =~ /([^\.]*)$/;
202 $imagetype =~ tr/[A-Z]/[a-z]/;
203 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
204 if ($imagetype !~ /^(jpeg|gif|png)$/) {
205 print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
206 }
207 my ($imagefile) = $link =~ /([^\/]*)$/;
208 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
209
210 if (defined $imagepath && $imagepath =~ /\w/) {
211 # relative link
212 $filename = &util::filename_cat ($dir, $imagepath);
213 if (-e $filename) {
214 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
215 $foundimage = 1;
216 } else {
217 $error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
218 }
219 }
220
221 if (!$foundimage) {
222 $filename = &util::filename_cat ($dir, $imagefile);
223 if (-e $filename) {
224 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
225 $foundimage = 1;
226 } elsif (defined $error) {
227 print $outhandle "$error $filename\n";
228 } else {
229 print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
230 }
231 }
232
233 if ($foundimage) {
234 return "${front}_httpdocimg_/${imagefile}${back}";
235 } else {
236 return "";
237 }
238}
239
2401;
Note: See TracBrowser for help on using the repository browser.