source: trunk/gsdl/perllib/plugins/BookPlug.pm@ 4845

Last change on this file since 4845 was 4744, checked in by mdewsnip, 21 years ago

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
4# html (or text) books
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# creates multi-level document from document containing
29# <<TOC>> level tags. Metadata for each section is taken from any
30# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
31# sets Title metadata.
32
33# Everything else between TOC tags is treated as simple html (i.e. no
34# processing of html links or any other HTMLPlug type stuff is done).
35
36# expects input files to have a .hb file extension by default (this can be
37# changed by adding a -process_exp option
38
39# a file with the same name as the hb file but a .jpg extension is
40# taken as the cover image (jpg files are blocked by this plugin)
41
42# BookPlug is a simplification (and extension) of the HBPlug used
43# by the Humanity Library collections. BookPlug is faster as it expects
44# the input files to be cleaner (The input to the HDL collections
45# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
46# tags to specify images, and simply takes all text between <<TOC>>
47# tags and start of text to be Title metadata). If you're marking up
48# documents to be displayed in the same way as the HDL collections,
49# use this plugin instead of HBPlug.
50
51# 12/05/02 Added usage datastructure - John Thompson
52
53package BookPlug;
54
55use BasPlug;
56use util;
57
58sub BEGIN {
59 @ISA = ('BasPlug');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
65 'type' => "string",
66 'reqd' => "no",
67 'deft' => &get_default_process_exp() },
68 { 'name' => "block_exp",
69 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
70 'type' => "string",
71 'reqd' => "no",
72 'deft' => &get_default_block_exp() } ];
73
74my $options = { 'name' => "BookPlug",
75 'desc' => "Creates multi-level document from document containing &lt;&lt;TOC&gt;&gt; level tags. Metadata for each section is taken from any other tags on the same line as the &lt;&lt;TOC&gt;&gt;. e.g. &lt;&lt;Title&gt;&gt;xxxx&lt;&lt;/Title&gt;&gt; sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around &lt;&lt;TOC&gt;&gt; tags, uses &lt;&lt;I&gt;&gt; tags to specify images, and simply takes all text between &lt;&lt;TOC&gt;&gt; tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
76 'inherits' => "Yes",
77 'args' => $arguments };
78
79sub new {
80 my ($class) = @_;
81 my $self = new BasPlug ("BookPlug", @_);
82
83 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
84 my $option_list = $self->{'option_list'};
85 push( @{$option_list}, $options );
86
87 return bless $self, $class;
88}
89
90sub get_default_block_exp {
91 my $self = shift (@_);
92
93 return q^\.jpg$^;
94}
95
96sub get_default_process_exp {
97 my $self = shift (@_);
98
99 return q^(?i)\.hb$^;
100}
101
102# do plugin specific processing of doc_obj
103sub process {
104 my $self = shift (@_);
105 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
106 my $outhandle = $self->{'outhandle'};
107
108 print $outhandle "BookPlug: processing $file\n"
109 if $self->{'verbosity'} > 1;
110
111 my $cursection = $doc_obj->get_top_section();
112
113 my $filename = &util::filename_cat($base_dir, $file);
114 my $absdir = $filename;
115 $absdir =~ s/[^\/\\]*$//;
116
117 # add the cover image
118 my $coverimage = $filename;
119 $coverimage =~ s/\.[^\.]*$/\.jpg/i;
120 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
121
122 my $title = "";
123
124 # remove any leading rubbish
125 $$textref =~ s/^.*?(<<TOC)/$1/ios;
126
127 my $curtoclevel = 1;
128 my $firstsection = 1;
129 my $toccount = 0;
130 while ($$textref =~ /\w/) {
131 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
132 my $toclevel = $1;
133 my $metadata = $2;
134 my $sectiontext = $3;
135
136 if ($toclevel == 2) {
137 $toccount ++;
138 }
139
140 # close any sections below the current level and
141 # create a new section (special case for the firstsection)
142 while (($curtoclevel > $toclevel) ||
143 (!$firstsection && $curtoclevel == $toclevel)) {
144 $cursection = $doc_obj->get_parent_section ($cursection);
145 $curtoclevel--;
146 }
147 if ($curtoclevel+1 < $toclevel) {
148 print $outhandle "WARNING - jump in toc levels in $filename " .
149 "from $curtoclevel to $toclevel\n";
150 }
151 while ($curtoclevel < $toclevel) {
152 $curtoclevel++;
153 $cursection =
154 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
155 }
156
157 # sort out metadata
158 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
159 my $metakey = $1;
160 my $metavalue = $2;
161
162 if ($metavalue ne "" && $metakey ne "") {
163 # make sure key fits in with gsdl naming scheme
164 $metakey =~ tr/[A-Z]/[a-z]/;
165 $metakey = ucfirst ($metakey);
166 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
167 }
168 }
169
170 # remove header rubbish
171 $sectiontext =~ s/^.*?<body[^>]*>//ios;
172
173 # and any other unwanted tags
174 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
175
176 # fix up the image links
177 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
178 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
179
180 # add the text
181 $doc_obj->add_utf8_text($cursection, $sectiontext);
182
183 $firstsection = 0;
184
185 $$textref =~ s/^\s+//s;
186 }
187
188 return 1;
189}
190
191sub replace_image_links {
192 my ($dir, $doc_obj, $front, $link, $back) = @_;
193 my $outhandle = $self->{'outhandle'};
194
195 my ($filename, $error);
196 my $foundimage = 0;
197
198 $link =~ s/\/\///;
199 my ($imagetype) = $link =~ /([^\.]*)$/;
200 $imagetype =~ tr/[A-Z]/[a-z]/;
201 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
202 if ($imagetype !~ /^(jpeg|gif|png)$/) {
203 print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
204 }
205 my ($imagefile) = $link =~ /([^\/]*)$/;
206 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
207
208 if (defined $imagepath && $imagepath =~ /\w/) {
209 # relative link
210 $filename = &util::filename_cat ($dir, $imagepath);
211 if (-e $filename) {
212 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
213 $foundimage = 1;
214 } else {
215 $error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
216 }
217 }
218
219 if (!$foundimage) {
220 $filename = &util::filename_cat ($dir, $imagefile);
221 if (-e $filename) {
222 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
223 $foundimage = 1;
224 } elsif (defined $error) {
225 print $outhandle "$error $filename\n";
226 } else {
227 print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
228 }
229 }
230
231 if ($foundimage) {
232 return "${front}_httpdocimg_/${imagefile}${back}";
233 } else {
234 return "";
235 }
236}
237
2381;
Note: See TracBrowser for help on using the repository browser.