source: trunk/gsdl/perllib/plugins/HBSPlug.pm@ 1244

Last change on this file since 1244 was 1244, checked in by sjboddie, 24 years ago

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# HBSPlug.pm -- plugin for processing simple html (or text) books
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates multi-level document from document containing
27# <<TOC>> level tags. Metadata for each section is taken from any
28# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
29# sets Title metadata.
30
31# Everything else between TOC tags is treated as simple html (i.e. no
32# processing of html links or any other HTMLPlug type stuff is done).
33
34# expects input files to have a .hb file extension by default (this can be
35# changed by adding a -process_exp option
36
37# a file with the same name as the hb file but a .jpg extension is
38# taken as the cover image (jpg files are blocked by this plugin)
39
40# HBSPlug is a simplification (and extension of) the HBPlug used
41# by the Humanity Library collections. HBSPlug is faster as it expects
42# the input files to be cleaner (The input to the HDL collections
43# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
44# tags to specify images, and simply takes all text between <<TOC>>
45# tags and start of text to be Title metadata). If you're marking up
46# documents to be displayed in the same way as the HDL collections,
47# use this plugin instead of HBPlug.
48
49package HBSPlug;
50
51use BasPlug;
52use util;
53
54sub BEGIN {
55 @ISA = ('BasPlug');
56}
57
58use strict;
59
60sub new {
61 my ($class) = @_;
62 my $self = new BasPlug ("HBSPlug", @_);
63
64 return bless $self, $class;
65}
66
67sub get_default_block_exp {
68 my $self = shift (@_);
69
70 return q^\.jpg$^;
71}
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75
76 return q^(?i)\.hb$^;
77}
78
79# do plugin specific processing of doc_obj
80sub process {
81 my $self = shift (@_);
82 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
83
84 print STDERR "HBSPlug: processing $file\n"
85 if $self->{'verbosity'} > 1;
86
87 my $cursection = $doc_obj->get_top_section();
88
89 my $filename = &util::filename_cat($base_dir, $file);
90 my $absdir = $filename;
91 $absdir =~ s/[^\/\\]*$//;
92
93 # add the cover image
94 my $coverimage = $filename;
95 $coverimage =~ s/\.[^\.]*$/\.jpg/i;
96 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
97
98 my $title = "";
99
100 # remove any leading rubbish
101 $$textref =~ s/^.*?(<<TOC)/$1/ios;
102
103 my $curtoclevel = 1;
104 my $firstsection = 1;
105 my $toccount = 0;
106 while ($$textref =~ /\w/) {
107 $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
108 my $toclevel = $1;
109 my $metadata = $2;
110 my $sectiontext = $3;
111
112 if ($toclevel == 2) {
113 $toccount ++;
114 }
115
116 # close any sections below the current level and
117 # create a new section (special case for the firstsection)
118 while (($curtoclevel > $toclevel) ||
119 (!$firstsection && $curtoclevel == $toclevel)) {
120 $cursection = $doc_obj->get_parent_section ($cursection);
121 $curtoclevel--;
122 }
123 if ($curtoclevel+1 < $toclevel) {
124 print STDERR "WARNING - jump in toc levels in $filename " .
125 "from $curtoclevel to $toclevel\n";
126 }
127 while ($curtoclevel < $toclevel) {
128 $curtoclevel++;
129 $cursection =
130 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
131 }
132
133 # sort out metadata
134 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
135 my $metakey = $1;
136 my $metavalue = $2;
137
138 if ($metavalue ne "" && $metakey ne "") {
139 # make sure key fits in with gsdl naming scheme
140 $metakey =~ tr/[A-Z]/[a-z]/;
141 $metakey = ucfirst ($metakey);
142 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
143 }
144 }
145
146 # remove header rubbish
147 $sectiontext =~ s/^.*?<body[^>]*>//ios;
148
149 # and any other unwanted tags
150 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
151
152 # fix up the image links
153 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
154 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
155
156 # add the text
157 $doc_obj->add_utf8_text($cursection, $sectiontext);
158
159 $firstsection = 0;
160
161 $$textref =~ s/^\s+//s;
162 }
163
164 return 1;
165}
166
167sub replace_image_links {
168
169 my ($dir, $doc_obj, $front, $link, $back) = @_;
170
171 my ($filename, $error);
172 my $foundimage = 0;
173
174 $link =~ s/\/\///;
175 my ($imagetype) = $link =~ /([^\.]*)$/;
176 $imagetype =~ tr/[A-Z]/[a-z]/;
177 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
178 if ($imagetype !~ /^(jpeg|gif|png)$/) {
179 print STDERR "HBSPlug: Warning - unknown image type ($imagetype)\n";
180 }
181 my ($imagefile) = $link =~ /([^\/]*)$/;
182 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
183
184 if (defined $imagepath && $imagepath =~ /\w/) {
185 # relative link
186 $filename = &util::filename_cat ($dir, $imagepath);
187 if (-e $filename) {
188 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
189 $foundimage = 1;
190 } else {
191 $error = "HBSPlug: Warning - couldn't find image file $imagefile in either $filename or";
192 }
193 }
194
195 if (!$foundimage) {
196 $filename = &util::filename_cat ($dir, $imagefile);
197 if (-e $filename) {
198 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
199 $foundimage = 1;
200 } elsif (defined $error) {
201 print STDERR "$error $filename\n";
202 } else {
203 print STDERR "HBSPlug: Warning - couldn't find image file $imagefile in $filename\n";
204 }
205 }
206
207 if ($foundimage) {
208 return "${front}_httpdocimg_/${imagefile}${back}";
209 } else {
210 return "";
211 }
212}
213
2141;
Note: See TracBrowser for help on using the repository browser.