source: trunk/gsdl/perllib/plugins/HBSPlug.pm@ 1221

Last change on this file since 1221 was 1221, checked in by sjboddie, 24 years ago

Added a new HBSPlug which is kind of a generalisation of HBPlug (should
still use HBPlug for the Humanity Library collections). Also removed the
old GB specific GBBPlug, GBHPlug, and GBTPlug plugins. Can now do what
these plugins once did by giving the "-input_encoding gb" option to
HBSPlug, HTMLPlug, and TEXTPlug respectively (although TEXTPlug doesn't
actually take this option yet -- I'll fix it soon).

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1###########################################################################
2#
3# HBSPlug.pm -- plugin for processing simple html (or text) books
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates multi-level document from document containing
27# <<TOC>> level tags. Metadata for each section is taken from any
28# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
29# sets Title metadata.
30
31# Everything else between TOC tags is treated as simple html (i.e. no
32# processing of html links or any other HTMLPlug type stuff is done).
33
34# expects input files to have a .hb file extension
35
36# a file with the same name as the hb file but a .jpg extension is
37# taken as the cover image
38
39# HBSPlug is a simplification (and extension of) the HBPlug used
40# by the Humanity Library collections. HBSPlug is faster as it expects
41# the input files to be cleaner (The input to the HDL collections
42# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
43# tags to specify images, and simply takes all text between <<TOC>>
44# tags and start of text to be Title metadata). If you're marking up
45# documents to be displayed in the same way as the HDL collections,
46# use this plugin instead of HBPlug.
47
48package HBSPlug;
49
50use BasPlug;
51use util;
52
53sub BEGIN {
54 @ISA = ('BasPlug');
55}
56
57use strict;
58
59sub new {
60 my ($class) = @_;
61 my $self = new BasPlug (@_);
62
63 return bless $self, $class;
64}
65
66sub is_recursive {
67 my $self = shift (@_);
68
69 return 0; # this is not a recursive plugin
70}
71
72
73# return number of files processed, undef if can't process
74# Note that $base_dir might be "" and that $file might
75# include directories
76sub read {
77 my $self = shift (@_);
78 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
79
80 my $filename = &util::filename_cat($base_dir, $file);
81 my $absdir = $filename;
82 $absdir =~ s/[^\/\\]*$//;
83
84 return 0 if ($filename =~ /\.jpg$/i);
85 return undef unless ($filename =~ /\.hb$/i && (-e $filename));
86
87 print STDERR "GBBPlug: processing $filename\n" if $processor->{'verbosity'};
88
89 # create a new document
90 my $doc_obj = new doc ($file, "indexed_doc");
91 my $cursection = $doc_obj->get_top_section();
92
93 # add the cover image
94 my $coverimage = $filename;
95 $coverimage =~ s/\.hb/\.jpg/i;
96 $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
97
98 # add metadata for top level of document
99 $self->extra_metadata ($doc_obj, $cursection, $metadata);
100
101 # read in HTML file ($text will be in utf8)
102 my $text = "";
103 $self->read_file ($filename, \$text);
104
105 my $title = "";
106
107 # remove any leading rubbish
108 $text =~ s/^.*?(<<TOC)/$1/ios;
109
110 my $curtoclevel = 1;
111 my $firstsection = 1;
112 my $toccount = 0;
113 while ($text =~ /\w/) {
114 $text =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
115 my $toclevel = $1;
116 my $metadata = $2;
117 my $sectiontext = $3;
118
119 if ($toclevel == 2) {
120 $toccount ++;
121 }
122
123 # close any sections below the current level and
124 # create a new section (special case for the firstsection)
125 while (($curtoclevel > $toclevel) ||
126 (!$firstsection && $curtoclevel == $toclevel)) {
127 $cursection = $doc_obj->get_parent_section ($cursection);
128 $curtoclevel--;
129 }
130 if ($curtoclevel+1 < $toclevel) {
131 print STDERR "WARNING - jump in toc levels in $filename " .
132 "from $curtoclevel to $toclevel\n";
133 }
134 while ($curtoclevel < $toclevel) {
135 $curtoclevel++;
136 $cursection =
137 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
138 }
139
140 # sort out metadata
141 while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
142 my $metakey = $1;
143 my $metavalue = $2;
144
145 if ($metavalue ne "" && $metakey ne "") {
146 # make sure key fits in with gsdl naming scheme
147 $metakey =~ tr/[A-Z]/[a-z]/;
148 $metakey = ucfirst ($metakey);
149 $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
150 }
151 }
152
153 # remove header rubbish
154 $sectiontext =~ s/^.*?<body[^>]*>//ios;
155
156 # and any other unwanted tags
157 $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
158
159 # fix up the image links
160 $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
161 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
162
163 # add the text
164 $doc_obj->add_utf8_text($cursection, $sectiontext);
165
166 $firstsection = 0;
167
168 $text =~ s/^\s+//s;
169 }
170
171 # add OID
172 $doc_obj->set_OID ();
173
174 # process the document
175 $processor->process($doc_obj);
176
177 return 1; # processed the file
178}
179
180sub replace_image_links {
181
182 my ($dir, $doc_obj, $front, $link, $back) = @_;
183
184 my ($filename, $error);
185 my $foundimage = 0;
186
187 $link =~ s/\/\///;
188 my ($imagetype) = $link =~ /([^\.]*)$/;
189 $imagetype =~ tr/[A-Z]/[a-z]/;
190 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
191 if ($imagetype !~ /^(jpeg|gif|png)$/) {
192 print STDERR "GBHPlug: Warning - unknown image type ($imagetype)\n";
193 }
194 my ($imagefile) = $link =~ /([^\/]*)$/;
195 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
196
197 if (defined $imagepath && $imagepath =~ /\w/) {
198 # relative link
199 $filename = &util::filename_cat ($dir, $imagepath);
200 if (-e $filename) {
201 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
202 $foundimage = 1;
203 } else {
204 $error = "GBHPlug: Warning - couldn't find image file $imagefile in either $filename or";
205 }
206 }
207
208 if (!$foundimage) {
209 $filename = &util::filename_cat ($dir, $imagefile);
210 if (-e $filename) {
211 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
212 $foundimage = 1;
213 } elsif (defined $error) {
214 print STDERR "$error $filename\n";
215 } else {
216 print STDERR "GBHPlug: Warning - couldn't find image file $imagefile in $filename\n";
217 }
218 }
219
220 if ($foundimage) {
221 return "${front}_httpdocimg_/${imagefile}${back}";
222 } else {
223 return "";
224 }
225}
226
2271;
Note: See TracBrowser for help on using the repository browser.