1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # BookPlug.pm (formally called HBSPlug) -- plugin for processing simple
|
---|
4 | # html (or text) books
|
---|
5 | #
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | # creates multi-level document from document containing
|
---|
29 | # <<TOC>> level tags. Metadata for each section is taken from any
|
---|
30 | # other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
|
---|
31 | # sets Title metadata.
|
---|
32 |
|
---|
33 | # Everything else between TOC tags is treated as simple html (i.e. no
|
---|
34 | # processing of html links or any other HTMLPlug type stuff is done).
|
---|
35 |
|
---|
36 | # expects input files to have a .hb file extension by default (this can be
|
---|
37 | # changed by adding a -process_exp option
|
---|
38 |
|
---|
39 | # a file with the same name as the hb file but a .jpg extension is
|
---|
40 | # taken as the cover image (jpg files are blocked by this plugin)
|
---|
41 |
|
---|
42 | # BookPlug is a simplification (and extension) of the HBPlug used
|
---|
43 | # by the Humanity Library collections. BookPlug is faster as it expects
|
---|
44 | # the input files to be cleaner (The input to the HDL collections
|
---|
45 | # contains lots of excess html tags around <<TOC>> tags, uses <<I>>
|
---|
46 | # tags to specify images, and simply takes all text between <<TOC>>
|
---|
47 | # tags and start of text to be Title metadata). If you're marking up
|
---|
48 | # documents to be displayed in the same way as the HDL collections,
|
---|
49 | # use this plugin instead of HBPlug.
|
---|
50 |
|
---|
51 | # 12/05/02 Added usage datastructure - John Thompson
|
---|
52 |
|
---|
53 | package BookPlug;
|
---|
54 |
|
---|
55 | use BasPlug;
|
---|
56 | use util;
|
---|
57 |
|
---|
58 | sub BEGIN {
|
---|
59 | @ISA = ('BasPlug');
|
---|
60 | }
|
---|
61 |
|
---|
62 | my $arguments =
|
---|
63 | [ { 'name' => "process_exp",
|
---|
64 | 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
|
---|
65 | 'type' => "string",
|
---|
66 | 'reqd' => "no",
|
---|
67 | 'deft' => &get_default_process_exp() },
|
---|
68 | { 'name' => "block_exp",
|
---|
69 | 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
|
---|
70 | 'type' => "string",
|
---|
71 | 'reqd' => "no",
|
---|
72 | 'deft' => &get_default_block_exp() } ];
|
---|
73 |
|
---|
74 | my $options = { 'name' => "BookPlug",
|
---|
75 | 'desc' => "Creates multi-level document from document containing <<TOC>> level tags. Metadata for each section is taken from any other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>> sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around <<TOC>> tags, uses <<I>> tags to specify images, and simply takes all text between <<TOC>> tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
|
---|
76 | 'inherits' => "Yes",
|
---|
77 | 'args' => $arguments };
|
---|
78 |
|
---|
79 | sub new {
|
---|
80 | my ($class) = @_;
|
---|
81 | my $self = new BasPlug ("BookPlug", @_);
|
---|
82 |
|
---|
83 | # 14-05-02 To allow for proper inheritance of arguments - John Thompson
|
---|
84 | my $option_list = $self->{'option_list'};
|
---|
85 | push( @{$option_list}, $options );
|
---|
86 |
|
---|
87 | return bless $self, $class;
|
---|
88 | }
|
---|
89 |
|
---|
90 | sub get_default_block_exp {
|
---|
91 | my $self = shift (@_);
|
---|
92 |
|
---|
93 | return q^\.jpg$^;
|
---|
94 | }
|
---|
95 |
|
---|
96 | sub get_default_process_exp {
|
---|
97 | my $self = shift (@_);
|
---|
98 |
|
---|
99 | return q^(?i)\.hb$^;
|
---|
100 | }
|
---|
101 |
|
---|
102 | # do plugin specific processing of doc_obj
|
---|
103 | sub process {
|
---|
104 | my $self = shift (@_);
|
---|
105 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
|
---|
106 | my $outhandle = $self->{'outhandle'};
|
---|
107 |
|
---|
108 | print $outhandle "BookPlug: processing $file\n"
|
---|
109 | if $self->{'verbosity'} > 1;
|
---|
110 |
|
---|
111 | my $cursection = $doc_obj->get_top_section();
|
---|
112 |
|
---|
113 | my $filename = &util::filename_cat($base_dir, $file);
|
---|
114 | my $absdir = $filename;
|
---|
115 | $absdir =~ s/[^\/\\]*$//;
|
---|
116 |
|
---|
117 | # add the cover image
|
---|
118 | my $coverimage = $filename;
|
---|
119 | $coverimage =~ s/\.[^\.]*$/\.jpg/i;
|
---|
120 | $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
|
---|
121 |
|
---|
122 | my $title = "";
|
---|
123 |
|
---|
124 | # remove any leading rubbish
|
---|
125 | $$textref =~ s/^.*?(<<TOC)/$1/ios;
|
---|
126 |
|
---|
127 | my $curtoclevel = 1;
|
---|
128 | my $firstsection = 1;
|
---|
129 | my $toccount = 0;
|
---|
130 | while ($$textref =~ /\w/) {
|
---|
131 | $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
|
---|
132 | my $toclevel = $1;
|
---|
133 | my $metadata = $2;
|
---|
134 | my $sectiontext = $3;
|
---|
135 |
|
---|
136 | if ($toclevel == 2) {
|
---|
137 | $toccount ++;
|
---|
138 | }
|
---|
139 |
|
---|
140 | # close any sections below the current level and
|
---|
141 | # create a new section (special case for the firstsection)
|
---|
142 | while (($curtoclevel > $toclevel) ||
|
---|
143 | (!$firstsection && $curtoclevel == $toclevel)) {
|
---|
144 | $cursection = $doc_obj->get_parent_section ($cursection);
|
---|
145 | $curtoclevel--;
|
---|
146 | }
|
---|
147 | if ($curtoclevel+1 < $toclevel) {
|
---|
148 | print $outhandle "WARNING - jump in toc levels in $filename " .
|
---|
149 | "from $curtoclevel to $toclevel\n";
|
---|
150 | }
|
---|
151 | while ($curtoclevel < $toclevel) {
|
---|
152 | $curtoclevel++;
|
---|
153 | $cursection =
|
---|
154 | $doc_obj->insert_section($doc_obj->get_end_child($cursection));
|
---|
155 | }
|
---|
156 |
|
---|
157 | # sort out metadata
|
---|
158 | while ($metadata =~ s/^.*?<<([^>]*)>>(.*?)<<[^>]*>>//) {
|
---|
159 | my $metakey = $1;
|
---|
160 | my $metavalue = $2;
|
---|
161 |
|
---|
162 | if ($metavalue ne "" && $metakey ne "") {
|
---|
163 | # make sure key fits in with gsdl naming scheme
|
---|
164 | $metakey =~ tr/[A-Z]/[a-z]/;
|
---|
165 | $metakey = ucfirst ($metakey);
|
---|
166 | $doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | # remove header rubbish
|
---|
171 | $sectiontext =~ s/^.*?<body[^>]*>//ios;
|
---|
172 |
|
---|
173 | # and any other unwanted tags
|
---|
174 | $sectiontext =~ s/<(\/p|\/html|\/body)>//isg;
|
---|
175 |
|
---|
176 | # fix up the image links
|
---|
177 | $sectiontext =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
|
---|
178 | &replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
|
---|
179 |
|
---|
180 | # add the text
|
---|
181 | $doc_obj->add_utf8_text($cursection, $sectiontext);
|
---|
182 |
|
---|
183 | $firstsection = 0;
|
---|
184 |
|
---|
185 | $$textref =~ s/^\s+//s;
|
---|
186 | }
|
---|
187 |
|
---|
188 | return 1;
|
---|
189 | }
|
---|
190 |
|
---|
191 | sub replace_image_links {
|
---|
192 | my ($dir, $doc_obj, $front, $link, $back) = @_;
|
---|
193 | my $outhandle = $self->{'outhandle'};
|
---|
194 |
|
---|
195 | my ($filename, $error);
|
---|
196 | my $foundimage = 0;
|
---|
197 |
|
---|
198 | $link =~ s/\/\///;
|
---|
199 | my ($imagetype) = $link =~ /([^\.]*)$/;
|
---|
200 | $imagetype =~ tr/[A-Z]/[a-z]/;
|
---|
201 | if ($imagetype eq "jpg") {$imagetype = "jpeg";}
|
---|
202 | if ($imagetype !~ /^(jpeg|gif|png)$/) {
|
---|
203 | print $outhandle "BookPlug: Warning - unknown image type ($imagetype)\n";
|
---|
204 | }
|
---|
205 | my ($imagefile) = $link =~ /([^\/]*)$/;
|
---|
206 | my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
|
---|
207 |
|
---|
208 | if (defined $imagepath && $imagepath =~ /\w/) {
|
---|
209 | # relative link
|
---|
210 | $filename = &util::filename_cat ($dir, $imagepath);
|
---|
211 | if (-e $filename) {
|
---|
212 | $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
|
---|
213 | $foundimage = 1;
|
---|
214 | } else {
|
---|
215 | $error = "BookPlug: Warning - couldn't find image file $imagefile in either $filename or";
|
---|
216 | }
|
---|
217 | }
|
---|
218 |
|
---|
219 | if (!$foundimage) {
|
---|
220 | $filename = &util::filename_cat ($dir, $imagefile);
|
---|
221 | if (-e $filename) {
|
---|
222 | $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
|
---|
223 | $foundimage = 1;
|
---|
224 | } elsif (defined $error) {
|
---|
225 | print $outhandle "$error $filename\n";
|
---|
226 | } else {
|
---|
227 | print $outhandle "BookPlug: Warning - couldn't find image file $imagefile in $filename\n";
|
---|
228 | }
|
---|
229 | }
|
---|
230 |
|
---|
231 | if ($foundimage) {
|
---|
232 | return "${front}_httpdocimg_/${imagefile}${back}";
|
---|
233 | } else {
|
---|
234 | return "";
|
---|
235 | }
|
---|
236 | }
|
---|
237 |
|
---|
238 | 1;
|
---|