Context Navigation

source: trunk/gsdl/perllib/plugins/HBSPlug.pm@ 1242

Last change on this file since 1242 was 1235, checked in by nzdl, 24 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 6.6 KB

Line
1	###########################################################################
2	#
3	# HBSPlug.pm -- plugin for processing simple html (or text) books
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# creates multi-level document from document containing
27	# <<TOC>> level tags. Metadata for each section is taken from any
28	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
29	# sets Title metadata.
30
31	# Everything else between TOC tags is treated as simple html (i.e. no
32	# processing of html links or any other HTMLPlug type stuff is done).
33
34	# expects input files to have a .hb file extension
35
36	# a file with the same name as the hb file but a .jpg extension is
37	# taken as the cover image
38
39	# HBSPlug is a simplification (and extension of) the HBPlug used
40	# by the Humanity Library collections. HBSPlug is faster as it expects
41	# the input files to be cleaner (The input to the HDL collections
42	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
43	# tags to specify images, and simply takes all text between <<TOC>>
44	# tags and start of text to be Title metadata). If you're marking up
45	# documents to be displayed in the same way as the HDL collections,
46	# use this plugin instead of HBPlug.
47
48	package HBSPlug;
49
50	use BasPlug;
51	use util;
52
53	sub BEGIN {
54	@ISA = ('BasPlug');
55	}
56
57	use strict;
58
59	sub new {
60	my ($class) = @_;
61	my $self = new BasPlug (@_);
62
63	return bless $self, $class;
64	}
65
66	sub is_recursive {
67	my $self = shift (@_);
68
69	return 0; # this is not a recursive plugin
70	}
71
72
73	# return number of files processed, undef if can't process
74	# Note that $base_dir might be "" and that $file might
75	# include directories
76	sub read {
77	my $self = shift (@_);
78	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
79
80	my $filename = &util::filename_cat($base_dir, $file);
81	my $absdir = $filename;
82	$absdir =~ s/[^\/\\]*$//;
83
84	return 0 if ($filename =~ /\.jpg$/i);
85	return undef unless ($filename =~ /\.hb$/i && (-e $filename));
86
87	print STDERR "HBSPlug: processing $filename\n" if $processor->{'verbosity'};
88
89	# create a new document
90	my $doc_obj = new doc ($file, "indexed_doc");
91	my $cursection = $doc_obj->get_top_section();
92
93	# add the cover image
94	my $coverimage = $filename;
95	$coverimage =~ s/\.hb/\.jpg/i;
96	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
97
98	# add metadata for top level of document
99	$self->extra_metadata ($doc_obj, $cursection, $metadata);
100
101	# read in HTML file ($text will be in utf8)
102	my $text = "";
103	$self->read_file ($filename, \$text);
104
105	my $title = "";
106
107	# remove any leading rubbish
108	$text =~ s/^.*?(<<TOC)/$1/ios;
109
110	my $curtoclevel = 1;
111	my $firstsection = 1;
112	my $toccount = 0;
113	while ($text =~ /\w/) {
114	$text =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
115	my $toclevel = $1;
116	my $metadata = $2;
117	my $sectiontext = $3;
118
119	if ($toclevel == 2) {
120	$toccount ++;
121	}
122
123	# close any sections below the current level and
124	# create a new section (special case for the firstsection)
125	while (($curtoclevel > $toclevel) \|\|
126	(!$firstsection && $curtoclevel == $toclevel)) {
127	$cursection = $doc_obj->get_parent_section ($cursection);
128	$curtoclevel--;
129	}
130	if ($curtoclevel+1 < $toclevel) {
131	print STDERR "WARNING - jump in toc levels in $filename " .
132	"from $curtoclevel to $toclevel\n";
133	}
134	while ($curtoclevel < $toclevel) {
135	$curtoclevel++;
136	$cursection =
137	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
138	}
139
140	# sort out metadata
141	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
142	my $metakey = $1;
143	my $metavalue = $2;
144
145	if ($metavalue ne "" && $metakey ne "") {
146	# make sure key fits in with gsdl naming scheme
147	$metakey =~ tr/[A-Z]/[a-z]/;
148	$metakey = ucfirst ($metakey);
149	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
150	}
151	}
152
153	# remove header rubbish
154	$sectiontext =~ s/^.?<body[^>]>//ios;
155
156	# and any other unwanted tags
157	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
158
159	# fix up the image links
160	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
161	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
162
163	# add the text
164	$doc_obj->add_utf8_text($cursection, $sectiontext);
165
166	$firstsection = 0;
167
168	$text =~ s/^\s+//s;
169	}
170
171	# add OID
172	$doc_obj->set_OID ();
173
174	# process the document
175	$processor->process($doc_obj);
176
177	return 1; # processed the file
178	}
179
180	sub replace_image_links {
181
182	my ($dir, $doc_obj, $front, $link, $back) = @_;
183
184	my ($filename, $error);
185	my $foundimage = 0;
186
187	$link =~ s/\/\///;
188	my ($imagetype) = $link =~ /([^\.]*)$/;
189	$imagetype =~ tr/[A-Z]/[a-z]/;
190	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
191	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
192	print STDERR "HBSPlug: Warning - unknown image type ($imagetype)\n";
193	}
194	my ($imagefile) = $link =~ /([^\/]*)$/;
195	my ($imagepath) = $link =~ /^[^\/](.)$/;
196
197	if (defined $imagepath && $imagepath =~ /\w/) {
198	# relative link
199	$filename = &util::filename_cat ($dir, $imagepath);
200	if (-e $filename) {
201	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
202	$foundimage = 1;
203	} else {
204	$error = "HBSPlug: Warning - couldn't find image file $imagefile in either $filename or";
205	}
206	}
207
208	if (!$foundimage) {
209	$filename = &util::filename_cat ($dir, $imagefile);
210	if (-e $filename) {
211	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
212	$foundimage = 1;
213	} elsif (defined $error) {
214	print STDERR "$error $filename\n";
215	} else {
216	print STDERR "HBSPlug: Warning - couldn't find image file $imagefile in $filename\n";
217	}
218	}
219
220	if ($foundimage) {
221	return "${front}_httpdocimg_/${imagefile}${back}";
222	} else {
223	return "";
224	}
225	}
226
227	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: