Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/plugins/HBSPlug.pm@ 1221

Last change on this file since 1221 was 1221, checked in by sjboddie, 24 years ago
Added a new HBSPlug which is kind of a generalisation of HBPlug (should still use HBPlug for the Humanity Library collections). Also removed the old GB specific GBBPlug, GBHPlug, and GBTPlug plugins. Can now do what these plugins once did by giving the "-input_encoding gb" option to HBSPlug, HTMLPlug, and TEXTPlug respectively (although TEXTPlug doesn't actually take this option yet -- I'll fix it soon).
Property svn:keywords set to `Author Date Id Revision`
File size: 6.6 KB

Line
1	###########################################################################
2	#
3	# HBSPlug.pm -- plugin for processing simple html (or text) books
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# creates multi-level document from document containing
27	# <<TOC>> level tags. Metadata for each section is taken from any
28	# other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>>
29	# sets Title metadata.
30
31	# Everything else between TOC tags is treated as simple html (i.e. no
32	# processing of html links or any other HTMLPlug type stuff is done).
33
34	# expects input files to have a .hb file extension
35
36	# a file with the same name as the hb file but a .jpg extension is
37	# taken as the cover image
38
39	# HBSPlug is a simplification (and extension of) the HBPlug used
40	# by the Humanity Library collections. HBSPlug is faster as it expects
41	# the input files to be cleaner (The input to the HDL collections
42	# contains lots of excess html tags around <<TOC>> tags, uses <<I>>
43	# tags to specify images, and simply takes all text between <<TOC>>
44	# tags and start of text to be Title metadata). If you're marking up
45	# documents to be displayed in the same way as the HDL collections,
46	# use this plugin instead of HBPlug.
47
48	package HBSPlug;
49
50	use BasPlug;
51	use util;
52
53	sub BEGIN {
54	@ISA = ('BasPlug');
55	}
56
57	use strict;
58
59	sub new {
60	my ($class) = @_;
61	my $self = new BasPlug (@_);
62
63	return bless $self, $class;
64	}
65
66	sub is_recursive {
67	my $self = shift (@_);
68
69	return 0; # this is not a recursive plugin
70	}
71
72
73	# return number of files processed, undef if can't process
74	# Note that $base_dir might be "" and that $file might
75	# include directories
76	sub read {
77	my $self = shift (@_);
78	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
79
80	my $filename = &util::filename_cat($base_dir, $file);
81	my $absdir = $filename;
82	$absdir =~ s/[^\/\\]*$//;
83
84	return 0 if ($filename =~ /\.jpg$/i);
85	return undef unless ($filename =~ /\.hb$/i && (-e $filename));
86
87	print STDERR "GBBPlug: processing $filename\n" if $processor->{'verbosity'};
88
89	# create a new document
90	my $doc_obj = new doc ($file, "indexed_doc");
91	my $cursection = $doc_obj->get_top_section();
92
93	# add the cover image
94	my $coverimage = $filename;
95	$coverimage =~ s/\.hb/\.jpg/i;
96	$doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
97
98	# add metadata for top level of document
99	$self->extra_metadata ($doc_obj, $cursection, $metadata);
100
101	# read in HTML file ($text will be in utf8)
102	my $text = "";
103	$self->read_file ($filename, \$text);
104
105	my $title = "";
106
107	# remove any leading rubbish
108	$text =~ s/^.*?(<<TOC)/$1/ios;
109
110	my $curtoclevel = 1;
111	my $firstsection = 1;
112	my $toccount = 0;
113	while ($text =~ /\w/) {
114	$text =~ s/^<<TOC(\d+)>>([^\n])\n(.?)(<<TOC\|\Z)/$4/ios;
115	my $toclevel = $1;
116	my $metadata = $2;
117	my $sectiontext = $3;
118
119	if ($toclevel == 2) {
120	$toccount ++;
121	}
122
123	# close any sections below the current level and
124	# create a new section (special case for the firstsection)
125	while (($curtoclevel > $toclevel) \|\|
126	(!$firstsection && $curtoclevel == $toclevel)) {
127	$cursection = $doc_obj->get_parent_section ($cursection);
128	$curtoclevel--;
129	}
130	if ($curtoclevel+1 < $toclevel) {
131	print STDERR "WARNING - jump in toc levels in $filename " .
132	"from $curtoclevel to $toclevel\n";
133	}
134	while ($curtoclevel < $toclevel) {
135	$curtoclevel++;
136	$cursection =
137	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
138	}
139
140	# sort out metadata
141	while ($metadata =~ s/^.?<<([^>])>>(.?)<<[^>]>>//) {
142	my $metakey = $1;
143	my $metavalue = $2;
144
145	if ($metavalue ne "" && $metakey ne "") {
146	# make sure key fits in with gsdl naming scheme
147	$metakey =~ tr/[A-Z]/[a-z]/;
148	$metakey = ucfirst ($metakey);
149	$doc_obj->add_utf8_metadata ($cursection, $metakey, $metavalue);
150	}
151	}
152
153	# remove header rubbish
154	$sectiontext =~ s/^.?<body[^>]>//ios;
155
156	# and any other unwanted tags
157	$sectiontext =~ s/<(\/p\|\/html\|\/body)>//isg;
158
159	# fix up the image links
160	$sectiontext =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
161	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/isge;
162
163	# add the text
164	$doc_obj->add_utf8_text($cursection, $sectiontext);
165
166	$firstsection = 0;
167
168	$text =~ s/^\s+//s;
169	}
170
171	# add OID
172	$doc_obj->set_OID ();
173
174	# process the document
175	$processor->process($doc_obj);
176
177	return 1; # processed the file
178	}
179
180	sub replace_image_links {
181
182	my ($dir, $doc_obj, $front, $link, $back) = @_;
183
184	my ($filename, $error);
185	my $foundimage = 0;
186
187	$link =~ s/\/\///;
188	my ($imagetype) = $link =~ /([^\.]*)$/;
189	$imagetype =~ tr/[A-Z]/[a-z]/;
190	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
191	if ($imagetype !~ /^(jpeg\|gif\|png)$/) {
192	print STDERR "GBHPlug: Warning - unknown image type ($imagetype)\n";
193	}
194	my ($imagefile) = $link =~ /([^\/]*)$/;
195	my ($imagepath) = $link =~ /^[^\/](.)$/;
196
197	if (defined $imagepath && $imagepath =~ /\w/) {
198	# relative link
199	$filename = &util::filename_cat ($dir, $imagepath);
200	if (-e $filename) {
201	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
202	$foundimage = 1;
203	} else {
204	$error = "GBHPlug: Warning - couldn't find image file $imagefile in either $filename or";
205	}
206	}
207
208	if (!$foundimage) {
209	$filename = &util::filename_cat ($dir, $imagefile);
210	if (-e $filename) {
211	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
212	$foundimage = 1;
213	} elsif (defined $error) {
214	print STDERR "$error $filename\n";
215	} else {
216	print STDERR "GBHPlug: Warning - couldn't find image file $imagefile in $filename\n";
217	}
218	}
219
220	if ($foundimage) {
221	return "${front}_httpdocimg_/${imagefile}${back}";
222	} else {
223	return "";
224	}
225	}
226
227	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: