Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 589

Last change on this file since 589 was 589, checked in by sjboddie, 25 years ago
fixed bug in regular expression
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 5.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# creates simple single-level document from .htm or .html files
27	# (case-insensitive match on filenames). Adds Title metadata
28	# taken from <title> tags if found otherwise first 100 characters
29	# outside of tags.
30	# will also attempt to include images which it will search for in
31	# the same directory as the document itself (it will also search
32	# directories relative to that directory).
33
34	# this plugin currently does nothing with href links so relative links
35	# may become broken.
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use sorttools;
42	use util;
43
44	sub BEGIN {
45	@ISA = ('BasPlug');
46	}
47
48	sub new {
49	my ($class) = @_;
50	$self = new BasPlug ();
51
52	return bless $self, $class;
53	}
54
55	sub is_recursive {
56	my $self = shift (@_);
57
58	return 0; # this is not a recursive plugin
59	}
60
61
62	# return number of files processed, undef if can't process
63	# Note that $base_dir might be "" and that $file might
64	# include directories
65	sub read {
66	my $self = shift (@_);
67	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
68
69	my $filename = &util::filename_cat($base_dir, $file);
70	my $absdir = $filename;
71	$absdir =~ s/[^\/\\]*$//;
72
73	return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename));
74
75	my $gz = 0;
76	if (defined $2) {
77	$gz = $2;
78	$gz = 1 if ($gz =~ /\.gz/i);
79	}
80
81	print STDERR "HTMLPlug: processing $filename\n" if $processor->{'verbosity'};
82
83	# create a new document
84	my $doc_obj = new doc ($file, "indexed_doc");
85
86	if ($gz) {
87	open (FILE, "zcat $filename \|") \|\| die "HTMLPlug::read - zcat can't open $filename\n";
88	} else {
89	open (FILE, $filename) \|\| die "HTMLPlug::read - can't open $filename\n";
90	}
91	my $cursection = $doc_obj->get_top_section();
92
93	my $text = "";
94	my $line = "";
95	my $donehead = 0;
96	my $title = "";
97	while (defined ($line = <FILE>)) {
98	$text .= $line;
99	}
100
101	# remove line breaks
102	$text =~ s/\s+/ /g;
103
104	# see if there's a <title> tag
105	my $foundtitle = 0;
106	if ($text =~ /<title[^>]>([^<])<\/title[^>]*>/i) {
107	if (defined $1) {
108	my $title = $1;
109	if ($title =~ /\w/) {
110	$doc_obj->add_metadata ($cursection, "Title", $title);
111	$foundtitle = 1;
112	}
113	}
114	}
115	# if no title use first 100 characters
116	if (!$foundtitle) {
117	my $tmptext = $text;
118	$tmptext =~ s/<[^>]*>//g;
119	my $title = substr ($tmptext, 0, 100);
120	$doc_obj->add_metadata ($cursection, "Title", $title);
121	}
122
123	# remove header rubbish
124	$text =~ s/^.?<body[^>]>//i;
125
126	# fix up the image links
127	$text =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
128	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige;
129
130	# add a newline at the beginning of each paragraph
131	$text =~ s/(.)\s*<p\b/$1\n\n<p/gi;
132
133	# add a newline every 80 characters at a word boundary
134	# Note: this regular expression puts a line feed before
135	# the last word in each section, even when it is not
136	# needed.
137	$text =~ s/(.{1,80})\s/$1\n/g;
138
139	$doc_obj->add_text ($cursection, $text);
140
141	foreach $field (keys(%$metadata)) {
142	# $metadata->{$field} may be an array reference
143	if (ref ($metadata->{$field}) eq "ARRAY") {
144	map {
145	$doc_obj->add_metadata ($cursection, $field, $_);
146	} @{$metadata->{$field}};
147	} else {
148	$doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
149	}
150	}
151
152	# add OID
153	$doc_obj->set_OID ();
154
155	# process the document
156	$processor->process($doc_obj);
157
158	return 1; # processed the file
159	}
160
161	sub replace_image_links {
162
163	my ($dir, $doc_obj, $front, $link, $back) = @_;
164
165	my ($filename, $error);
166	my $foundimage = 0;
167
168	$link =~ s/\/\///;
169	my ($imagetype) = $link =~ /([^\.]*)$/;
170	$imagetype =~ tr/[A-Z]/[a-z]/;
171	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
172	if ($imagetype !~ /^(jpg\|gif\|png)$/) {
173	print STDERR "HTMLPlug: Warning - unknown image type ($imagetype)\n";
174	}
175	my ($imagefile) = $link =~ /([^\/]*)$/;
176	my ($imagepath) = $link =~ /^[^\/](.)$/;
177
178	if (defined $imagepath && $imagepath =~ /\w/) {
179	# relative link
180	$filename = &util::filename_cat ($dir, $imagepath);
181	if (-e $filename) {
182	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
183	$foundimage = 1;
184	} else {
185	$error = "HTMLPlug: Warning - couldn't find image file $imagefile in either $filename or";
186	}
187	}
188
189	if (!$foundimage) {
190	$filename = &util::filename_cat ($dir, $imagefile);
191	if (-e $filename) {
192	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
193	$foundimage = 1;
194	} elsif (defined $error) {
195	print STDERR "$error $filename\n";
196	} else {
197	print STDERR "HTMLPlug: Warning - couldn't find image file $imagefile in $filename\n";
198	}
199	}
200
201	if ($foundimage) {
202	return "${front}_httpcollection_/archives/_thisOID_/${imagefile}${back}";
203	} else {
204	return "";
205	}
206	}
207
208	1;
209
210
211
212
213
214
215
216
217
218
219

Note: See TracBrowser for help on using the repository browser.

Download in other formats: