Context Navigation

source: trunk/gsdl/perllib/plugins/GBHPlug.pm@ 1020

Last change on this file since 1020 was 1020, checked in by sjboddie, 24 years ago
changed paths to collection images (again!)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.0 KB

Line
1	###########################################################################
2	#
3	# GBHPlug.pm -- plugin for processing gb encoded html
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# creates simple single-level document from .htm or .html files
27	# (case-insensitive match on filenames). Adds Title metadata
28	# taken from <title> tags if found otherwise first 100 characters
29	# outside of tags.
30	# will also attempt to include images which it will search for in
31	# the same directory as the document itself (it will also search
32	# directories relative to that directory).
33
34	# this plugin currently does nothing with href links so relative links
35	# may become broken.
36
37
38	package GBHPlug;
39
40	use BasPlug;
41	use sorttools;
42	use util;
43	use unicode;
44	use cnseg;
45	use gb;
46
47
48	sub BEGIN {
49	@ISA = ('BasPlug');
50	}
51
52	sub new {
53	my ($class) = @_;
54	$self = new BasPlug ();
55
56	return bless $self, $class;
57	}
58
59	sub is_recursive {
60	my $self = shift (@_);
61
62	return 0; # this is not a recursive plugin
63	}
64
65
66	# return number of files processed, undef if can't process
67	# Note that $base_dir might be "" and that $file might
68	# include directories
69	sub read {
70	my $self = shift (@_);
71	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
72
73	my $filename = &util::filename_cat($base_dir, $file);
74	my $absdir = $filename;
75	$absdir =~ s/[^\/\\]*$//;
76
77	return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename));
78
79	my $gz = 0;
80	if (defined $2) {
81	$gz = $2;
82	$gz = 1 if ($gz =~ /\.gz/i);
83	}
84
85	print STDERR "GBHPlug: processing $filename\n" if $processor->{'verbosity'};
86
87	# create a new document
88	my $doc_obj = new doc ($file, "indexed_doc");
89
90	if ($gz) {
91	open (FILE, "zcat $filename \|") \|\| die "GBHPlug::read - ERROR: zcat can't open $filename\n";
92	} else {
93	open (FILE, $filename) \|\| die "GBHPlug::read - ERROR: can't open $filename\n";
94	}
95	my $cursection = $doc_obj->get_top_section();
96
97	my $text = "";
98	my $line = "";
99	my $title = "";
100	while (defined ($line = <FILE>)) {
101	$text .= $line;
102	}
103
104	# convert to unicode
105	$text = &unicode::unicode2utf8(&gb::gb2unicode($text));
106	# segment the Chinese words
107	$text = &cnseg::segment($text);
108
109	# we'll use the worthless alarm thingy to temporarily replace
110	# '\n' so we'd better check it doesn't occur naturally
111	if ($text =~ /\a/) {
112	print STDERR "GBHPlug::read - 'WARNING '\a' character occurs in text!!\n";
113	}
114
115	# remove line breaks
116	$text =~ s/\n/\a/g;
117
118	# see if there's a <title> tag
119	if ($text =~ /<title[^>]>([^<])<\/title[^>]*>/i) {
120	if (defined $1) {
121	$title = $1;
122	}
123	}
124	# if no title use first 20 characters
125	if ($title eq "") {
126	my $tmptext = $text;
127	$tmptext =~ s/<[^>]*>//g;
128	$title = substr ($tmptext, 0, 20);
129	}
130
131	if ($title ne "") {
132	$doc_obj->add_utf8_metadata ($cursection, "Title", $title);
133	}
134
135	# remove header rubbish
136	$text =~ s/^.?<body[^>]>//i;
137
138	# and any other unwanted tags
139	$text =~ s/<(\/p\|\/html\|\/body)>//g;
140
141	# fix up the image links
142	$text =~ s/(<img[^>]?src\s=\s\"?)([^\">]+)(\"?[^>]>)/
143	&replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige;
144
145	# put line breaks back in
146	$text =~ s/\a/\n/g;
147
148	# add the text
149	$doc_obj->add_utf8_text($cursection, $text);
150
151	# assume that any metadata passed to this plugin is already utf8
152	foreach $field (keys(%$metadata)) {
153	# $metadata->{$field} may be an array reference
154	if (ref ($metadata->{$field}) eq "ARRAY") {
155	map {
156	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
157	} @{$metadata->{$field}};
158	} else {
159	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
160	}
161	}
162
163	# add OID
164	$doc_obj->set_OID ();
165
166	# process the document
167	$processor->process($doc_obj);
168
169	return 1; # processed the file
170	}
171
172	sub replace_image_links {
173
174	my ($dir, $doc_obj, $front, $link, $back) = @_;
175
176	my ($filename, $error);
177	my $foundimage = 0;
178
179	$link =~ s/\/\///;
180	my ($imagetype) = $link =~ /([^\.]*)$/;
181	$imagetype =~ tr/[A-Z]/[a-z]/;
182	if ($imagetype eq "jpg") {$imagetype = "jpeg";}
183	if ($imagetype !~ /^(jpg\|gif\|png)$/) {
184	print STDERR "GBHPlug: Warning - unknown image type ($imagetype)\n";
185	}
186	my ($imagefile) = $link =~ /([^\/]*)$/;
187	my ($imagepath) = $link =~ /^[^\/](.)$/;
188
189	if (defined $imagepath && $imagepath =~ /\w/) {
190	# relative link
191	$filename = &util::filename_cat ($dir, $imagepath);
192	if (-e $filename) {
193	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
194	$foundimage = 1;
195	} else {
196	$error = "GBHPlug: Warning - couldn't find image file $imagefile in either $filename or";
197	}
198	}
199
200	if (!$foundimage) {
201	$filename = &util::filename_cat ($dir, $imagefile);
202	if (-e $filename) {
203	$doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
204	$foundimage = 1;
205	} elsif (defined $error) {
206	print STDERR "$error $filename\n";
207	} else {
208	print STDERR "GBHPlug: Warning - couldn't find image file $imagefile in $filename\n";
209	}
210	}
211
212	if ($foundimage) {
213	return "${front}_httpdocimg_/${imagefile}${back}";
214	} else {
215	return "";
216	}
217	}
218
219	1;
220
221
222
223
224
225
226
227
228
229
230

Note: See TracBrowser for help on using the repository browser.

Download in other formats: