source: trunk/gsdl/perllib/plugins/GBHPlug.pm@ 1020

Last change on this file since 1020 was 1020, checked in by sjboddie, 24 years ago

changed paths to collection images (again!)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1###########################################################################
2#
3# GBHPlug.pm -- plugin for processing gb encoded html
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# creates simple single-level document from .htm or .html files
27# (case-insensitive match on filenames). Adds Title metadata
28# taken from <title> tags if found otherwise first 100 characters
29# outside of tags.
30# will also attempt to include images which it will search for in
31# the same directory as the document itself (it will also search
32# directories relative to that directory).
33
34# this plugin currently does nothing with href links so relative links
35# may become broken.
36
37
38package GBHPlug;
39
40use BasPlug;
41use sorttools;
42use util;
43use unicode;
44use cnseg;
45use gb;
46
47
48sub BEGIN {
49 @ISA = ('BasPlug');
50}
51
52sub new {
53 my ($class) = @_;
54 $self = new BasPlug ();
55
56 return bless $self, $class;
57}
58
59sub is_recursive {
60 my $self = shift (@_);
61
62 return 0; # this is not a recursive plugin
63}
64
65
66# return number of files processed, undef if can't process
67# Note that $base_dir might be "" and that $file might
68# include directories
69sub read {
70 my $self = shift (@_);
71 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
72
73 my $filename = &util::filename_cat($base_dir, $file);
74 my $absdir = $filename;
75 $absdir =~ s/[^\/\\]*$//;
76
77 return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename));
78
79 my $gz = 0;
80 if (defined $2) {
81 $gz = $2;
82 $gz = 1 if ($gz =~ /\.gz/i);
83 }
84
85 print STDERR "GBHPlug: processing $filename\n" if $processor->{'verbosity'};
86
87 # create a new document
88 my $doc_obj = new doc ($file, "indexed_doc");
89
90 if ($gz) {
91 open (FILE, "zcat $filename |") || die "GBHPlug::read - ERROR: zcat can't open $filename\n";
92 } else {
93 open (FILE, $filename) || die "GBHPlug::read - ERROR: can't open $filename\n";
94 }
95 my $cursection = $doc_obj->get_top_section();
96
97 my $text = "";
98 my $line = "";
99 my $title = "";
100 while (defined ($line = <FILE>)) {
101 $text .= $line;
102 }
103
104 # convert to unicode
105 $text = &unicode::unicode2utf8(&gb::gb2unicode($text));
106 # segment the Chinese words
107 $text = &cnseg::segment($text);
108
109 # we'll use the worthless alarm thingy to temporarily replace
110 # '\n' so we'd better check it doesn't occur naturally
111 if ($text =~ /\a/) {
112 print STDERR "GBHPlug::read - 'WARNING '\a' character occurs in text!!\n";
113 }
114
115 # remove line breaks
116 $text =~ s/\n/\a/g;
117
118 # see if there's a <title> tag
119 if ($text =~ /<title[^>]*>([^<]*)<\/title[^>]*>/i) {
120 if (defined $1) {
121 $title = $1;
122 }
123 }
124 # if no title use first 20 characters
125 if ($title eq "") {
126 my $tmptext = $text;
127 $tmptext =~ s/<[^>]*>//g;
128 $title = substr ($tmptext, 0, 20);
129 }
130
131 if ($title ne "") {
132 $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
133 }
134
135 # remove header rubbish
136 $text =~ s/^.*?<body[^>]*>//i;
137
138 # and any other unwanted tags
139 $text =~ s/<(\/p|\/html|\/body)>//g;
140
141 # fix up the image links
142 $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/
143 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige;
144
145 # put line breaks back in
146 $text =~ s/\a/\n/g;
147
148 # add the text
149 $doc_obj->add_utf8_text($cursection, $text);
150
151 # assume that any metadata passed to this plugin is already utf8
152 foreach $field (keys(%$metadata)) {
153 # $metadata->{$field} may be an array reference
154 if (ref ($metadata->{$field}) eq "ARRAY") {
155 map {
156 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
157 } @{$metadata->{$field}};
158 } else {
159 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
160 }
161 }
162
163 # add OID
164 $doc_obj->set_OID ();
165
166 # process the document
167 $processor->process($doc_obj);
168
169 return 1; # processed the file
170}
171
172sub replace_image_links {
173
174 my ($dir, $doc_obj, $front, $link, $back) = @_;
175
176 my ($filename, $error);
177 my $foundimage = 0;
178
179 $link =~ s/\/\///;
180 my ($imagetype) = $link =~ /([^\.]*)$/;
181 $imagetype =~ tr/[A-Z]/[a-z]/;
182 if ($imagetype eq "jpg") {$imagetype = "jpeg";}
183 if ($imagetype !~ /^(jpg|gif|png)$/) {
184 print STDERR "GBHPlug: Warning - unknown image type ($imagetype)\n";
185 }
186 my ($imagefile) = $link =~ /([^\/]*)$/;
187 my ($imagepath) = $link =~ /^[^\/]*(.*)$/;
188
189 if (defined $imagepath && $imagepath =~ /\w/) {
190 # relative link
191 $filename = &util::filename_cat ($dir, $imagepath);
192 if (-e $filename) {
193 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
194 $foundimage = 1;
195 } else {
196 $error = "GBHPlug: Warning - couldn't find image file $imagefile in either $filename or";
197 }
198 }
199
200 if (!$foundimage) {
201 $filename = &util::filename_cat ($dir, $imagefile);
202 if (-e $filename) {
203 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype");
204 $foundimage = 1;
205 } elsif (defined $error) {
206 print STDERR "$error $filename\n";
207 } else {
208 print STDERR "GBHPlug: Warning - couldn't find image file $imagefile in $filename\n";
209 }
210 }
211
212 if ($foundimage) {
213 return "${front}_httpdocimg_/${imagefile}${back}";
214 } else {
215 return "";
216 }
217}
218
2191;
220
221
222
223
224
225
226
227
228
229
230
Note: See TracBrowser for help on using the repository browser.