source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 9823

Last change on this file since 9823 was 8166, checked in by mdewsnip, 20 years ago

Added FileSize metadata in most plugins.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
RevLine 
[537]1###########################################################################
2#
3# HBPlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1244]26# plugin which processes an HTML book directory
[4]27
[1244]28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
[3540]40# 12/05/02 Added usage datastructure - John Thompson
41
[4]42package HBPlug;
43
[1010]44use ghtml;
[4]45use BasPlug;
[3542]46use unicode;
[286]47use util;
[288]48use doc;
[4]49
[288]50
[4]51sub BEGIN {
52 @ISA = ('BasPlug');
53}
54
[6408]55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasPlug.process_exp}",
58 'type' => "regexp",
59 'reqd' => "no",
60 'deft' => &get_default_process_exp() }
61 ];
62
[3540]63my $options = { 'name' => "HBPlug",
[5680]64 'desc' => "{HBPlug.desc}",
[6408]65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments };
[3540]68
[4]69sub new {
70 my ($class) = @_;
[1244]71 my $self = new BasPlug ("HBPlug", @_);
[5924]72 $self->{'plugin_type'} = "HBPlug";
[3540]73 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
74 my $option_list = $self->{'option_list'};
75 push( @{$option_list}, $options );
[4]76
77 return bless $self, $class;
78}
79
[1244]80sub init {
[4]81 my $self = shift (@_);
[1424]82 my ($verbosity, $outhandle) = @_;
[4]83
[1424]84 $self->BasPlug::init($verbosity, $outhandle);
[4744]85 $self->{'input_encoding'} = "iso_8859_1";
[1244]86
87 # this plugin only handles ascii encodings
88 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
89 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
90 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
[3540]91 }
[4]92}
93
[1244]94# this is included only to prevent warnings being printed out
95# from BasPlug::init. The process_exp is not used by this plugin
96sub get_default_process_exp {
97 my $self = shift (@_);
98
99 return "This plugin does not use a process_exp\n";
100}
101
102
[4]103sub HB_read_html_file {
104 my $self = shift (@_);
[318]105 my ($htmlfile, $text) = @_;
[4]106
107 # load in the file
108 if (!open (FILE, $htmlfile)) {
[1424]109 my $outhandle = $self->{'outhandle'};
110 print $outhandle "ERROR - could not open $htmlfile\n";
[318]111 return;
[4]112 }
113
[318]114 my $foundbody = 0;
[1244]115 $self->HB_gettext (\$foundbody, $text, "FILE");
[318]116 close FILE;
117
118 # just in case there was no <body> tag
119 if (!$foundbody) {
120 $foundbody = 1;
121 open (FILE, $htmlfile) || return;
[1244]122 $self->HB_gettext (\$foundbody, $text, "FILE");
[318]123 close FILE;
124 }
[3542]125 # text is in utf8
[318]126}
127
[3542]128# converts the text to utf8, as ghtml does that for &eacute; etc.
[318]129sub HB_gettext {
130 my $self = shift (@_);
131 my ($foundbody, $text, $handle) = @_;
[1424]132 my $outhandle = $self->{'outhandle'};
[318]133
[4]134 my $line = "";
[318]135 while (defined ($line = <$handle>)) {
[4]136 # look for body tag
[318]137 if (!$$foundbody) {
[4]138 if ($line =~ s/^.*<body[^>]*>//i) {
[318]139 $$foundbody = 1;
[4]140 } else {
141 next;
142 }
143 }
[318]144
[4]145 # check for symbol fonts
146 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
147 my $font = $1;
[1424]148 print $outhandle "HBPlug::HB_gettext - warning removed font $font\n"
[4]149 if ($font !~ /^arial$/i);
150 }
151
152 $line =~ s/<\/p>//ig; # remove </p> tags
153 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
154
[318]155 $$text .= $line;
[4]156 }
[3542]157 #
158 if ($self->{'input_encoding'} eq "iso_8859_1") {
159 # convert to utf-8
160 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
161 }
162 # convert any alphanumeric character entities to their utf-8
163 # equivalent for indexing purposes
164 &ghtml::convertcharentities ($$text);
165
[318]166 $$text =~ s/\s+/ /g; # remove \n's
[4]167}
168
169sub HB_clean_section {
170 my $self = shift (@_);
171 my ($section) = @_;
172
173 # remove tags without a starting tag from the section
174 my ($tag, $tagstart);
175 while ($section =~ /<\/([^>]{1,10})>/) {
176 $tag = $1;
177 $tagstart = index($section, "<$tag");
178 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
179 $section =~ s/<\/$tag>//;
180 }
181
182 # remove extra paragraph tags
183 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
184
185 # remove extra stuff at the end of the section
186 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
187
188 # add a newline at the beginning of each paragraph
189 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
190
191 # add a newline every 80 characters at a word boundary
192 # Note: this regular expression puts a line feed before
193 # the last word in each section, even when it is not
194 # needed.
195 $section =~ s/(.{1,80})\s/$1\n/g;
196
[317]197 # fix up the image links
198 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
[1020]199 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[317]200 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
[1020]201 <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
[4]202
203 return $section;
204}
205
206
207sub shorten {
208 my $self = shift (@_);
209 my ($text) = @_;
210
211 return "\"$text\"" if (length($text) < 100);
212
213 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
214 substr ($text, length($text)-50) . "\"";
215}
216
[1244]217# if input_encoding is ascii we can call add_utf8_metadata
218# directly but if it's iso_8859_1 (the default) we need to call
219# add_metadata so that the ascii2utf8 conversion is done first
220# this should speed things up a little if processing an ascii only
221# document with input_encoding set to ascii
222sub HB_add_metadata {
223 my $self = shift (@_);
224 my ($doc_obj, $cursection, $field, $value) = @_;
[4]225
[3542]226# All text should now be in utf-8
227# if ($self->{'input_encoding'} eq "ascii") {
228 $doc_obj->add_utf8_metadata ($cursection, $field, $value);
229# } else {
230# $doc_obj->add_metadata ($cursection, $field, $value);
231# }
[1244]232}
233
[317]234# return number of files processed, undef if can't process
[4]235# Note that $base_dir might be "" and that $file might
236# include directories
237sub read {
238 my $self = shift (@_);
[6332]239 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
[1424]240 my $outhandle = $self->{'outhandle'};
[4]241
[288]242 # get the html filename and see if this is an HTML Book...
243 my $jobnumber = $file;
[4]244 if ($file =~ /[\\\/]/) {
245 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
246 }
[317]247 return undef unless defined $jobnumber;
[286]248 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
[317]249 return undef unless -e $htmlfile;
[4]250
[6332]251 print STDERR "<Processing n='$file' p='HBPlug'>\n" if ($gli);
[1424]252 print $outhandle "HBPlug: processing $file\n";
[4]253
254 # read in the file and do basic html cleaning (removing header etc)
[318]255 my $html = "";
256 $self->HB_read_html_file ($htmlfile, \$html);
[3542]257 # html is in utf8
[4]258
259 # create a new document
260 my $doc_obj = new doc ($file, "indexed_doc");
[2327]261 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
[4]262
263 # copy the book cover if it exists
[286]264 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
[796]265 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
[7508]266 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8166]267 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
268 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
[4]269
[317]270 my $cursection = $doc_obj->get_top_section();
[8121]271
[317]272 # add metadata for top level of document
[1244]273 foreach my $field (keys(%$metadata)) {
[317]274 # $metadata->{$field} may be an array reference
275 if (ref ($metadata->{$field}) eq "ARRAY") {
276 map {
[1244]277 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
[317]278 } @{$metadata->{$field}};
279 } else {
[1244]280 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
[317]281 }
282 }
283
[4]284 # process the file one section at a time
285 my $curtoclevel = 1;
286 my $firstsection = 1;
287 while (length ($html) > 0) {
[339]288 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
[4]289 my $toclevel = $3;
290 my $title = $4;
291 my $sectiontext = "";
[321]292 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
[4]293 $sectiontext = $1;
294 } else {
295 $sectiontext = $html;
296 $html = "";
297 }
298
299 # remove tags and extra spaces from the title
300 $title =~ s/<\/?[^>]+>//g;
301 $title =~ s/^\s+|\s+$//g;
302
303 # close any sections below the current level and
304 # create a new section (special case for the firstsection)
305 while (($curtoclevel > $toclevel) ||
306 (!$firstsection && $curtoclevel == $toclevel)) {
307 $cursection = $doc_obj->get_parent_section ($cursection);
308 $curtoclevel--;
309 }
310 if ($curtoclevel+1 < $toclevel) {
[1424]311 print $outhandle "WARNING - jump in toc levels in $htmlfile " .
[4]312 "from $curtoclevel to $toclevel\n";
313 }
314 while ($curtoclevel < $toclevel) {
315 $curtoclevel++;
316 $cursection =
317 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
318 }
319
320 # add the metadata to this section
[1244]321 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
[286]322
[4]323 # clean up the section html
324 $sectiontext = $self->HB_clean_section($sectiontext);
325
326 # associate any files
[286]327 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
[1020]328 if /_httpdocimg_\/([^\"]+)\"/; 0; }
329 split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
[4]330
331 # add the text for this section
[3542]332# All read text should now be in utf-8
333# if ($self->{'input_encoding'} eq "ascii") {
334 $doc_obj->add_utf8_text ($cursection, $sectiontext);
335# } else {
336# $doc_obj->add_text ($cursection, $sectiontext);
337# }
[4]338 } else {
[1424]339 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
[4]340 "\nin $htmlfile\n";
341 last;
342 }
[317]343 $firstsection = 0;
[4]344 }
345
346 # add a OID
347 $doc_obj->set_OID ();
348
349 # process the document
[288]350 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
[4]351
352 return 1; # processed the file
353}
354
355
3561;
Note: See TracBrowser for help on using the repository browser.