root/gsdl/trunk/perllib/plugins/HBPlugin.pm @ 17026

Revision 17026, 10.0 KB (checked in by kjdon, 11 years ago)

OID generation modifications: OIDtype and OIDmetadata options now available for plugins as well as import. OIDtype for plugins defaults to auto - if set to auto, then use the values from import. All plugins now call self->add_OID instead of doc_obj->set_OID. This sets the doc_obj OIDtype so that doesn't need to be donein other places any more. all plugins have the get_oid_hash_type method - normally returns hash_on_file, but can be overridden to return hash_on_ga_xml for those plugins that don't want hashing on file (MP3,OggVorbis?...)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# HBPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which processes an HTML book directory
27
28# This plugin is used by the Humanity Library collections and does not handle
29# input encodings other than ascii or extended ascii
30
31# this code is kind of ugly and could no doubt be made to run faster, by leaving
32# it in this state I hope to encourage people to make their collections use
33# HBSPlug instead ;-)
34
35# Use HBSPlug if creating a new collection and marking up files like the
36# Humanity Library collections. HBSPlug accepts all input encodings but
37# expects the marked up files to be cleaner than those used by the
38# Humanity Library collections
39
40package HBPlugin;
41
42use ghtml;
43use BasePlugin;
44use unicode;
45use util;
46use doc;
47
48use strict;
49no strict 'refs'; # allow filehandles to be variables and viceversa
50
51sub BEGIN {
52    @HBPlugin::ISA = ('BasePlugin');
53}
54my $encoding_list =     
55    [ { 'name' => "ascii",
56    'desc' => "{BasePlugin.encoding.ascii}" },
57      { 'name' => "iso_8859_1",
58    'desc' => "{HBPlugin.encoding.iso_8859_1}" } ];
59 
60my $arguments =
61    [ { 'name' => "process_exp",
62    'desc' => "{BasePlugin.process_exp}",
63    'type' => "regexp",
64    'reqd' => "no",
65    'deft' => &get_default_process_exp() },
66      { 'name' => "input_encoding",
67    'desc' => "{ReadTextFile.input_encoding}",
68    'type' => "enum",
69    'deft' => "iso_8859_1",
70    'list' => $encoding_list,
71    'reqd' => "no" }
72      ];
73
74my $options = { 'name'     => "HBPlugin",
75        'desc'     => "{HBPlugin.desc}",
76        'abstract' => "no",
77        'inherits' => "yes",
78        'args'     => $arguments };
79
80sub new {
81    my ($class) = shift (@_);
82    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
83    push(@$pluginlist, $class);
84
85    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
86    push(@{$hashArgOptLists->{"OptList"}},$options);
87
88    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
89
90    return bless $self, $class;
91}
92
93# this is included only to prevent warnings being printed out
94# from BasePlugin::init. The process_exp is not used by this plugin
95sub get_default_process_exp {
96    my $self = shift (@_);
97
98    return "This plugin does not use a process_exp\n";
99}
100
101
102sub HB_read_html_file {
103    my $self = shift (@_);
104    my ($htmlfile, $text) = @_;
105
106    # load in the file
107    if (!open (FILE, $htmlfile)) {
108    my $outhandle = $self->{'outhandle'};
109    print $outhandle "ERROR - could not open $htmlfile\n";
110    return;
111    }
112
113    my $foundbody = 0;
114    $self->HB_gettext (\$foundbody, $text, "FILE");
115    close FILE;
116
117    # just in case there was no <body> tag
118    if (!$foundbody) {
119    $foundbody = 1;
120    open (FILE, $htmlfile) || return;
121    $self->HB_gettext (\$foundbody, $text, "FILE");
122    close FILE;
123    }
124    # text is in utf8
125}
126
127# converts the text to utf8, as ghtml does that for &eacute; etc.
128sub HB_gettext {
129    my $self = shift (@_);
130    my ($foundbody, $text, $handle) = @_;
131    my $outhandle = $self->{'outhandle'};
132
133    my $line = "";
134    while (defined ($line = <$handle>)) {
135    # look for body tag
136    if (!$$foundbody) {
137        if ($line =~ s/^.*<body[^>]*>//i) {
138        $$foundbody = 1;
139        } else {
140        next;
141        }
142    }
143   
144    # check for symbol fonts
145    if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
146        my $font = $1;
147        print $outhandle "HBPlugin::HB_gettext - warning removed font $font\n"
148        if ($font !~ /^arial$/i);
149    }
150
151    $line =~ s/<\/p>//ig;   # remove </p> tags
152    $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
153
154    $$text .= $line;
155    }
156    #
157    if ($self->{'input_encoding'} eq "iso_8859_1") {
158    # convert to utf-8
159    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
160    }
161    # convert any alphanumeric character entities to their utf-8
162    # equivalent for indexing purposes
163    &ghtml::convertcharentities ($$text);
164
165    $$text =~ s/\s+/ /g; # remove \n's
166}
167
168sub HB_clean_section {
169    my $self = shift (@_);
170    my ($section) = @_;
171
172    # remove tags without a starting tag from the section
173    my ($tag, $tagstart);
174    while ($section =~ /<\/([^>]{1,10})>/) {
175    $tag = $1;
176    $tagstart = index($section, "<$tag");
177    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
178    $section =~ s/<\/$tag>//;
179    }
180   
181    # remove extra paragraph tags
182    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
183   
184    # remove extra stuff at the end of the section
185    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
186   
187    # add a newline at the beginning of each paragraph
188    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
189   
190    # add a newline every 80 characters at a word boundary
191    # Note: this regular expression puts a line feed before
192    # the last word in each section, even when it is not
193    # needed.
194    $section =~ s/(.{1,80})\s/$1\n/g;
195   
196    # fix up the image links
197    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
198    <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
199    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
200    <center><img src=\"_httpdocimg_\/$1\"><\/center><br>/ig;
201
202    return $section;
203}
204
205
206sub shorten {
207    my $self = shift (@_);
208    my ($text) = @_;
209
210    return "\"$text\"" if (length($text) < 100);
211
212    return "\"" . substr ($text, 0, 50) . "\" ... \"" .
213    substr ($text, length($text)-50) . "\"";
214}
215
216# return number of files processed, undef if can't process
217# Note that $base_dir might be "" and that $file might
218# include directories
219sub read {
220    my $self = shift (@_);
221    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
222    my $outhandle = $self->{'outhandle'};
223
224    # get the html filename and see if this is an HTML Book...
225    my $jobnumber = $file;
226    if ($file =~ /[\\\/]/) {
227    ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
228    }
229    return undef unless defined $jobnumber;
230    my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
231    return undef unless -e $htmlfile;
232
233    print STDERR "<Processing n='$file' p='HBPlugin'>\n" if ($gli);
234    print $outhandle "HBPlugin: processing $file\n";
235
236    # read in the file and do basic html cleaning (removing header etc)
237    my $html = "";
238    $self->HB_read_html_file ($htmlfile, \$html);
239    # html is in utf8
240
241    # create a new document
242    my $doc_obj = new doc ($file, "indexed_doc");
243
244    # copy the book cover if it exists
245    my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
246    $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg");
247    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
248    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileFormat", "HB");
249    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $htmlfile));
250
251    my $cursection = $doc_obj->get_top_section();
252   
253    # add metadata for top level of document
254    foreach my $field (keys(%$metadata)) {
255    # $metadata->{$field} may be an array reference
256    if (ref ($metadata->{$field}) eq "ARRAY") {
257        map {
258        $doc_obj->add_utf8_metadata($cursection, $field, $_);
259        } @{$metadata->{$field}};
260    } else {
261        $doc_obj->add_utf8_metadata($cursection, $field, $metadata->{$field});
262    }
263    }
264
265    # process the file one section at a time
266    my $curtoclevel = 1;
267    my $firstsection = 1;
268    while (length ($html) > 0) {
269    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
270        my $toclevel = $3;
271        my $title = $4;
272        my $sectiontext = "";
273        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
274        $sectiontext = $1;
275        } else {
276        $sectiontext = $html;
277        $html = "";
278        }
279
280        # remove tags and extra spaces from the title
281        $title =~ s/<\/?[^>]+>//g;
282        $title =~ s/^\s+|\s+$//g;
283
284        # close any sections below the current level and
285        # create a new section (special case for the firstsection)
286        while (($curtoclevel > $toclevel) ||
287           (!$firstsection && $curtoclevel == $toclevel)) {
288        $cursection = $doc_obj->get_parent_section ($cursection);
289        $curtoclevel--;
290        }
291        if ($curtoclevel+1 < $toclevel) {
292        print $outhandle "WARNING - jump in toc levels in $htmlfile " .
293            "from $curtoclevel to $toclevel\n";
294        }
295        while ($curtoclevel < $toclevel) {
296        $curtoclevel++;
297        $cursection =
298            $doc_obj->insert_section($doc_obj->get_end_child($cursection));
299        }
300
301        # add the metadata to this section
302        $doc_obj->add_utf8_metadata($cursection, "Title", $title);
303
304        # clean up the section html
305        $sectiontext = $self->HB_clean_section($sectiontext);
306
307        # associate any files
308        map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
309              if /_httpdocimg_\/([^\"]+)\"/; 0; }
310             split (/(_httpdocimg_\/[^\"]+\")/, $sectiontext);
311
312        # add the text for this section
313        $doc_obj->add_utf8_text ($cursection, $sectiontext);
314    } else {
315        print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
316        "\nin $htmlfile\n";
317        last;
318    }
319    $firstsection = 0;
320    }
321
322    # add a OID
323    $self->add_OID($doc_obj);
324
325    # process the document
326    $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
327       
328    return 1; # processed the file
329}
330
331
3321;
Note: See TracBrowser for help on using the browser.