- Timestamp:
- 2000-07-13T10:21:53+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/New_Config_Format-branch/gsdl/perllib/plugins/HBPlug.pm
r1020 r1279 24 24 ########################################################################### 25 25 26 # plugin which process an HTML book directory 26 # plugin which processes an HTML book directory 27 28 # This plugin is used by the Humanity Library collections and does not handle 29 # input encodings other than ascii or extended ascii 30 31 # this code is kind of ugly and could no doubt be made to run faster, by leaving 32 # it in this state I hope to encourage people to make their collections use 33 # HBSPlug instead ;-) 34 35 # Use HBSPlug if creating a new collection and marking up files like the 36 # Humanity Library collections. HBSPlug accepts all input encodings but 37 # expects the marked up files to be cleaner than those used by the 38 # Humanity Library collections 27 39 28 40 package HBPlug; 29 41 30 use plugin;31 42 use ghtml; 32 43 use BasPlug; 33 44 use util; 34 use lang;35 45 use doc; 36 use cfgread;37 46 38 47 … … 43 52 sub new { 44 53 my ($class) = @_; 45 $self = new BasPlug ();54 my $self = new BasPlug ("HBPlug", @_); 46 55 47 56 return bless $self, $class; 48 57 } 49 58 50 sub is_recursive { 51 my $self = shift (@_); 52 53 return 0; # this is not a recursive plugin 54 } 59 sub init { 60 my $self = shift (@_); 61 my ($verbosity) = @_; 62 63 $self->BasPlug::init(); 64 65 # this plugin only handles ascii encodings 66 if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) { 67 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" . 68 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n"; 69 } 70 } 71 72 # this is included only to prevent warnings being printed out 73 # from BasPlug::init. The process_exp is not used by this plugin 74 sub get_default_process_exp { 75 my $self = shift (@_); 76 77 return "This plugin does not use a process_exp\n"; 78 } 79 55 80 56 81 sub HB_read_html_file { … … 65 90 66 91 my $foundbody = 0; 67 $self->HB_gettext (\$foundbody, $text, FILE);92 $self->HB_gettext (\$foundbody, $text, "FILE"); 68 93 close FILE; 69 94 … … 72 97 $foundbody = 1; 73 98 open (FILE, $htmlfile) || return; 74 $self->HB_gettext (\$foundbody, $text, FILE);99 $self->HB_gettext (\$foundbody, $text, "FILE"); 75 100 close FILE; 76 101 } … … 159 184 } 160 185 186 # if input_encoding is ascii we can call add_utf8_metadata 187 # directly but if it's iso_8859_1 (the default) we need to call 188 # add_metadata so that the ascii2utf8 conversion is done first 189 # this should speed things up a little if processing an ascii only 190 # document with input_encoding set to ascii 191 sub HB_add_metadata { 192 my $self = shift (@_); 193 my ($doc_obj, $cursection, $field, $value) = @_; 194 195 if ($self->{'input_encoding'} eq "ascii") { 196 $doc_obj->add_utf8_metadata ($cursection, $field, $value); 197 } else { 198 $doc_obj->add_metadata ($cursection, $field, $value); 199 } 200 } 161 201 162 202 # return number of files processed, undef if can't process … … 192 232 193 233 # add metadata for top level of document 194 foreach $field (keys(%$metadata)) {234 foreach my $field (keys(%$metadata)) { 195 235 # $metadata->{$field} may be an array reference 196 236 if (ref ($metadata->{$field}) eq "ARRAY") { 197 237 map { 198 $ doc_obj->add_metadata ($cursection, $field, $_);238 $self->HB_add_metadata ($doc_obj, $cursection, $field, $_); 199 239 } @{$metadata->{$field}}; 200 240 } else { 201 $ doc_obj->add_metadata ($cursection, $field, $metadata->{$field});241 $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field}); 202 242 } 203 243 } … … 240 280 241 281 # add the metadata to this section 242 $ doc_obj->add_metadata ($cursection, "Title", $title);282 $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title); 243 283 244 284 # clean up the section html … … 251 291 252 292 # add the text for this section 253 $doc_obj->add_text ($cursection, $sectiontext); 254 293 if ($self->{'input_encoding'} eq "ascii") { 294 $doc_obj->add_utf8_text ($cursection, $sectiontext); 295 } else { 296 $doc_obj->add_text ($cursection, $sectiontext); 297 } 255 298 } else { 256 299 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
Note:
See TracChangeset
for help on using the changeset viewer.