source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 23419

Last change on this file since 23419 was 23419, checked in by max, 13 years ago

Setting the values to store as block files is now done through an API call to BasePlugin. This way, anything uniform requirement (such as putting in both C:\... and c:\... entries for Windows) can be done in one place.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 62.8 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39use Unicode::Normalize 'normalize';
40
41use ReadTextFile;
42use HBPlugin;
43use ghtml;
44use unicode;
45use util;
46use XMLParser;
47
48use Image::Size;
49use File::Copy;
50
51sub BEGIN {
52 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53}
54
55use strict; # every perl program should have this!
56no strict 'refs'; # make an exception so we can use variables as filehandles
57
58my $arguments =
59 [ { 'name' => "process_exp",
60 'desc' => "{BasePlugin.process_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_process_exp() },
63 { 'name' => "block_exp",
64 'desc' => "{BasePlugin.block_exp}",
65 'type' => 'regexp',
66 'deft' => &get_default_block_exp() },
67 { 'name' => "nolinks",
68 'desc' => "{HTMLPlugin.nolinks}",
69 'type' => "flag" },
70 { 'name' => "keep_head",
71 'desc' => "{HTMLPlugin.keep_head}",
72 'type' => "flag" },
73 { 'name' => "no_metadata",
74 'desc' => "{HTMLPlugin.no_metadata}",
75 'type' => "flag" },
76 { 'name' => "metadata_fields",
77 'desc' => "{HTMLPlugin.metadata_fields}",
78 'type' => "string",
79 'deft' => "Title" },
80 { 'name' => "metadata_field_separator",
81 'desc' => "{HTMLPlugin.metadata_field_separator}",
82 'type' => "string",
83 'deft' => "" },
84 { 'name' => "hunt_creator_metadata",
85 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
86 'type' => "flag" },
87 { 'name' => "file_is_url",
88 'desc' => "{HTMLPlugin.file_is_url}",
89 'type' => "flag" },
90 { 'name' => "assoc_files",
91 'desc' => "{HTMLPlugin.assoc_files}",
92 'type' => "regexp",
93 'deft' => &get_default_block_exp() },
94 { 'name' => "rename_assoc_files",
95 'desc' => "{HTMLPlugin.rename_assoc_files}",
96 'type' => "flag" },
97 { 'name' => "title_sub",
98 'desc' => "{HTMLPlugin.title_sub}",
99 'type' => "string",
100 'deft' => "" },
101 { 'name' => "description_tags",
102 'desc' => "{HTMLPlugin.description_tags}",
103 'type' => "flag" },
104 # retain this for backward compatibility (w3mir option was replaced by
105 # file_is_url)
106 { 'name' => "w3mir",
107# 'desc' => "{HTMLPlugin.w3mir}",
108 'type' => "flag",
109 'hiddengli' => "yes"},
110 { 'name' => "no_strip_metadata_html",
111 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
112 'type' => "string",
113 'deft' => "",
114 'reqd' => "no"},
115 { 'name' => "sectionalise_using_h_tags",
116 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
117 'type' => "flag" },
118 { 'name' => "use_realistic_book",
119 'desc' => "{HTMLPlugin.tidy_html}",
120 'type' => "flag"},
121 { 'name' => "old_style_HDL",
122 'desc' => "{HTMLPlugin.old_style_HDL}",
123 'type' => "flag"},
124 {'name' => "processing_tmp_files",
125 'desc' => "{BasePlugin.processing_tmp_files}",
126 'type' => "flag",
127 'hiddengli' => "yes"}
128 ];
129
130my $options = { 'name' => "HTMLPlugin",
131 'desc' => "{HTMLPlugin.desc}",
132 'abstract' => "no",
133 'inherits' => "yes",
134 'args' => $arguments };
135
136
137sub new {
138 my ($class) = shift (@_);
139 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140 push(@$pluginlist, $class);
141
142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143 push(@{$hashArgOptLists->{"OptList"}},$options);
144
145
146 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
147
148 if ($self->{'w3mir'}) {
149 $self->{'file_is_url'} = 1;
150 }
151 $self->{'aux_files'} = {};
152 $self->{'dir_num'} = 0;
153 $self->{'file_num'} = 0;
154
155 return bless $self, $class;
156}
157
158# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
159# if have eg <script language="javascript" src="img/lib.js@123">
160# blocking is now done by reading through the file and recording all the
161# images and other files
162sub get_default_block_exp {
163 my $self = shift (@_);
164
165 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
166 return "";
167}
168
169sub get_default_process_exp {
170 my $self = shift (@_);
171
172 # the last option is an attempt to encode the concept of an html query ...
173 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
174}
175
176sub store_block_files
177{
178 my $self =shift (@_);
179 my ($filename_full_path, $block_hash) = @_;
180
181 my $html_fname = $filename_full_path;
182 my @file_blocks;
183
184 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187 # read in file ($text will be in utf8)
188 my $raw_text = "";
189 $self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191 my $textref = \$raw_text;
192 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
193 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
194 $$textref =~ s/$opencom(.*?)$closecom//gs;
195
196 # Convert entities to their UTF8 equivalents
197 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
198 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
200
201 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
202 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
203 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
204 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
205 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
206 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
207 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
208
209 if(!defined $self->{'unicode_to_original_filename'}) {
210 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211 $self->{'unicode_to_original_filename'} = {};
212 }
213
214 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216 # remove quotes from link at start and end if necessary
217 if ($raw_link =~ m/^\"/) {
218 $raw_link =~ s/^\"//;
219 $raw_link =~ s/\"$//;
220 }
221
222 # remove any anchor names, e.g. foo.html#name becomes foo.html
223 # but watch out for any #'s that are part of entities, such as &#x3B1;
224 $raw_link =~ s/([^&])\#.*$/$1/s;
225
226 # some links may just be anchor names
227 next unless ($raw_link =~ /\S+/);
228
229 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230 # Turn relative file path into full path
231 my $dirname = &File::Basename::dirname($filename_full_path);
232 $raw_link = &util::filename_cat($dirname, $raw_link);
233 }
234 $raw_link = $self->eval_dir_dots($raw_link);
235
236 # this is the actual filename on the filesystem (that the link refers to)
237 my $url_original_filename = $self->opt_url_decode($raw_link);
238
239 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241 if ($exceeds_bytecodes) {
242 # We have a link to a file name that is more complicated than a raw byte filename
243 # What we do next depends on the operating system we are on
244
245 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
246 # Assume we're dealing with a UTF-8 encoded filename
247 $url_original_filename = encode("utf8", $url_original_filename);
248 }
249 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250 # HFS+ is UTF8 with decompostion
251 $url_original_filename = encode("utf8", $url_original_filename);
252 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253 }
254 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 # Don't need to do anything as later code maps Windows
256 # unicode filenames to DOS short filenames when needed
257 }
258 else {
259 my $outhandle = $self->{'outhandle'};
260 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261 print $outhandle " in raw file system encoding of: $raw_link\n";
262 print $outhandle " Assuming filesystem is UTF-8 based.\n";
263 $url_original_filename = encode("utf8", $url_original_filename);
264 }
265 }
266
267 # Convert the (currently raw) link into its Unicode version.
268 # Store the Unicode link along with the url_original_filename
269 my $unicode_url_original_filename = "";
270 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276 if ($url_original_filename ne $unicode_url_original_filename) {
277 my $outhandle = $self->{'outhandle'};
278
279 print $outhandle "URL Encoding $url_original_filename\n";
280 print $outhandle " ->$unicode_url_original_filename\n";
281
282 # Allow for possibility of raw byte version and Unicode versions of file
283 $self->block_filename($block_hash,$unicode_url_original_filename);
284 }
285
286 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287
288 $self->block_filename($block_hash,$url_original_filename);
289
290 }
291}
292
293# Given a filename in any encoding, will URL decode it to get back the original filename
294# in the original encoding. Because this method is intended to work out the *original*
295# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
296# string already exists in the local folder.
297#
298sub opt_url_decode {
299 my $self = shift (@_);
300 my ($raw_link) = @_;
301
302
303 # Replace %XX's in URL with decoded value if required.
304 # Note that the filename may include the %XX in some situations
305
306## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
307
308 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
309 if (!-e $raw_link) {
310 $raw_link = &unicode::url_decode($raw_link,1);
311 }
312 }
313
314 return $raw_link;
315}
316
317sub read_into_doc_obj
318{
319 my $self = shift (@_);
320 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
321
322 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
323
324 # Lookup content_encoding worked out in file_block pass for this file
325 # Store it under the local name 'content_encoding' so its nice and
326 # easy to access
327 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
328
329 # get the input file
330 my $input_filename = $file;
331 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
332 $suffix = lc($suffix);
333 my $tidy_filename;
334 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
335 {
336 # because the document has to be sectionalized set the description tags
337 $self->{'description_tags'} = 1;
338
339 # set the file to be tidied
340 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
341
342 # get the tidied file
343 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
344 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
345
346 # derive tmp filename from input filename
347 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
348
349 # set the new input file and base_dir to be from the tidied file
350 $file = "$tailname$suffix";
351 $base_dir = $dirname;
352 }
353
354 # call the parent read_into_doc_obj
355 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
356 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
357 {
358 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
359 my $collect_file = &util::filename_within_collection($filename_full_path);
360 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
361 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
362 # build. So set it manually.
363 $doc_obj->set_source_path($filename_full_path);
364 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
365 $doc_obj->set_converted_filename($collect_conv_file);
366
367 my $plugin_filename_encoding = $self->{'filename_encoding'};
368 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
369 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
370 }
371
372 delete $self->{'store_content_encoding'}->{$filename_full_path};
373 $self->{'content_encoding'} = undef;
374
375 return ($process_status,$doc_obj);
376}
377
378# do plugin specific processing of doc_obj
379sub process {
380 my $self = shift (@_);
381 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
382 my $outhandle = $self->{'outhandle'};
383
384 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
385 # this makes life so much easier... perl can cope with unix-style '/'s.
386 $base_dir =~ s@(\\)+@/@g;
387 $file =~ s@(\\)+@/@g;
388 }
389
390 my $filename = &util::filename_cat($base_dir,$file);
391 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
392 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
393
394 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
395 # And again
396 $upgraded_base_dir =~ s@(\\)+@/@g;
397 $upgraded_filename =~ s@(\\)+@/@g;
398
399 # Need to make sure there is a '/' on the end of upgraded_base_dir
400 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
401 $upgraded_base_dir .= "/";
402 }
403 }
404 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
405
406 # reset per-doc stuff...
407 $self->{'aux_files'} = {};
408 $self->{'dir_num'} = 0;
409 $self->{'file_num'} = 0;
410
411 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
412 # you can also include metadata in the format (X can be any number)
413 # <hX>Title<!--gsdl-metadata
414 # <Metadata name="name1">value1</Metadata>
415 # ...
416 # <Metadata name="nameN">valueN</Metadata>
417 #--></hX>
418 if ($self->{'sectionalise_using_h_tags'}) {
419 # description_tags should allways be activated because we convert headings to description tags
420 $self->{'description_tags'} = 1;
421
422 my $arrSections = [];
423 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
424
425 if (scalar(@$arrSections)) {
426 my $strMetadata = $self->update_section_data($arrSections, -1);
427 if (length($strMetadata)) {
428 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
429 $$textref =~ s/<\/body>/$strMetadata/ig;
430 }
431 }
432 }
433
434 my $cursection = $doc_obj->get_top_section();
435
436 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
437 unless $self->{'no_metadata'} || $self->{'description_tags'};
438
439 # Store URL for page as metadata - this can be used for an
440 # altavista style search interface. The URL won't be valid
441 # unless the file structure contains the domain name (i.e.
442 # like when w3mir is used to download a website).
443
444 # URL metadata (even invalid ones) are used to support internal
445 # links, so even if 'file_is_url' is off, still need to store info
446
447 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
448
449# my $utf8_file = $self->filename_to_utf8_metadata($file);
450# $utf8_file =~ s/&\#095;/_/g;
451# variable below used to be utf8_file
452 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455 my $web_url = "http://";
456 my $utf8_web_url = "http://";
457 if(defined $dirname) { # local directory
458 # Check for "ftp" in the domain name of the directory
459 # structure to determine if this URL should be a ftp:// URL
460 # This check is not infallible, but better than omitting the
461 # check, which would cause all files downloaded from ftp sites
462 # via mirroring with wget to have potentially erroneous http:// URLs
463 # assigned in their metadata
464 if ($dirname =~ /^[^\/]*ftp/i)
465 {
466 $web_url = "ftp://";
467 $utf8_web_url = "ftp://";
468 }
469 $dirname = $self->eval_dir_dots($dirname);
470 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472 $web_url = $web_url.$dirname.$url_encoded_file;
473 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474 } else {
475 $web_url = $web_url.$url_encoded_file;
476 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477 }
478 $web_url =~ s/\\/\//g;
479 $utf8_web_url =~ s/\\/\//g;
480
481 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484 }
485
486
487 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490 if ($self->{'file_is_url'}) {
491 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
494 }
495
496 if ($self->{'description_tags'}) {
497 # remove the html header - note that doing this here means any
498 # sections defined within the header will be lost (so all <Section>
499 # tags must appear within the body of the HTML)
500 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
501
502 $$textref =~ s/^.*?<body[^>]*>//is;
503 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
504
505 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
506 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
507
508 my $lt = '(?:<|&lt;)';
509 my $gt = '(?:>|&gt;)';
510 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
511
512 my $dont_strip = '';
513 if ($self->{'no_strip_metadata_html'}) {
514 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
515 }
516
517 my $found_something = 0; my $top = 1;
518 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
519 my $text = $1;
520 my $comment = $2;
521 if (defined $text) {
522 # text before a comment - note that getting to here
523 # doesn't necessarily mean there are Section tags in
524 # the document
525 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526 }
527 while ($comment =~ s/$lt(.*?)$gt//s) {
528 my $tag = $1;
529 if ($tag eq "Section") {
530 $found_something = 1;
531 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532 $top = 0;
533 } elsif ($tag eq "/Section") {
534 $found_something = 1;
535 $cursection = $doc_obj->get_parent_section ($cursection);
536 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537 my $metaname = $1;
538 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540 my $metavalue = $1;
541 $metavalue =~ s/^\s+//;
542 $metavalue =~ s/\s+$//;
543 # assume that no metadata value intentionally includes
544 # carriage returns or HTML tags (if they're there they
545 # were probably introduced when converting to HTML from
546 # some other format).
547 # actually some people want to have html tags in their
548 # metadata.
549 $metavalue =~ s/[\cJ\cM]/ /sg;
550 $metavalue =~ s/<[^>]+>//sg
551 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
552 $metavalue =~ s/\s+/ /sg;
553 if ($metaname =~ /\./) { # has a namespace
554 $metaname = "ex.$metaname";
555 }
556 if ($accumulate) {
557 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558 } else {
559 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560 }
561 } elsif ($tag eq "Description" || $tag eq "/Description") {
562 # do nothing with containing Description tags
563 } else {
564 # simple HTML tag (probably created by the conversion
565 # to HTML from some other format) - we'll ignore it and
566 # hope for the best ;-)
567 }
568 }
569 }
570 if ($cursection ne "") {
571 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572 }
573
574 $$textref =~ s/^.*?<body[^>]*>//is;
575 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
576 if ($$textref =~ m/\S/) {
577 if (!$found_something) {
578 if ($self->{'verbosity'} > 2) {
579 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580 print $outhandle " will be processed as a single section document\n";
581 }
582
583 # go ahead and process single-section document
584 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586 # if document contains no Section tags we'll go ahead
587 # and extract metadata (this won't have been done
588 # above as the -description_tags option prevents it)
589 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591 unless $self->{'no_metadata'};
592
593 } else {
594 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595 print $outhandle " of the final closing </Section> tag. This text will\n";
596 print $outhandle " be ignored.";
597
598 my ($text);
599 if (length($$textref) > 30) {
600 $text = substr($$textref, 0, 30) . "...";
601 } else {
602 $text = $$textref;
603 }
604 $text =~ s/\n/ /isg;
605 print $outhandle " ($text)\n";
606 }
607 } elsif (!$found_something) {
608
609 if ($self->{'verbosity'} > 2) {
610 # may get to here if document contained no valid Section
611 # tags but did contain some comments. The text will have
612 # been processed already but we should print the warning
613 # as above and extract metadata
614 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616 }
617
618 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620 unless $self->{'no_metadata'};
621 }
622
623 } else {
624
625 # remove header and footer
626 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
627 $$textref =~ s/^.*?<body[^>]*>//is;
628 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
629 }
630
631
632 # single section document
633 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
634 }
635
636 return 1;
637}
638
639
640sub process_heading
641{
642 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
643 $strHeadingText = '' if (!defined($strHeadingText));
644
645 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
646
647 my $strSecMetadata = '';
648 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
649 {
650 $strSecMetadata .= $1;
651 }
652
653 $strHeadingText =~ s/^\s+//g;
654 $strHeadingText =~ s/\s+$//g;
655 $strSecMetadata =~ s/^\s+//g;
656 $strSecMetadata =~ s/\s+$//g;
657
658 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
659
660 if (length($strSecMetadata)) {
661 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
662 }
663
664 $strMetadata .= "\t</Description>\n";
665
666 return "<!--" . $strMetadata . "-->";
667}
668
669
670sub update_section_data
671{
672 my ($self, $arrSections, $nCurTocNo) = @_;
673 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
674
675 if ($nSections == 0) {
676 push @$arrSections, $nCurTocNo;
677 return $strBuffer;
678 }
679 $nLast = $arrSections->[$nSections - 1];
680 if ($nCurTocNo > $nLast) {
681 push @$arrSections, $nCurTocNo;
682 return $strBuffer;
683 }
684 for(my $i = $nSections - 1; $i >= 0; $i--) {
685 if ($nCurTocNo <= $arrSections->[$i]) {
686 $strBuffer .= "\n</Section>";
687 pop @$arrSections;
688 }
689 }
690 push @$arrSections, $nCurTocNo;
691 return $strBuffer;
692}
693
694
695# note that process_section may be called multiple times for a single
696# section (relying on the fact that add_utf8_text appends the text to any
697# that may exist already).
698sub process_section {
699 my $self = shift (@_);
700 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
701 # trap links
702 if (!$self->{'nolinks'}) {
703 # usemap="./#index" not handled correctly => change to "#index"
704## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
705
706## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
707## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
708
709 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
710 $self->replace_usemap_links($1, $2, $3)/isge;
711
712 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
713 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
714
715## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
716# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
717 }
718
719 # trap images
720
721 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
722 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
723 # turned regular text succeeding images into links. That is, by embedding <imgs>
724 # inside <a href=""></a>, the text following images were no longer misbehaving.
725 # However, there would be many occasions whereby images were not meant to link
726 # to their source images but where the images would link to another web page.
727 # To allow this, the no_image_links option was introduced: it would prevent
728 # the behaviour of embedding images into links that referenced the source images.
729
730 # Somewhere along the line, the problem of normal text turning into links when
731 # such text followed images which were not embedded in <a href=""></a> ceased
732 # to occur. This is why the following lines have been commented out (as well as
733 # two lines in replace_images). They appear to no longer apply.
734
735 # If at any time, there is a need for having images embedded in <a> anchor tags,
736 # then it might be better to turn that into an HTMLPlugin option rather than make
737 # it the default behaviour. Also, eventually, no_image_links needs to become
738 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
739
740 #if(!$self->{'no_image_links'}){
741 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
742 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
743 #}
744
745 # add text to document object
746 # turn \ into \\ so that the rest of greenstone doesn't think there
747 # is an escape code following. (Macro parsing loses them...)
748 $$textref =~ s/\\/\\\\/go;
749
750 $doc_obj->add_utf8_text($cursection, $$textref);
751}
752
753sub replace_images {
754 my $self = shift (@_);
755 my ($front, $link, $back, $base_dir,
756 $file, $doc_obj, $section) = @_;
757
758 # remove quotes from link at start and end if necessary
759 if ($link=~/^[\"\']/) {
760 $link=~s/^[\"\']//;
761 $link=~s/[\"\']$//;
762 $front.='"';
763 $back="\"$back";
764 }
765
766 $link =~ s/\n/ /g;
767
768 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
769 # If the Word file path has spaces in it, wv messes up and you end up with
770 # absolute paths for the images, and without the "file://" prefix
771 # So check for this special case and massage the data to be correct
772 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
773 $link =~ s/^.*\\([^\\]+)$/$1/;
774 }
775
776 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
777
778 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
779
780# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
781
782 my $anchor_name = $img_file;
783 #$anchor_name =~ s/^.*\///;
784 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
785
786 my $image_link = $front . $img_file .$back;
787 return $image_link;
788
789 # The reasons for why the following two lines are no longer necessary can be
790 # found in subroutine process_section
791 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
792 #return $anchor_link;
793
794 #return $front . $img_file . $back . $anchor_name;
795}
796
797sub replace_href_links {
798 my $self = shift (@_);
799 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
800
801 # remove quotes from link at start and end if necessary
802 if ($link=~/^[\"\']/) {
803 $link=~s/^[\"\']//;
804 $link=~s/[\"\']$//;
805 $front.='"';
806 $back="\"$back";
807 }
808
809 # attempt to sort out targets - frames are not handled
810 # well in this plugin and some cases will screw things
811 # up - e.g. the _parent target (so we'll just remove
812 # them all ;-)
813 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
814 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
815 $front =~ s/target=\"?_parent\"?//is;
816 $back =~ s/target=\"?_parent\"?//is;
817
818 return $front . $link . $back if $link =~ m/^\#/s;
819 $link =~ s/\n/ /g;
820
821 # Find file referred to by $link on file system
822 # This is more complicated than it sounds when char encodings
823 # is taken in to account
824 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
825
826 # href may use '\'s where '/'s should be on Windows
827 $href =~ s/\\/\//g;
828 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
829
830
831 ##### leave all these links alone (they won't be picked up by intermediate
832 ##### pages). I think that's safest when dealing with frames, targets etc.
833 ##### (at least until I think of a better way to do it). Problems occur with
834 ##### mailto links from within small frames, the intermediate page is displayed
835 ##### within that frame and can't be seen. There is still potential for this to
836 ##### happen even with html pages - the solution seems to be to somehow tell
837 ##### the browser from the server side to display the page being sent (i.e.
838 ##### the intermediate page) in the top level window - I'm not sure if that's
839 ##### possible - the following line should probably be deleted if that can be done
840 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
841
842 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
843 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
844
845 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
846
847 # Don't do any encoding for now, as not clear what
848 # the right thing to do is to support filename
849 # encoding on Windows when they are not UTF16
850 #
851 }
852 else {
853 # => Unix-based system
854
855 # If web page didn't give encoding, then default to utf8
856 my $content_encoding= $self->{'content_encoding'} || "utf8";
857
858 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
859 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
860 }
861
862 $href = encode($content_encoding,$href);
863 }
864
865 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
866 $href = &unicode::filename_to_url($href);
867
868 &ghtml::urlsafe ($href);
869
870 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
871 print STDERR "******DEBUG: href=$href\n";
872 }
873
874
875 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
876 } else {
877 # link is to some other type of file (e.g., an image) so we'll
878 # need to associate that file
879 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
880 }
881}
882
883sub add_file {
884 my $self = shift (@_);
885 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
886 my ($newname);
887
888 my $filename = $href;
889 if ($base_dir eq "") {
890 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
891 # remove http://
892 $filename =~ s/^[^:]*:\/\///;
893 }
894 else {
895 # remove http:/ thereby leaving one slash at the start as
896 # part of full pathname
897 $filename =~ s/^[^:]*:\///;
898 }
899 }
900 else {
901 # remove http://
902 $filename =~ s/^[^:]*:\/\///;
903 }
904
905 $filename = &util::filename_cat($base_dir, $filename);
906
907 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
908 # we are processing a tidytmp file - want paths to be in import
909 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
910 }
911
912 # Replace %XX's in URL with decoded value if required. Note that the
913 # filename may include the %XX in some situations. If the *original*
914 # file's name was in URL encoding, the following method will not decode
915 # it.
916 my $unicode_filename = $filename;
917 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
918
919 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
920 # (it seems) accompanying file
921 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
922
923 my $content_encoding= $self->{'content_encoding'} || "utf8";
924
925 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
926 # The filenames that come through the HTML file have been decoded
927 # into Unicode aware Perl strings. Need to convert them back
928 # to their initial raw-byte encoding to match the file that
929 # exists on the file system
930 $filename = encode($content_encoding, $opt_decode_unicode_filename);
931 }
932 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
933 # HFS+ is UTF8 with decompostion
934 $filename = encode($content_encoding, $opt_decode_unicode_filename);
935 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
936
937 }
938 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
939 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
940
941 if (defined $long_filename) {
942 my $short_filename = Win32::GetLongPathName($long_filename);
943 $filename = $short_filename;
944 }
945# else {
946# print STDERR "***** failed to map href to real file:\n";
947# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
948# }
949 }
950 else {
951 my $outhandle = $self->{'outhandle'};
952 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
953 print $outhandle " in file system encoding of href: $href\n";
954 print $outhandle " No character encoding done.\n";
955 }
956
957
958 # some special processing if the intended filename was converted to utf8, but
959 # the actual file still needs to be renamed
960 if (!&util::fd_exists($filename)) {
961 # try the original filename stored in map
962 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
963 print STDERR "******!! orig filename did not exist: $filename\n";
964 }
965
966## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
967
968 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
969
970 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
971 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
972 }
973
974 if (defined $original_filename && -e $original_filename) {
975 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
976 print STDERR "****** Found match!\n";
977 }
978 $filename = $original_filename;
979 }
980 }
981
982 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
983
984 if ($rl == 0) {
985 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
986 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
987 }
988 else {
989 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
990 }
991 }
992
993 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
994 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
995 }
996 # add the original image file as a source file
997 if (!$self->{'processing_tmp_files'} ) {
998 $doc_obj->associate_source_file($filename);
999 }
1000 if ($self->{'rename_assoc_files'}) {
1001 if (defined $self->{'aux_files'}->{$href}) {
1002 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1003 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1004 } else {
1005 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1006 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1007 $self->inc_filecount ();
1008 }
1009 $doc_obj->associate_file($filename, $newname, undef, $section);
1010 return "_httpdocimg_/$newname";
1011 } else {
1012 if(&unicode::is_url_encoded($unicode_filename)) {
1013 # use the possibly-decoded filename instead to avoid double URL encoding
1014 ($newname) = $filename =~ m/([^\/\\]*)$/;
1015 } else {
1016 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1017 }
1018
1019 # Make sure this name uses only ASCII characters.
1020 # We use either base64 or URL encoding, as these preserve original encoding
1021 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
1022
1023### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1024 $doc_obj->associate_file($filename, $newname, undef, $section);
1025
1026 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1027 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1028 my $newname_url = $newname;
1029 $newname_url = &unicode::filename_to_url($newname_url);
1030 return "_httpdocimg_/$newname_url";
1031 }
1032}
1033
1034
1035sub format_link {
1036 my $self = shift (@_);
1037 my ($link, $base_dir, $file) = @_;
1038
1039 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1040 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
1041
1042 $hash_part = "" if !defined $hash_part;
1043 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1044 my $outhandle = $self->{'outhandle'};
1045 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1046 if $self->{'verbosity'};
1047 return ($link, "", 0);
1048 }
1049
1050 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
1051 my $type = $1;
1052
1053 if ($link =~ m/^(http|ftp):/i) {
1054 # Turn url (using /) into file name (possibly using \ on windows)
1055 my @http_dir_split = split('/', $before_hash);
1056 $before_hash = &util::filename_cat(@http_dir_split);
1057 }
1058
1059 $before_hash = $self->eval_dir_dots($before_hash);
1060
1061 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1062
1063 my $rl = 0;
1064 $rl = 1 if (-e $linkfilename);
1065
1066 # make sure there's a slash on the end if it's a directory
1067 if ($before_hash !~ m/\/$/) {
1068 $before_hash .= "/" if (-d $linkfilename);
1069 }
1070 return ($type . $before_hash, $hash_part, $rl);
1071
1072 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1073
1074 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1075
1076 # the first directory will be the domain name if file_is_url
1077 # to generate archives, otherwise we'll assume all files are
1078 # from the same site and base_dir is the root
1079
1080 if ($self->{'file_is_url'}) {
1081 my @dirs = split /[\/\\]/, $file;
1082 my $domname = shift (@dirs);
1083 $before_hash = &util::filename_cat($domname, $before_hash);
1084 $before_hash =~ s@\\@/@g; # for windows
1085 }
1086 else
1087 {
1088 # see if link shares directory with source document
1089 # => turn into relative link if this is so!
1090
1091 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1092 # too difficult doing a pattern match with embedded '\'s...
1093 my $win_before_hash=$before_hash;
1094 $win_before_hash =~ s@(\\)+@/@g;
1095 # $base_dir is already similarly "converted" on windows.
1096 if ($win_before_hash =~ s@^$base_dir/@@o) {
1097 # if this is true, we removed a prefix
1098 $before_hash=$win_before_hash;
1099 }
1100 }
1101 else {
1102 # before_hash has lost leading slash by this point,
1103 # -> add back in prior to substitution with $base_dir
1104 $before_hash = "/$before_hash";
1105
1106 $before_hash = &util::filename_cat("",$before_hash);
1107 $before_hash =~ s@^$base_dir/@@;
1108 }
1109 }
1110 } else {
1111 # Turn relative file path into full path
1112 my $dirname = &File::Basename::dirname($file);
1113 $before_hash = &util::filename_cat($dirname, $before_hash);
1114 $before_hash = $self->eval_dir_dots($before_hash);
1115 }
1116
1117 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1118
1119
1120# print STDERR "**** linkfilename = $linkfilename\n";
1121# if (!&util::fd_exists($linkfilename)) {
1122# print STDERR "***** Warning: Could not find $linkfilename\n";
1123# }
1124
1125
1126 # make sure there's a slash on the end if it's a directory
1127 if ($before_hash !~ m/\/$/) {
1128 $before_hash .= "/" if (-d $linkfilename);
1129 }
1130
1131# print STDERR "*** returning: $before_hash\n";
1132
1133 return ("http://" . $before_hash, $hash_part, 1);
1134 } else {
1135 # mailto, news, nntp, telnet, javascript or gopher link
1136 return ($before_hash, "", 0);
1137 }
1138}
1139
1140sub extract_first_NNNN_characters {
1141 my $self = shift (@_);
1142 my ($textref, $doc_obj, $thissection) = @_;
1143
1144 foreach my $size (split /,/, $self->{'first'}) {
1145 my $tmptext = $$textref;
1146 # skip to the body
1147 $tmptext =~ s/.*<body[^>]*>//i;
1148 # remove javascript
1149 $tmptext =~ s@<script.*?</script>@ @sig;
1150 $tmptext =~ s/<[^>]*>/ /g;
1151 $tmptext =~ s/&nbsp;/ /g;
1152 $tmptext =~ s/^\s+//;
1153 $tmptext =~ s/\s+$//;
1154 $tmptext =~ s/\s+/ /gs;
1155 $tmptext = &unicode::substr ($tmptext, 0, $size);
1156 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1157 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1158 }
1159}
1160
1161
1162sub extract_metadata {
1163 my $self = shift (@_);
1164 my ($textref, $metadata, $doc_obj, $section) = @_;
1165 my $outhandle = $self->{'outhandle'};
1166 # if we don't want metadata, we may as well not be here ...
1167 return if (!defined $self->{'metadata_fields'});
1168
1169 my $separator = $self->{'metadata_field_separator'};
1170 if ($separator eq "") {
1171 undef $separator;
1172 }
1173
1174 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1175 # html meta, 'value' is the metadata name for greenstone to use
1176 my %find_fields = ();
1177
1178 my %creator_fields = (); # short-cut for lookups
1179
1180
1181 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1182 $field =~ s/^\s+//; # remove leading whitespace
1183 $field =~ s/\s+$//; # remove trailing whitespace
1184
1185 # support tag<tagname>
1186 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1187 # "$2" is the user's preferred gs metadata name
1188 $find_fields{lc($1)}=$2; # lc = lowercase
1189 } else { # no <tagname> for mapping
1190 # "$field" is the user's preferred gs metadata name
1191 $find_fields{lc($field)}=$field; # lc = lowercase
1192 }
1193 }
1194
1195 if (defined $self->{'hunt_creator_metadata'} &&
1196 $self->{'hunt_creator_metadata'} == 1 ) {
1197 my @extra_fields =
1198 (
1199 'author',
1200 'author.email',
1201 'creator',
1202 'dc.creator',
1203 'dc.creator.corporatename',
1204 );
1205
1206 # add the creator_metadata fields to search for
1207 foreach my $field (@extra_fields) {
1208 $creator_fields{$field}=0; # add to lookup hash
1209 }
1210 }
1211
1212
1213 # find the header in the html file, which has the meta tags
1214 $$textref =~ m@<head>(.*?)</head>@si;
1215
1216 my $html_header=$1;
1217
1218 # go through every <meta... tag defined in the html and see if it is
1219 # one of the tags we want to match.
1220
1221 # special case for title - we want to remember if its been found
1222 my $found_title = 0;
1223 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1224 $html_header =~ m/^/; # match the start of the string, for \G assertion
1225
1226 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1227 my $metatag=$1;
1228 my ($tag, $value);
1229
1230 # find the tag name
1231 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1232 $tag=$2;
1233 # in case they're not using " or ', but they should...
1234 if (! $tag) {
1235 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1236 $tag=$1;
1237 }
1238
1239 if (!defined $tag) {
1240 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1241 next;
1242 }
1243
1244 # don't need to assign this field if it was passed in from a previous
1245 # (recursive) plugin
1246 if (defined $metadata->{$tag}) {next}
1247
1248 # find the tag content
1249 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1250 $value=$2;
1251
1252 if (! $value) {
1253 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1254 $value=$1;
1255 }
1256 if (!defined $value) {
1257 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1258 next;
1259 }
1260
1261 # clean up and add
1262 $value =~ s/\s+/ /gs;
1263 chomp($value); # remove trailing \n, if any
1264 if (exists $creator_fields{lc($tag)}) {
1265 # map this value onto greenstone's "Creator" metadata
1266 $tag='Creator';
1267 } elsif (!exists $find_fields{lc($tag)}) {
1268 next; # don't want this tag
1269 } else {
1270 # get the user's preferred capitalisation
1271 $tag = $find_fields{lc($tag)};
1272 }
1273 if (lc($tag) eq "title") {
1274 $found_title = 1;
1275 }
1276
1277 if ($self->{'verbosity'} > 2) {
1278 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1279 }
1280
1281 if ($tag =~ /\./) {
1282 # there is a . so has a namespace, add ex.
1283 $tag = "ex.$tag";
1284 }
1285 if (defined $separator) {
1286 my @values = split($separator, $value);
1287 foreach my $v (@values) {
1288 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1289 }
1290 }
1291 else {
1292 $doc_obj->add_utf8_metadata($section, $tag, $value);
1293 }
1294 }
1295
1296 # TITLE: extract the document title
1297 if (exists $find_fields{'title'} && !$found_title) {
1298 # we want a title, and didn't find one in the meta tags
1299 # see if there's a <title> tag
1300 my $title;
1301 my $from = ""; # for debugging output only
1302 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1303 $title = $1;
1304 $from = "<title> tags";
1305 }
1306
1307 if (!defined $title) {
1308 $from = "first 100 chars";
1309 # if no title use first 100 or so characters
1310 $title = $$textref;
1311 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1312 $title =~ s/^.*?<body>//si;
1313 # ignore javascript!
1314 $title =~ s@<script.*?</script>@ @sig;
1315 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1316 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1317 $title = substr ($title, 0, 100);
1318 $title =~ s/\s\S*$/.../;
1319 }
1320 $title =~ s/<[^>]*>/ /g; # remove html tags
1321 $title =~ s/&nbsp;/ /g;
1322 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1323 $title =~ s/\s+/ /gs; # collapse multiple spaces
1324 $title =~ s/^\s*//; # remove leading spaces
1325 $title =~ s/\s*$//; # remove trailing spaces
1326
1327 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1328 $title =~ s/^\s+//s; # in case title_sub introduced any...
1329 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1330 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1331 if ($self->{'verbosity'} > 2);
1332 }
1333
1334 # add FileFormat metadata
1335 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1336
1337 # Special, for metadata names such as tagH1 - extracts
1338 # the text between the first <H1> and </H1> tags into "H1" metadata.
1339
1340 foreach my $field (keys %find_fields) {
1341 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1342 my $tag = $1;
1343 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1344 my $content = $1;
1345 $content =~ s/&nbsp;/ /g;
1346 $content =~ s/<[^>]*>/ /g;
1347 $content =~ s/^\s+//;
1348 $content =~ s/\s+$//;
1349 $content =~ s/\s+/ /gs;
1350 if ($content) {
1351 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1352 $tag =~ s/^tag//i;
1353 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1354 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1355 if ($self->{'verbosity'} > 2);
1356 }
1357 }
1358 }
1359}
1360
1361
1362# evaluate any "../" to next directory up
1363# evaluate any "./" as here
1364sub eval_dir_dots {
1365 my $self = shift (@_);
1366 my ($filename) = @_;
1367 my $dirsep_os = &util::get_os_dirsep();
1368 my @dirsep = split(/$dirsep_os/,$filename);
1369
1370 my @eval_dirs = ();
1371 foreach my $d (@dirsep) {
1372 if ($d eq "..") {
1373 pop(@eval_dirs);
1374
1375 } elsif ($d eq ".") {
1376 # do nothing!
1377
1378 } else {
1379 push(@eval_dirs,$d);
1380 }
1381 }
1382
1383 # Need to fiddle with number of elements in @eval_dirs if the
1384 # first one is the empty string. This is because of a
1385 # modification to util::filename_cat that supresses the addition
1386 # of a leading '/' character (or \ if windows) (intended to help
1387 # filename cat with relative paths) if the first entry in the
1388 # array is the empty string. Making the array start with *two*
1389 # empty strings is a way to defeat this "smart" option.
1390 #
1391 if (scalar(@eval_dirs) > 0) {
1392 if ($eval_dirs[0] eq ""){
1393 unshift(@eval_dirs,"");
1394 }
1395 }
1396
1397 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1398 return $evaluated_filename;
1399}
1400
1401sub replace_usemap_links {
1402 my $self = shift (@_);
1403 my ($front, $link, $back) = @_;
1404
1405 # remove quotes from link at start and end if necessary
1406 if ($link=~/^[\"\']/) {
1407 $link=~s/^[\"\']//;
1408 $link=~s/[\"\']$//;
1409 $front.='"';
1410 $back="\"$back";
1411 }
1412
1413 $link =~ s/^\.\///;
1414 return $front . $link . $back;
1415}
1416
1417sub inc_filecount {
1418 my $self = shift (@_);
1419
1420 if ($self->{'file_num'} == 1000) {
1421 $self->{'dir_num'} ++;
1422 $self->{'file_num'} = 0;
1423 } else {
1424 $self->{'file_num'} ++;
1425 }
1426}
1427
1428
1429# Extend read_file so that strings like &eacute; are
1430# converted to UTF8 internally.
1431#
1432# We don't convert &lt; or &gt; or &amp; or &quot; in case
1433# they interfere with the GML files
1434
1435sub read_file {
1436 my $self = shift(@_);
1437 my ($filename, $encoding, $language, $textref) = @_;
1438
1439 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1440
1441 # Convert entities to their Unicode code-point equivalents
1442 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1443 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1444 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1445
1446}
1447
1448sub HB_read_html_file {
1449 my $self = shift (@_);
1450 my ($htmlfile, $text) = @_;
1451
1452 # load in the file
1453 if (!open (FILE, $htmlfile)) {
1454 print STDERR "ERROR - could not open $htmlfile\n";
1455 return;
1456 }
1457
1458 my $foundbody = 0;
1459 $self->HB_gettext (\$foundbody, $text, "FILE");
1460 close FILE;
1461
1462 # just in case there was no <body> tag
1463 if (!$foundbody) {
1464 $foundbody = 1;
1465 open (FILE, $htmlfile) || return;
1466 $self->HB_gettext (\$foundbody, $text, "FILE");
1467 close FILE;
1468 }
1469 # text is in utf8
1470}
1471
1472# converts the text to utf8, as ghtml does that for &eacute; etc.
1473sub HB_gettext {
1474 my $self = shift (@_);
1475 my ($foundbody, $text, $handle) = @_;
1476
1477 my $line = "";
1478 while (defined ($line = <$handle>)) {
1479 # look for body tag
1480 if (!$$foundbody) {
1481 if ($line =~ s/^.*<body[^>]*>//i) {
1482 $$foundbody = 1;
1483 } else {
1484 next;
1485 }
1486 }
1487
1488 # check for symbol fonts
1489 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1490 my $font = $1;
1491 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1492 if ($font !~ m/^arial$/i);
1493 }
1494
1495 $$text .= $line;
1496 }
1497
1498 if ($self->{'input_encoding'} eq "iso_8859_1") {
1499 # convert to utf-8
1500 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1501 }
1502 # convert any alphanumeric character entities to their utf-8
1503 # equivalent for indexing purposes
1504 #&ghtml::convertcharentities ($$text);
1505
1506 $$text =~ s/\s+/ /g; # remove \n's
1507
1508 # At this point $$text is a binary byte string
1509 # => turn it into a Unicode aware string, so full
1510 # Unicode aware pattern matching can be used.
1511 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1512 #
1513
1514 $$text = decode("utf8",$$text);
1515}
1516
1517sub HB_clean_section {
1518 my $self = shift (@_);
1519 my ($section) = @_;
1520
1521 # remove tags without a starting tag from the section
1522 my ($tag, $tagstart);
1523 while ($section =~ m/<\/([^>]{1,10})>/) {
1524 $tag = $1;
1525 $tagstart = index($section, "<$tag");
1526 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1527 $section =~ s/<\/$tag>//;
1528 }
1529
1530 # remove extra paragraph tags
1531 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1532
1533 # remove extra stuff at the end of the section
1534 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1535
1536 # add a newline at the beginning of each paragraph
1537 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1538
1539 # add a newline every 80 characters at a word boundary
1540 # Note: this regular expression puts a line feed before
1541 # the last word in each section, even when it is not
1542 # needed.
1543 $section =~ s/(.{1,80})\s/$1\n/g;
1544
1545 # fix up the image links
1546 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1547 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1548 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1549 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1550
1551 return $section;
1552}
1553
1554# Will convert the oldHDL format to the new HDL format (using the Section tag)
1555sub convert_to_newHDLformat
1556{
1557 my $self = shift (@_);
1558 my ($file,$cnfile) = @_;
1559 my $input_filename = $file;
1560 my $tmp_filename = $cnfile;
1561
1562 # write HTML tmp file with new HDL format
1563 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1564
1565 # read in the file and do basic html cleaning (removing header etc)
1566 my $html = "";
1567 $self->HB_read_html_file ($input_filename, \$html);
1568
1569 # process the file one section at a time
1570 my $curtoclevel = 1;
1571 my $firstsection = 1;
1572 my $toclevel = 0;
1573 while (length ($html) > 0) {
1574 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1575 $toclevel = $3;
1576 my $title = $4;
1577 my $sectiontext = "";
1578 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1579 $sectiontext = $1;
1580 } else {
1581 $sectiontext = $html;
1582 $html = "";
1583 }
1584
1585 # remove tags and extra spaces from the title
1586 $title =~ s/<\/?[^>]+>//g;
1587 $title =~ s/^\s+|\s+$//g;
1588
1589 # close any sections below the current level and
1590 # create a new section (special case for the firstsection)
1591 print PROD "<!--\n";
1592 while (($curtoclevel > $toclevel) ||
1593 (!$firstsection && $curtoclevel == $toclevel)) {
1594 $curtoclevel--;
1595 print PROD "</Section>\n";
1596 }
1597 if ($curtoclevel+1 < $toclevel) {
1598 print STDERR "WARNING - jump in toc levels in $input_filename " .
1599 "from $curtoclevel to $toclevel\n";
1600 }
1601 while ($curtoclevel < $toclevel) {
1602 $curtoclevel++;
1603 }
1604
1605 if ($curtoclevel == 1) {
1606 # add the header tag
1607 print PROD "-->\n";
1608 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1609 print PROD "<!--\n";
1610 }
1611
1612 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1613
1614 print PROD "-->\n";
1615
1616 # clean up the section html
1617 $sectiontext = $self->HB_clean_section($sectiontext);
1618
1619 print PROD "$sectiontext\n";
1620
1621 } else {
1622 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1623 "\nin $input_filename\n";
1624 last;
1625 }
1626 $firstsection = 0;
1627 }
1628
1629 print PROD "<!--\n";
1630 while ($curtoclevel > 0) {
1631 $curtoclevel--;
1632 print PROD "</Section>\n";
1633 }
1634 print PROD "-->\n";
1635
1636 close (PROD) || die("Error Closing File: $tmp_filename $!");
1637
1638 return $tmp_filename;
1639}
1640
1641sub shorten {
1642 my $self = shift (@_);
1643 my ($text) = @_;
1644
1645 return "\"$text\"" if (length($text) < 100);
1646
1647 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1648 substr ($text, length($text)-50) . "\"";
1649}
1650
1651sub convert_tidy_or_oldHDL_file
1652{
1653 my $self = shift (@_);
1654 my ($file) = @_;
1655 my $input_filename = $file;
1656
1657 if (-d $input_filename)
1658 {
1659 return $input_filename;
1660 }
1661
1662 # get the input filename
1663 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1664 my $base_dirname = $dirname;
1665 $suffix = lc($suffix);
1666
1667 # derive tmp filename from input filename
1668 # Remove any white space from filename -- no risk of name collision, and
1669 # makes later conversion by utils simpler. Leave spaces in path...
1670 # tidy up the filename with space, dot, hyphen between
1671 $tailname =~ s/\s+//g;
1672 $tailname =~ s/\.+//g;
1673 $tailname =~ s/\-+//g;
1674 # convert to utf-8 otherwise we have problems with the doc.xml file
1675 # later on
1676 &unicode::ensure_utf8(\$tailname);
1677
1678 # softlink to collection tmp dir
1679 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1680 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1681
1682 my $test_dirname = "";
1683 my $f_separator = &util::get_os_dirsep();
1684
1685 if ($dirname =~ m/import$f_separator/)
1686 {
1687 $test_dirname = $'; #'
1688
1689 #print STDERR "init $'\n";
1690
1691 while ($test_dirname =~ m/[$f_separator]/)
1692 {
1693 my $folderdirname = $`;
1694 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1695 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1696 $test_dirname = $'; #'
1697 }
1698 }
1699
1700 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1701
1702 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1703 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1704 {
1705 #convert the input file to a new style HDL
1706 my $hdl_output_filename = $input_filename;
1707 if ($self->{'old_style_HDL'})
1708 {
1709 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1710 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1711 }
1712
1713 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1714 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1715 my @files = grep {!/^\.+$/} readdir(DIR);
1716 close(DIR);
1717
1718 foreach my $file (@files)
1719 {
1720 my $src_file = &util::filename_cat($base_dirname,$file);
1721 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1722 if ((!-e $dest_file) && (!-d $src_file))
1723 {
1724 # just copy the original file back to the tmp directory
1725 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1726 }
1727 }
1728
1729 # tidy the input file
1730 my $tidy_output_filename = $hdl_output_filename;
1731 if ($self->{'use_realistic_book'})
1732 {
1733 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1734 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1735 }
1736 $tmp_filename = $tidy_output_filename;
1737 }
1738 else
1739 {
1740 if (!-e $tmp_filename)
1741 {
1742 # just copy the original file back to the tmp directory
1743 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1744 }
1745 }
1746
1747 return $tmp_filename;
1748}
1749
1750
1751# Will make the html input file as a proper XML file with removed font tag and
1752# image size added to the img tag.
1753# The tidying process takes place in a collection specific 'tmp' directory so
1754# that we don't accidentally damage the input.
1755sub tmp_tidy_file
1756{
1757 my $self = shift (@_);
1758 my ($file,$cnfile) = @_;
1759 my $input_filename = $file;
1760 my $tmp_filename = $cnfile;
1761
1762 # get the input filename
1763 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1764
1765 require HTML::TokeParser::Simple;
1766
1767 # create HTML parser to decode the input file
1768 my $parser = HTML::TokeParser::Simple->new($input_filename);
1769
1770 # write HTML tmp file without the font tag and image size are added to the img tag
1771 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1772 while (my $token = $parser->get_token())
1773 {
1774 # is it an img tag
1775 if ($token->is_start_tag('img'))
1776 {
1777 # get the attributes
1778 my $attr = $token->return_attr;
1779
1780 # get the full path to the image
1781 my $img_file = &util::filename_cat($dirname,$attr->{src});
1782
1783 # set the width and height attribute
1784 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1785
1786 # recreate the tag
1787 print PROD "<img";
1788 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1789 print PROD ">";
1790 }
1791 # is it a font tag
1792 else
1793 {
1794 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1795 {
1796 # remove font tag
1797 print PROD "";
1798 }
1799 else
1800 {
1801 # print without changes
1802 print PROD $token->as_is;
1803 }
1804 }
1805 }
1806 close (PROD) || die("Error Closing File: $tmp_filename $!");
1807
1808 # run html-tidy on the tmp file to make it a proper XML file
1809
1810 my $outhandle = $self->{'outhandle'};
1811 print $outhandle "Converting HTML to be XML compliant:\n";
1812
1813 my $tidy_cmd = "tidy";
1814 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1815 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1816 if ($self->{'verbosity'} <= 2) {
1817 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1818 $tidy_cmd .= " 2>nul";
1819 }
1820 else {
1821 $tidy_cmd .= " 2>/dev/null";
1822 }
1823 print $outhandle " => $tidy_cmd\n";
1824 }
1825
1826 my $tidyfile = `$tidy_cmd`;
1827
1828 # write result back to the tmp file
1829 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1830 print PROD $tidyfile;
1831 close (PROD) || die("Error Closing File: $tmp_filename $!");
1832
1833 # return the output filename
1834 return $tmp_filename;
1835}
1836
1837sub associate_cover_image
1838{
1839 my $self = shift(@_);
1840 my ($doc_obj, $filename) = @_;
1841 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1842 {
1843 # we will have cover image in tidytmp, but want it from import
1844 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1845 }
1846 $self->SUPER::associate_cover_image($doc_obj, $filename);
1847}
1848
1849
18501;
Note: See TracBrowser for help on using the repository browser.