source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 24951

Last change on this file since 24951 was 24431, checked in by ak19, 13 years ago

Dr Bainbridge fixed HTMLPlugin so that empty keywords and subject values as generated when converting PDFs to HTML don't get entered into ex.Meta anymore as escaped quoted Keywords and Subject. Added in John Rose's request that PDFPlugin automatically extract Title, Author, Subject and Keywords metadata fields and store them as ex.Metadata.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 63.2 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39use Unicode::Normalize 'normalize';
40
41use ReadTextFile;
42use HBPlugin;
43use ghtml;
44use unicode;
45use util;
46use XMLParser;
47
48use Image::Size;
49use File::Copy;
50
51sub BEGIN {
52 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53}
54
55use strict; # every perl program should have this!
56no strict 'refs'; # make an exception so we can use variables as filehandles
57
58my $arguments =
59 [ { 'name' => "process_exp",
60 'desc' => "{BasePlugin.process_exp}",
61 'type' => "regexp",
62 'deft' => &get_default_process_exp() },
63 { 'name' => "block_exp",
64 'desc' => "{BasePlugin.block_exp}",
65 'type' => 'regexp',
66 'deft' => &get_default_block_exp() },
67 { 'name' => "nolinks",
68 'desc' => "{HTMLPlugin.nolinks}",
69 'type' => "flag" },
70 { 'name' => "keep_head",
71 'desc' => "{HTMLPlugin.keep_head}",
72 'type' => "flag" },
73 { 'name' => "no_metadata",
74 'desc' => "{HTMLPlugin.no_metadata}",
75 'type' => "flag" },
76 { 'name' => "metadata_fields",
77 'desc' => "{HTMLPlugin.metadata_fields}",
78 'type' => "string",
79 'deft' => "Title" },
80 { 'name' => "metadata_field_separator",
81 'desc' => "{HTMLPlugin.metadata_field_separator}",
82 'type' => "string",
83 'deft' => "" },
84 { 'name' => "hunt_creator_metadata",
85 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
86 'type' => "flag" },
87 { 'name' => "file_is_url",
88 'desc' => "{HTMLPlugin.file_is_url}",
89 'type' => "flag" },
90 { 'name' => "assoc_files",
91 'desc' => "{HTMLPlugin.assoc_files}",
92 'type' => "regexp",
93 'deft' => &get_default_block_exp() },
94 { 'name' => "rename_assoc_files",
95 'desc' => "{HTMLPlugin.rename_assoc_files}",
96 'type' => "flag" },
97 { 'name' => "title_sub",
98 'desc' => "{HTMLPlugin.title_sub}",
99 'type' => "string",
100 'deft' => "" },
101 { 'name' => "description_tags",
102 'desc' => "{HTMLPlugin.description_tags}",
103 'type' => "flag" },
104 # retain this for backward compatibility (w3mir option was replaced by
105 # file_is_url)
106 { 'name' => "w3mir",
107# 'desc' => "{HTMLPlugin.w3mir}",
108 'type' => "flag",
109 'hiddengli' => "yes"},
110 { 'name' => "no_strip_metadata_html",
111 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
112 'type' => "string",
113 'deft' => "",
114 'reqd' => "no"},
115 { 'name' => "sectionalise_using_h_tags",
116 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
117 'type' => "flag" },
118 { 'name' => "use_realistic_book",
119 'desc' => "{HTMLPlugin.tidy_html}",
120 'type' => "flag"},
121 { 'name' => "old_style_HDL",
122 'desc' => "{HTMLPlugin.old_style_HDL}",
123 'type' => "flag"},
124 {'name' => "processing_tmp_files",
125 'desc' => "{BasePlugin.processing_tmp_files}",
126 'type' => "flag",
127 'hiddengli' => "yes"}
128 ];
129
130my $options = { 'name' => "HTMLPlugin",
131 'desc' => "{HTMLPlugin.desc}",
132 'abstract' => "no",
133 'inherits' => "yes",
134 'args' => $arguments };
135
136
137sub new {
138 my ($class) = shift (@_);
139 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140 push(@$pluginlist, $class);
141
142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143 push(@{$hashArgOptLists->{"OptList"}},$options);
144
145
146 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
147
148 if ($self->{'w3mir'}) {
149 $self->{'file_is_url'} = 1;
150 }
151 $self->{'aux_files'} = {};
152 $self->{'dir_num'} = 0;
153 $self->{'file_num'} = 0;
154
155 return bless $self, $class;
156}
157
158# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
159# if have eg <script language="javascript" src="img/lib.js@123">
160# blocking is now done by reading through the file and recording all the
161# images and other files
162sub get_default_block_exp {
163 my $self = shift (@_);
164
165 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
166 return "";
167}
168
169sub get_default_process_exp {
170 my $self = shift (@_);
171
172 # the last option is an attempt to encode the concept of an html query ...
173 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
174}
175
176sub store_block_files
177{
178 my $self =shift (@_);
179 my ($filename_full_path, $block_hash) = @_;
180
181 my $html_fname = $filename_full_path;
182
183 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
184 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
185
186 # read in file ($text will be in utf8)
187 my $raw_text = "";
188 $self->read_file_no_decoding($filename_full_path, \$raw_text);
189
190 my $textref = \$raw_text;
191 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
192 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
193 $$textref =~ s/$opencom(.*?)$closecom//gs;
194
195 # Convert entities to their UTF8 equivalents
196 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
197 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
198 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
199
200 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
201 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
202 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
203 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
204 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
205 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
206 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
207
208 if(!defined $self->{'unicode_to_original_filename'}) {
209 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
210 $self->{'unicode_to_original_filename'} = {};
211 }
212
213 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
214
215 # remove quotes from link at start and end if necessary
216 if ($raw_link =~ m/^\"/) {
217 $raw_link =~ s/^\"//;
218 $raw_link =~ s/\"$//;
219 }
220
221 # remove any anchor names, e.g. foo.html#name becomes foo.html
222 # but watch out for any #'s that are part of entities, such as &#x3B1;
223 $raw_link =~ s/([^&])\#.*$/$1/s;
224
225 # some links may just be anchor names
226 next unless ($raw_link =~ /\S+/);
227
228 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
229 # Turn relative file path into full path
230 my $dirname = &File::Basename::dirname($filename_full_path);
231 $raw_link = &util::filename_cat($dirname, $raw_link);
232 }
233 $raw_link = $self->eval_dir_dots($raw_link);
234
235 # this is the actual filename on the filesystem (that the link refers to)
236 my $url_original_filename = $self->opt_url_decode($raw_link);
237
238 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
239
240 if ($exceeds_bytecodes) {
241 # We have a link to a file name that is more complicated than a raw byte filename
242 # What we do next depends on the operating system we are on
243
244 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
245 # Assume we're dealing with a UTF-8 encoded filename
246 $url_original_filename = encode("utf8", $url_original_filename);
247 }
248 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
249 # HFS+ is UTF8 with decompostion
250 $url_original_filename = encode("utf8", $url_original_filename);
251 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
252 }
253 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
254 # Don't need to do anything as later code maps Windows
255 # unicode filenames to DOS short filenames when needed
256 }
257 else {
258 my $outhandle = $self->{'outhandle'};
259 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
260 print $outhandle " in raw file system encoding of: $raw_link\n";
261 print $outhandle " Assuming filesystem is UTF-8 based.\n";
262 $url_original_filename = encode("utf8", $url_original_filename);
263 }
264 }
265
266 # Convert the (currently raw) link into its Unicode version.
267 # Store the Unicode link along with the url_original_filename
268 my $unicode_url_original_filename = "";
269 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
270
271
272 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
273
274
275 if ($url_original_filename ne $unicode_url_original_filename) {
276 my $outhandle = $self->{'outhandle'};
277
278 print $outhandle "URL Encoding $url_original_filename\n";
279 print $outhandle " ->$unicode_url_original_filename\n";
280
281 # Allow for possibility of raw byte version and Unicode versions of file
282 &util::block_filename($block_hash,$unicode_url_original_filename);
283 }
284
285 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
286 &util::block_filename($block_hash,$url_original_filename);
287
288 }
289}
290
291# Given a filename in any encoding, will URL decode it to get back the original filename
292# in the original encoding. Because this method is intended to work out the *original*
293# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
294# string already exists in the local folder.
295#
296sub opt_url_decode {
297 my $self = shift (@_);
298 my ($raw_link) = @_;
299
300
301 # Replace %XX's in URL with decoded value if required.
302 # Note that the filename may include the %XX in some situations
303
304## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
305
306 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
307 if (!-e $raw_link) {
308 $raw_link = &unicode::url_decode($raw_link,1);
309 }
310 }
311
312 return $raw_link;
313}
314
315sub read_into_doc_obj
316{
317 my $self = shift (@_);
318 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
319
320 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
321
322 # Lookup content_encoding worked out in file_block pass for this file
323 # Store it under the local name 'content_encoding' so its nice and
324 # easy to access
325 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
326
327 # get the input file
328 my $input_filename = $file;
329 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
330 $suffix = lc($suffix);
331 my $tidy_filename;
332 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
333 {
334 # because the document has to be sectionalized set the description tags
335 $self->{'description_tags'} = 1;
336
337 # set the file to be tidied
338 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
339
340 # get the tidied file
341 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
342 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
343
344 # derive tmp filename from input filename
345 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
346
347 # set the new input file and base_dir to be from the tidied file
348 $file = "$tailname$suffix";
349 $base_dir = $dirname;
350 }
351
352 # call the parent read_into_doc_obj
353 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
354 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
355 {
356 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
357 my $collect_file = &util::filename_within_collection($filename_full_path);
358 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
359 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
360 # build. So set it manually.
361 $doc_obj->set_source_path($filename_full_path);
362 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
363 $doc_obj->set_converted_filename($collect_conv_file);
364
365 my $plugin_filename_encoding = $self->{'filename_encoding'};
366 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
367 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
368 }
369
370 delete $self->{'store_content_encoding'}->{$filename_full_path};
371 $self->{'content_encoding'} = undef;
372
373 return ($process_status,$doc_obj);
374}
375
376# do plugin specific processing of doc_obj
377sub process {
378 my $self = shift (@_);
379 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
380 my $outhandle = $self->{'outhandle'};
381
382 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
383 # this makes life so much easier... perl can cope with unix-style '/'s.
384 $base_dir =~ s@(\\)+@/@g;
385 $file =~ s@(\\)+@/@g;
386 }
387
388 my $filename = &util::filename_cat($base_dir,$file);
389 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
390 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
391
392 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
393 # And again
394 $upgraded_base_dir =~ s@(\\)+@/@g;
395 $upgraded_filename =~ s@(\\)+@/@g;
396
397 # Need to make sure there is a '/' on the end of upgraded_base_dir
398 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
399 $upgraded_base_dir .= "/";
400 }
401 }
402 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
403
404 # reset per-doc stuff...
405 $self->{'aux_files'} = {};
406 $self->{'dir_num'} = 0;
407 $self->{'file_num'} = 0;
408
409 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
410 # you can also include metadata in the format (X can be any number)
411 # <hX>Title<!--gsdl-metadata
412 # <Metadata name="name1">value1</Metadata>
413 # ...
414 # <Metadata name="nameN">valueN</Metadata>
415 #--></hX>
416 if ($self->{'sectionalise_using_h_tags'}) {
417 # description_tags should allways be activated because we convert headings to description tags
418 $self->{'description_tags'} = 1;
419
420 my $arrSections = [];
421 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
422
423 if (scalar(@$arrSections)) {
424 my $strMetadata = $self->update_section_data($arrSections, -1);
425 if (length($strMetadata)) {
426 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
427 $$textref =~ s/<\/body>/$strMetadata/ig;
428 }
429 }
430 }
431
432 my $cursection = $doc_obj->get_top_section();
433
434 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
435 unless $self->{'no_metadata'} || $self->{'description_tags'};
436
437 # Store URL for page as metadata - this can be used for an
438 # altavista style search interface. The URL won't be valid
439 # unless the file structure contains the domain name (i.e.
440 # like when w3mir is used to download a website).
441
442 # URL metadata (even invalid ones) are used to support internal
443 # links, so even if 'file_is_url' is off, still need to store info
444
445 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
446
447# my $utf8_file = $self->filename_to_utf8_metadata($file);
448# $utf8_file =~ s/&\#095;/_/g;
449# variable below used to be utf8_file
450
451 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
452 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
453
454 my $web_url = "http://";
455 my $utf8_web_url = "http://";
456 if(defined $dirname) { # local directory
457 # Check for "ftp" in the domain name of the directory
458 # structure to determine if this URL should be a ftp:// URL
459 # This check is not infallible, but better than omitting the
460 # check, which would cause all files downloaded from ftp sites
461 # via mirroring with wget to have potentially erroneous http:// URLs
462 # assigned in their metadata
463 if ($dirname =~ /^[^\/]*ftp/i)
464 {
465 $web_url = "ftp://";
466 $utf8_web_url = "ftp://";
467 }
468 $dirname = $self->eval_dir_dots($dirname);
469 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
470
471 $web_url = $web_url.$dirname.$url_encoded_file;
472 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
473 } else {
474 $web_url = $web_url.$url_encoded_file;
475 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
476 }
477 $web_url =~ s/\\/\//g;
478 $utf8_web_url =~ s/\\/\//g;
479
480 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
481 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
482 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
483 }
484
485
486 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
487 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
488
489 if ($self->{'file_is_url'}) {
490 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
491 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
492 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
493 }
494
495 if ($self->{'description_tags'}) {
496 # remove the html header - note that doing this here means any
497 # sections defined within the header will be lost (so all <Section>
498 # tags must appear within the body of the HTML)
499 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
500
501 $$textref =~ s/^.*?<body[^>]*>//is;
502 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
503
504 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
505 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
506
507 my $lt = '(?:<|&lt;)';
508 my $gt = '(?:>|&gt;)';
509 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
510
511 my $dont_strip = '';
512 if ($self->{'no_strip_metadata_html'}) {
513 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
514 }
515
516 my $found_something = 0; my $top = 1;
517 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
518 my $text = $1;
519 my $comment = $2;
520 if (defined $text) {
521 # text before a comment - note that getting to here
522 # doesn't necessarily mean there are Section tags in
523 # the document
524 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
525 }
526 while ($comment =~ s/$lt(.*?)$gt//s) {
527 my $tag = $1;
528 if ($tag eq "Section") {
529 $found_something = 1;
530 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
531 $top = 0;
532 } elsif ($tag eq "/Section") {
533 $found_something = 1;
534 $cursection = $doc_obj->get_parent_section ($cursection);
535 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
536 my $metaname = $1;
537 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
538 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
539 my $metavalue = $1;
540 $metavalue =~ s/^\s+//;
541 $metavalue =~ s/\s+$//;
542 # assume that no metadata value intentionally includes
543 # carriage returns or HTML tags (if they're there they
544 # were probably introduced when converting to HTML from
545 # some other format).
546 # actually some people want to have html tags in their
547 # metadata.
548 $metavalue =~ s/[\cJ\cM]/ /sg;
549 $metavalue =~ s/<[^>]+>//sg
550 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
551 $metavalue =~ s/\s+/ /sg;
552 if ($metaname =~ /\./) { # has a namespace
553 $metaname = "ex.$metaname";
554 }
555 if ($accumulate) {
556 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
557 } else {
558 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
559 }
560 } elsif ($tag eq "Description" || $tag eq "/Description") {
561 # do nothing with containing Description tags
562 } else {
563 # simple HTML tag (probably created by the conversion
564 # to HTML from some other format) - we'll ignore it and
565 # hope for the best ;-)
566 }
567 }
568 }
569 if ($cursection ne "") {
570 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
571 }
572
573 $$textref =~ s/^.*?<body[^>]*>//is;
574 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
575 if ($$textref =~ m/\S/) {
576 if (!$found_something) {
577 if ($self->{'verbosity'} > 2) {
578 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
579 print $outhandle " will be processed as a single section document\n";
580 }
581
582 # go ahead and process single-section document
583 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
584
585 # if document contains no Section tags we'll go ahead
586 # and extract metadata (this won't have been done
587 # above as the -description_tags option prevents it)
588 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
589 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
590 unless $self->{'no_metadata'};
591
592 } else {
593 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
594 print $outhandle " of the final closing </Section> tag. This text will\n";
595 print $outhandle " be ignored.";
596
597 my ($text);
598 if (length($$textref) > 30) {
599 $text = substr($$textref, 0, 30) . "...";
600 } else {
601 $text = $$textref;
602 }
603 $text =~ s/\n/ /isg;
604 print $outhandle " ($text)\n";
605 }
606 } elsif (!$found_something) {
607
608 if ($self->{'verbosity'} > 2) {
609 # may get to here if document contained no valid Section
610 # tags but did contain some comments. The text will have
611 # been processed already but we should print the warning
612 # as above and extract metadata
613 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
614 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
615 }
616
617 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
618 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
619 unless $self->{'no_metadata'};
620 }
621
622 } else {
623
624 # remove header and footer
625 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
626 $$textref =~ s/^.*?<body[^>]*>//is;
627 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
628 }
629
630
631 # single section document
632 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
633 }
634
635 return 1;
636}
637
638
639sub process_heading
640{
641 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
642 $strHeadingText = '' if (!defined($strHeadingText));
643
644 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
645
646 my $strSecMetadata = '';
647 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
648 {
649 $strSecMetadata .= $1;
650 }
651
652 $strHeadingText =~ s/^\s+//g;
653 $strHeadingText =~ s/\s+$//g;
654 $strSecMetadata =~ s/^\s+//g;
655 $strSecMetadata =~ s/\s+$//g;
656
657 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
658
659 if (length($strSecMetadata)) {
660 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
661 }
662
663 $strMetadata .= "\t</Description>\n";
664
665 return "<!--" . $strMetadata . "-->";
666}
667
668
669sub update_section_data
670{
671 my ($self, $arrSections, $nCurTocNo) = @_;
672 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
673
674 if ($nSections == 0) {
675 push @$arrSections, $nCurTocNo;
676 return $strBuffer;
677 }
678 $nLast = $arrSections->[$nSections - 1];
679 if ($nCurTocNo > $nLast) {
680 push @$arrSections, $nCurTocNo;
681 return $strBuffer;
682 }
683 for(my $i = $nSections - 1; $i >= 0; $i--) {
684 if ($nCurTocNo <= $arrSections->[$i]) {
685 $strBuffer .= "\n</Section>";
686 pop @$arrSections;
687 }
688 }
689 push @$arrSections, $nCurTocNo;
690 return $strBuffer;
691}
692
693
694# note that process_section may be called multiple times for a single
695# section (relying on the fact that add_utf8_text appends the text to any
696# that may exist already).
697sub process_section {
698 my $self = shift (@_);
699 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
700 # trap links
701 if (!$self->{'nolinks'}) {
702 # usemap="./#index" not handled correctly => change to "#index"
703## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
704
705## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
706## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
707
708 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
709 $self->replace_usemap_links($1, $2, $3)/isge;
710
711 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
712 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
713
714## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
715# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
716 }
717
718 # trap images
719
720 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
721 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
722 # turned regular text succeeding images into links. That is, by embedding <imgs>
723 # inside <a href=""></a>, the text following images were no longer misbehaving.
724 # However, there would be many occasions whereby images were not meant to link
725 # to their source images but where the images would link to another web page.
726 # To allow this, the no_image_links option was introduced: it would prevent
727 # the behaviour of embedding images into links that referenced the source images.
728
729 # Somewhere along the line, the problem of normal text turning into links when
730 # such text followed images which were not embedded in <a href=""></a> ceased
731 # to occur. This is why the following lines have been commented out (as well as
732 # two lines in replace_images). They appear to no longer apply.
733
734 # If at any time, there is a need for having images embedded in <a> anchor tags,
735 # then it might be better to turn that into an HTMLPlugin option rather than make
736 # it the default behaviour. Also, eventually, no_image_links needs to become
737 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
738
739 #if(!$self->{'no_image_links'}){
740 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
741 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
742 #}
743
744 # add text to document object
745 # turn \ into \\ so that the rest of greenstone doesn't think there
746 # is an escape code following. (Macro parsing loses them...)
747 $$textref =~ s/\\/\\\\/go;
748
749 $doc_obj->add_utf8_text($cursection, $$textref);
750}
751
752sub replace_images {
753 my $self = shift (@_);
754 my ($front, $link, $back, $base_dir,
755 $file, $doc_obj, $section) = @_;
756
757 # remove quotes from link at start and end if necessary
758 if ($link=~/^[\"\']/) {
759 $link=~s/^[\"\']//;
760 $link=~s/[\"\']$//;
761 $front.='"';
762 $back="\"$back";
763 }
764
765 $link =~ s/\n/ /g;
766
767 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
768 # If the Word file path has spaces in it, wv messes up and you end up with
769 # absolute paths for the images, and without the "file://" prefix
770 # So check for this special case and massage the data to be correct
771 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
772 $link =~ s/^.*\\([^\\]+)$/$1/;
773 }
774
775 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
776
777 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
778
779# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
780
781 my $anchor_name = $img_file;
782 #$anchor_name =~ s/^.*\///;
783 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
784
785 my $image_link = $front . $img_file .$back;
786 return $image_link;
787
788 # The reasons for why the following two lines are no longer necessary can be
789 # found in subroutine process_section
790 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
791 #return $anchor_link;
792
793 #return $front . $img_file . $back . $anchor_name;
794}
795
796sub replace_href_links {
797 my $self = shift (@_);
798 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
799
800 # remove quotes from link at start and end if necessary
801 if ($link=~/^[\"\']/) {
802 $link=~s/^[\"\']//;
803 $link=~s/[\"\']$//;
804 $front.='"';
805 $back="\"$back";
806 }
807
808 # attempt to sort out targets - frames are not handled
809 # well in this plugin and some cases will screw things
810 # up - e.g. the _parent target (so we'll just remove
811 # them all ;-)
812 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
813 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
814 $front =~ s/target=\"?_parent\"?//is;
815 $back =~ s/target=\"?_parent\"?//is;
816
817 return $front . $link . $back if $link =~ m/^\#/s;
818 $link =~ s/\n/ /g;
819
820 # Find file referred to by $link on file system
821 # This is more complicated than it sounds when char encodings
822 # is taken in to account
823 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
824
825 # href may use '\'s where '/'s should be on Windows
826 $href =~ s/\\/\//g;
827 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
828
829
830 ##### leave all these links alone (they won't be picked up by intermediate
831 ##### pages). I think that's safest when dealing with frames, targets etc.
832 ##### (at least until I think of a better way to do it). Problems occur with
833 ##### mailto links from within small frames, the intermediate page is displayed
834 ##### within that frame and can't be seen. There is still potential for this to
835 ##### happen even with html pages - the solution seems to be to somehow tell
836 ##### the browser from the server side to display the page being sent (i.e.
837 ##### the intermediate page) in the top level window - I'm not sure if that's
838 ##### possible - the following line should probably be deleted if that can be done
839 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
840
841 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
842 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
843
844 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
845
846 # Don't do any encoding for now, as not clear what
847 # the right thing to do is to support filename
848 # encoding on Windows when they are not UTF16
849 #
850 }
851 else {
852 # => Unix-based system
853
854 # If web page didn't give encoding, then default to utf8
855 my $content_encoding= $self->{'content_encoding'} || "utf8";
856
857 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
858 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
859 }
860
861 # on Darwin, the unicode filenames are stored on the file
862 # system in decomposed form, so any href link (including when
863 # URL-encoded) should refer to the decomposed name of the file
864 if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
865 $href = normalize('D', $href); # Normalization Form D (decomposition)
866 }
867
868 $href = encode($content_encoding,$href);
869 }
870
871 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
872 $href = &unicode::filename_to_url($href);
873
874 &ghtml::urlsafe ($href);
875
876 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
877 print STDERR "******DEBUG: href=$href\n";
878 }
879
880
881 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
882 } else {
883 # link is to some other type of file (e.g., an image) so we'll
884 # need to associate that file
885 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
886 }
887}
888
889sub add_file {
890 my $self = shift (@_);
891 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
892 my ($newname);
893
894 my $filename = $href;
895 if ($base_dir eq "") {
896 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
897 # remove http://
898 $filename =~ s/^[^:]*:\/\///;
899 }
900 else {
901 # remove http:/ thereby leaving one slash at the start as
902 # part of full pathname
903 $filename =~ s/^[^:]*:\///;
904 }
905 }
906 else {
907 # remove http://
908 $filename =~ s/^[^:]*:\/\///;
909 }
910
911 $filename = &util::filename_cat($base_dir, $filename);
912
913 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
914 # we are processing a tidytmp file - want paths to be in import
915 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
916 }
917
918 # Replace %XX's in URL with decoded value if required. Note that the
919 # filename may include the %XX in some situations. If the *original*
920 # file's name was in URL encoding, the following method will not decode
921 # it.
922 my $unicode_filename = $filename;
923 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
924
925 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
926 # (it seems) accompanying file
927 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
928
929 my $content_encoding= $self->{'content_encoding'} || "utf8";
930
931 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
932 # The filenames that come through the HTML file have been decoded
933 # into Unicode aware Perl strings. Need to convert them back
934 # to their initial raw-byte encoding to match the file that
935 # exists on the file system
936 $filename = encode($content_encoding, $opt_decode_unicode_filename);
937 }
938 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
939 # HFS+ is UTF8 with decompostion
940 $filename = encode($content_encoding, $opt_decode_unicode_filename);
941 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
942
943 }
944 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
945 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
946
947 if (defined $long_filename) {
948 my $short_filename = Win32::GetLongPathName($long_filename);
949 $filename = $short_filename;
950 }
951# else {
952# print STDERR "***** failed to map href to real file:\n";
953# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
954# }
955 }
956 else {
957 my $outhandle = $self->{'outhandle'};
958 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
959 print $outhandle " in file system encoding of href: $href\n";
960 print $outhandle " No character encoding done.\n";
961 }
962
963
964 # some special processing if the intended filename was converted to utf8, but
965 # the actual file still needs to be renamed
966 if (!&util::fd_exists($filename)) {
967 # try the original filename stored in map
968 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
969 print STDERR "******!! orig filename did not exist: $filename\n";
970 }
971
972## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
973
974 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
975
976 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
977 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
978 }
979
980 if (defined $original_filename && -e $original_filename) {
981 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
982 print STDERR "****** Found match!\n";
983 }
984 $filename = $original_filename;
985 }
986 }
987
988 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
989
990 if ($rl == 0) {
991 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
992 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
993 }
994 else {
995 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
996 }
997 }
998
999 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1000 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
1001 }
1002 # add the original image file as a source file
1003 if (!$self->{'processing_tmp_files'} ) {
1004 $doc_obj->associate_source_file($filename);
1005 }
1006 if ($self->{'rename_assoc_files'}) {
1007 if (defined $self->{'aux_files'}->{$href}) {
1008 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1009 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1010 } else {
1011 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1012 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1013 $self->inc_filecount ();
1014 }
1015 $doc_obj->associate_file($filename, $newname, undef, $section);
1016 return "_httpdocimg_/$newname";
1017 } else {
1018 if(&unicode::is_url_encoded($unicode_filename)) {
1019 # use the possibly-decoded filename instead to avoid double URL encoding
1020 ($newname) = $filename =~ m/([^\/\\]*)$/;
1021 } else {
1022 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1023 }
1024
1025 # Make sure this name uses only ASCII characters.
1026 # We use either base64 or URL encoding, as these preserve original encoding
1027 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
1028
1029### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1030 $doc_obj->associate_file($filename, $newname, undef, $section);
1031
1032 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1033 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1034 my $newname_url = $newname;
1035 $newname_url = &unicode::filename_to_url($newname_url);
1036 return "_httpdocimg_/$newname_url";
1037 }
1038}
1039
1040
1041sub format_link {
1042 my $self = shift (@_);
1043 my ($link, $base_dir, $file) = @_;
1044
1045 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1046 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
1047
1048 $hash_part = "" if !defined $hash_part;
1049 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1050 my $outhandle = $self->{'outhandle'};
1051 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1052 if $self->{'verbosity'};
1053 return ($link, "", 0);
1054 }
1055
1056 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
1057 my $type = $1;
1058
1059 if ($link =~ m/^(http|ftp):/i) {
1060 # Turn url (using /) into file name (possibly using \ on windows)
1061 my @http_dir_split = split('/', $before_hash);
1062 $before_hash = &util::filename_cat(@http_dir_split);
1063 }
1064
1065 $before_hash = $self->eval_dir_dots($before_hash);
1066
1067 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1068
1069 my $rl = 0;
1070 $rl = 1 if (-e $linkfilename);
1071
1072 # make sure there's a slash on the end if it's a directory
1073 if ($before_hash !~ m/\/$/) {
1074 $before_hash .= "/" if (-d $linkfilename);
1075 }
1076 return ($type . $before_hash, $hash_part, $rl);
1077
1078 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1079
1080 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1081
1082 # the first directory will be the domain name if file_is_url
1083 # to generate archives, otherwise we'll assume all files are
1084 # from the same site and base_dir is the root
1085
1086 if ($self->{'file_is_url'}) {
1087 my @dirs = split /[\/\\]/, $file;
1088 my $domname = shift (@dirs);
1089 $before_hash = &util::filename_cat($domname, $before_hash);
1090 $before_hash =~ s@\\@/@g; # for windows
1091 }
1092 else
1093 {
1094 # see if link shares directory with source document
1095 # => turn into relative link if this is so!
1096
1097 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1098 # too difficult doing a pattern match with embedded '\'s...
1099 my $win_before_hash=$before_hash;
1100 $win_before_hash =~ s@(\\)+@/@g;
1101 # $base_dir is already similarly "converted" on windows.
1102 if ($win_before_hash =~ s@^$base_dir/@@o) {
1103 # if this is true, we removed a prefix
1104 $before_hash=$win_before_hash;
1105 }
1106 }
1107 else {
1108 # before_hash has lost leading slash by this point,
1109 # -> add back in prior to substitution with $base_dir
1110 $before_hash = "/$before_hash";
1111
1112 $before_hash = &util::filename_cat("",$before_hash);
1113 $before_hash =~ s@^$base_dir/@@;
1114 }
1115 }
1116 } else {
1117 # Turn relative file path into full path
1118 my $dirname = &File::Basename::dirname($file);
1119 $before_hash = &util::filename_cat($dirname, $before_hash);
1120 $before_hash = $self->eval_dir_dots($before_hash);
1121 }
1122
1123 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1124
1125
1126# print STDERR "**** linkfilename = $linkfilename\n";
1127# if (!&util::fd_exists($linkfilename)) {
1128# print STDERR "***** Warning: Could not find $linkfilename\n";
1129# }
1130
1131
1132 # make sure there's a slash on the end if it's a directory
1133 if ($before_hash !~ m/\/$/) {
1134 $before_hash .= "/" if (-d $linkfilename);
1135 }
1136
1137# print STDERR "*** returning: $before_hash\n";
1138
1139 return ("http://" . $before_hash, $hash_part, 1);
1140 } else {
1141 # mailto, news, nntp, telnet, javascript or gopher link
1142 return ($before_hash, "", 0);
1143 }
1144}
1145
1146sub extract_first_NNNN_characters {
1147 my $self = shift (@_);
1148 my ($textref, $doc_obj, $thissection) = @_;
1149
1150 foreach my $size (split /,/, $self->{'first'}) {
1151 my $tmptext = $$textref;
1152 # skip to the body
1153 $tmptext =~ s/.*<body[^>]*>//i;
1154 # remove javascript
1155 $tmptext =~ s@<script.*?</script>@ @sig;
1156 $tmptext =~ s/<[^>]*>/ /g;
1157 $tmptext =~ s/&nbsp;/ /g;
1158 $tmptext =~ s/^\s+//;
1159 $tmptext =~ s/\s+$//;
1160 $tmptext =~ s/\s+/ /gs;
1161 $tmptext = &unicode::substr ($tmptext, 0, $size);
1162 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1163 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1164 }
1165}
1166
1167
1168sub extract_metadata {
1169 my $self = shift (@_);
1170 my ($textref, $metadata, $doc_obj, $section) = @_;
1171 my $outhandle = $self->{'outhandle'};
1172 # if we don't want metadata, we may as well not be here ...
1173 return if (!defined $self->{'metadata_fields'});
1174
1175 my $separator = $self->{'metadata_field_separator'};
1176 if ($separator eq "") {
1177 undef $separator;
1178 }
1179
1180 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1181 # html meta, 'value' is the metadata name for greenstone to use
1182 my %find_fields = ();
1183
1184 my %creator_fields = (); # short-cut for lookups
1185
1186
1187 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1188 $field =~ s/^\s+//; # remove leading whitespace
1189 $field =~ s/\s+$//; # remove trailing whitespace
1190
1191 # support tag<tagname>
1192 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1193 # "$2" is the user's preferred gs metadata name
1194 $find_fields{lc($1)}=$2; # lc = lowercase
1195 } else { # no <tagname> for mapping
1196 # "$field" is the user's preferred gs metadata name
1197 $find_fields{lc($field)}=$field; # lc = lowercase
1198 }
1199 }
1200
1201 if (defined $self->{'hunt_creator_metadata'} &&
1202 $self->{'hunt_creator_metadata'} == 1 ) {
1203 my @extra_fields =
1204 (
1205 'author',
1206 'author.email',
1207 'creator',
1208 'dc.creator',
1209 'dc.creator.corporatename',
1210 );
1211
1212 # add the creator_metadata fields to search for
1213 foreach my $field (@extra_fields) {
1214 $creator_fields{$field}=0; # add to lookup hash
1215 }
1216 }
1217
1218
1219 # find the header in the html file, which has the meta tags
1220 $$textref =~ m@<head>(.*?)</head>@si;
1221
1222 my $html_header=$1;
1223
1224 # go through every <meta... tag defined in the html and see if it is
1225 # one of the tags we want to match.
1226
1227 # special case for title - we want to remember if its been found
1228 my $found_title = 0;
1229 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1230 $html_header =~ m/^/; # match the start of the string, for \G assertion
1231
1232 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1233 my $metatag=$1;
1234 my ($tag, $value);
1235
1236 # find the tag name
1237 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1238 $tag=$2;
1239 # in case they're not using " or ', but they should...
1240 if (! $tag) {
1241 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1242 $tag=$1;
1243 }
1244
1245 if (!defined $tag) {
1246 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1247 next;
1248 }
1249
1250 # don't need to assign this field if it was passed in from a previous
1251 # (recursive) plugin
1252 if (defined $metadata->{$tag}) {next}
1253
1254 # find the tag content
1255 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1256 $value=$2;
1257
1258 # The following code assigns the metaname to value if value is
1259 # empty. Why would we do this?
1260 #if (! $value) {
1261 # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1262 # $value=$1;
1263 #}
1264 if (!defined $value || $value eq "") {
1265 print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1266 next;
1267 }
1268
1269 # clean up and add
1270 $value =~ s/\s+/ /gs;
1271 chomp($value); # remove trailing \n, if any
1272 if (exists $creator_fields{lc($tag)}) {
1273 # map this value onto greenstone's "Creator" metadata
1274 $tag='Creator';
1275 } elsif (!exists $find_fields{lc($tag)}) {
1276 next; # don't want this tag
1277 } else {
1278 # get the user's preferred capitalisation
1279 $tag = $find_fields{lc($tag)};
1280 }
1281 if (lc($tag) eq "title") {
1282 $found_title = 1;
1283 }
1284
1285 if ($self->{'verbosity'} > 2) {
1286 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1287 }
1288
1289 if ($tag =~ /\./) {
1290 # there is a . so has a namespace, add ex.
1291 $tag = "ex.$tag";
1292 }
1293 if (defined $separator) {
1294 my @values = split($separator, $value);
1295 foreach my $v (@values) {
1296 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1297 }
1298 }
1299 else {
1300 $doc_obj->add_utf8_metadata($section, $tag, $value);
1301 }
1302 }
1303
1304 # TITLE: extract the document title
1305 if (exists $find_fields{'title'} && !$found_title) {
1306 # we want a title, and didn't find one in the meta tags
1307 # see if there's a <title> tag
1308 my $title;
1309 my $from = ""; # for debugging output only
1310 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1311 $title = $1;
1312 $from = "<title> tags";
1313 }
1314
1315 if (!defined $title) {
1316 $from = "first 100 chars";
1317 # if no title use first 100 or so characters
1318 $title = $$textref;
1319 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1320 $title =~ s/^.*?<body>//si;
1321 # ignore javascript!
1322 $title =~ s@<script.*?</script>@ @sig;
1323 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1324 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1325 $title = substr ($title, 0, 100);
1326 $title =~ s/\s\S*$/.../;
1327 }
1328 $title =~ s/<[^>]*>/ /g; # remove html tags
1329 $title =~ s/&nbsp;/ /g;
1330 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1331 $title =~ s/\s+/ /gs; # collapse multiple spaces
1332 $title =~ s/^\s*//; # remove leading spaces
1333 $title =~ s/\s*$//; # remove trailing spaces
1334
1335 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1336 $title =~ s/^\s+//s; # in case title_sub introduced any...
1337 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1338 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1339 if ($self->{'verbosity'} > 2);
1340 }
1341
1342 # add FileFormat metadata
1343 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1344
1345 # Special, for metadata names such as tagH1 - extracts
1346 # the text between the first <H1> and </H1> tags into "H1" metadata.
1347
1348 foreach my $field (keys %find_fields) {
1349 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1350 my $tag = $1;
1351 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1352 my $content = $1;
1353 $content =~ s/&nbsp;/ /g;
1354 $content =~ s/<[^>]*>/ /g;
1355 $content =~ s/^\s+//;
1356 $content =~ s/\s+$//;
1357 $content =~ s/\s+/ /gs;
1358 if ($content) {
1359 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1360 $tag =~ s/^tag//i;
1361 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1362 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1363 if ($self->{'verbosity'} > 2);
1364 }
1365 }
1366 }
1367}
1368
1369
1370# evaluate any "../" to next directory up
1371# evaluate any "./" as here
1372sub eval_dir_dots {
1373 my $self = shift (@_);
1374 my ($filename) = @_;
1375 my $dirsep_os = &util::get_os_dirsep();
1376 my @dirsep = split(/$dirsep_os/,$filename);
1377
1378 my @eval_dirs = ();
1379 foreach my $d (@dirsep) {
1380 if ($d eq "..") {
1381 pop(@eval_dirs);
1382
1383 } elsif ($d eq ".") {
1384 # do nothing!
1385
1386 } else {
1387 push(@eval_dirs,$d);
1388 }
1389 }
1390
1391 # Need to fiddle with number of elements in @eval_dirs if the
1392 # first one is the empty string. This is because of a
1393 # modification to util::filename_cat that supresses the addition
1394 # of a leading '/' character (or \ if windows) (intended to help
1395 # filename cat with relative paths) if the first entry in the
1396 # array is the empty string. Making the array start with *two*
1397 # empty strings is a way to defeat this "smart" option.
1398 #
1399 if (scalar(@eval_dirs) > 0) {
1400 if ($eval_dirs[0] eq ""){
1401 unshift(@eval_dirs,"");
1402 }
1403 }
1404
1405 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1406 return $evaluated_filename;
1407}
1408
1409sub replace_usemap_links {
1410 my $self = shift (@_);
1411 my ($front, $link, $back) = @_;
1412
1413 # remove quotes from link at start and end if necessary
1414 if ($link=~/^[\"\']/) {
1415 $link=~s/^[\"\']//;
1416 $link=~s/[\"\']$//;
1417 $front.='"';
1418 $back="\"$back";
1419 }
1420
1421 $link =~ s/^\.\///;
1422 return $front . $link . $back;
1423}
1424
1425sub inc_filecount {
1426 my $self = shift (@_);
1427
1428 if ($self->{'file_num'} == 1000) {
1429 $self->{'dir_num'} ++;
1430 $self->{'file_num'} = 0;
1431 } else {
1432 $self->{'file_num'} ++;
1433 }
1434}
1435
1436
1437# Extend read_file so that strings like &eacute; are
1438# converted to UTF8 internally.
1439#
1440# We don't convert &lt; or &gt; or &amp; or &quot; in case
1441# they interfere with the GML files
1442
1443sub read_file {
1444 my $self = shift(@_);
1445 my ($filename, $encoding, $language, $textref) = @_;
1446
1447 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1448
1449 # Convert entities to their Unicode code-point equivalents
1450 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1451 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1452 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1453
1454}
1455
1456sub HB_read_html_file {
1457 my $self = shift (@_);
1458 my ($htmlfile, $text) = @_;
1459
1460 # load in the file
1461 if (!open (FILE, $htmlfile)) {
1462 print STDERR "ERROR - could not open $htmlfile\n";
1463 return;
1464 }
1465
1466 my $foundbody = 0;
1467 $self->HB_gettext (\$foundbody, $text, "FILE");
1468 close FILE;
1469
1470 # just in case there was no <body> tag
1471 if (!$foundbody) {
1472 $foundbody = 1;
1473 open (FILE, $htmlfile) || return;
1474 $self->HB_gettext (\$foundbody, $text, "FILE");
1475 close FILE;
1476 }
1477 # text is in utf8
1478}
1479
1480# converts the text to utf8, as ghtml does that for &eacute; etc.
1481sub HB_gettext {
1482 my $self = shift (@_);
1483 my ($foundbody, $text, $handle) = @_;
1484
1485 my $line = "";
1486 while (defined ($line = <$handle>)) {
1487 # look for body tag
1488 if (!$$foundbody) {
1489 if ($line =~ s/^.*<body[^>]*>//i) {
1490 $$foundbody = 1;
1491 } else {
1492 next;
1493 }
1494 }
1495
1496 # check for symbol fonts
1497 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1498 my $font = $1;
1499 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1500 if ($font !~ m/^arial$/i);
1501 }
1502
1503 $$text .= $line;
1504 }
1505
1506 if ($self->{'input_encoding'} eq "iso_8859_1") {
1507 # convert to utf-8
1508 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1509 }
1510 # convert any alphanumeric character entities to their utf-8
1511 # equivalent for indexing purposes
1512 #&ghtml::convertcharentities ($$text);
1513
1514 $$text =~ s/\s+/ /g; # remove \n's
1515
1516 # At this point $$text is a binary byte string
1517 # => turn it into a Unicode aware string, so full
1518 # Unicode aware pattern matching can be used.
1519 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1520 #
1521
1522 $$text = decode("utf8",$$text);
1523}
1524
1525sub HB_clean_section {
1526 my $self = shift (@_);
1527 my ($section) = @_;
1528
1529 # remove tags without a starting tag from the section
1530 my ($tag, $tagstart);
1531 while ($section =~ m/<\/([^>]{1,10})>/) {
1532 $tag = $1;
1533 $tagstart = index($section, "<$tag");
1534 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1535 $section =~ s/<\/$tag>//;
1536 }
1537
1538 # remove extra paragraph tags
1539 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1540
1541 # remove extra stuff at the end of the section
1542 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1543
1544 # add a newline at the beginning of each paragraph
1545 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1546
1547 # add a newline every 80 characters at a word boundary
1548 # Note: this regular expression puts a line feed before
1549 # the last word in each section, even when it is not
1550 # needed.
1551 $section =~ s/(.{1,80})\s/$1\n/g;
1552
1553 # fix up the image links
1554 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1555 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1556 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1557 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1558
1559 return $section;
1560}
1561
1562# Will convert the oldHDL format to the new HDL format (using the Section tag)
1563sub convert_to_newHDLformat
1564{
1565 my $self = shift (@_);
1566 my ($file,$cnfile) = @_;
1567 my $input_filename = $file;
1568 my $tmp_filename = $cnfile;
1569
1570 # write HTML tmp file with new HDL format
1571 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1572
1573 # read in the file and do basic html cleaning (removing header etc)
1574 my $html = "";
1575 $self->HB_read_html_file ($input_filename, \$html);
1576
1577 # process the file one section at a time
1578 my $curtoclevel = 1;
1579 my $firstsection = 1;
1580 my $toclevel = 0;
1581 while (length ($html) > 0) {
1582 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1583 $toclevel = $3;
1584 my $title = $4;
1585 my $sectiontext = "";
1586 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1587 $sectiontext = $1;
1588 } else {
1589 $sectiontext = $html;
1590 $html = "";
1591 }
1592
1593 # remove tags and extra spaces from the title
1594 $title =~ s/<\/?[^>]+>//g;
1595 $title =~ s/^\s+|\s+$//g;
1596
1597 # close any sections below the current level and
1598 # create a new section (special case for the firstsection)
1599 print PROD "<!--\n";
1600 while (($curtoclevel > $toclevel) ||
1601 (!$firstsection && $curtoclevel == $toclevel)) {
1602 $curtoclevel--;
1603 print PROD "</Section>\n";
1604 }
1605 if ($curtoclevel+1 < $toclevel) {
1606 print STDERR "WARNING - jump in toc levels in $input_filename " .
1607 "from $curtoclevel to $toclevel\n";
1608 }
1609 while ($curtoclevel < $toclevel) {
1610 $curtoclevel++;
1611 }
1612
1613 if ($curtoclevel == 1) {
1614 # add the header tag
1615 print PROD "-->\n";
1616 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1617 print PROD "<!--\n";
1618 }
1619
1620 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1621
1622 print PROD "-->\n";
1623
1624 # clean up the section html
1625 $sectiontext = $self->HB_clean_section($sectiontext);
1626
1627 print PROD "$sectiontext\n";
1628
1629 } else {
1630 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1631 "\nin $input_filename\n";
1632 last;
1633 }
1634 $firstsection = 0;
1635 }
1636
1637 print PROD "<!--\n";
1638 while ($curtoclevel > 0) {
1639 $curtoclevel--;
1640 print PROD "</Section>\n";
1641 }
1642 print PROD "-->\n";
1643
1644 close (PROD) || die("Error Closing File: $tmp_filename $!");
1645
1646 return $tmp_filename;
1647}
1648
1649sub shorten {
1650 my $self = shift (@_);
1651 my ($text) = @_;
1652
1653 return "\"$text\"" if (length($text) < 100);
1654
1655 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1656 substr ($text, length($text)-50) . "\"";
1657}
1658
1659sub convert_tidy_or_oldHDL_file
1660{
1661 my $self = shift (@_);
1662 my ($file) = @_;
1663 my $input_filename = $file;
1664
1665 if (-d $input_filename)
1666 {
1667 return $input_filename;
1668 }
1669
1670 # get the input filename
1671 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1672 my $base_dirname = $dirname;
1673 $suffix = lc($suffix);
1674
1675 # derive tmp filename from input filename
1676 # Remove any white space from filename -- no risk of name collision, and
1677 # makes later conversion by utils simpler. Leave spaces in path...
1678 # tidy up the filename with space, dot, hyphen between
1679 $tailname =~ s/\s+//g;
1680 $tailname =~ s/\.+//g;
1681 $tailname =~ s/\-+//g;
1682 # convert to utf-8 otherwise we have problems with the doc.xml file
1683 # later on
1684 &unicode::ensure_utf8(\$tailname);
1685
1686 # softlink to collection tmp dir
1687 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1688 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1689
1690 my $test_dirname = "";
1691 my $f_separator = &util::get_os_dirsep();
1692
1693 if ($dirname =~ m/import$f_separator/)
1694 {
1695 $test_dirname = $'; #'
1696
1697 #print STDERR "init $'\n";
1698
1699 while ($test_dirname =~ m/[$f_separator]/)
1700 {
1701 my $folderdirname = $`;
1702 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1703 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1704 $test_dirname = $'; #'
1705 }
1706 }
1707
1708 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1709
1710 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1711 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1712 {
1713 #convert the input file to a new style HDL
1714 my $hdl_output_filename = $input_filename;
1715 if ($self->{'old_style_HDL'})
1716 {
1717 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1718 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1719 }
1720
1721 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1722 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1723 my @files = grep {!/^\.+$/} readdir(DIR);
1724 close(DIR);
1725
1726 foreach my $file (@files)
1727 {
1728 my $src_file = &util::filename_cat($base_dirname,$file);
1729 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1730 if ((!-e $dest_file) && (!-d $src_file))
1731 {
1732 # just copy the original file back to the tmp directory
1733 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1734 }
1735 }
1736
1737 # tidy the input file
1738 my $tidy_output_filename = $hdl_output_filename;
1739 if ($self->{'use_realistic_book'})
1740 {
1741 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1742 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1743 }
1744 $tmp_filename = $tidy_output_filename;
1745 }
1746 else
1747 {
1748 if (!-e $tmp_filename)
1749 {
1750 # just copy the original file back to the tmp directory
1751 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1752 }
1753 }
1754
1755 return $tmp_filename;
1756}
1757
1758
1759# Will make the html input file as a proper XML file with removed font tag and
1760# image size added to the img tag.
1761# The tidying process takes place in a collection specific 'tmp' directory so
1762# that we don't accidentally damage the input.
1763sub tmp_tidy_file
1764{
1765 my $self = shift (@_);
1766 my ($file,$cnfile) = @_;
1767 my $input_filename = $file;
1768 my $tmp_filename = $cnfile;
1769
1770 # get the input filename
1771 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1772
1773 require HTML::TokeParser::Simple;
1774
1775 # create HTML parser to decode the input file
1776 my $parser = HTML::TokeParser::Simple->new($input_filename);
1777
1778 # write HTML tmp file without the font tag and image size are added to the img tag
1779 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1780 while (my $token = $parser->get_token())
1781 {
1782 # is it an img tag
1783 if ($token->is_start_tag('img'))
1784 {
1785 # get the attributes
1786 my $attr = $token->return_attr;
1787
1788 # get the full path to the image
1789 my $img_file = &util::filename_cat($dirname,$attr->{src});
1790
1791 # set the width and height attribute
1792 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1793
1794 # recreate the tag
1795 print PROD "<img";
1796 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1797 print PROD ">";
1798 }
1799 # is it a font tag
1800 else
1801 {
1802 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1803 {
1804 # remove font tag
1805 print PROD "";
1806 }
1807 else
1808 {
1809 # print without changes
1810 print PROD $token->as_is;
1811 }
1812 }
1813 }
1814 close (PROD) || die("Error Closing File: $tmp_filename $!");
1815
1816 # run html-tidy on the tmp file to make it a proper XML file
1817
1818 my $outhandle = $self->{'outhandle'};
1819 print $outhandle "Converting HTML to be XML compliant:\n";
1820
1821 my $tidy_cmd = "tidy";
1822 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1823 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1824 if ($self->{'verbosity'} <= 2) {
1825 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1826 $tidy_cmd .= " 2>nul";
1827 }
1828 else {
1829 $tidy_cmd .= " 2>/dev/null";
1830 }
1831 print $outhandle " => $tidy_cmd\n";
1832 }
1833
1834 my $tidyfile = `$tidy_cmd`;
1835
1836 # write result back to the tmp file
1837 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1838 print PROD $tidyfile;
1839 close (PROD) || die("Error Closing File: $tmp_filename $!");
1840
1841 # return the output filename
1842 return $tmp_filename;
1843}
1844
1845sub associate_cover_image
1846{
1847 my $self = shift(@_);
1848 my ($doc_obj, $filename) = @_;
1849 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1850 {
1851 # we will have cover image in tidytmp, but want it from import
1852 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1853 }
1854 $self->SUPER::associate_cover_image($doc_obj, $filename);
1855}
1856
1857
18581;
Note: See TracBrowser for help on using the repository browser.