source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27703

Last change on this file since 27703 was 27703, checked in by ak19, 11 years ago

Dr Bainbridge fixed the final diffcol issue with Small-HTML on windows where links in the body of the text had backslashes on Windows. They needed to be linux style slashes. The code change and logic was slightly more complicated as the links were used to test whether referenced files existed on the file system and were therefore relative (to the GS collection) or external links.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 66.0 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39use Unicode::Normalize 'normalize';
40
41use ReadTextFile;
42use HBPlugin;
43use ghtml;
44use unicode;
45use util;
46use FileUtils;
47use XMLParser;
48
49use Image::Size;
50use File::Copy;
51
52sub BEGIN {
53 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
54}
55
56use strict; # every perl program should have this!
57no strict 'refs'; # make an exception so we can use variables as filehandles
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasePlugin.process_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_process_exp() },
64 { 'name' => "block_exp",
65 'desc' => "{BasePlugin.block_exp}",
66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "nolinks",
69 'desc' => "{HTMLPlugin.nolinks}",
70 'type' => "flag" },
71 { 'name' => "keep_head",
72 'desc' => "{HTMLPlugin.keep_head}",
73 'type' => "flag" },
74 { 'name' => "no_metadata",
75 'desc' => "{HTMLPlugin.no_metadata}",
76 'type' => "flag" },
77 { 'name' => "metadata_fields",
78 'desc' => "{HTMLPlugin.metadata_fields}",
79 'type' => "string",
80 'deft' => "Title" },
81 { 'name' => "metadata_field_separator",
82 'desc' => "{HTMLPlugin.metadata_field_separator}",
83 'type' => "string",
84 'deft' => "" },
85 { 'name' => "hunt_creator_metadata",
86 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
87 'type' => "flag" },
88 { 'name' => "file_is_url",
89 'desc' => "{HTMLPlugin.file_is_url}",
90 'type' => "flag" },
91 { 'name' => "assoc_files",
92 'desc' => "{HTMLPlugin.assoc_files}",
93 'type' => "regexp",
94 'deft' => &get_default_block_exp() },
95 { 'name' => "rename_assoc_files",
96 'desc' => "{HTMLPlugin.rename_assoc_files}",
97 'type' => "flag" },
98 { 'name' => "title_sub",
99 'desc' => "{HTMLPlugin.title_sub}",
100 'type' => "string",
101 'deft' => "" },
102 { 'name' => "description_tags",
103 'desc' => "{HTMLPlugin.description_tags}",
104 'type' => "flag" },
105 # retain this for backward compatibility (w3mir option was replaced by
106 # file_is_url)
107 { 'name' => "w3mir",
108# 'desc' => "{HTMLPlugin.w3mir}",
109 'type' => "flag",
110 'hiddengli' => "yes"},
111 { 'name' => "no_strip_metadata_html",
112 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no"},
116 { 'name' => "sectionalise_using_h_tags",
117 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
118 'type' => "flag" },
119 { 'name' => "use_realistic_book",
120 'desc' => "{HTMLPlugin.tidy_html}",
121 'type' => "flag"},
122 { 'name' => "old_style_HDL",
123 'desc' => "{HTMLPlugin.old_style_HDL}",
124 'type' => "flag"},
125 {'name' => "processing_tmp_files",
126 'desc' => "{BasePlugin.processing_tmp_files}",
127 'type' => "flag",
128 'hiddengli' => "yes"}
129 ];
130
131my $options = { 'name' => "HTMLPlugin",
132 'desc' => "{HTMLPlugin.desc}",
133 'abstract' => "no",
134 'inherits' => "yes",
135 'args' => $arguments };
136
137
138sub new {
139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141 push(@$pluginlist, $class);
142
143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
145
146
147 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
148
149 if ($self->{'w3mir'}) {
150 $self->{'file_is_url'} = 1;
151 }
152 $self->{'aux_files'} = {};
153 $self->{'dir_num'} = 0;
154 $self->{'file_num'} = 0;
155
156 return bless $self, $class;
157}
158
159# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
160# if have eg <script language="javascript" src="img/lib.js@123">
161# blocking is now done by reading through the file and recording all the
162# images and other files
163sub get_default_block_exp {
164 my $self = shift (@_);
165
166 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
167 return "";
168}
169
170sub get_default_process_exp {
171 my $self = shift (@_);
172
173 # the last option is an attempt to encode the concept of an html query ...
174 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
175}
176
177sub store_block_files
178{
179 my $self =shift (@_);
180 my ($filename_full_path, $block_hash) = @_;
181
182 my $html_fname = $filename_full_path;
183
184 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187 # read in file ($text will be in utf8)
188 my $raw_text = "";
189 $self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191 my $textref = \$raw_text;
192 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
193 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
194 $$textref =~ s/$opencom(.*?)$closecom//gs;
195
196 # Convert entities to their UTF8 equivalents
197 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
198 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
200
201 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
202 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
203 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
204 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
205 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
206 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
207 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
208
209 if(!defined $self->{'unicode_to_original_filename'}) {
210 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211 $self->{'unicode_to_original_filename'} = {};
212 }
213
214 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216 # remove quotes from link at start and end if necessary
217 if ($raw_link =~ m/^\"/) {
218 $raw_link =~ s/^\"//;
219 $raw_link =~ s/\"$//;
220 }
221
222 # remove any anchor names, e.g. foo.html#name becomes foo.html
223 # but watch out for any #'s that are part of entities, such as &#x3B1;
224 $raw_link =~ s/([^&])\#.*$/$1/s;
225
226 # some links may just be anchor names
227 next unless ($raw_link =~ /\S+/);
228
229 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230 # Turn relative file path into full path
231 my $dirname = &File::Basename::dirname($filename_full_path);
232 $raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
233 }
234 $raw_link = $self->eval_dir_dots($raw_link);
235
236 # this is the actual filename on the filesystem (that the link refers to)
237 my $url_original_filename = $self->opt_url_decode($raw_link);
238
239 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241 if ($exceeds_bytecodes) {
242 # We have a link to a file name that is more complicated than a raw byte filename
243 # What we do next depends on the operating system we are on
244
245 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
246 # Assume we're dealing with a UTF-8 encoded filename
247 $url_original_filename = encode("utf8", $url_original_filename);
248 }
249 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250 # HFS+ is UTF8 with decompostion
251 $url_original_filename = encode("utf8", $url_original_filename);
252 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253 }
254 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 # Don't need to do anything as later code maps Windows
256 # unicode filenames to DOS short filenames when needed
257 }
258 else {
259 my $outhandle = $self->{'outhandle'};
260 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261 print $outhandle " in raw file system encoding of: $raw_link\n";
262 print $outhandle " Assuming filesystem is UTF-8 based.\n";
263 $url_original_filename = encode("utf8", $url_original_filename);
264 }
265 }
266
267 # Convert the (currently raw) link into its Unicode version.
268 # Store the Unicode link along with the url_original_filename
269 my $unicode_url_original_filename = "";
270 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276 if ($url_original_filename ne $unicode_url_original_filename) {
277 my $outhandle = $self->{'outhandle'};
278
279 print $outhandle "URL Encoding $url_original_filename\n";
280 print $outhandle " ->$unicode_url_original_filename\n";
281
282 # Allow for possibility of raw byte version and Unicode versions of file
283 &util::block_filename($block_hash,$unicode_url_original_filename);
284 }
285
286 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287 &util::block_filename($block_hash,$url_original_filename);
288
289 }
290}
291
292# Given a filename in any encoding, will URL decode it to get back the original filename
293# in the original encoding. Because this method is intended to work out the *original*
294# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
295# string already exists in the local folder.
296#
297sub opt_url_decode {
298 my $self = shift (@_);
299 my ($raw_link) = @_;
300
301
302 # Replace %XX's in URL with decoded value if required.
303 # Note that the filename may include the %XX in some situations
304
305## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
306
307 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
308 if (!-e $raw_link) {
309 $raw_link = &unicode::url_decode($raw_link,1);
310 }
311 }
312
313 return $raw_link;
314}
315
316sub read_into_doc_obj
317{
318 my $self = shift (@_);
319 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
320
321 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
322
323 # Lookup content_encoding worked out in file_block pass for this file
324 # Store it under the local name 'content_encoding' so its nice and
325 # easy to access
326 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
327
328 # get the input file
329 my $input_filename = $file;
330 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
331 $suffix = lc($suffix);
332 my $tidy_filename;
333 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
334 {
335 # because the document has to be sectionalized set the description tags
336 $self->{'description_tags'} = 1;
337
338 # set the file to be tidied
339 $input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
340
341 # get the tidied file
342 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
343 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
344
345 # derive tmp filename from input filename
346 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
347
348 # set the new input file and base_dir to be from the tidied file
349 $file = "$tailname$suffix";
350 $base_dir = $dirname;
351 }
352
353 # call the parent read_into_doc_obj
354 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
355 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
356 {
357 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
358 my $collect_file = &util::filename_within_collection($filename_full_path);
359 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
360 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
361 # build. So set it manually.
362 $doc_obj->set_source_path($filename_full_path);
363 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
364 $doc_obj->set_converted_filename($collect_conv_file);
365
366 my $plugin_filename_encoding = $self->{'filename_encoding'};
367 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
368 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
369 }
370
371 delete $self->{'store_content_encoding'}->{$filename_full_path};
372 $self->{'content_encoding'} = undef;
373
374 return ($process_status,$doc_obj);
375}
376
377# do plugin specific processing of doc_obj
378sub process {
379 my $self = shift (@_);
380 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
381 my $outhandle = $self->{'outhandle'};
382
383 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
384 # this makes life so much easier... perl can cope with unix-style '/'s.
385 $base_dir =~ s@(\\)+@/@g;
386 $file =~ s@(\\)+@/@g;
387 }
388
389 my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
390 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
391 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
392
393 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
394 # And again
395 $upgraded_base_dir =~ s@(\\)+@/@g;
396 $upgraded_filename =~ s@(\\)+@/@g;
397
398 # Need to make sure there is a '/' on the end of upgraded_base_dir
399 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
400 $upgraded_base_dir .= "/";
401 }
402 }
403 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
404
405 # reset per-doc stuff...
406 $self->{'aux_files'} = {};
407 $self->{'dir_num'} = 0;
408 $self->{'file_num'} = 0;
409
410 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
411 # you can also include metadata in the format (X can be any number)
412 # <hX>Title<!--gsdl-metadata
413 # <Metadata name="name1">value1</Metadata>
414 # ...
415 # <Metadata name="nameN">valueN</Metadata>
416 #--></hX>
417 if ($self->{'sectionalise_using_h_tags'}) {
418 # description_tags should allways be activated because we convert headings to description tags
419 $self->{'description_tags'} = 1;
420
421 my $arrSections = [];
422 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
423
424 if (scalar(@$arrSections)) {
425 my $strMetadata = $self->update_section_data($arrSections, -1);
426 if (length($strMetadata)) {
427 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
428 $$textref =~ s/<\/body>/$strMetadata/ig;
429 }
430 }
431 }
432
433 my $cursection = $doc_obj->get_top_section();
434
435 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
436 unless $self->{'no_metadata'} || $self->{'description_tags'};
437
438 # Store URL for page as metadata - this can be used for an
439 # altavista style search interface. The URL won't be valid
440 # unless the file structure contains the domain name (i.e.
441 # like when w3mir is used to download a website).
442
443 # URL metadata (even invalid ones) are used to support internal
444 # links, so even if 'file_is_url' is off, still need to store info
445
446 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
447
448# my $utf8_file = $self->filename_to_utf8_metadata($file);
449# $utf8_file =~ s/&\#095;/_/g;
450# variable below used to be utf8_file
451
452 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455 my $web_url = "http://";
456 my $utf8_web_url = "http://";
457 if(defined $dirname) { # local directory
458 # Check for "ftp" in the domain name of the directory
459 # structure to determine if this URL should be a ftp:// URL
460 # This check is not infallible, but better than omitting the
461 # check, which would cause all files downloaded from ftp sites
462 # via mirroring with wget to have potentially erroneous http:// URLs
463 # assigned in their metadata
464 if ($dirname =~ /^[^\/]*ftp/i)
465 {
466 $web_url = "ftp://";
467 $utf8_web_url = "ftp://";
468 }
469 $dirname = $self->eval_dir_dots($dirname);
470 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472 $web_url = $web_url.$dirname.$url_encoded_file;
473 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474 } else {
475 $web_url = $web_url.$url_encoded_file;
476 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477 }
478 $web_url =~ s/\\/\//g;
479 $utf8_web_url =~ s/\\/\//g;
480
481 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484 }
485
486
487 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490 if ($self->{'file_is_url'}) {
491 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
494 }
495
496 if ($self->{'description_tags'}) {
497 # remove the html header - note that doing this here means any
498 # sections defined within the header will be lost (so all <Section>
499 # tags must appear within the body of the HTML)
500 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
501
502 $$textref =~ s/^.*?<body[^>]*>//is;
503 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
504
505 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
506 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
507
508 my $lt = '(?:<|&lt;)';
509 my $gt = '(?:>|&gt;)';
510 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
511
512 my $dont_strip = '';
513 if ($self->{'no_strip_metadata_html'}) {
514 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
515 }
516
517 my $found_something = 0; my $top = 1;
518 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
519 my $text = $1;
520 my $comment = $2;
521 if (defined $text) {
522 # text before a comment - note that getting to here
523 # doesn't necessarily mean there are Section tags in
524 # the document
525 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526 }
527 while ($comment =~ s/$lt(.*?)$gt//s) {
528 my $tag = $1;
529 if ($tag eq "Section") {
530 $found_something = 1;
531 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532 $top = 0;
533 } elsif ($tag eq "/Section") {
534 $found_something = 1;
535 $cursection = $doc_obj->get_parent_section ($cursection);
536 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537 my $metaname = $1;
538 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540 my $metavalue = $1;
541 $metavalue =~ s/^\s+//;
542 $metavalue =~ s/\s+$//;
543 # assume that no metadata value intentionally includes
544 # carriage returns or HTML tags (if they're there they
545 # were probably introduced when converting to HTML from
546 # some other format).
547 # actually some people want to have html tags in their
548 # metadata.
549 $metavalue =~ s/[\cJ\cM]/ /sg;
550 $metavalue =~ s/<[^>]+>//sg
551 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
552 $metavalue =~ s/\s+/ /sg;
553 if ($metaname =~ /\./) { # has a namespace
554 $metaname = "ex.$metaname";
555 }
556 if ($accumulate) {
557 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558 } else {
559 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560 }
561 } elsif ($tag eq "Description" || $tag eq "/Description") {
562 # do nothing with containing Description tags
563 } else {
564 # simple HTML tag (probably created by the conversion
565 # to HTML from some other format) - we'll ignore it and
566 # hope for the best ;-)
567 }
568 }
569 }
570 if ($cursection ne "") {
571 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572 }
573
574 $$textref =~ s/^.*?<body[^>]*>//is;
575 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
576 if ($$textref =~ m/\S/) {
577 if (!$found_something) {
578 if ($self->{'verbosity'} > 2) {
579 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580 print $outhandle " will be processed as a single section document\n";
581 }
582
583 # go ahead and process single-section document
584 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586 # if document contains no Section tags we'll go ahead
587 # and extract metadata (this won't have been done
588 # above as the -description_tags option prevents it)
589 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591 unless $self->{'no_metadata'};
592
593 } else {
594 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595 print $outhandle " of the final closing </Section> tag. This text will\n";
596 print $outhandle " be ignored.";
597
598 my ($text);
599 if (length($$textref) > 30) {
600 $text = substr($$textref, 0, 30) . "...";
601 } else {
602 $text = $$textref;
603 }
604 $text =~ s/\n/ /isg;
605 print $outhandle " ($text)\n";
606 }
607 } elsif (!$found_something) {
608
609 if ($self->{'verbosity'} > 2) {
610 # may get to here if document contained no valid Section
611 # tags but did contain some comments. The text will have
612 # been processed already but we should print the warning
613 # as above and extract metadata
614 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616 }
617
618 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620 unless $self->{'no_metadata'};
621 }
622
623 } else {
624
625 # remove header and footer
626 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
627 $$textref =~ s/^.*?<body[^>]*>//is;
628 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
629 }
630
631 $self->{'css_assoc_files'} = {};
632
633 # single section document
634 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
635
636 #my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
637
638 $self->acquire_css_associated_files($doc_obj, $cursection);
639
640 $self->{'css_assoc_files'} = {};
641 }
642
643 return 1;
644}
645
646
647sub process_heading
648{
649 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
650 $strHeadingText = '' if (!defined($strHeadingText));
651
652 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
653
654 my $strSecMetadata = '';
655 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
656 {
657 $strSecMetadata .= $1;
658 }
659
660 $strHeadingText =~ s/^\s+//g;
661 $strHeadingText =~ s/\s+$//g;
662 $strSecMetadata =~ s/^\s+//g;
663 $strSecMetadata =~ s/\s+$//g;
664
665 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
666
667 if (length($strSecMetadata)) {
668 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
669 }
670
671 $strMetadata .= "\t</Description>\n";
672
673 return "<!--" . $strMetadata . "-->";
674}
675
676
677sub update_section_data
678{
679 my ($self, $arrSections, $nCurTocNo) = @_;
680 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
681
682 if ($nSections == 0) {
683 push @$arrSections, $nCurTocNo;
684 return $strBuffer;
685 }
686 $nLast = $arrSections->[$nSections - 1];
687 if ($nCurTocNo > $nLast) {
688 push @$arrSections, $nCurTocNo;
689 return $strBuffer;
690 }
691 for(my $i = $nSections - 1; $i >= 0; $i--) {
692 if ($nCurTocNo <= $arrSections->[$i]) {
693 $strBuffer .= "\n</Section>";
694 pop @$arrSections;
695 }
696 }
697 push @$arrSections, $nCurTocNo;
698 return $strBuffer;
699}
700
701
702# note that process_section may be called multiple times for a single
703# section (relying on the fact that add_utf8_text appends the text to any
704# that may exist already).
705sub process_section {
706 my $self = shift (@_);
707 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
708
709 my @styleTagsText = ($$textref =~ m/<style[^>]*>([^<]*)<\/style>/sg);
710 if(scalar(@styleTagsText) > 0)
711 {
712 my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
713 foreach my $styleText (@styleTagsText)
714 {
715 $self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
716 }
717 }
718
719 # trap links
720 if (!$self->{'nolinks'}) {
721 # usemap="./#index" not handled correctly => change to "#index"
722## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
723
724## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
725## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
726
727 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
728 $self->replace_usemap_links($1, $2, $3)/isge;
729
730 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
731 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
732
733## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
734# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
735 }
736
737 # trap images
738
739 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
740 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
741 # turned regular text succeeding images into links. That is, by embedding <imgs>
742 # inside <a href=""></a>, the text following images were no longer misbehaving.
743 # However, there would be many occasions whereby images were not meant to link
744 # to their source images but where the images would link to another web page.
745 # To allow this, the no_image_links option was introduced: it would prevent
746 # the behaviour of embedding images into links that referenced the source images.
747
748 # Somewhere along the line, the problem of normal text turning into links when
749 # such text followed images which were not embedded in <a href=""></a> ceased
750 # to occur. This is why the following lines have been commented out (as well as
751 # two lines in replace_images). They appear to no longer apply.
752
753 # If at any time, there is a need for having images embedded in <a> anchor tags,
754 # then it might be better to turn that into an HTMLPlugin option rather than make
755 # it the default behaviour. Also, eventually, no_image_links needs to become
756 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
757
758 #if(!$self->{'no_image_links'}){
759 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
760 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
761 #}
762
763 # add text to document object
764 # turn \ into \\ so that the rest of greenstone doesn't think there
765 # is an escape code following. (Macro parsing loses them...)
766 $$textref =~ s/\\/\\\\/go;
767
768 $doc_obj->add_utf8_text($cursection, $$textref);
769}
770
771sub replace_images {
772 my $self = shift (@_);
773 my ($front, $link, $back, $base_dir,
774 $file, $doc_obj, $section) = @_;
775
776 # remove quotes from link at start and end if necessary
777 if ($link=~/^[\"\']/) {
778 $link=~s/^[\"\']//;
779 $link=~s/[\"\']$//;
780 $front.='"';
781 $back="\"$back";
782 }
783
784 $link =~ s/\n/ /g;
785
786 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
787 # If the Word file path has spaces in it, wv messes up and you end up with
788 # absolute paths for the images, and without the "file://" prefix
789 # So check for this special case and massage the data to be correct
790 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
791 $link =~ s/^.*\\([^\\]+)$/$1/;
792 }
793
794 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
795
796 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
797
798# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n\n";
799
800 my $anchor_name = $img_file;
801 #$anchor_name =~ s/^.*\///;
802 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
803
804 my $image_link = $front . $img_file .$back;
805 return $image_link;
806
807 # The reasons for why the following two lines are no longer necessary can be
808 # found in subroutine process_section
809 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
810 #return $anchor_link;
811
812 #return $front . $img_file . $back . $anchor_name;
813}
814
815sub replace_href_links {
816 my $self = shift (@_);
817 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
818
819 if($front =~ m/^<link / && $link =~ m/\.css"$/)
820 {
821 my $actual_link = $link;
822 $actual_link =~ s/^"(.*)"$/$1/;
823
824 my $directory = &File::Basename::dirname($file);
825
826 my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
827 $self->retrieve_css_associated_files($css_filename);
828 }
829
830 # remove quotes from link at start and end if necessary
831 if ($link=~/^[\"\']/) {
832 $link=~s/^[\"\']//;
833 $link=~s/[\"\']$//;
834 $front.='"';
835 $back="\"$back";
836 }
837
838 # attempt to sort out targets - frames are not handled
839 # well in this plugin and some cases will screw things
840 # up - e.g. the _parent target (so we'll just remove
841 # them all ;-)
842 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
843 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
844 $front =~ s/target=\"?_parent\"?//is;
845 $back =~ s/target=\"?_parent\"?//is;
846
847 if($link =~ m/^\#/s)
848 {
849 return $front . "_httpsamepagelink_" . $link . $back;
850 }
851
852 $link =~ s/\n/ /g;
853
854 # Find file referred to by $link on file system
855 # This is more complicated than it sounds when char encodings
856 # is taken in to account
857 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
858
859 # href may use '\'s where '/'s should be on Windows
860 $href =~ s/\\/\//g;
861 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
862
863 ##### leave all these links alone (they won't be picked up by intermediate
864 ##### pages). I think that's safest when dealing with frames, targets etc.
865 ##### (at least until I think of a better way to do it). Problems occur with
866 ##### mailto links from within small frames, the intermediate page is displayed
867 ##### within that frame and can't be seen. There is still potential for this to
868 ##### happen even with html pages - the solution seems to be to somehow tell
869 ##### the browser from the server side to display the page being sent (i.e.
870 ##### the intermediate page) in the top level window - I'm not sure if that's
871 ##### possible - the following line should probably be deleted if that can be done
872 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
873
874 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
875 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
876
877 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
878
879 # Don't do any encoding for now, as not clear what
880 # the right thing to do is to support filename
881 # encoding on Windows when they are not UTF16
882 #
883 }
884 else {
885 # => Unix-based system
886
887 # If web page didn't give encoding, then default to utf8
888 my $content_encoding= $self->{'content_encoding'} || "utf8";
889
890 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
891 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
892 }
893
894 # on Darwin, the unicode filenames are stored on the file
895 # system in decomposed form, so any href link (including when
896 # URL-encoded) should refer to the decomposed name of the file
897 if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
898 $href = normalize('D', $href); # Normalization Form D (decomposition)
899 }
900
901 $href = encode($content_encoding,$href);
902 }
903
904 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
905 $href = &unicode::filename_to_url($href);
906
907 &ghtml::urlsafe ($href);
908
909 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
910 print STDERR "******DEBUG: href=$href\n";
911 }
912
913
914 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
915 } else {
916 # link is to some other type of file (e.g., an image) so we'll
917 # need to associate that file
918 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
919 }
920}
921
922sub retrieve_css_associated_files {
923 my $self = shift (@_);
924 my ($css_filename) = @_;
925
926 my $css_filename_dirname = &File::Basename::dirname($css_filename);
927
928 open (CSSFILE, $css_filename) || return;
929 sysread (CSSFILE, my $file_string, -s CSSFILE);
930
931 $self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
932
933 close CSSFILE;
934}
935
936sub acquire_css_associated_files_from_text_block {
937 my $self = shift (@_);
938 my ($text, $css_filename_dirname) = @_;
939
940 my @image_urls = ($text =~ m/background-image:\s*url[^;]*;/sg);
941 foreach my $img_url (@image_urls)
942 {
943 $img_url =~ s/^.*url.*\((.*)\).*$/$1/;
944 $img_url =~ s/^\s*"?([^"]*)"?\s*$/$1/;
945
946 $self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
947 }
948}
949
950sub acquire_css_associated_files {
951 my $self = shift(@_);
952
953 my ($doc_obj, $section) = @_;
954
955 foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
956 {
957 $doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
958 }
959}
960
961sub add_file {
962 my $self = shift (@_);
963 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
964 my ($newname);
965
966 my $filename = $href;
967 if ($base_dir eq "") {
968 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
969 # remove http://
970 $filename =~ s/^[^:]*:\/\///;
971 }
972 else {
973 # remove http:/ thereby leaving one slash at the start as
974 # part of full pathname
975 $filename =~ s/^[^:]*:\///;
976 }
977 }
978 else {
979 # remove http://
980 $filename =~ s/^[^:]*:\/\///;
981 }
982
983 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
984 $filename =~ s@\/@\\@g;
985 }
986
987 $filename = &FileUtils::filenameConcatenate($base_dir, $filename);
988
989 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
990 # we are processing a tidytmp file - want paths to be in import
991 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
992 }
993
994 # Replace %XX's in URL with decoded value if required. Note that the
995 # filename may include the %XX in some situations. If the *original*
996 # file's name was in URL encoding, the following method will not decode
997 # it.
998 my $unicode_filename = $filename;
999 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
1000
1001 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
1002 # (it seems) accompanying file
1003 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1004
1005 my $content_encoding= $self->{'content_encoding'} || "utf8";
1006
1007 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
1008 # The filenames that come through the HTML file have been decoded
1009 # into Unicode aware Perl strings. Need to convert them back
1010 # to their initial raw-byte encoding to match the file that
1011 # exists on the file system
1012 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1013 }
1014 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1015 # HFS+ is UTF8 with decompostion
1016 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1017 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
1018
1019 }
1020 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1021 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1022
1023 if (defined $long_filename) {
1024 my $short_filename = Win32::GetLongPathName($long_filename);
1025 $filename = $short_filename;
1026 }
1027# else {
1028# print STDERR "***** failed to map href to real file:\n";
1029# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1030# }
1031 }
1032 else {
1033 my $outhandle = $self->{'outhandle'};
1034 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1035 print $outhandle " in file system encoding of href: $href\n";
1036 print $outhandle " No character encoding done.\n";
1037 }
1038
1039
1040 # some special processing if the intended filename was converted to utf8, but
1041 # the actual file still needs to be renamed
1042 #if (!&util::fd_exists($filename)) {
1043 if (!&FileUtils::fileExists($filename)) {
1044 # try the original filename stored in map
1045 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1046 print STDERR "******!! orig filename did not exist: $filename\n";
1047 }
1048
1049## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1050
1051 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1052
1053 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1054 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1055 }
1056
1057 if (defined $original_filename && -e $original_filename) {
1058 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1059 print STDERR "****** Found match!\n";
1060 }
1061 $filename = $original_filename;
1062 }
1063 }
1064
1065 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1066
1067 if ($rl == 0) {
1068 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1069 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
1070 }
1071 else {
1072 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
1073 }
1074 }
1075
1076 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1077 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
1078 }
1079 # add the original image file as a source file
1080 if (!$self->{'processing_tmp_files'} ) {
1081 $doc_obj->associate_source_file($filename);
1082 }
1083 if ($self->{'rename_assoc_files'}) {
1084 if (defined $self->{'aux_files'}->{$href}) {
1085 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1086 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1087 } else {
1088 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1089 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1090 $self->inc_filecount ();
1091 }
1092 $doc_obj->associate_file($filename, $newname, undef, $section);
1093 return "_httpdocimg_/$newname";
1094 } else {
1095 if(&unicode::is_url_encoded($unicode_filename)) {
1096 # use the possibly-decoded filename instead to avoid double URL encoding
1097 ($newname) = $filename =~ m/([^\/\\]*)$/;
1098 } else {
1099 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1100 }
1101
1102 # Make sure this name uses only ASCII characters.
1103 # We use either base64 or URL encoding, as these preserve original encoding
1104 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
1105
1106### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1107 $doc_obj->associate_file($filename, $newname, undef, $section);
1108
1109 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1110 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1111 my $newname_url = $newname;
1112 $newname_url = &unicode::filename_to_url($newname_url);
1113 return "_httpdocimg_/$newname_url";
1114 }
1115}
1116
1117
1118sub format_link {
1119 my $self = shift (@_);
1120 my ($link, $base_dir, $file) = @_;
1121
1122 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1123 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
1124
1125 $hash_part = "" if !defined $hash_part;
1126 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1127 my $outhandle = $self->{'outhandle'};
1128 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1129 if $self->{'verbosity'};
1130 return ($link, "", 0);
1131 }
1132
1133 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
1134 my $type = $1;
1135 my $before_hash_file = $before_hash;
1136
1137 if ($link =~ m/^(http|ftp):/i) {
1138
1139 # Turn url (using /) into file name (possibly using \ on windows)
1140 my @http_dir_split = split('/', $before_hash_file);
1141 $before_hash_file = &FileUtils::filenameConcatenate(@http_dir_split);
1142 }
1143
1144 # want to maintain two version of "before_hash": one representing the URL, the other using filesystem specific directory separator
1145 $before_hash_file = $self->eval_dir_dots($before_hash_file);
1146 my $before_hash_url = $before_hash_file;
1147 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1148 $before_hash_url =~ s@\\@\/@g;
1149 }
1150
1151 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
1152
1153 my $rl = 0;
1154 $rl = 1 if (-e $linkfilename);
1155
1156 # make sure there's a slash on the end if it's a directory
1157 if ($before_hash_url !~ m/\/$/) {
1158 $before_hash_url .= "/" if (-d $linkfilename);
1159 }
1160 return ($type . $before_hash_url, $hash_part, $rl);
1161
1162 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1163
1164 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1165
1166 # the first directory will be the domain name if file_is_url
1167 # to generate archives, otherwise we'll assume all files are
1168 # from the same site and base_dir is the root
1169
1170 if ($self->{'file_is_url'}) {
1171 my @dirs = split /[\/\\]/, $file;
1172 my $domname = shift (@dirs);
1173 $before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1174 $before_hash =~ s@\\@/@g; # for windows
1175 }
1176 else
1177 {
1178 # see if link shares directory with source document
1179 # => turn into relative link if this is so!
1180
1181 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1182 # too difficult doing a pattern match with embedded '\'s...
1183 my $win_before_hash=$before_hash;
1184 $win_before_hash =~ s@(\\)+@/@g;
1185 # $base_dir is already similarly "converted" on windows.
1186 if ($win_before_hash =~ s@^$base_dir/@@o) {
1187 # if this is true, we removed a prefix
1188 $before_hash=$win_before_hash;
1189 }
1190 }
1191 else {
1192 # before_hash has lost leading slash by this point,
1193 # -> add back in prior to substitution with $base_dir
1194 $before_hash = "/$before_hash";
1195
1196 $before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1197 $before_hash =~ s@^$base_dir/@@;
1198 }
1199 }
1200 } else {
1201 # Turn relative file path into full path
1202 my $dirname = &File::Basename::dirname($file);
1203 $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
1204 $before_hash = $self->eval_dir_dots($before_hash);
1205 $before_hash =~ s@\\@/@g; # for windows
1206 }
1207
1208 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1209
1210
1211# print STDERR "**** linkfilename = $linkfilename\n";
1212# if (!&util::fd_exists($linkfilename)) {
1213# print STDERR "***** Warning: Could not find $linkfilename\n";
1214# }
1215
1216
1217 # make sure there's a slash on the end if it's a directory
1218 if ($before_hash !~ m/\/$/) {
1219 $before_hash .= "/" if (-d $linkfilename);
1220 }
1221
1222# print STDERR "*** returning: $before_hash\n";
1223
1224 return ("http://" . $before_hash, $hash_part, 1);
1225 } else {
1226 # mailto, news, nntp, telnet, javascript or gopher link
1227 return ($before_hash, "", 0);
1228 }
1229}
1230
1231sub extract_first_NNNN_characters {
1232 my $self = shift (@_);
1233 my ($textref, $doc_obj, $thissection) = @_;
1234
1235 foreach my $size (split /,/, $self->{'first'}) {
1236 my $tmptext = $$textref;
1237 # skip to the body
1238 $tmptext =~ s/.*<body[^>]*>//i;
1239 # remove javascript
1240 $tmptext =~ s@<script.*?</script>@ @sig;
1241 $tmptext =~ s/<[^>]*>/ /g;
1242 $tmptext =~ s/&nbsp;/ /g;
1243 $tmptext =~ s/^\s+//;
1244 $tmptext =~ s/\s+$//;
1245 $tmptext =~ s/\s+/ /gs;
1246 $tmptext = &unicode::substr ($tmptext, 0, $size);
1247 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1248 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1249 }
1250}
1251
1252
1253sub extract_metadata {
1254 my $self = shift (@_);
1255 my ($textref, $metadata, $doc_obj, $section) = @_;
1256 my $outhandle = $self->{'outhandle'};
1257 # if we don't want metadata, we may as well not be here ...
1258 return if (!defined $self->{'metadata_fields'});
1259
1260 my $separator = $self->{'metadata_field_separator'};
1261 if ($separator eq "") {
1262 undef $separator;
1263 }
1264
1265 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1266 # html meta, 'value' is the metadata name for greenstone to use
1267 my %find_fields = ();
1268
1269 my %creator_fields = (); # short-cut for lookups
1270
1271
1272 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1273 $field =~ s/^\s+//; # remove leading whitespace
1274 $field =~ s/\s+$//; # remove trailing whitespace
1275
1276 # support tag<tagname>
1277 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1278 # "$2" is the user's preferred gs metadata name
1279 $find_fields{lc($1)}=$2; # lc = lowercase
1280 } else { # no <tagname> for mapping
1281 # "$field" is the user's preferred gs metadata name
1282 $find_fields{lc($field)}=$field; # lc = lowercase
1283 }
1284 }
1285
1286 if (defined $self->{'hunt_creator_metadata'} &&
1287 $self->{'hunt_creator_metadata'} == 1 ) {
1288 my @extra_fields =
1289 (
1290 'author',
1291 'author.email',
1292 'creator',
1293 'dc.creator',
1294 'dc.creator.corporatename',
1295 );
1296
1297 # add the creator_metadata fields to search for
1298 foreach my $field (@extra_fields) {
1299 $creator_fields{$field}=0; # add to lookup hash
1300 }
1301 }
1302
1303
1304 # find the header in the html file, which has the meta tags
1305 $$textref =~ m@<head>(.*?)</head>@si;
1306
1307 my $html_header=$1;
1308
1309 # go through every <meta... tag defined in the html and see if it is
1310 # one of the tags we want to match.
1311
1312 # special case for title - we want to remember if its been found
1313 my $found_title = 0;
1314 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1315 $html_header =~ m/^/; # match the start of the string, for \G assertion
1316
1317 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1318 my $metatag=$1;
1319 my ($tag, $value);
1320
1321 # find the tag name
1322 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1323 $tag=$2;
1324 # in case they're not using " or ', but they should...
1325 if (! $tag) {
1326 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1327 $tag=$1;
1328 }
1329
1330 if (!defined $tag) {
1331 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1332 next;
1333 }
1334
1335 # don't need to assign this field if it was passed in from a previous
1336 # (recursive) plugin
1337 if (defined $metadata->{$tag}) {next}
1338
1339 # find the tag content
1340 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1341 $value=$2;
1342
1343 # The following code assigns the metaname to value if value is
1344 # empty. Why would we do this?
1345 #if (! $value) {
1346 # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1347 # $value=$1;
1348 #}
1349 if (!defined $value || $value eq "") {
1350 print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1351 next;
1352 }
1353
1354 # clean up and add
1355 $value =~ s/\s+/ /gs;
1356 chomp($value); # remove trailing \n, if any
1357 if (exists $creator_fields{lc($tag)}) {
1358 # map this value onto greenstone's "Creator" metadata
1359 $tag='Creator';
1360 } elsif (!exists $find_fields{lc($tag)}) {
1361 next; # don't want this tag
1362 } else {
1363 # get the user's preferred capitalisation
1364 $tag = $find_fields{lc($tag)};
1365 }
1366 if (lc($tag) eq "title") {
1367 $found_title = 1;
1368 }
1369
1370 if ($self->{'verbosity'} > 2) {
1371 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1372 }
1373
1374 if ($tag =~ /\./) {
1375 # there is a . so has a namespace, add ex.
1376 $tag = "ex.$tag";
1377 }
1378 if (defined $separator) {
1379 my @values = split($separator, $value);
1380 foreach my $v (@values) {
1381 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1382 }
1383 }
1384 else {
1385 $doc_obj->add_utf8_metadata($section, $tag, $value);
1386 }
1387 }
1388
1389 # TITLE: extract the document title
1390 if (exists $find_fields{'title'} && !$found_title) {
1391 # we want a title, and didn't find one in the meta tags
1392 # see if there's a <title> tag
1393 my $title;
1394 my $from = ""; # for debugging output only
1395 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1396 $title = $1;
1397 $from = "<title> tags";
1398 }
1399
1400 if (!defined $title) {
1401 $from = "first 100 chars";
1402 # if no title use first 100 or so characters
1403 $title = $$textref;
1404 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1405 $title =~ s/^.*?<body>//si;
1406 # ignore javascript!
1407 $title =~ s@<script.*?</script>@ @sig;
1408 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1409 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1410 $title = substr ($title, 0, 100);
1411 $title =~ s/\s\S*$/.../;
1412 }
1413 $title =~ s/<[^>]*>/ /g; # remove html tags
1414 $title =~ s/&nbsp;/ /g;
1415 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1416 $title =~ s/\s+/ /gs; # collapse multiple spaces
1417 $title =~ s/^\s*//; # remove leading spaces
1418 $title =~ s/\s*$//; # remove trailing spaces
1419
1420 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1421 $title =~ s/^\s+//s; # in case title_sub introduced any...
1422 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1423 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1424 if ($self->{'verbosity'} > 2);
1425 }
1426
1427 # add FileFormat metadata
1428 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1429
1430 # Special, for metadata names such as tagH1 - extracts
1431 # the text between the first <H1> and </H1> tags into "H1" metadata.
1432
1433 foreach my $field (keys %find_fields) {
1434 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1435 my $tag = $1;
1436 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1437 my $content = $1;
1438 $content =~ s/&nbsp;/ /g;
1439 $content =~ s/<[^>]*>/ /g;
1440 $content =~ s/^\s+//;
1441 $content =~ s/\s+$//;
1442 $content =~ s/\s+/ /gs;
1443 if ($content) {
1444 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1445 $tag =~ s/^tag//i;
1446 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1447 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1448 if ($self->{'verbosity'} > 2);
1449 }
1450 }
1451 }
1452}
1453
1454
1455# evaluate any "../" to next directory up
1456# evaluate any "./" as here
1457sub eval_dir_dots {
1458 my $self = shift (@_);
1459 my ($filename) = @_;
1460 my $dirsep_os = &util::get_os_dirsep();
1461 my @dirsep = split(/$dirsep_os/,$filename);
1462
1463 my @eval_dirs = ();
1464 foreach my $d (@dirsep) {
1465 if ($d eq "..") {
1466 pop(@eval_dirs);
1467
1468 } elsif ($d eq ".") {
1469 # do nothing!
1470
1471 } else {
1472 push(@eval_dirs,$d);
1473 }
1474 }
1475
1476 # Need to fiddle with number of elements in @eval_dirs if the
1477 # first one is the empty string. This is because of a
1478 # modification to FileUtils::filenameConcatenate that supresses the addition
1479 # of a leading '/' character (or \ if windows) (intended to help
1480 # filename cat with relative paths) if the first entry in the
1481 # array is the empty string. Making the array start with *two*
1482 # empty strings is a way to defeat this "smart" option.
1483 #
1484 if (scalar(@eval_dirs) > 0) {
1485 if ($eval_dirs[0] eq ""){
1486 unshift(@eval_dirs,"");
1487 }
1488 }
1489
1490 my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1491 return $evaluated_filename;
1492}
1493
1494sub replace_usemap_links {
1495 my $self = shift (@_);
1496 my ($front, $link, $back) = @_;
1497
1498 # remove quotes from link at start and end if necessary
1499 if ($link=~/^[\"\']/) {
1500 $link=~s/^[\"\']//;
1501 $link=~s/[\"\']$//;
1502 $front.='"';
1503 $back="\"$back";
1504 }
1505
1506 $link =~ s/^\.\///;
1507 return $front . $link . $back;
1508}
1509
1510sub inc_filecount {
1511 my $self = shift (@_);
1512
1513 if ($self->{'file_num'} == 1000) {
1514 $self->{'dir_num'} ++;
1515 $self->{'file_num'} = 0;
1516 } else {
1517 $self->{'file_num'} ++;
1518 }
1519}
1520
1521
1522# Extend read_file so that strings like &eacute; are
1523# converted to UTF8 internally.
1524#
1525# We don't convert &lt; or &gt; or &amp; or &quot; in case
1526# they interfere with the GML files
1527
1528sub read_file {
1529 my $self = shift(@_);
1530 my ($filename, $encoding, $language, $textref) = @_;
1531
1532 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1533
1534 # Convert entities to their Unicode code-point equivalents
1535 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1536 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1537 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1538
1539}
1540
1541sub HB_read_html_file {
1542 my $self = shift (@_);
1543 my ($htmlfile, $text) = @_;
1544
1545 # load in the file
1546 if (!open (FILE, $htmlfile)) {
1547 print STDERR "ERROR - could not open $htmlfile\n";
1548 return;
1549 }
1550
1551 my $foundbody = 0;
1552 $self->HB_gettext (\$foundbody, $text, "FILE");
1553 close FILE;
1554
1555 # just in case there was no <body> tag
1556 if (!$foundbody) {
1557 $foundbody = 1;
1558 open (FILE, $htmlfile) || return;
1559 $self->HB_gettext (\$foundbody, $text, "FILE");
1560 close FILE;
1561 }
1562 # text is in utf8
1563}
1564
1565# converts the text to utf8, as ghtml does that for &eacute; etc.
1566sub HB_gettext {
1567 my $self = shift (@_);
1568 my ($foundbody, $text, $handle) = @_;
1569
1570 my $line = "";
1571 while (defined ($line = <$handle>)) {
1572 # look for body tag
1573 if (!$$foundbody) {
1574 if ($line =~ s/^.*<body[^>]*>//i) {
1575 $$foundbody = 1;
1576 } else {
1577 next;
1578 }
1579 }
1580
1581 # check for symbol fonts
1582 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1583 my $font = $1;
1584 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1585 if ($font !~ m/^arial$/i);
1586 }
1587
1588 $$text .= $line;
1589 }
1590
1591 if ($self->{'input_encoding'} eq "iso_8859_1") {
1592 # convert to utf-8
1593 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1594 }
1595 # convert any alphanumeric character entities to their utf-8
1596 # equivalent for indexing purposes
1597 #&ghtml::convertcharentities ($$text);
1598
1599 $$text =~ s/\s+/ /g; # remove \n's
1600
1601 # At this point $$text is a binary byte string
1602 # => turn it into a Unicode aware string, so full
1603 # Unicode aware pattern matching can be used.
1604 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1605 #
1606
1607 $$text = decode("utf8",$$text);
1608}
1609
1610sub HB_clean_section {
1611 my $self = shift (@_);
1612 my ($section) = @_;
1613
1614 # remove tags without a starting tag from the section
1615 my ($tag, $tagstart);
1616 while ($section =~ m/<\/([^>]{1,10})>/) {
1617 $tag = $1;
1618 $tagstart = index($section, "<$tag");
1619 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1620 $section =~ s/<\/$tag>//;
1621 }
1622
1623 # remove extra paragraph tags
1624 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1625
1626 # remove extra stuff at the end of the section
1627 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1628
1629 # add a newline at the beginning of each paragraph
1630 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1631
1632 # add a newline every 80 characters at a word boundary
1633 # Note: this regular expression puts a line feed before
1634 # the last word in each section, even when it is not
1635 # needed.
1636 $section =~ s/(.{1,80})\s/$1\n/g;
1637
1638 # fix up the image links
1639 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1640 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1641 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1642 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1643
1644 return $section;
1645}
1646
1647# Will convert the oldHDL format to the new HDL format (using the Section tag)
1648sub convert_to_newHDLformat
1649{
1650 my $self = shift (@_);
1651 my ($file,$cnfile) = @_;
1652 my $input_filename = $file;
1653 my $tmp_filename = $cnfile;
1654
1655 # write HTML tmp file with new HDL format
1656 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1657
1658 # read in the file and do basic html cleaning (removing header etc)
1659 my $html = "";
1660 $self->HB_read_html_file ($input_filename, \$html);
1661
1662 # process the file one section at a time
1663 my $curtoclevel = 1;
1664 my $firstsection = 1;
1665 my $toclevel = 0;
1666 while (length ($html) > 0) {
1667 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1668 $toclevel = $3;
1669 my $title = $4;
1670 my $sectiontext = "";
1671 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1672 $sectiontext = $1;
1673 } else {
1674 $sectiontext = $html;
1675 $html = "";
1676 }
1677
1678 # remove tags and extra spaces from the title
1679 $title =~ s/<\/?[^>]+>//g;
1680 $title =~ s/^\s+|\s+$//g;
1681
1682 # close any sections below the current level and
1683 # create a new section (special case for the firstsection)
1684 print PROD "<!--\n";
1685 while (($curtoclevel > $toclevel) ||
1686 (!$firstsection && $curtoclevel == $toclevel)) {
1687 $curtoclevel--;
1688 print PROD "</Section>\n";
1689 }
1690 if ($curtoclevel+1 < $toclevel) {
1691 print STDERR "WARNING - jump in toc levels in $input_filename " .
1692 "from $curtoclevel to $toclevel\n";
1693 }
1694 while ($curtoclevel < $toclevel) {
1695 $curtoclevel++;
1696 }
1697
1698 if ($curtoclevel == 1) {
1699 # add the header tag
1700 print PROD "-->\n";
1701 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1702 print PROD "<!--\n";
1703 }
1704
1705 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1706
1707 print PROD "-->\n";
1708
1709 # clean up the section html
1710 $sectiontext = $self->HB_clean_section($sectiontext);
1711
1712 print PROD "$sectiontext\n";
1713
1714 } else {
1715 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1716 "\nin $input_filename\n";
1717 last;
1718 }
1719 $firstsection = 0;
1720 }
1721
1722 print PROD "<!--\n";
1723 while ($curtoclevel > 0) {
1724 $curtoclevel--;
1725 print PROD "</Section>\n";
1726 }
1727 print PROD "-->\n";
1728
1729 close (PROD) || die("Error Closing File: $tmp_filename $!");
1730
1731 return $tmp_filename;
1732}
1733
1734sub shorten {
1735 my $self = shift (@_);
1736 my ($text) = @_;
1737
1738 return "\"$text\"" if (length($text) < 100);
1739
1740 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1741 substr ($text, length($text)-50) . "\"";
1742}
1743
1744sub convert_tidy_or_oldHDL_file
1745{
1746 my $self = shift (@_);
1747 my ($file) = @_;
1748 my $input_filename = $file;
1749
1750 if (-d $input_filename)
1751 {
1752 return $input_filename;
1753 }
1754
1755 # get the input filename
1756 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1757 my $base_dirname = $dirname;
1758 $suffix = lc($suffix);
1759
1760 # derive tmp filename from input filename
1761 # Remove any white space from filename -- no risk of name collision, and
1762 # makes later conversion by utils simpler. Leave spaces in path...
1763 # tidy up the filename with space, dot, hyphen between
1764 $tailname =~ s/\s+//g;
1765 $tailname =~ s/\.+//g;
1766 $tailname =~ s/\-+//g;
1767 # convert to utf-8 otherwise we have problems with the doc.xml file
1768 # later on
1769 &unicode::ensure_utf8(\$tailname);
1770
1771 # softlink to collection tmp dir
1772 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1773 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1774
1775 my $test_dirname = "";
1776 my $f_separator = &util::get_os_dirsep();
1777
1778 if ($dirname =~ m/import$f_separator/)
1779 {
1780 $test_dirname = $'; #'
1781
1782 #print STDERR "init $'\n";
1783
1784 while ($test_dirname =~ m/[$f_separator]/)
1785 {
1786 my $folderdirname = $`;
1787 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1788 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1789 $test_dirname = $'; #'
1790 }
1791 }
1792
1793 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1794
1795 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1796 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1797 {
1798 #convert the input file to a new style HDL
1799 my $hdl_output_filename = $input_filename;
1800 if ($self->{'old_style_HDL'})
1801 {
1802 $hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1803 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1804 }
1805
1806 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1807 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1808 my @files = grep {!/^\.+$/} readdir(DIR);
1809 close(DIR);
1810
1811 foreach my $file (@files)
1812 {
1813 my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1814 my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1815 if ((!-e $dest_file) && (!-d $src_file))
1816 {
1817 # just copy the original file back to the tmp directory
1818 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1819 }
1820 }
1821
1822 # tidy the input file
1823 my $tidy_output_filename = $hdl_output_filename;
1824 if ($self->{'use_realistic_book'})
1825 {
1826 $tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1827 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1828 }
1829 $tmp_filename = $tidy_output_filename;
1830 }
1831 else
1832 {
1833 if (!-e $tmp_filename)
1834 {
1835 # just copy the original file back to the tmp directory
1836 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1837 }
1838 }
1839
1840 return $tmp_filename;
1841}
1842
1843
1844# Will make the html input file as a proper XML file with removed font tag and
1845# image size added to the img tag.
1846# The tidying process takes place in a collection specific 'tmp' directory so
1847# that we don't accidentally damage the input.
1848sub tmp_tidy_file
1849{
1850 my $self = shift (@_);
1851 my ($file,$cnfile) = @_;
1852 my $input_filename = $file;
1853 my $tmp_filename = $cnfile;
1854
1855 # get the input filename
1856 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1857
1858 require HTML::TokeParser::Simple;
1859
1860 # create HTML parser to decode the input file
1861 my $parser = HTML::TokeParser::Simple->new($input_filename);
1862
1863 # write HTML tmp file without the font tag and image size are added to the img tag
1864 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1865 while (my $token = $parser->get_token())
1866 {
1867 # is it an img tag
1868 if ($token->is_start_tag('img'))
1869 {
1870 # get the attributes
1871 my $attr = $token->return_attr;
1872
1873 # get the full path to the image
1874 my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1875
1876 # set the width and height attribute
1877 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1878
1879 # recreate the tag
1880 print PROD "<img";
1881 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1882 print PROD ">";
1883 }
1884 # is it a font tag
1885 else
1886 {
1887 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1888 {
1889 # remove font tag
1890 print PROD "";
1891 }
1892 else
1893 {
1894 # print without changes
1895 print PROD $token->as_is;
1896 }
1897 }
1898 }
1899 close (PROD) || die("Error Closing File: $tmp_filename $!");
1900
1901 # run html-tidy on the tmp file to make it a proper XML file
1902
1903 my $outhandle = $self->{'outhandle'};
1904 print $outhandle "Converting HTML to be XML compliant:\n";
1905
1906 my $tidy_cmd = "tidy";
1907 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1908 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1909 if ($self->{'verbosity'} <= 2) {
1910 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1911 $tidy_cmd .= " 2>nul";
1912 }
1913 else {
1914 $tidy_cmd .= " 2>/dev/null";
1915 }
1916 print $outhandle " => $tidy_cmd\n";
1917 }
1918
1919 my $tidyfile = `$tidy_cmd`;
1920
1921 # write result back to the tmp file
1922 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1923 print PROD $tidyfile;
1924 close (PROD) || die("Error Closing File: $tmp_filename $!");
1925
1926 # return the output filename
1927 return $tmp_filename;
1928}
1929
1930sub associate_cover_image
1931{
1932 my $self = shift(@_);
1933 my ($doc_obj, $filename) = @_;
1934 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1935 {
1936 # we will have cover image in tidytmp, but want it from import
1937 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1938 }
1939 $self->SUPER::associate_cover_image($doc_obj, $filename);
1940}
1941
1942
19431;
Note: See TracBrowser for help on using the repository browser.