source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27314

Last change on this file since 27314 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 65.6 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39use Unicode::Normalize 'normalize';
40
41use ReadTextFile;
42use HBPlugin;
43use ghtml;
44use unicode;
45use util;
46use FileUtils;
47use XMLParser;
48
49use Image::Size;
50use File::Copy;
51
52sub BEGIN {
53 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
54}
55
56use strict; # every perl program should have this!
57no strict 'refs'; # make an exception so we can use variables as filehandles
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasePlugin.process_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_process_exp() },
64 { 'name' => "block_exp",
65 'desc' => "{BasePlugin.block_exp}",
66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "nolinks",
69 'desc' => "{HTMLPlugin.nolinks}",
70 'type' => "flag" },
71 { 'name' => "keep_head",
72 'desc' => "{HTMLPlugin.keep_head}",
73 'type' => "flag" },
74 { 'name' => "no_metadata",
75 'desc' => "{HTMLPlugin.no_metadata}",
76 'type' => "flag" },
77 { 'name' => "metadata_fields",
78 'desc' => "{HTMLPlugin.metadata_fields}",
79 'type' => "string",
80 'deft' => "Title" },
81 { 'name' => "metadata_field_separator",
82 'desc' => "{HTMLPlugin.metadata_field_separator}",
83 'type' => "string",
84 'deft' => "" },
85 { 'name' => "hunt_creator_metadata",
86 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
87 'type' => "flag" },
88 { 'name' => "file_is_url",
89 'desc' => "{HTMLPlugin.file_is_url}",
90 'type' => "flag" },
91 { 'name' => "assoc_files",
92 'desc' => "{HTMLPlugin.assoc_files}",
93 'type' => "regexp",
94 'deft' => &get_default_block_exp() },
95 { 'name' => "rename_assoc_files",
96 'desc' => "{HTMLPlugin.rename_assoc_files}",
97 'type' => "flag" },
98 { 'name' => "title_sub",
99 'desc' => "{HTMLPlugin.title_sub}",
100 'type' => "string",
101 'deft' => "" },
102 { 'name' => "description_tags",
103 'desc' => "{HTMLPlugin.description_tags}",
104 'type' => "flag" },
105 # retain this for backward compatibility (w3mir option was replaced by
106 # file_is_url)
107 { 'name' => "w3mir",
108# 'desc' => "{HTMLPlugin.w3mir}",
109 'type' => "flag",
110 'hiddengli' => "yes"},
111 { 'name' => "no_strip_metadata_html",
112 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no"},
116 { 'name' => "sectionalise_using_h_tags",
117 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
118 'type' => "flag" },
119 { 'name' => "use_realistic_book",
120 'desc' => "{HTMLPlugin.tidy_html}",
121 'type' => "flag"},
122 { 'name' => "old_style_HDL",
123 'desc' => "{HTMLPlugin.old_style_HDL}",
124 'type' => "flag"},
125 {'name' => "processing_tmp_files",
126 'desc' => "{BasePlugin.processing_tmp_files}",
127 'type' => "flag",
128 'hiddengli' => "yes"}
129 ];
130
131my $options = { 'name' => "HTMLPlugin",
132 'desc' => "{HTMLPlugin.desc}",
133 'abstract' => "no",
134 'inherits' => "yes",
135 'args' => $arguments };
136
137
138sub new {
139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141 push(@$pluginlist, $class);
142
143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
145
146
147 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
148
149 if ($self->{'w3mir'}) {
150 $self->{'file_is_url'} = 1;
151 }
152 $self->{'aux_files'} = {};
153 $self->{'dir_num'} = 0;
154 $self->{'file_num'} = 0;
155
156 return bless $self, $class;
157}
158
159# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
160# if have eg <script language="javascript" src="img/lib.js@123">
161# blocking is now done by reading through the file and recording all the
162# images and other files
163sub get_default_block_exp {
164 my $self = shift (@_);
165
166 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
167 return "";
168}
169
170sub get_default_process_exp {
171 my $self = shift (@_);
172
173 # the last option is an attempt to encode the concept of an html query ...
174 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
175}
176
177sub store_block_files
178{
179 my $self =shift (@_);
180 my ($filename_full_path, $block_hash) = @_;
181
182 my $html_fname = $filename_full_path;
183
184 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
186
187 # read in file ($text will be in utf8)
188 my $raw_text = "";
189 $self->read_file_no_decoding($filename_full_path, \$raw_text);
190
191 my $textref = \$raw_text;
192 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
193 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
194 $$textref =~ s/$opencom(.*?)$closecom//gs;
195
196 # Convert entities to their UTF8 equivalents
197 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
198 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
200
201 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
202 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
203 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
204 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
205 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
206 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
207 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
208
209 if(!defined $self->{'unicode_to_original_filename'}) {
210 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
211 $self->{'unicode_to_original_filename'} = {};
212 }
213
214 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
215
216 # remove quotes from link at start and end if necessary
217 if ($raw_link =~ m/^\"/) {
218 $raw_link =~ s/^\"//;
219 $raw_link =~ s/\"$//;
220 }
221
222 # remove any anchor names, e.g. foo.html#name becomes foo.html
223 # but watch out for any #'s that are part of entities, such as &#x3B1;
224 $raw_link =~ s/([^&])\#.*$/$1/s;
225
226 # some links may just be anchor names
227 next unless ($raw_link =~ /\S+/);
228
229 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
230 # Turn relative file path into full path
231 my $dirname = &File::Basename::dirname($filename_full_path);
232 $raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
233 }
234 $raw_link = $self->eval_dir_dots($raw_link);
235
236 # this is the actual filename on the filesystem (that the link refers to)
237 my $url_original_filename = $self->opt_url_decode($raw_link);
238
239 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
240
241 if ($exceeds_bytecodes) {
242 # We have a link to a file name that is more complicated than a raw byte filename
243 # What we do next depends on the operating system we are on
244
245 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
246 # Assume we're dealing with a UTF-8 encoded filename
247 $url_original_filename = encode("utf8", $url_original_filename);
248 }
249 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250 # HFS+ is UTF8 with decompostion
251 $url_original_filename = encode("utf8", $url_original_filename);
252 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253 }
254 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 # Don't need to do anything as later code maps Windows
256 # unicode filenames to DOS short filenames when needed
257 }
258 else {
259 my $outhandle = $self->{'outhandle'};
260 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261 print $outhandle " in raw file system encoding of: $raw_link\n";
262 print $outhandle " Assuming filesystem is UTF-8 based.\n";
263 $url_original_filename = encode("utf8", $url_original_filename);
264 }
265 }
266
267 # Convert the (currently raw) link into its Unicode version.
268 # Store the Unicode link along with the url_original_filename
269 my $unicode_url_original_filename = "";
270 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276 if ($url_original_filename ne $unicode_url_original_filename) {
277 my $outhandle = $self->{'outhandle'};
278
279 print $outhandle "URL Encoding $url_original_filename\n";
280 print $outhandle " ->$unicode_url_original_filename\n";
281
282 # Allow for possibility of raw byte version and Unicode versions of file
283 &util::block_filename($block_hash,$unicode_url_original_filename);
284 }
285
286 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
287 &util::block_filename($block_hash,$url_original_filename);
288
289 }
290}
291
292# Given a filename in any encoding, will URL decode it to get back the original filename
293# in the original encoding. Because this method is intended to work out the *original*
294# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
295# string already exists in the local folder.
296#
297sub opt_url_decode {
298 my $self = shift (@_);
299 my ($raw_link) = @_;
300
301
302 # Replace %XX's in URL with decoded value if required.
303 # Note that the filename may include the %XX in some situations
304
305## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
306
307 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
308 if (!-e $raw_link) {
309 $raw_link = &unicode::url_decode($raw_link,1);
310 }
311 }
312
313 return $raw_link;
314}
315
316sub read_into_doc_obj
317{
318 my $self = shift (@_);
319 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
320
321 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
322
323 # Lookup content_encoding worked out in file_block pass for this file
324 # Store it under the local name 'content_encoding' so its nice and
325 # easy to access
326 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
327
328 # get the input file
329 my $input_filename = $file;
330 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
331 $suffix = lc($suffix);
332 my $tidy_filename;
333 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
334 {
335 # because the document has to be sectionalized set the description tags
336 $self->{'description_tags'} = 1;
337
338 # set the file to be tidied
339 $input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
340
341 # get the tidied file
342 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
343 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
344
345 # derive tmp filename from input filename
346 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
347
348 # set the new input file and base_dir to be from the tidied file
349 $file = "$tailname$suffix";
350 $base_dir = $dirname;
351 }
352
353 # call the parent read_into_doc_obj
354 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
355 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
356 {
357 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
358 my $collect_file = &util::filename_within_collection($filename_full_path);
359 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
360 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
361 # build. So set it manually.
362 $doc_obj->set_source_path($filename_full_path);
363 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
364 $doc_obj->set_converted_filename($collect_conv_file);
365
366 my $plugin_filename_encoding = $self->{'filename_encoding'};
367 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
368 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
369 }
370
371 delete $self->{'store_content_encoding'}->{$filename_full_path};
372 $self->{'content_encoding'} = undef;
373
374 return ($process_status,$doc_obj);
375}
376
377# do plugin specific processing of doc_obj
378sub process {
379 my $self = shift (@_);
380 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
381 my $outhandle = $self->{'outhandle'};
382
383 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
384 # this makes life so much easier... perl can cope with unix-style '/'s.
385 $base_dir =~ s@(\\)+@/@g;
386 $file =~ s@(\\)+@/@g;
387 }
388
389 my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
390 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
391 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
392
393 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
394 # And again
395 $upgraded_base_dir =~ s@(\\)+@/@g;
396 $upgraded_filename =~ s@(\\)+@/@g;
397
398 # Need to make sure there is a '/' on the end of upgraded_base_dir
399 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
400 $upgraded_base_dir .= "/";
401 }
402 }
403 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
404
405 # reset per-doc stuff...
406 $self->{'aux_files'} = {};
407 $self->{'dir_num'} = 0;
408 $self->{'file_num'} = 0;
409
410 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
411 # you can also include metadata in the format (X can be any number)
412 # <hX>Title<!--gsdl-metadata
413 # <Metadata name="name1">value1</Metadata>
414 # ...
415 # <Metadata name="nameN">valueN</Metadata>
416 #--></hX>
417 if ($self->{'sectionalise_using_h_tags'}) {
418 # description_tags should allways be activated because we convert headings to description tags
419 $self->{'description_tags'} = 1;
420
421 my $arrSections = [];
422 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
423
424 if (scalar(@$arrSections)) {
425 my $strMetadata = $self->update_section_data($arrSections, -1);
426 if (length($strMetadata)) {
427 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
428 $$textref =~ s/<\/body>/$strMetadata/ig;
429 }
430 }
431 }
432
433 my $cursection = $doc_obj->get_top_section();
434
435 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
436 unless $self->{'no_metadata'} || $self->{'description_tags'};
437
438 # Store URL for page as metadata - this can be used for an
439 # altavista style search interface. The URL won't be valid
440 # unless the file structure contains the domain name (i.e.
441 # like when w3mir is used to download a website).
442
443 # URL metadata (even invalid ones) are used to support internal
444 # links, so even if 'file_is_url' is off, still need to store info
445
446 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
447
448# my $utf8_file = $self->filename_to_utf8_metadata($file);
449# $utf8_file =~ s/&\#095;/_/g;
450# variable below used to be utf8_file
451
452 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
454
455 my $web_url = "http://";
456 my $utf8_web_url = "http://";
457 if(defined $dirname) { # local directory
458 # Check for "ftp" in the domain name of the directory
459 # structure to determine if this URL should be a ftp:// URL
460 # This check is not infallible, but better than omitting the
461 # check, which would cause all files downloaded from ftp sites
462 # via mirroring with wget to have potentially erroneous http:// URLs
463 # assigned in their metadata
464 if ($dirname =~ /^[^\/]*ftp/i)
465 {
466 $web_url = "ftp://";
467 $utf8_web_url = "ftp://";
468 }
469 $dirname = $self->eval_dir_dots($dirname);
470 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
471
472 $web_url = $web_url.$dirname.$url_encoded_file;
473 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
474 } else {
475 $web_url = $web_url.$url_encoded_file;
476 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
477 }
478 $web_url =~ s/\\/\//g;
479 $utf8_web_url =~ s/\\/\//g;
480
481 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
482 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
484 }
485
486
487 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
488 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
489
490 if ($self->{'file_is_url'}) {
491 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
494 }
495
496 if ($self->{'description_tags'}) {
497 # remove the html header - note that doing this here means any
498 # sections defined within the header will be lost (so all <Section>
499 # tags must appear within the body of the HTML)
500 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
501
502 $$textref =~ s/^.*?<body[^>]*>//is;
503 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
504
505 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
506 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
507
508 my $lt = '(?:<|&lt;)';
509 my $gt = '(?:>|&gt;)';
510 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
511
512 my $dont_strip = '';
513 if ($self->{'no_strip_metadata_html'}) {
514 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
515 }
516
517 my $found_something = 0; my $top = 1;
518 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
519 my $text = $1;
520 my $comment = $2;
521 if (defined $text) {
522 # text before a comment - note that getting to here
523 # doesn't necessarily mean there are Section tags in
524 # the document
525 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
526 }
527 while ($comment =~ s/$lt(.*?)$gt//s) {
528 my $tag = $1;
529 if ($tag eq "Section") {
530 $found_something = 1;
531 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532 $top = 0;
533 } elsif ($tag eq "/Section") {
534 $found_something = 1;
535 $cursection = $doc_obj->get_parent_section ($cursection);
536 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
537 my $metaname = $1;
538 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
539 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540 my $metavalue = $1;
541 $metavalue =~ s/^\s+//;
542 $metavalue =~ s/\s+$//;
543 # assume that no metadata value intentionally includes
544 # carriage returns or HTML tags (if they're there they
545 # were probably introduced when converting to HTML from
546 # some other format).
547 # actually some people want to have html tags in their
548 # metadata.
549 $metavalue =~ s/[\cJ\cM]/ /sg;
550 $metavalue =~ s/<[^>]+>//sg
551 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
552 $metavalue =~ s/\s+/ /sg;
553 if ($metaname =~ /\./) { # has a namespace
554 $metaname = "ex.$metaname";
555 }
556 if ($accumulate) {
557 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558 } else {
559 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560 }
561 } elsif ($tag eq "Description" || $tag eq "/Description") {
562 # do nothing with containing Description tags
563 } else {
564 # simple HTML tag (probably created by the conversion
565 # to HTML from some other format) - we'll ignore it and
566 # hope for the best ;-)
567 }
568 }
569 }
570 if ($cursection ne "") {
571 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
572 }
573
574 $$textref =~ s/^.*?<body[^>]*>//is;
575 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
576 if ($$textref =~ m/\S/) {
577 if (!$found_something) {
578 if ($self->{'verbosity'} > 2) {
579 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
580 print $outhandle " will be processed as a single section document\n";
581 }
582
583 # go ahead and process single-section document
584 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
585
586 # if document contains no Section tags we'll go ahead
587 # and extract metadata (this won't have been done
588 # above as the -description_tags option prevents it)
589 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591 unless $self->{'no_metadata'};
592
593 } else {
594 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
595 print $outhandle " of the final closing </Section> tag. This text will\n";
596 print $outhandle " be ignored.";
597
598 my ($text);
599 if (length($$textref) > 30) {
600 $text = substr($$textref, 0, 30) . "...";
601 } else {
602 $text = $$textref;
603 }
604 $text =~ s/\n/ /isg;
605 print $outhandle " ($text)\n";
606 }
607 } elsif (!$found_something) {
608
609 if ($self->{'verbosity'} > 2) {
610 # may get to here if document contained no valid Section
611 # tags but did contain some comments. The text will have
612 # been processed already but we should print the warning
613 # as above and extract metadata
614 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
615 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616 }
617
618 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620 unless $self->{'no_metadata'};
621 }
622
623 } else {
624
625 # remove header and footer
626 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
627 $$textref =~ s/^.*?<body[^>]*>//is;
628 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
629 }
630
631 $self->{'css_assoc_files'} = {};
632
633 # single section document
634 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
635
636 #my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
637
638 $self->acquire_css_associated_files($doc_obj, $cursection);
639
640 $self->{'css_assoc_files'} = {};
641 }
642
643 return 1;
644}
645
646
647sub process_heading
648{
649 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
650 $strHeadingText = '' if (!defined($strHeadingText));
651
652 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
653
654 my $strSecMetadata = '';
655 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
656 {
657 $strSecMetadata .= $1;
658 }
659
660 $strHeadingText =~ s/^\s+//g;
661 $strHeadingText =~ s/\s+$//g;
662 $strSecMetadata =~ s/^\s+//g;
663 $strSecMetadata =~ s/\s+$//g;
664
665 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
666
667 if (length($strSecMetadata)) {
668 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
669 }
670
671 $strMetadata .= "\t</Description>\n";
672
673 return "<!--" . $strMetadata . "-->";
674}
675
676
677sub update_section_data
678{
679 my ($self, $arrSections, $nCurTocNo) = @_;
680 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
681
682 if ($nSections == 0) {
683 push @$arrSections, $nCurTocNo;
684 return $strBuffer;
685 }
686 $nLast = $arrSections->[$nSections - 1];
687 if ($nCurTocNo > $nLast) {
688 push @$arrSections, $nCurTocNo;
689 return $strBuffer;
690 }
691 for(my $i = $nSections - 1; $i >= 0; $i--) {
692 if ($nCurTocNo <= $arrSections->[$i]) {
693 $strBuffer .= "\n</Section>";
694 pop @$arrSections;
695 }
696 }
697 push @$arrSections, $nCurTocNo;
698 return $strBuffer;
699}
700
701
702# note that process_section may be called multiple times for a single
703# section (relying on the fact that add_utf8_text appends the text to any
704# that may exist already).
705sub process_section {
706 my $self = shift (@_);
707 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
708
709 my @styleTagsText = ($$textref =~ m/<style[^>]*>([^<]*)<\/style>/sg);
710 if(scalar(@styleTagsText) > 0)
711 {
712 my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
713 foreach my $styleText (@styleTagsText)
714 {
715 $self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
716 }
717 }
718
719 # trap links
720 if (!$self->{'nolinks'}) {
721 # usemap="./#index" not handled correctly => change to "#index"
722## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
723
724## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
725## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
726
727 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
728 $self->replace_usemap_links($1, $2, $3)/isge;
729
730 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
731 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
732
733## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
734# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
735 }
736
737 # trap images
738
739 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
740 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
741 # turned regular text succeeding images into links. That is, by embedding <imgs>
742 # inside <a href=""></a>, the text following images were no longer misbehaving.
743 # However, there would be many occasions whereby images were not meant to link
744 # to their source images but where the images would link to another web page.
745 # To allow this, the no_image_links option was introduced: it would prevent
746 # the behaviour of embedding images into links that referenced the source images.
747
748 # Somewhere along the line, the problem of normal text turning into links when
749 # such text followed images which were not embedded in <a href=""></a> ceased
750 # to occur. This is why the following lines have been commented out (as well as
751 # two lines in replace_images). They appear to no longer apply.
752
753 # If at any time, there is a need for having images embedded in <a> anchor tags,
754 # then it might be better to turn that into an HTMLPlugin option rather than make
755 # it the default behaviour. Also, eventually, no_image_links needs to become
756 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
757
758 #if(!$self->{'no_image_links'}){
759 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
760 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
761 #}
762
763 # add text to document object
764 # turn \ into \\ so that the rest of greenstone doesn't think there
765 # is an escape code following. (Macro parsing loses them...)
766 $$textref =~ s/\\/\\\\/go;
767
768 $doc_obj->add_utf8_text($cursection, $$textref);
769}
770
771sub replace_images {
772 my $self = shift (@_);
773 my ($front, $link, $back, $base_dir,
774 $file, $doc_obj, $section) = @_;
775
776 # remove quotes from link at start and end if necessary
777 if ($link=~/^[\"\']/) {
778 $link=~s/^[\"\']//;
779 $link=~s/[\"\']$//;
780 $front.='"';
781 $back="\"$back";
782 }
783
784 $link =~ s/\n/ /g;
785
786 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
787 # If the Word file path has spaces in it, wv messes up and you end up with
788 # absolute paths for the images, and without the "file://" prefix
789 # So check for this special case and massage the data to be correct
790 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
791 $link =~ s/^.*\\([^\\]+)$/$1/;
792 }
793
794 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
795
796 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
797
798# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
799
800 my $anchor_name = $img_file;
801 #$anchor_name =~ s/^.*\///;
802 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
803
804 my $image_link = $front . $img_file .$back;
805 return $image_link;
806
807 # The reasons for why the following two lines are no longer necessary can be
808 # found in subroutine process_section
809 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
810 #return $anchor_link;
811
812 #return $front . $img_file . $back . $anchor_name;
813}
814
815sub replace_href_links {
816 my $self = shift (@_);
817 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
818
819 if($front =~ m/^<link / && $link =~ m/\.css"$/)
820 {
821 my $actual_link = $link;
822 $actual_link =~ s/^"(.*)"$/$1/;
823
824 my $directory = &File::Basename::dirname($file);
825
826 my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
827 $self->retrieve_css_associated_files($css_filename);
828 }
829
830 # remove quotes from link at start and end if necessary
831 if ($link=~/^[\"\']/) {
832 $link=~s/^[\"\']//;
833 $link=~s/[\"\']$//;
834 $front.='"';
835 $back="\"$back";
836 }
837
838 # attempt to sort out targets - frames are not handled
839 # well in this plugin and some cases will screw things
840 # up - e.g. the _parent target (so we'll just remove
841 # them all ;-)
842 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
843 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
844 $front =~ s/target=\"?_parent\"?//is;
845 $back =~ s/target=\"?_parent\"?//is;
846
847 if($link =~ m/^\#/s)
848 {
849 return $front . "_httpsamepagelink_" . $link . $back;
850 }
851
852 $link =~ s/\n/ /g;
853
854 # Find file referred to by $link on file system
855 # This is more complicated than it sounds when char encodings
856 # is taken in to account
857 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
858
859 # href may use '\'s where '/'s should be on Windows
860 $href =~ s/\\/\//g;
861 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
862
863 ##### leave all these links alone (they won't be picked up by intermediate
864 ##### pages). I think that's safest when dealing with frames, targets etc.
865 ##### (at least until I think of a better way to do it). Problems occur with
866 ##### mailto links from within small frames, the intermediate page is displayed
867 ##### within that frame and can't be seen. There is still potential for this to
868 ##### happen even with html pages - the solution seems to be to somehow tell
869 ##### the browser from the server side to display the page being sent (i.e.
870 ##### the intermediate page) in the top level window - I'm not sure if that's
871 ##### possible - the following line should probably be deleted if that can be done
872 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
873
874 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
875 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
876
877 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
878
879 # Don't do any encoding for now, as not clear what
880 # the right thing to do is to support filename
881 # encoding on Windows when they are not UTF16
882 #
883 }
884 else {
885 # => Unix-based system
886
887 # If web page didn't give encoding, then default to utf8
888 my $content_encoding= $self->{'content_encoding'} || "utf8";
889
890 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
891 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
892 }
893
894 # on Darwin, the unicode filenames are stored on the file
895 # system in decomposed form, so any href link (including when
896 # URL-encoded) should refer to the decomposed name of the file
897 if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
898 $href = normalize('D', $href); # Normalization Form D (decomposition)
899 }
900
901 $href = encode($content_encoding,$href);
902 }
903
904 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
905 $href = &unicode::filename_to_url($href);
906
907 &ghtml::urlsafe ($href);
908
909 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
910 print STDERR "******DEBUG: href=$href\n";
911 }
912
913
914 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
915 } else {
916 # link is to some other type of file (e.g., an image) so we'll
917 # need to associate that file
918 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
919 }
920}
921
922sub retrieve_css_associated_files {
923 my $self = shift (@_);
924 my ($css_filename) = @_;
925
926 my $css_filename_dirname = &File::Basename::dirname($css_filename);
927
928 open (CSSFILE, $css_filename) || return;
929 sysread (CSSFILE, my $file_string, -s CSSFILE);
930
931 $self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
932
933 close CSSFILE;
934}
935
936sub acquire_css_associated_files_from_text_block {
937 my $self = shift (@_);
938 my ($text, $css_filename_dirname) = @_;
939
940 my @image_urls = ($text =~ m/background-image:\s*url[^;]*;/sg);
941 foreach my $img_url (@image_urls)
942 {
943 $img_url =~ s/^.*url.*\((.*)\).*$/$1/;
944 $img_url =~ s/^\s*"?([^"]*)"?\s*$/$1/;
945
946 $self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
947 }
948}
949
950sub acquire_css_associated_files {
951 my $self = shift(@_);
952
953 my ($doc_obj, $section) = @_;
954
955 foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
956 {
957 $doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
958 }
959}
960
961sub add_file {
962 my $self = shift (@_);
963 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
964 my ($newname);
965
966 my $filename = $href;
967 if ($base_dir eq "") {
968 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
969 # remove http://
970 $filename =~ s/^[^:]*:\/\///;
971 }
972 else {
973 # remove http:/ thereby leaving one slash at the start as
974 # part of full pathname
975 $filename =~ s/^[^:]*:\///;
976 }
977 }
978 else {
979 # remove http://
980 $filename =~ s/^[^:]*:\/\///;
981 }
982
983 $filename = &FileUtils::filenameConcatenate($base_dir, $filename);
984
985 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
986 # we are processing a tidytmp file - want paths to be in import
987 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
988 }
989
990 # Replace %XX's in URL with decoded value if required. Note that the
991 # filename may include the %XX in some situations. If the *original*
992 # file's name was in URL encoding, the following method will not decode
993 # it.
994 my $unicode_filename = $filename;
995 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
996
997 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
998 # (it seems) accompanying file
999 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1000
1001 my $content_encoding= $self->{'content_encoding'} || "utf8";
1002
1003 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
1004 # The filenames that come through the HTML file have been decoded
1005 # into Unicode aware Perl strings. Need to convert them back
1006 # to their initial raw-byte encoding to match the file that
1007 # exists on the file system
1008 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1009 }
1010 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1011 # HFS+ is UTF8 with decompostion
1012 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1013 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
1014
1015 }
1016 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1017 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1018
1019 if (defined $long_filename) {
1020 my $short_filename = Win32::GetLongPathName($long_filename);
1021 $filename = $short_filename;
1022 }
1023# else {
1024# print STDERR "***** failed to map href to real file:\n";
1025# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1026# }
1027 }
1028 else {
1029 my $outhandle = $self->{'outhandle'};
1030 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1031 print $outhandle " in file system encoding of href: $href\n";
1032 print $outhandle " No character encoding done.\n";
1033 }
1034
1035
1036 # some special processing if the intended filename was converted to utf8, but
1037 # the actual file still needs to be renamed
1038 #if (!&util::fd_exists($filename)) {
1039 if (!&FileUtils::fileExists($filename)) {
1040 # try the original filename stored in map
1041 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1042 print STDERR "******!! orig filename did not exist: $filename\n";
1043 }
1044
1045## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1046
1047 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1048
1049 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1050 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1051 }
1052
1053 if (defined $original_filename && -e $original_filename) {
1054 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1055 print STDERR "****** Found match!\n";
1056 }
1057 $filename = $original_filename;
1058 }
1059 }
1060
1061 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1062
1063 if ($rl == 0) {
1064 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1065 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
1066 }
1067 else {
1068 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
1069 }
1070 }
1071
1072 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1073 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
1074 }
1075 # add the original image file as a source file
1076 if (!$self->{'processing_tmp_files'} ) {
1077 $doc_obj->associate_source_file($filename);
1078 }
1079 if ($self->{'rename_assoc_files'}) {
1080 if (defined $self->{'aux_files'}->{$href}) {
1081 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1082 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1083 } else {
1084 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1085 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1086 $self->inc_filecount ();
1087 }
1088 $doc_obj->associate_file($filename, $newname, undef, $section);
1089 return "_httpdocimg_/$newname";
1090 } else {
1091 if(&unicode::is_url_encoded($unicode_filename)) {
1092 # use the possibly-decoded filename instead to avoid double URL encoding
1093 ($newname) = $filename =~ m/([^\/\\]*)$/;
1094 } else {
1095 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1096 }
1097
1098 # Make sure this name uses only ASCII characters.
1099 # We use either base64 or URL encoding, as these preserve original encoding
1100 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
1101
1102### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1103 $doc_obj->associate_file($filename, $newname, undef, $section);
1104
1105 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1106 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1107 my $newname_url = $newname;
1108 $newname_url = &unicode::filename_to_url($newname_url);
1109 return "_httpdocimg_/$newname_url";
1110 }
1111}
1112
1113
1114sub format_link {
1115 my $self = shift (@_);
1116 my ($link, $base_dir, $file) = @_;
1117
1118 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1119 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
1120
1121 $hash_part = "" if !defined $hash_part;
1122 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1123 my $outhandle = $self->{'outhandle'};
1124 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1125 if $self->{'verbosity'};
1126 return ($link, "", 0);
1127 }
1128
1129 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
1130 my $type = $1;
1131
1132 if ($link =~ m/^(http|ftp):/i) {
1133 # Turn url (using /) into file name (possibly using \ on windows)
1134 my @http_dir_split = split('/', $before_hash);
1135 $before_hash = &FileUtils::filenameConcatenate(@http_dir_split);
1136 }
1137
1138 $before_hash = $self->eval_dir_dots($before_hash);
1139
1140 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1141
1142 my $rl = 0;
1143 $rl = 1 if (-e $linkfilename);
1144
1145 # make sure there's a slash on the end if it's a directory
1146 if ($before_hash !~ m/\/$/) {
1147 $before_hash .= "/" if (-d $linkfilename);
1148 }
1149 return ($type . $before_hash, $hash_part, $rl);
1150
1151 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1152
1153 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1154
1155 # the first directory will be the domain name if file_is_url
1156 # to generate archives, otherwise we'll assume all files are
1157 # from the same site and base_dir is the root
1158
1159 if ($self->{'file_is_url'}) {
1160 my @dirs = split /[\/\\]/, $file;
1161 my $domname = shift (@dirs);
1162 $before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1163 $before_hash =~ s@\\@/@g; # for windows
1164 }
1165 else
1166 {
1167 # see if link shares directory with source document
1168 # => turn into relative link if this is so!
1169
1170 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1171 # too difficult doing a pattern match with embedded '\'s...
1172 my $win_before_hash=$before_hash;
1173 $win_before_hash =~ s@(\\)+@/@g;
1174 # $base_dir is already similarly "converted" on windows.
1175 if ($win_before_hash =~ s@^$base_dir/@@o) {
1176 # if this is true, we removed a prefix
1177 $before_hash=$win_before_hash;
1178 }
1179 }
1180 else {
1181 # before_hash has lost leading slash by this point,
1182 # -> add back in prior to substitution with $base_dir
1183 $before_hash = "/$before_hash";
1184
1185 $before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1186 $before_hash =~ s@^$base_dir/@@;
1187 }
1188 }
1189 } else {
1190 # Turn relative file path into full path
1191 my $dirname = &File::Basename::dirname($file);
1192 $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
1193 $before_hash = $self->eval_dir_dots($before_hash);
1194 }
1195
1196 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1197
1198
1199# print STDERR "**** linkfilename = $linkfilename\n";
1200# if (!&util::fd_exists($linkfilename)) {
1201# print STDERR "***** Warning: Could not find $linkfilename\n";
1202# }
1203
1204
1205 # make sure there's a slash on the end if it's a directory
1206 if ($before_hash !~ m/\/$/) {
1207 $before_hash .= "/" if (-d $linkfilename);
1208 }
1209
1210# print STDERR "*** returning: $before_hash\n";
1211
1212 return ("http://" . $before_hash, $hash_part, 1);
1213 } else {
1214 # mailto, news, nntp, telnet, javascript or gopher link
1215 return ($before_hash, "", 0);
1216 }
1217}
1218
1219sub extract_first_NNNN_characters {
1220 my $self = shift (@_);
1221 my ($textref, $doc_obj, $thissection) = @_;
1222
1223 foreach my $size (split /,/, $self->{'first'}) {
1224 my $tmptext = $$textref;
1225 # skip to the body
1226 $tmptext =~ s/.*<body[^>]*>//i;
1227 # remove javascript
1228 $tmptext =~ s@<script.*?</script>@ @sig;
1229 $tmptext =~ s/<[^>]*>/ /g;
1230 $tmptext =~ s/&nbsp;/ /g;
1231 $tmptext =~ s/^\s+//;
1232 $tmptext =~ s/\s+$//;
1233 $tmptext =~ s/\s+/ /gs;
1234 $tmptext = &unicode::substr ($tmptext, 0, $size);
1235 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1236 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1237 }
1238}
1239
1240
1241sub extract_metadata {
1242 my $self = shift (@_);
1243 my ($textref, $metadata, $doc_obj, $section) = @_;
1244 my $outhandle = $self->{'outhandle'};
1245 # if we don't want metadata, we may as well not be here ...
1246 return if (!defined $self->{'metadata_fields'});
1247
1248 my $separator = $self->{'metadata_field_separator'};
1249 if ($separator eq "") {
1250 undef $separator;
1251 }
1252
1253 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1254 # html meta, 'value' is the metadata name for greenstone to use
1255 my %find_fields = ();
1256
1257 my %creator_fields = (); # short-cut for lookups
1258
1259
1260 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1261 $field =~ s/^\s+//; # remove leading whitespace
1262 $field =~ s/\s+$//; # remove trailing whitespace
1263
1264 # support tag<tagname>
1265 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1266 # "$2" is the user's preferred gs metadata name
1267 $find_fields{lc($1)}=$2; # lc = lowercase
1268 } else { # no <tagname> for mapping
1269 # "$field" is the user's preferred gs metadata name
1270 $find_fields{lc($field)}=$field; # lc = lowercase
1271 }
1272 }
1273
1274 if (defined $self->{'hunt_creator_metadata'} &&
1275 $self->{'hunt_creator_metadata'} == 1 ) {
1276 my @extra_fields =
1277 (
1278 'author',
1279 'author.email',
1280 'creator',
1281 'dc.creator',
1282 'dc.creator.corporatename',
1283 );
1284
1285 # add the creator_metadata fields to search for
1286 foreach my $field (@extra_fields) {
1287 $creator_fields{$field}=0; # add to lookup hash
1288 }
1289 }
1290
1291
1292 # find the header in the html file, which has the meta tags
1293 $$textref =~ m@<head>(.*?)</head>@si;
1294
1295 my $html_header=$1;
1296
1297 # go through every <meta... tag defined in the html and see if it is
1298 # one of the tags we want to match.
1299
1300 # special case for title - we want to remember if its been found
1301 my $found_title = 0;
1302 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1303 $html_header =~ m/^/; # match the start of the string, for \G assertion
1304
1305 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1306 my $metatag=$1;
1307 my ($tag, $value);
1308
1309 # find the tag name
1310 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1311 $tag=$2;
1312 # in case they're not using " or ', but they should...
1313 if (! $tag) {
1314 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1315 $tag=$1;
1316 }
1317
1318 if (!defined $tag) {
1319 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1320 next;
1321 }
1322
1323 # don't need to assign this field if it was passed in from a previous
1324 # (recursive) plugin
1325 if (defined $metadata->{$tag}) {next}
1326
1327 # find the tag content
1328 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1329 $value=$2;
1330
1331 # The following code assigns the metaname to value if value is
1332 # empty. Why would we do this?
1333 #if (! $value) {
1334 # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1335 # $value=$1;
1336 #}
1337 if (!defined $value || $value eq "") {
1338 print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1339 next;
1340 }
1341
1342 # clean up and add
1343 $value =~ s/\s+/ /gs;
1344 chomp($value); # remove trailing \n, if any
1345 if (exists $creator_fields{lc($tag)}) {
1346 # map this value onto greenstone's "Creator" metadata
1347 $tag='Creator';
1348 } elsif (!exists $find_fields{lc($tag)}) {
1349 next; # don't want this tag
1350 } else {
1351 # get the user's preferred capitalisation
1352 $tag = $find_fields{lc($tag)};
1353 }
1354 if (lc($tag) eq "title") {
1355 $found_title = 1;
1356 }
1357
1358 if ($self->{'verbosity'} > 2) {
1359 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1360 }
1361
1362 if ($tag =~ /\./) {
1363 # there is a . so has a namespace, add ex.
1364 $tag = "ex.$tag";
1365 }
1366 if (defined $separator) {
1367 my @values = split($separator, $value);
1368 foreach my $v (@values) {
1369 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1370 }
1371 }
1372 else {
1373 $doc_obj->add_utf8_metadata($section, $tag, $value);
1374 }
1375 }
1376
1377 # TITLE: extract the document title
1378 if (exists $find_fields{'title'} && !$found_title) {
1379 # we want a title, and didn't find one in the meta tags
1380 # see if there's a <title> tag
1381 my $title;
1382 my $from = ""; # for debugging output only
1383 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1384 $title = $1;
1385 $from = "<title> tags";
1386 }
1387
1388 if (!defined $title) {
1389 $from = "first 100 chars";
1390 # if no title use first 100 or so characters
1391 $title = $$textref;
1392 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1393 $title =~ s/^.*?<body>//si;
1394 # ignore javascript!
1395 $title =~ s@<script.*?</script>@ @sig;
1396 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1397 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1398 $title = substr ($title, 0, 100);
1399 $title =~ s/\s\S*$/.../;
1400 }
1401 $title =~ s/<[^>]*>/ /g; # remove html tags
1402 $title =~ s/&nbsp;/ /g;
1403 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1404 $title =~ s/\s+/ /gs; # collapse multiple spaces
1405 $title =~ s/^\s*//; # remove leading spaces
1406 $title =~ s/\s*$//; # remove trailing spaces
1407
1408 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1409 $title =~ s/^\s+//s; # in case title_sub introduced any...
1410 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1411 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1412 if ($self->{'verbosity'} > 2);
1413 }
1414
1415 # add FileFormat metadata
1416 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1417
1418 # Special, for metadata names such as tagH1 - extracts
1419 # the text between the first <H1> and </H1> tags into "H1" metadata.
1420
1421 foreach my $field (keys %find_fields) {
1422 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1423 my $tag = $1;
1424 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1425 my $content = $1;
1426 $content =~ s/&nbsp;/ /g;
1427 $content =~ s/<[^>]*>/ /g;
1428 $content =~ s/^\s+//;
1429 $content =~ s/\s+$//;
1430 $content =~ s/\s+/ /gs;
1431 if ($content) {
1432 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1433 $tag =~ s/^tag//i;
1434 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1435 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1436 if ($self->{'verbosity'} > 2);
1437 }
1438 }
1439 }
1440}
1441
1442
1443# evaluate any "../" to next directory up
1444# evaluate any "./" as here
1445sub eval_dir_dots {
1446 my $self = shift (@_);
1447 my ($filename) = @_;
1448 my $dirsep_os = &util::get_os_dirsep();
1449 my @dirsep = split(/$dirsep_os/,$filename);
1450
1451 my @eval_dirs = ();
1452 foreach my $d (@dirsep) {
1453 if ($d eq "..") {
1454 pop(@eval_dirs);
1455
1456 } elsif ($d eq ".") {
1457 # do nothing!
1458
1459 } else {
1460 push(@eval_dirs,$d);
1461 }
1462 }
1463
1464 # Need to fiddle with number of elements in @eval_dirs if the
1465 # first one is the empty string. This is because of a
1466 # modification to FileUtils::filenameConcatenate that supresses the addition
1467 # of a leading '/' character (or \ if windows) (intended to help
1468 # filename cat with relative paths) if the first entry in the
1469 # array is the empty string. Making the array start with *two*
1470 # empty strings is a way to defeat this "smart" option.
1471 #
1472 if (scalar(@eval_dirs) > 0) {
1473 if ($eval_dirs[0] eq ""){
1474 unshift(@eval_dirs,"");
1475 }
1476 }
1477
1478 my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1479 return $evaluated_filename;
1480}
1481
1482sub replace_usemap_links {
1483 my $self = shift (@_);
1484 my ($front, $link, $back) = @_;
1485
1486 # remove quotes from link at start and end if necessary
1487 if ($link=~/^[\"\']/) {
1488 $link=~s/^[\"\']//;
1489 $link=~s/[\"\']$//;
1490 $front.='"';
1491 $back="\"$back";
1492 }
1493
1494 $link =~ s/^\.\///;
1495 return $front . $link . $back;
1496}
1497
1498sub inc_filecount {
1499 my $self = shift (@_);
1500
1501 if ($self->{'file_num'} == 1000) {
1502 $self->{'dir_num'} ++;
1503 $self->{'file_num'} = 0;
1504 } else {
1505 $self->{'file_num'} ++;
1506 }
1507}
1508
1509
1510# Extend read_file so that strings like &eacute; are
1511# converted to UTF8 internally.
1512#
1513# We don't convert &lt; or &gt; or &amp; or &quot; in case
1514# they interfere with the GML files
1515
1516sub read_file {
1517 my $self = shift(@_);
1518 my ($filename, $encoding, $language, $textref) = @_;
1519
1520 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1521
1522 # Convert entities to their Unicode code-point equivalents
1523 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1524 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1525 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1526
1527}
1528
1529sub HB_read_html_file {
1530 my $self = shift (@_);
1531 my ($htmlfile, $text) = @_;
1532
1533 # load in the file
1534 if (!open (FILE, $htmlfile)) {
1535 print STDERR "ERROR - could not open $htmlfile\n";
1536 return;
1537 }
1538
1539 my $foundbody = 0;
1540 $self->HB_gettext (\$foundbody, $text, "FILE");
1541 close FILE;
1542
1543 # just in case there was no <body> tag
1544 if (!$foundbody) {
1545 $foundbody = 1;
1546 open (FILE, $htmlfile) || return;
1547 $self->HB_gettext (\$foundbody, $text, "FILE");
1548 close FILE;
1549 }
1550 # text is in utf8
1551}
1552
1553# converts the text to utf8, as ghtml does that for &eacute; etc.
1554sub HB_gettext {
1555 my $self = shift (@_);
1556 my ($foundbody, $text, $handle) = @_;
1557
1558 my $line = "";
1559 while (defined ($line = <$handle>)) {
1560 # look for body tag
1561 if (!$$foundbody) {
1562 if ($line =~ s/^.*<body[^>]*>//i) {
1563 $$foundbody = 1;
1564 } else {
1565 next;
1566 }
1567 }
1568
1569 # check for symbol fonts
1570 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1571 my $font = $1;
1572 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1573 if ($font !~ m/^arial$/i);
1574 }
1575
1576 $$text .= $line;
1577 }
1578
1579 if ($self->{'input_encoding'} eq "iso_8859_1") {
1580 # convert to utf-8
1581 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1582 }
1583 # convert any alphanumeric character entities to their utf-8
1584 # equivalent for indexing purposes
1585 #&ghtml::convertcharentities ($$text);
1586
1587 $$text =~ s/\s+/ /g; # remove \n's
1588
1589 # At this point $$text is a binary byte string
1590 # => turn it into a Unicode aware string, so full
1591 # Unicode aware pattern matching can be used.
1592 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1593 #
1594
1595 $$text = decode("utf8",$$text);
1596}
1597
1598sub HB_clean_section {
1599 my $self = shift (@_);
1600 my ($section) = @_;
1601
1602 # remove tags without a starting tag from the section
1603 my ($tag, $tagstart);
1604 while ($section =~ m/<\/([^>]{1,10})>/) {
1605 $tag = $1;
1606 $tagstart = index($section, "<$tag");
1607 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1608 $section =~ s/<\/$tag>//;
1609 }
1610
1611 # remove extra paragraph tags
1612 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1613
1614 # remove extra stuff at the end of the section
1615 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1616
1617 # add a newline at the beginning of each paragraph
1618 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1619
1620 # add a newline every 80 characters at a word boundary
1621 # Note: this regular expression puts a line feed before
1622 # the last word in each section, even when it is not
1623 # needed.
1624 $section =~ s/(.{1,80})\s/$1\n/g;
1625
1626 # fix up the image links
1627 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1628 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1629 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1630 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1631
1632 return $section;
1633}
1634
1635# Will convert the oldHDL format to the new HDL format (using the Section tag)
1636sub convert_to_newHDLformat
1637{
1638 my $self = shift (@_);
1639 my ($file,$cnfile) = @_;
1640 my $input_filename = $file;
1641 my $tmp_filename = $cnfile;
1642
1643 # write HTML tmp file with new HDL format
1644 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1645
1646 # read in the file and do basic html cleaning (removing header etc)
1647 my $html = "";
1648 $self->HB_read_html_file ($input_filename, \$html);
1649
1650 # process the file one section at a time
1651 my $curtoclevel = 1;
1652 my $firstsection = 1;
1653 my $toclevel = 0;
1654 while (length ($html) > 0) {
1655 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1656 $toclevel = $3;
1657 my $title = $4;
1658 my $sectiontext = "";
1659 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1660 $sectiontext = $1;
1661 } else {
1662 $sectiontext = $html;
1663 $html = "";
1664 }
1665
1666 # remove tags and extra spaces from the title
1667 $title =~ s/<\/?[^>]+>//g;
1668 $title =~ s/^\s+|\s+$//g;
1669
1670 # close any sections below the current level and
1671 # create a new section (special case for the firstsection)
1672 print PROD "<!--\n";
1673 while (($curtoclevel > $toclevel) ||
1674 (!$firstsection && $curtoclevel == $toclevel)) {
1675 $curtoclevel--;
1676 print PROD "</Section>\n";
1677 }
1678 if ($curtoclevel+1 < $toclevel) {
1679 print STDERR "WARNING - jump in toc levels in $input_filename " .
1680 "from $curtoclevel to $toclevel\n";
1681 }
1682 while ($curtoclevel < $toclevel) {
1683 $curtoclevel++;
1684 }
1685
1686 if ($curtoclevel == 1) {
1687 # add the header tag
1688 print PROD "-->\n";
1689 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1690 print PROD "<!--\n";
1691 }
1692
1693 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1694
1695 print PROD "-->\n";
1696
1697 # clean up the section html
1698 $sectiontext = $self->HB_clean_section($sectiontext);
1699
1700 print PROD "$sectiontext\n";
1701
1702 } else {
1703 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1704 "\nin $input_filename\n";
1705 last;
1706 }
1707 $firstsection = 0;
1708 }
1709
1710 print PROD "<!--\n";
1711 while ($curtoclevel > 0) {
1712 $curtoclevel--;
1713 print PROD "</Section>\n";
1714 }
1715 print PROD "-->\n";
1716
1717 close (PROD) || die("Error Closing File: $tmp_filename $!");
1718
1719 return $tmp_filename;
1720}
1721
1722sub shorten {
1723 my $self = shift (@_);
1724 my ($text) = @_;
1725
1726 return "\"$text\"" if (length($text) < 100);
1727
1728 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1729 substr ($text, length($text)-50) . "\"";
1730}
1731
1732sub convert_tidy_or_oldHDL_file
1733{
1734 my $self = shift (@_);
1735 my ($file) = @_;
1736 my $input_filename = $file;
1737
1738 if (-d $input_filename)
1739 {
1740 return $input_filename;
1741 }
1742
1743 # get the input filename
1744 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1745 my $base_dirname = $dirname;
1746 $suffix = lc($suffix);
1747
1748 # derive tmp filename from input filename
1749 # Remove any white space from filename -- no risk of name collision, and
1750 # makes later conversion by utils simpler. Leave spaces in path...
1751 # tidy up the filename with space, dot, hyphen between
1752 $tailname =~ s/\s+//g;
1753 $tailname =~ s/\.+//g;
1754 $tailname =~ s/\-+//g;
1755 # convert to utf-8 otherwise we have problems with the doc.xml file
1756 # later on
1757 &unicode::ensure_utf8(\$tailname);
1758
1759 # softlink to collection tmp dir
1760 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1761 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1762
1763 my $test_dirname = "";
1764 my $f_separator = &util::get_os_dirsep();
1765
1766 if ($dirname =~ m/import$f_separator/)
1767 {
1768 $test_dirname = $'; #'
1769
1770 #print STDERR "init $'\n";
1771
1772 while ($test_dirname =~ m/[$f_separator]/)
1773 {
1774 my $folderdirname = $`;
1775 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1776 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1777 $test_dirname = $'; #'
1778 }
1779 }
1780
1781 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1782
1783 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1784 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1785 {
1786 #convert the input file to a new style HDL
1787 my $hdl_output_filename = $input_filename;
1788 if ($self->{'old_style_HDL'})
1789 {
1790 $hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1791 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1792 }
1793
1794 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1795 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1796 my @files = grep {!/^\.+$/} readdir(DIR);
1797 close(DIR);
1798
1799 foreach my $file (@files)
1800 {
1801 my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1802 my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1803 if ((!-e $dest_file) && (!-d $src_file))
1804 {
1805 # just copy the original file back to the tmp directory
1806 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1807 }
1808 }
1809
1810 # tidy the input file
1811 my $tidy_output_filename = $hdl_output_filename;
1812 if ($self->{'use_realistic_book'})
1813 {
1814 $tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1815 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1816 }
1817 $tmp_filename = $tidy_output_filename;
1818 }
1819 else
1820 {
1821 if (!-e $tmp_filename)
1822 {
1823 # just copy the original file back to the tmp directory
1824 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1825 }
1826 }
1827
1828 return $tmp_filename;
1829}
1830
1831
1832# Will make the html input file as a proper XML file with removed font tag and
1833# image size added to the img tag.
1834# The tidying process takes place in a collection specific 'tmp' directory so
1835# that we don't accidentally damage the input.
1836sub tmp_tidy_file
1837{
1838 my $self = shift (@_);
1839 my ($file,$cnfile) = @_;
1840 my $input_filename = $file;
1841 my $tmp_filename = $cnfile;
1842
1843 # get the input filename
1844 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1845
1846 require HTML::TokeParser::Simple;
1847
1848 # create HTML parser to decode the input file
1849 my $parser = HTML::TokeParser::Simple->new($input_filename);
1850
1851 # write HTML tmp file without the font tag and image size are added to the img tag
1852 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1853 while (my $token = $parser->get_token())
1854 {
1855 # is it an img tag
1856 if ($token->is_start_tag('img'))
1857 {
1858 # get the attributes
1859 my $attr = $token->return_attr;
1860
1861 # get the full path to the image
1862 my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1863
1864 # set the width and height attribute
1865 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1866
1867 # recreate the tag
1868 print PROD "<img";
1869 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1870 print PROD ">";
1871 }
1872 # is it a font tag
1873 else
1874 {
1875 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1876 {
1877 # remove font tag
1878 print PROD "";
1879 }
1880 else
1881 {
1882 # print without changes
1883 print PROD $token->as_is;
1884 }
1885 }
1886 }
1887 close (PROD) || die("Error Closing File: $tmp_filename $!");
1888
1889 # run html-tidy on the tmp file to make it a proper XML file
1890
1891 my $outhandle = $self->{'outhandle'};
1892 print $outhandle "Converting HTML to be XML compliant:\n";
1893
1894 my $tidy_cmd = "tidy";
1895 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1896 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1897 if ($self->{'verbosity'} <= 2) {
1898 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1899 $tidy_cmd .= " 2>nul";
1900 }
1901 else {
1902 $tidy_cmd .= " 2>/dev/null";
1903 }
1904 print $outhandle " => $tidy_cmd\n";
1905 }
1906
1907 my $tidyfile = `$tidy_cmd`;
1908
1909 # write result back to the tmp file
1910 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1911 print PROD $tidyfile;
1912 close (PROD) || die("Error Closing File: $tmp_filename $!");
1913
1914 # return the output filename
1915 return $tmp_filename;
1916}
1917
1918sub associate_cover_image
1919{
1920 my $self = shift(@_);
1921 my ($doc_obj, $filename) = @_;
1922 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1923 {
1924 # we will have cover image in tidytmp, but want it from import
1925 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1926 }
1927 $self->SUPER::associate_cover_image($doc_obj, $filename);
1928}
1929
1930
19311;
Note: See TracBrowser for help on using the repository browser.