source: gsdl/trunk/perllib/plugins/HTMLPlugin.pm@ 20774

Last change on this file since 20774 was 20774, checked in by kjdon, 15 years ago

moved some of the horrible old methods to the end of the file so that the important ones come first

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 51.7 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use ReadTextFile;
39use HBPlugin;
40use ghtml;
41use unicode;
42use util;
43use XMLParser;
44
45use Image::Size;
46use File::Copy;
47
48sub BEGIN {
49 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50}
51
52use strict; # every perl program should have this!
53no strict 'refs'; # make an exception so we can use variables as filehandles
54
55my $arguments =
56 [ { 'name' => "process_exp",
57 'desc' => "{BasePlugin.process_exp}",
58 'type' => "regexp",
59 'deft' => &get_default_process_exp() },
60 { 'name' => "block_exp",
61 'desc' => "{BasePlugin.block_exp}",
62 'type' => 'regexp',
63 'deft' => &get_default_block_exp() },
64 { 'name' => "nolinks",
65 'desc' => "{HTMLPlugin.nolinks}",
66 'type' => "flag" },
67 { 'name' => "keep_head",
68 'desc' => "{HTMLPlugin.keep_head}",
69 'type' => "flag" },
70 { 'name' => "no_metadata",
71 'desc' => "{HTMLPlugin.no_metadata}",
72 'type' => "flag" },
73 { 'name' => "metadata_fields",
74 'desc' => "{HTMLPlugin.metadata_fields}",
75 'type' => "string",
76 'deft' => "Title" },
77 { 'name' => "hunt_creator_metadata",
78 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79 'type' => "flag" },
80 { 'name' => "file_is_url",
81 'desc' => "{HTMLPlugin.file_is_url}",
82 'type' => "flag" },
83 { 'name' => "assoc_files",
84 'desc' => "{HTMLPlugin.assoc_files}",
85 'type' => "regexp",
86 'deft' => &get_default_block_exp() },
87 { 'name' => "rename_assoc_files",
88 'desc' => "{HTMLPlugin.rename_assoc_files}",
89 'type' => "flag" },
90 { 'name' => "title_sub",
91 'desc' => "{HTMLPlugin.title_sub}",
92 'type' => "string",
93 'deft' => "" },
94 { 'name' => "description_tags",
95 'desc' => "{HTMLPlugin.description_tags}",
96 'type' => "flag" },
97 # retain this for backward compatibility (w3mir option was replaced by
98 # file_is_url)
99 { 'name' => "w3mir",
100# 'desc' => "{HTMLPlugin.w3mir}",
101 'type' => "flag",
102 'hiddengli' => "yes"},
103 { 'name' => "no_strip_metadata_html",
104 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105 'type' => "string",
106 'deft' => "",
107 'reqd' => "no"},
108 { 'name' => "sectionalise_using_h_tags",
109 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110 'type' => "flag" },
111 { 'name' => "use_realistic_book",
112 'desc' => "{HTMLPlugin.tidy_html}",
113 'type' => "flag"},
114 { 'name' => "old_style_HDL",
115 'desc' => "{HTMLPlugin.old_style_HDL}",
116 'type' => "flag"}
117 ];
118
119my $options = { 'name' => "HTMLPlugin",
120 'desc' => "{HTMLPlugin.desc}",
121 'abstract' => "no",
122 'inherits' => "yes",
123 'args' => $arguments };
124
125
126sub new {
127 my ($class) = shift (@_);
128 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
129 push(@$pluginlist, $class);
130
131 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
132 push(@{$hashArgOptLists->{"OptList"}},$options);
133
134
135 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
136
137 if ($self->{'w3mir'}) {
138 $self->{'file_is_url'} = 1;
139 }
140 $self->{'aux_files'} = {};
141 $self->{'dir_num'} = 0;
142 $self->{'file_num'} = 0;
143
144 return bless $self, $class;
145}
146
147# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
148# if have eg <script language="javascript" src="img/lib.js@123">
149sub get_default_block_exp {
150 my $self = shift (@_);
151
152 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
153 return "";
154}
155
156sub get_default_process_exp {
157 my $self = shift (@_);
158
159 # the last option is an attempt to encode the concept of an html query ...
160 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
161}
162
163sub store_block_files
164{
165 my $self =shift (@_);
166 my ($filename_full_path, $block_hash) = @_;
167
168 my $html_fname = $filename_full_path;
169 my @file_blocks;
170
171 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
172
173 # read in file ($text will be in utf8)
174 my $raw_text = "";
175 $self->read_file_no_decoding ($filename_full_path, \$raw_text);
176
177 my $textref = \$raw_text;
178 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
179 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
180 $$textref =~ s/$opencom(.*?)$closecom//gs;
181
182 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
183 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
184 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
185 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
186 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
187 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
188 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
189
190 if(!defined $self->{'utf8_to_original_filename'}) {
191 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
192 $self->{'utf8_to_original_filename'} = {};
193 }
194
195 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
196
197 # remove quotes from link at start and end if necessary
198 if ($link=~/^\"/) {
199 $link=~s/^\"//;
200 $link=~s/\"$//;
201 }
202
203 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
204 # some links may just be anchor names
205 next unless ($link =~ /\S+/);
206
207 if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
208 # Turn relative file path into full path
209 my $dirname = &File::Basename::dirname($filename_full_path);
210 $link = &util::filename_cat($dirname, $link);
211 }
212 $link = $self->eval_dir_dots($link);
213
214 # this is the actual filename on the filesystem (that the link refers to)
215 my $url_original_filename = $self->opt_url_decode($link);
216
217 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
218 my $utf8_link = "";
219 $self->decode_text($link,$encoding,$language,\$utf8_link);
220
221 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
222# print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
223
224 if ($url_original_filename ne $utf8_link) {
225 my $outhandle = $self->{'outhandle'};
226
227 print $outhandle "URL Encoding $url_original_filename\n";
228 print $outhandle " ->$utf8_link\n";
229 }
230
231 $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
232 }
233}
234
235# Given a filename in any encoding, will URL decode it to get back the original filename
236# in the original encoding. Because this method is intended to work out the *original*
237# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
238# string already exists in the local folder.
239# Return the original filename corresponding to the parameter URL-encoded filename, and
240# a decoded flag that is set to true iff URL-decoding had to be applied.
241sub opt_url_decode {
242 my $self = shift (@_);
243 my ($link) = @_;
244
245 # Replace %XX's in URL with decoded value if required.
246 # Note that the filename may include the %XX in some situations
247 if ($link =~ m/\%[A-F0-9]{2}/i) {
248 if (!-e $link) {
249 $link = &unicode::url_decode($link);
250 }
251 }
252
253 return $link;
254}
255
256sub read_into_doc_obj
257{
258 my $self = shift (@_);
259 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
260
261 # get the input file
262 my $input_filename = $file;
263 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
264 $suffix = lc($suffix);
265
266 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
267 {
268 # because the document has to be sectionalized set the description tags
269 $self->{'description_tags'} = 1;
270
271 # set the file to be tidied
272 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
273
274 # get the tidied file
275 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
276 my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
277
278 # derive tmp filename from input filename
279 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
280
281 # set the new input file and base_dir to be from the tidied file
282 $file = "$tailname$suffix";
283 $base_dir = $dirname;
284 }
285
286 # call the parent read_into_doc_obj
287 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
288
289 return ($process_status,$doc_obj);
290}
291
292# do plugin specific processing of doc_obj
293sub process {
294 my $self = shift (@_);
295 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
296 my $outhandle = $self->{'outhandle'};
297
298 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
299 # this makes life so much easier... perl can cope with unix-style '/'s.
300 $base_dir =~ s@(\\)+@/@g;
301 $file =~ s@(\\)+@/@g;
302 }
303
304 # reset per-doc stuff...
305 $self->{'aux_files'} = {};
306 $self->{'dir_num'} = 0;
307 $self->{'file_num'} = 0;
308
309 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
310 # you can also include metadata in the format (X can be any number)
311 # <hX>Title<!--gsdl-metadata
312 # <Metadata name="name1">value1</Metadata>
313 # ...
314 # <Metadata name="nameN">valueN</Metadata>
315 #--></hX>
316 if ($self->{'sectionalise_using_h_tags'}) {
317 # description_tags should allways be activated because we convert headings to description tags
318 $self->{'description_tags'} = 1;
319
320 my $arrSections = [];
321 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
322
323 if (scalar(@$arrSections)) {
324 my $strMetadata = $self->update_section_data($arrSections, -1);
325 if (length($strMetadata)) {
326 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
327 $$textref =~ s/<\/body>/$strMetadata/ig;
328 }
329 }
330 }
331
332 my $cursection = $doc_obj->get_top_section();
333
334 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
335 unless $self->{'no_metadata'} || $self->{'description_tags'};
336
337 # Store URL for page as metadata - this can be used for an
338 # altavista style search interface. The URL won't be valid
339 # unless the file structure contains the domain name (i.e.
340 # like when w3mir is used to download a website).
341
342 # URL metadata (even invalid ones) are used to support internal
343 # links, so even if 'file_is_url' is off, still need to store info
344
345 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
346 my $utf8_file = $self->filename_to_utf8_metadata($file);
347 $utf8_file =~ s/&\#095;/_/g;
348 my $web_url = "http://";
349 if(defined $dirname) { # local directory
350 $dirname = $self->eval_dir_dots($dirname);
351 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
352 $web_url = $web_url.$dirname.$utf8_file;
353 } else {
354 $web_url = $web_url.$utf8_file;
355 }
356 $web_url =~ s/\\/\//g;
357 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
358
359 if ($self->{'file_is_url'}) {
360 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
361 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
362 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
363 }
364
365 if ($self->{'description_tags'}) {
366 # remove the html header - note that doing this here means any
367 # sections defined within the header will be lost (so all <Section>
368 # tags must appear within the body of the HTML)
369 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
370
371 $$textref =~ s/^.*?<body[^>]*>//is;
372 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
373
374 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
375 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
376
377 my $lt = '(?:<|&lt;)';
378 my $gt = '(?:>|&gt;)';
379 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
380
381 my $dont_strip = '';
382 if ($self->{'no_strip_metadata_html'}) {
383 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
384 }
385
386 my $found_something = 0; my $top = 1;
387 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
388 my $text = $1;
389 my $comment = $2;
390 if (defined $text) {
391 # text before a comment - note that getting to here
392 # doesn't necessarily mean there are Section tags in
393 # the document
394 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
395 }
396 while ($comment =~ s/$lt(.*?)$gt//s) {
397 my $tag = $1;
398 if ($tag eq "Section") {
399 $found_something = 1;
400 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
401 $top = 0;
402 } elsif ($tag eq "/Section") {
403 $found_something = 1;
404 $cursection = $doc_obj->get_parent_section ($cursection);
405 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
406 my $metaname = $1;
407 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
408 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
409 my $metavalue = $1;
410 $metavalue =~ s/^\s+//;
411 $metavalue =~ s/\s+$//;
412 # assume that no metadata value intentionally includes
413 # carriage returns or HTML tags (if they're there they
414 # were probably introduced when converting to HTML from
415 # some other format).
416 # actually some people want to have html tags in their
417 # metadata.
418 $metavalue =~ s/[\cJ\cM]/ /sg;
419 $metavalue =~ s/<[^>]+>//sg
420 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
421 $metavalue =~ s/\s+/ /sg;
422 if ($accumulate) {
423 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
424 } else {
425 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
426 }
427 } elsif ($tag eq "Description" || $tag eq "/Description") {
428 # do nothing with containing Description tags
429 } else {
430 # simple HTML tag (probably created by the conversion
431 # to HTML from some other format) - we'll ignore it and
432 # hope for the best ;-)
433 }
434 }
435 }
436 if ($cursection ne "") {
437 print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
438 }
439
440 $$textref =~ s/^.*?<body[^>]*>//is;
441 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
442 if ($$textref =~ m/\S/) {
443 if (!$found_something) {
444 if ($self->{'verbosity'} > 2) {
445 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
446 print $outhandle " will be processed as a single section document\n";
447 }
448
449 # go ahead and process single-section document
450 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
451
452 # if document contains no Section tags we'll go ahead
453 # and extract metadata (this won't have been done
454 # above as the -description_tags option prevents it)
455 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
456 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
457 unless $self->{'no_metadata'};
458
459 } else {
460 print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
461 print $outhandle " of the final closing </Section> tag. This text will\n";
462 print $outhandle " be ignored.";
463
464 my ($text);
465 if (length($$textref) > 30) {
466 $text = substr($$textref, 0, 30) . "...";
467 } else {
468 $text = $$textref;
469 }
470 $text =~ s/\n/ /isg;
471 print $outhandle " ($text)\n";
472 }
473 } elsif (!$found_something) {
474
475 if ($self->{'verbosity'} > 2) {
476 # may get to here if document contained no valid Section
477 # tags but did contain some comments. The text will have
478 # been processed already but we should print the warning
479 # as above and extract metadata
480 print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
481 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
482 }
483
484 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
485 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
486 unless $self->{'no_metadata'};
487 }
488
489 } else {
490
491 # remove header and footer
492 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
493 $$textref =~ s/^.*?<body[^>]*>//is;
494 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
495 }
496
497 # single section document
498 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
499 }
500 return 1;
501}
502
503
504sub process_heading
505{
506 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
507 $strHeadingText = '' if (!defined($strHeadingText));
508
509 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
510
511 my $strSecMetadata = '';
512 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
513 {
514 $strSecMetadata .= $1;
515 }
516
517 $strHeadingText =~ s/^\s+//g;
518 $strHeadingText =~ s/\s+$//g;
519 $strSecMetadata =~ s/^\s+//g;
520 $strSecMetadata =~ s/\s+$//g;
521
522 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
523
524 if (length($strSecMetadata)) {
525 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
526 }
527
528 $strMetadata .= "\t</Description>\n";
529
530 return "<!--" . $strMetadata . "-->";
531}
532
533
534sub update_section_data
535{
536 my ($self, $arrSections, $nCurTocNo) = @_;
537 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
538
539 if ($nSections == 0) {
540 push @$arrSections, $nCurTocNo;
541 return $strBuffer;
542 }
543 $nLast = $arrSections->[$nSections - 1];
544 if ($nCurTocNo > $nLast) {
545 push @$arrSections, $nCurTocNo;
546 return $strBuffer;
547 }
548 for(my $i = $nSections - 1; $i >= 0; $i--) {
549 if ($nCurTocNo <= $arrSections->[$i]) {
550 $strBuffer .= "\n</Section>";
551 pop @$arrSections;
552 }
553 }
554 push @$arrSections, $nCurTocNo;
555 return $strBuffer;
556}
557
558
559# note that process_section may be called multiple times for a single
560# section (relying on the fact that add_utf8_text appends the text to any
561# that may exist already).
562sub process_section {
563 my $self = shift (@_);
564 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
565 # trap links
566 if (!$self->{'nolinks'}) {
567 # usemap="./#index" not handled correctly => change to "#index"
568## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
569
570 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
571 $self->replace_usemap_links($1, $2, $3)/isge;
572
573## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
574
575 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
576 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
577 }
578
579 # trap images
580
581 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
582 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
583 # turned regular text succeeding images into links. That is, by embedding <imgs>
584 # inside <a href=""></a>, the text following images were no longer misbehaving.
585 # However, there would be many occasions whereby images were not meant to link
586 # to their source images but where the images would link to another web page.
587 # To allow this, the no_image_links option was introduced: it would prevent
588 # the behaviour of embedding images into links that referenced the source images.
589
590 # Somewhere along the line, the problem of normal text turning into links when
591 # such text followed images which were not embedded in <a href=""></a> ceased
592 # to occur. This is why the following lines have been commented out (as well as
593 # two lines in replace_images). They appear to no longer apply.
594
595 # If at any time, there is a need for having images embedded in <a> anchor tags,
596 # then it might be better to turn that into an HTMLPlugin option rather than make
597 # it the default behaviour. Also, eventually, no_image_links needs to become
598 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
599
600 #if(!$self->{'no_image_links'}){
601 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
602 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
603 #}
604
605 # add text to document object
606 # turn \ into \\ so that the rest of greenstone doesn't think there
607 # is an escape code following. (Macro parsing loses them...)
608 $$textref =~ s/\\/\\\\/go;
609
610 $doc_obj->add_utf8_text($cursection, $$textref);
611}
612
613sub replace_images {
614 my $self = shift (@_);
615 my ($front, $link, $back, $base_dir,
616 $file, $doc_obj, $section) = @_;
617
618 # remove quotes from link at start and end if necessary
619 if ($link=~/^[\"\']/) {
620 $link=~s/^[\"\']//;
621 $link=~s/[\"\']$//;
622 $front.='"';
623 $back="\"$back";
624 }
625
626 $link =~ s/\n/ /g;
627
628 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
629 # If the Word file path has spaces in it, wv messes up and you end up with
630 # absolute paths for the images, and without the "file://" prefix
631 # So check for this special case and massage the data to be correct
632 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
633 $link =~ s/^.*\\([^\\]+)$/$1/;
634 }
635
636 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
637
638 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
639
640# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
641
642 my $anchor_name = $img_file;
643 #$anchor_name =~ s/^.*\///;
644 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
645
646 my $image_link = $front . $img_file .$back;
647 return $image_link;
648
649 # The reasons for why the following two lines are no longer necessary can be
650 # found in subroutine process_section
651 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
652 #return $anchor_link;
653
654 #return $front . $img_file . $back . $anchor_name;
655}
656
657sub replace_href_links {
658 my $self = shift (@_);
659 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
660
661 # remove quotes from link at start and end if necessary
662 if ($link=~/^[\"\']/) {
663 $link=~s/^[\"\']//;
664 $link=~s/[\"\']$//;
665 $front.='"';
666 $back="\"$back";
667 }
668
669 # attempt to sort out targets - frames are not handled
670 # well in this plugin and some cases will screw things
671 # up - e.g. the _parent target (so we'll just remove
672 # them all ;-)
673 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
674 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
675 $front =~ s/target=\"?_parent\"?//is;
676 $back =~ s/target=\"?_parent\"?//is;
677
678 return $front . $link . $back if $link =~ m/^\#/s;
679 $link =~ s/\n/ /g;
680
681 # Find file referred to by $link on file system
682 # This is more complicated than it sounds when char encodings
683 # is taken in to account
684 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
685
686 # href may use '\'s where '/'s should be on Windows
687 $href =~ s/\\/\//g;
688 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
689
690
691 ##### leave all these links alone (they won't be picked up by intermediate
692 ##### pages). I think that's safest when dealing with frames, targets etc.
693 ##### (at least until I think of a better way to do it). Problems occur with
694 ##### mailto links from within small frames, the intermediate page is displayed
695 ##### within that frame and can't be seen. There is still potential for this to
696 ##### happen even with html pages - the solution seems to be to somehow tell
697 ##### the browser from the server side to display the page being sent (i.e.
698 ##### the intermediate page) in the top level window - I'm not sure if that's
699 ##### possible - the following line should probably be deleted if that can be done
700 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
701
702
703 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
704 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
705 &ghtml::urlsafe ($href);
706 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
707 } else {
708 # link is to some other type of file (eg image) so we'll
709 # need to associate that file
710 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
711 }
712}
713
714sub add_file {
715 my $self = shift (@_);
716 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
717 my ($newname);
718
719 my $filename = $href;
720 if ($base_dir eq "") {
721 # remove http:/ thereby leaving one slash at the start
722 $filename =~ s/^[^:]*:\///;
723 }
724 else {
725 # remove http://
726 $filename =~ s/^[^:]*:\/\///;
727 }
728
729 $filename = &util::filename_cat($base_dir, $filename);
730 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
731 # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
732 my $utf8_filename = $filename;
733 $filename = $self->opt_url_decode($utf8_filename);
734
735 # some special processing if the intended filename was converted to utf8, but
736 # the actual file still needs to be renamed
737 if (!-e $filename) {
738 # try the original filename stored in map
739 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
740 if (defined $original_filename && -e $original_filename) {
741 $filename = $original_filename;
742 }
743 }
744
745 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
746
747 if ($rl == 0) {
748 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
749 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
750 }
751 else {
752 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
753 }
754 }
755
756 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
757 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
758 }
759 if ($self->{'rename_assoc_files'}) {
760 if (defined $self->{'aux_files'}->{$href}) {
761 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
762 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
763 } else {
764 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
765 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
766 $self->inc_filecount ();
767 }
768 $doc_obj->associate_file($filename, $newname, undef, $section);
769 return "_httpdocimg_/$newname";
770 } else {
771 if(&unicode::is_url_encoded($utf8_filename)) {
772 # use the possibly-decoded filename instead to avoid double URL encoding
773 ($newname) = $filename =~ m/([^\/\\]*)$/;
774 } else {
775 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
776 }
777
778 # Make sure this name uses only ASCII characters.
779 # We use either base64 or URL encoding, as these preserve original encoding
780 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
781
782 $doc_obj->associate_file($filename, $newname, undef, $section);
783
784 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
785 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
786 my $newname_url = $newname;
787 $newname_url = &unicode::filename_to_url($newname_url);
788 return "_httpdocimg_/$newname_url";
789 }
790}
791
792
793sub format_link {
794 my $self = shift (@_);
795 my ($link, $base_dir, $file) = @_;
796
797 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
798
799 $hash_part = "" if !defined $hash_part;
800 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
801 my $outhandle = $self->{'outhandle'};
802 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
803 if $self->{'verbosity'};
804 return ($link, "", 0);
805 }
806
807 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
808 my $type = $1;
809
810 if ($link =~ m/^(http|ftp):/i) {
811 # Turn url (using /) into file name (possibly using \ on windows)
812 my @http_dir_split = split('/', $before_hash);
813 $before_hash = &util::filename_cat(@http_dir_split);
814 }
815
816 $before_hash = $self->eval_dir_dots($before_hash);
817
818 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
819
820 my $rl = 0;
821 $rl = 1 if (-e $linkfilename);
822
823 # make sure there's a slash on the end if it's a directory
824 if ($before_hash !~ m/\/$/) {
825 $before_hash .= "/" if (-d $linkfilename);
826 }
827 return ($type . $before_hash, $hash_part, $rl);
828
829 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
830
831 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
832
833 # the first directory will be the domain name if file_is_url
834 # to generate archives, otherwise we'll assume all files are
835 # from the same site and base_dir is the root
836
837 if ($self->{'file_is_url'}) {
838 my @dirs = split /[\/\\]/, $file;
839 my $domname = shift (@dirs);
840 $before_hash = &util::filename_cat($domname, $before_hash);
841 $before_hash =~ s@\\@/@g; # for windows
842 }
843 else
844 {
845 # see if link shares directory with source document
846 # => turn into relative link if this is so!
847
848 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
849 # too difficult doing a pattern match with embedded '\'s...
850 my $win_before_hash=$before_hash;
851 $win_before_hash =~ s@(\\)+@/@g;
852 # $base_dir is already similarly "converted" on windows.
853 if ($win_before_hash =~ s@^$base_dir/@@o) {
854 # if this is true, we removed a prefix
855 $before_hash=$win_before_hash;
856 }
857 }
858 else {
859 # before_hash has lost leading slash by this point,
860 # -> add back in prior to substitution with $base_dir
861 $before_hash = "/$before_hash";
862
863 $before_hash = &util::filename_cat("",$before_hash);
864 $before_hash =~ s@^$base_dir/@@;
865 }
866 }
867 } else {
868 # Turn relative file path into full path
869 my $dirname = &File::Basename::dirname($file);
870 $before_hash = &util::filename_cat($dirname, $before_hash);
871 $before_hash = $self->eval_dir_dots($before_hash);
872 }
873
874 my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
875 # make sure there's a slash on the end if it's a directory
876 if ($before_hash !~ m/\/$/) {
877 $before_hash .= "/" if (-d $linkfilename);
878 }
879 return ("http://" . $before_hash, $hash_part, 1);
880 } else {
881 # mailto, news, nntp, telnet, javascript or gopher link
882 return ($before_hash, "", 0);
883 }
884}
885
886sub extract_first_NNNN_characters {
887 my $self = shift (@_);
888 my ($textref, $doc_obj, $thissection) = @_;
889
890 foreach my $size (split /,/, $self->{'first'}) {
891 my $tmptext = $$textref;
892 # skip to the body
893 $tmptext =~ s/.*<body[^>]*>//i;
894 # remove javascript
895 $tmptext =~ s@<script.*?</script>@ @sig;
896 $tmptext =~ s/<[^>]*>/ /g;
897 $tmptext =~ s/&nbsp;/ /g;
898 $tmptext =~ s/^\s+//;
899 $tmptext =~ s/\s+$//;
900 $tmptext =~ s/\s+/ /gs;
901 $tmptext = &unicode::substr ($tmptext, 0, $size);
902 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
903 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
904 }
905}
906
907
908sub extract_metadata {
909 my $self = shift (@_);
910 my ($textref, $metadata, $doc_obj, $section) = @_;
911 my $outhandle = $self->{'outhandle'};
912 # if we don't want metadata, we may as well not be here ...
913 return if (!defined $self->{'metadata_fields'});
914
915 # metadata fields to extract/save. 'key' is the (lowercase) name of the
916 # html meta, 'value' is the metadata name for greenstone to use
917 my %find_fields = ();
918
919 my %creator_fields = (); # short-cut for lookups
920
921
922 foreach my $field (split /,/, $self->{'metadata_fields'}) {
923 $field =~ s/^\s+//; # remove leading whitespace
924 $field =~ s/\s+$//; # remove trailing whitespace
925
926 # support tag<tagname>
927 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
928 # "$2" is the user's preferred gs metadata name
929 $find_fields{lc($1)}=$2; # lc = lowercase
930 } else { # no <tagname> for mapping
931 # "$field" is the user's preferred gs metadata name
932 $find_fields{lc($field)}=$field; # lc = lowercase
933 }
934 }
935
936 if (defined $self->{'hunt_creator_metadata'} &&
937 $self->{'hunt_creator_metadata'} == 1 ) {
938 my @extra_fields =
939 (
940 'author',
941 'author.email',
942 'creator',
943 'dc.creator',
944 'dc.creator.corporatename',
945 );
946
947 # add the creator_metadata fields to search for
948 foreach my $field (@extra_fields) {
949 $creator_fields{$field}=0; # add to lookup hash
950 }
951 }
952
953
954 # find the header in the html file, which has the meta tags
955 $$textref =~ m@<head>(.*?)</head>@si;
956
957 my $html_header=$1;
958
959 # go through every <meta... tag defined in the html and see if it is
960 # one of the tags we want to match.
961
962 # special case for title - we want to remember if its been found
963 my $found_title = 0;
964 # this assumes that ">" won't appear. (I don't think it's allowed to...)
965 $html_header =~ m/^/; # match the start of the string, for \G assertion
966
967 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
968 my $metatag=$1;
969 my ($tag, $value);
970
971 # find the tag name
972 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
973 $tag=$2;
974 # in case they're not using " or ', but they should...
975 if (! $tag) {
976 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
977 $tag=$1;
978 }
979
980 if (!defined $tag) {
981 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
982 next;
983 }
984
985 # don't need to assign this field if it was passed in from a previous
986 # (recursive) plugin
987 if (defined $metadata->{$tag}) {next}
988
989 # find the tag content
990 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
991 $value=$2;
992
993 if (! $value) {
994 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
995 $value=$1;
996 }
997 if (!defined $value) {
998 print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
999 next;
1000 }
1001
1002 # clean up and add
1003 $value =~ s/\s+/ /gs;
1004 chomp($value); # remove trailing \n, if any
1005 if (exists $creator_fields{lc($tag)}) {
1006 # map this value onto greenstone's "Creator" metadata
1007 $tag='Creator';
1008 } elsif (!exists $find_fields{lc($tag)}) {
1009 next; # don't want this tag
1010 } else {
1011 # get the user's preferred capitalisation
1012 $tag = $find_fields{lc($tag)};
1013 }
1014 if (lc($tag) eq "title") {
1015 $found_title = 1;
1016 }
1017
1018 if ($self->{'verbosity'} > 2) {
1019 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1020 }
1021
1022 # Do we still reply on the following? Surely there must
1023 # be a better way to go about this outside of the plugin?
1024 #
1025 #if ($tag =~ m/date.*/i){
1026 # $tag = lc($tag);
1027 #}
1028
1029 $doc_obj->add_utf8_metadata($section, $tag, $value);
1030
1031 }
1032
1033 # TITLE: extract the document title
1034 if (exists $find_fields{'title'} && !$found_title) {
1035 # we want a title, and didn't find one in the meta tags
1036 # see if there's a <title> tag
1037 my $title;
1038 my $from = ""; # for debugging output only
1039 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1040 $title = $1;
1041 $from = "<title> tags";
1042 }
1043
1044 if (!defined $title) {
1045 $from = "first 100 chars";
1046 # if no title use first 100 or so characters
1047 $title = $$textref;
1048 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1049 $title =~ s/^.*?<body>//si;
1050 # ignore javascript!
1051 $title =~ s@<script.*?</script>@ @sig;
1052 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1053 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1054 $title = substr ($title, 0, 100);
1055 $title =~ s/\s\S*$/.../;
1056 }
1057 $title =~ s/<[^>]*>/ /g; # remove html tags
1058 $title =~ s/&nbsp;/ /g;
1059 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1060 $title =~ s/\s+/ /gs; # collapse multiple spaces
1061 $title =~ s/^\s*//; # remove leading spaces
1062 $title =~ s/\s*$//; # remove trailing spaces
1063
1064 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1065 $title =~ s/^\s+//s; # in case title_sub introduced any...
1066 $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1067 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1068 if ($self->{'verbosity'} > 2);
1069 }
1070
1071 # add FileFormat metadata
1072 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1073
1074 # Special, for metadata names such as tagH1 - extracts
1075 # the text between the first <H1> and </H1> tags into "H1" metadata.
1076
1077 foreach my $field (keys %find_fields) {
1078 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1079 my $tag = $1;
1080 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1081 my $content = $1;
1082 $content =~ s/&nbsp;/ /g;
1083 $content =~ s/<[^>]*>/ /g;
1084 $content =~ s/^\s+//;
1085 $content =~ s/\s+$//;
1086 $content =~ s/\s+/ /gs;
1087 if ($content) {
1088 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1089 $tag =~ s/^tag//i;
1090 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1091 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1092 if ($self->{'verbosity'} > 2);
1093 }
1094 }
1095 }
1096}
1097
1098
1099# evaluate any "../" to next directory up
1100# evaluate any "./" as here
1101sub eval_dir_dots {
1102 my $self = shift (@_);
1103 my ($filename) = @_;
1104 my $dirsep_os = &util::get_os_dirsep();
1105 my @dirsep = split(/$dirsep_os/,$filename);
1106
1107 my @eval_dirs = ();
1108 foreach my $d (@dirsep) {
1109 if ($d eq "..") {
1110 pop(@eval_dirs);
1111
1112 } elsif ($d eq ".") {
1113 # do nothing!
1114
1115 } else {
1116 push(@eval_dirs,$d);
1117 }
1118 }
1119
1120 # Need to fiddle with number of elements in @eval_dirs if the
1121 # first one is the empty string. This is because of a
1122 # modification to util::filename_cat that supresses the addition
1123 # of a leading '/' character (or \ if windows) (intended to help
1124 # filename cat with relative paths) if the first entry in the
1125 # array is the empty string. Making the array start with *two*
1126 # empty strings is a way to defeat this "smart" option.
1127 #
1128 if (scalar(@eval_dirs) > 0) {
1129 if ($eval_dirs[0] eq ""){
1130 unshift(@eval_dirs,"");
1131 }
1132 }
1133
1134 my $evaluated_filename = (scalar @eval_dirs > 0) ? &util::filename_cat(@eval_dirs) : "";
1135 return $evaluated_filename;
1136}
1137
1138sub replace_usemap_links {
1139 my $self = shift (@_);
1140 my ($front, $link, $back) = @_;
1141
1142 # remove quotes from link at start and end if necessary
1143 if ($link=~/^[\"\']/) {
1144 $link=~s/^[\"\']//;
1145 $link=~s/[\"\']$//;
1146 $front.='"';
1147 $back="\"$back";
1148 }
1149
1150 $link =~ s/^\.\///;
1151 return $front . $link . $back;
1152}
1153
1154sub inc_filecount {
1155 my $self = shift (@_);
1156
1157 if ($self->{'file_num'} == 1000) {
1158 $self->{'dir_num'} ++;
1159 $self->{'file_num'} = 0;
1160 } else {
1161 $self->{'file_num'} ++;
1162 }
1163}
1164
1165
1166# Extend read_file so that strings like &eacute; are
1167# converted to UTF8 internally.
1168#
1169# We don't convert &lt; or &gt; or &amp; or &quot; in case
1170# they interfere with the GML files
1171
1172sub read_file {
1173 my $self = shift(@_);
1174 my ($filename, $encoding, $language, $textref) = @_;
1175
1176 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1177
1178 # Convert entities to their UTF8 equivalents
1179 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1180 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1181 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1182}
1183
1184sub HB_read_html_file {
1185 my $self = shift (@_);
1186 my ($htmlfile, $text) = @_;
1187
1188 # load in the file
1189 if (!open (FILE, $htmlfile)) {
1190 print STDERR "ERROR - could not open $htmlfile\n";
1191 return;
1192 }
1193
1194 my $foundbody = 0;
1195 $self->HB_gettext (\$foundbody, $text, "FILE");
1196 close FILE;
1197
1198 # just in case there was no <body> tag
1199 if (!$foundbody) {
1200 $foundbody = 1;
1201 open (FILE, $htmlfile) || return;
1202 $self->HB_gettext (\$foundbody, $text, "FILE");
1203 close FILE;
1204 }
1205 # text is in utf8
1206}
1207
1208# converts the text to utf8, as ghtml does that for &eacute; etc.
1209sub HB_gettext {
1210 my $self = shift (@_);
1211 my ($foundbody, $text, $handle) = @_;
1212
1213 my $line = "";
1214 while (defined ($line = <$handle>)) {
1215 # look for body tag
1216 if (!$$foundbody) {
1217 if ($line =~ s/^.*<body[^>]*>//i) {
1218 $$foundbody = 1;
1219 } else {
1220 next;
1221 }
1222 }
1223
1224 # check for symbol fonts
1225 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1226 my $font = $1;
1227 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1228 if ($font !~ m/^arial$/i);
1229 }
1230
1231 $$text .= $line;
1232 }
1233
1234 if ($self->{'input_encoding'} eq "iso_8859_1") {
1235 # convert to utf-8
1236 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1237 }
1238 # convert any alphanumeric character entities to their utf-8
1239 # equivalent for indexing purposes
1240 #&ghtml::convertcharentities ($$text);
1241
1242 $$text =~ s/\s+/ /g; # remove \n's
1243}
1244
1245sub HB_clean_section {
1246 my $self = shift (@_);
1247 my ($section) = @_;
1248
1249 # remove tags without a starting tag from the section
1250 my ($tag, $tagstart);
1251 while ($section =~ m/<\/([^>]{1,10})>/) {
1252 $tag = $1;
1253 $tagstart = index($section, "<$tag");
1254 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1255 $section =~ s/<\/$tag>//;
1256 }
1257
1258 # remove extra paragraph tags
1259 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1260
1261 # remove extra stuff at the end of the section
1262 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1263
1264 # add a newline at the beginning of each paragraph
1265 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1266
1267 # add a newline every 80 characters at a word boundary
1268 # Note: this regular expression puts a line feed before
1269 # the last word in each section, even when it is not
1270 # needed.
1271 $section =~ s/(.{1,80})\s/$1\n/g;
1272
1273 # fix up the image links
1274 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1275 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1276 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1277 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1278
1279 return $section;
1280}
1281
1282# Will convert the oldHDL format to the new HDL format (using the Section tag)
1283sub convert_to_newHDLformat
1284{
1285 my $self = shift (@_);
1286 my ($file,$cnfile) = @_;
1287 my $input_filename = $file;
1288 my $tmp_filename = $cnfile;
1289
1290 # write HTML tmp file with new HDL format
1291 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1292
1293 # read in the file and do basic html cleaning (removing header etc)
1294 my $html = "";
1295 $self->HB_read_html_file ($input_filename, \$html);
1296
1297 # process the file one section at a time
1298 my $curtoclevel = 1;
1299 my $firstsection = 1;
1300 my $toclevel = 0;
1301 while (length ($html) > 0) {
1302 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1303 $toclevel = $3;
1304 my $title = $4;
1305 my $sectiontext = "";
1306 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1307 $sectiontext = $1;
1308 } else {
1309 $sectiontext = $html;
1310 $html = "";
1311 }
1312
1313 # remove tags and extra spaces from the title
1314 $title =~ s/<\/?[^>]+>//g;
1315 $title =~ s/^\s+|\s+$//g;
1316
1317 # close any sections below the current level and
1318 # create a new section (special case for the firstsection)
1319 print PROD "<!--\n";
1320 while (($curtoclevel > $toclevel) ||
1321 (!$firstsection && $curtoclevel == $toclevel)) {
1322 $curtoclevel--;
1323 print PROD "</Section>\n";
1324 }
1325 if ($curtoclevel+1 < $toclevel) {
1326 print STDERR "WARNING - jump in toc levels in $input_filename " .
1327 "from $curtoclevel to $toclevel\n";
1328 }
1329 while ($curtoclevel < $toclevel) {
1330 $curtoclevel++;
1331 }
1332
1333 if ($curtoclevel == 1) {
1334 # add the header tag
1335 print PROD "-->\n";
1336 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1337 print PROD "<!--\n";
1338 }
1339
1340 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1341
1342 print PROD "-->\n";
1343
1344 # clean up the section html
1345 $sectiontext = $self->HB_clean_section($sectiontext);
1346
1347 print PROD "$sectiontext\n";
1348
1349 } else {
1350 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1351 "\nin $input_filename\n";
1352 last;
1353 }
1354 $firstsection = 0;
1355 }
1356
1357 print PROD "<!--\n";
1358 while ($curtoclevel > 0) {
1359 $curtoclevel--;
1360 print PROD "</Section>\n";
1361 }
1362 print PROD "-->\n";
1363
1364 close (PROD) || die("Error Closing File: $tmp_filename $!");
1365
1366 return $tmp_filename;
1367}
1368
1369sub shorten {
1370 my $self = shift (@_);
1371 my ($text) = @_;
1372
1373 return "\"$text\"" if (length($text) < 100);
1374
1375 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1376 substr ($text, length($text)-50) . "\"";
1377}
1378
1379sub convert_tidy_or_oldHDL_file
1380{
1381 my $self = shift (@_);
1382 my ($file) = @_;
1383 my $input_filename = $file;
1384
1385 if (-d $input_filename)
1386 {
1387 return $input_filename;
1388 }
1389
1390 # get the input filename
1391 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1392 my $base_dirname = $dirname;
1393 $suffix = lc($suffix);
1394
1395 # derive tmp filename from input filename
1396 # Remove any white space from filename -- no risk of name collision, and
1397 # makes later conversion by utils simpler. Leave spaces in path...
1398 # tidy up the filename with space, dot, hyphen between
1399 $tailname =~ s/\s+//g;
1400 $tailname =~ s/\.+//g;
1401 $tailname =~ s/\-+//g;
1402 # convert to utf-8 otherwise we have problems with the doc.xml file
1403 # later on
1404 &unicode::ensure_utf8(\$tailname);
1405
1406 # softlink to collection tmp dir
1407 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1408 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1409
1410 my $test_dirname = "";
1411 my $f_separator = &util::get_os_dirsep();
1412
1413 if ($dirname =~ m/import$f_separator/)
1414 {
1415 $test_dirname = $'; #'
1416
1417 #print STDERR "init $'\n";
1418
1419 while ($test_dirname =~ m/[$f_separator]/)
1420 {
1421 my $folderdirname = $`;
1422 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
1423 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1424 $test_dirname = $'; #'
1425 }
1426 }
1427
1428 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1429
1430 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1431 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1432 {
1433 #convert the input file to a new style HDL
1434 my $hdl_output_filename = $input_filename;
1435 if ($self->{'old_style_HDL'})
1436 {
1437 $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1438 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1439 }
1440
1441 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1442 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1443 my @files = grep {!/^\.+$/} readdir(DIR);
1444 close(DIR);
1445
1446 foreach my $file (@files)
1447 {
1448 my $src_file = &util::filename_cat($base_dirname,$file);
1449 my $dest_file = &util::filename_cat($tmp_dirname,$file);
1450 if ((!-e $dest_file) && (!-d $src_file))
1451 {
1452 # just copy the original file back to the tmp directory
1453 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1454 }
1455 }
1456
1457 # tidy the input file
1458 my $tidy_output_filename = $hdl_output_filename;
1459 if ($self->{'use_realistic_book'})
1460 {
1461 $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
1462 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1463 }
1464 $tmp_filename = $tidy_output_filename;
1465 }
1466 else
1467 {
1468 if (!-e $tmp_filename)
1469 {
1470 # just copy the original file back to the tmp directory
1471 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1472 }
1473 }
1474
1475 return $tmp_filename;
1476}
1477
1478
1479# Will make the html input file as a proper XML file with removed font tag and
1480# image size added to the img tag.
1481# The tidying process takes place in a collection specific 'tmp' directory so
1482# that we don't accidentally damage the input.
1483sub tmp_tidy_file
1484{
1485 my $self = shift (@_);
1486 my ($file,$cnfile) = @_;
1487 my $input_filename = $file;
1488 my $tmp_filename = $cnfile;
1489
1490 # get the input filename
1491 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1492
1493 require HTML::TokeParser::Simple;
1494
1495 # create HTML parser to decode the input file
1496 my $parser = HTML::TokeParser::Simple->new($input_filename);
1497
1498 # write HTML tmp file without the font tag and image size are added to the img tag
1499 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1500 while (my $token = $parser->get_token())
1501 {
1502 # is it an img tag
1503 if ($token->is_start_tag('img'))
1504 {
1505 # get the attributes
1506 my $attr = $token->return_attr;
1507
1508 # get the full path to the image
1509 my $img_file = &util::filename_cat($dirname,$attr->{src});
1510
1511 # set the width and height attribute
1512 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1513
1514 # recreate the tag
1515 print PROD "<img";
1516 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1517 print PROD ">";
1518 }
1519 # is it a font tag
1520 else
1521 {
1522 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1523 {
1524 # remove font tag
1525 print PROD "";
1526 }
1527 else
1528 {
1529 # print without changes
1530 print PROD $token->as_is;
1531 }
1532 }
1533 }
1534 close (PROD) || die("Error Closing File: $tmp_filename $!");
1535
1536 # run html-tidy on the tmp file to make it a proper XML file
1537 my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
1538
1539 # write result back to the tmp file
1540 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1541 print PROD $tidyfile;
1542 close (PROD) || die("Error Closing File: $tmp_filename $!");
1543
1544 # return the output filename
1545 return $tmp_filename;
1546}
1547
15481;
Note: See TracBrowser for help on using the repository browser.