source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 70.3 KB
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use Encode;
39use Unicode::Normalize 'normalize';
40
41use ReadTextFile;
42use HBPlugin;
43use ghtml;
44use unicode;
45use util;
46use FileUtils;
47use XMLParser;
48
49use File::Copy;
50
51sub BEGIN {
52 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
53 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
54 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); # for Image/Size.pm
55}
56
57use Image::Size;
58
59use strict; # every perl program should have this!
60no strict 'refs'; # make an exception so we can use variables as filehandles
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BaseImporter.process_exp}",
65 'type' => "regexp",
66 'deft' => &get_default_process_exp() },
67 { 'name' => "block_exp",
68 'desc' => "{BaseImporter.block_exp}",
69 'type' => 'regexp',
70 'deft' => &get_default_block_exp() },
71 { 'name' => "nolinks",
72 'desc' => "{HTMLPlugin.nolinks}",
73 'type' => "flag" },
74 { 'name' => "keep_head",
75 'desc' => "{HTMLPlugin.keep_head}",
76 'type' => "flag" },
77 { 'name' => "no_metadata",
78 'desc' => "{HTMLPlugin.no_metadata}",
79 'type' => "flag" },
80 { 'name' => "metadata_fields",
81 'desc' => "{HTMLPlugin.metadata_fields}",
82 'type' => "string",
83 'deft' => "Title" },
84 { 'name' => "metadata_field_separator",
85 'desc' => "{HTMLPlugin.metadata_field_separator}",
86 'type' => "string",
87 'deft' => "" },
88 { 'name' => "hunt_creator_metadata",
89 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
90 'type' => "flag" },
91 { 'name' => "file_is_url",
92 'desc' => "{HTMLPlugin.file_is_url}",
93 'type' => "flag" },
94 { 'name' => "assoc_files",
95 'desc' => "{HTMLPlugin.assoc_files}",
96 'type' => "regexp",
97 'deft' => &get_default_block_exp() },
98 { 'name' => "rename_assoc_files",
99 'desc' => "{HTMLPlugin.rename_assoc_files}",
100 'type' => "flag" },
101 { 'name' => "title_sub",
102 'desc' => "{HTMLPlugin.title_sub}",
103 'type' => "string",
104 'deft' => "" },
105 { 'name' => "description_tags",
106 'desc' => "{HTMLPlugin.description_tags}",
107 'type' => "flag" },
108 # retain this for backward compatibility (w3mir option was replaced by
109 # file_is_url)
110 { 'name' => "w3mir",
111# 'desc' => "{HTMLPlugin.w3mir}",
112 'type' => "flag",
113 'hiddengli' => "yes"},
114 { 'name' => "no_strip_metadata_html",
115 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
116 'type' => "string",
117 'deft' => "",
118 'reqd' => "no"},
119 { 'name' => "sectionalise_using_h_tags",
120 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
121 'type' => "flag" },
122 { 'name' => "use_realistic_book",
123 'desc' => "{HTMLPlugin.tidy_html}",
124 'type' => "flag"},
125 { 'name' => "old_style_HDL",
126 'desc' => "{HTMLPlugin.old_style_HDL}",
127 'type' => "flag"},
128 {'name' => "processing_tmp_files",
129 'desc' => "{BaseImporter.processing_tmp_files}",
130 'type' => "flag",
131 'hiddengli' => "yes"}
132 ];
133
134my $options = { 'name' => "HTMLPlugin",
135 'desc' => "{HTMLPlugin.desc}",
136 'abstract' => "no",
137 'inherits' => "yes",
138 'args' => $arguments };
139
140
141sub new {
142 my ($class) = shift (@_);
143 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
144 push(@$pluginlist, $class);
145
146 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
147 push(@{$hashArgOptLists->{"OptList"}},$options);
148
149
150 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
151
152 if ($self->{'w3mir'}) {
153 $self->{'file_is_url'} = 1;
154 }
155 $self->{'aux_files'} = {};
156 $self->{'dir_num'} = 0;
157 $self->{'file_num'} = 0;
158
159 return bless $self, $class;
160}
161
162# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
163# if have eg <script language="javascript" src="img/lib.js@123">
164# blocking is now done by reading through the file and recording all the
165# images and other files
166sub get_default_block_exp {
167 my $self = shift (@_);
168
169 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
170 return "";
171}
172
173sub get_default_process_exp {
174 my $self = shift (@_);
175
176 # the last option is an attempt to encode the concept of an html query ...
177 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
178}
179
180sub store_block_files
181{
182 my $self =shift (@_);
183 my ($filename_full_path, $block_hash) = @_;
184
185 my $html_fname = $filename_full_path;
186
187 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
188 $self->{'store_content_encoding'}->{$filename_full_path} = [$content_encoding, $language];
189
190
191 # read in file ($text will be in the filesystem encoding)
192 my $raw_text = "";
193 $self->read_file_no_decoding($filename_full_path, \$raw_text);
194
195 my $textref = \$raw_text;
196 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
197 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
198 $$textref =~ s/$opencom(.*?)$closecom//gs;
199
200 # Convert entities to their UTF8 equivalents
201 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
202 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
203 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
204
205 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
206 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
207 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
208 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
209 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
210 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
211 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
212
213 if(!defined $self->{'unicode_to_original_filename'}) {
214 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
215 $self->{'unicode_to_original_filename'} = {};
216 }
217
218 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
219
220 # remove quotes from link at start and end if necessary
221 if ($raw_link =~ m/^\"/) {
222 $raw_link =~ s/^\"//;
223 $raw_link =~ s/\"$//;
224 }
225
226 # remove any anchor names, e.g. foo.html#name becomes foo.html
227 # but watch out for any #'s that are part of entities, such as &#x3B1;
228 $raw_link =~ s/([^&])\#.*$/$1/s;
229
230 # some links may just be anchor names
231 next unless ($raw_link =~ /\S+/);
232
233 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
234 # Turn relative file path into full path
235 my $dirname = &File::Basename::dirname($filename_full_path);
236 $raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
237 }
238 $raw_link = $self->eval_dir_dots($raw_link);
239
240 # this is the actual filename on the filesystem (that the link refers to)
241 my $url_original_filename = $self->opt_url_decode($raw_link);
242
243 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
244
245 if ($exceeds_bytecodes) {
246 # We have a link to a file name that is more complicated than a raw byte filename
247 # What we do next depends on the operating system we are on
248
249 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
250 # Assume we're dealing with a UTF-8 encoded filename
251 $url_original_filename = encode("utf8", $url_original_filename);
252 }
253 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
254 # HFS+ is UTF8 with decompostion
255 $url_original_filename = encode("utf8", $url_original_filename);
256 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
257 }
258 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
259 # Don't need to do anything as later code maps Windows
260 # unicode filenames to DOS short filenames when needed
261 }
262 else {
263 my $outhandle = $self->{'outhandle'};
264 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
265 print $outhandle " in raw file system encoding of: $raw_link\n";
266 print $outhandle " Assuming filesystem is UTF-8 based.\n";
267 $url_original_filename = encode("utf8", $url_original_filename);
268 }
269 }
270
271 # Convert the (currently raw) link into its Unicode version.
272 # Store the Unicode link along with the url_original_filename
273 my $unicode_url_original_filename = "";
274 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
275
276
277 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
278
279
280 if ($url_original_filename ne $unicode_url_original_filename) {
281 my $outhandle = $self->{'outhandle'};
282
283 print $outhandle "URL Encoding $url_original_filename\n";
284 print $outhandle " ->$unicode_url_original_filename\n";
285
286 # make sure not to block the file itself, as happens when an html file links to itself
287 # e.g. if the current file is mary-boleyn/index.html and contains <link rel="canonical" href="index.html" />
288 my $unicode_html_fname = "";
289 $self->decode_text($html_fname,$content_encoding,$language,\$unicode_html_fname);
290 if($unicode_url_original_filename ne $unicode_html_fname) {
291 # Allow for possibility of raw byte version and Unicode versions of file
292 $self->block_filename($block_hash,$unicode_url_original_filename);
293 }
294 }
295
296 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
297 # TODO now use unicode files in block, do we need this??
298 #$self->block_raw_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname;
299
300 # but only add the linked file to the blocklist if the current html file does not link to itself
301
302 }
303}
304
305# Given a filename in any encoding, will URL decode it to get back the original filename
306# in the original encoding. Because this method is intended to work out the *original*
307# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
308# string already exists in the local folder.
309#
310sub opt_url_decode {
311 my $self = shift (@_);
312 my ($raw_link) = @_;
313
314
315 # Replace %XX's in URL with decoded value if required.
316 # Note that the filename may include the %XX in some situations
317
318## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
319
320 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
321 if (!-e $raw_link) {
322 $raw_link = &unicode::url_decode($raw_link,1);
323 }
324 }
325
326 return $raw_link;
327}
328
329sub read_into_doc_obj
330{
331 my $self = shift (@_);
332 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
333
334 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
335
336 # Lookup content_encoding and language worked out in file_block pass for this file
337 # Store them under the local names they are nice and easy to access
338 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}[0];
339 $self->{'language'} = $self->{'store_content_encoding'}->{$filename_full_path}[1];
340
341 # get the input file
342 my $input_filename = $file;
343 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
344 $suffix = lc($suffix);
345 my $tidy_filename;
346 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
347 {
348 # because the document has to be sectionalized set the description tags
349 $self->{'description_tags'} = 1;
350
351 # set the file to be tidied
352 $input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
353
354 # get the tidied file
355 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
356 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
357
358 # derive tmp filename from input filename
359 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
360
361 # set the new input file and base_dir to be from the tidied file
362 $file = "$tailname$suffix";
363 $base_dir = $dirname;
364 }
365
366 # call the parent read_into_doc_obj
367 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
368 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
369 {
370 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
371 my $collect_file = &util::filename_within_collection($filename_full_path);
372 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
373 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
374 # build. So set it manually.
375 $doc_obj->set_source_path($filename_full_path);
376 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
377 $doc_obj->set_converted_filename($collect_conv_file);
378
379 my $plugin_filename_encoding = $self->{'filename_encoding'};
380 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
381 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
382 }
383
384 delete $self->{'store_content_encoding'}->{$filename_full_path};
385 $self->{'content_encoding'} = undef;
386
387 return ($process_status,$doc_obj);
388}
389
390# do plugin specific processing of doc_obj
391sub process {
392 my $self = shift (@_);
393 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
394 my $outhandle = $self->{'outhandle'};
395
396 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
397 # this makes life so much easier... perl can cope with unix-style '/'s.
398 $base_dir =~ s@(\\)+@/@g;
399 $file =~ s@(\\)+@/@g;
400 }
401
402 my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
403 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
404 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
405
406 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
407 # And again
408 $upgraded_base_dir =~ s@(\\)+@/@g;
409 $upgraded_filename =~ s@(\\)+@/@g;
410
411 # Need to make sure there is a '/' on the end of upgraded_base_dir
412 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
413 $upgraded_base_dir .= "/";
414 }
415 }
416 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
417
418 # reset per-doc stuff...
419 $self->{'aux_files'} = {};
420 $self->{'dir_num'} = 0;
421 $self->{'file_num'} = 0;
422
423 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
424 # you can also include metadata in the format (X can be any number)
425 # <hX>Title<!--gsdl-metadata
426 # <Metadata name="name1">value1</Metadata>
427 # ...
428 # <Metadata name="nameN">valueN</Metadata>
429 #--></hX>
430 if ($self->{'sectionalise_using_h_tags'}) {
431 # description_tags should allways be activated because we convert headings to description tags
432 $self->{'description_tags'} = 1;
433
434 my $arrSections = [];
435 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
436
437 if (scalar(@$arrSections)) {
438 my $strMetadata = $self->update_section_data($arrSections, -1);
439 if (length($strMetadata)) {
440 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
441 $$textref =~ s/<\/body>/$strMetadata/ig;
442 }
443 }
444 }
445
446 my $cursection = $doc_obj->get_top_section();
447
448 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
449 unless $self->{'no_metadata'} || $self->{'description_tags'};
450
451 # Store URL for page as metadata - this can be used for an
452 # altavista style search interface. The URL won't be valid
453 # unless the file structure contains the domain name (i.e.
454 # like when w3mir is used to download a website).
455
456 # URL metadata (even invalid ones) are used to support internal
457 # links, so even if 'file_is_url' is off, still need to store info
458
459 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
460
461# my $utf8_file = $self->filename_to_utf8_metadata($file);
462# $utf8_file =~ s/&\#095;/_/g;
463# variable below used to be utf8_file
464
465 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
466 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
467
468 my $web_url = "http://";
469 my $utf8_web_url = "http://";
470
471 if(defined $dirname) { # local directory
472
473 # Check for "ftp" in the domain name of the directory
474 # structure to determine if this URL should be a ftp:// URL
475 # This check is not infallible, but better than omitting the
476 # check, which would cause all files downloaded from ftp sites
477 # via mirroring with wget to have potentially erroneous http:// URLs
478 # assigned in their metadata
479 if ($dirname =~ /^[^\/]*ftp/i)
480 {
481 $web_url = "ftp://";
482 $utf8_web_url = "ftp://";
483 }
484 $dirname = $self->eval_dir_dots($dirname);
485 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
486
487 # this local directory in import may need to be URL encoded like the file
488 my $url_encoded_dir = &unicode::raw_filename_to_url_encoded($dirname);
489 my $utf8_url_encoded_dir = &unicode::raw_filename_to_utf8_url_encoded($dirname);
490
491 # changed here
492 $web_url = $web_url.$url_encoded_dir.$url_encoded_file;
493 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_dir.$utf8_url_encoded_file;
494 } else {
495 $web_url = $web_url.$url_encoded_file;
496 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
497 }
498 $web_url =~ s/\\/\//g;
499 $utf8_web_url =~ s/\\/\//g;
500
501 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
502 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
503 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
504 print STDERR "*******DEBUG: web url: $web_url\n";
505 print STDERR "*******DEBUG: utf8 web url: $utf8_web_url\n";
506 }
507
508
509 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
510 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
511
512 if ($self->{'file_is_url'}) {
513 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
514 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
515 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
516 }
517
518 if ($self->{'description_tags'}) {
519 # remove the html header - note that doing this here means any
520 # sections defined within the header will be lost (so all <Section>
521 # tags must appear within the body of the HTML)
522 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
523
524 $$textref =~ s/^.*?<body[^>]*>//is;
525 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
526
527 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
528 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
529
530 my $lt = '(?:<|&lt;)';
531 my $gt = '(?:>|&gt;)';
532 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
533
534 my $dont_strip = '';
535 if ($self->{'no_strip_metadata_html'}) {
536 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
537 }
538
539 my $found_something = 0; my $top = 1;
540 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
541 my $text = $1;
542 my $comment = $2;
543 if (defined $text) {
544 # text before a comment - note that getting to here
545 # doesn't necessarily mean there are Section tags in
546 # the document
547 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
548 }
549 while ($comment =~ s/$lt(.*?)$gt//s) {
550 my $tag = $1;
551 if ($tag eq "Section") {
552 $found_something = 1;
553 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
554 $top = 0;
555 } elsif ($tag eq "/Section") {
556 $found_something = 1;
557 $cursection = $doc_obj->get_parent_section ($cursection);
558 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
559 my $metaname = $1;
560 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
561 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
562 my $metavalue = $1;
563 $metavalue =~ s/^\s+//;
564 $metavalue =~ s/\s+$//;
565 # assume that no metadata value intentionally includes
566 # carriage returns or HTML tags (if they're there they
567 # were probably introduced when converting to HTML from
568 # some other format).
569 # actually some people want to have html tags in their
570 # metadata.
571 $metavalue =~ s/[\cJ\cM]/ /sg;
572 $metavalue =~ s/<[^>]+>//sg
573 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
574 $metavalue =~ s/\s+/ /sg;
575 if ($metaname =~ /\./) { # has a namespace
576 $metaname = "ex.$metaname";
577 }
578 if ($accumulate) {
579 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
580 } else {
581 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
582 }
583 } elsif ($tag eq "Description" || $tag eq "/Description") {
584 # do nothing with containing Description tags
585 } else {
586 # simple HTML tag (probably created by the conversion
587 # to HTML from some other format) - we'll ignore it and
588 # hope for the best ;-)
589 }
590 }
591 }
592 if ($cursection ne "") {
593 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
594 }
595
596 $$textref =~ s/^.*?<body[^>]*>//is;
597 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
598 if ($$textref =~ m/\S/) {
599 if (!$found_something) {
600 if ($self->{'verbosity'} > 2) {
601 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
602 print $outhandle " will be processed as a single section document\n";
603 }
604
605 # go ahead and process single-section document
606 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
607
608 # if document contains no Section tags we'll go ahead
609 # and extract metadata (this won't have been done
610 # above as the -description_tags option prevents it)
611 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
612 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
613 unless $self->{'no_metadata'};
614
615 } else {
616 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
617 print $outhandle " of the final closing </Section> tag. This text will\n";
618 print $outhandle " be ignored.";
619
620 my ($text);
621 if (length($$textref) > 30) {
622 $text = substr($$textref, 0, 30) . "...";
623 } else {
624 $text = $$textref;
625 }
626 $text =~ s/\n/ /isg;
627 print $outhandle " ($text)\n";
628 }
629 } elsif (!$found_something) {
630
631 if ($self->{'verbosity'} > 2) {
632 # may get to here if document contained no valid Section
633 # tags but did contain some comments. The text will have
634 # been processed already but we should print the warning
635 # as above and extract metadata
636 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
637 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
638 }
639
640 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
641 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
642 unless $self->{'no_metadata'};
643 }
644 $self->replace_section_links($doc_obj);
645 } else {
646
647 # remove header and footer
648 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
649 $$textref =~ s/^.*?<body[^>]*>//is;
650 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
651 }
652
653 $self->{'css_assoc_files'} = {};
654
655 # single section document
656 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
657
658 #my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
659
660 $self->acquire_css_associated_files($doc_obj, $cursection);
661
662 $self->{'css_assoc_files'} = {};
663 }
664
665 return 1;
666}
667
668
669sub process_heading
670{
671 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
672 $strHeadingText = '' if (!defined($strHeadingText));
673
674 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
675
676 my $strSecMetadata = '';
677 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
678 {
679 $strSecMetadata .= $1;
680 }
681
682 $strHeadingText =~ s/^\s+//g;
683 $strHeadingText =~ s/\s+$//g;
684 $strSecMetadata =~ s/^\s+//g;
685 $strSecMetadata =~ s/\s+$//g;
686
687 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
688
689 if (length($strSecMetadata)) {
690 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
691 }
692
693 $strMetadata .= "\t</Description>\n";
694
695 return "<!--" . $strMetadata . "-->";
696}
697
698
699sub update_section_data
700{
701 my ($self, $arrSections, $nCurTocNo) = @_;
702 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
703
704 if ($nSections == 0) {
705 push @$arrSections, $nCurTocNo;
706 return $strBuffer;
707 }
708 $nLast = $arrSections->[$nSections - 1];
709 if ($nCurTocNo > $nLast) {
710 push @$arrSections, $nCurTocNo;
711 return $strBuffer;
712 }
713 for(my $i = $nSections - 1; $i >= 0; $i--) {
714 if ($nCurTocNo <= $arrSections->[$i]) {
715 $strBuffer .= "\n</Section>";
716 pop @$arrSections;
717 }
718 }
719 push @$arrSections, $nCurTocNo;
720 return $strBuffer;
721}
722
723
724# note that process_section may be called multiple times for a single
725# section (relying on the fact that add_utf8_text appends the text to any
726# that may exist already).
727sub process_section {
728 my $self = shift (@_);
729 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
730
731 my @styleTagsText = ($$textref =~ m/<style[^>]*>([^<]*)<\/style>/sg);
732 if(scalar(@styleTagsText) > 0)
733 {
734 my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
735 foreach my $styleText (@styleTagsText)
736 {
737 $self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
738 }
739 }
740
741 # trap links
742 if (!$self->{'nolinks'}) {
743 # usemap="./#index" not handled correctly => change to "#index"
744## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
745
746## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
747## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
748
749 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
750 $self->replace_usemap_links($1, $2, $3)/isge;
751
752 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
753 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
754
755## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
756# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
757 }
758
759 # trap images
760
761 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
762 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
763 # turned regular text succeeding images into links. That is, by embedding <imgs>
764 # inside <a href=""></a>, the text following images were no longer misbehaving.
765 # However, there would be many occasions whereby images were not meant to link
766 # to their source images but where the images would link to another web page.
767 # To allow this, the no_image_links option was introduced: it would prevent
768 # the behaviour of embedding images into links that referenced the source images.
769
770 # Somewhere along the line, the problem of normal text turning into links when
771 # such text followed images which were not embedded in <a href=""></a> ceased
772 # to occur. This is why the following lines have been commented out (as well as
773 # two lines in replace_images). They appear to no longer apply.
774
775 # If at any time, there is a need for having images embedded in <a> anchor tags,
776 # then it might be better to turn that into an HTMLPlugin option rather than make
777 # it the default behaviour. Also, eventually, no_image_links needs to become
778 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
779
780 #if(!$self->{'no_image_links'}){
781 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
782 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
783 #}
784
785 # add text to document object
786 # turn \ into \\ so that the rest of greenstone doesn't think there
787 # is an escape code following. (Macro parsing loses them...)
788 $$textref =~ s/\\/\\\\/go;
789
790 $doc_obj->add_utf8_text($cursection, $$textref);
791}
792
793sub replace_images {
794 my $self = shift (@_);
795 my ($front, $link, $back, $base_dir,
796 $file, $doc_obj, $section) = @_;
797
798 # remove quotes from link at start and end if necessary
799 if ($link=~/^[\"\']/) {
800 $link=~s/^[\"\']//;
801 $link=~s/[\"\']$//;
802 $front.='"';
803 $back="\"$back";
804 }
805
806 $link =~ s/\n/ /g;
807
808 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
809 # If the Word file path has spaces in it, wv messes up and you end up with
810 # absolute paths for the images, and without the "file://" prefix
811 # So check for this special case and massage the data to be correct
812 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
813 $link =~ s/^.*\\([^\\]+)$/$1/;
814 }
815
816 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
817
818 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
819
820# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n\n";
821
822 my $anchor_name = $img_file;
823 #$anchor_name =~ s/^.*\///;
824 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
825
826 my $image_link = $front . $img_file .$back;
827 return $image_link;
828
829 # The reasons for why the following two lines are no longer necessary can be
830 # found in subroutine process_section
831 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
832 #return $anchor_link;
833
834 #return $front . $img_file . $back . $anchor_name;
835}
836
837sub replace_href_links {
838 my $self = shift (@_);
839 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
840
841 if($front =~ m/^<link / && $link =~ m/\.css"$/)
842 {
843 my $actual_link = $link;
844 $actual_link =~ s/^"(.*)"$/$1/;
845
846 my $directory = &File::Basename::dirname($file);
847
848 my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
849 $self->retrieve_css_associated_files($css_filename);
850 }
851
852 # remove quotes from link at start and end if necessary
853 if ($link=~/^[\"\']/) {
854 $link=~s/^[\"\']//;
855 $link=~s/[\"\']$//;
856 $front.='"';
857 $back="\"$back";
858 }
859
860 # can't remember adding this :-( must have had a reason though...
861 if ($link =~ /^\_http/ || $link =~ /^\_libraryname\_/) {
862 # assume it is a greenstone one and leave alone
863 return $front . $link . $back;
864 }
865
866 # attempt to sort out targets - frames are not handled
867 # well in this plugin and some cases will screw things
868 # up - e.g. the _parent target (so we'll just remove
869 # them all ;-)
870 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
871 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
872 $front =~ s/target=\"?_parent\"?//is;
873 $back =~ s/target=\"?_parent\"?//is;
874
875 if($link =~ m/^\#/s)
876 {
877 return $front . "_httpsamepagelink_" . $link . $back;
878 }
879
880 $link =~ s/\n/ /g;
881
882 # Find file referred to by $link on file system
883 # This is more complicated than it sounds when char encodings
884 # is taken in to account
885 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
886
887 # href may use '\'s where '/'s should be on Windows
888 $href =~ s/\\/\//g;
889 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
890
891 ##### leave all these links alone (they won't be picked up by intermediate
892 ##### pages). I think that's safest when dealing with frames, targets etc.
893 ##### (at least until I think of a better way to do it). Problems occur with
894 ##### mailto links from within small frames, the intermediate page is displayed
895 ##### within that frame and can't be seen. There is still potential for this to
896 ##### happen even with html pages - the solution seems to be to somehow tell
897 ##### the browser from the server side to display the page being sent (i.e.
898 ##### the intermediate page) in the top level window - I'm not sure if that's
899 ##### possible - the following line should probably be deleted if that can be done
900 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
901
902 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
903 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
904
905 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
906
907 # Don't do any encoding for now, as not clear what
908 # the right thing to do is to support filename
909 # encoding on Windows when they are not UTF16
910 #
911 }
912 else {
913 # => Unix-based system
914
915 # If web page didn't give encoding, then default to utf8
916 my $content_encoding= $self->{'content_encoding'} || "utf8";
917
918 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
919 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
920 }
921
922 # on Darwin, the unicode filenames are stored on the file
923 # system in decomposed form, so any href link (including when
924 # URL-encoded) should refer to the decomposed name of the file
925 if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
926 $href = normalize('D', $href); # Normalization Form D (decomposition)
927 }
928
929 $href = encode($content_encoding,$href);
930 }
931
932 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
933 $href = &unicode::filename_to_url($href);
934
935 &ghtml::urlsafe ($href);
936
937 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
938 print STDERR "******DEBUG: href=$href\n";
939 }
940
941 #TODO here
942# if ($rl ==1) {
943 # have a relative link, we need to do URL encoding etc so it matches what has happened for that file
944 #$href = &util::rename_file($href, $self->{'file_rename_method'});
945# $href = &unicode::raw_filename_to_url_encoded($href);
946 # then, this might be url encoded, so we replace % with %25
947# $href = &unicode::filename_to_url($href);
948# print STDERR "DEBUG: url encoded href = $href\n";
949# }
950
951 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
952 } else {
953 # link is to some other type of file (e.g., an image) so we'll
954 # need to associate that file
955 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
956 }
957}
958
959sub retrieve_css_associated_files {
960 my $self = shift (@_);
961 my ($css_filename) = @_;
962
963 my $css_filename_dirname = &File::Basename::dirname($css_filename);
964
965 open (CSSFILE, $css_filename) || return;
966 sysread (CSSFILE, my $file_string, -s CSSFILE);
967
968 $self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
969
970 close CSSFILE;
971}
972
973sub acquire_css_associated_files_from_text_block {
974 my $self = shift (@_);
975 my ($text, $css_filename_dirname) = @_;
976
977 my @image_urls = ($text =~ m/background-image:\s*url[^;]*;/sg);
978 foreach my $img_url (@image_urls)
979 {
980 $img_url =~ s/^.*url.*\((.*)\).*$/$1/;
981 $img_url =~ s/^\s*"?([^"]*)"?\s*$/$1/;
982
983 $self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
984 }
985}
986
987sub acquire_css_associated_files {
988 my $self = shift(@_);
989
990 my ($doc_obj, $section) = @_;
991
992 foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
993 {
994 $doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
995 }
996}
997
998sub add_file {
999 my $self = shift (@_);
1000 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1001 my ($newname);
1002
1003 my $filename = $href;
1004 if ($base_dir eq "") {
1005 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1006 # remove http://
1007 $filename =~ s/^[^:]*:\/\///;
1008 }
1009 else {
1010 # remove http:/ thereby leaving one slash at the start as
1011 # part of full pathname
1012 $filename =~ s/^[^:]*:\///;
1013 }
1014 }
1015 else {
1016 # remove http://
1017 $filename =~ s/^[^:]*:\/\///;
1018 }
1019
1020 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1021 $filename =~ s@\/@\\@g;
1022 }
1023
1024 $filename = &FileUtils::filenameConcatenate($base_dir, $filename);
1025
1026 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
1027 # we are processing a tidytmp file - want paths to be in import
1028 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1029 }
1030
1031 # Replace %XX's in URL with decoded value if required. Note that the
1032 # filename may include the %XX in some situations. If the *original*
1033 # file's name was in URL encoding, the following method will not decode
1034 # it.
1035 my $unicode_filename = $filename;
1036 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
1037
1038 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
1039 # (it seems) accompanying file
1040 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1041
1042 my $content_encoding= $self->{'content_encoding'} || "utf8";
1043
1044 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
1045 # The filenames that come through the HTML file have been decoded
1046 # into Unicode aware Perl strings. Need to convert them back
1047 # to their initial raw-byte encoding to match the file that
1048 # exists on the file system
1049 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1050
1051 }
1052 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1053 # HFS+ is UTF8 with decompostion
1054 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1055 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
1056
1057 }
1058 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1059 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1060
1061 if (defined $long_filename) {
1062 my $short_filename = Win32::GetLongPathName($long_filename);
1063 $filename = $short_filename;
1064 }
1065# else {
1066# print STDERR "***** failed to map href to real file:\n";
1067# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1068# }
1069 }
1070 else {
1071 my $outhandle = $self->{'outhandle'};
1072 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1073 print $outhandle " in file system encoding of href: $href\n";
1074 print $outhandle " No character encoding done.\n";
1075 }
1076
1077
1078 # some special processing if the intended filename was converted to utf8, but
1079 # the actual file still needs to be renamed
1080 if (!&FileUtils::fileExists($filename)) {
1081 # try the original filename stored in map
1082 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1083 print STDERR "******!! orig filename did not exist: $filename\n";
1084 }
1085
1086## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
1087
1088 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
1089
1090 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1091 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
1092 }
1093
1094 if (defined $original_filename && -e $original_filename) {
1095 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
1096 print STDERR "****** Found match!\n";
1097 }
1098 $filename = $original_filename;
1099 }
1100 }
1101
1102 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1103
1104 if ($rl == 0) {
1105 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1106 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
1107 }
1108 else {
1109 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
1110 }
1111 }
1112
1113 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1114 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
1115 }
1116 # add the original image file as a source file
1117 if (!$self->{'processing_tmp_files'} ) {
1118 $doc_obj->associate_source_file($filename);
1119 }
1120 if ($self->{'rename_assoc_files'}) {
1121 if (defined $self->{'aux_files'}->{$href}) {
1122 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1123 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1124 } else {
1125 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1126 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1127 $self->inc_filecount ();
1128 }
1129 $doc_obj->associate_file($filename, $newname, undef, $section);
1130 return "_httpdocimg_/$newname";
1131 } else {
1132 if(&unicode::is_url_encoded($unicode_filename)) {
1133 # use the possibly-decoded filename instead to avoid double URL encoding
1134 ($newname) = $filename =~ m/([^\/\\]*)$/;
1135 } else {
1136 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
1137 }
1138
1139 # Make sure this name uses only ASCII characters.
1140 # We use either base64 or URL encoding, as these preserve original encoding
1141 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
1142
1143### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
1144 $doc_obj->associate_file($filename, $newname, undef, $section);
1145
1146 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1147 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1148 my $newname_url = $newname;
1149 $newname_url = &unicode::filename_to_url($newname_url);
1150 return "_httpdocimg_/$newname_url";
1151 }
1152}
1153
1154sub replace_section_links {
1155 my $self = shift(@_);
1156 my ($doc_obj) = @_;
1157 my %anchors;
1158 my $top_section = $doc_obj->get_top_section();
1159 my $thissection = $doc_obj->get_next_section($top_section);
1160 while ( defined $thissection ) {
1161 my $text = $doc_obj->get_text($thissection);
1162 while ( $text =~ /(?:(?:id|name)\s*=\s*[\'\"])([^\'\"]+)/gi ) {
1163 $anchors{$1} = $thissection;
1164 }
1165 $thissection = $doc_obj->get_next_section($thissection);
1166 }
1167 $thissection = $top_section;
1168 while (defined $thissection) {
1169 my $text = $doc_obj->get_text($thissection);
1170 $text =~ s/(href\s*=\s*[\"\'])(_httpsamepagelink_#)([^\'\"]+)/$self->replace_link_to_anchor($1,$2,$3,$thissection,$anchors{$3})/ige;
1171 $doc_obj->delete_text( $thissection);
1172 $doc_obj->add_utf8_text( $thissection, $text );
1173 $thissection = $doc_obj->get_next_section ($thissection);
1174 }
1175}
1176sub replace_link_to_anchor {
1177 my $self = shift(@_);
1178 my ($href_part,$old_link,$identifier,$current_section,$target_section) = @_;
1179 if (length $target_section && $current_section ne $target_section){
1180 return $href_part . "javascript:goToAnchor(\'" . $target_section . "\',\'" . $identifier . "\');" ;
1181 }
1182 return $href_part . $old_link . $identifier ;
1183}
1184
1185sub format_link {
1186 my $self = shift (@_);
1187 my ($link, $base_dir, $file) = @_;
1188
1189 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1190 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
1191
1192 $hash_part = "" if !defined $hash_part;
1193 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1194 my $outhandle = $self->{'outhandle'};
1195 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1196 if $self->{'verbosity'};
1197 return ($link, "", 0);
1198 }
1199
1200# my $dirname;
1201 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
1202 my $type = $1;
1203 my $before_hash_file = $before_hash;
1204
1205 if ($link =~ m/^(http|ftp):/i) {
1206
1207 # Turn url (using /) into file name (possibly using \ on windows)
1208 my @http_dir_split = split('/', $before_hash_file);
1209 $before_hash_file = &FileUtils::filenameConcatenate(@http_dir_split);
1210 }
1211
1212 # want to maintain two version of "before_hash": one representing the URL, the other using filesystem specific directory separator
1213 $before_hash_file = $self->eval_dir_dots($before_hash_file);
1214 my $before_hash_url = $before_hash_file;
1215 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1216 $before_hash_url =~ s@\\@\/@g;
1217 }
1218
1219 ######## TODO need to check this for encoding stufff
1220 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
1221 print STDERR "chekcing for existence whether relative link or not $linkfilename\n";
1222 my $rl = 0;
1223 $rl = 1 if (-e $linkfilename);
1224 if (-e $linkfilename) {
1225
1226 print STDERR "DOES exist $linkfilename\n";
1227 } else {
1228 print STDERR "DOESN'T exist $linkfilename\n";
1229 }
1230 # make sure there's a slash on the end if it's a directory
1231 if ($before_hash_url !~ m/\/$/) {
1232 $before_hash_url .= "/" if (-d $linkfilename);
1233 }
1234 return ($type . $before_hash_url, $hash_part, $rl);
1235
1236 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1237
1238 #### TODO whst is this test doing???
1239 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1240
1241 # the first directory will be the domain name if file_is_url
1242 # to generate archives, otherwise we'll assume all files are
1243 # from the same site and base_dir is the root
1244
1245 if ($self->{'file_is_url'}) {
1246 my @dirs = split /[\/\\]/, $file;
1247 my $domname = shift (@dirs);
1248 $before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
1249 $before_hash =~ s@\\@/@g; # for windows
1250 }
1251 else
1252 {
1253 # see if link shares directory with source document
1254 # => turn into relative link if this is so!
1255
1256 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1257 # too difficult doing a pattern match with embedded '\'s...
1258 my $win_before_hash=$before_hash;
1259 $win_before_hash =~ s@(\\)+@/@g;
1260 # $base_dir is already similarly "converted" on windows.
1261 if ($win_before_hash =~ s@^$base_dir/@@o) {
1262 # if this is true, we removed a prefix
1263 $before_hash=$win_before_hash;
1264 }
1265 }
1266 else {
1267 # before_hash has lost leading slash by this point,
1268 # -> add back in prior to substitution with $base_dir
1269 $before_hash = "/$before_hash";
1270
1271 $before_hash = &FileUtils::filenameConcatenate("",$before_hash);
1272 $before_hash =~ s@^$base_dir/@@;
1273 }
1274 }
1275 } else {
1276
1277 # Turn relative file path into full path (inside import dir)
1278 my $dirname = &File::Basename::dirname($file);
1279
1280 # we want to add dirname (which is raw filesystem path) onto $before_hash, (which is perl unicode aware string). Convert dirname to perl string
1281
1282 my $unicode_dirname ="";
1283 # we need to turn raw filesystem filename into perl unicode aware string
1284 my $filename_encoding = $self->guess_filesystem_encoding();
1285
1286 # copied this from set_Source_metadata in BaseImporter
1287 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
1288 # Use filename_encoding to map raw filename to a Perl unicode-aware string
1289 $unicode_dirname = decode($filename_encoding,$dirname);
1290 }
1291 else {
1292 # otherwise generate %xx encoded version of filename for char > 127
1293 $unicode_dirname = &unicode::raw_filename_to_url_encoded($dirname);
1294 }
1295 print STDERR "unicode dirname = ".&unicode::debug_unicode_string($unicode_dirname)."\n";
1296 $before_hash = &FileUtils::filenameConcatenate($unicode_dirname, $before_hash);
1297 $before_hash = $self->eval_dir_dots($before_hash);
1298 $before_hash =~ s@\\@/@g; # for windows
1299 }
1300
1301 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
1302
1303 # make sure there's a slash on the end if it's a directory
1304 if ($before_hash !~ m/\/$/) {
1305 $before_hash .= "/" if (-d $linkfilename);
1306 }
1307 return ("http://" . $before_hash, $hash_part, 1);
1308 } else {
1309 # mailto, news, nntp, telnet, javascript or gopher link
1310 return ($before_hash, "", 0);
1311 }
1312}
1313
1314sub extract_first_NNNN_characters {
1315 my $self = shift (@_);
1316 my ($textref, $doc_obj, $thissection) = @_;
1317
1318 foreach my $size (split /,/, $self->{'first'}) {
1319 my $tmptext = $$textref;
1320 # skip to the body
1321 $tmptext =~ s/.*<body[^>]*>//i;
1322 # remove javascript
1323 $tmptext =~ s@<script.*?</script>@ @sig;
1324 $tmptext =~ s/<[^>]*>/ /g;
1325 $tmptext =~ s/&nbsp;/ /g;
1326 $tmptext =~ s/^\s+//;
1327 $tmptext =~ s/\s+$//;
1328 $tmptext =~ s/\s+/ /gs;
1329 $tmptext = &unicode::substr ($tmptext, 0, $size);
1330 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1331 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1332 }
1333}
1334
1335
1336sub extract_metadata {
1337 my $self = shift (@_);
1338 my ($textref, $metadata, $doc_obj, $section) = @_;
1339 my $outhandle = $self->{'outhandle'};
1340 # if we don't want metadata, we may as well not be here ...
1341 return if (!defined $self->{'metadata_fields'});
1342
1343 my $separator = $self->{'metadata_field_separator'};
1344 if ($separator eq "") {
1345 undef $separator;
1346 }
1347
1348 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1349 # html meta, 'value' is the metadata name for greenstone to use
1350 my %find_fields = ();
1351
1352 my %creator_fields = (); # short-cut for lookups
1353
1354
1355 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1356 $field =~ s/^\s+//; # remove leading whitespace
1357 $field =~ s/\s+$//; # remove trailing whitespace
1358
1359 # support tag<tagname>
1360 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
1361 # "$2" is the user's preferred gs metadata name
1362 $find_fields{lc($1)}=$2; # lc = lowercase
1363 } else { # no <tagname> for mapping
1364 # "$field" is the user's preferred gs metadata name
1365 $find_fields{lc($field)}=$field; # lc = lowercase
1366 }
1367 }
1368
1369 if (defined $self->{'hunt_creator_metadata'} &&
1370 $self->{'hunt_creator_metadata'} == 1 ) {
1371 my @extra_fields =
1372 (
1373 'author',
1374 'author.email',
1375 'creator',
1376 'dc.creator',
1377 'dc.creator.corporatename',
1378 );
1379
1380 # add the creator_metadata fields to search for
1381 foreach my $field (@extra_fields) {
1382 $creator_fields{$field}=0; # add to lookup hash
1383 }
1384 }
1385
1386
1387 # find the header in the html file, which has the meta tags
1388 $$textref =~ m@<head>(.*?)</head>@si;
1389
1390 my $html_header=$1;
1391
1392 # go through every <meta... tag defined in the html and see if it is
1393 # one of the tags we want to match.
1394
1395 # special case for title - we want to remember if its been found
1396 my $found_title = 0;
1397 # this assumes that ">" won't appear. (I don't think it's allowed to...)
1398 $html_header =~ m/^/; # match the start of the string, for \G assertion
1399
1400 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1401 my $metatag=$1;
1402 my ($tag, $value);
1403
1404 # find the tag name
1405 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1406 $tag=$2;
1407 # in case they're not using " or ', but they should...
1408 if (! $tag) {
1409 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1410 $tag=$1;
1411 }
1412
1413 if (!defined $tag) {
1414 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1415 next;
1416 }
1417
1418 # don't need to assign this field if it was passed in from a previous
1419 # (recursive) plugin
1420 if (defined $metadata->{$tag}) {next}
1421
1422 # find the tag content
1423 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1424 $value=$2;
1425
1426 # The following code assigns the metaname to value if value is
1427 # empty. Why would we do this?
1428 #if (! $value) {
1429 # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1430 # $value=$1;
1431 #}
1432 if (!defined $value || $value eq "") {
1433 print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
1434 next;
1435 }
1436
1437 # clean up and add
1438 $value =~ s/\s+/ /gs;
1439 chomp($value); # remove trailing \n, if any
1440 if (exists $creator_fields{lc($tag)}) {
1441 # map this value onto greenstone's "Creator" metadata
1442 $tag='Creator';
1443 } elsif (!exists $find_fields{lc($tag)}) {
1444 next; # don't want this tag
1445 } else {
1446 # get the user's preferred capitalisation
1447 $tag = $find_fields{lc($tag)};
1448 }
1449 if (lc($tag) eq "title") {
1450 $found_title = 1;
1451 }
1452
1453 if ($self->{'verbosity'} > 2) {
1454 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
1455 }
1456
1457 if ($tag =~ /\./) {
1458 # there is a . so has a namespace, add ex.
1459 $tag = "ex.$tag";
1460 }
1461 if (defined $separator) {
1462 my @values = split($separator, $value);
1463 foreach my $v (@values) {
1464 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1465 }
1466 }
1467 else {
1468 $doc_obj->add_utf8_metadata($section, $tag, $value);
1469 }
1470 }
1471
1472 # TITLE: extract the document title
1473 if (exists $find_fields{'title'} && !$found_title) {
1474 # we want a title, and didn't find one in the meta tags
1475 # see if there's a <title> tag
1476 my $title;
1477 my $from = ""; # for debugging output only
1478 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1479 $title = $1;
1480 $from = "<title> tags";
1481 }
1482
1483 if (!defined $title) {
1484 $from = "first 100 chars";
1485 # if no title use first 100 or so characters
1486 $title = $$textref;
1487 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1488 $title =~ s/^.*?<body>//si;
1489 # ignore javascript!
1490 $title =~ s@<script.*?</script>@ @sig;
1491 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1492 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1493 $title =~ s@\r@@g; # remove Windows carriage returns to ensure that titles of pdftohtml docs are consistent (the same 100 chars) across windows and linux
1494 $title = substr ($title, 0, 100);
1495 $title =~ s/\s\S*$/.../;
1496 }
1497 $title =~ s/<[^>]*>/ /g; # remove html tags
1498 $title =~ s/&nbsp;/ /g;
1499 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1500 $title =~ s/\s+/ /gs; # collapse multiple spaces
1501 $title =~ s/^\s*//; # remove leading spaces
1502 $title =~ s/\s*$//; # remove trailing spaces
1503
1504 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1505 $title =~ s/^\s+//s; # in case title_sub introduced any...
1506 $doc_obj->add_utf8_metadata ($section, "Title", $title);
1507 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1508 if ($self->{'verbosity'} > 2);
1509 }
1510
1511 # add FileFormat metadata
1512 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1513
1514 # Special, for metadata names such as tagH1 - extracts
1515 # the text between the first <H1> and </H1> tags into "H1" metadata.
1516
1517 foreach my $field (keys %find_fields) {
1518 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1519 my $tag = $1;
1520 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1521 my $content = $1;
1522 $content =~ s/&nbsp;/ /g;
1523 $content =~ s/<[^>]*>/ /g;
1524 $content =~ s/^\s+//;
1525 $content =~ s/\s+$//;
1526 $content =~ s/\s+/ /gs;
1527 if ($content) {
1528 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1529 $tag =~ s/^tag//i;
1530 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1531 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1532 if ($self->{'verbosity'} > 2);
1533 }
1534 }
1535 }
1536}
1537
1538
1539# evaluate any "../" to next directory up
1540# evaluate any "./" as here
1541sub eval_dir_dots {
1542 my $self = shift (@_);
1543 my ($filename) = @_;
1544 my $dirsep_os = &util::get_os_dirsep();
1545 my @dirsep = split(/$dirsep_os/,$filename);
1546
1547 my @eval_dirs = ();
1548 foreach my $d (@dirsep) {
1549 if ($d eq "..") {
1550 pop(@eval_dirs);
1551
1552 } elsif ($d eq ".") {
1553 # do nothing!
1554
1555 } else {
1556 push(@eval_dirs,$d);
1557 }
1558 }
1559
1560 # Need to fiddle with number of elements in @eval_dirs if the
1561 # first one is the empty string. This is because of a
1562 # modification to FileUtils::filenameConcatenate that supresses the addition
1563 # of a leading '/' character (or \ if windows) (intended to help
1564 # filename cat with relative paths) if the first entry in the
1565 # array is the empty string. Making the array start with *two*
1566 # empty strings is a way to defeat this "smart" option.
1567 #
1568 if (scalar(@eval_dirs) > 0) {
1569 if ($eval_dirs[0] eq ""){
1570 unshift(@eval_dirs,"");
1571 }
1572 }
1573
1574 my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
1575 return $evaluated_filename;
1576}
1577
1578sub replace_usemap_links {
1579 my $self = shift (@_);
1580 my ($front, $link, $back) = @_;
1581
1582 # remove quotes from link at start and end if necessary
1583 if ($link=~/^[\"\']/) {
1584 $link=~s/^[\"\']//;
1585 $link=~s/[\"\']$//;
1586 $front.='"';
1587 $back="\"$back";
1588 }
1589
1590 $link =~ s/^\.\///;
1591 return $front . $link . $back;
1592}
1593
1594sub inc_filecount {
1595 my $self = shift (@_);
1596
1597 if ($self->{'file_num'} == 1000) {
1598 $self->{'dir_num'} ++;
1599 $self->{'file_num'} = 0;
1600 } else {
1601 $self->{'file_num'} ++;
1602 }
1603}
1604
1605
1606# Extend read_file so that strings like &eacute; are
1607# converted to UTF8 internally.
1608#
1609# We don't convert &lt; or &gt; or &amp; or &quot; in case
1610# they interfere with the GML files
1611
1612sub read_file {
1613 my $self = shift(@_);
1614 my ($filename, $encoding, $language, $textref) = @_;
1615
1616 $self->SUPER::read_file($filename, $encoding, $language, $textref);
1617
1618 # Convert entities to their Unicode code-point equivalents
1619 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1620 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
1621 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1622
1623}
1624
1625sub HB_read_html_file {
1626 my $self = shift (@_);
1627 my ($htmlfile, $text) = @_;
1628
1629 # load in the file
1630 if (!open (FILE, $htmlfile)) {
1631 print STDERR "ERROR - could not open $htmlfile\n";
1632 return;
1633 }
1634
1635 my $foundbody = 0;
1636 $self->HB_gettext (\$foundbody, $text, "FILE");
1637 close FILE;
1638
1639 # just in case there was no <body> tag
1640 if (!$foundbody) {
1641 $foundbody = 1;
1642 open (FILE, $htmlfile) || return;
1643 $self->HB_gettext (\$foundbody, $text, "FILE");
1644 close FILE;
1645 }
1646 # text is in utf8
1647}
1648
1649# converts the text to utf8, as ghtml does that for &eacute; etc.
1650sub HB_gettext {
1651 my $self = shift (@_);
1652 my ($foundbody, $text, $handle) = @_;
1653
1654 my $line = "";
1655 while (defined ($line = <$handle>)) {
1656 # look for body tag
1657 if (!$$foundbody) {
1658 if ($line =~ s/^.*<body[^>]*>//i) {
1659 $$foundbody = 1;
1660 } else {
1661 next;
1662 }
1663 }
1664
1665 # check for symbol fonts
1666 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1667 my $font = $1;
1668 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1669 if ($font !~ m/^arial$/i);
1670 }
1671
1672 $$text .= $line;
1673 }
1674
1675 if ($self->{'input_encoding'} eq "iso_8859_1") {
1676 # convert to utf-8
1677 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1678 }
1679 # convert any alphanumeric character entities to their utf-8
1680 # equivalent for indexing purposes
1681 #&ghtml::convertcharentities ($$text);
1682
1683 $$text =~ s/\s+/ /g; # remove \n's
1684
1685 # At this point $$text is a binary byte string
1686 # => turn it into a Unicode aware string, so full
1687 # Unicode aware pattern matching can be used.
1688 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1689 #
1690
1691 $$text = decode("utf8",$$text);
1692}
1693
1694sub HB_clean_section {
1695 my $self = shift (@_);
1696 my ($section) = @_;
1697
1698 # remove tags without a starting tag from the section
1699 my ($tag, $tagstart);
1700 while ($section =~ m/<\/([^>]{1,10})>/) {
1701 $tag = $1;
1702 $tagstart = index($section, "<$tag");
1703 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1704 $section =~ s/<\/$tag>//;
1705 }
1706
1707 # remove extra paragraph tags
1708 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1709
1710 # remove extra stuff at the end of the section
1711 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1712
1713 # add a newline at the beginning of each paragraph
1714 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1715
1716 # add a newline every 80 characters at a word boundary
1717 # Note: this regular expression puts a line feed before
1718 # the last word in each section, even when it is not
1719 # needed.
1720 $section =~ s/(.{1,80})\s/$1\n/g;
1721
1722 # fix up the image links
1723 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1724 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1725 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1726 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1727
1728 return $section;
1729}
1730
1731# Will convert the oldHDL format to the new HDL format (using the Section tag)
1732sub convert_to_newHDLformat
1733{
1734 my $self = shift (@_);
1735 my ($file,$cnfile) = @_;
1736 my $input_filename = $file;
1737 my $tmp_filename = $cnfile;
1738
1739 # write HTML tmp file with new HDL format
1740 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1741
1742 # read in the file and do basic html cleaning (removing header etc)
1743 my $html = "";
1744 $self->HB_read_html_file ($input_filename, \$html);
1745
1746 # process the file one section at a time
1747 my $curtoclevel = 1;
1748 my $firstsection = 1;
1749 my $toclevel = 0;
1750 while (length ($html) > 0) {
1751 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1752 $toclevel = $3;
1753 my $title = $4;
1754 my $sectiontext = "";
1755 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1756 $sectiontext = $1;
1757 } else {
1758 $sectiontext = $html;
1759 $html = "";
1760 }
1761
1762 # remove tags and extra spaces from the title
1763 $title =~ s/<\/?[^>]+>//g;
1764 $title =~ s/^\s+|\s+$//g;
1765
1766 # close any sections below the current level and
1767 # create a new section (special case for the firstsection)
1768 print PROD "<!--\n";
1769 while (($curtoclevel > $toclevel) ||
1770 (!$firstsection && $curtoclevel == $toclevel)) {
1771 $curtoclevel--;
1772 print PROD "</Section>\n";
1773 }
1774 if ($curtoclevel+1 < $toclevel) {
1775 print STDERR "WARNING - jump in toc levels in $input_filename " .
1776 "from $curtoclevel to $toclevel\n";
1777 }
1778 while ($curtoclevel < $toclevel) {
1779 $curtoclevel++;
1780 }
1781
1782 if ($curtoclevel == 1) {
1783 # add the header tag
1784 print PROD "-->\n";
1785 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1786 print PROD "<!--\n";
1787 }
1788
1789 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1790
1791 print PROD "-->\n";
1792
1793 # clean up the section html
1794 $sectiontext = $self->HB_clean_section($sectiontext);
1795
1796 print PROD "$sectiontext\n";
1797
1798 } else {
1799 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1800 "\nin $input_filename\n";
1801 last;
1802 }
1803 $firstsection = 0;
1804 }
1805
1806 print PROD "<!--\n";
1807 while ($curtoclevel > 0) {
1808 $curtoclevel--;
1809 print PROD "</Section>\n";
1810 }
1811 print PROD "-->\n";
1812
1813 close (PROD) || die("Error Closing File: $tmp_filename $!");
1814
1815 return $tmp_filename;
1816}
1817
1818sub shorten {
1819 my $self = shift (@_);
1820 my ($text) = @_;
1821
1822 return "\"$text\"" if (length($text) < 100);
1823
1824 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1825 substr ($text, length($text)-50) . "\"";
1826}
1827
1828sub convert_tidy_or_oldHDL_file
1829{
1830 my $self = shift (@_);
1831 my ($file) = @_;
1832 my $input_filename = $file;
1833
1834 if (-d $input_filename)
1835 {
1836 return $input_filename;
1837 }
1838
1839 # get the input filename
1840 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1841 my $base_dirname = $dirname;
1842 $suffix = lc($suffix);
1843
1844 # derive tmp filename from input filename
1845 # Remove any white space from filename -- no risk of name collision, and
1846 # makes later conversion by utils simpler. Leave spaces in path...
1847 # tidy up the filename with space, dot, hyphen between
1848 $tailname =~ s/\s+//g;
1849 $tailname =~ s/\.+//g;
1850 $tailname =~ s/\-+//g;
1851 # convert to utf-8 otherwise we have problems with the doc.xml file
1852 # later on
1853 &unicode::ensure_utf8(\$tailname);
1854
1855 # softlink to collection tmp dir
1856 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
1857 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
1858
1859 my $test_dirname = "";
1860 my $f_separator = &util::get_os_dirsep();
1861
1862 if ($dirname =~ m/import$f_separator/)
1863 {
1864 $test_dirname = $'; #'
1865
1866 #print STDERR "init $'\n";
1867
1868 while ($test_dirname =~ m/[$f_separator]/)
1869 {
1870 my $folderdirname = $`;
1871 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
1872 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
1873 $test_dirname = $'; #'
1874 }
1875 }
1876
1877 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1878
1879 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1880 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1881 {
1882 #convert the input file to a new style HDL
1883 my $hdl_output_filename = $input_filename;
1884 if ($self->{'old_style_HDL'})
1885 {
1886 $hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1887 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1888 }
1889
1890 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1891 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1892 my @files = grep {!/^\.+$/} readdir(DIR);
1893 close(DIR);
1894
1895 foreach my $file (@files)
1896 {
1897 my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1898 my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
1899 if ((!-e $dest_file) && (!-d $src_file))
1900 {
1901 # just copy the original file back to the tmp directory
1902 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1903 }
1904 }
1905
1906 # tidy the input file
1907 my $tidy_output_filename = $hdl_output_filename;
1908 if ($self->{'use_realistic_book'})
1909 {
1910 $tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
1911 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1912 }
1913 $tmp_filename = $tidy_output_filename;
1914 }
1915 else
1916 {
1917 if (!-e $tmp_filename)
1918 {
1919 # just copy the original file back to the tmp directory
1920 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1921 }
1922 }
1923
1924 return $tmp_filename;
1925}
1926
1927
1928# Will make the html input file as a proper XML file with removed font tag and
1929# image size added to the img tag.
1930# The tidying process takes place in a collection specific 'tmp' directory so
1931# that we don't accidentally damage the input.
1932sub tmp_tidy_file
1933{
1934 my $self = shift (@_);
1935 my ($file,$cnfile) = @_;
1936 my $input_filename = $file;
1937 my $tmp_filename = $cnfile;
1938
1939 # get the input filename
1940 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1941
1942 require HTML::TokeParser::Simple;
1943
1944 # create HTML parser to decode the input file
1945 my $parser = HTML::TokeParser::Simple->new($input_filename);
1946
1947 # write HTML tmp file without the font tag and image size are added to the img tag
1948 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1949 while (my $token = $parser->get_token())
1950 {
1951 # is it an img tag
1952 if ($token->is_start_tag('img'))
1953 {
1954 # get the attributes
1955 my $attr = $token->return_attr;
1956
1957 # get the full path to the image
1958 my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
1959
1960 # set the width and height attribute
1961 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1962
1963 # recreate the tag
1964 print PROD "<img";
1965 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1966 print PROD ">";
1967 }
1968 # is it a font tag
1969 else
1970 {
1971 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1972 {
1973 # remove font tag
1974 print PROD "";
1975 }
1976 else
1977 {
1978 # print without changes
1979 print PROD $token->as_is;
1980 }
1981 }
1982 }
1983 close (PROD) || die("Error Closing File: $tmp_filename $!");
1984
1985 # run html-tidy on the tmp file to make it a proper XML file
1986
1987 my $outhandle = $self->{'outhandle'};
1988 print $outhandle "Converting HTML to be XML compliant:\n";
1989
1990 my $tidy_cmd = "tidy";
1991 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
1992 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
1993 if ($self->{'verbosity'} <= 2) {
1994 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1995 $tidy_cmd .= " 2>nul";
1996 }
1997 else {
1998 $tidy_cmd .= " 2>/dev/null";
1999 }
2000 print $outhandle " => $tidy_cmd\n";
2001 }
2002
2003 my $tidyfile = `$tidy_cmd`;
2004
2005 # write result back to the tmp file
2006 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
2007 print PROD $tidyfile;
2008 close (PROD) || die("Error Closing File: $tmp_filename $!");
2009
2010 # return the output filename
2011 return $tmp_filename;
2012}
2013
2014sub associate_cover_image
2015{
2016 my $self = shift(@_);
2017 my ($doc_obj, $filename) = @_;
2018 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
2019 {
2020 # we will have cover image in tidytmp, but want it from import
2021 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
2022 }
2023 $self->SUPER::associate_cover_image($doc_obj, $filename);
2024}
2025
2026
20271;
Note: See TracBrowser for help on using the repository browser.