source: main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 65.6 KB
RevLine 
[14665]1###########################################################################
2#
[15872]3# HTMLPlugin.pm -- basic html plugin
[14665]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
[15872]36package HTMLPlugin;
[14665]37
[22842]38use Encode;
[23387]39use Unicode::Normalize 'normalize';
[22842]40
[15872]41use ReadTextFile;
42use HBPlugin;
[14665]43use ghtml;
44use unicode;
45use util;
[27306]46use FileUtils;
[14665]47use XMLParser;
48
49use Image::Size;
[14913]50use File::Copy;
[14665]51
52sub BEGIN {
[15872]53 @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
[14665]54}
55
56use strict; # every perl program should have this!
57no strict 'refs'; # make an exception so we can use variables as filehandles
58
59my $arguments =
60 [ { 'name' => "process_exp",
[15872]61 'desc' => "{BasePlugin.process_exp}",
[14665]62 'type' => "regexp",
63 'deft' => &get_default_process_exp() },
64 { 'name' => "block_exp",
[15872]65 'desc' => "{BasePlugin.block_exp}",
[14665]66 'type' => 'regexp',
67 'deft' => &get_default_block_exp() },
68 { 'name' => "nolinks",
[15872]69 'desc' => "{HTMLPlugin.nolinks}",
[14665]70 'type' => "flag" },
71 { 'name' => "keep_head",
[15872]72 'desc' => "{HTMLPlugin.keep_head}",
[14665]73 'type' => "flag" },
74 { 'name' => "no_metadata",
[15872]75 'desc' => "{HTMLPlugin.no_metadata}",
[14665]76 'type' => "flag" },
77 { 'name' => "metadata_fields",
[15872]78 'desc' => "{HTMLPlugin.metadata_fields}",
[14665]79 'type' => "string",
80 'deft' => "Title" },
[21800]81 { 'name' => "metadata_field_separator",
82 'desc' => "{HTMLPlugin.metadata_field_separator}",
83 'type' => "string",
84 'deft' => "" },
[14665]85 { 'name' => "hunt_creator_metadata",
[15872]86 'desc' => "{HTMLPlugin.hunt_creator_metadata}",
[14665]87 'type' => "flag" },
88 { 'name' => "file_is_url",
[15872]89 'desc' => "{HTMLPlugin.file_is_url}",
[14665]90 'type' => "flag" },
91 { 'name' => "assoc_files",
[15872]92 'desc' => "{HTMLPlugin.assoc_files}",
[14665]93 'type' => "regexp",
94 'deft' => &get_default_block_exp() },
95 { 'name' => "rename_assoc_files",
[15872]96 'desc' => "{HTMLPlugin.rename_assoc_files}",
[14665]97 'type' => "flag" },
98 { 'name' => "title_sub",
[15872]99 'desc' => "{HTMLPlugin.title_sub}",
[14665]100 'type' => "string",
101 'deft' => "" },
102 { 'name' => "description_tags",
[15872]103 'desc' => "{HTMLPlugin.description_tags}",
[14665]104 'type' => "flag" },
105 # retain this for backward compatibility (w3mir option was replaced by
106 # file_is_url)
107 { 'name' => "w3mir",
[15872]108# 'desc' => "{HTMLPlugin.w3mir}",
[14665]109 'type' => "flag",
110 'hiddengli' => "yes"},
111 { 'name' => "no_strip_metadata_html",
[15872]112 'desc' => "{HTMLPlugin.no_strip_metadata_html}",
[14665]113 'type' => "string",
114 'deft' => "",
115 'reqd' => "no"},
116 { 'name' => "sectionalise_using_h_tags",
[15872]117 'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
[14665]118 'type' => "flag" },
[14913]119 { 'name' => "use_realistic_book",
[15872]120 'desc' => "{HTMLPlugin.tidy_html}",
[14665]121 'type' => "flag"},
[15872]122 { 'name' => "old_style_HDL",
123 'desc' => "{HTMLPlugin.old_style_HDL}",
[20791]124 'type' => "flag"},
125 {'name' => "processing_tmp_files",
126 'desc' => "{BasePlugin.processing_tmp_files}",
127 'type' => "flag",
128 'hiddengli' => "yes"}
[14665]129 ];
130
[15872]131my $options = { 'name' => "HTMLPlugin",
132 'desc' => "{HTMLPlugin.desc}",
[14665]133 'abstract' => "no",
134 'inherits' => "yes",
135 'args' => $arguments };
136
137
138sub new {
139 my ($class) = shift (@_);
140 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
141 push(@$pluginlist, $class);
142
[15872]143 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
144 push(@{$hashArgOptLists->{"OptList"}},$options);
[16024]145
[14665]146
[15872]147 my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
[14665]148
149 if ($self->{'w3mir'}) {
150 $self->{'file_is_url'} = 1;
151 }
152 $self->{'aux_files'} = {};
153 $self->{'dir_num'} = 0;
154 $self->{'file_num'} = 0;
155
156 return bless $self, $class;
157}
158
159# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
160# if have eg <script language="javascript" src="img/lib.js@123">
[20791]161# blocking is now done by reading through the file and recording all the
162# images and other files
[14665]163sub get_default_block_exp {
164 my $self = shift (@_);
165
[16392]166 #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
167 return "";
[14665]168}
169
170sub get_default_process_exp {
171 my $self = shift (@_);
172
173 # the last option is an attempt to encode the concept of an html query ...
174 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
175}
176
177sub store_block_files
178{
179 my $self =shift (@_);
[16392]180 my ($filename_full_path, $block_hash) = @_;
181
182 my $html_fname = $filename_full_path;
[14665]183
[23335]184 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
185 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
[14665]186
187 # read in file ($text will be in utf8)
[16769]188 my $raw_text = "";
[23363]189 $self->read_file_no_decoding($filename_full_path, \$raw_text);
[16769]190
191 my $textref = \$raw_text;
[14665]192 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
193 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
194 $$textref =~ s/$opencom(.*?)$closecom//gs;
195
[23363]196 # Convert entities to their UTF8 equivalents
197 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
198 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
199 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
200
[14665]201 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
202 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
203 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
204 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
205 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
[17127]206 my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
[16638]207 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
[14665]208
[23387]209 if(!defined $self->{'unicode_to_original_filename'}) {
[16769]210 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
[23387]211 $self->{'unicode_to_original_filename'} = {};
[16769]212 }
213
[23387]214 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
[14665]215
216 # remove quotes from link at start and end if necessary
[23387]217 if ($raw_link =~ m/^\"/) {
218 $raw_link =~ s/^\"//;
219 $raw_link =~ s/\"$//;
[14665]220 }
221
[23371]222 # remove any anchor names, e.g. foo.html#name becomes foo.html
223 # but watch out for any #'s that are part of entities, such as &#x3B1;
[23387]224 $raw_link =~ s/([^&])\#.*$/$1/s;
[23371]225
[16638]226 # some links may just be anchor names
[23387]227 next unless ($raw_link =~ /\S+/);
[14665]228
[23415]229 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/i) {
[14665]230 # Turn relative file path into full path
[16392]231 my $dirname = &File::Basename::dirname($filename_full_path);
[27306]232 $raw_link = &FileUtils::filenameConcatenate($dirname, $raw_link);
[14665]233 }
[23387]234 $raw_link = $self->eval_dir_dots($raw_link);
[16638]235
[16769]236 # this is the actual filename on the filesystem (that the link refers to)
[23387]237 my $url_original_filename = $self->opt_url_decode($raw_link);
[16769]238
[23387]239 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
[16769]240
[23387]241 if ($exceeds_bytecodes) {
242 # We have a link to a file name that is more complicated than a raw byte filename
243 # What we do next depends on the operating system we are on
[16769]244
[23387]245 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
246 # Assume we're dealing with a UTF-8 encoded filename
247 $url_original_filename = encode("utf8", $url_original_filename);
248 }
249 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
250 # HFS+ is UTF8 with decompostion
251 $url_original_filename = encode("utf8", $url_original_filename);
252 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
253 }
254 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
255 # Don't need to do anything as later code maps Windows
256 # unicode filenames to DOS short filenames when needed
257 }
258 else {
259 my $outhandle = $self->{'outhandle'};
260 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
261 print $outhandle " in raw file system encoding of: $raw_link\n";
262 print $outhandle " Assuming filesystem is UTF-8 based.\n";
263 $url_original_filename = encode("utf8", $url_original_filename);
264 }
265 }
266
267 # Convert the (currently raw) link into its Unicode version.
268 # Store the Unicode link along with the url_original_filename
269 my $unicode_url_original_filename = "";
270 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
271
272
273 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
274
275
276 if ($url_original_filename ne $unicode_url_original_filename) {
[17088]277 my $outhandle = $self->{'outhandle'};
[23387]278
[17088]279 print $outhandle "URL Encoding $url_original_filename\n";
[23387]280 print $outhandle " ->$unicode_url_original_filename\n";
[17088]281
[23387]282 # Allow for possibility of raw byte version and Unicode versions of file
[23561]283 &util::block_filename($block_hash,$unicode_url_original_filename);
[23387]284 }
[23363]285
[23418]286 # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
[23561]287 &util::block_filename($block_hash,$url_original_filename);
[23418]288
[14665]289 }
290}
291
[16769]292# Given a filename in any encoding, will URL decode it to get back the original filename
293# in the original encoding. Because this method is intended to work out the *original*
[18320]294# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
[16769]295# string already exists in the local folder.
[23363]296#
[16769]297sub opt_url_decode {
298 my $self = shift (@_);
[23387]299 my ($raw_link) = @_;
[16024]300
[23387]301
[16769]302 # Replace %XX's in URL with decoded value if required.
303 # Note that the filename may include the %XX in some situations
[23387]304
305## if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
306
307 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
308 if (!-e $raw_link) {
309 $raw_link = &unicode::url_decode($raw_link,1);
[16769]310 }
311 }
[23387]312
313 return $raw_link;
[16769]314}
315
[20774]316sub read_into_doc_obj
317{
318 my $self = shift (@_);
319 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
320
[22330]321 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[23335]322
323 # Lookup content_encoding worked out in file_block pass for this file
324 # Store it under the local name 'content_encoding' so its nice and
325 # easy to access
326 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
327
[20774]328 # get the input file
329 my $input_filename = $file;
330 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
331 $suffix = lc($suffix);
[22330]332 my $tidy_filename;
[20774]333 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
334 {
335 # because the document has to be sectionalized set the description tags
336 $self->{'description_tags'} = 1;
337
338 # set the file to be tidied
[27306]339 $input_filename = &FileUtils::filenameConcatenate($base_dir,$file) if $base_dir =~ m/\w/;
[20774]340
341 # get the tidied file
342 #my $tidy_filename = $self->tmp_tidy_file($input_filename);
[22330]343 $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
[20774]344
345 # derive tmp filename from input filename
346 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
347
348 # set the new input file and base_dir to be from the tidied file
349 $file = "$tailname$suffix";
350 $base_dir = $dirname;
351 }
352
353 # call the parent read_into_doc_obj
354 my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
[22330]355 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
356 {
357 # now we need to reset the filenames in the doc obj so that the converted filenames are not used
358 my $collect_file = &util::filename_within_collection($filename_full_path);
359 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
360 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
[23363]361 # build. So set it manually.
362 $doc_obj->set_source_path($filename_full_path);
[22330]363 my $collect_conv_file = &util::filename_within_collection($tidy_filename);
364 $doc_obj->set_converted_filename($collect_conv_file);
[23349]365
366 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23352]367 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
368 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
[22330]369 }
[23335]370
371 delete $self->{'store_content_encoding'}->{$filename_full_path};
372 $self->{'content_encoding'} = undef;
373
[20774]374 return ($process_status,$doc_obj);
375}
[16769]376
[14665]377# do plugin specific processing of doc_obj
378sub process {
379 my $self = shift (@_);
380 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
381 my $outhandle = $self->{'outhandle'};
382
[16769]383 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[16024]384 # this makes life so much easier... perl can cope with unix-style '/'s.
[23371]385 $base_dir =~ s@(\\)+@/@g;
386 $file =~ s@(\\)+@/@g;
[14665]387 }
[23371]388
[27306]389 my $filename = &FileUtils::filenameConcatenate($base_dir,$file);
[23371]390 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
391 my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
392
393 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
394 # And again
395 $upgraded_base_dir =~ s@(\\)+@/@g;
396 $upgraded_filename =~ s@(\\)+@/@g;
397
398 # Need to make sure there is a '/' on the end of upgraded_base_dir
[23387]399 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
[23371]400 $upgraded_base_dir .= "/";
401 }
402 }
403 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
[14665]404
405 # reset per-doc stuff...
406 $self->{'aux_files'} = {};
407 $self->{'dir_num'} = 0;
408 $self->{'file_num'} = 0;
409
410 # process an HTML file where sections are divided by headings tags (H1, H2 ...)
411 # you can also include metadata in the format (X can be any number)
412 # <hX>Title<!--gsdl-metadata
413 # <Metadata name="name1">value1</Metadata>
414 # ...
415 # <Metadata name="nameN">valueN</Metadata>
416 #--></hX>
417 if ($self->{'sectionalise_using_h_tags'}) {
418 # description_tags should allways be activated because we convert headings to description tags
419 $self->{'description_tags'} = 1;
420
421 my $arrSections = [];
[23371]422 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
[14665]423
424 if (scalar(@$arrSections)) {
425 my $strMetadata = $self->update_section_data($arrSections, -1);
426 if (length($strMetadata)) {
427 $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
428 $$textref =~ s/<\/body>/$strMetadata/ig;
429 }
430 }
431 }
432
433 my $cursection = $doc_obj->get_top_section();
434
435 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
436 unless $self->{'no_metadata'} || $self->{'description_tags'};
437
438 # Store URL for page as metadata - this can be used for an
439 # altavista style search interface. The URL won't be valid
440 # unless the file structure contains the domain name (i.e.
441 # like when w3mir is used to download a website).
442
443 # URL metadata (even invalid ones) are used to support internal
444 # links, so even if 'file_is_url' is off, still need to store info
445
[23371]446 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
[23347]447
[23335]448# my $utf8_file = $self->filename_to_utf8_metadata($file);
449# $utf8_file =~ s/&\#095;/_/g;
[23835]450# variable below used to be utf8_file
451
[23387]452 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
453 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
[23335]454
[16735]455 my $web_url = "http://";
[23387]456 my $utf8_web_url = "http://";
[16735]457 if(defined $dirname) { # local directory
[22689]458 # Check for "ftp" in the domain name of the directory
459 # structure to determine if this URL should be a ftp:// URL
460 # This check is not infallible, but better than omitting the
461 # check, which would cause all files downloaded from ftp sites
462 # via mirroring with wget to have potentially erroneous http:// URLs
463 # assigned in their metadata
464 if ($dirname =~ /^[^\/]*ftp/i)
465 {
466 $web_url = "ftp://";
[23387]467 $utf8_web_url = "ftp://";
[22689]468 }
[16836]469 $dirname = $self->eval_dir_dots($dirname);
[18626]470 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
[23387]471
472 $web_url = $web_url.$dirname.$url_encoded_file;
473 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
[16735]474 } else {
[23387]475 $web_url = $web_url.$url_encoded_file;
476 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
[16735]477 }
[19983]478 $web_url =~ s/\\/\//g;
[23387]479 $utf8_web_url =~ s/\\/\//g;
[23371]480
481 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]482 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n";
483 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
[23371]484 }
485
486
[15872]487 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
[23387]488 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
[15872]489
[14665]490 if ($self->{'file_is_url'}) {
491 $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
492 $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
493 $doc_obj->add_metadata($cursection, "/weblink", "</a>");
494 }
495
496 if ($self->{'description_tags'}) {
497 # remove the html header - note that doing this here means any
498 # sections defined within the header will be lost (so all <Section>
499 # tags must appear within the body of the HTML)
500 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
501
502 $$textref =~ s/^.*?<body[^>]*>//is;
503 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
504
505 my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
506 my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
507
508 my $lt = '(?:<|&lt;)';
509 my $gt = '(?:>|&gt;)';
510 my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
511
512 my $dont_strip = '';
513 if ($self->{'no_strip_metadata_html'}) {
514 ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
515 }
516
517 my $found_something = 0; my $top = 1;
518 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
519 my $text = $1;
520 my $comment = $2;
521 if (defined $text) {
522 # text before a comment - note that getting to here
523 # doesn't necessarily mean there are Section tags in
524 # the document
[23371]525 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]526 }
527 while ($comment =~ s/$lt(.*?)$gt//s) {
528 my $tag = $1;
529 if ($tag eq "Section") {
530 $found_something = 1;
531 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
532 $top = 0;
533 } elsif ($tag eq "/Section") {
534 $found_something = 1;
535 $cursection = $doc_obj->get_parent_section ($cursection);
[16769]536 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
[14665]537 my $metaname = $1;
[16769]538 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
[14665]539 $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
540 my $metavalue = $1;
541 $metavalue =~ s/^\s+//;
542 $metavalue =~ s/\s+$//;
543 # assume that no metadata value intentionally includes
544 # carriage returns or HTML tags (if they're there they
545 # were probably introduced when converting to HTML from
546 # some other format).
547 # actually some people want to have html tags in their
548 # metadata.
549 $metavalue =~ s/[\cJ\cM]/ /sg;
550 $metavalue =~ s/<[^>]+>//sg
[16769]551 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
[14665]552 $metavalue =~ s/\s+/ /sg;
[22348]553 if ($metaname =~ /\./) { # has a namespace
554 $metaname = "ex.$metaname";
555 }
[14665]556 if ($accumulate) {
557 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
558 } else {
559 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
560 }
561 } elsif ($tag eq "Description" || $tag eq "/Description") {
562 # do nothing with containing Description tags
563 } else {
564 # simple HTML tag (probably created by the conversion
565 # to HTML from some other format) - we'll ignore it and
566 # hope for the best ;-)
567 }
568 }
569 }
570 if ($cursection ne "") {
[23371]571 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
[14665]572 }
573
574 $$textref =~ s/^.*?<body[^>]*>//is;
575 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
[16769]576 if ($$textref =~ m/\S/) {
[14665]577 if (!$found_something) {
578 if ($self->{'verbosity'} > 2) {
[23371]579 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
[14665]580 print $outhandle " will be processed as a single section document\n";
581 }
582
583 # go ahead and process single-section document
[23371]584 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[14665]585
586 # if document contains no Section tags we'll go ahead
587 # and extract metadata (this won't have been done
588 # above as the -description_tags option prevents it)
589 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
590 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
591 unless $self->{'no_metadata'};
592
593 } else {
[23371]594 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
[14665]595 print $outhandle " of the final closing </Section> tag. This text will\n";
596 print $outhandle " be ignored.";
597
598 my ($text);
599 if (length($$textref) > 30) {
600 $text = substr($$textref, 0, 30) . "...";
601 } else {
602 $text = $$textref;
603 }
604 $text =~ s/\n/ /isg;
605 print $outhandle " ($text)\n";
606 }
607 } elsif (!$found_something) {
608
609 if ($self->{'verbosity'} > 2) {
610 # may get to here if document contained no valid Section
611 # tags but did contain some comments. The text will have
612 # been processed already but we should print the warning
613 # as above and extract metadata
[23371]614 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
[14665]615 print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
616 }
617
618 my $complete_text = $head_keep.$doc_obj->get_text($cursection);
619 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
620 unless $self->{'no_metadata'};
621 }
622
623 } else {
624
625 # remove header and footer
626 if (!$self->{'keep_head'} || $self->{'description_tags'}) {
627 $$textref =~ s/^.*?<body[^>]*>//is;
628 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
629 }
630
[25555]631 $self->{'css_assoc_files'} = {};
632
[14665]633 # single section document
[23371]634 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
[25555]635
636 #my $upgraded_filename_dirname = &File::Basename::dirname($upgraded_filename);
637
638 $self->acquire_css_associated_files($doc_obj, $cursection);
639
640 $self->{'css_assoc_files'} = {};
[14665]641 }
[23335]642
[14665]643 return 1;
644}
645
646
647sub process_heading
648{
649 my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
650 $strHeadingText = '' if (!defined($strHeadingText));
651
652 my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
653
654 my $strSecMetadata = '';
655 while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
656 {
657 $strSecMetadata .= $1;
658 }
659
660 $strHeadingText =~ s/^\s+//g;
661 $strHeadingText =~ s/\s+$//g;
662 $strSecMetadata =~ s/^\s+//g;
663 $strSecMetadata =~ s/\s+$//g;
664
665 $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
666
667 if (length($strSecMetadata)) {
668 $strMetadata .= "\t\t" . $strSecMetadata . "\n";
669 }
670
671 $strMetadata .= "\t</Description>\n";
672
673 return "<!--" . $strMetadata . "-->";
674}
675
676
677sub update_section_data
678{
679 my ($self, $arrSections, $nCurTocNo) = @_;
680 my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
681
682 if ($nSections == 0) {
683 push @$arrSections, $nCurTocNo;
684 return $strBuffer;
685 }
686 $nLast = $arrSections->[$nSections - 1];
687 if ($nCurTocNo > $nLast) {
688 push @$arrSections, $nCurTocNo;
689 return $strBuffer;
690 }
691 for(my $i = $nSections - 1; $i >= 0; $i--) {
692 if ($nCurTocNo <= $arrSections->[$i]) {
693 $strBuffer .= "\n</Section>";
694 pop @$arrSections;
695 }
696 }
697 push @$arrSections, $nCurTocNo;
698 return $strBuffer;
699}
700
701
702# note that process_section may be called multiple times for a single
703# section (relying on the fact that add_utf8_text appends the text to any
704# that may exist already).
705sub process_section {
706 my $self = shift (@_);
707 my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[25555]708
709 my @styleTagsText = ($$textref =~ m/<style[^>]*>([^<]*)<\/style>/sg);
710 if(scalar(@styleTagsText) > 0)
711 {
[27306]712 my $css_filename_dirname = &File::Basename::dirname(&FileUtils::filenameConcatenate($base_dir, $file));
[25555]713 foreach my $styleText (@styleTagsText)
714 {
715 $self->acquire_css_associated_files_from_text_block($styleText, $css_filename_dirname);
716 }
717 }
718
[14665]719 # trap links
720 if (!$self->{'nolinks'}) {
721 # usemap="./#index" not handled correctly => change to "#index"
[16769]722## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
723
[23392]724## my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
725## my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
[23387]726
[16769]727 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
[14665]728 $self->replace_usemap_links($1, $2, $3)/isge;
729
[23463]730 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
731 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[23387]732
[23392]733## $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
734# $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
[14665]735 }
736
737 # trap images
738
[15872]739 # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
[15176]740 # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
741 # turned regular text succeeding images into links. That is, by embedding <imgs>
742 # inside <a href=""></a>, the text following images were no longer misbehaving.
743 # However, there would be many occasions whereby images were not meant to link
744 # to their source images but where the images would link to another web page.
745 # To allow this, the no_image_links option was introduced: it would prevent
746 # the behaviour of embedding images into links that referenced the source images.
747
748 # Somewhere along the line, the problem of normal text turning into links when
749 # such text followed images which were not embedded in <a href=""></a> ceased
750 # to occur. This is why the following lines have been commented out (as well as
751 # two lines in replace_images). They appear to no longer apply.
752
753 # If at any time, there is a need for having images embedded in <a> anchor tags,
[15872]754 # then it might be better to turn that into an HTMLPlugin option rather than make
[15176]755 # it the default behaviour. Also, eventually, no_image_links needs to become
[15872]756 # a deprecated option for HTMLPlugin as it has now become the default behaviour.
[15176]757
758 #if(!$self->{'no_image_links'}){
[16247]759 $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
[15872]760 $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[15176]761 #}
762
[14665]763 # add text to document object
764 # turn \ into \\ so that the rest of greenstone doesn't think there
765 # is an escape code following. (Macro parsing loses them...)
766 $$textref =~ s/\\/\\\\/go;
767
768 $doc_obj->add_utf8_text($cursection, $$textref);
769}
770
771sub replace_images {
772 my $self = shift (@_);
773 my ($front, $link, $back, $base_dir,
774 $file, $doc_obj, $section) = @_;
775
776 # remove quotes from link at start and end if necessary
777 if ($link=~/^[\"\']/) {
[15838]778 $link=~s/^[\"\']//;
779 $link=~s/[\"\']$//;
[14665]780 $front.='"';
781 $back="\"$back";
782 }
[15872]783
[14665]784 $link =~ s/\n/ /g;
785
786 # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
787 # If the Word file path has spaces in it, wv messes up and you end up with
788 # absolute paths for the images, and without the "file://" prefix
789 # So check for this special case and massage the data to be correct
[16769]790 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
[14665]791 $link =~ s/^.*\\([^\\]+)$/$1/;
792 }
[16632]793
[14665]794 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
795
796 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
797
[17127]798# print STDERR "**** link = $link\n**** href = $href\n**** img_file = $img_file, rl = $rl\n";
[16632]799
[14665]800 my $anchor_name = $img_file;
801 #$anchor_name =~ s/^.*\///;
802 #$anchor_name = "<a name=\"$anchor_name\" ></a>";
803
804 my $image_link = $front . $img_file .$back;
[15176]805 return $image_link;
[14665]806
[15176]807 # The reasons for why the following two lines are no longer necessary can be
808 # found in subroutine process_section
809 #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>";
810 #return $anchor_link;
811
[14665]812 #return $front . $img_file . $back . $anchor_name;
813}
814
815sub replace_href_links {
816 my $self = shift (@_);
[23392]817 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[25555]818
819 if($front =~ m/^<link / && $link =~ m/\.css"$/)
820 {
821 my $actual_link = $link;
822 $actual_link =~ s/^"(.*)"$/$1/;
823
824 my $directory = &File::Basename::dirname($file);
825
[27306]826 my $css_filename = &FileUtils::filenameConcatenate($base_dir, $directory, $actual_link);
[25555]827 $self->retrieve_css_associated_files($css_filename);
828 }
829
[16769]830 # remove quotes from link at start and end if necessary
831 if ($link=~/^[\"\']/) {
832 $link=~s/^[\"\']//;
833 $link=~s/[\"\']$//;
834 $front.='"';
835 $back="\"$back";
836 }
837
[14665]838 # attempt to sort out targets - frames are not handled
839 # well in this plugin and some cases will screw things
840 # up - e.g. the _parent target (so we'll just remove
841 # them all ;-)
842 $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
843 $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
844 $front =~ s/target=\"?_parent\"?//is;
845 $back =~ s/target=\"?_parent\"?//is;
846
[25673]847 if($link =~ m/^\#/s)
848 {
849 return $front . "_httpsamepagelink_" . $link . $back;
850 }
851
[14665]852 $link =~ s/\n/ /g;
853
[16769]854 # Find file referred to by $link on file system
855 # This is more complicated than it sounds when char encodings
856 # is taken in to account
[14665]857 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[23835]858
[14665]859 # href may use '\'s where '/'s should be on Windows
860 $href =~ s/\\/\//g;
[16769]861 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
[14665]862
863 ##### leave all these links alone (they won't be picked up by intermediate
864 ##### pages). I think that's safest when dealing with frames, targets etc.
865 ##### (at least until I think of a better way to do it). Problems occur with
866 ##### mailto links from within small frames, the intermediate page is displayed
867 ##### within that frame and can't be seen. There is still potential for this to
868 ##### happen even with html pages - the solution seems to be to somehow tell
869 ##### the browser from the server side to display the page being sent (i.e.
870 ##### the intermediate page) in the top level window - I'm not sure if that's
871 ##### possible - the following line should probably be deleted if that can be done
[16769]872 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
[14665]873
[16769]874 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
875 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
[23335]876
[23371]877 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
[23335]878
[23371]879 # Don't do any encoding for now, as not clear what
880 # the right thing to do is to support filename
881 # encoding on Windows when they are not UTF16
882 #
[23347]883 }
[23371]884 else {
885 # => Unix-based system
[23335]886
[23371]887 # If web page didn't give encoding, then default to utf8
888 my $content_encoding= $self->{'content_encoding'} || "utf8";
[23835]889
[23371]890 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
891 print STDERR "**** Encoding with '$content_encoding', href: $href\n";
892 }
[23335]893
[23835]894 # on Darwin, the unicode filenames are stored on the file
895 # system in decomposed form, so any href link (including when
896 # URL-encoded) should refer to the decomposed name of the file
897 if ($ENV{'GSDLOS'} =~ /^darwin$/i) {
898 $href = normalize('D', $href); # Normalization Form D (decomposition)
899 }
900
[23371]901 $href = encode($content_encoding,$href);
902 }
903
[23835]904 $href = &unicode::raw_filename_to_utf8_url_encoded($href);
[23335]905 $href = &unicode::filename_to_url($href);
906
[16812]907 &ghtml::urlsafe ($href);
[23371]908
[23347]909 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]910 print STDERR "******DEBUG: href=$href\n";
[23347]911 }
[23335]912
[23347]913
[18521]914 return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
[14665]915 } else {
[23335]916 # link is to some other type of file (e.g., an image) so we'll
[14665]917 # need to associate that file
918 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
919 }
920}
921
[25555]922sub retrieve_css_associated_files {
923 my $self = shift (@_);
924 my ($css_filename) = @_;
925
926 my $css_filename_dirname = &File::Basename::dirname($css_filename);
927
928 open (CSSFILE, $css_filename) || return;
929 sysread (CSSFILE, my $file_string, -s CSSFILE);
930
931 $self->acquire_css_associated_files_from_text_block($file_string, $css_filename_dirname) unless !defined $file_string;
932
933 close CSSFILE;
934}
935
936sub acquire_css_associated_files_from_text_block {
937 my $self = shift (@_);
938 my ($text, $css_filename_dirname) = @_;
939
940 my @image_urls = ($text =~ m/background-image:\s*url[^;]*;/sg);
941 foreach my $img_url (@image_urls)
942 {
943 $img_url =~ s/^.*url.*\((.*)\).*$/$1/;
944 $img_url =~ s/^\s*"?([^"]*)"?\s*$/$1/;
945
[27306]946 $self->{'css_assoc_files'}->{&FileUtils::filenameConcatenate($css_filename_dirname, $img_url)} = $img_url;
[25555]947 }
948}
949
950sub acquire_css_associated_files {
951 my $self = shift(@_);
952
953 my ($doc_obj, $section) = @_;
954
955 foreach my $image_filename (keys %{$self->{'css_assoc_files'}})
956 {
957 $doc_obj->associate_file($image_filename, $self->{'css_assoc_files'}->{$image_filename}, undef, $section);
958 }
959}
960
[14665]961sub add_file {
962 my $self = shift (@_);
963 my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
964 my ($newname);
965
966 my $filename = $href;
967 if ($base_dir eq "") {
[23387]968 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
969 # remove http://
970 $filename =~ s/^[^:]*:\/\///;
971 }
972 else {
973 # remove http:/ thereby leaving one slash at the start as
974 # part of full pathname
975 $filename =~ s/^[^:]*:\///;
976 }
[14665]977 }
978 else {
979 # remove http://
980 $filename =~ s/^[^:]*:\/\///;
981 }
982
[27306]983 $filename = &FileUtils::filenameConcatenate($base_dir, $filename);
[23363]984
[22355]985 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
986 # we are processing a tidytmp file - want paths to be in import
987 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
988 }
[23335]989
990 # Replace %XX's in URL with decoded value if required. Note that the
991 # filename may include the %XX in some situations. If the *original*
992 # file's name was in URL encoding, the following method will not decode
993 # it.
[23387]994 my $unicode_filename = $filename;
995 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
[16769]996
[23387]997 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
998 # (it seems) accompanying file
999 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
1000
[23335]1001 my $content_encoding= $self->{'content_encoding'} || "utf8";
1002
[23387]1003 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
1004 # The filenames that come through the HTML file have been decoded
1005 # into Unicode aware Perl strings. Need to convert them back
1006 # to their initial raw-byte encoding to match the file that
1007 # exists on the file system
1008 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1009 }
1010 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
1011 # HFS+ is UTF8 with decompostion
1012 $filename = encode($content_encoding, $opt_decode_unicode_filename);
1013 $filename = normalize('D', $filename); # Normalization Form D (decomposition)
[23335]1014
[23387]1015 }
1016 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
1017 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
1018
1019 if (defined $long_filename) {
1020 my $short_filename = Win32::GetLongPathName($long_filename);
1021 $filename = $short_filename;
1022 }
1023# else {
1024# print STDERR "***** failed to map href to real file:\n";
1025# print STDERR "****** $href -> $opt_decode_unicode_filename\n";
1026# }
1027 }
1028 else {
1029 my $outhandle = $self->{'outhandle'};
1030 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
1031 print $outhandle " in file system encoding of href: $href\n";
1032 print $outhandle " No character encoding done.\n";
1033 }
1034
1035
[16769]1036 # some special processing if the intended filename was converted to utf8, but
1037 # the actual file still needs to be renamed
[27306]1038 #if (!&util::fd_exists($filename)) {
1039 if (!&FileUtils::fileExists($filename)) {
[16769]1040 # try the original filename stored in map
[23347]1041 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]1042 print STDERR "******!! orig filename did not exist: $filename\n";
[23347]1043 }
[23335]1044
[23387]1045## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
[23363]1046
[23387]1047 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
[23335]1048
[23347]1049 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23387]1050 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n";
[23347]1051 }
[23335]1052
[16920]1053 if (defined $original_filename && -e $original_filename) {
[23347]1054 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
[23363]1055 print STDERR "****** Found match!\n";
[23347]1056 }
[16769]1057 $filename = $original_filename;
[14665]1058 }
1059 }
[16769]1060
1061 my ($ext) = $filename =~ m/(\.[^\.]*)$/;
[14665]1062
1063 if ($rl == 0) {
[16769]1064 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]1065 return "_httpextlink_&amp;rl=0&amp;el=prompt&amp;href=" . $href . $hash_part;
[14665]1066 }
1067 else {
[18521]1068 return "_httpextlink_&amp;rl=0&amp;el=direct&amp;href=" . $href . $hash_part;
[14665]1069 }
1070 }
1071
[16769]1072 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
[18521]1073 return "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part;
[14665]1074 }
[20778]1075 # add the original image file as a source file
[20791]1076 if (!$self->{'processing_tmp_files'} ) {
1077 $doc_obj->associate_source_file($filename);
1078 }
[14665]1079 if ($self->{'rename_assoc_files'}) {
1080 if (defined $self->{'aux_files'}->{$href}) {
1081 $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1082 $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1083 } else {
1084 $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1085 $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1086 $self->inc_filecount ();
1087 }
1088 $doc_obj->associate_file($filename, $newname, undef, $section);
1089 return "_httpdocimg_/$newname";
1090 } else {
[23387]1091 if(&unicode::is_url_encoded($unicode_filename)) {
[16904]1092 # use the possibly-decoded filename instead to avoid double URL encoding
1093 ($newname) = $filename =~ m/([^\/\\]*)$/;
1094 } else {
[23387]1095 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
[16904]1096 }
[16935]1097
[18320]1098 # Make sure this name uses only ASCII characters.
1099 # We use either base64 or URL encoding, as these preserve original encoding
1100 $newname = &util::rename_file($newname, $self->{'file_rename_method'});
[16632]1101
[23363]1102### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
[14665]1103 $doc_obj->associate_file($filename, $newname, undef, $section);
[16632]1104
[16769]1105 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1106 # of filenames, URL-encode the additional percent signs of the URL-encoded filename
[16632]1107 my $newname_url = $newname;
[18404]1108 $newname_url = &unicode::filename_to_url($newname_url);
[16769]1109 return "_httpdocimg_/$newname_url";
[14665]1110 }
1111}
1112
1113
1114sub format_link {
1115 my $self = shift (@_);
1116 my ($link, $base_dir, $file) = @_;
1117
[23371]1118 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
1119 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
[23463]1120
[14665]1121 $hash_part = "" if !defined $hash_part;
[16769]1122 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
[23463]1123 my $outhandle = $self->{'outhandle'};
1124 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1125 if $self->{'verbosity'};
1126 return ($link, "", 0);
[14665]1127 }
[23463]1128
[20576]1129 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
[23463]1130 my $type = $1;
[14665]1131
[16769]1132 if ($link =~ m/^(http|ftp):/i) {
[14665]1133 # Turn url (using /) into file name (possibly using \ on windows)
1134 my @http_dir_split = split('/', $before_hash);
[27306]1135 $before_hash = &FileUtils::filenameConcatenate(@http_dir_split);
[14665]1136 }
1137
1138 $before_hash = $self->eval_dir_dots($before_hash);
[16024]1139
[27306]1140 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
[14665]1141
1142 my $rl = 0;
1143 $rl = 1 if (-e $linkfilename);
1144
1145 # make sure there's a slash on the end if it's a directory
[16769]1146 if ($before_hash !~ m/\/$/) {
[14665]1147 $before_hash .= "/" if (-d $linkfilename);
1148 }
1149 return ($type . $before_hash, $hash_part, $rl);
[16024]1150
[16769]1151 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
[14665]1152
[16769]1153 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1154
[14665]1155 # the first directory will be the domain name if file_is_url
1156 # to generate archives, otherwise we'll assume all files are
1157 # from the same site and base_dir is the root
1158
1159 if ($self->{'file_is_url'}) {
1160 my @dirs = split /[\/\\]/, $file;
1161 my $domname = shift (@dirs);
[27306]1162 $before_hash = &FileUtils::filenameConcatenate($domname, $before_hash);
[14665]1163 $before_hash =~ s@\\@/@g; # for windows
1164 }
1165 else
1166 {
1167 # see if link shares directory with source document
1168 # => turn into relative link if this is so!
1169
[16769]1170 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
[14665]1171 # too difficult doing a pattern match with embedded '\'s...
1172 my $win_before_hash=$before_hash;
1173 $win_before_hash =~ s@(\\)+@/@g;
1174 # $base_dir is already similarly "converted" on windows.
1175 if ($win_before_hash =~ s@^$base_dir/@@o) {
[16024]1176 # if this is true, we removed a prefix
1177 $before_hash=$win_before_hash;
[14665]1178 }
1179 }
1180 else {
1181 # before_hash has lost leading slash by this point,
1182 # -> add back in prior to substitution with $base_dir
1183 $before_hash = "/$before_hash";
1184
[27306]1185 $before_hash = &FileUtils::filenameConcatenate("",$before_hash);
[14665]1186 $before_hash =~ s@^$base_dir/@@;
1187 }
1188 }
1189 } else {
1190 # Turn relative file path into full path
1191 my $dirname = &File::Basename::dirname($file);
[27306]1192 $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
[16769]1193 $before_hash = $self->eval_dir_dots($before_hash);
[14665]1194 }
1195
[27306]1196 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
[23387]1197
1198
1199# print STDERR "**** linkfilename = $linkfilename\n";
1200# if (!&util::fd_exists($linkfilename)) {
1201# print STDERR "***** Warning: Could not find $linkfilename\n";
1202# }
1203
1204
[14665]1205 # make sure there's a slash on the end if it's a directory
[16769]1206 if ($before_hash !~ m/\/$/) {
[14665]1207 $before_hash .= "/" if (-d $linkfilename);
1208 }
[23387]1209
1210# print STDERR "*** returning: $before_hash\n";
1211
[14665]1212 return ("http://" . $before_hash, $hash_part, 1);
1213 } else {
1214 # mailto, news, nntp, telnet, javascript or gopher link
1215 return ($before_hash, "", 0);
1216 }
1217}
1218
1219sub extract_first_NNNN_characters {
1220 my $self = shift (@_);
1221 my ($textref, $doc_obj, $thissection) = @_;
1222
1223 foreach my $size (split /,/, $self->{'first'}) {
1224 my $tmptext = $$textref;
1225 # skip to the body
1226 $tmptext =~ s/.*<body[^>]*>//i;
1227 # remove javascript
1228 $tmptext =~ s@<script.*?</script>@ @sig;
1229 $tmptext =~ s/<[^>]*>/ /g;
1230 $tmptext =~ s/&nbsp;/ /g;
1231 $tmptext =~ s/^\s+//;
1232 $tmptext =~ s/\s+$//;
1233 $tmptext =~ s/\s+/ /gs;
1234 $tmptext = &unicode::substr ($tmptext, 0, $size);
1235 $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1236 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1237 }
1238}
1239
1240
1241sub extract_metadata {
1242 my $self = shift (@_);
1243 my ($textref, $metadata, $doc_obj, $section) = @_;
1244 my $outhandle = $self->{'outhandle'};
1245 # if we don't want metadata, we may as well not be here ...
1246 return if (!defined $self->{'metadata_fields'});
[22348]1247
[21800]1248 my $separator = $self->{'metadata_field_separator'};
1249 if ($separator eq "") {
1250 undef $separator;
1251 }
1252
[14665]1253 # metadata fields to extract/save. 'key' is the (lowercase) name of the
1254 # html meta, 'value' is the metadata name for greenstone to use
1255 my %find_fields = ();
1256
1257 my %creator_fields = (); # short-cut for lookups
1258
1259
1260 foreach my $field (split /,/, $self->{'metadata_fields'}) {
1261 $field =~ s/^\s+//; # remove leading whitespace
1262 $field =~ s/\s+$//; # remove trailing whitespace
[22348]1263
[14665]1264 # support tag<tagname>
[20689]1265 if ($field =~ m/^(.*?)\s*<(.*?)>$/) {
[14665]1266 # "$2" is the user's preferred gs metadata name
1267 $find_fields{lc($1)}=$2; # lc = lowercase
1268 } else { # no <tagname> for mapping
1269 # "$field" is the user's preferred gs metadata name
1270 $find_fields{lc($field)}=$field; # lc = lowercase
1271 }
1272 }
1273
1274 if (defined $self->{'hunt_creator_metadata'} &&
1275 $self->{'hunt_creator_metadata'} == 1 ) {
1276 my @extra_fields =
1277 (
1278 'author',
1279 'author.email',
1280 'creator',
1281 'dc.creator',
1282 'dc.creator.corporatename',
1283 );
1284
1285 # add the creator_metadata fields to search for
1286 foreach my $field (@extra_fields) {
1287 $creator_fields{$field}=0; # add to lookup hash
1288 }
1289 }
1290
1291
1292 # find the header in the html file, which has the meta tags
1293 $$textref =~ m@<head>(.*?)</head>@si;
1294
1295 my $html_header=$1;
1296
1297 # go through every <meta... tag defined in the html and see if it is
1298 # one of the tags we want to match.
1299
1300 # special case for title - we want to remember if its been found
1301 my $found_title = 0;
1302 # this assumes that ">" won't appear. (I don't think it's allowed to...)
[16769]1303 $html_header =~ m/^/; # match the start of the string, for \G assertion
[16024]1304
[14665]1305 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1306 my $metatag=$1;
1307 my ($tag, $value);
1308
1309 # find the tag name
[16769]1310 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
[14665]1311 $tag=$2;
1312 # in case they're not using " or ', but they should...
1313 if (! $tag) {
[16769]1314 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
[14665]1315 $tag=$1;
1316 }
1317
1318 if (!defined $tag) {
[15872]1319 print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
[14665]1320 next;
1321 }
1322
1323 # don't need to assign this field if it was passed in from a previous
1324 # (recursive) plugin
1325 if (defined $metadata->{$tag}) {next}
1326
1327 # find the tag content
[16769]1328 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
[14665]1329 $value=$2;
1330
[24431]1331 # The following code assigns the metaname to value if value is
1332 # empty. Why would we do this?
1333 #if (! $value) {
1334 # $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1335 # $value=$1;
1336 #}
1337 if (!defined $value || $value eq "") {
1338 print $outhandle "HTMLPlugin: can't find VALUE in <meta $metatag >\n" if ($self->{'verbosity'} > 2);
[14665]1339 next;
1340 }
[22348]1341
[14665]1342 # clean up and add
1343 $value =~ s/\s+/ /gs;
1344 chomp($value); # remove trailing \n, if any
1345 if (exists $creator_fields{lc($tag)}) {
1346 # map this value onto greenstone's "Creator" metadata
1347 $tag='Creator';
1348 } elsif (!exists $find_fields{lc($tag)}) {
[16024]1349 next; # don't want this tag
[14665]1350 } else {
1351 # get the user's preferred capitalisation
1352 $tag = $find_fields{lc($tag)};
1353 }
1354 if (lc($tag) eq "title") {
1355 $found_title = 1;
1356 }
[18521]1357
1358 if ($self->{'verbosity'} > 2) {
1359 print $outhandle " extracted \"$tag\" metadata \"$value\"\n";
[14665]1360 }
[18521]1361
[22348]1362 if ($tag =~ /\./) {
1363 # there is a . so has a namespace, add ex.
1364 $tag = "ex.$tag";
1365 }
[21800]1366 if (defined $separator) {
1367 my @values = split($separator, $value);
1368 foreach my $v (@values) {
1369 $doc_obj->add_utf8_metadata($section, $tag, $v) if $v =~ /\S/;
1370 }
1371 }
1372 else {
1373 $doc_obj->add_utf8_metadata($section, $tag, $value);
1374 }
[14665]1375 }
1376
1377 # TITLE: extract the document title
1378 if (exists $find_fields{'title'} && !$found_title) {
1379 # we want a title, and didn't find one in the meta tags
1380 # see if there's a <title> tag
1381 my $title;
1382 my $from = ""; # for debugging output only
[16769]1383 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
[14665]1384 $title = $1;
1385 $from = "<title> tags";
1386 }
1387
1388 if (!defined $title) {
1389 $from = "first 100 chars";
1390 # if no title use first 100 or so characters
1391 $title = $$textref;
1392 $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
1393 $title =~ s/^.*?<body>//si;
1394 # ignore javascript!
1395 $title =~ s@<script.*?</script>@ @sig;
1396 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1397 $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1398 $title = substr ($title, 0, 100);
1399 $title =~ s/\s\S*$/.../;
1400 }
1401 $title =~ s/<[^>]*>/ /g; # remove html tags
1402 $title =~ s/&nbsp;/ /g;
1403 $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1404 $title =~ s/\s+/ /gs; # collapse multiple spaces
1405 $title =~ s/^\s*//; # remove leading spaces
1406 $title =~ s/\s*$//; # remove trailing spaces
1407
1408 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1409 $title =~ s/^\s+//s; # in case title_sub introduced any...
[23335]1410 $doc_obj->add_utf8_metadata ($section, "Title", $title);
[14665]1411 print $outhandle " extracted Title metadata \"$title\" from $from\n"
1412 if ($self->{'verbosity'} > 2);
1413 }
1414
1415 # add FileFormat metadata
1416 $doc_obj->add_metadata($section,"FileFormat", "HTML");
1417
1418 # Special, for metadata names such as tagH1 - extracts
1419 # the text between the first <H1> and </H1> tags into "H1" metadata.
1420
1421 foreach my $field (keys %find_fields) {
[16769]1422 if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
[14665]1423 my $tag = $1;
1424 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1425 my $content = $1;
1426 $content =~ s/&nbsp;/ /g;
1427 $content =~ s/<[^>]*>/ /g;
1428 $content =~ s/^\s+//;
1429 $content =~ s/\s+$//;
1430 $content =~ s/\s+/ /gs;
1431 if ($content) {
1432 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1433 $tag =~ s/^tag//i;
1434 $doc_obj->add_utf8_metadata ($section, $tag, $content);
1435 print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1436 if ($self->{'verbosity'} > 2);
1437 }
1438 }
1439 }
1440}
1441
1442
1443# evaluate any "../" to next directory up
1444# evaluate any "./" as here
1445sub eval_dir_dots {
1446 my $self = shift (@_);
1447 my ($filename) = @_;
1448 my $dirsep_os = &util::get_os_dirsep();
1449 my @dirsep = split(/$dirsep_os/,$filename);
1450
1451 my @eval_dirs = ();
1452 foreach my $d (@dirsep) {
1453 if ($d eq "..") {
1454 pop(@eval_dirs);
1455
1456 } elsif ($d eq ".") {
1457 # do nothing!
1458
1459 } else {
1460 push(@eval_dirs,$d);
1461 }
1462 }
1463
1464 # Need to fiddle with number of elements in @eval_dirs if the
1465 # first one is the empty string. This is because of a
[27306]1466 # modification to FileUtils::filenameConcatenate that supresses the addition
[14665]1467 # of a leading '/' character (or \ if windows) (intended to help
1468 # filename cat with relative paths) if the first entry in the
1469 # array is the empty string. Making the array start with *two*
1470 # empty strings is a way to defeat this "smart" option.
1471 #
1472 if (scalar(@eval_dirs) > 0) {
1473 if ($eval_dirs[0] eq ""){
1474 unshift(@eval_dirs,"");
1475 }
1476 }
[16836]1477
[27306]1478 my $evaluated_filename = (scalar @eval_dirs > 0) ? &FileUtils::filenameConcatenate(@eval_dirs) : "";
[16836]1479 return $evaluated_filename;
[14665]1480}
1481
1482sub replace_usemap_links {
1483 my $self = shift (@_);
1484 my ($front, $link, $back) = @_;
1485
[16769]1486 # remove quotes from link at start and end if necessary
1487 if ($link=~/^[\"\']/) {
1488 $link=~s/^[\"\']//;
1489 $link=~s/[\"\']$//;
1490 $front.='"';
1491 $back="\"$back";
1492 }
1493
[14665]1494 $link =~ s/^\.\///;
1495 return $front . $link . $back;
1496}
1497
1498sub inc_filecount {
1499 my $self = shift (@_);
1500
1501 if ($self->{'file_num'} == 1000) {
1502 $self->{'dir_num'} ++;
1503 $self->{'file_num'} = 0;
1504 } else {
1505 $self->{'file_num'} ++;
1506 }
1507}
1508
1509
[15872]1510# Extend read_file so that strings like &eacute; are
[14665]1511# converted to UTF8 internally.
1512#
1513# We don't convert &lt; or &gt; or &amp; or &quot; in case
1514# they interfere with the GML files
1515
1516sub read_file {
[15872]1517 my $self = shift(@_);
1518 my ($filename, $encoding, $language, $textref) = @_;
[14665]1519
[15872]1520 $self->SUPER::read_file($filename, $encoding, $language, $textref);
[14665]1521
[23363]1522 # Convert entities to their Unicode code-point equivalents
[14665]1523 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
[22951]1524 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
[14665]1525 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
[22842]1526
[14665]1527}
1528
[20774]1529sub HB_read_html_file {
1530 my $self = shift (@_);
1531 my ($htmlfile, $text) = @_;
1532
1533 # load in the file
1534 if (!open (FILE, $htmlfile)) {
1535 print STDERR "ERROR - could not open $htmlfile\n";
1536 return;
1537 }
1538
1539 my $foundbody = 0;
1540 $self->HB_gettext (\$foundbody, $text, "FILE");
1541 close FILE;
1542
1543 # just in case there was no <body> tag
1544 if (!$foundbody) {
1545 $foundbody = 1;
1546 open (FILE, $htmlfile) || return;
1547 $self->HB_gettext (\$foundbody, $text, "FILE");
1548 close FILE;
1549 }
1550 # text is in utf8
1551}
1552
1553# converts the text to utf8, as ghtml does that for &eacute; etc.
1554sub HB_gettext {
1555 my $self = shift (@_);
1556 my ($foundbody, $text, $handle) = @_;
1557
1558 my $line = "";
1559 while (defined ($line = <$handle>)) {
1560 # look for body tag
1561 if (!$$foundbody) {
1562 if ($line =~ s/^.*<body[^>]*>//i) {
1563 $$foundbody = 1;
1564 } else {
1565 next;
1566 }
1567 }
1568
1569 # check for symbol fonts
1570 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
1571 my $font = $1;
1572 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
1573 if ($font !~ m/^arial$/i);
1574 }
1575
1576 $$text .= $line;
1577 }
1578
1579 if ($self->{'input_encoding'} eq "iso_8859_1") {
1580 # convert to utf-8
1581 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
1582 }
1583 # convert any alphanumeric character entities to their utf-8
1584 # equivalent for indexing purposes
1585 #&ghtml::convertcharentities ($$text);
1586
1587 $$text =~ s/\s+/ /g; # remove \n's
[22857]1588
1589 # At this point $$text is a binary byte string
1590 # => turn it into a Unicode aware string, so full
1591 # Unicode aware pattern matching can be used.
1592 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
1593 #
1594
1595 $$text = decode("utf8",$$text);
[20774]1596}
1597
1598sub HB_clean_section {
1599 my $self = shift (@_);
1600 my ($section) = @_;
1601
1602 # remove tags without a starting tag from the section
1603 my ($tag, $tagstart);
1604 while ($section =~ m/<\/([^>]{1,10})>/) {
1605 $tag = $1;
1606 $tagstart = index($section, "<$tag");
1607 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
1608 $section =~ s/<\/$tag>//;
1609 }
1610
1611 # remove extra paragraph tags
1612 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
1613
1614 # remove extra stuff at the end of the section
1615 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
1616
1617 # add a newline at the beginning of each paragraph
1618 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
1619
1620 # add a newline every 80 characters at a word boundary
1621 # Note: this regular expression puts a line feed before
1622 # the last word in each section, even when it is not
1623 # needed.
1624 $section =~ s/(.{1,80})\s/$1\n/g;
1625
1626 # fix up the image links
1627 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
1628 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1629 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
1630 <center><img src=\"$1\" \/><\/center><br\/>/ig;
1631
1632 return $section;
1633}
1634
1635# Will convert the oldHDL format to the new HDL format (using the Section tag)
1636sub convert_to_newHDLformat
1637{
1638 my $self = shift (@_);
1639 my ($file,$cnfile) = @_;
1640 my $input_filename = $file;
1641 my $tmp_filename = $cnfile;
1642
1643 # write HTML tmp file with new HDL format
1644 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1645
1646 # read in the file and do basic html cleaning (removing header etc)
1647 my $html = "";
1648 $self->HB_read_html_file ($input_filename, \$html);
1649
1650 # process the file one section at a time
1651 my $curtoclevel = 1;
1652 my $firstsection = 1;
1653 my $toclevel = 0;
1654 while (length ($html) > 0) {
1655 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
1656 $toclevel = $3;
1657 my $title = $4;
1658 my $sectiontext = "";
1659 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
1660 $sectiontext = $1;
1661 } else {
1662 $sectiontext = $html;
1663 $html = "";
1664 }
1665
1666 # remove tags and extra spaces from the title
1667 $title =~ s/<\/?[^>]+>//g;
1668 $title =~ s/^\s+|\s+$//g;
1669
1670 # close any sections below the current level and
1671 # create a new section (special case for the firstsection)
1672 print PROD "<!--\n";
1673 while (($curtoclevel > $toclevel) ||
1674 (!$firstsection && $curtoclevel == $toclevel)) {
1675 $curtoclevel--;
1676 print PROD "</Section>\n";
1677 }
1678 if ($curtoclevel+1 < $toclevel) {
1679 print STDERR "WARNING - jump in toc levels in $input_filename " .
1680 "from $curtoclevel to $toclevel\n";
1681 }
1682 while ($curtoclevel < $toclevel) {
1683 $curtoclevel++;
1684 }
1685
1686 if ($curtoclevel == 1) {
1687 # add the header tag
1688 print PROD "-->\n";
1689 print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
1690 print PROD "<!--\n";
1691 }
1692
1693 print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
1694
1695 print PROD "-->\n";
1696
1697 # clean up the section html
1698 $sectiontext = $self->HB_clean_section($sectiontext);
1699
1700 print PROD "$sectiontext\n";
1701
1702 } else {
1703 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
1704 "\nin $input_filename\n";
1705 last;
1706 }
1707 $firstsection = 0;
1708 }
1709
1710 print PROD "<!--\n";
1711 while ($curtoclevel > 0) {
1712 $curtoclevel--;
1713 print PROD "</Section>\n";
1714 }
1715 print PROD "-->\n";
1716
1717 close (PROD) || die("Error Closing File: $tmp_filename $!");
1718
1719 return $tmp_filename;
1720}
1721
1722sub shorten {
1723 my $self = shift (@_);
1724 my ($text) = @_;
1725
1726 return "\"$text\"" if (length($text) < 100);
1727
1728 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
1729 substr ($text, length($text)-50) . "\"";
1730}
1731
1732sub convert_tidy_or_oldHDL_file
1733{
1734 my $self = shift (@_);
1735 my ($file) = @_;
1736 my $input_filename = $file;
1737
1738 if (-d $input_filename)
1739 {
1740 return $input_filename;
1741 }
1742
1743 # get the input filename
1744 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1745 my $base_dirname = $dirname;
1746 $suffix = lc($suffix);
1747
1748 # derive tmp filename from input filename
1749 # Remove any white space from filename -- no risk of name collision, and
1750 # makes later conversion by utils simpler. Leave spaces in path...
1751 # tidy up the filename with space, dot, hyphen between
1752 $tailname =~ s/\s+//g;
1753 $tailname =~ s/\.+//g;
1754 $tailname =~ s/\-+//g;
1755 # convert to utf-8 otherwise we have problems with the doc.xml file
1756 # later on
1757 &unicode::ensure_utf8(\$tailname);
1758
1759 # softlink to collection tmp dir
[27306]1760 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
[20774]1761 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1762
1763 my $test_dirname = "";
1764 my $f_separator = &util::get_os_dirsep();
1765
1766 if ($dirname =~ m/import$f_separator/)
1767 {
1768 $test_dirname = $'; #'
1769
1770 #print STDERR "init $'\n";
1771
1772 while ($test_dirname =~ m/[$f_separator]/)
1773 {
1774 my $folderdirname = $`;
[27306]1775 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname,$folderdirname);
[20774]1776 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1777 $test_dirname = $'; #'
1778 }
1779 }
1780
[27306]1781 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]1782
1783 # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
1784 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
1785 {
1786 #convert the input file to a new style HDL
1787 my $hdl_output_filename = $input_filename;
1788 if ($self->{'old_style_HDL'})
1789 {
[27306]1790 $hdl_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]1791 $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
1792 }
1793
1794 #just for checking copy all other file from the base dir to tmp dir if it is not exists
1795 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
1796 my @files = grep {!/^\.+$/} readdir(DIR);
1797 close(DIR);
1798
1799 foreach my $file (@files)
1800 {
[27306]1801 my $src_file = &FileUtils::filenameConcatenate($base_dirname,$file);
1802 my $dest_file = &FileUtils::filenameConcatenate($tmp_dirname,$file);
[20774]1803 if ((!-e $dest_file) && (!-d $src_file))
1804 {
1805 # just copy the original file back to the tmp directory
1806 copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
1807 }
1808 }
1809
1810 # tidy the input file
1811 my $tidy_output_filename = $hdl_output_filename;
1812 if ($self->{'use_realistic_book'})
1813 {
[27306]1814 $tidy_output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$suffix");
[20774]1815 $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
1816 }
1817 $tmp_filename = $tidy_output_filename;
1818 }
1819 else
1820 {
1821 if (!-e $tmp_filename)
1822 {
1823 # just copy the original file back to the tmp directory
1824 copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
1825 }
1826 }
1827
1828 return $tmp_filename;
1829}
1830
1831
1832# Will make the html input file as a proper XML file with removed font tag and
1833# image size added to the img tag.
1834# The tidying process takes place in a collection specific 'tmp' directory so
1835# that we don't accidentally damage the input.
1836sub tmp_tidy_file
1837{
1838 my $self = shift (@_);
1839 my ($file,$cnfile) = @_;
1840 my $input_filename = $file;
1841 my $tmp_filename = $cnfile;
1842
1843 # get the input filename
1844 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1845
1846 require HTML::TokeParser::Simple;
1847
1848 # create HTML parser to decode the input file
1849 my $parser = HTML::TokeParser::Simple->new($input_filename);
1850
1851 # write HTML tmp file without the font tag and image size are added to the img tag
1852 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1853 while (my $token = $parser->get_token())
1854 {
1855 # is it an img tag
1856 if ($token->is_start_tag('img'))
1857 {
1858 # get the attributes
1859 my $attr = $token->return_attr;
1860
1861 # get the full path to the image
[27306]1862 my $img_file = &FileUtils::filenameConcatenate($dirname,$attr->{src});
[20774]1863
1864 # set the width and height attribute
1865 ($attr->{width}, $attr->{height}) = imgsize($img_file);
1866
1867 # recreate the tag
1868 print PROD "<img";
1869 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
1870 print PROD ">";
1871 }
1872 # is it a font tag
1873 else
1874 {
1875 if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
1876 {
1877 # remove font tag
1878 print PROD "";
1879 }
1880 else
1881 {
1882 # print without changes
1883 print PROD $token->as_is;
1884 }
1885 }
1886 }
1887 close (PROD) || die("Error Closing File: $tmp_filename $!");
1888
1889 # run html-tidy on the tmp file to make it a proper XML file
1890
[22594]1891 my $outhandle = $self->{'outhandle'};
1892 print $outhandle "Converting HTML to be XML compliant:\n";
1893
1894 my $tidy_cmd = "tidy";
1895 $tidy_cmd .= " -q" if ($self->{'verbosity'} <= 2);
[22636]1896 $tidy_cmd .= " -raw -wrap 0 -asxml \"$tmp_filename\"";
[22594]1897 if ($self->{'verbosity'} <= 2) {
1898 if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1899 $tidy_cmd .= " 2>nul";
1900 }
1901 else {
1902 $tidy_cmd .= " 2>/dev/null";
1903 }
1904 print $outhandle " => $tidy_cmd\n";
1905 }
1906
1907 my $tidyfile = `$tidy_cmd`;
1908
[20774]1909 # write result back to the tmp file
1910 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
1911 print PROD $tidyfile;
1912 close (PROD) || die("Error Closing File: $tmp_filename $!");
1913
1914 # return the output filename
1915 return $tmp_filename;
1916}
1917
[22355]1918sub associate_cover_image
1919{
1920 my $self = shift(@_);
1921 my ($doc_obj, $filename) = @_;
1922 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'}))
1923 {
1924 # we will have cover image in tidytmp, but want it from import
1925 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
1926 }
1927 $self->SUPER::associate_cover_image($doc_obj, $filename);
1928}
1929
1930
[14665]19311;
Note: See TracBrowser for help on using the repository browser.