root/gsdl/trunk/perllib/plugins/HTMLPlugin.pm @ 16812

Revision 16812, 51.8 KB (checked in by ak19, 11 years ago)

Undid the changes introduced in the last commit which url encode the href_link and web_url, since the database can manage UTF8 keys after all after making some small change to GDBMWrapper. While previous changes got interlinking htmls with multilingual filenames working for GS3 they failed for GS2 (for which they were previously working). Now it should hopefully work for both.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# HTMLPlugin.pm -- basic html plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# Note that this plugin handles frames only in a very simple way
29# i.e. each frame is treated as a separate document. This means
30# search results will contain links to individual frames rather
31# than linking to the top level frameset.
32# There may also be some problems caused by the _parent target
33# (it's removed by this plugin)
34#
35
36package HTMLPlugin;
37
38use ReadTextFile;
39use HBPlugin;
40use ghtml;
41use unicode;
42use util;
43use XMLParser;
44
45use Image::Size;
46use File::Copy;
47
48sub BEGIN {
49    @HTMLPlugin::ISA = ('ReadTextFile', 'HBPlugin');
50}
51
52use strict; # every perl program should have this!
53no strict 'refs'; # make an exception so we can use variables as filehandles
54
55my $arguments =
56    [ { 'name' => "process_exp",
57    'desc' => "{BasePlugin.process_exp}",
58    'type' => "regexp",
59    'deft' =>  &get_default_process_exp() },
60      { 'name' => "block_exp",
61    'desc' => "{BasePlugin.block_exp}",
62    'type' => 'regexp',
63    'deft' =>  &get_default_block_exp() },
64      { 'name' => "nolinks",
65    'desc' => "{HTMLPlugin.nolinks}",
66    'type' => "flag" },
67      { 'name' => "keep_head",
68    'desc' => "{HTMLPlugin.keep_head}",
69    'type' => "flag" },
70      { 'name' => "no_metadata",
71    'desc' => "{HTMLPlugin.no_metadata}",
72    'type' => "flag" },
73      { 'name' => "metadata_fields",
74    'desc' => "{HTMLPlugin.metadata_fields}",
75    'type' => "string",
76    'deft' => "Title" },
77      { 'name' => "hunt_creator_metadata",
78    'desc' => "{HTMLPlugin.hunt_creator_metadata}",
79    'type' => "flag" },
80      { 'name' => "file_is_url",
81    'desc' => "{HTMLPlugin.file_is_url}",
82    'type' => "flag" },
83      { 'name' => "assoc_files",
84    'desc' => "{HTMLPlugin.assoc_files}",
85    'type' => "regexp",
86    'deft' => &get_default_block_exp() },
87      { 'name' => "rename_assoc_files",
88    'desc' => "{HTMLPlugin.rename_assoc_files}",
89    'type' => "flag" },
90      { 'name' => "title_sub",
91    'desc' => "{HTMLPlugin.title_sub}",
92    'type' => "string",
93    'deft' => "" },
94      { 'name' => "description_tags",
95    'desc' => "{HTMLPlugin.description_tags}",
96    'type' => "flag" },
97      # retain this for backward compatibility (w3mir option was replaced by
98      # file_is_url)
99      { 'name' => "w3mir",
100#   'desc' => "{HTMLPlugin.w3mir}",
101    'type' => "flag",
102    'hiddengli' => "yes"},
103      { 'name' => "no_strip_metadata_html",
104    'desc' => "{HTMLPlugin.no_strip_metadata_html}",
105    'type' => "string",
106    'deft' => "",
107    'reqd' => "no"},
108      { 'name' => "sectionalise_using_h_tags",
109    'desc' => "{HTMLPlugin.sectionalise_using_h_tags}",
110    'type' => "flag" },
111      { 'name' => "use_realistic_book",
112        'desc' => "{HTMLPlugin.tidy_html}",
113    'type' => "flag"},
114      { 'name' => "old_style_HDL",
115        'desc' => "{HTMLPlugin.old_style_HDL}",
116    'type' => "flag"}
117      ];
118
119my $options = { 'name'     => "HTMLPlugin",
120        'desc'     => "{HTMLPlugin.desc}",
121        'abstract' => "no",
122        'inherits' => "yes",
123        'args'     => $arguments };
124
125
126sub HB_read_html_file {
127    my $self = shift (@_);
128    my ($htmlfile, $text) = @_;
129   
130    # load in the file
131    if (!open (FILE, $htmlfile)) {
132    print STDERR "ERROR - could not open $htmlfile\n";
133    return;
134    }
135
136    my $foundbody = 0;
137    $self->HB_gettext (\$foundbody, $text, "FILE");
138    close FILE;
139   
140    # just in case there was no <body> tag
141    if (!$foundbody) {
142    $foundbody = 1;
143    open (FILE, $htmlfile) || return;
144    $self->HB_gettext (\$foundbody, $text, "FILE");
145    close FILE;
146    }
147    # text is in utf8
148}       
149
150# converts the text to utf8, as ghtml does that for &eacute; etc.
151sub HB_gettext {
152    my $self = shift (@_);
153    my ($foundbody, $text, $handle) = @_;
154   
155    my $line = "";
156    while (defined ($line = <$handle>)) {
157    # look for body tag
158    if (!$$foundbody) {
159        if ($line =~ s/^.*<body[^>]*>//i) {
160        $$foundbody = 1;
161        } else {
162        next;
163        }
164    }
165   
166    # check for symbol fonts
167    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
168        my $font = $1;
169        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
170        if ($font !~ m/^arial$/i);
171    }
172
173    $$text .= $line;
174    }
175
176    if ($self->{'input_encoding'} eq "iso_8859_1") {
177    # convert to utf-8
178    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
179    }
180    # convert any alphanumeric character entities to their utf-8
181    # equivalent for indexing purposes
182    #&ghtml::convertcharentities ($$text);
183
184    $$text =~ s/\s+/ /g; # remove \n's
185}
186
187sub HB_clean_section {
188    my $self = shift (@_);
189    my ($section) = @_;
190
191    # remove tags without a starting tag from the section
192    my ($tag, $tagstart);
193    while ($section =~ m/<\/([^>]{1,10})>/) {
194    $tag = $1;
195    $tagstart = index($section, "<$tag");
196    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
197    $section =~ s/<\/$tag>//;
198    }
199   
200    # remove extra paragraph tags
201    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
202   
203    # remove extra stuff at the end of the section
204    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
205   
206    # add a newline at the beginning of each paragraph
207    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
208   
209    # add a newline every 80 characters at a word boundary
210    # Note: this regular expression puts a line feed before
211    # the last word in each section, even when it is not
212    # needed.
213    $section =~ s/(.{1,80})\s/$1\n/g;
214   
215    # fix up the image links
216    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
217    <center><img src=\"$1\" \/><\/center><br\/>/ig;
218    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
219    <center><img src=\"$1\" \/><\/center><br\/>/ig;
220
221    return $section;
222}
223
224# Will convert the oldHDL format to the new HDL format (using the Section tag) 
225sub convert_to_newHDLformat
226{
227    my $self = shift (@_);
228    my ($file,$cnfile) = @_;
229    my $input_filename = $file;
230    my $tmp_filename = $cnfile;
231   
232    # write HTML tmp file with new HDL format
233    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
234   
235    # read in the file and do basic html cleaning (removing header etc)
236    my $html = "";
237    $self->HB_read_html_file ($input_filename, \$html);
238   
239    # process the file one section at a time
240    my $curtoclevel = 1;
241    my $firstsection = 1;
242    my $toclevel = 0;
243    while (length ($html) > 0) {
244    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
245        $toclevel = $3;
246        my $title = $4;
247        my $sectiontext = "";
248        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
249        $sectiontext = $1;
250        } else {
251        $sectiontext = $html;
252        $html = "";
253        }
254
255        # remove tags and extra spaces from the title
256        $title =~ s/<\/?[^>]+>//g;
257        $title =~ s/^\s+|\s+$//g;
258
259        # close any sections below the current level and
260        # create a new section (special case for the firstsection)
261        print PROD "<!--\n";
262        while (($curtoclevel > $toclevel) ||
263           (!$firstsection && $curtoclevel == $toclevel)) {
264        $curtoclevel--;
265        print PROD "</Section>\n";
266        }
267        if ($curtoclevel+1 < $toclevel) {
268        print STDERR "WARNING - jump in toc levels in $input_filename " .
269            "from $curtoclevel to $toclevel\n";
270        }
271        while ($curtoclevel < $toclevel) {
272        $curtoclevel++;
273        }
274
275        if ($curtoclevel == 1) {
276            # add the header tag
277        print PROD "-->\n";
278            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
279        print PROD "<!--\n";
280        }
281       
282        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
283       
284        print PROD "-->\n";
285       
286        # clean up the section html
287        $sectiontext = $self->HB_clean_section($sectiontext);
288
289        print PROD "$sectiontext\n";     
290
291    } else {
292        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
293        "\nin $input_filename\n";
294        last;
295    }
296    $firstsection = 0;
297    }
298   
299    print PROD "<!--\n";
300    while ($curtoclevel > 0) {
301    $curtoclevel--;
302    print PROD "</Section>\n";
303    }
304    print PROD "-->\n";
305   
306    close (PROD) || die("Error Closing File: $tmp_filename $!");
307   
308    return $tmp_filename;
309}       
310
311sub shorten {
312    my $self = shift (@_);
313    my ($text) = @_;
314
315    return "\"$text\"" if (length($text) < 100);
316
317    return "\"" . substr ($text, 0, 50) . "\" ... \"" .
318    substr ($text, length($text)-50) . "\"";
319}
320
321sub convert_tidy_or_oldHDL_file
322{
323    my $self = shift (@_);
324    my ($file) = @_;
325    my $input_filename = $file;
326   
327    if (-d $input_filename)
328    {
329        return $input_filename;
330    }
331   
332    # get the input filename
333    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
334    my $base_dirname = $dirname;
335    $suffix = lc($suffix);   
336   
337    # derive tmp filename from input filename
338    # Remove any white space from filename -- no risk of name collision, and
339    # makes later conversion by utils simpler. Leave spaces in path...
340    # tidy up the filename with space, dot, hyphen between
341    $tailname =~ s/\s+//g;
342    $tailname =~ s/\.+//g;
343    $tailname =~ s/\-+//g;
344    # convert to utf-8 otherwise we have problems with the doc.xml file
345    # later on
346    &unicode::ensure_utf8(\$tailname);
347   
348    # softlink to collection tmp dir
349    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
350    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
351   
352    my $test_dirname = "";
353    my $f_separator = &util::get_os_dirsep();
354   
355    if ($dirname =~ m/import$f_separator/)
356    {
357        $test_dirname = $'; #'
358   
359    #print STDERR "init $'\n";
360   
361    while ($test_dirname =~ m/[$f_separator]/)
362    {
363        my $folderdirname = $`;
364        $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
365        &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
366        $test_dirname = $'; #'
367    }
368    }
369   
370    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
371   
372    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
373    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
374    {   
375        #convert the input file to a new style HDL
376        my $hdl_output_filename = $input_filename;
377        if ($self->{'old_style_HDL'})
378        {
379        $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
380        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
381        }
382   
383    #just for checking copy all other file from the base dir to tmp dir if it is not exists
384    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
385    my @files = grep {!/^\.+$/} readdir(DIR);
386    close(DIR);
387
388    foreach my $file (@files)
389    {
390        my $src_file = &util::filename_cat($base_dirname,$file);
391        my $dest_file = &util::filename_cat($tmp_dirname,$file);
392        if ((!-e $dest_file) && (!-d $src_file))
393        {
394        # just copy the original file back to the tmp directory
395        copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
396        }
397    }
398   
399    # tidy the input file
400    my $tidy_output_filename = $hdl_output_filename;
401    if ($self->{'tidy_html'})
402    {
403        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
404        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
405    }
406    $tmp_filename = $tidy_output_filename;
407    }
408    else
409    {
410        if (!-e $tmp_filename)
411    {
412        # just copy the original file back to the tmp directory
413        copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
414    }
415    }
416   
417    return $tmp_filename;
418}
419
420
421# Will make the html input file as a proper XML file with removed font tag and
422# image size added to the img tag.
423# The tidying process takes place in a collection specific 'tmp' directory so
424# that we don't accidentally damage the input.
425sub tmp_tidy_file
426{
427    my $self = shift (@_);
428    my ($file,$cnfile) = @_;
429    my $input_filename = $file;
430    my $tmp_filename = $cnfile;
431   
432    # get the input filename
433    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
434
435    require HTML::TokeParser::Simple;
436   
437    # create HTML parser to decode the input file
438    my $parser = HTML::TokeParser::Simple->new($input_filename);
439
440    # write HTML tmp file without the font tag and image size are added to the img tag
441    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
442    while (my $token = $parser->get_token())
443    {
444    # is it an img tag
445    if ($token->is_start_tag('img'))
446    {
447        # get the attributes
448        my $attr = $token->return_attr;
449
450        # get the full path to the image
451        my $img_file = &util::filename_cat($dirname,$attr->{src});
452       
453        # set the width and height attribute
454        ($attr->{width}, $attr->{height}) = imgsize($img_file);
455
456        # recreate the tag
457        print PROD "<img";
458        print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
459        print PROD ">";
460    }
461    # is it a font tag
462    else
463    {
464        if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
465        {
466        # remove font tag
467        print PROD "";
468        }
469        else
470        {
471        # print without changes
472        print PROD $token->as_is;
473        }
474    }
475    }
476    close (PROD) || die("Error Closing File: $tmp_filename $!");
477
478    # run html-tidy on the tmp file to make it a proper XML file
479    my $tidyfile = `tidy -utf8 -wrap 0 -asxml $tmp_filename`;
480
481    # write result back to the tmp file
482    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
483    print PROD $tidyfile;
484    close (PROD) || die("Error Closing File: $tmp_filename $!");
485   
486    # return the output filename
487    return $tmp_filename;
488}
489
490sub read_into_doc_obj
491{
492    my $self = shift (@_); 
493    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
494       
495    # get the input file
496    my $input_filename = $file;
497    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
498    $suffix = lc($suffix);   
499   
500    if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'}))
501    {
502    # because the document has to be sectionalized set the description tags
503    $self->{'description_tags'} = 1;
504   
505    # set the file to be tidied
506    $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
507   
508    # get the tidied file
509    #my $tidy_filename = $self->tmp_tidy_file($input_filename);
510    my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
511   
512    # derive tmp filename from input filename
513    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$");
514   
515    # set the new input file and base_dir to be from the tidied file
516    $file = "$tailname$suffix";
517    $base_dir = $dirname;
518    }
519   
520    # call the parent read_into_doc_obj
521    my ($process_status,$doc_obj) = $self->SUPER::read_into_doc_obj($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
522   
523    return ($process_status,$doc_obj);
524}
525
526sub new {
527    my ($class) = shift (@_);
528    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
529    push(@$pluginlist, $class);
530   
531    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
532    push(@{$hashArgOptLists->{"OptList"}},$options);
533   
534
535    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
536   
537    if ($self->{'w3mir'}) {
538    $self->{'file_is_url'} = 1;
539    }
540    $self->{'aux_files'} = {};
541    $self->{'dir_num'} = 0;
542    $self->{'file_num'} = 0;
543   
544    return bless $self, $class;
545}
546
547# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
548# if have eg <script language="javascript" src="img/lib.js@123">
549sub get_default_block_exp {
550    my $self = shift (@_);
551   
552    #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
553    return "";
554}
555
556sub get_default_process_exp {
557    my $self = shift (@_);
558   
559    # the last option is an attempt to encode the concept of an html query ...
560    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
561}
562
563sub store_block_files
564{
565    my $self =shift (@_);
566    my ($filename_full_path, $block_hash) = @_;
567
568    my $html_fname = $filename_full_path;
569    my @file_blocks;
570   
571    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
572
573    # read in file ($text will be in utf8)
574    my $raw_text = "";
575    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
576
577    my $textref = \$raw_text;
578    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
579    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
580    $$textref =~ s/$opencom(.*?)$closecom//gs;
581
582    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
583    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
584    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
585    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
586    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
587    my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
588    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
589
590    if(!defined $self->{'utf8_to_original_filename'}) {
591    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
592    $self->{'utf8_to_original_filename'} = {};
593    }
594
595    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
596
597    # remove quotes from link at start and end if necessary
598    if ($link=~/^\"/) {
599        $link=~s/^\"//;
600        $link=~s/\"$//;
601    }
602
603    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
604    # some links may just be anchor names
605    next unless ($link =~ /\S+/);
606
607    if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
608        # Turn relative file path into full path
609        my $dirname = &File::Basename::dirname($filename_full_path);
610        $link = &util::filename_cat($dirname, $link);
611    }
612    $link = $self->eval_dir_dots($link);
613   
614
615    # this is the actual filename on the filesystem (that the link refers to)
616    my $url_original_filename = $self->opt_url_decode($link);
617
618    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
619    my $utf8_link = "";
620    $self->decode_text($link,$encoding,$language,\$utf8_link);
621### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link);
622
623    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
624### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename;
625
626#   print STDERR "**** Storing block link: $link\n";
627#   print STDERR "**** URL original filename: $url_original_filename\n";
628    print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
629
630    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
631    }
632}
633
634# Given a filename in any encoding, will URL decode it to get back the original filename
635# in the original encoding. Because this method is intended to work out the *original*
636# filename*, it not URL decode any filename if a file by the name of the *URL-encoded*
637# string already exists in the local folder.
638# Return the original filename corresponding to the parameter URL-encoded filename, and
639# a decoded flag that is set to true iff URL-decoding had to be applied.
640sub opt_url_decode {
641    my $self = shift (@_);
642    my ($link) = @_;
643
644    # Replace %XX's in URL with decoded value if required.
645    # Note that the filename may include the %XX in some situations
646    if ($link =~ m/\%[A-F0-9]{2}/i) {
647    if (!-e $link) {
648        $link = &unicode::url_decode($link);
649    }
650    }
651
652    return $link;
653}
654
655
656# do plugin specific processing of doc_obj
657sub process {
658    my $self = shift (@_);
659    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
660    my $outhandle = $self->{'outhandle'};
661
662    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
663    # this makes life so much easier... perl can cope with unix-style '/'s.
664    $base_dir =~ s@(\\)+@/@g;
665    $file =~ s@(\\)+@/@g;
666    }
667   
668    # reset per-doc stuff...
669    $self->{'aux_files'} = {};
670    $self->{'dir_num'} = 0;
671    $self->{'file_num'} = 0;
672
673    # process an HTML file where sections are divided by headings tags (H1, H2 ...)
674    # you can also include metadata in the format (X can be any number)
675    # <hX>Title<!--gsdl-metadata
676    #   <Metadata name="name1">value1</Metadata>
677    #   ...
678    #   <Metadata name="nameN">valueN</Metadata>
679    #--></hX>
680    if ($self->{'sectionalise_using_h_tags'}) {
681    # description_tags should allways be activated because we convert headings to description tags
682    $self->{'description_tags'} = 1;
683
684    my $arrSections = [];
685    $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
686
687    if (scalar(@$arrSections)) {
688        my $strMetadata = $self->update_section_data($arrSections, -1);
689        if (length($strMetadata)) {
690        $strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
691        $$textref =~ s/<\/body>/$strMetadata/ig;
692        }
693    }
694    }
695
696    my $cursection = $doc_obj->get_top_section();
697
698    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
699    unless $self->{'no_metadata'} || $self->{'description_tags'};
700
701    # Store URL for page as metadata - this can be used for an
702    # altavista style search interface. The URL won't be valid
703    # unless the file structure contains the domain name (i.e.
704    # like when w3mir is used to download a website).
705
706    # URL metadata (even invalid ones) are used to support internal
707    # links, so even if 'file_is_url' is off, still need to store info
708
709#    print STDERR "#### file: $file\n";
710    my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
711    my $utf8_file = $self->filename_to_utf8_metadata($file);
712
713    my $web_url = "http://";
714    if(defined $dirname) { # local directory
715    $web_url = $web_url.$dirname.$utf8_file;
716    } else {
717    $web_url = $web_url.$utf8_file;
718    }
719#    print STDERR "#### weburl: $web_url\n";
720
721    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
722
723    if ($self->{'file_is_url'}) {
724    $doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
725    $doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
726    $doc_obj->add_metadata($cursection, "/weblink", "</a>");
727    }
728
729    if ($self->{'description_tags'}) {
730    # remove the html header - note that doing this here means any
731    # sections defined within the header will be lost (so all <Section>
732    # tags must appear within the body of the HTML)
733    my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
734
735    $$textref =~ s/^.*?<body[^>]*>//is;
736    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
737
738    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
739    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
740
741    my $lt = '(?:<|&lt;)';
742    my $gt = '(?:>|&gt;)';
743    my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
744
745    my $dont_strip = '';
746    if ($self->{'no_strip_metadata_html'}) {
747        ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
748    }
749
750    my $found_something = 0; my $top = 1;
751    while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
752        my $text = $1;
753        my $comment = $2;
754        if (defined $text) {
755        # text before a comment - note that getting to here
756        # doesn't necessarily mean there are Section tags in
757        # the document
758        $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
759        }
760        while ($comment =~ s/$lt(.*?)$gt//s) {
761        my $tag = $1;
762        if ($tag eq "Section") {
763            $found_something = 1;
764            $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
765            $top = 0;
766        } elsif ($tag eq "/Section") {
767            $found_something = 1;
768            $cursection = $doc_obj->get_parent_section ($cursection);
769        } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
770            my $metaname = $1;
771            my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
772            $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
773            my $metavalue = $1;
774            $metavalue =~ s/^\s+//;
775            $metavalue =~ s/\s+$//;
776                    # assume that no metadata value intentionally includes
777                    # carriage returns or HTML tags (if they're there they
778                    # were probably introduced when converting to HTML from
779                    # some other format).
780            # actually some people want to have html tags in their
781            # metadata.
782            $metavalue =~ s/[\cJ\cM]/ /sg;
783            $metavalue =~ s/<[^>]+>//sg
784            unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
785            $metavalue =~ s/\s+/ /sg;
786            if ($accumulate) {
787            $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
788            } else {
789            $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);   
790            }
791        } elsif ($tag eq "Description" || $tag eq "/Description") {
792            # do nothing with containing Description tags
793        } else {
794            # simple HTML tag (probably created by the conversion
795            # to HTML from some other format) - we'll ignore it and
796            # hope for the best ;-)
797        }
798        }
799    }
800    if ($cursection ne "") {
801        print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
802    }
803
804    $$textref =~ s/^.*?<body[^>]*>//is;
805    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
806    if ($$textref =~ m/\S/) {
807        if (!$found_something) {
808        if ($self->{'verbosity'} > 2) {
809            print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
810            print $outhandle "          will be processed as a single section document\n";
811        }
812
813        # go ahead and process single-section document
814        $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
815
816        # if document contains no Section tags we'll go ahead
817        # and extract metadata (this won't have been done
818        # above as the -description_tags option prevents it)
819        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
820        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
821            unless $self->{'no_metadata'};
822
823        } else {
824        print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
825        print $outhandle "          of the final closing </Section> tag. This text will\n";
826        print $outhandle "          be ignored.";
827
828        my ($text);
829        if (length($$textref) > 30) {
830            $text = substr($$textref, 0, 30) . "...";
831        } else {
832            $text = $$textref;
833        }
834        $text =~ s/\n/ /isg;
835        print $outhandle " ($text)\n";
836        }
837    } elsif (!$found_something) {
838
839        if ($self->{'verbosity'} > 2) {
840        # may get to here if document contained no valid Section
841        # tags but did contain some comments. The text will have
842        # been processed already but we should print the warning
843        # as above and extract metadata
844        print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
845        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
846        }
847
848        my $complete_text = $head_keep.$doc_obj->get_text($cursection);
849        $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
850        unless $self->{'no_metadata'};
851    }
852   
853    } else {
854
855    # remove header and footer
856    if (!$self->{'keep_head'} || $self->{'description_tags'}) {
857        $$textref =~ s/^.*?<body[^>]*>//is;
858        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
859    }
860
861    # single section document
862    $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
863    }
864    return 1;
865}
866
867
868sub process_heading
869{
870    my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
871    $strHeadingText = '' if (!defined($strHeadingText));
872
873    my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
874
875    my $strSecMetadata = '';
876    while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
877    {
878    $strSecMetadata .= $1;
879    }
880
881    $strHeadingText =~ s/^\s+//g;
882    $strHeadingText =~ s/\s+$//g;
883    $strSecMetadata =~ s/^\s+//g;
884    $strSecMetadata =~ s/\s+$//g;
885
886    $strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
887
888    if (length($strSecMetadata)) {
889    $strMetadata .= "\t\t" . $strSecMetadata . "\n";
890    }
891
892    $strMetadata .= "\t</Description>\n";
893
894    return "<!--" . $strMetadata . "-->";
895}
896
897
898sub update_section_data
899{
900    my ($self, $arrSections, $nCurTocNo) = @_;
901    my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
902
903    if ($nSections == 0) {
904    push @$arrSections, $nCurTocNo;
905    return $strBuffer;
906    }
907    $nLast = $arrSections->[$nSections - 1];
908    if ($nCurTocNo > $nLast) {
909    push @$arrSections, $nCurTocNo;
910    return $strBuffer;
911    }
912    for(my $i = $nSections - 1; $i >= 0; $i--) {
913    if ($nCurTocNo <= $arrSections->[$i]) {
914        $strBuffer .= "\n</Section>";
915        pop @$arrSections;
916    }
917    }
918    push @$arrSections, $nCurTocNo;
919    return $strBuffer;
920}
921
922
923# note that process_section may be called multiple times for a single
924# section (relying on the fact that add_utf8_text appends the text to any
925# that may exist already).
926sub process_section {
927    my $self = shift (@_);
928    my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
929    # trap links
930    if (!$self->{'nolinks'}) {
931    # usemap="./#index" not handled correctly => change to "#index"
932##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
933
934    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
935        $self->replace_usemap_links($1, $2, $3)/isge;
936
937##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
938
939    $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
940        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
941    }
942
943    # trap images
944
945    # Previously, by default, HTMLPlugin would embed <img> tags inside anchor tags
946    # i.e. <a href="image><img src="image"></a> in order to overcome a problem that
947    # turned regular text succeeding images into links. That is, by embedding <imgs>
948    # inside <a href=""></a>, the text following images were no longer misbehaving.
949    # However, there would be many occasions whereby images were not meant to link
950    # to their source images but where the images would link to another web page.
951    # To allow this, the no_image_links option was introduced: it would prevent
952    # the behaviour of embedding images into links that referenced the source images.
953
954    # Somewhere along the line, the problem of normal text turning into links when
955    # such text followed images which were not embedded in <a href=""></a> ceased
956    # to occur. This is why the following lines have been commented out (as well as
957    # two lines in replace_images). They appear to no longer apply.
958
959    # If at any time, there is a need for having images embedded in <a> anchor tags,
960    # then it might be better to turn that into an HTMLPlugin option rather than make
961    # it the default behaviour. Also, eventually, no_image_links needs to become
962    # a deprecated option for HTMLPlugin as it has now become the default behaviour.
963
964    #if(!$self->{'no_image_links'}){
965    $$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
966    $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
967    #}
968
969    # add text to document object
970    # turn \ into \\ so that the rest of greenstone doesn't think there
971    # is an escape code following. (Macro parsing loses them...)
972    $$textref =~ s/\\/\\\\/go;
973   
974    $doc_obj->add_utf8_text($cursection, $$textref);
975}
976
977sub replace_images {
978    my $self = shift (@_);
979    my ($front, $link, $back, $base_dir,
980    $file, $doc_obj, $section) = @_;
981
982    # remove quotes from link at start and end if necessary
983    if ($link=~/^[\"\']/) {
984    $link=~s/^[\"\']//;
985    $link=~s/[\"\']$//;
986    $front.='"';
987    $back="\"$back";
988    }
989
990    $link =~ s/\n/ /g;
991
992    # Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
993    # If the Word file path has spaces in it, wv messes up and you end up with
994    #   absolute paths for the images, and without the "file://" prefix
995    # So check for this special case and massage the data to be correct
996    if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
997    $link =~ s/^.*\\([^\\]+)$/$1/;
998    }
999   
1000    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1001
1002    my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
1003
1004#    print STDERR "**** link = $link\n";
1005#    print STDERR "**** href = $href\n";
1006#    print STDERR "**** img_file = $img_file\n";
1007
1008    my $anchor_name = $img_file;
1009    #$anchor_name =~ s/^.*\///;
1010    #$anchor_name = "<a name=\"$anchor_name\" ></a>";
1011
1012    my $image_link = $front . $img_file .$back;
1013    return $image_link;
1014
1015    # The reasons for why the following two lines are no longer necessary can be
1016    # found in subroutine process_section
1017    #my $anchor_link = "<a href=\"$img_file\" >".$image_link."</a>"; 
1018    #return $anchor_link;   
1019   
1020    #return $front . $img_file . $back . $anchor_name;
1021}
1022
1023sub replace_href_links {
1024    my $self = shift (@_);
1025    my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
1026
1027    # remove quotes from link at start and end if necessary
1028    if ($link=~/^[\"\']/) {
1029    $link=~s/^[\"\']//;
1030    $link=~s/[\"\']$//;
1031    $front.='"';
1032    $back="\"$back";
1033    }
1034
1035    # attempt to sort out targets - frames are not handled
1036    # well in this plugin and some cases will screw things
1037    # up - e.g. the _parent target (so we'll just remove
1038    # them all ;-)
1039    $front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1040    $back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
1041    $front =~ s/target=\"?_parent\"?//is;
1042    $back =~ s/target=\"?_parent\"?//is;
1043
1044    return $front . $link . $back if $link =~ m/^\#/s;
1045    $link =~ s/\n/ /g;
1046
1047    # Find file referred to by $link on file system
1048    # This is more complicated than it sounds when char encodings
1049    # is taken in to account
1050##    &unicode::ensure_utf8(\$link);
1051##    $link = &unicode::url_encode($link);
1052#    print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n";
1053#    print STDERR "#### link: $link\n";
1054
1055    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
1056#    print STDERR "#### href: $href\n";
1057 
1058    # href may use '\'s where '/'s should be on Windows
1059    $href =~ s/\\/\//g;
1060
1061##    $href = &unicode::url_decode($href);
1062#    print STDERR "#### href again: $href\n";
1063    my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
1064   
1065
1066    ##### leave all these links alone (they won't be picked up by intermediate
1067    ##### pages). I think that's safest when dealing with frames, targets etc.
1068    ##### (at least until I think of a better way to do it). Problems occur with
1069    ##### mailto links from within small frames, the intermediate page is displayed
1070    ##### within that frame and can't be seen. There is still potential for this to
1071    ##### happen even with html pages - the solution seems to be to somehow tell
1072    ##### the browser from the server side to display the page being sent (i.e.
1073    ##### the intermediate page) in the top level window - I'm not sure if that's
1074    ##### possible - the following line should probably be deleted if that can be done
1075    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
1076
1077
1078    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
1079    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
1080    &ghtml::urlsafe ($href);
1081    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
1082    } else {
1083    # link is to some other type of file (eg image) so we'll
1084    # need to associate that file
1085    return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
1086    }
1087}
1088
1089sub add_file {
1090    my $self = shift (@_);
1091    my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
1092    my ($newname);
1093
1094    my $filename = $href;
1095    if ($base_dir eq "") {
1096    # remove http:/ thereby leaving one slash at the start
1097    $filename =~ s/^[^:]*:\///;
1098    }
1099    else {
1100    # remove http://
1101    $filename =~ s/^[^:]*:\/\///;
1102    }
1103
1104    $filename = &util::filename_cat($base_dir, $filename);
1105#    print STDERR "**** filename: $filename\n";
1106    # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
1107    # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
1108    my $utf8_filename = $filename;
1109   
1110#    print STDERR "*** filename before URL decoding: $filename\n";
1111    $filename = $self->opt_url_decode($utf8_filename);
1112#    print STDERR "*** filename after URL decoding:  $filename\n\n";
1113
1114    # some special processing if the intended filename was converted to utf8, but
1115    # the actual file still needs to be renamed
1116    if (!-e $filename) {
1117    # try the original filename stored in map
1118    my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
1119    if (-e $original_filename) {
1120        $filename = $original_filename;
1121    }
1122    }
1123   
1124    my ($ext) = $filename =~ m/(\.[^\.]*)$/;
1125
1126    if ($rl == 0) {
1127    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1128        return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
1129    }
1130    else {
1131        return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
1132    }
1133    }
1134
1135    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
1136    return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
1137    }
1138    if ($self->{'rename_assoc_files'}) {
1139    if (defined $self->{'aux_files'}->{$href}) {
1140        $newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
1141        $self->{'aux_files'}->{$href}->{'file_num'} . $ext;
1142    } else {
1143        $newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
1144        $self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
1145        $self->inc_filecount ();
1146    }
1147    $doc_obj->associate_file($filename, $newname, undef, $section);
1148    return "_httpdocimg_/$newname";
1149    } else {
1150    ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
1151   
1152#   print STDERR "Before url encoding newname: $newname\n";
1153    # Make sure this name uses only ASCII characters
1154    # => use URL encoding, as it preserves original encoding
1155    $newname = &unicode::url_encode($newname);
1156#   print STDERR "After url encoding newname: $newname\n";
1157#   print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n";
1158
1159    $doc_obj->associate_file($filename, $newname, undef, $section);
1160
1161    # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
1162    # of filenames, URL-encode the additional percent signs of the URL-encoded filename
1163    my $newname_url = $newname;
1164    $newname_url =~ s/%/%25/g;
1165    return "_httpdocimg_/$newname_url";
1166    }
1167}
1168
1169
1170sub format_link {
1171    my $self = shift (@_);
1172    my ($link, $base_dir, $file) = @_;
1173
1174    my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
1175   
1176    $hash_part = "" if !defined $hash_part;
1177    if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
1178    my $outhandle = $self->{'outhandle'};
1179    print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
1180        if $self->{'verbosity'};
1181    return ($link, "", 0);
1182    }
1183
1184    if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) {
1185    my $type = $1;
1186
1187    if ($link =~ m/^(http|ftp):/i) {
1188        # Turn url (using /) into file name (possibly using \ on windows)
1189        my @http_dir_split = split('/', $before_hash);
1190        $before_hash = &util::filename_cat(@http_dir_split);
1191    }
1192
1193    $before_hash = $self->eval_dir_dots($before_hash);
1194   
1195    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1196   
1197    my $rl = 0;
1198    $rl = 1 if (-e $linkfilename);
1199
1200    # make sure there's a slash on the end if it's a directory
1201    if ($before_hash !~ m/\/$/) {
1202        $before_hash .= "/" if (-d $linkfilename);
1203    }
1204    return ($type . $before_hash, $hash_part, $rl);
1205   
1206    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
1207
1208    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
1209
1210        # the first directory will be the domain name if file_is_url
1211        # to generate archives, otherwise we'll assume all files are
1212        # from the same site and base_dir is the root
1213
1214        if ($self->{'file_is_url'}) {
1215        my @dirs = split /[\/\\]/, $file;
1216        my $domname = shift (@dirs);
1217        $before_hash = &util::filename_cat($domname, $before_hash);
1218        $before_hash =~ s@\\@/@g; # for windows
1219        }
1220        else
1221        {
1222        # see if link shares directory with source document
1223        # => turn into relative link if this is so!
1224       
1225        if ($ENV{'GSDLOS'} =~ m/^windows/i) {
1226            # too difficult doing a pattern match with embedded '\'s...
1227            my $win_before_hash=$before_hash;
1228            $win_before_hash =~ s@(\\)+@/@g;
1229            # $base_dir is already similarly "converted" on windows.
1230            if ($win_before_hash =~ s@^$base_dir/@@o) {
1231            # if this is true, we removed a prefix
1232            $before_hash=$win_before_hash;
1233            }
1234        }
1235        else {
1236            # before_hash has lost leading slash by this point,
1237            # -> add back in prior to substitution with $base_dir
1238            $before_hash = "/$before_hash";
1239
1240            $before_hash = &util::filename_cat("",$before_hash);
1241            $before_hash =~ s@^$base_dir/@@;
1242        }
1243        }
1244    } else {
1245        # Turn relative file path into full path
1246        my $dirname = &File::Basename::dirname($file);
1247        $before_hash = &util::filename_cat($dirname, $before_hash);
1248        $before_hash = $self->eval_dir_dots($before_hash);             
1249       
1250#       print STDERR "#### before_hash: $before_hash\n";
1251    }
1252
1253    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
1254    # make sure there's a slash on the end if it's a directory
1255    if ($before_hash !~ m/\/$/) {
1256        $before_hash .= "/" if (-d $linkfilename);
1257    }
1258    return ("http://" . $before_hash, $hash_part, 1);
1259    } else {
1260    # mailto, news, nntp, telnet, javascript or gopher link
1261    return ($before_hash, "", 0);
1262    }
1263}
1264
1265sub extract_first_NNNN_characters {
1266    my $self = shift (@_);
1267    my ($textref, $doc_obj, $thissection) = @_;
1268   
1269    foreach my $size (split /,/, $self->{'first'}) {
1270    my $tmptext =  $$textref;
1271    # skip to the body
1272    $tmptext =~ s/.*<body[^>]*>//i;
1273    # remove javascript
1274    $tmptext =~ s@<script.*?</script>@ @sig;
1275    $tmptext =~ s/<[^>]*>/ /g;
1276    $tmptext =~ s/&nbsp;/ /g;
1277    $tmptext =~ s/^\s+//;
1278    $tmptext =~ s/\s+$//;
1279    $tmptext =~ s/\s+/ /gs;
1280    $tmptext = &unicode::substr ($tmptext, 0, $size);
1281    $tmptext =~ s/\s\S*$/&#8230;/; # adds an ellipse (...)
1282    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1283    }
1284}
1285
1286
1287sub extract_metadata {
1288    my $self = shift (@_);
1289    my ($textref, $metadata, $doc_obj, $section) = @_;
1290    my $outhandle = $self->{'outhandle'};
1291    # if we don't want metadata, we may as well not be here ...
1292    return if (!defined $self->{'metadata_fields'});
1293
1294    # metadata fields to extract/save. 'key' is the (lowercase) name of the
1295    # html meta, 'value' is the metadata name for greenstone to use
1296    my %find_fields = ();
1297
1298    my %creator_fields = (); # short-cut for lookups
1299
1300
1301    foreach my $field (split /,/, $self->{'metadata_fields'}) {
1302        $field =~ s/^\s+//; # remove leading whitespace
1303        $field =~ s/\s+$//; # remove trailing whitespace
1304
1305    # support tag<tagname>
1306    if ($field =~ m/^(.*?)<(.*?)>$/) {
1307        # "$2" is the user's preferred gs metadata name
1308        $find_fields{lc($1)}=$2; # lc = lowercase
1309    } else { # no <tagname> for mapping
1310        # "$field" is the user's preferred gs metadata name
1311        $find_fields{lc($field)}=$field; # lc = lowercase
1312    }
1313    }
1314
1315    if (defined $self->{'hunt_creator_metadata'} &&
1316    $self->{'hunt_creator_metadata'} == 1 ) {
1317    my @extra_fields =
1318        (
1319         'author',
1320         'author.email',
1321         'creator',
1322         'dc.creator',
1323         'dc.creator.corporatename',
1324         );
1325
1326    # add the creator_metadata fields to search for
1327    foreach my $field (@extra_fields) {
1328        $creator_fields{$field}=0; # add to lookup hash
1329    }
1330    }
1331
1332
1333    # find the header in the html file, which has the meta tags
1334    $$textref =~ m@<head>(.*?)</head>@si;
1335
1336    my $html_header=$1;
1337
1338    # go through every <meta... tag defined in the html and see if it is
1339    # one of the tags we want to match.
1340   
1341    # special case for title - we want to remember if its been found
1342    my $found_title = 0;
1343    # this assumes that ">" won't appear. (I don't think it's allowed to...)
1344    $html_header =~ m/^/; # match the start of the string, for \G assertion
1345   
1346    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
1347    my $metatag=$1;
1348    my ($tag, $value);
1349
1350    # find the tag name
1351    $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
1352    $tag=$2;
1353    # in case they're not using " or ', but they should...
1354    if (! $tag) {
1355        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1356        $tag=$1;
1357    }
1358
1359    if (!defined $tag) {
1360        print $outhandle "HTMLPlugin: can't find NAME in \"$metatag\"\n";
1361        next;
1362    }
1363   
1364    # don't need to assign this field if it was passed in from a previous
1365    # (recursive) plugin
1366    if (defined $metadata->{$tag}) {next}
1367
1368    # find the tag content
1369    $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
1370    $value=$2;
1371
1372    if (! $value) {
1373        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
1374        $value=$1;
1375    }
1376    if (!defined $value) {
1377        print $outhandle "HTMLPlugin: can't find VALUE in \"$metatag\"\n";
1378        next;
1379    }
1380
1381    # clean up and add
1382    $value =~ s/\s+/ /gs;
1383    chomp($value); # remove trailing \n, if any
1384    if (exists $creator_fields{lc($tag)}) {
1385        # map this value onto greenstone's "Creator" metadata
1386        $tag='Creator';
1387    } elsif (!exists $find_fields{lc($tag)}) {
1388        next; # don't want this tag
1389    } else {
1390        # get the user's preferred capitalisation
1391        $tag = $find_fields{lc($tag)};
1392    }
1393    if (lc($tag) eq "title") {
1394        $found_title = 1;
1395    }
1396    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
1397        if ($self->{'verbosity'} > 2);
1398    if ($tag =~ m/date.*/i){
1399        $tag = lc($tag);
1400    }
1401    $doc_obj->add_utf8_metadata($section, $tag, $value);
1402
1403    }
1404   
1405    # TITLE: extract the document title
1406    if (exists $find_fields{'title'} && !$found_title) {
1407    # we want a title, and didn't find one in the meta tags
1408    # see if there's a <title> tag
1409    my $title;
1410    my $from = ""; # for debugging output only
1411    if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
1412        $title = $1;
1413        $from = "<title> tags";
1414    }
1415
1416    if (!defined $title) {
1417        $from = "first 100 chars";
1418        # if no title use first 100 or so characters
1419        $title = $$textref;
1420        $title =~ s/^\xFE\xFF//; # Remove unicode byte order mark       
1421        $title =~ s/^.*?<body>//si;
1422        # ignore javascript!
1423        $title =~ s@<script.*?</script>@ @sig;
1424        $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
1425        $title =~ s/<[^>]*>/ /g; # remove all HTML tags
1426        $title = substr ($title, 0, 100);
1427        $title =~ s/\s\S*$/.../;
1428    }
1429    $title =~ s/<[^>]*>/ /g; # remove html tags
1430    $title =~ s/&nbsp;/ /g;
1431    $title =~ s/(?:&nbsp;|\xc2\xa0)/ /g; # utf-8 for nbsp...
1432    $title =~ s/\s+/ /gs; # collapse multiple spaces
1433    $title =~ s/^\s*//;   # remove leading spaces
1434    $title =~ s/\s*$//;   # remove trailing spaces
1435
1436    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
1437    $title =~ s/^\s+//s; # in case title_sub introduced any...
1438    $doc_obj->add_utf8_metadata ($section, 'Title', $title);
1439    print $outhandle " extracted Title metadata \"$title\" from $from\n"
1440        if ($self->{'verbosity'} > 2);
1441    }
1442   
1443    # add FileFormat metadata
1444    $doc_obj->add_metadata($section,"FileFormat", "HTML");
1445
1446    # Special, for metadata names such as tagH1 - extracts
1447    # the text between the first <H1> and </H1> tags into "H1" metadata.
1448
1449    foreach my $field (keys %find_fields) {
1450    if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
1451    my $tag = $1;
1452    if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
1453        my $content = $1;
1454        $content =~ s/&nbsp;/ /g;
1455        $content =~ s/<[^>]*>/ /g;
1456        $content =~ s/^\s+//;
1457        $content =~ s/\s+$//;
1458        $content =~ s/\s+/ /gs;
1459        if ($content) {
1460        $tag=$find_fields{"tag$tag"}; # get the user's capitalisation
1461        $tag =~ s/^tag//i;
1462        $doc_obj->add_utf8_metadata ($section, $tag, $content);
1463        print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
1464            if ($self->{'verbosity'} > 2);
1465        }
1466    }
1467    }   
1468}
1469
1470
1471# evaluate any "../" to next directory up
1472# evaluate any "./" as here
1473sub eval_dir_dots {
1474    my $self = shift (@_);
1475    my ($filename) = @_;
1476    my $dirsep_os = &util::get_os_dirsep();
1477    my @dirsep = split(/$dirsep_os/,$filename);
1478
1479    my @eval_dirs = ();
1480    foreach my $d (@dirsep) {
1481    if ($d eq "..") {
1482        pop(@eval_dirs);
1483       
1484    } elsif ($d eq ".") {
1485        # do nothing!
1486
1487    } else {
1488        push(@eval_dirs,$d);
1489    }
1490    }
1491
1492    # Need to fiddle with number of elements in @eval_dirs if the
1493    # first one is the empty string.  This is because of a
1494    # modification to util::filename_cat that supresses the addition
1495    # of a leading '/' character (or \ if windows) (intended to help
1496    # filename cat with relative paths) if the first entry in the
1497    # array is the empty string.  Making the array start with *two*
1498    # empty strings is a way to defeat this "smart" option.
1499    #
1500    if (scalar(@eval_dirs) > 0) {
1501    if ($eval_dirs[0] eq ""){
1502        unshift(@eval_dirs,"");
1503    }
1504    }
1505    return &util::filename_cat(@eval_dirs);
1506}
1507
1508sub replace_usemap_links {
1509    my $self = shift (@_);
1510    my ($front, $link, $back) = @_;
1511
1512    # remove quotes from link at start and end if necessary
1513    if ($link=~/^[\"\']/) {
1514    $link=~s/^[\"\']//;
1515    $link=~s/[\"\']$//;
1516    $front.='"';
1517    $back="\"$back";
1518    }
1519
1520    $link =~ s/^\.\///;
1521    return $front . $link . $back;
1522}
1523
1524sub inc_filecount {
1525    my $self = shift (@_);
1526
1527    if ($self->{'file_num'} == 1000) {
1528    $self->{'dir_num'} ++;
1529    $self->{'file_num'} = 0;
1530    } else {
1531    $self->{'file_num'} ++;
1532    }
1533}
1534
1535
1536# Extend read_file so that strings like &eacute; are
1537# converted to UTF8 internally. 
1538#
1539# We don't convert &lt; or &gt; or &amp; or &quot; in case
1540# they interfere with the GML files
1541
1542sub read_file {
1543    my $self = shift(@_);
1544    my ($filename, $encoding, $language, $textref) = @_;
1545
1546    $self->SUPER::read_file($filename, $encoding, $language, $textref);
1547
1548    # Convert entities to their UTF8 equivalents
1549    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
1550    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
1551    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
1552}
1553
15541;
Note: See TracBrowser for help on using the browser.