Context Navigation

← Previous Changeset
Next Changeset →

Changeset 20774

Timestamp:

2009-10-05T15:43:00+13:00 (15 years ago)

Author:

kjdon

Message:

moved some of the horrible old methods to the end of the file so that the important ones come first

File:

: 1 edited

gsdl/trunk/perllib/plugins/HTMLPlugin.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/plugins/HTMLPlugin.pm

-              r20689
+              r20774
+sub HB_read_html_file {
+    my $self = shift (@_);
+    my ($htmlfile, $text) = @_;
+    # load in the file
+    if (!open (FILE, $htmlfile)) {
+    print STDERR "ERROR - could not open $htmlfile\n";
+    return;
+    }
+    my $foundbody = 0;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    # just in case there was no <body> tag
+    if (!$foundbody) {
+    $foundbody = 1;
+    open (FILE, $htmlfile) || return;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    }
+    # text is in utf8
+}
+# converts the text to utf8, as ghtml does that for &eacute; etc.
+sub HB_gettext {
+    my $self = shift (@_);
+    my ($foundbody, $text, $handle) = @_;
+    my $line = "";
+    while (defined ($line = <$handle>)) {
+    # look for body tag
+    if (!$$foundbody) {
+        if ($line =~ s/^.*<body[^>]*>//i) {
+        $$foundbody = 1;
+        } else {
+        next;
+        }
+    }
+    # check for symbol fonts
+    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
+        my $font = $1;
+        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
+        if ($font !~ m/^arial$/i);
+    }
+    $$text .= $line;
+    }
+    if ($self->{'input_encoding'} eq "iso_8859_1") {
+    # convert to utf-8
+    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
+    }
+    # convert any alphanumeric character entities to their utf-8
+    # equivalent for indexing purposes
+    #&ghtml::convertcharentities ($$text);
+    $$text =~ s/\s+/ /g; # remove \n's
+}
+sub HB_clean_section {
+    my $self = shift (@_);
+    my ($section) = @_;
+    # remove tags without a starting tag from the section
+    my ($tag, $tagstart);
+    while ($section =~ m/<\/([^>]{1,10})>/) {
+    $tag = $1;
+    $tagstart = index($section, "<$tag");
+    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
+    $section =~ s/<\/$tag>//;
+    }
+    # remove extra paragraph tags
+    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
+    # remove extra stuff at the end of the section
+    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
+    # add a newline at the beginning of each paragraph
+    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
+    # add a newline every 80 characters at a word boundary
+    # Note: this regular expression puts a line feed before
+    # the last word in each section, even when it is not
+    # needed.
+    $section =~ s/(.{1,80})\s/$1\n/g;
+    # fix up the image links
+    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
+    <center><img src=\"$1\" \/><\/center><br\/>/ig;
+    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
+    <center><img src=\"$1\" \/><\/center><br\/>/ig;
+    return $section;
+}
+# Will convert the oldHDL format to the new HDL format (using the Section tag)
+sub convert_to_newHDLformat
+sub new {
+    my ($class) = shift (@_);
+    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
+    push(@$pluginlist, $class);
+    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+    push(@{$hashArgOptLists->{"OptList"}},$options);
+    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
+    if ($self->{'w3mir'}) {
+    $self->{'file_is_url'} = 1;
+    }
+    $self->{'aux_files'} = {};
+    $self->{'dir_num'} = 0;
+    $self->{'file_num'} = 0;
+    return bless $self, $class;
+}
+# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
+# if have eg <script language="javascript" src="img/lib.js@123">
+sub get_default_block_exp {
+    my $self = shift (@_);
+    #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
+    return "";
+}
+sub get_default_process_exp {
+    my $self = shift (@_);
+    # the last option is an attempt to encode the concept of an html query ...
+    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
+}
+sub store_block_files
+{
+    my $self = shift (@_);
+    my ($file,$cnfile) = @_;
+    my $input_filename = $file;
+    my $tmp_filename = $cnfile;
+    # write HTML tmp file with new HDL format
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    # read in the file and do basic html cleaning (removing header etc)
+    my $html = "";
+    $self->HB_read_html_file ($input_filename, \$html);
+    # process the file one section at a time
+    my $curtoclevel = 1;
+    my $firstsection = 1;
+    my $toclevel = 0;
+    while (length ($html) > 0) {
+    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
+        $toclevel = $3;
+        my $title = $4;
+        my $sectiontext = "";
+        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
+        $sectiontext = $1;
+        } else {
+        $sectiontext = $html;
+        $html = "";
+        }
+        # remove tags and extra spaces from the title
+        $title =~ s/<\/?[^>]+>//g;
+        $title =~ s/^\s+|\s+$//g;
+        # close any sections below the current level and
+        # create a new section (special case for the firstsection)
+        print PROD "<!--\n";
+        while (($curtoclevel > $toclevel) ||
+           (!$firstsection && $curtoclevel == $toclevel)) {
+        $curtoclevel--;
+        print PROD "</Section>\n";
+        }
+        if ($curtoclevel+1 < $toclevel) {
+        print STDERR "WARNING - jump in toc levels in $input_filename " .
+            "from $curtoclevel to $toclevel\n";
+        }
+        while ($curtoclevel < $toclevel) {
+        $curtoclevel++;
+        }
+        if ($curtoclevel == 1) {
+            # add the header tag
+        print PROD "-->\n";
+            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
+        print PROD "<!--\n";
+        }
+        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
+        print PROD "-->\n";
+        # clean up the section html
+        $sectiontext = $self->HB_clean_section($sectiontext);
+        print PROD "$sectiontext\n";
+    } else {
+        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
+        "\nin $input_filename\n";
+        last;
+    }
+    $firstsection = 0;
+    }
+    print PROD "<!--\n";
+    while ($curtoclevel > 0) {
+    $curtoclevel--;
+    print PROD "</Section>\n";
+    }
+    print PROD "-->\n";
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    return $tmp_filename;
+}
+sub shorten {
+    my $self = shift (@_);
+    my ($text) = @_;
+    return "\"$text\"" if (length($text) < 100);
+    return "\"" . substr ($text, 0, 50) . "\" ... \"" .
+    substr ($text, length($text)-50) . "\"";
+}
+sub convert_tidy_or_oldHDL_file
+{
+    my $self = shift (@_);
+    my ($file) = @_;
+    my $input_filename = $file;
+    if (-d $input_filename)
+    {
+        return $input_filename;
+    }
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    my $base_dirname = $dirname;
+    $suffix = lc($suffix);
+    # derive tmp filename from input filename
+    # Remove any white space from filename -- no risk of name collision, and
+    # makes later conversion by utils simpler. Leave spaces in path...
+    # tidy up the filename with space, dot, hyphen between
+    $tailname =~ s/\s+//g;
+    $tailname =~ s/\.+//g;
+    $tailname =~ s/\-+//g;
+    # convert to utf-8 otherwise we have problems with the doc.xml file
+    # later on
+    &unicode::ensure_utf8(\$tailname);
+    # softlink to collection tmp dir
+    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
+    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+    my $test_dirname = "";
+    my $f_separator = &util::get_os_dirsep();
+    if ($dirname =~ m/import$f_separator/)
+    {
+        $test_dirname = $'; #'
+    #print STDERR "init $'\n";
+    while ($test_dirname =~ m/[$f_separator]/)
+    {
+        my $folderdirname = $`;
+        $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
+        &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+        $test_dirname = $'; #'
+    }
+    }
+    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
+    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
+    {
+        #convert the input file to a new style HDL
+        my $hdl_output_filename = $input_filename;
+        if ($self->{'old_style_HDL'})
+        {
+        $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
+        }
+    #just for checking copy all other file from the base dir to tmp dir if it is not exists
+    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
+    my @files = grep {!/^\.+$/} readdir(DIR);
+    close(DIR);
+    foreach my $file (@files)
+    {
+        my $src_file = &util::filename_cat($base_dirname,$file);
+        my $dest_file = &util::filename_cat($tmp_dirname,$file);
+        if ((!-e $dest_file) && (!-d $src_file))
+        {
+        # just copy the original file back to the tmp directory
+        copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
+        }
+    }
+    # tidy the input file
+    my $tidy_output_filename = $hdl_output_filename;
+    if ($self->{'use_realistic_book'})
+    {
+        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
+    }
+    $tmp_filename = $tidy_output_filename;
+    }
+    else
+    {
+        if (!-e $tmp_filename)
+    {
+        # just copy the original file back to the tmp directory
+        copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
+    }
+    }
+    return $tmp_filename;
+}
+# Will make the html input file as a proper XML file with removed font tag and
+# image size added to the img tag.
+# The tidying process takes place in a collection specific 'tmp' directory so
+# that we don't accidentally damage the input.
+sub tmp_tidy_file
+{
+    my $self = shift (@_);
+    my ($file,$cnfile) = @_;
+    my $input_filename = $file;
+    my $tmp_filename = $cnfile;
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    require HTML::TokeParser::Simple;
+    # create HTML parser to decode the input file
+    my $parser = HTML::TokeParser::Simple->new($input_filename);
+    # write HTML tmp file without the font tag and image size are added to the img tag
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    while (my $token = $parser->get_token())
+    {
+    # is it an img tag
+    if ($token->is_start_tag('img'))
+    {
+        # get the attributes
+        my $attr = $token->return_attr;
+        # get the full path to the image
+        my $img_file = &util::filename_cat($dirname,$attr->{src});
+        # set the width and height attribute
+        ($attr->{width}, $attr->{height}) = imgsize($img_file);
+        # recreate the tag
+        print PROD "<img";
+        print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
+        print PROD ">";
+    }
+    # is it a font tag
+    else
+    {
+        if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
+        {
+        # remove font tag
+        print PROD "";
+        }
+        else
+        {
+        # print without changes
+        print PROD $token->as_is;
+        }
+    }
+    }
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    # run html-tidy on the tmp file to make it a proper XML file
+    my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
+    # write result back to the tmp file
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    print PROD $tidyfile;
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    # return the output filename
+    return $tmp_filename;
+}
+    my $self =shift (@_);
+    my ($filename_full_path, $block_hash) = @_;
+    my $html_fname = $filename_full_path;
+    my @file_blocks;
+    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
+    # read in file ($text will be in utf8)
+    my $raw_text = "";
+    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
+    my $textref = \$raw_text;
+    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
+    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
+    $$textref =~ s/$opencom(.*?)$closecom//gs;
+    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
+    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
+    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
+    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
+    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    if(!defined $self->{'utf8_to_original_filename'}) {
+    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
+    $self->{'utf8_to_original_filename'} = {};
+    }
+    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
+    # remove quotes from link at start and end if necessary
+    if ($link=~/^\"/) {
+        $link=~s/^\"//;
+        $link=~s/\"$//;
+    }
+    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
+    # some links may just be anchor names
+    next unless ($link =~ /\S+/);
+    if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
+        # Turn relative file path into full path
+        my $dirname = &File::Basename::dirname($filename_full_path);
+        $link = &util::filename_cat($dirname, $link);
+    }
+    $link = $self->eval_dir_dots($link);
+    # this is the actual filename on the filesystem (that the link refers to)
+    my $url_original_filename = $self->opt_url_decode($link);
+    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
+    my $utf8_link = "";
+    $self->decode_text($link,$encoding,$language,\$utf8_link);
+    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
+#   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
+    if ($url_original_filename ne $utf8_link) {
+        my $outhandle = $self->{'outhandle'};
+        print $outhandle "URL Encoding $url_original_filename\n";
+        print $outhandle " ->$utf8_link\n";
+    }
+    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
+    }
+}
+# Given a filename in any encoding, will URL decode it to get back the original filename
+# in the original encoding. Because this method is intended to work out the *original*
+# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
+# string already exists in the local folder.
+# Return the original filename corresponding to the parameter URL-encoded filename, and
+# a decoded flag that is set to true iff URL-decoding had to be applied.
+sub opt_url_decode {
+    my $self = shift (@_);
+    my ($link) = @_;
+    # Replace %XX's in URL with decoded value if required.
+    # Note that the filename may include the %XX in some situations
+    if ($link =~ m/\%[A-F0-9]{2}/i) {
+    if (!-e $link) {
+        $link = &unicode::url_decode($link);
+    }
+    }
+    return $link;
+}
 sub read_into_doc_obj
 …
     return ($process_status,$doc_obj);
+}
-sub new {
-    my ($class) = shift (@_);
-    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
-    push(@$pluginlist, $class);
-    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
-    push(@{$hashArgOptLists->{"OptList"}},$options);
-    my $self = new ReadTextFile($pluginlist,$inputargs,$hashArgOptLists);
-    if ($self->{'w3mir'}) {
-    $self->{'file_is_url'} = 1;
+    }
-    $self->{'aux_files'} = {};
-    $self->{'dir_num'} = 0;
-    $self->{'file_num'} = 0;
-    return bless $self, $class;
+}
-# may want to use (?i)\.(gif|jpe?g|jpe|png|css|js(?:@.*)?)$
-# if have eg <script language="javascript" src="img/lib.js@123">
-sub get_default_block_exp {
-    my $self = shift (@_);
-    #return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^;
-    return "";
+}
-sub get_default_process_exp {
-    my $self = shift (@_);
-    # the last option is an attempt to encode the concept of an html query ...
-    return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^;
+}
-sub store_block_files
+{
-    my $self =shift (@_);
-    my ($filename_full_path, $block_hash) = @_;
-    my $html_fname = $filename_full_path;
-    my @file_blocks;
-    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
-    # read in file ($text will be in utf8)
-    my $raw_text = "";
-    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
-    my $textref = \$raw_text;
-    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
-    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
-    $$textref =~ s/$opencom(.*?)$closecom//gs;
-    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
-    my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs);
-    my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs);
-    my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs);
-    my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs);
-    my @tabbg_matches = ($$textref =~ m/<(?:body|table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs);
-    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
-    if(!defined $self->{'utf8_to_original_filename'}) {
-    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
-    $self->{'utf8_to_original_filename'} = {};
+    }
-    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
-    # remove quotes from link at start and end if necessary
-    if ($link=~/^\"/) {
-        $link=~s/^\"//;
-        $link=~s/\"$//;
+    }
-    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
-    # some links may just be anchor names
-    next unless ($link =~ /\S+/);
-    if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
-        # Turn relative file path into full path
-        my $dirname = &File::Basename::dirname($filename_full_path);
-        $link = &util::filename_cat($dirname, $link);
+    }
-    $link = $self->eval_dir_dots($link);
-    # this is the actual filename on the filesystem (that the link refers to)
-    my $url_original_filename = $self->opt_url_decode($link);
-    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
-    my $utf8_link = "";
-    $self->decode_text($link,$encoding,$language,\$utf8_link);
-    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
-#   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
-    if ($url_original_filename ne $utf8_link) {
-        my $outhandle = $self->{'outhandle'};
-        print $outhandle "URL Encoding $url_original_filename\n";
-        print $outhandle " ->$utf8_link\n";
+    }
-    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
+    }
+}
-# Given a filename in any encoding, will URL decode it to get back the original filename
-# in the original encoding. Because this method is intended to work out the *original*
-# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
-# string already exists in the local folder.
-# Return the original filename corresponding to the parameter URL-encoded filename, and
-# a decoded flag that is set to true iff URL-decoding had to be applied.
-sub opt_url_decode {
-    my $self = shift (@_);
-    my ($link) = @_;
-    # Replace %XX's in URL with decoded value if required.
-    # Note that the filename may include the %XX in some situations
-    if ($link =~ m/\%[A-F0-9]{2}/i) {
-    if (!-e $link) {
-        $link = &unicode::url_decode($link);
+    }
+    }
-    return $link;
+}
 # do plugin specific processing of doc_obj
 …
+}
+sub HB_read_html_file {
+    my $self = shift (@_);
+    my ($htmlfile, $text) = @_;
+    # load in the file
+    if (!open (FILE, $htmlfile)) {
+    print STDERR "ERROR - could not open $htmlfile\n";
+    return;
+    }
+    my $foundbody = 0;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    # just in case there was no <body> tag
+    if (!$foundbody) {
+    $foundbody = 1;
+    open (FILE, $htmlfile) || return;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    }
+    # text is in utf8
+}
+# converts the text to utf8, as ghtml does that for &eacute; etc.
+sub HB_gettext {
+    my $self = shift (@_);
+    my ($foundbody, $text, $handle) = @_;
+    my $line = "";
+    while (defined ($line = <$handle>)) {
+    # look for body tag
+    if (!$$foundbody) {
+        if ($line =~ s/^.*<body[^>]*>//i) {
+        $$foundbody = 1;
+        } else {
+        next;
+        }
+    }
+    # check for symbol fonts
+    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
+        my $font = $1;
+        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
+        if ($font !~ m/^arial$/i);
+    }
+    $$text .= $line;
+    }
+    if ($self->{'input_encoding'} eq "iso_8859_1") {
+    # convert to utf-8
+    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
+    }
+    # convert any alphanumeric character entities to their utf-8
+    # equivalent for indexing purposes
+    #&ghtml::convertcharentities ($$text);
+    $$text =~ s/\s+/ /g; # remove \n's
+}
+sub HB_clean_section {
+    my $self = shift (@_);
+    my ($section) = @_;
+    # remove tags without a starting tag from the section
+    my ($tag, $tagstart);
+    while ($section =~ m/<\/([^>]{1,10})>/) {
+    $tag = $1;
+    $tagstart = index($section, "<$tag");
+    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
+    $section =~ s/<\/$tag>//;
+    }
+    # remove extra paragraph tags
+    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
+    # remove extra stuff at the end of the section
+    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
+    # add a newline at the beginning of each paragraph
+    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
+    # add a newline every 80 characters at a word boundary
+    # Note: this regular expression puts a line feed before
+    # the last word in each section, even when it is not
+    # needed.
+    $section =~ s/(.{1,80})\s/$1\n/g;
+    # fix up the image links
+    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
+    <center><img src=\"$1\" \/><\/center><br\/>/ig;
+    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
+    <center><img src=\"$1\" \/><\/center><br\/>/ig;
+    return $section;
+}
+# Will convert the oldHDL format to the new HDL format (using the Section tag)
+sub convert_to_newHDLformat
+{
+    my $self = shift (@_);
+    my ($file,$cnfile) = @_;
+    my $input_filename = $file;
+    my $tmp_filename = $cnfile;
+    # write HTML tmp file with new HDL format
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    # read in the file and do basic html cleaning (removing header etc)
+    my $html = "";
+    $self->HB_read_html_file ($input_filename, \$html);
+    # process the file one section at a time
+    my $curtoclevel = 1;
+    my $firstsection = 1;
+    my $toclevel = 0;
+    while (length ($html) > 0) {
+    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
+        $toclevel = $3;
+        my $title = $4;
+        my $sectiontext = "";
+        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
+        $sectiontext = $1;
+        } else {
+        $sectiontext = $html;
+        $html = "";
+        }
+        # remove tags and extra spaces from the title
+        $title =~ s/<\/?[^>]+>//g;
+        $title =~ s/^\s+|\s+$//g;
+        # close any sections below the current level and
+        # create a new section (special case for the firstsection)
+        print PROD "<!--\n";
+        while (($curtoclevel > $toclevel) ||
+           (!$firstsection && $curtoclevel == $toclevel)) {
+        $curtoclevel--;
+        print PROD "</Section>\n";
+        }
+        if ($curtoclevel+1 < $toclevel) {
+        print STDERR "WARNING - jump in toc levels in $input_filename " .
+            "from $curtoclevel to $toclevel\n";
+        }
+        while ($curtoclevel < $toclevel) {
+        $curtoclevel++;
+        }
+        if ($curtoclevel == 1) {
+            # add the header tag
+        print PROD "-->\n";
+            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
+        print PROD "<!--\n";
+        }
+        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
+        print PROD "-->\n";
+        # clean up the section html
+        $sectiontext = $self->HB_clean_section($sectiontext);
+        print PROD "$sectiontext\n";
+    } else {
+        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
+        "\nin $input_filename\n";
+        last;
+    }
+    $firstsection = 0;
+    }
+    print PROD "<!--\n";
+    while ($curtoclevel > 0) {
+    $curtoclevel--;
+    print PROD "</Section>\n";
+    }
+    print PROD "-->\n";
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    return $tmp_filename;
+}
+sub shorten {
+    my $self = shift (@_);
+    my ($text) = @_;
+    return "\"$text\"" if (length($text) < 100);
+    return "\"" . substr ($text, 0, 50) . "\" ... \"" .
+    substr ($text, length($text)-50) . "\"";
+}
+sub convert_tidy_or_oldHDL_file
+{
+    my $self = shift (@_);
+    my ($file) = @_;
+    my $input_filename = $file;
+    if (-d $input_filename)
+    {
+        return $input_filename;
+    }
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    my $base_dirname = $dirname;
+    $suffix = lc($suffix);
+    # derive tmp filename from input filename
+    # Remove any white space from filename -- no risk of name collision, and
+    # makes later conversion by utils simpler. Leave spaces in path...
+    # tidy up the filename with space, dot, hyphen between
+    $tailname =~ s/\s+//g;
+    $tailname =~ s/\.+//g;
+    $tailname =~ s/\-+//g;
+    # convert to utf-8 otherwise we have problems with the doc.xml file
+    # later on
+    &unicode::ensure_utf8(\$tailname);
+    # softlink to collection tmp dir
+    my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp");
+    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+    my $test_dirname = "";
+    my $f_separator = &util::get_os_dirsep();
+    if ($dirname =~ m/import$f_separator/)
+    {
+        $test_dirname = $'; #'
+    #print STDERR "init $'\n";
+    while ($test_dirname =~ m/[$f_separator]/)
+    {
+        my $folderdirname = $`;
+        $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
+        &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
+        $test_dirname = $'; #'
+    }
+    }
+    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
+    if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
+    {
+        #convert the input file to a new style HDL
+        my $hdl_output_filename = $input_filename;
+        if ($self->{'old_style_HDL'})
+        {
+        $hdl_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
+        }
+    #just for checking copy all other file from the base dir to tmp dir if it is not exists
+    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
+    my @files = grep {!/^\.+$/} readdir(DIR);
+    close(DIR);
+    foreach my $file (@files)
+    {
+        my $src_file = &util::filename_cat($base_dirname,$file);
+        my $dest_file = &util::filename_cat($tmp_dirname,$file);
+        if ((!-e $dest_file) && (!-d $src_file))
+        {
+        # just copy the original file back to the tmp directory
+        copy($src_file,$dest_file) or die "Can't copy file $src_file to $dest_file $!";
+        }
+    }
+    # tidy the input file
+    my $tidy_output_filename = $hdl_output_filename;
+    if ($self->{'use_realistic_book'})
+    {
+        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
+    }
+    $tmp_filename = $tidy_output_filename;
+    }
+    else
+    {
+        if (!-e $tmp_filename)
+    {
+        # just copy the original file back to the tmp directory
+        copy($input_filename,$tmp_filename) or die "Can't copy file $input_filename to $tmp_filename $!";
+    }
+    }
+    return $tmp_filename;
+}
+# Will make the html input file as a proper XML file with removed font tag and
+# image size added to the img tag.
+# The tidying process takes place in a collection specific 'tmp' directory so
+# that we don't accidentally damage the input.
+sub tmp_tidy_file
+{
+    my $self = shift (@_);
+    my ($file,$cnfile) = @_;
+    my $input_filename = $file;
+    my $tmp_filename = $cnfile;
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    require HTML::TokeParser::Simple;
+    # create HTML parser to decode the input file
+    my $parser = HTML::TokeParser::Simple->new($input_filename);
+    # write HTML tmp file without the font tag and image size are added to the img tag
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    while (my $token = $parser->get_token())
+    {
+    # is it an img tag
+    if ($token->is_start_tag('img'))
+    {
+        # get the attributes
+        my $attr = $token->return_attr;
+        # get the full path to the image
+        my $img_file = &util::filename_cat($dirname,$attr->{src});
+        # set the width and height attribute
+        ($attr->{width}, $attr->{height}) = imgsize($img_file);
+        # recreate the tag
+        print PROD "<img";
+        print PROD map { qq { $_="$attr->{$_}"} } keys %$attr;
+        print PROD ">";
+    }
+    # is it a font tag
+    else
+    {
+        if (($token->is_start_tag('font')) || ($token->is_end_tag('font')))
+        {
+        # remove font tag
+        print PROD "";
+        }
+        else
+        {
+        # print without changes
+        print PROD $token->as_is;
+        }
+    }
+    }
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    # run html-tidy on the tmp file to make it a proper XML file
+    my $tidyfile = `tidy -utf8 -wrap 0 -asxml "$tmp_filename"`;
+    # write result back to the tmp file
+    open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+    print PROD $tidyfile;
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    # return the output filename
+    return $tmp_filename;
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 20774

Legend:

gsdl/trunk/perllib/plugins/HTMLPlugin.pm

Download in other formats: