Context Navigation

← Previous Changeset
Next Changeset →

Changeset 14012

Timestamp:

2007-04-16T15:43:20+12:00 (17 years ago)

Author:

cvs_anon

Message:

modify HTMLPlug to convert old HDL section style to new HDL section style

Location:

trunk/gsdl/perllib

Files:

: 2 edited

plugins/HTMLPlug.pm (modified) (7 diffs)
strings.properties (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r13968
+              r14012
         'desc' => "{HTMLPlug.tidy_html}",
     'type' => "flag"},
+      { 'name' => "old_style_HDL",
+        'desc' => "{HTMLPlug.old_style_HDL}",
+    'type' => "flag"}
       ];
 …
         'args'     => $arguments };
+# Will make the html input file as a proper XML file with removed font tag and
+# image size added to the img tag.
+# The tidying process takes place in a collection specific 'tmp' directory so
+# that we don't accidentally damage the input.
+sub tmp_tidy_file
+sub HB_read_html_file {
+    my $self = shift (@_);
+    my ($htmlfile, $text) = @_;
+    # load in the file
+    if (!open (FILE, $htmlfile)) {
+    print STDERR "ERROR - could not open $htmlfile\n";
+    return;
+    }
+    my $foundbody = 0;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    # just in case there was no <body> tag
+    if (!$foundbody) {
+    $foundbody = 1;
+    open (FILE, $htmlfile) || return;
+    $self->HB_gettext (\$foundbody, $text, "FILE");
+    close FILE;
+    }
+    # text is in utf8
+}
+# converts the text to utf8, as ghtml does that for &eacute; etc.
+sub HB_gettext {
+    my $self = shift (@_);
+    my ($foundbody, $text, $handle) = @_;
+    my $line = "";
+    while (defined ($line = <$handle>)) {
+    # look for body tag
+    if (!$$foundbody) {
+        if ($line =~ s/^.*<body[^>]*>//i) {
+        $$foundbody = 1;
+        } else {
+        next;
+        }
+    }
+    # check for symbol fonts
+    if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
+        my $font = $1;
+        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
+        if ($font !~ /^arial$/i);
+    }
+    $$text .= $line;
+    }
+    if ($self->{'input_encoding'} eq "iso_8859_1") {
+    # convert to utf-8
+    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
+    }
+    # convert any alphanumeric character entities to their utf-8
+    # equivalent for indexing purposes
+    &ghtml::convertcharentities ($$text);
+    $$text =~ s/\s+/ /g; # remove \n's
+}
+sub HB_clean_section {
+    my $self = shift (@_);
+    my ($section) = @_;
+    # remove tags without a starting tag from the section
+    my ($tag, $tagstart);
+    while ($section =~ /<\/([^>]{1,10})>/) {
+    $tag = $1;
+    $tagstart = index($section, "<$tag");
+    last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
+    $section =~ s/<\/$tag>//;
+    }
+    # remove extra paragraph tags
+    while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
+    # remove extra stuff at the end of the section
+    while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
+    # add a newline at the beginning of each paragraph
+    $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
+    # add a newline every 80 characters at a word boundary
+    # Note: this regular expression puts a line feed before
+    # the last word in each section, even when it is not
+    # needed.
+    $section =~ s/(.{1,80})\s/$1\n/g;
+    # fix up the image links
+    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
+    <center><img src=\"$1\"><\/center><br>/ig;
+    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
+    <center><img src=\"$1\"><\/center><br>/ig;
+    return $section;
+}
+# Will convert the oldHDL format to the new HDL format (using the Section tag)
+sub convert_to_newHDLformat
+{
+   my $self = shift (@_);
+   my ($file,$cnfile) = @_;
+   my $input_filename = $file;
+   my $tmp_filename = $cnfile;
+   # write HTML tmp file with new HDL format
+   open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!");
+   # read in the file and do basic html cleaning (removing header etc)
+   my $html = "";
+   $self->HB_read_html_file ($input_filename, \$html);
+    # process the file one section at a time
+    my $curtoclevel = 1;
+    my $firstsection = 1;
+    my $toclevel = 0;
+    while (length ($html) > 0) {
+    if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
+        $toclevel = $3;
+        my $title = $4;
+        my $sectiontext = "";
+        if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
+        $sectiontext = $1;
+        } else {
+        $sectiontext = $html;
+        $html = "";
+        }
+        # remove tags and extra spaces from the title
+        $title =~ s/<\/?[^>]+>//g;
+        $title =~ s/^\s+|\s+$//g;
+        # close any sections below the current level and
+        # create a new section (special case for the firstsection)
+        print PROD "<!--\n";
+        while (($curtoclevel > $toclevel) ||
+           (!$firstsection && $curtoclevel == $toclevel)) {
+        $curtoclevel--;
+        print PROD "</Section>\n";
+        }
+        if ($curtoclevel+1 < $toclevel) {
+        print STDERR "WARNING - jump in toc levels in $input_filename " .
+            "from $curtoclevel to $toclevel\n";
+        }
+        while ($curtoclevel < $toclevel) {
+        $curtoclevel++;
+        }
+        if ($curtoclevel == 1) {
+            # add the header tag
+        print PROD "-->\n";
+            print PROD "<HTML>\n<HEAD>\n<TITLE>$title</TITLE>\n</HEAD>\n<BODY>\n";
+        print PROD "<!--\n";
+        }
+        print PROD "<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">$title</Metadata>\n\t</Description>\n";
+        print PROD "-->\n";
+        # clean up the section html
+        $sectiontext = $self->HB_clean_section($sectiontext);
+        print PROD "$sectiontext\n";
+    } else {
+        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
+        "\nin $input_filename\n";
+        last;
+    }
+    $firstsection = 0;
+    }
+    print PROD "<!--\n";
+    while (($curtoclevel > $toclevel) ||
+       (!$firstsection && $curtoclevel == $toclevel)) {
+    $curtoclevel--;
+    print PROD "</Section>\n";
+    }
+    print PROD "</Section>\n";
+    print PROD "-->\n";
+    close (PROD) || die("Error Closing File: $tmp_filename $!");
+    return $tmp_filename;
+}
+sub convert_tidy_or_oldHDL_file
+{
     my $self = shift (@_);
 …
     $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname);
     &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     # tidy the input file if it is a HTML-like file
+    # tidy or convert the input file if it is a HTML-like file or it is accepted by the process_exp
     if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
+    {
+        # convert the input file to a new style HDL
+        my $hdl_output_filename = $input_filename;
+        if ($self->{'old_style_HDL'})
+        {
+        $hdl_output_filename = &util::filename_cat($tmp_dirname, "newHDL_$tailname$suffix");
+        $hdl_output_filename = $self->convert_to_newHDLformat($input_filename,$hdl_output_filename);
+        }
+    # tidy the input file
+    my $tidy_output_filename = $hdl_output_filename;
+    if ($self->{'tidy_html'})
+    {
+        $tidy_output_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+        $tidy_output_filename = $self->tmp_tidy_file($hdl_output_filename,$tidy_output_filename);
+    }
+    $tmp_filename = $tidy_output_filename;
+    # just for checking copy all other file from the base dir to tmp dir if it is not exists
+    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
+    my @files = grep {!/^\.+$/} readdir(DIR);
+    close(DIR);
+    foreach my $file (@files)
+    {
+        my $src_file = &util::filename_cat($base_dirname,$file);
+        my $dest_file = &util::filename_cat($tmp_dirname,$file);
+        if ((!-e $dest_file) && (!-d $src_file))
+        {
+            # just copy the original file back to the tmp directory
+                open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
+                open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
+                print TIDYOUT <TIDYIN>;
+                close TIDYIN;
+                close TIDYOUT;
+        }
+    }
+    }
+    else
+    {
+        if (!-e $tmp_filename)
+    {
+            # just copy the original file back to the tmp directory
+            open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
+            open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
+            print TIDYOUT <TIDYIN>;
+            close TIDYIN;
+            close TIDYOUT;
+    }
+    }
+    return $tmp_filename;
+}
+# Will make the html input file as a proper XML file with removed font tag and
+# image size added to the img tag.
+# The tidying process takes place in a collection specific 'tmp' directory so
+# that we don't accidentally damage the input.
+sub tmp_tidy_file
+{
+    my $self = shift (@_);
+    my ($file,$cnfile) = @_;
+    my $input_filename = $file;
+    my $tmp_filename = $cnfile;
+    # get the input filename
+    my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
         # create HTML parser to decode the input file
         my $parser = HTML::TokeParser::Simple->new($input_filename);
 …
         print PROD $tidyfile;
         close (PROD) || die("Error Closing File: $tmp_filename $!");
-    # just for checking copy all other file from the base dir to tmp dir if it is not exists
-    opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!";
-    my @files = grep {!/^\.+$/} readdir(DIR);
-    close(DIR);
-    foreach my $file (@files)
+    {
-        my $src_file = &util::filename_cat($base_dirname,$file);
-        my $dest_file = &util::filename_cat($tmp_dirname,$file);
-        if ((!-e $dest_file) && (!-d $src_file))
+        {
-            # just copy the original file back to the tmp directory
-                open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!";
-                open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!";
-                print TIDYOUT <TIDYIN>;
-                close TIDYIN;
-                close TIDYOUT;
+        }
+    }
+    }
-    else
+    {
-        if (!-e $tmp_filename)
+    {
-            # just copy the original file back to the tmp directory
-            open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!";
-            open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!";
-            print TIDYOUT <TIDYIN>;
-            close TIDYIN;
-            close TIDYOUT;
+    }
+    }
     # return the output filename
 …
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
+    # check the process_exp and block_exp thing
+    my ($block_status,$filename) = $self->read_block(@_);
+    return $block_status if ((!defined $block_status) || ($block_status==0));
     # get the input file
     my $input_filename = $file;
 …
     $suffix = lc($suffix);
     if ($self->{'tidy_html'})
+    if (($self->{'tidy_html'}) || ($self->{'old_style_HDL'}))
+    {
-        # tidy the input file if it is a HTML-like file
-        #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml"))
-        #{
         # set the file to be tidied
             $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
             # get the tidied file
+            my $tidy_filename = $self->tmp_tidy_file($input_filename);
+            #my $tidy_filename = $self->tmp_tidy_file($input_filename);
+        my $tidy_filename = $self->convert_tidy_or_oldHDL_file($input_filename);
             # derive tmp filename from input filename
 …
         $file = "$tailname$suffix";
         $base_dir = $dirname;
-    #}
+    }

trunk/gsdl/perllib/strings.properties

r13968	r14012
819	819	HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format.
820	820
	821	HTMLPlug.old_style_HDL:To mark whether the file in this collection is sectionalized using the old HDL's section style.
	822
821	823	ImagePlug.converttotype:Convert main image to format 's'.
822	824

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 14012

Legend:

trunk/gsdl/perllib/plugins/HTMLPlug.pm

trunk/gsdl/perllib/strings.properties

Download in other formats: