Context Navigation

← Previous Changeset
Next Changeset →

Changeset 14251

Timestamp:

2007-07-16T10:22:59+12:00 (17 years ago)

Author:

anna

Message:

updated version, added comments

File:

: 1 edited

gsdl/trunk/perllib/plugins/MediaWikiPlug.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/plugins/MediaWikiPlug.pm

-              r14108
+              r14251
+#
 ###########################################################################
+# This plugin is to process an HTML file where sections are divided by
+# user-defined headings tags. As it is difficult to predict what user's definition
+# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
+# as well as allows to get rid of user-defined Table of Content (TOC)...
+# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
+# This plugin is to process an HTML file from a MediaWiki website which downloaded by
+# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
+# login, discussion, history, etc. Only the navigation and search section could be preserved.
+# Searchbox will be modified to search the Greenstone collection instead of the website.
+# It also can automatically add the table of contents on the website's Main_Page to the
+# collection's Home page.
 package MediaWikiPlug;
 use HTMLPlug;
+use ImagePlug;
+use File::Copy;
+# use ImagePlug;
+# use File::Copy;
+use unicode;
 #use strict; # every perl program should have this!
 …
 sub BEGIN {
     @MediaWikiPlug::ISA = ('HTMLPlug');
+    @MediaWikiPlug::ISA = ('HTMLPlug');
+}
 my $arguments =
+    [
+     # show the table of contents on collection's home page
      { 'name' => "show_toc",
        'desc' => "{MediaWikiPlug.show_toc}",
        'type' => "flag",
        'reqd' => "no"},
+     # set to delete the table of contents section on each MediaWiki page
+     { 'name' => "delete_toc",
+       'desc' => "{MediaWikiPlug.delete_toc}",
+       'type' => "flag",
+       'reqd' => "no"},
+     # regexp to match the table of contents
      { 'name' => "toc_exp",
        'desc' => "{MediaWikiPlug.toc_exp}",
        'type' => "regexp",
        'reqd' => "no",
+       'deft' => "" },
+     { 'name' => "delete_toc",
+       'desc' => "{MediaWikiPlug.delete_toc}",
+       'type' => "flag",
+       'reqd' => "no"},
+       'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*</table>\\n" },
+     # set to delete the navigation section
      { 'name' => "delete_nav",
        'desc' => "{MediaWikiPlug.delete_nav}",
        'type' => "flag",
        'reqd' => "no",
+       'deft' => ""},
+     { 'name' => "nav_exp",
+       'desc' => "{MediaWikiPlug.nav_exp}",
+       'deft' => ""},
+     # regexp to match the navigation section
+     { 'name' => "nav_div_exp",
+       'desc' => "{MediaWikiPlug.nav_div_exp}",
        'type' => "regexp",
        'reqd' => "no",
+       'deft' => "" },
+     { 'name' => "tag_sections",
+       'desc' => "{MediaWikiPlug.tag_sections}",
+       'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
+     # set to delete the searchbox section
+     { 'name' => "delete_searchbox",
+       'desc' => "{MediaWikiPlug.delete_searchbox}",
        'type' => "flag",
+       'reqd' => "no"},
+     { 'name' => "description_tags",
+       'desc' => "{HTMLPlug.description_tags}",
+       'type' => "flag",
+       'reqd' => "no"}
+       'reqd' => "no",
+       'deft' => ""},
+     # regexp to match the searchbox section
+     { 'name' => "searchbox_div_exp",
+       'desc' => "{MediaWikiPlug.searchbox_div_exp}",
+       'type' => "regexp",
+       'reqd' => "no",
+       'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
+     # regexp to match title suffix
+     # can't use the title_sub option in HTMLPlug instead
+     # because title_sub always matches from the begining
+     { 'name' => "remove_title_suffix_exp",
+       'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
+       'type' => "regexp",
+       'reqd' => "no",
+       'deft' => ""}
      ];
 my $options = { 'name'     => "MediaWikiPlug",
 …
         'args'     => $arguments };
 sub new {
     my ($class) = shift (@_);
 …
     $head =~ m/<title>(.+)<\/title>/i;
     my $doctitle = $1 if defined $1;
+    my $doctitle = $1 if defined $1;
     if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
 …
     # set the title here if we haven't found it yet
     if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
+    if (defined $doctitle && $doctitle =~ /\S/) {
+        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
+    if (defined $doctitle && $doctitle =~ /\S/) {
+            # remove suffix in title if required
+            my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
+        if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
+           $doctitle =~ s/$remove_suffix_exp//i;
+        }
+        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
     } else {
         $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
+        $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
+    }
+    }
+    if(defined $base_dir && $base_dir ne ""){
+    # find and download stylesheet
+    }
+    # we are only interested in the column-contents div <div id="column-content">
+    # remove header section, it may contain header images or additional search boxes
+    my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
+    $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
+    # remove timeline
+    $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
+    # remove extra bits
+    my $extra_bits = "Retrieved from(.+)</a>\"";
+    $body_text =~ s/$extra_bits//isg;
+    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
+    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
+    $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
+    $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
+    # get rid of the [edit] buttons
+    $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
+    # get rid of the last time edit information at the bottom
+    $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
+    # get rid of the (Redirected from ...)
+    $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
+    # escape texts macros
+    $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
+    # may change the links, like Greenstone_Documentation_All.html, then change back
+    $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
+    # define file delimiter for different platforms
+    my $file_delimiter;
+    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+       $file_delimiter = "\\";
+    } else {
+       $file_delimiter = "/";
+    }
+    # IMPORTANT: different delimiter for $base_dir and $file
+    # $base_dir use forward slash for both windows and linux
+    # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
+                                        # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
+    # $file use different delimiters : forward slash for linux; backward slash for windows
+    # print "\nfile : $file\n\n";         # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
+                                        # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
+    # get the base url for the MediaWiki website
+    my $safe_delimiter = &safe_escape_regexp($file_delimiter);
+    my @url_dirs=split($safe_delimiter, $file);
+    my $url_base = $url_dirs[0];
+    # Re-check css files associated with MediaWiki pages
+    if(defined $base_dir && $base_dir ne ""){
     my @css_files;
     my $css_file_count = 0;
+    # find all the style sheets imported with import statement
+    # find all the stylesheets imported with @import statement
     while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
+        $css_files[$css_file_count++] = $2 if defined $2;
+    }
+        $css_files[$css_file_count++] = $2 if defined $2;
+    }
+    # download the stylesheets if we haven't downloaded them yet
+        # add prefix to each style elmement, comment out the body element
+        # and copy the files to collection's images folder
+    for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
+        my $css_file = $css_files[$css_file_count];
+        # remove prefix gli/cache directory
+            $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
+            # change the \ delimiter in $css_file to / for consistency
+            $css_file =~ s/\\/\//isg;
+            if($css_file !~ /$url_base/) {
+              $css_file = $url_base . $css_file;
+            }
+            # trim the ? mark append to the end of a stylesheet
+        $css_file =~ s/\?(.+)$//isg;
+            my $css_file_path = &util::filename_cat($base_dir, $css_file);
+        # do nothing if we have already downloaded the css files
+        if (! -e $css_file_path) {
+             # check the stylesheet's directory in the import folder
+             # if the directory doesn't exist, create one
+         my @dirs = split(/\//i,$css_file);
+         my $path_check = "$base_dir/";
+         for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
+        $path_check .= $dirs[$i] . "/";
+        mkdir($path_check) if (! -d $path_check );
+         }
+             # NOTE: wget needs configuration to directly access Internet
+             # These files should already downloaded if we used the MediaWikiDownload
+         # downloading
+         $css_file = "http://$css_file";
+             print "\ndownloading : " . $css_file . "\n\n";
+         system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
+         if ($? != 0) {
+              print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
+              print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
+              unlink("$css_file_path");
+             }
+            } # done with download
+        # add a prefix "#wikispecificstyle" to each element
+        # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
+            # so we will wrap the web page with a div with id = wikispecificstyle
+            my $css_content;
+        if(open(INPUT, "<$css_file_path")){
+        while(my $line = <INPUT>){
+                    # comment out the body element because we change the body to div
+                    $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
+            if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
+            $line = "#wikispecificstyle " . $line;
+            }
+            $css_content .= $line;
+        }
+        close(INPUT);
+        open(OUTPUT, ">$css_file_path");
+        print OUTPUT $css_content;
+        close(OUTPUT);
+        }
+            # Copy the modified stylesheets to collection's images folder
+            # for future customization
+            my $images_dir = $base_dir;
+            $images_dir =~ s/import$/images/;
+            $css_file =~ m/(.*)\/(.*)$/;
+            $images_dir = &util::filename_cat($images_dir, $2);
+            if(open(OUTPUT, ">$images_dir")){
+              print OUTPUT $css_content;
+              close(OUTPUT);
+            }
+    }
+    }
+    # by default, only preserve navigation box and search box
+    # others like toolbox, interaction, languages box, will be removed
+    # extract the larger part -- footer section
+    my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
+    $body_text =~ /$print_footer/;
+    my $footer = "";
+    $footer = $& if defined $&;
+    $footer =~ s/<\/body>//isg;
+    # trim the comments first
+    $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
+    # contain sections that are to be preserved
+    my $preserve_sections = "";
+    # process the navigation section
+    my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
+    if (defined $self->{'nav_div_exp'}) {
+      $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
+    }
+    if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
+        # do nothing
+    } else {
+      if ($footer =~ m/$nav_match_exp/ig) {
+        $preserve_sections = $& ;
+      } else {
+        print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
+      }
+      # if($preserve_sections =~/\S/){
+      #  $preserve_sections .= "</div>";
+      # }
+    }
+    # process the searchbox section
+    my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
+    if(defined $self->{'searchbox_div_exp'}) {
+        $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
+    }
+    my $searchbox_section = "";
+    $footer =~ m/$searchbox_exp/ig;
+    $searchbox_section = $& if defined $&;
+    # make the searchbox form work in Greenstone
+    if($searchbox_section =~ /\S/){
+        # replace action
+        $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
+        # remove buttons
+        $searchbox_section =~ s/name="search"/name="q"/isg;
+        $searchbox_section =~ s/name="go"//isg;
+        $searchbox_section =~ s/name="fulltext"//isg;
+        # get collection name from $base_dir for c param
+        $base_dir =~ m/\/collect\/(.+)\//i;
+        my $collection_name = "";
+        $collection_name = $1 if defined $1;
+        # add Greenstone search params
+        my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
+            ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
+            # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
+            # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
+        $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
+        # $searchbox_section .= "</div>";
+    } else {
+      print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
+    }
+    # either delete or replace the searchbox
+    if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
+        # do nothing
+    } else {
+        $preserve_sections .= "\n$searchbox_section\n";
+    }
+    if($preserve_sections ne ""){
+      $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
+    }
+    $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
+    $body_text =~ s/$print_footer/$preserve_sections/isg;
+    # delete other forms in the page
+    my @forms;
+    my $form_count = 0;
+    while($body_text =~ m/<form([^>]*)name=("|')([^>]*)("|')/isg){
+        next if($3 eq "q");
+        $forms[$form_count++] = $&;
+    }
+    foreach my $form (@forms) {
+      $body_text =~ s/$form[\s\S]*?<\/form>//m;
+    }
+    # process links.
+    # because current WGET 1.10 the -k and -E option doesn't work together
+    # need to 'manually' convert the links to relative links
+    # Dealing with 3 types of links:
+    # -- outgoing links
+    #   -- if we have downloaded the target files, link to the internal version (relative link)
+    #   -- otherwise, link to the external version (absolute links)
+    # -- in-page links (relative link)
+    # NOTE: (important)
+    #   must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
+    #   otherwise, the internal links may have problems
+    # remove the title attribute of <a> tag
+    $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
+    # extract all the links
+    my @links;
+    my $link_count = 0;
+    while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
+        $links[$link_count++] = "$1=\"$2$url_base/$3\"";
+    }
+    foreach my $cur_link (@links) {
+        # escape greedy match + character
+        $cur_link =~ s/\+/\\+/isg;
+        $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
+        my $external_file_path = "$1\"http://$url_base/$3\"";
+        $body_text =~ s/$cur_link/$external_file_path/i;
+    }
+    # tag links to new wiki pages as red
+    $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
+    # tag links to pages external of the MediaWiki website as blue
+    $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
+    # process the table-of-contents section
+    # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
+    # 1. read _content_ macro from about.dm
+    # 2. append the toc, change all links to the Greenstone internal format for relative links
+    # 3. write to the extra.dm
+    # TODO: we assume the _about:content_ hasn't been specified before
+    #       so needs to add function to handle when the macro is already in the extra.dm
+    if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
+      # extract toc of the Main_Page
+      my $mainpage_toc = "";
+      my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
+      if($self->{'toc_exp'} =~ /\S/){
+         $toc_exp = $self->{'toc_exp'};
+      }
+      if($body_text =~ /$toc_exp/){
+        $mainpage_toc = $&;
+      }
+      if($mainpage_toc =~ /\S/) {
+        # change the in-page links to relative links, for example, change <a href="#section1"> to
+        # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
+        my $file_url_format = $file;
+        $file_url_format =~ s/\\/\//isg;
+    $file_url_format = "http://" . $file_url_format;
+        # encode as URL, otherwise doesn't work on Windows
+        $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
+    $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
+        # read the collection's extra.dm
+        my $macro_path = $base_dir;
+        $macro_path =~ s/import$/macros/;
+        my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
+        my $extra_dm = "";
+        if(open(INPUT, "<$extradm_file")){
+        while(my $line = <INPUT>){
+        $extra_dm .= $line;
+        }
+        } else {
+            print $outhandle "can't open file $extradm_file\n";
+        }
+        close(INPUT);
+        # check whether we have changed the macros
+        my @packages = split("package ", $extra_dm);
+        my $about_package = "";
+        foreach my $package (@packages) {
+          $about_package = "package " . $package if($package =~ /^about/);
+        }
+        my $update_extra_dm = 0;
+        if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
+       print $outhandle "_content_ macro already changed!!!!\n";
+    }
+        # if extra.dm doesn't have an "about package"
+        elsif ($about_package !~ /\S/) {
+          # read _content_ macro from $GSDLHOME/macros/about.dm file
+      my $global_about_package = &read_content_from_about_dm();
+          # create the extra _content_ macro for this collection
+          # add the original content of the _content_ macro
+          $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
+          # append the new about package to extra.dm
+          $extra_dm .= "\n\npackage about\n_content_$&\n\n";
+          $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
+          $update_extra_dm = 1;
+        }
+        # the about package exists, but either doesn't have the _content_ macro or
+        # the _content_ macro doesn't contain the toc
+        else {
+          # check if there is a content macro
+          my $content_macro_existed = 0;
+          $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
+          # if there is one
+          # append a new section div for toc to the end of the document section
+          if($content_macro_existed ==1) {
+            $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
+            my $content_macro = $&;
+            my $new_content_macro = $content_macro;
+            $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
+            $extra_dm =~ s/$content_macro/$new_content_macro/mg;
+          }
+          # otherwise, append _content_ macro to the about package
+          else {
+            my $new_about_package = $about_package;
+            $content_macro = &read_content_from_about_dm();
+            $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
+            $new_about_package .= "\n\n_content_$&\n\n";
+            $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
+            $extra_dm =~ s/$about_package/$new_about_package/mg;
+          }
+          # either the case, we need to update the extra.dm
+          $update_extra_dm = 1;
+         }
+         if($update_extra_dm==1){
+            # write to the extra.dm file of the collection
+            if (open(OUTPUT, ">$extradm_file")) {
+                print OUTPUT $extra_dm;
+            } else {
+                print "can't open $extradm_file\n";
+            }
+            close(OUTPUT);
+         }
+      } else {
+        print $outhandle "Main_Page doesn't have a table-of-contents section\n";
+      }
+    }
+    # check whether the stylesheet exists
+    # if not, download it and copy to the collection's images folder
+    for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
+        my $css_file = $css_files[$css_file_count];
+        $css_file =~ s/^(.+)gli\/cache\///i;
+        my $css_file_path = "$base_dir/$css_file";
+        if (-e $css_file_path){ # the file already exists
+            next;
+        }
+        # check the css directory and create one if it's not there
+        my @dirs = split(/\//i,$css_file);
+        my $path_check = "$base_dir/";
+        for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
+            $path_check .= $dirs[$i] . "/";
+            if(! -d $path_check ){
+                mkdir($path_check);
+            }
+        }
+        # download
+        $css_file = "http://$css_file";
+        system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
+        if ($? != 0) {unlink("$css_file_path");}
+        # change every style element to #wikispecificstyle ...
+        if(open(INPUT, "<$css_file_path")){
+            my $css_content;
+            while(my $line = <INPUT>){
+                if($line =~ m/^(.+)\{/i){
+                    $line = "#wikispecificstyle " . $line;
+                }
+                $css_content .= $line;
+            }
+            close(INPUT);
+            open(OUTPUT, ">$css_file_path");
+            print OUTPUT $css_content;
+            close(OUTPUT);
+        }
+        # copy to images folder
+        # do not copy, because collection can only have one specific stylesheet
+        # better to add and modify the style sheets manually
+        # @dirs = split(/\//i,$base_dir);
+        # my $collection_base_dir;
+        # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
+        #   $collection_base_dir .= $dirs[$i] . "/";
+        # }
+        # my $images_folder = $collection_base_dir . "images/";
+        # copy($css_file_path, $images_folder) || die "File cannot be copied.";
+    # If delete_toc is set, remove toc and tof contents.
+    if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
+    if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
+          # print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
+          if ($body_text =~ /$self->{'toc_exp'}/) {
+        $body_text =~ s/$self->{'toc_exp'}//i;
+          }
+    }
+    }
+    # add sections around h2 tag
+    # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
+    # add search box with each section
+    if ($self->{'tag_sections'}) {
+    my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
+    for(my $i=1; $i < scalar(@sections); $i++){
+        my $section_title = $sections[$i];
+        $section_title =~ s/<([^>]*)>//g;
+        $section_title =~ s/(^\s|\s$)//g;
+        my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
+        if($i !=1){
+            $section_metadata = "</Section>\n" . $section_metadata;
+        }
+        $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
+        $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
+        $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
+        $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
+        if($i==scalar(@sections)-1) {
+            # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
+            $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
+        }
+    }
+     }
+    # If delete_nav is enabled, it means to get rid of navigation contents.
+    # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
+    #   if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
+    #       print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
+    #       $body_text =~ s/$self->{'nav_exp'}//isg;
+    #   }
+    #}
+    my $searchbox = "";
+    if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
+    my $nav_match_express;
+    if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
+        $nav_match_express = $self->{'nav_exp'} ;
+    } else { # default setting for mediawiki
+        $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
+    }
+    print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
+    # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;
+    $body_text =~ m/$nav_match_express/isg;
+    my $navigate = $& if defined $&;
+    # find the search box and add it to the document page
+    if(defined $navigate && $navigate =~ /\S/){
+        $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
+        $searchbox = $& . "\n<\/div>\n<\/div>";
+        $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
+        $searchbox =~ s/name="search"/name="q"/isg;
+        $searchbox =~ s/name="go"//isg;
+        $searchbox =~ s/name="fulltext"//isg;
+        my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
+                    ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
+                    ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
+                    ."<input type=\"hidden\" name=\"t\" value=\"1\">";
+        $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
+        $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
+    }
+    # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
+    $body_text =~ s/$nav_match_express/$searchbox/isg;
+    }
+    if ($self->{'tag_sections'}) {
+        $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
+    }
+    # Tidy up extra new lines
+    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
+    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
+    $section_text .= "<!--\n<Section>\n-->\n";
+    my $body = "<body".$body_text;
+    $$textref = $body;
+    # get the base dir for convert absolute links to relative links
+    $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
+    my $basedir = $2;
+    $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
+    $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
+    # get rid of the [edit] button
+    $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
+    # get rid of the last time edit information at the bottom
+    $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
+    # get rid of the (Redirected from ...)
+    $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
+    # escape macros
+    $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
+    # may change the links, like Greenstone_Documentation_All.html, then change back
+    $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
+    # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
+    # get rid of the title attribute of a tag
+    $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
+    # find the relative path of current directory
+    if($basedir ne ""){
+        my @dirs=split("\/", $file);
+        my $dirnum = scalar(@dirs);
+        my $replace = "";
+        for(my $i=0; $i<$dirnum-2; $i++){
+            $replace .= "../";
+        }
+        # test if the linked relative file exists, if not, link to the internet version
+        $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;
+        # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
+        # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
+        # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
+        #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
+            #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
+        #   my $prefix = $1;
+        #   my $link = $&;
+        #   my $rel_file_name = $3;
+        #   my $rel_link = "$replace$rel_file_name";
+            # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
+        #   if(-e $rel_link){
+        #       $rel_link = "$prefix=\"$rel_link\"";
+        #       $$textref =~ s/$link/$rel_link/i;
+        #   }else{
+        #       my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
+                # print $outhandle "external link==> $ext_link\n";
+        #       $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
+        #   }
+        #}
+        # tag the link to new wiki pages as red
+        $$textref =~ s/(href|src)="$replace([^>]*)&amp;action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&amp;action=edit$3"/gi;
+        $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
+        # tag the link to external pages as blue
+        $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
+        #print $outhandle $$textref;
+    }
+    # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
+    # 1. read _content_ macro from about.dm
+    # 2. append the toc, change all links to the Greenstone internal format for relative links
+    # 3. write to the extra.dm
+    # TODO: currently we suppose the _about:content_ hasn't been specified before
+    #       so needs to add function to handle when the macro is already in the extra.dm
+    if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
+        my $macro_path = $base_dir;
+        $macro_path =~ s/import$/macros/;
+        my $extra_dm;
+        my $extradm_file = "$macro_path/extra.dm";
+        if(open(INPUT, "<$extradm_file")){
+            while(my $line = <INPUT>){
+                $extra_dm .= $line;
+        }
+            close(INPUT);
+            if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
+                print $outhandle "already changed!!!!\n";
+            } else {
+                # read _content_ macro from about.dm file
+                my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
+                my $about_page_content = "";
+                if(open(INPUT, "<$about_macro")){
+                    while(my $line=<INPUT>){
+                        $about_page_content .= $line;
+                    }
+                }else{
+                    print $outhandle "can't open file $about_macro\n";
+                }
+                close(INPUT);
+                # extract the _content_ macro
+                $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
+                $about_page_content = $&;
+                # extract toc of the Main_Page
+                my $mainpage_content = "";
+                if($self->{'toc_exp'} =~ /\S/){
+                    $$textref =~ /$self->{'toc_exp'}/;
+                    $mainpage_content = $&;
+                } else {
+                    # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
+                }
+                # print $outhandle "---------\n$$textref\n--------\n\n";
+                # print $outhandle "==========\n$mainpage_content\n==========\n\n";
+                # add toc to the _content_ macro
+                $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
+                $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
+                # change all links to the internal Greenstone relative link format
+                $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
+                $extra_dm =~ s/(\.\.\/)+/\//isg;
+                # print $outhandle "to add---------\n$extra_dm\n--------\n";
+                # write to the extra.dm file of the collection
+                open(OUTPUT, ">$extradm_file");
+                print OUTPUT $extra_dm;
+                close(OUTPUT);
+            }
+        } else {
+            print $outhandle "can't open file $extradm_file\n";
+        }
+    }
+    # If delete_toc is enabled, it means to get rid of toc and tof contents.
+    # get rid of TOC and TOF sections and their title
+    if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
+        if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
+            # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
+            # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
+            # $body_text =~ s/$self->{'toc_exp'}//i;
+            print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
+            $$textref =~ s/$self->{'toc_exp'}//i;
+        }
+    }
+    # To add a layer on top of the wiki page
+    # so as to keep the wiki style inside the wiki page
+    # and keep the Greenstone style at the same time
+    $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
+    $$textref =~ s/<\/body>/<\/div><\/body>/is;
+    # tag with sections
+    $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
+    $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
+    #print $outhandle "\n\n$$textref\n\n";
+    # use description tags
+    if ($self->{'description_tags'}) {
+        my $cursection = $doc_obj->get_top_section();
+        # remove the html header - note that doing this here means any
+        # sections defined within the header will be lost (so all <Section>
+        # tags must appear within the body of the HTML)
+        my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
+        $$textref =~ s/^.*?<body[^>]*>//is;
+        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
+        my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
+        my $lt = '(?:<|&lt;)';
+        my $gt = '(?:>|&gt;)';
+        my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
+        # my $dont_strip = '';
+        # if ($self->{'no_strip_metadata_html'}) {
+        #    ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
+        # }
+        my $found_something = 0;
+        my $top = 1;
+        while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
+            my $text = $1;
+            my $comment = $2;
+            if (defined $text) {
+                # text before a comment - note that getting to here
+                # doesn't necessarily mean there are Section tags in
+                # the document
+                # print $outhandle "section text:\n$text\n";
+                $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
+            }
+            while ($comment =~ s/$lt(.*?)$gt//s) {
+                my $tag = $1;
+                if ($tag eq "Section") {
+                    $found_something = 1;
+                    $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
+                    $top = 0;
+                } elsif ($tag eq "/Section") {
+                    $found_something = 1;
+                    $cursection = $doc_obj->get_parent_section ($cursection);
+                } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
+                    my $metaname = $1;
+                    my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
+                    $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
+                    my $metavalue = $1;
+                    $metavalue =~ s/^\s+//;
+                    $metavalue =~ s/\s+$//;
+                    # assume that no metadata value intentionally includes
+                    # carriage returns or HTML tags (if they're there they
+                    # were probably introduced when converting to HTML from
+                    # some other format).
+                    # actually some people want to have html tags in their
+                    # metadata.
+                    $metavalue =~ s/[\cJ\cM]/ /sg;
+                    # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
+                    $metavalue =~ s/\s+/ /sg;
+                    # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
+                    if ($accumulate) {
+                        $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
+                    } else {
+                        $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
+                    }
+                } elsif ($tag eq "Description" || $tag eq "/Description") {
+                    # do nothing with containing Description tags
+                } else {
+                    # simple HTML tag (probably created by the conversion
+                    # to HTML from some other format) - we'll ignore it and
+                    # hope for the best ;-)
+                }
+            }
+        }# end while
+        if ($cursection ne "") {
+            print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
+        }
+        $$textref =~ s/^.*?<body[^>]*>//is;
+        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        if ($$textref =~ /\S/) {
+            if (!$found_something) {
+                if ($self->{'verbosity'} > 2) {
+                    print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
+                    print $outhandle "          will be processed as a single section document\n";
+                }
+                # go ahead and process single-section document
+                $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+            } else {
+                print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
+                print $outhandle "          of the final closing </Section> tag. This text will\n";
+                print $outhandle "          be ignored.";
+                my ($text);
+                if (length($$textref) > 30) {
+                    $text = substr($$textref, 0, 30) . "...";
+                } else {
+                    $text = $$textref;
+                }
+                $text =~ s/\n/ /isg;
+                print $outhandle " ($text)\n";
+            }
+        } elsif (!$found_something) {
+            if ($self->{'verbosity'} > 2) {
+            # may get to here if document contained no valid Section
+            # tags but did contain some comments. The text will have
+            # been processed already but we should print the warning
+            # as above and extract metadata
+            print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
+            print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+            }
+        }
+    } # if $self->{'description_tags'}
+    else {
+        # remove header and footer
+        # if (!$self->{'keep_head'}) {
+        #    $$textref =~ s/^.*?<body[^>]*>//is;
+        #    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        # }
+        # single section document
+        # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+        # Important: to get the relative links to work,
+        # 1: use the below statement instead of the above one
+        # 2. cannot have process_section method.
+        # why?????
+        $self->SUPER::process(@_);
+    }
+    return 1;
+    }
+    $$textref = "<body" . $body_text;
+    # Wrap the whole page with <div id="wikispecificstyle"></div>
+    # keep the style of this website and don't mess up with the Greenstone styles
+    $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
+    $$textref =~ s/<\/body>/<\/div><\/body>/is;
+    #$self->SUPER::process(@_);
+    $self->SUPER::process(@_);
+    return 1;
+}
-# note that process_section may be called multiple times for a single
-# section (relying on the fact that add_utf8_text appends the text to any
-# that may exist already).
-# sub process_section {
-#    my $self = shift (@_);
-#    my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
-    # trap links
-    # if (!$self->{'nolinks'}) {
-    # usemap="./#index" not handled correctly => change to "#index"
-    # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
-        #$self->replace_usemap_links($1, $2, $3)/isge;
-    #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
-        #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
-    #}
-    # trap images
-    # allow spaces if inside quotes - jrm21
-    #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
-    #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
-    # add text to document object
-    # turn \ into \\ so that the rest of greenstone doesn't think there
-    # is an escape code following. (Macro parsing loses them...)
-#    $$textref =~ s/\\/\\\\/go;
-#    $doc_obj->add_utf8_text($cursection, $$textref);
-#}
 …
+}
+sub safe_escape_regexp
+{
+  my $regexp = shift (@_);
+  # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+    $regexp =~ s/\\/\\\\/isg;
+  #} else {
+    $regexp =~ s/\//\\\//isg;
+  #}
+  return $regexp;
+}
+sub read_content_from_about_dm
+{
+  my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
+  my $about_page_content = "";
+  if (open(INPUT, "<$about_macro_file")){
+    while (my $line=<INPUT>){
+      $about_page_content .= $line;
+    }
+  } else {
+    print $outhandle "can't open file $about_macro_file\n";
+  }
+  close(INPUT);
+  # extract the _content_ macro
+  $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
+  $about_page_content = $&;
+  return $about_page_content;
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 14251

Legend:

gsdl/trunk/perllib/plugins/MediaWikiPlug.pm

Download in other formats: