Changeset 14251


Ignore:
Timestamp:
2007-07-16T10:22:59+12:00 (17 years ago)
Author:
anna
Message:

updated version, added comments

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/MediaWikiPlug.pm

    r14108 r14251  
    2424#
    2525###########################################################################
    26 # This plugin is to process an HTML file where sections are divided by
    27 # user-defined headings tags. As it is difficult to predict what user's definition
    28 # this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
    29 # as well as allows to get rid of user-defined Table of Content (TOC)...
    30 # format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
     26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
     27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
     28# login, discussion, history, etc. Only the navigation and search section could be preserved.
     29# Searchbox will be modified to search the Greenstone collection instead of the website.
     30# It also can automatically add the table of contents on the website's Main_Page to the
     31# collection's Home page.
    3132
    3233package MediaWikiPlug;
    3334
    3435use HTMLPlug;
    35 use ImagePlug;
    36 use File::Copy;
     36# use ImagePlug;
     37# use File::Copy;
     38use unicode;
     39
    3740
    3841#use strict; # every perl program should have this!
     
    4043
    4144sub BEGIN {
    42     @MediaWikiPlug::ISA = ('HTMLPlug');
     45    @MediaWikiPlug::ISA = ('HTMLPlug');       
    4346}
    4447
    4548my $arguments =
    4649    [         
     50     # show the table of contents on collection's home page
    4751     { 'name' => "show_toc",
    4852       'desc' => "{MediaWikiPlug.show_toc}",
    4953       'type' => "flag",
    5054       'reqd' => "no"},
     55     # set to delete the table of contents section on each MediaWiki page
     56     { 'name' => "delete_toc",
     57       'desc' => "{MediaWikiPlug.delete_toc}",
     58       'type' => "flag",
     59       'reqd' => "no"},
     60     # regexp to match the table of contents
    5161     { 'name' => "toc_exp",
    5262       'desc' => "{MediaWikiPlug.toc_exp}",
    5363       'type' => "regexp",
    5464       'reqd' => "no",
    55        'deft' => "" },   
    56      { 'name' => "delete_toc",
    57        'desc' => "{MediaWikiPlug.delete_toc}",
    58        'type' => "flag",
    59        'reqd' => "no"},
     65       'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*</table>\\n" },       
     66     # set to delete the navigation section
    6067     { 'name' => "delete_nav",
    6168       'desc' => "{MediaWikiPlug.delete_nav}",
    6269       'type' => "flag",
    6370       'reqd' => "no",
    64        'deft' => ""},     
    65      { 'name' => "nav_exp",
    66        'desc' => "{MediaWikiPlug.nav_exp}",
     71       'deft' => ""},
     72     # regexp to match the navigation section   
     73     { 'name' => "nav_div_exp",
     74       'desc' => "{MediaWikiPlug.nav_div_exp}",
    6775       'type' => "regexp",
    6876       'reqd' => "no",
    69        'deft' => "" },
    70      { 'name' => "tag_sections",
    71        'desc' => "{MediaWikiPlug.tag_sections}",
     77       'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
     78     # set to delete the searchbox section
     79     { 'name' => "delete_searchbox",
     80       'desc' => "{MediaWikiPlug.delete_searchbox}",
    7281       'type' => "flag",
    73        'reqd' => "no"},
    74      { 'name' => "description_tags",
    75        'desc' => "{HTMLPlug.description_tags}",
    76        'type' => "flag",
    77        'reqd' => "no"}       
     82       'reqd' => "no",
     83       'deft' => ""},
     84     # regexp to match the searchbox section
     85     { 'name' => "searchbox_div_exp",
     86       'desc' => "{MediaWikiPlug.searchbox_div_exp}",
     87       'type' => "regexp",
     88       'reqd' => "no",
     89       'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},     
     90     # regexp to match title suffix
     91     # can't use the title_sub option in HTMLPlug instead
     92     # because title_sub always matches from the begining     
     93     { 'name' => "remove_title_suffix_exp",
     94       'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
     95       'type' => "regexp",
     96       'reqd' => "no",
     97       'deft' => ""}
    7898     ];
    79 
    8099
    81100my $options = { 'name'     => "MediaWikiPlug",
     
    85104        'args'     => $arguments };
    86105
    87 
    88106sub new {
    89107    my ($class) = shift (@_);
     
    112130   
    113131    $head =~ m/<title>(.+)<\/title>/i;
    114     my $doctitle = $1 if defined $1;
     132    my $doctitle = $1 if defined $1;   
    115133   
    116134    if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
     
    126144    # set the title here if we haven't found it yet
    127145    if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {   
    128     if (defined $doctitle && $doctitle =~ /\S/) {
    129         $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
     146    if (defined $doctitle && $doctitle =~ /\S/) {               
     147            # remove suffix in title if required
     148            my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
     149        if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
     150           $doctitle =~ s/$remove_suffix_exp//i;
     151        }       
     152        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
    130153    } else {
    131         $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
     154        $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
    132155    }
    133     }
    134    
    135     if(defined $base_dir && $base_dir ne ""){
    136     # find and download stylesheet
     156    }
     157
     158    # we are only interested in the column-contents div <div id="column-content">
     159    # remove header section, it may contain header images or additional search boxes
     160    my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
     161    $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
     162   
     163    # remove timeline
     164    $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
     165   
     166    # remove extra bits
     167    my $extra_bits = "Retrieved from(.+)</a>\"";
     168    $body_text =~ s/$extra_bits//isg;
     169   
     170    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
     171    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
     172    $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
     173    $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
     174   
     175    # get rid of the [edit] buttons
     176    $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
     177    # get rid of the last time edit information at the bottom
     178    $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;   
     179    # get rid of the (Redirected from ...)
     180    $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg; 
     181   
     182    # escape texts macros
     183    $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
     184    # may change the links, like Greenstone_Documentation_All.html, then change back
     185    $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
     186   
     187    # define file delimiter for different platforms
     188    my $file_delimiter;
     189    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     190       $file_delimiter = "\\";
     191    } else {
     192       $file_delimiter = "/";           
     193    }   
     194   
     195    # IMPORTANT: different delimiter for $base_dir and $file
     196    # $base_dir use forward slash for both windows and linux
     197    # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import   
     198                                        # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
     199    # $file use different delimiters : forward slash for linux; backward slash for windows
     200    # print "\nfile : $file\n\n";         # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html   
     201                                        # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
     202   
     203    # get the base url for the MediaWiki website
     204    my $safe_delimiter = &safe_escape_regexp($file_delimiter);
     205    my @url_dirs=split($safe_delimiter, $file);
     206    my $url_base = $url_dirs[0];   
     207       
     208    # Re-check css files associated with MediaWiki pages
     209    if(defined $base_dir && $base_dir ne ""){   
    137210    my @css_files;
    138211    my $css_file_count = 0;
    139     # find all the style sheets imported with import statement
     212   
     213    # find all the stylesheets imported with @import statement 
    140214    while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
    141         $css_files[$css_file_count++] = $2 if defined $2;
    142     }   
     215        $css_files[$css_file_count++] = $2 if defined $2;
     216    }
     217       
     218    # download the stylesheets if we haven't downloaded them yet
     219        # add prefix to each style elmement, comment out the body element
     220        # and copy the files to collection's images folder
     221    for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {       
     222       
     223        my $css_file = $css_files[$css_file_count];       
     224       
     225        # remove prefix gli/cache directory                 
     226            $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
     227                       
     228            # change the \ delimiter in $css_file to / for consistency
     229            $css_file =~ s/\\/\//isg;
     230            if($css_file !~ /$url_base/) {
     231              $css_file = $url_base . $css_file; 
     232            }
     233           
     234            # trim the ? mark append to the end of a stylesheet
     235        $css_file =~ s/\?(.+)$//isg; 
     236       
     237            my $css_file_path = &util::filename_cat($base_dir, $css_file);     
     238       
     239        # do nothing if we have already downloaded the css files
     240        if (! -e $css_file_path) {     
     241         
     242             # check the stylesheet's directory in the import folder
     243             # if the directory doesn't exist, create one           
     244         my @dirs = split(/\//i,$css_file);     
     245         my $path_check = "$base_dir/";           
     246         for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
     247        $path_check .= $dirs[$i] . "/";
     248        mkdir($path_check) if (! -d $path_check );
     249         }
     250         
     251             # NOTE: wget needs configuration to directly access Internet
     252             # These files should already downloaded if we used the MediaWikiDownload             
     253         # downloading           
     254         $css_file = "http://$css_file";       
     255             print "\ndownloading : " . $css_file . "\n\n";
     256         system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
     257         if ($? != 0) {
     258              print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
     259              print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
     260              unlink("$css_file_path");
     261             }
     262            } # done with download
     263       
     264        # add a prefix "#wikispecificstyle" to each element
     265        # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
     266            # so we will wrap the web page with a div with id = wikispecificstyle
     267            my $css_content;
     268        if(open(INPUT, "<$css_file_path")){     
     269        while(my $line = <INPUT>){
     270                    # comment out the body element because we change the body to div
     271                    $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
     272                                       
     273            if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){                   
     274            $line = "#wikispecificstyle " . $line;
     275            }
     276            $css_content .= $line;
     277        }
     278        close(INPUT);           
     279        open(OUTPUT, ">$css_file_path");
     280        print OUTPUT $css_content;
     281        close(OUTPUT);
     282        }
     283           
     284            # Copy the modified stylesheets to collection's images folder
     285            # for future customization
     286            my $images_dir = $base_dir;
     287            $images_dir =~ s/import$/images/;
     288            $css_file =~ m/(.*)\/(.*)$/;
     289            $images_dir = &util::filename_cat($images_dir, $2);           
     290           
     291            if(open(OUTPUT, ">$images_dir")){   
     292              print OUTPUT $css_content;
     293              close(OUTPUT);
     294            }
     295    }
     296    }   
     297   
     298   
     299    # by default, only preserve navigation box and search box
     300    # others like toolbox, interaction, languages box, will be removed 
     301   
     302    # extract the larger part -- footer section
     303    my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
     304    $body_text =~ /$print_footer/;
     305    my $footer = "";
     306    $footer = $& if defined $&;
     307    $footer =~ s/<\/body>//isg;
     308   
     309    # trim the comments first   
     310    $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
     311   
     312    # contain sections that are to be preserved
     313    my $preserve_sections = "";   
     314   
     315    # process the navigation section   
     316    my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
     317    if (defined $self->{'nav_div_exp'}) {
     318      $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
     319    }
     320       
     321    if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) { 
     322        # do nothing   
     323    } else {     
     324      if ($footer =~ m/$nav_match_exp/ig) {
     325        $preserve_sections = $& ;
     326      } else {
     327        print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
     328      }
     329      # if($preserve_sections =~/\S/){
     330      #  $preserve_sections .= "</div>";
     331      # }           
     332    }         
     333           
     334    # process the searchbox section       
     335    my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
     336    if(defined $self->{'searchbox_div_exp'}) {               
     337        $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
     338    }   
     339                       
     340    my $searchbox_section = "";   
     341    $footer =~ m/$searchbox_exp/ig;
     342    $searchbox_section = $& if defined $&;   
     343   
     344    # make the searchbox form work in Greenstone
     345    if($searchbox_section =~ /\S/){       
     346        # replace action
     347        $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
     348               
     349        # remove buttons
     350        $searchbox_section =~ s/name="search"/name="q"/isg;
     351        $searchbox_section =~ s/name="go"//isg;
     352        $searchbox_section =~ s/name="fulltext"//isg;
     353               
     354        # get collection name from $base_dir for c param       
     355        $base_dir =~ m/\/collect\/(.+)\//i;
     356        my $collection_name = "";
     357        $collection_name = $1 if defined $1;
     358       
     359        # add Greenstone search params
     360        my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
     361            ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
     362            # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
     363            # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
     364       
     365        $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;         
     366       
     367        # $searchbox_section .= "</div>";
     368    } else {
     369      print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
     370    }       
     371   
     372    # either delete or replace the searchbox
     373    if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
     374        # do nothing       
     375    } else {
     376        $preserve_sections .= "\n$searchbox_section\n";
     377    }   
     378   
     379   
     380    if($preserve_sections ne ""){
     381      $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
     382    }
     383    $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";   
     384   
     385    $body_text =~ s/$print_footer/$preserve_sections/isg;
     386   
     387   
     388    # delete other forms in the page
     389    my @forms;
     390    my $form_count = 0;
     391    while($body_text =~ m/<form([^>]*)name=("|')([^>]*)("|')/isg){
     392        next if($3 eq "q");
     393        $forms[$form_count++] = $&;
     394    }
     395    foreach my $form (@forms) {     
     396      $body_text =~ s/$form[\s\S]*?<\/form>//m;
     397    }   
     398   
     399   
     400    # process links.
     401    # because current WGET 1.10 the -k and -E option doesn't work together
     402    # need to 'manually' convert the links to relative links
     403    # Dealing with 3 types of links:
     404    # -- outgoing links
     405    #   -- if we have downloaded the target files, link to the internal version (relative link)
     406    #   -- otherwise, link to the external version (absolute links)
     407    # -- in-page links (relative link)
     408   
     409    # NOTE: (important)
     410    #   must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
     411    #   otherwise, the internal links may have problems
     412   
     413    # remove the title attribute of <a> tag
     414    $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
     415   
     416    # extract all the links
     417    my @links;
     418    my $link_count = 0;   
     419    while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){       
     420        $links[$link_count++] = "$1=\"$2$url_base/$3\"";       
     421    }
     422   
     423    foreach my $cur_link (@links) {     
     424        # escape greedy match + character
     425        $cur_link =~ s/\+/\\+/isg;
     426       
     427        $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;         
     428        my $external_file_path = "$1\"http://$url_base/$3\"";
     429           
     430        $body_text =~ s/$cur_link/$external_file_path/i;
     431    }
     432             
     433    # tag links to new wiki pages as red   
     434    $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
     435   
     436    # tag links to pages external of the MediaWiki website as blue
     437    $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
     438       
     439   
     440    # process the table-of-contents section
     441    # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file     
     442    # 1. read _content_ macro from about.dm
     443    # 2. append the toc, change all links to the Greenstone internal format for relative links
     444    # 3. write to the extra.dm
     445    # TODO: we assume the _about:content_ hasn't been specified before
     446    #       so needs to add function to handle when the macro is already in the extra.dm       
     447    if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
     448   
     449      # extract toc of the Main_Page             
     450      my $mainpage_toc = ""; 
     451      my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
     452      if($self->{'toc_exp'} =~ /\S/){
     453         $toc_exp = $self->{'toc_exp'};     
     454      }
     455      if($body_text =~ /$toc_exp/){                         
     456        $mainpage_toc = $&;
     457      }
     458       
     459      if($mainpage_toc =~ /\S/) {
     460       
     461        # change the in-page links to relative links, for example, change <a href="#section1"> to
     462        # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">           
     463        my $file_url_format = $file;
     464        $file_url_format =~ s/\\/\//isg;
     465    $file_url_format = "http://" . $file_url_format;
     466       
     467        # encode as URL, otherwise doesn't work on Windows
     468        $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
     469    $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
     470       
     471       
     472        # read the collection's extra.dm   
     473        my $macro_path = $base_dir;
     474        $macro_path =~ s/import$/macros/;       
     475        my $extradm_file = &util::filename_cat($macro_path, "extra.dm");       
     476       
     477        my $extra_dm = "";
     478        if(open(INPUT, "<$extradm_file")){                 
     479        while(my $line = <INPUT>){
     480        $extra_dm .= $line;
     481        }           
     482        } else {
     483            print $outhandle "can't open file $extradm_file\n";
     484        }
     485        close(INPUT);
     486       
     487        # check whether we have changed the macros
     488        my @packages = split("package ", $extra_dm);
     489        my $about_package = "";
     490        foreach my $package (@packages) {
     491          $about_package = "package " . $package if($package =~ /^about/);
     492        }     
     493               
     494        my $update_extra_dm = 0;       
     495       
     496        if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){ 
     497       print $outhandle "_content_ macro already changed!!!!\n";
     498    }
     499        # if extra.dm doesn't have an "about package"
     500        elsif ($about_package !~ /\S/) {         
     501          # read _content_ macro from $GSDLHOME/macros/about.dm file         
     502      my $global_about_package = &read_content_from_about_dm();     
     503           
     504          # create the extra _content_ macro for this collection           
     505          # add the original content of the _content_ macro
     506          $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
     507         
     508          # append the new about package to extra.dm
     509          $extra_dm .= "\n\npackage about\n_content_$&\n\n";
     510          $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
     511         
     512          $update_extra_dm = 1;
     513        }
     514        # the about package exists, but either doesn't have the _content_ macro or
     515        # the _content_ macro doesn't contain the toc
     516        else {       
     517          # check if there is a content macro   
     518          my $content_macro_existed = 0;
     519          $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
     520           
     521          # if there is one
     522          # append a new section div for toc to the end of the document section                   
     523          if($content_macro_existed ==1) {
     524            $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
     525            my $content_macro = $&;                         
     526            my $new_content_macro = $content_macro;
     527            $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;             
     528            $extra_dm =~ s/$content_macro/$new_content_macro/mg;                                   
     529          }
     530          # otherwise, append _content_ macro to the about package
     531          else {
     532            my $new_about_package = $about_package;           
     533            $content_macro = &read_content_from_about_dm();
     534            $content_macro =~ m/{(.|\n)*<\/div>\n\n/;           
     535           
     536            $new_about_package .= "\n\n_content_$&\n\n";
     537            $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";             
     538            $extra_dm =~ s/$about_package/$new_about_package/mg;   
     539          }
     540         
     541          # either the case, we need to update the extra.dm         
     542          $update_extra_dm = 1;
     543         }         
     544                 
     545         if($update_extra_dm==1){
     546            # write to the extra.dm file of the collection
     547            if (open(OUTPUT, ">$extradm_file")) {
     548                print OUTPUT $extra_dm;
     549            } else {
     550                print "can't open $extradm_file\n";
     551            }
     552            close(OUTPUT);
     553         }
     554      } else {
     555        print $outhandle "Main_Page doesn't have a table-of-contents section\n";
     556      }
     557    }
    143558   
    144     # check whether the stylesheet exists
    145     # if not, download it and copy to the collection's images folder
    146     for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
    147         my $css_file = $css_files[$css_file_count];             
    148         $css_file =~ s/^(.+)gli\/cache\///i;
    149            
    150         my $css_file_path = "$base_dir/$css_file";     
    151        
    152         if (-e $css_file_path){ # the file already exists
    153             next;
    154         } 
    155        
    156         # check the css directory and create one if it's not there
    157         my @dirs = split(/\//i,$css_file);
    158         my $path_check = "$base_dir/";
    159         for(my $i = 0; $i < (scalar(@dirs)-1); $i++){                           
    160             $path_check .= $dirs[$i] . "/";         
    161             if(! -d $path_check ){
    162                 mkdir($path_check);
    163             }
    164         }       
    165        
    166         # download
    167         $css_file = "http://$css_file";     
    168         system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
    169         if ($? != 0) {unlink("$css_file_path");}
    170        
    171         # change every style element to #wikispecificstyle ...
    172         if(open(INPUT, "<$css_file_path")){
    173             my $css_content;
    174             while(my $line = <INPUT>){             
    175                 if($line =~ m/^(.+)\{/i){
    176                     $line = "#wikispecificstyle " . $line;             
    177                 }
    178                 $css_content .= $line;
    179             }
    180             close(INPUT);           
    181             open(OUTPUT, ">$css_file_path");
    182             print OUTPUT $css_content;
    183             close(OUTPUT);
    184         }
    185        
    186         # copy to images folder
    187         # do not copy, because collection can only have one specific stylesheet
    188         # better to add and modify the style sheets manually
    189         # @dirs = split(/\//i,$base_dir);                       
    190         # my $collection_base_dir;     
    191         # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
    192         #   $collection_base_dir .= $dirs[$i] . "/";           
    193         # }
    194         # my $images_folder = $collection_base_dir . "images/";
    195         # copy($css_file_path, $images_folder) || die "File cannot be copied.";
     559    # If delete_toc is set, remove toc and tof contents.   
     560    if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
     561    if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
     562          # print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
     563          if ($body_text =~ /$self->{'toc_exp'}/) {
     564        $body_text =~ s/$self->{'toc_exp'}//i;
     565          }
    196566    }
    197     }
    198    
    199     # add sections around h2 tag
    200     # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
    201     # add search box with each section     
    202     if ($self->{'tag_sections'}) {
    203     my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
    204     for(my $i=1; $i < scalar(@sections); $i++){
    205         my $section_title = $sections[$i];     
    206         $section_title =~ s/<([^>]*)>//g;
    207         $section_title =~ s/(^\s|\s$)//g;
    208         my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
    209         if($i !=1){
    210             $section_metadata = "</Section>\n" . $section_metadata;
    211         }
    212         $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
    213        
    214         $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
    215         $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
    216        
    217         $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
    218        
    219         if($i==scalar(@sections)-1) {
    220             # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
    221             $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
    222         }
    223     }
    224      }   
    225    
    226     # If delete_nav is enabled, it means to get rid of navigation contents.
    227     # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
    228     #   if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
    229     #       print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
    230     #       $body_text =~ s/$self->{'nav_exp'}//isg;       
    231     #   }
    232     #}
    233     my $searchbox = "";
    234     if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
    235     my $nav_match_express;
    236     if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
    237         $nav_match_express = $self->{'nav_exp'} ;
    238     } else { # default setting for mediawiki
    239         $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
    240     }
    241    
    242     print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
    243    
    244     # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;   
    245     $body_text =~ m/$nav_match_express/isg;
    246     my $navigate = $& if defined $&;     
    247    
    248     # find the search box and add it to the document page
    249     if(defined $navigate && $navigate =~ /\S/){
    250         $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
    251         $searchbox = $& . "\n<\/div>\n<\/div>";
    252         $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
    253         $searchbox =~ s/name="search"/name="q"/isg;
    254         $searchbox =~ s/name="go"//isg;
    255         $searchbox =~ s/name="fulltext"//isg;
    256         my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
    257                     ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
    258                     ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
    259                     ."<input type=\"hidden\" name=\"t\" value=\"1\">";
    260         $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;   
    261         $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
    262     }
    263    
    264     # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
    265     $body_text =~ s/$nav_match_express/$searchbox/isg;
    266     }
    267    
    268     if ($self->{'tag_sections'}) {
    269         $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
    270     }
    271    
    272     # Tidy up extra new lines
    273     $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
    274     $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
    275    
    276     $section_text .= "<!--\n<Section>\n-->\n";
    277     my $body = "<body".$body_text;
    278    
    279     $$textref = $body;
    280    
    281     # get the base dir for convert absolute links to relative links
    282     $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
    283     my $basedir = $2;   
    284        
    285     $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;   
    286     $$textref =~ s/(&nbsp;)+/&nbsp;/sg;     
    287    
    288     # get rid of the [edit] button
    289     $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
    290 
    291     # get rid of the last time edit information at the bottom
    292     $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
    293    
    294     # get rid of the (Redirected from ...)
    295     $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
    296    
    297     # escape macros
    298     $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
    299     # may change the links, like Greenstone_Documentation_All.html, then change back
    300     $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;   
    301    
    302     # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
    303     # get rid of the title attribute of a tag
    304     $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
    305     # find the relative path of current directory         
    306     if($basedir ne ""){
    307         my @dirs=split("\/", $file);
    308         my $dirnum = scalar(@dirs);
    309         my $replace = "";
    310         for(my $i=0; $i<$dirnum-2; $i++){
    311             $replace .= "../";
    312         }
    313         # test if the linked relative file exists, if not, link to the internet version
    314         $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;               
    315         # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
    316         # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
    317         # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
    318        
    319         #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
    320             #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
    321         #   my $prefix = $1;
    322         #   my $link = $&;
    323         #   my $rel_file_name = $3;                     
    324         #   my $rel_link = "$replace$rel_file_name";
    325             # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
    326         #   if(-e $rel_link){
    327         #       $rel_link = "$prefix=\"$rel_link\"";
    328         #       $$textref =~ s/$link/$rel_link/i;
    329         #   }else{
    330         #       my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
    331                 # print $outhandle "external link==> $ext_link\n";
    332         #       $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
    333         #   }
    334         #}         
    335            
    336        
    337         # tag the link to new wiki pages as red
    338         $$textref =~ s/(href|src)="$replace([^>]*)&amp;action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&amp;action=edit$3"/gi;     
    339         $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
    340        
    341         # tag the link to external pages as blue
    342         $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;   
    343        
    344         #print $outhandle $$textref;
    345     }
    346    
    347     # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
    348     # 1. read _content_ macro from about.dm
    349     # 2. append the toc, change all links to the Greenstone internal format for relative links
    350     # 3. write to the extra.dm
    351     # TODO: currently we suppose the _about:content_ hasn't been specified before
    352     #       so needs to add function to handle when the macro is already in the extra.dm   
    353     if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
    354         my $macro_path = $base_dir;
    355         $macro_path =~ s/import$/macros/;
    356         my $extra_dm;
    357         my $extradm_file = "$macro_path/extra.dm";
    358         if(open(INPUT, "<$extradm_file")){     
    359             while(my $line = <INPUT>){
    360                 $extra_dm .= $line;
    361         }
    362             close(INPUT);                           
    363            
    364             if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){ 
    365                 print $outhandle "already changed!!!!\n";               
    366             } else {
    367                 # read _content_ macro from about.dm file
    368                 my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";               
    369                 my $about_page_content = "";
    370                 if(open(INPUT, "<$about_macro")){
    371                     while(my $line=<INPUT>){
    372                         $about_page_content .= $line;
    373                     }
    374                 }else{
    375                     print $outhandle "can't open file $about_macro\n";
    376                 }           
    377                 close(INPUT);
    378                
    379                 # extract the _content_ macro
    380                 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
    381                 $about_page_content = $&;
    382                    
    383                 # extract toc of the Main_Page
    384                 my $mainpage_content = "";
    385                 if($self->{'toc_exp'} =~ /\S/){
    386                     $$textref =~ /$self->{'toc_exp'}/;
    387                     $mainpage_content = $&;
    388                 } else {
    389                     # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
    390                 }
    391                 # print $outhandle "---------\n$$textref\n--------\n\n";               
    392                 # print $outhandle "==========\n$mainpage_content\n==========\n\n";
    393                    
    394                 # add toc to the _content_ macro
    395                 $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
    396                 $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
    397                    
    398                 # change all links to the internal Greenstone relative link format
    399                 $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
    400                 $extra_dm =~ s/(\.\.\/)+/\//isg;
    401                 # print $outhandle "to add---------\n$extra_dm\n--------\n";
    402                        
    403                 # write to the extra.dm file of the collection
    404                 open(OUTPUT, ">$extradm_file");
    405                 print OUTPUT $extra_dm;
    406                 close(OUTPUT);         
    407             }   
    408         } else {
    409             print $outhandle "can't open file $extradm_file\n";
    410         }
    411     }
    412    
    413     # If delete_toc is enabled, it means to get rid of toc and tof contents.
    414     # get rid of TOC and TOF sections and their title
    415     if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
    416         if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
    417             # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;     
    418             # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;     
    419             # $body_text =~ s/$self->{'toc_exp'}//i;
    420             print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;   
    421             $$textref =~ s/$self->{'toc_exp'}//i;
    422         }   
    423     }
    424    
    425     # To add a layer on top of the wiki page
    426     # so as to keep the wiki style inside the wiki page
    427     # and keep the Greenstone style at the same time   
    428     $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;   
    429     $$textref =~ s/<\/body>/<\/div><\/body>/is;
    430    
    431     # tag with sections   
    432     $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
    433     $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
    434    
    435     #print $outhandle "\n\n$$textref\n\n";
    436    
    437     # use description tags   
    438     if ($self->{'description_tags'}) {
    439         my $cursection = $doc_obj->get_top_section();
    440         # remove the html header - note that doing this here means any
    441         # sections defined within the header will be lost (so all <Section>
    442         # tags must appear within the body of the HTML)
    443         my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
    444 
    445         $$textref =~ s/^.*?<body[^>]*>//is;
    446         $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    447 
    448         my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
    449         my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
    450 
    451         my $lt = '(?:<|&lt;)';
    452         my $gt = '(?:>|&gt;)';
    453         my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
    454 
    455         # my $dont_strip = '';
    456         # if ($self->{'no_strip_metadata_html'}) {
    457         #    ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
    458         # }
    459 
    460         my $found_something = 0;
    461         my $top = 1;
    462         while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
    463             my $text = $1;
    464             my $comment = $2;
    465             if (defined $text) {
    466                 # text before a comment - note that getting to here
    467                 # doesn't necessarily mean there are Section tags in
    468                 # the document
    469                 # print $outhandle "section text:\n$text\n";
    470                 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
    471             }
    472             while ($comment =~ s/$lt(.*?)$gt//s) {
    473                 my $tag = $1;
    474                 if ($tag eq "Section") {
    475                     $found_something = 1;
    476                     $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
    477                     $top = 0;
    478                 } elsif ($tag eq "/Section") {
    479                     $found_something = 1;
    480                     $cursection = $doc_obj->get_parent_section ($cursection);
    481                 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
    482                     my $metaname = $1;
    483                     my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
    484                     $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
    485                     my $metavalue = $1;
    486                     $metavalue =~ s/^\s+//;
    487                     $metavalue =~ s/\s+$//;
    488                     # assume that no metadata value intentionally includes
    489                     # carriage returns or HTML tags (if they're there they
    490                     # were probably introduced when converting to HTML from
    491                     # some other format).
    492                     # actually some people want to have html tags in their
    493                     # metadata.
    494                     $metavalue =~ s/[\cJ\cM]/ /sg;
    495                     # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
    496                     $metavalue =~ s/\s+/ /sg;
    497                     # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
    498                     if ($accumulate) {
    499                         $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
    500                     } else {
    501                         $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);   
    502                     }
    503                 } elsif ($tag eq "Description" || $tag eq "/Description") {
    504                     # do nothing with containing Description tags
    505                 } else {
    506                     # simple HTML tag (probably created by the conversion
    507                     # to HTML from some other format) - we'll ignore it and
    508                     # hope for the best ;-)
    509                 }
    510             }
    511         }# end while
    512 
    513         if ($cursection ne "") {
    514             print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
    515         }
    516 
    517         $$textref =~ s/^.*?<body[^>]*>//is;
    518         $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    519         if ($$textref =~ /\S/) {
    520             if (!$found_something) {
    521                 if ($self->{'verbosity'} > 2) {
    522                     print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
    523                     print $outhandle "          will be processed as a single section document\n";
    524                 }
    525    
    526                 # go ahead and process single-section document
    527                 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
    528 
    529             } else {
    530                 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
    531                 print $outhandle "          of the final closing </Section> tag. This text will\n";
    532                 print $outhandle "          be ignored.";
    533 
    534                 my ($text);
    535                 if (length($$textref) > 30) {
    536                     $text = substr($$textref, 0, 30) . "...";
    537                 } else {
    538                     $text = $$textref;
    539                 }
    540                 $text =~ s/\n/ /isg;
    541                 print $outhandle " ($text)\n";
    542             }
    543         } elsif (!$found_something) {
    544             if ($self->{'verbosity'} > 2) {
    545             # may get to here if document contained no valid Section
    546             # tags but did contain some comments. The text will have
    547             # been processed already but we should print the warning
    548             # as above and extract metadata
    549             print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
    550             print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    551             }
    552         }
    553     } # if $self->{'description_tags'}
    554     else {
    555         # remove header and footer
    556         # if (!$self->{'keep_head'}) {
    557         #    $$textref =~ s/^.*?<body[^>]*>//is;
    558         #    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    559         # }
    560 
    561         # single section document
    562         # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);       
    563        
    564         # Important: to get the relative links to work,
    565         # 1: use the below statement instead of the above one
    566         # 2. cannot have process_section method.
    567         # why?????
    568         $self->SUPER::process(@_);
    569     }
    570     return 1;
     567    }       
     568   
     569    $$textref = "<body" . $body_text;
     570   
     571    # Wrap the whole page with <div id="wikispecificstyle"></div>
     572    # keep the style of this website and don't mess up with the Greenstone styles
     573    $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
     574    $$textref =~ s/<\/body>/<\/div><\/body>/is;     
    571575           
    572     #$self->SUPER::process(@_);
     576    $self->SUPER::process(@_);
     577   
     578    return 1;
    573579}
    574 
    575 
    576 
    577 # note that process_section may be called multiple times for a single
    578 # section (relying on the fact that add_utf8_text appends the text to any
    579 # that may exist already).
    580 # sub process_section {
    581 #    my $self = shift (@_);
    582 #    my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
    583 
    584     # trap links
    585     # if (!$self->{'nolinks'}) {
    586     # usemap="./#index" not handled correctly => change to "#index"
    587     # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    588         #$self->replace_usemap_links($1, $2, $3)/isge;
    589 
    590     #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    591         #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    592     #}
    593 
    594     # trap images
    595 
    596     # allow spaces if inside quotes - jrm21
    597     #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
    598     #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    599 
    600     # add text to document object
    601     # turn \ into \\ so that the rest of greenstone doesn't think there
    602     # is an escape code following. (Macro parsing loses them...)
    603 #    $$textref =~ s/\\/\\\\/go;
    604    
    605 #    $doc_obj->add_utf8_text($cursection, $$textref);
    606 #}
    607580
    608581
     
    651624}
    652625
     626sub safe_escape_regexp
     627{
     628  my $regexp = shift (@_);
     629 
     630  # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     631    $regexp =~ s/\\/\\\\/isg;   
     632  #} else {
     633    $regexp =~ s/\//\\\//isg;         
     634  #}
     635  return $regexp;
     636}
     637
     638sub read_content_from_about_dm
     639{
     640  my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
     641  my $about_page_content = "";
     642  if (open(INPUT, "<$about_macro_file")){
     643    while (my $line=<INPUT>){
     644      $about_page_content .= $line;
     645    }
     646  } else {
     647    print $outhandle "can't open file $about_macro_file\n";
     648  }         
     649  close(INPUT);
     650           
     651  # extract the _content_ macro
     652  $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
     653  $about_page_content = $&;
     654 
     655  return $about_page_content;
     656}
     657
    6536581;
Note: See TracChangeset for help on using the changeset viewer.