Changeset 14270

gsdl/branches/gsdl-2.74/bin/script/buildcol.pl

-              r14197
+              r14270
 my $arguments =
+    [ { 'name' => "disable_OAI",
+    'desc' => "{buildcol.disable_OAI}",
+    'type' => "flag",
+    'reqd' => "no",
+    'modegli' => "2" },
+      { 'name' => "remove_empty_classifications",
+    [ { 'name' => "remove_empty_classifications",
     'desc' => "{buildcol.remove_empty_classifications}",
     'type' => "flag",
 …
     'type' => "flag",
     'reqd' => "no",
+    'hiddengli' => "yes" }
+    'hiddengli' => "yes" },
+      { 'name' => "disable_OAI",
+          'desc' => "{buildcol.disable_OAI}",
+          'type' => "flag",
+          'reqd' => "no",
+          'modegli' => "2",
+      'hiddengli' => "yes" }
 #      { 'name' => "incremental_dlc",
 …
     unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");
+    # read the configuration file (for gs2)
+    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
+    # Read in the collection configuration file.
     my ($collectcfg, $buildtype);
+    if (-e $configfilename) {
+      $collectcfg = &colcfg::read_collect_cfg ($configfilename);
+      $gs_mode = "gs2";
+    }
+    else {
+      # If it is gs3
+      $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collectionConfig.xml");
+      if (!-e $configfilename) {
+    &gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die;
+      }
+      else {
+    ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
+    if ($gs_mode eq "gs2") {
+        $collectcfg = &colcfg::read_collect_cfg ($configfilename);
+    } elsif ($gs_mode eq "gs3") {
     $collectcfg = &colcfg::read_collection_cfg_xml ($configfilename);
+    $gs_mode = "gs3";
+      }
+    }
+    }
     if ($verbosity !~ /\d+/) {
     if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
 …
         $remove_empty_classifications = 1;
+    }
+    }
+    }
     if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
 …
     $gli = 0 unless defined $gli;
+    # If the disable_OAI flag is not present, the option $disable_OAI with the value of 0 will be passed to basebuilder.pm
     $disable_OAI = 0 unless defined $disable_OAI;
     # New argument to track whether build is incremental
     $incremental = 0 unless defined $incremental;
 …
     # if a builder class has been created for this collection, use it
     # otherwise, use the mg or mgpp builder
+    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
+    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
+    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
+    $buildertype = "custombuilder";
+    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
+    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
+    $buildertype = "custombuilder";
+    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
     $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
     $buildertype = "${collection}builder";

gsdl/branches/gsdl-2.74/bin/script/downloadfrom.pl

-              r12903
+              r14270
         'desc' => "{downloadfrom.download_mode.Web}",
     'downloadname' => "WebDownload" },
+      { 'name' => "MediaWiki",
+        'desc' => "{downloadfrom.download_mode.MediaWiki}",
+        'downloadname' => "MediaWikiDownload" },
       { 'name' => "OAI",
         'desc' => "{downloadfrom.download_mode.OAI}",

gsdl/branches/gsdl-2.74/bin/script/gti.pl

-              r13948
+              r14270
+my $anonymous_cvs_root = ":pserver:cvs_anon\@cvs.scms.waikato.ac.nz:2402/usr/local/global-cvs/gsdl-src";
+#my $anonymous_cvs_root = ":pserver:cvs_anon\@cvs.scms.waikato.ac.nz:2402/usr/local/global-cvs/gsdl-src";
+#my $anonymous_svn_root = "http://http://svn.greenstone.org/gsdl/trunk/";
 my $gsdl_root_directory = "$ENV{'GSDLHOME'}";
 my $gti_log_file = &util::filename_cat($gsdl_root_directory, "etc", "gti.log");
 …
     # 'target_file' => "gsdl-documentation/tutorials/xml-source/tutorial_{target_language_code}.xml" },
       # Greenstone.org
+      # new Greenstone.org
       { 'key' => "greenorg",
+    'file_type' => "macrofile",
+    'source_file' => "greenorg/macros/english.dm",
+    'target_file' => "greenorg/macros/{iso_639_1_target_language_name}.dm" }
+        'file_type' => "resource_bundle",
+        'source_file' => "greenstoneorg/website/classes/Gsc.properties",
+        'target_file' => "greenstoneorg/website/classes/Gsc_{iso_639_1_target_language_name}.properties"
+    # 'file_type' => "macrofile",
+    # 'source_file' => "greenorg/macros/english.dm",
+    # 'target_file' => "greenorg/macros/{iso_639_1_target_language_name}.dm"
+      }
     ];
 …
+    }
     if ($gti_command =~ /^get-first-n-chunks-requiring-work$/i) {
     print &get_first_n_chunks_requiring_work(@gti_command_arguments);
+    print &get_first_n_chunks_requiring_work(@gti_command_arguments);
+    }
     if ($gti_command =~ /^get-language-status$/i) {
     print &get_language_status(@gti_command_arguments);
+    print &get_language_status(@gti_command_arguments);
+    }
     if ($gti_command =~ /^search-chunks$/i) {
 …
     my @source_file_lines = &read_file_lines($source_file_path);
     my %source_file_key_to_line_mapping = &build_key_to_line_mapping(\@source_file_lines, $translation_file_type);
     my $target_file_path = &util::filename_cat($gsdl_root_directory, $target_file);
     my @target_file_lines = &read_file_lines($target_file_path);
 …
     my $source_file_chunk_date = $source_file_key_to_last_update_date_mapping{$chunk_key};
     my $source_file_chunk_text = &make_text_xml_safe($source_file_key_to_text_mapping{$chunk_key});
+    if(!defined $source_file_chunk_date){
+        $source_file_chunk_date = "";
+    }
     $xml_response .= "    <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n";
     $xml_response .= "      <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n";
+    $xml_response .= "      <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n";
     $xml_response .= "      <TargetFileText></TargetFileText>\n";
     $xml_response .= "    </Chunk>\n";
 …
     my $target_file_chunk_date = $target_file_key_to_last_update_date_mapping{$chunk_key};
     my $target_file_chunk_text = &make_text_xml_safe($target_file_key_to_text_mapping{$chunk_key});
+    $xml_response .= "    <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n";
+    if(!defined $source_file_chunk_date){
+        $source_file_chunk_date = "";
+    }
+    $xml_response .= "    <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n";
     $xml_response .= "      <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n";
     $xml_response .= "      <TargetFileText date=\"$target_file_chunk_date\">$target_file_chunk_text</TargetFileText>\n";
 …
     # The "2>/dev/null" is very important! If it is missing this will never return when run from the receptionist
     # unless ($translation_file_is_not_in_cvs) {
+    my $source_file_cvs_status = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root update $source_file 2>/dev/null`;
+    #my $source_file_cvs_status = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root update $source_file 2>/dev/null`;
+        my $source_file_cvs_status = `cd $gsdl_root_directory; svn status $source_file 2>/dev/null`;
     if ($source_file_cvs_status =~ /^C /) {
         &throw_fatal_error("Source file $source_file_path conflicts with the repository.");
 …
     my $chunk_cvs_date = $key_to_cvs_date_mapping{$chunk_key};
     $key_to_last_update_date_mapping{$chunk_key} = $chunk_cvs_date;
     # If a comment date exists and it is after the CVS date, use that instead
+        # need to convert the comment date format to SVN format
     my $chunk_gti_comment = $key_to_gti_comment_mapping{$chunk_key};
     if (defined($chunk_gti_comment) && $chunk_gti_comment =~ /(\d?\d-\D\D\D-\d\d\d\d)/) {
         my $chunk_comment_date = $1;
+        my $chunk_comment_date = $1;
         if ((!defined($chunk_cvs_date) || &is_date_after($chunk_comment_date, $chunk_cvs_date))) {
         $key_to_last_update_date_mapping{$chunk_key} = $chunk_comment_date;
 …
     # Use CVS to annotate each line of the file with the date it was last edited
     # The "2>/dev/null" is very important! If it is missing this will never return when run from the receptionist
+    my $cvs_annotated_file = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root annotate -F $filename 2>/dev/null`;
+    # my $cvs_annotated_file = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root annotate -F $filename 2>/dev/null`;
+    # my $cvs_annotated_file = `cd $gsdl_root_directory; export PATH=.:/research/lh92/programs/subversion/bin; svn annotate -v --force $filename`;
+    my $cvs_annotated_file = `cd $gsdl_root_directory; svn annotate -v $filename`;
     my @cvs_annotated_file_lines = split(/\n/, $cvs_annotated_file);
 …
     foreach my $cvs_annotated_file_line (@cvs_annotated_file_lines) {
     # Extract the date from the CVS annotation at the front
+    $cvs_annotated_file_line =~ s/^\S+\s+\(\S+\s+(\S+)\):\s//;
+    push(@cvs_annotated_file_lines_date, $1);
+    }
+        # cvs format : 07-Jun-02
+        # svn format : 2007-07-16
+    # $cvs_annotated_file_line =~ s/^\S+\s+\(\S+\s+(\S+)\):\s//;
+        $cvs_annotated_file_line =~ s/^\s+\S+\s+\S+\s(\S+)//;
+        push(@cvs_annotated_file_lines_date, $1);
+        # trim extra date information in svn annotation format
+        # 15:42:49 +1200 (Wed, 21 Jun 2006)
+        $cvs_annotated_file_line =~ s/^\s+\S+\s\S+\s\((.+?)\)\s//;
+    }
     # Build a key to line mapping for the CVS annotated file, for matching the chunk key to the CVS date
     my %key_to_line_mapping = &build_key_to_line_mapping(\@cvs_annotated_file_lines, $translation_file_type);
     my %key_to_cvs_date_mapping = ();
     foreach my $chunk_key (keys(%key_to_line_mapping)) {
     my $chunk_starting_line = (split(/-/, $key_to_line_mapping{$chunk_key}))[0];
     my $chunk_finishing_line = (split(/-/, $key_to_line_mapping{$chunk_key}))[1];
     # Find the date this chunk was last edited, from the CVS annotation
     my $chunk_date = $cvs_annotated_file_lines_date[$chunk_starting_line];
+    my $chunk_date = $cvs_annotated_file_lines_date[$chunk_starting_line];
     for (my $l = ($chunk_starting_line + 1); $l <= $chunk_finishing_line; $l++) {
         if (&is_date_after($cvs_annotated_file_lines_date[$l], $chunk_date)) {
         # This part of the chunk has been updated more recently
         $chunk_date = $cvs_annotated_file_lines_date[$l];
+        }
+    }
 …
     my $source_chunk_last_update_date = $source_file_key_to_last_update_date_mapping->{$chunk_key};
     my $target_chunk_last_update_date = $target_file_key_to_last_update_date_mapping->{$chunk_key};
+    if (defined($target_chunk_last_update_date) && &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date)) {
+        # print "key: $chunk_key\nsource date : $source_chunk_last_update_date\ntarget date : $target_chunk_last_update_date\nafter? ". &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date) . "\n\n";
+        if (defined($target_chunk_last_update_date) && &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date)) {
         # &log_message("Chunk with key $chunk_key needs updating.");
         push(@target_file_keys_requiring_updating, $chunk_key);
 …
 # Returns 1 if $date1 is after $date2, 0 otherwise
 sub is_date_after
+sub is_date_after_cvs
+{
     my ($date1, $date2) = @_;
 …
           "Jul", 7, "Aug", 8, "Sep", 9, "Oct", 10, "Nov", 11, "Dec", 12);
+    if(!defined $date1) {
+        return 1;
+    }
     my @date1parts = split(/-/, $date1);
     my @date2parts = split(/-/, $date2);
 …
     my $year1 = $date1parts[2];
     if ($year1 < 80) {
     $year1 += 2000;
+        $year1 += 2000;
+    }
     my $year2 = $date2parts[2];
     if ($year2 < 80) {
     $year2 += 2000;
+        $year2 += 2000;
+    }
 …
+    }
+    return 0;
+}
+sub is_date_after
+{
+    my ($date1, $date2) = @_;
+    if(!defined $date1) {
+      return 1;
+    }
+    if(!defined $date2) {
+      return 0;
+    }
+    # 16-Aug-2006
+    if($date1=~ /(\d+?)-(\S\S\S)-(\d\d\d\d)/){
+       my %months = ("Jan", "01", "Feb", "02", "Mar", "03", "Apr",  "04", "May",  "05", "Jun",  "06",
+          "Jul", "07", "Aug", "08", "Sep", "09", "Oct", "10", "Nov", "11", "Dec", "12");
+       $date1=$3 . "-" . $months{$2} . "-" . $1;
+       # print "** converted date1: $date1\n";
+    }
+    if($date2=~ /(\d+?)-(\S\S\S)-(\d\d\d\d)/){
+       my %months = ("Jan", "01", "Feb", "02", "Mar", "03", "Apr",  "04", "May",  "05", "Jun",  "06",
+          "Jul", "07", "Aug", "08", "Sep", "09", "Oct", "10", "Nov", "11", "Dec", "12");
+       $date2=$3 . "-" . $months{$2} . "-" . $1;
+       # print "** converted date2: $date2\n";
+    }
+    # 2006-08-16
+    my @date1parts = split(/-/, $date1);
+    my @date2parts = split(/-/, $date2);
+    # Compare year
+    if ($date1parts[0] > $date2parts[0]) {
+    return 1;
+    }
+    elsif ($date1parts[0] == $date2parts[0]) {
+    # Year is the same, so compare month
+    if ($date1parts[1] > $date2parts[1]) {
+        return 1;
+    }
+    elsif ($date1parts[1] == $date2parts[1]) {
+        # Month is the same, so compare day
+        if ($date1parts[2] > $date2parts[2]) {
+        return 1;
+        }
+    }
+    }
     return 0;
+}

gsdl/branches/gsdl-2.74/bin/script/mkcol.pl

r14032	r14270
76	76	'reqd' => "no" },
77	77	{ 'name' => "gs3mode",
78		'desc' => "",
	78	'desc' => "mkcol.gs3mode",
79	79	'type' => "flag",
80	80	'reqd' => "no" },

gsdl/branches/gsdl-2.74/cgi-bin/gliserver.pl

-              r14025
+              r14270
 #!perl -w
 # Need to specify the full path of Perl above
-use gsdlCGI;
 use strict;
+# Set this to 1 to work around IIS 6 craziness
+my $iis6_mode = 0;
+# IIS 6: for some reason, IIS runs this script with the working directory set to the Greenstone
+#   directory rather than the cgi-bin directory, causing lots of stuff to fail
+if ($iis6_mode)
+{
+    # Change into cgi-bin directory
+    chdir("cgi-bin");
+}
+# We use require and an eval here (instead of "use") to catch any errors loading the module (for IIS)
+eval("require \"gsdlCGI.pm\"");
+if ($@)
+{
+    print STDOUT "Content-type:text/plain\n\n";
+    print STDOUT "ERROR: $@\n";
+    exit 0;
+}
 …
 sub authenticate_user
+{
     my $gsdl_cgi = shift(@_);
     my $username = shift(@_);
 …
     my $installation_status = "";
+    print STDOUT "Content-type:text/plain\n\n";
     # Check that Java is installed and accessible
     my $java = $gsdl_cgi->get_java_path();
     my $java_command = "$java -version 2>&1";
+    # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go
+    #   directly out to the page
+    if ($iis6_mode)
+    {
+    $java_command = "java -version";
+    }
     my $java_output = `$java_command`;
     my $java_status = $?;
 …
     if ($installation_ok) {
     $gsdl_cgi->generate_ok_message($installation_status . "\nInstallation OK!");
+    print STDOUT $installation_status . "\nInstallation OK!";
+    }
     else {
     $gsdl_cgi->generate_error($installation_status);
+    print STDOUT $installation_status;
+    }
+}
 …
+    }
-    print STDOUT "Content-type:text/plain\n\n";
     foreach my $cgi_arg_name ($gsdl_cgi->param) {
     my $cgi_arg_value = $gsdl_cgi->clean_param($cgi_arg_name) || "";
 …
+    }
+    print STDOUT "Content-type:text/plain\n\n";
     my $perl_command = "perl -S $script $perl_args 2>&1";
+    # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go
+    #   directly out to the page
+    if ($iis6_mode)
+    {
+    $perl_command = "perl -S $script $perl_args";
+    }
     my $perl_output = `$perl_command`;
     my $perl_status = $?;
 …
+    }
+    print STDOUT "Content-type:text/plain\n\n";
+    print STDOUT $perl_output;
+    if (defined($perl_output))
+    {
+    print STDOUT $perl_output;
+    }
+}
 …
+    }
+    print STDOUT "Content-type:text/plain\n\n";
     my $perl_command = "perl -S $script $perl_args 2>&1";
+    # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go
+    #   directly out to the page
+    if ($iis6_mode)
+    {
+    $perl_command = "perl -S $script $perl_args";
+    }
     if (!open(PIN, "$perl_command |")) {
     $gsdl_cgi->generate_error("Unable to execute command: $perl_command");
+    }
-    print STDOUT "Content-type:text/plain\n\n";
     while (defined (my $perl_output_line = <PIN>)) {
     print STDOUT $perl_output_line;
 …
     # Read the uploaded data and write it out to file
+    # We have to pass the size of the uploaded data in the "fs" argument because IIS 6 seems to be
+    #   completely incapable of working this out otherwise (causing the old code to crash)
     my $buf;
     my $num_bytes = 0;
+    my $num_bytes_remaining = $gsdl_cgi->clean_param("fs");
+    my $bytes_to_read = $num_bytes_remaining;
+    if ($bytes_to_read > 1024) { $bytes_to_read = 1024; }
     binmode(FOUT);
     while (read(STDIN, $buf, 1024) > 0) {
+    while (read(STDIN, $buf, $bytes_to_read) > 0) {
     print FOUT $buf;
     $num_bytes += length($buf);
+    $num_bytes_remaining -= length($buf);
+    $bytes_to_read = $num_bytes_remaining;
+    if ($bytes_to_read > 1024) { $bytes_to_read = 1024; }
+    }
     close(FOUT);

gsdl/branches/gsdl-2.74/cgi-bin/gsdlCGI.pm

r14024	r14270
101	101	print STDOUT $full_mess;
102	102
103		~~die $full_mess~~;
	103	exit 0;
104	104	}
105	105

gsdl/branches/gsdl-2.74/macros/style.dm

-              r13429
+              r14270
 # _pagetitle_
 # _globalscripts_
+_htmlhead_ {<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
+_htmlhead_ {<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+"http://www.w3.org/TR/html4/loose.dtd">
 <html_htmlextra_>

gsdl/branches/gsdl-2.74/perllib/basebuilder.pm

-              r14212
+              r14270
 package basebuilder;
+use strict;
+no strict 'refs'; # allow filehandles to be variables and viceversa
 use classify;
 use cfgread;
 …
     $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
     $outhandle = STDERR unless defined $outhandle;
+    $outhandle = *STDERR unless defined $outhandle;
     $no_text = 0 unless defined $no_text;
     $failhandle = STDERR unless defined $failhandle;
+    $failhandle = *STDERR unless defined $failhandle;
     # create a builder object
 …
     $self->{'gli'} = 0 unless defined $self->{'gli'};
+    # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml
     $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
+    # read in the collection configuration file
+    my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
+    if (-e $colcfgname) {
+      ##$self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
+      $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
+      $gs_mode = "gs2";
+    }
+    else {
+      my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collectionConfig.xml";
+      if (!-e $colcfgname) {
+    die "mgbuilder::new - couldn't find collectionConfig.xml for collection $collection\n";
+      }
+      else {
+    #$self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
+        $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
+    $gs_mode = "gs3";
+      }
+    # Read in the collection configuration file.
+    my ($colcfgname);
+    ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
+    if ($gs_mode eq "gs2") {
+        $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
+    } elsif ($gs_mode eq "gs3") {
+    $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
+    }
 …
     my ($buildprocdir, $buildproctype);
     my $collection = $self->{'collection'};
+    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
+    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
+    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
+    $buildproctype = "custombuildproc";
+    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
+    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
+    $buildproctype = "custombuildproc";
+    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
     $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
     $buildproctype = "${collection}buildproc";
 …
     $self->{'maxnumeric'} = $maxnumeric;
+}
-# It seems we don't need this sub
-#sub set_disable_OAI {
-#    my $disable_OAI = shift (@_);
-#    my ($disable_OAI = @_;
+#
-#    $self->{'disable_OAI'} = $disable_OAI;
-#}
 sub set_strip_html {
     my $self = shift (@_);
 …
     # and their directory names (includes subcolls and langs)
     $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
+    my $indexmap = $self->{'index_mapping'}->{'indexmap'};
     # build each of the indexes
     foreach my $index (@$indexes) {
 …
     my ($handle);
     if ($self->{'debug'}) {
     $handle = STDOUT;
+    $handle = *STDOUT;
     } else {
     if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
 …
         die "builder::make_infodatabase - couldn't run $txt2db_exe\n";
+    }
     $handle = basebuilder::PIPEOUT;
+    $handle = *PIPEOUT;
+    }
 …
     $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
     $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
+    # store whether to disable OAI service
+    $build_cfg->{'disable_OAI'} = $self->{'disable_OAI'};
     # store the mapping between the index names and the directory names
     # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
 …
     if ($gs_mode eq "gs2") {
-      #&colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
       &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
+    }
     if ($gs_mode eq "gs3") {
+      #&colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
+      &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});
+      &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'}, $self->{'disable_OAI'});
+    }

gsdl/branches/gsdl-2.74/perllib/cfgread4gs3.pm

-              r14200
+              r14270
+    }
     print "*** collectionConfig.xml internal ***\n";
     &Display;
+    #print "*** collectionConfig.xml internal ***\n";
+    #&Display;
     return $data;
+}
 …
 # Create the buildConfig.xml file for a specific collection
 sub write_cfg_file {
     # this sub is called make_auxiliary_files() in basebuilder.pm
+    # this sub is called in make_auxiliary_files() in basebuilder.pm
     # the received args: $buildoutfile - destination file: buildConfig.xml
     #                    $buildcfg - all build options, eg, disable_OAI
     #                    $collectcfg - contents of collectionConfig.xml read in by read_cfg_file sub in cfgread4gs3.pm.
     my ($buildoutfile, $buildcfg, $collectcfg) = @_;
+    my ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI) = @_;
     my $line = [];
     if (!open (COLCFG, ">$buildoutfile")) {
     print STDERR "cfgread::write_cfg_file couldn't write the cfg file $buildoutfile\n";
+    print STDERR "cfgread4gs3::write_cfg_file couldn't write the build config file $buildoutfile\n";
     die;
+    }
 …
     # This serviceRack enables the collection to provide the oai metadata retrieve service, which is served by the OAIPMH.java class
     # For each collection, we write the following serviceRack in the collection's buildConfig.xml file as follows if the 'disable_OAI' argument is not ticked in GLI (or equivalently, a 'disable_OAI' flag is not specified on the command line). There are also other configurations in the OAIConfig.xml.
     if ($buildcfg->{'disable_OAI'} == 0) {
+    # For each collection, we write the following serviceRack in the collection's buildConfig.xml file if the 'disable_OAI' argument is not checked in the GLI (or equivalently, a 'disable_OAI' flag is not specified on the command line). There are also other configurations in the OAIConfig.xml.
+    if ($disable_OAI == 0) {
       &write_line('COLCFG', ["<serviceRack name=\"OAIPMH\">"]);
       if (defined $buildcfg->{'indexstem'}) {

gsdl/branches/gsdl-2.74/perllib/classify.pm

-              r14112
+              r14270
     # find the classifier
+    my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
+    my $customclassname;
+    if (defined($ENV{'GSDLCOLLECTION'}))
+    {
+    $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
                                               "perllib", "classify", "${classifier}.pm");
+    }
     my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm");
     my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm");
     if (-e $customclassname) { require $customclassname; }
+    if (defined($customclassname) && -e $customclassname) { require $customclassname; }
     elsif (-e $colclassname) { require $colclassname; }
     elsif (-e $mainclassname) { require $mainclassname; }

gsdl/branches/gsdl-2.74/perllib/colcfg.pm

-              r14115
+              r14270
+}
 sub write_build_cfg_xml {
     my ($buildoutfile, $buildcfg, $collectcfg) = @_;
+    my ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI) = @_;
     return &cfgread4gs3::write_cfg_file ($buildoutfile, $buildcfg, $collectcfg);
+    return &cfgread4gs3::write_cfg_file ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI);
+}
 …
     return &cfgread::read_cfg_file ($filename,
            q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/,
+           q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/,
            q/^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields|indexfieldmap|indexlevels|levelmap)$/);
 …
     &cfgread::write_cfg_file($filename, $data,
            q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/,
+           q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/,
            q/^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields|indexfieldmap|indexlevels|levelmap)$/);
+}

gsdl/branches/gsdl-2.74/perllib/plugin.pm

-              r14112
+              r14270
     # find the plugin
+    my $customplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
+    my $customplugname;
+    if (defined($ENV{'GSDLCOLLECTION'}))
+    {
+    $customplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'},
                                              'perllib', 'plugins', "${pluginname}.pm");
+    }
     my $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'plugins',
                       "${pluginname}.pm");
     my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'}, 'perllib', 'plugins',
                        "${pluginname}.pm");
     if (-e $customplugname) { require $customplugname; }
+    if (defined($customplugname) && -e $customplugname) { require $customplugname; }
     elsif (-e $colplugname) { require $colplugname; }
     elsif (-e $mainplugname) { require $mainplugname; }

gsdl/branches/gsdl-2.74/perllib/plugins/HTMLPlug.pm

-              r14089
+              r14270
     foreach my $field (split /,/, $self->{'metadata_fields'}) {
+        $field =~ s/^\s+//; # remove leading whitespace
+        $field =~ s/\s+$//; # remove trailing whitespace
     # support tag<tagname>
     if ($field =~ /^(.*?)<(.*?)>$/) {

gsdl/branches/gsdl-2.74/perllib/plugins/MediaWikiPlug.pm

-              r14108
+              r14270
+#
 ###########################################################################
+# This plugin is to process an HTML file where sections are divided by
+# user-defined headings tags. As it is difficult to predict what user's definition
+# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
+# as well as allows to get rid of user-defined Table of Content (TOC)...
+# format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
+# This plugin is to process an HTML file from a MediaWiki website which downloaded by
+# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
+# login, discussion, history, etc. Only the navigation and search section could be preserved.
+# Searchbox will be modified to search the Greenstone collection instead of the website.
+# It also can automatically add the table of contents on the website's Main_Page to the
+# collection's Home page.
 package MediaWikiPlug;
 use HTMLPlug;
+use ImagePlug;
+use File::Copy;
+# use ImagePlug;
+# use File::Copy;
+use unicode;
 #use strict; # every perl program should have this!
 …
 sub BEGIN {
     @MediaWikiPlug::ISA = ('HTMLPlug');
+    @MediaWikiPlug::ISA = ('HTMLPlug');
+}
 my $arguments =
+    [
+     # show the table of contents on collection's home page
      { 'name' => "show_toc",
        'desc' => "{MediaWikiPlug.show_toc}",
        'type' => "flag",
        'reqd' => "no"},
+     # set to delete the table of contents section on each MediaWiki page
+     { 'name' => "delete_toc",
+       'desc' => "{MediaWikiPlug.delete_toc}",
+       'type' => "flag",
+       'reqd' => "no"},
+     # regexp to match the table of contents
      { 'name' => "toc_exp",
        'desc' => "{MediaWikiPlug.toc_exp}",
        'type' => "regexp",
        'reqd' => "no",
+       'deft' => "" },
+     { 'name' => "delete_toc",
+       'desc' => "{MediaWikiPlug.delete_toc}",
+       'type' => "flag",
+       'reqd' => "no"},
+       'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*</table>\\n" },
+     # set to delete the navigation section
      { 'name' => "delete_nav",
        'desc' => "{MediaWikiPlug.delete_nav}",
        'type' => "flag",
        'reqd' => "no",
+       'deft' => ""},
+     { 'name' => "nav_exp",
+       'desc' => "{MediaWikiPlug.nav_exp}",
+       'deft' => ""},
+     # regexp to match the navigation section
+     { 'name' => "nav_div_exp",
+       'desc' => "{MediaWikiPlug.nav_div_exp}",
        'type' => "regexp",
        'reqd' => "no",
+       'deft' => "" },
+     { 'name' => "tag_sections",
+       'desc' => "{MediaWikiPlug.tag_sections}",
+       'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
+     # set to delete the searchbox section
+     { 'name' => "delete_searchbox",
+       'desc' => "{MediaWikiPlug.delete_searchbox}",
        'type' => "flag",
+       'reqd' => "no"},
+     { 'name' => "description_tags",
+       'desc' => "{HTMLPlug.description_tags}",
+       'type' => "flag",
+       'reqd' => "no"}
+       'reqd' => "no",
+       'deft' => ""},
+     # regexp to match the searchbox section
+     { 'name' => "searchbox_div_exp",
+       'desc' => "{MediaWikiPlug.searchbox_div_exp}",
+       'type' => "regexp",
+       'reqd' => "no",
+       'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
+     # regexp to match title suffix
+     # can't use the title_sub option in HTMLPlug instead
+     # because title_sub always matches from the begining
+     { 'name' => "remove_title_suffix_exp",
+       'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
+       'type' => "regexp",
+       'reqd' => "no",
+       'deft' => ""}
      ];
 my $options = { 'name'     => "MediaWikiPlug",
 …
         'args'     => $arguments };
 sub new {
     my ($class) = shift (@_);
 …
     $head =~ m/<title>(.+)<\/title>/i;
     my $doctitle = $1 if defined $1;
+    my $doctitle = $1 if defined $1;
     if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
 …
     # set the title here if we haven't found it yet
     if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
+    if (defined $doctitle && $doctitle =~ /\S/) {
+        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
+    if (defined $doctitle && $doctitle =~ /\S/) {
+            # remove suffix in title if required
+            my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
+        if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
+           $doctitle =~ s/$remove_suffix_exp//i;
+        }
+        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
     } else {
         $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
+        $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
+    }
+    }
+    if(defined $base_dir && $base_dir ne ""){
+    # find and download stylesheet
+    }
+    # we are only interested in the column-contents div <div id="column-content">
+    # remove header section, it may contain header images or additional search boxes
+    my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
+    $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
+    # remove timeline
+    $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
+    # remove extra bits
+    my $extra_bits = "Retrieved from(.+)</a>\"";
+    $body_text =~ s/$extra_bits//isg;
+    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
+    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
+    $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
+    $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
+    # get rid of the [edit] buttons
+    $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
+    # get rid of the last time edit information at the bottom
+    $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
+    # get rid of the (Redirected from ...)
+    $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
+    # escape texts macros
+    $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
+    # may change the links, like Greenstone_Documentation_All.html, then change back
+    $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
+    # define file delimiter for different platforms
+    my $file_delimiter;
+    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+       $file_delimiter = "\\";
+    } else {
+       $file_delimiter = "/";
+    }
+    # IMPORTANT: different delimiter for $base_dir and $file
+    # $base_dir use forward slash for both windows and linux
+    # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
+                                        # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
+    # $file use different delimiters : forward slash for linux; backward slash for windows
+    # print "\nfile : $file\n\n";         # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
+                                        # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
+    # get the base url for the MediaWiki website
+    my $safe_delimiter = &safe_escape_regexp($file_delimiter);
+    my @url_dirs=split($safe_delimiter, $file);
+    my $url_base = $url_dirs[0];
+    # Re-check css files associated with MediaWiki pages
+    if(defined $base_dir && $base_dir ne ""){
     my @css_files;
     my $css_file_count = 0;
+    # find all the style sheets imported with import statement
+    # find all the stylesheets imported with @import statement
     while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
+        $css_files[$css_file_count++] = $2 if defined $2;
+    }
+        $css_files[$css_file_count++] = $2 if defined $2;
+    }
+    # download the stylesheets if we haven't downloaded them yet
+        # add prefix to each style elmement, comment out the body element
+        # and copy the files to collection's images folder
+    for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
+        my $css_file = $css_files[$css_file_count];
+        # remove prefix gli/cache directory
+            $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
+            # change the \ delimiter in $css_file to / for consistency
+            $css_file =~ s/\\/\//isg;
+            if($css_file !~ /$url_base/) {
+              $css_file = $url_base . $css_file;
+            }
+            # trim the ? mark append to the end of a stylesheet
+        $css_file =~ s/\?(.+)$//isg;
+            my $css_file_path = &util::filename_cat($base_dir, $css_file);
+        # do nothing if we have already downloaded the css files
+        if (! -e $css_file_path) {
+             # check the stylesheet's directory in the import folder
+             # if the directory doesn't exist, create one
+         my @dirs = split(/\//i,$css_file);
+         my $path_check = "$base_dir/";
+         for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
+        $path_check .= $dirs[$i] . "/";
+        mkdir($path_check) if (! -d $path_check );
+         }
+             # NOTE: wget needs configuration to directly access Internet
+             # These files should already downloaded if we used the MediaWikiDownload
+         # downloading
+         $css_file = "http://$css_file";
+             print "\ndownloading : " . $css_file . "\n\n";
+         system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
+         if ($? != 0) {
+              print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
+              print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
+              unlink("$css_file_path");
+             }
+            } # done with download
+        # add a prefix "#wikispecificstyle" to each element
+        # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
+            # so we will wrap the web page with a div with id = wikispecificstyle
+            my $css_content;
+        if(open(INPUT, "<$css_file_path")){
+        while(my $line = <INPUT>){
+                    # comment out the body element because we change the body to div
+                    $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
+            if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
+            $line = "#wikispecificstyle " . $line;
+            }
+            $css_content .= $line;
+        }
+        close(INPUT);
+        open(OUTPUT, ">$css_file_path");
+        print OUTPUT $css_content;
+        close(OUTPUT);
+        }
+            # Copy the modified stylesheets to collection's images folder
+            # for future customization
+            my $images_dir = $base_dir;
+            $images_dir =~ s/import$/images/;
+            $css_file =~ m/(.*)\/(.*)$/;
+            $images_dir = &util::filename_cat($images_dir, $2);
+            if(open(OUTPUT, ">$images_dir")){
+              print OUTPUT $css_content;
+              close(OUTPUT);
+            }
+    }
+    }
+    # by default, only preserve navigation box and search box
+    # others like toolbox, interaction, languages box, will be removed
+    # extract the larger part -- footer section
+    my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
+    $body_text =~ /$print_footer/;
+    my $footer = "";
+    $footer = $& if defined $&;
+    $footer =~ s/<\/body>//isg;
+    # trim the comments first
+    $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
+    # contain sections that are to be preserved
+    my $preserve_sections = "";
+    # process the navigation section
+    my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
+    if (defined $self->{'nav_div_exp'}) {
+      $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
+    }
+    if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
+        # do nothing
+    } else {
+      if ($footer =~ m/$nav_match_exp/ig) {
+        $preserve_sections = $& ;
+      } else {
+        print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
+      }
+      # if($preserve_sections =~/\S/){
+      #  $preserve_sections .= "</div>";
+      # }
+    }
+    # process the searchbox section
+    my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
+    if(defined $self->{'searchbox_div_exp'}) {
+        $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
+    }
+    my $searchbox_section = "";
+    $footer =~ m/$searchbox_exp/ig;
+    $searchbox_section = $& if defined $&;
+    # make the searchbox form work in Greenstone
+    if($searchbox_section =~ /\S/){
+        # replace action
+        $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
+        # remove buttons
+        $searchbox_section =~ s/name="search"/name="q"/isg;
+        $searchbox_section =~ s/name="go"//isg;
+        $searchbox_section =~ s/name="fulltext"//isg;
+        # get collection name from $base_dir for c param
+        $base_dir =~ m/\/collect\/(.+)\//i;
+        my $collection_name = "";
+        $collection_name = $1 if defined $1;
+        # add Greenstone search params
+        my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
+            ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
+            # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
+            # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
+        $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
+        # $searchbox_section .= "</div>";
+    } else {
+      print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
+    }
+    # either delete or replace the searchbox
+    if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
+        # do nothing
+    } else {
+        $preserve_sections .= "\n$searchbox_section\n";
+    }
+    if($preserve_sections ne ""){
+      $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
+    }
+    $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
+    $body_text =~ s/$print_footer/$preserve_sections/isg;
+    # delete other forms in the page
+    my @forms;
+    my $form_count = 0;
+    while($body_text =~ m/<form([^>]*)name=("|')([^>]*)("|')/isg){
+        next if($3 eq "q");
+        $forms[$form_count++] = $&;
+    }
+    foreach my $form (@forms) {
+      $body_text =~ s/$form[\s\S]*?<\/form>//m;
+    }
+    # process links.
+    # because current WGET 1.10 the -k and -E option doesn't work together
+    # need to 'manually' convert the links to relative links
+    # Dealing with 3 types of links:
+    # -- outgoing links
+    #   -- if we have downloaded the target files, link to the internal version (relative link)
+    #   -- otherwise, link to the external version (absolute links)
+    # -- in-page links (relative link)
+    # NOTE: (important)
+    #   must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
+    #   otherwise, the internal links may have problems
+    # remove the title attribute of <a> tag
+    $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
+    # extract all the links
+    my @links;
+    my $link_count = 0;
+    while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
+        $links[$link_count++] = "$1=\"$2$url_base/$3\"";
+    }
+    foreach my $cur_link (@links) {
+        # escape greedy match + character
+        $cur_link =~ s/\+/\\+/isg;
+        $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
+        my $external_file_path = "$1\"http://$url_base/$3\"";
+        $body_text =~ s/$cur_link/$external_file_path/i;
+    }
+    # tag links to new wiki pages as red
+    $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
+    # tag links to pages external of the MediaWiki website as blue
+    $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
+    # process the table-of-contents section
+    # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
+    # 1. read _content_ macro from about.dm
+    # 2. append the toc, change all links to the Greenstone internal format for relative links
+    # 3. write to the extra.dm
+    # TODO: we assume the _about:content_ hasn't been specified before
+    #       so needs to add function to handle when the macro is already in the extra.dm
+    if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
+      # extract toc of the Main_Page
+      my $mainpage_toc = "";
+      my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
+      if($self->{'toc_exp'} =~ /\S/){
+         $toc_exp = $self->{'toc_exp'};
+      }
+      if($body_text =~ /$toc_exp/){
+        $mainpage_toc = $&;
+      }
+      if($mainpage_toc =~ /\S/) {
+        # change the in-page links to relative links, for example, change <a href="#section1"> to
+        # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
+        my $file_url_format = $file;
+        $file_url_format =~ s/\\/\//isg;
+    $file_url_format = "http://" . $file_url_format;
+        # encode as URL, otherwise doesn't work on Windows
+        $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
+    $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
+        # read the collection's extra.dm
+        my $macro_path = $base_dir;
+        $macro_path =~ s/import$/macros/;
+        my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
+        my $extra_dm = "";
+        if(open(INPUT, "<$extradm_file")){
+        while(my $line = <INPUT>){
+        $extra_dm .= $line;
+        }
+        } else {
+            print $outhandle "can't open file $extradm_file\n";
+        }
+        close(INPUT);
+        # check whether we have changed the macros
+        my @packages = split("package ", $extra_dm);
+        my $about_package = "";
+        foreach my $package (@packages) {
+          $about_package = "package " . $package if($package =~ /^about/);
+        }
+        my $update_extra_dm = 0;
+        if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
+       print $outhandle "_content_ macro already changed!!!!\n";
+    }
+        # if extra.dm doesn't have an "about package"
+        elsif ($about_package !~ /\S/) {
+          # read _content_ macro from $GSDLHOME/macros/about.dm file
+      my $global_about_package = &read_content_from_about_dm();
+          # create the extra _content_ macro for this collection
+          # add the original content of the _content_ macro
+          $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
+          # append the new about package to extra.dm
+          $extra_dm .= "\n\npackage about\n_content_$&\n\n";
+          $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
+          $update_extra_dm = 1;
+        }
+        # the about package exists, but either doesn't have the _content_ macro or
+        # the _content_ macro doesn't contain the toc
+        else {
+          # check if there is a content macro
+          my $content_macro_existed = 0;
+          $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
+          # if there is one
+          # append a new section div for toc to the end of the document section
+          if($content_macro_existed ==1) {
+            $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
+            my $content_macro = $&;
+            my $new_content_macro = $content_macro;
+            $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
+            $extra_dm =~ s/$content_macro/$new_content_macro/mg;
+          }
+          # otherwise, append _content_ macro to the about package
+          else {
+            my $new_about_package = $about_package;
+            $content_macro = &read_content_from_about_dm();
+            $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
+            $new_about_package .= "\n\n_content_$&\n\n";
+            $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
+            $extra_dm =~ s/$about_package/$new_about_package/mg;
+          }
+          # either the case, we need to update the extra.dm
+          $update_extra_dm = 1;
+         }
+         if($update_extra_dm==1){
+            # write to the extra.dm file of the collection
+            if (open(OUTPUT, ">$extradm_file")) {
+                print OUTPUT $extra_dm;
+            } else {
+                print "can't open $extradm_file\n";
+            }
+            close(OUTPUT);
+         }
+      } else {
+        print $outhandle "Main_Page doesn't have a table-of-contents section\n";
+      }
+    }
+    # check whether the stylesheet exists
+    # if not, download it and copy to the collection's images folder
+    for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
+        my $css_file = $css_files[$css_file_count];
+        $css_file =~ s/^(.+)gli\/cache\///i;
+        my $css_file_path = "$base_dir/$css_file";
+        if (-e $css_file_path){ # the file already exists
+            next;
+        }
+        # check the css directory and create one if it's not there
+        my @dirs = split(/\//i,$css_file);
+        my $path_check = "$base_dir/";
+        for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
+            $path_check .= $dirs[$i] . "/";
+            if(! -d $path_check ){
+                mkdir($path_check);
+            }
+        }
+        # download
+        $css_file = "http://$css_file";
+        system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
+        if ($? != 0) {unlink("$css_file_path");}
+        # change every style element to #wikispecificstyle ...
+        if(open(INPUT, "<$css_file_path")){
+            my $css_content;
+            while(my $line = <INPUT>){
+                if($line =~ m/^(.+)\{/i){
+                    $line = "#wikispecificstyle " . $line;
+                }
+                $css_content .= $line;
+            }
+            close(INPUT);
+            open(OUTPUT, ">$css_file_path");
+            print OUTPUT $css_content;
+            close(OUTPUT);
+        }
+        # copy to images folder
+        # do not copy, because collection can only have one specific stylesheet
+        # better to add and modify the style sheets manually
+        # @dirs = split(/\//i,$base_dir);
+        # my $collection_base_dir;
+        # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
+        #   $collection_base_dir .= $dirs[$i] . "/";
+        # }
+        # my $images_folder = $collection_base_dir . "images/";
+        # copy($css_file_path, $images_folder) || die "File cannot be copied.";
+    # If delete_toc is set, remove toc and tof contents.
+    if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
+    if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
+          # print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
+          if ($body_text =~ /$self->{'toc_exp'}/) {
+        $body_text =~ s/$self->{'toc_exp'}//i;
+          }
+    }
+    }
+    # add sections around h2 tag
+    # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
+    # add search box with each section
+    if ($self->{'tag_sections'}) {
+    my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
+    for(my $i=1; $i < scalar(@sections); $i++){
+        my $section_title = $sections[$i];
+        $section_title =~ s/<([^>]*)>//g;
+        $section_title =~ s/(^\s|\s$)//g;
+        my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
+        if($i !=1){
+            $section_metadata = "</Section>\n" . $section_metadata;
+        }
+        $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
+        $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
+        $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
+        $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
+        if($i==scalar(@sections)-1) {
+            # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
+            $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
+        }
+    }
+     }
+    # If delete_nav is enabled, it means to get rid of navigation contents.
+    # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
+    #   if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
+    #       print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
+    #       $body_text =~ s/$self->{'nav_exp'}//isg;
+    #   }
+    #}
+    my $searchbox = "";
+    if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
+    my $nav_match_express;
+    if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
+        $nav_match_express = $self->{'nav_exp'} ;
+    } else { # default setting for mediawiki
+        $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
+    }
+    print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
+    # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;
+    $body_text =~ m/$nav_match_express/isg;
+    my $navigate = $& if defined $&;
+    # find the search box and add it to the document page
+    if(defined $navigate && $navigate =~ /\S/){
+        $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
+        $searchbox = $& . "\n<\/div>\n<\/div>";
+        $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
+        $searchbox =~ s/name="search"/name="q"/isg;
+        $searchbox =~ s/name="go"//isg;
+        $searchbox =~ s/name="fulltext"//isg;
+        my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
+                    ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
+                    ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
+                    ."<input type=\"hidden\" name=\"t\" value=\"1\">";
+        $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
+        $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
+    }
+    # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
+    $body_text =~ s/$nav_match_express/$searchbox/isg;
+    }
+    if ($self->{'tag_sections'}) {
+        $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
+    }
+    # Tidy up extra new lines
+    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
+    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
+    $section_text .= "<!--\n<Section>\n-->\n";
+    my $body = "<body".$body_text;
+    $$textref = $body;
+    # get the base dir for convert absolute links to relative links
+    $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
+    my $basedir = $2;
+    $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
+    $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
+    # get rid of the [edit] button
+    $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
+    # get rid of the last time edit information at the bottom
+    $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
+    # get rid of the (Redirected from ...)
+    $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
+    # escape macros
+    $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
+    # may change the links, like Greenstone_Documentation_All.html, then change back
+    $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
+    # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
+    # get rid of the title attribute of a tag
+    $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
+    # find the relative path of current directory
+    if($basedir ne ""){
+        my @dirs=split("\/", $file);
+        my $dirnum = scalar(@dirs);
+        my $replace = "";
+        for(my $i=0; $i<$dirnum-2; $i++){
+            $replace .= "../";
+        }
+        # test if the linked relative file exists, if not, link to the internet version
+        $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;
+        # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
+        # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
+        # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
+        #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
+            #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
+        #   my $prefix = $1;
+        #   my $link = $&;
+        #   my $rel_file_name = $3;
+        #   my $rel_link = "$replace$rel_file_name";
+            # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
+        #   if(-e $rel_link){
+        #       $rel_link = "$prefix=\"$rel_link\"";
+        #       $$textref =~ s/$link/$rel_link/i;
+        #   }else{
+        #       my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
+                # print $outhandle "external link==> $ext_link\n";
+        #       $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
+        #   }
+        #}
+        # tag the link to new wiki pages as red
+        $$textref =~ s/(href|src)="$replace([^>]*)&amp;action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&amp;action=edit$3"/gi;
+        $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
+        # tag the link to external pages as blue
+        $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
+        #print $outhandle $$textref;
+    }
+    # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
+    # 1. read _content_ macro from about.dm
+    # 2. append the toc, change all links to the Greenstone internal format for relative links
+    # 3. write to the extra.dm
+    # TODO: currently we suppose the _about:content_ hasn't been specified before
+    #       so needs to add function to handle when the macro is already in the extra.dm
+    if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
+        my $macro_path = $base_dir;
+        $macro_path =~ s/import$/macros/;
+        my $extra_dm;
+        my $extradm_file = "$macro_path/extra.dm";
+        if(open(INPUT, "<$extradm_file")){
+            while(my $line = <INPUT>){
+                $extra_dm .= $line;
+        }
+            close(INPUT);
+            if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
+                print $outhandle "already changed!!!!\n";
+            } else {
+                # read _content_ macro from about.dm file
+                my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
+                my $about_page_content = "";
+                if(open(INPUT, "<$about_macro")){
+                    while(my $line=<INPUT>){
+                        $about_page_content .= $line;
+                    }
+                }else{
+                    print $outhandle "can't open file $about_macro\n";
+                }
+                close(INPUT);
+                # extract the _content_ macro
+                $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
+                $about_page_content = $&;
+                # extract toc of the Main_Page
+                my $mainpage_content = "";
+                if($self->{'toc_exp'} =~ /\S/){
+                    $$textref =~ /$self->{'toc_exp'}/;
+                    $mainpage_content = $&;
+                } else {
+                    # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
+                }
+                # print $outhandle "---------\n$$textref\n--------\n\n";
+                # print $outhandle "==========\n$mainpage_content\n==========\n\n";
+                # add toc to the _content_ macro
+                $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
+                $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
+                # change all links to the internal Greenstone relative link format
+                $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
+                $extra_dm =~ s/(\.\.\/)+/\//isg;
+                # print $outhandle "to add---------\n$extra_dm\n--------\n";
+                # write to the extra.dm file of the collection
+                open(OUTPUT, ">$extradm_file");
+                print OUTPUT $extra_dm;
+                close(OUTPUT);
+            }
+        } else {
+            print $outhandle "can't open file $extradm_file\n";
+        }
+    }
+    # If delete_toc is enabled, it means to get rid of toc and tof contents.
+    # get rid of TOC and TOF sections and their title
+    if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
+        if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
+            # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
+            # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
+            # $body_text =~ s/$self->{'toc_exp'}//i;
+            print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
+            $$textref =~ s/$self->{'toc_exp'}//i;
+        }
+    }
+    # To add a layer on top of the wiki page
+    # so as to keep the wiki style inside the wiki page
+    # and keep the Greenstone style at the same time
+    $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
+    $$textref =~ s/<\/body>/<\/div><\/body>/is;
+    # tag with sections
+    $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
+    $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
+    #print $outhandle "\n\n$$textref\n\n";
+    # use description tags
+    if ($self->{'description_tags'}) {
+        my $cursection = $doc_obj->get_top_section();
+        # remove the html header - note that doing this here means any
+        # sections defined within the header will be lost (so all <Section>
+        # tags must appear within the body of the HTML)
+        my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
+        $$textref =~ s/^.*?<body[^>]*>//is;
+        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
+        my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
+        my $lt = '(?:<|&lt;)';
+        my $gt = '(?:>|&gt;)';
+        my $quot = '(?:"|&quot;|&rdquo;|&ldquo;)';
+        # my $dont_strip = '';
+        # if ($self->{'no_strip_metadata_html'}) {
+        #    ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
+        # }
+        my $found_something = 0;
+        my $top = 1;
+        while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
+            my $text = $1;
+            my $comment = $2;
+            if (defined $text) {
+                # text before a comment - note that getting to here
+                # doesn't necessarily mean there are Section tags in
+                # the document
+                # print $outhandle "section text:\n$text\n";
+                $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
+            }
+            while ($comment =~ s/$lt(.*?)$gt//s) {
+                my $tag = $1;
+                if ($tag eq "Section") {
+                    $found_something = 1;
+                    $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
+                    $top = 0;
+                } elsif ($tag eq "/Section") {
+                    $found_something = 1;
+                    $cursection = $doc_obj->get_parent_section ($cursection);
+                } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
+                    my $metaname = $1;
+                    my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
+                    $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
+                    my $metavalue = $1;
+                    $metavalue =~ s/^\s+//;
+                    $metavalue =~ s/\s+$//;
+                    # assume that no metadata value intentionally includes
+                    # carriage returns or HTML tags (if they're there they
+                    # were probably introduced when converting to HTML from
+                    # some other format).
+                    # actually some people want to have html tags in their
+                    # metadata.
+                    $metavalue =~ s/[\cJ\cM]/ /sg;
+                    # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
+                    $metavalue =~ s/\s+/ /sg;
+                    # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
+                    if ($accumulate) {
+                        $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
+                    } else {
+                        $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
+                    }
+                } elsif ($tag eq "Description" || $tag eq "/Description") {
+                    # do nothing with containing Description tags
+                } else {
+                    # simple HTML tag (probably created by the conversion
+                    # to HTML from some other format) - we'll ignore it and
+                    # hope for the best ;-)
+                }
+            }
+        }# end while
+        if ($cursection ne "") {
+            print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
+        }
+        $$textref =~ s/^.*?<body[^>]*>//is;
+        $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        if ($$textref =~ /\S/) {
+            if (!$found_something) {
+                if ($self->{'verbosity'} > 2) {
+                    print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
+                    print $outhandle "          will be processed as a single section document\n";
+                }
+                # go ahead and process single-section document
+                $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+            } else {
+                print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
+                print $outhandle "          of the final closing </Section> tag. This text will\n";
+                print $outhandle "          be ignored.";
+                my ($text);
+                if (length($$textref) > 30) {
+                    $text = substr($$textref, 0, 30) . "...";
+                } else {
+                    $text = $$textref;
+                }
+                $text =~ s/\n/ /isg;
+                print $outhandle " ($text)\n";
+            }
+        } elsif (!$found_something) {
+            if ($self->{'verbosity'} > 2) {
+            # may get to here if document contained no valid Section
+            # tags but did contain some comments. The text will have
+            # been processed already but we should print the warning
+            # as above and extract metadata
+            print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
+            print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+            }
+        }
+    } # if $self->{'description_tags'}
+    else {
+        # remove header and footer
+        # if (!$self->{'keep_head'}) {
+        #    $$textref =~ s/^.*?<body[^>]*>//is;
+        #    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+        # }
+        # single section document
+        # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+        # Important: to get the relative links to work,
+        # 1: use the below statement instead of the above one
+        # 2. cannot have process_section method.
+        # why?????
+        $self->SUPER::process(@_);
+    }
+    return 1;
+    }
+    $$textref = "<body" . $body_text;
+    # Wrap the whole page with <div id="wikispecificstyle"></div>
+    # keep the style of this website and don't mess up with the Greenstone styles
+    $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
+    $$textref =~ s/<\/body>/<\/div><\/body>/is;
+    #$self->SUPER::process(@_);
+    $self->SUPER::process(@_);
+    return 1;
+}
-# note that process_section may be called multiple times for a single
-# section (relying on the fact that add_utf8_text appends the text to any
-# that may exist already).
-# sub process_section {
-#    my $self = shift (@_);
-#    my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
-    # trap links
-    # if (!$self->{'nolinks'}) {
-    # usemap="./#index" not handled correctly => change to "#index"
-    # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
-        #$self->replace_usemap_links($1, $2, $3)/isge;
-    #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
-        #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
-    #}
-    # trap images
-    # allow spaces if inside quotes - jrm21
-    #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
-    #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
-    # add text to document object
-    # turn \ into \\ so that the rest of greenstone doesn't think there
-    # is an escape code following. (Macro parsing loses them...)
-#    $$textref =~ s/\\/\\\\/go;
-#    $doc_obj->add_utf8_text($cursection, $$textref);
-#}
 …
+}
+sub safe_escape_regexp
+{
+  my $regexp = shift (@_);
+  # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+    $regexp =~ s/\\/\\\\/isg;
+  #} else {
+    $regexp =~ s/\//\\\//isg;
+  #}
+  return $regexp;
+}
+sub read_content_from_about_dm
+{
+  my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
+  my $about_page_content = "";
+  if (open(INPUT, "<$about_macro_file")){
+    while (my $line=<INPUT>){
+      $about_page_content .= $line;
+    }
+  } else {
+    print $outhandle "can't open file $about_macro_file\n";
+  }
+  close(INPUT);
+  # extract the _content_ macro
+  $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
+  $about_page_content = $&;
+  return $about_page_content;
+}
 ;

gsdl/branches/gsdl-2.74/perllib/strings.properties

-              r14198
+              r14270
 # -- buildcol.pl --
-buildcol.disable_OAI:tick to make it not providing the OAI service for this collection.
 buildcol.archivedir:Where the archives live.
 …
 downloadfrom.download_mode:The type of server to download from
 downloadfrom.download_mode.Web:HTTP
+downloadfrom.download_mode.MediaWiki:MediaWiki website
 downloadfrom.download_mode.OAI: Open Archives Initiative
 downloadfrom.download_mode.z3950:z3950 server
 …
 GenericList.desc:A general and flexible list classifier with most of the abilities of AZCompactList, but with better Unicode, metadata and sorting capabilities.
 GenericList.metadata:Metadata fields used for classification. Use '/' to separate the levels in the hierarchy and ';' to separate metadata fields within each level.
+GenericList.partition_name_length:The length of the partition name; defaults to a variable length from 1 up to 3 characters, depending on how many are required to distinguish the partition start from its end. This option only applies when partition_type_within_level is set to 'constant_size'.
 GenericList.partition_size_within_level:The number of items in each partition (only applies when partition_type_within_level is set to 'constant_size').
 GenericList.partition_type_within_level:The type of partitioning done: either 'per_letter', 'constant_size', or 'none'.
 …
 MARCXMLPlug.metadata_mapping_file:Name of file that includes mapping details from MARC values to Greenstone metadata names. Defaults to 'marctodc.txt' found in the site's etc directory.
+MediaWikiPlug.desc:Plugin for importing MediaWiki web pages
+MediaWikiPlug.show_toc: Add to the collection's About page the 'table of contents' on the MediaWiki website's main page. Needs to specify a Perl regular expression in toc_exp below to match the 'table of contents' section.
+MediaWikiPlug.delete_toc:Delete the 'table of contents' section on each HTML page. Needs to specify a Perl regular expression in toc_exp below to match the 'table of contents' section.
+MediaWikiPlug.toc_exp:A Perl regular expression to match the 'table of content'. The default value matches common MediaWiki web pages.
+MediaWikiPlug.delete_nav:Delete the navigation section. Needs to specify a Perl regular expression in nav_div_exp below.
+MediaWikiPlug.nav_div_exp:A Perl regular expression to match the navigation section. The default value matches common MediaWiki web pages.
+MediaWikiPlug.delete_searchbox:Delete the searchbox section. Needs to specify a Perl regular expression in searchbox_div_exp below.
+MediaWikiPlug.searchbox_div_id:A Perl regular expression to match the searchbox section. The default value matches common MediaWiki web pages.
+MediaWikiPlug.remove_title_suffix_exp:A Perl regular expression to trim the extracted title. For example, \\s-(.+) will trim title contents after "-".
 MetadataCSVPlug.desc:A plugin for metadata in comma-separated value format. The Filename field in the CSV file is used to determine which document the metadata belongs to.
 …
 BasDownload.desc:Base class for Download modules
+MediaWikiDownload.desc:A module for downloading from MediaWiki websites
+MediaWikiDownload.reject_filetype:Ignore url list, separate by comma, e.g.*cgi-bin*,*.ppt ignores hyperlinks that contain either 'cgi-bin' or '.ppt'
+MediaWikiDownload.reject_filetype_disp:Ignore url list, separate by comma
+MediaWikiDownload.exclude_directories:List of exclude directories (must be absolute path to the directory), e.g. /people,/documentation will exclude the 'people' and 'documentation' subdirectory under the currently crawling site.
+MediaWikiDownload.exclude_directories_disp:List of exclude directories, separate by comma
 OAIDownload.desc:A module for downloading from OAI repositories

gsdl/branches/gsdl-2.74/src/recpt/authenaction.cpp

-              r14014
+              r14270
 #include "infodbclass.h"
 #include "gsdltimes.h"
-#include "userdb.h"
 …
 void authenaction::configure (const text_t &key, const text_tarray &cfgline) {
-  // get the password filename
-  if (cfgline.size() == 1) {
-    if (key == "usersfile") usersfile = cfgline[0];
-    else if (key == "keyfile") keyfile = cfgline[0];
-    else if (key == "keydecay") keydecay = cfgline[0].getint();
+  }
   action::configure (key, cfgline);
+}
 bool authenaction::init (ostream &logout) {
   if (gdbmhome.empty()) {
     logout << "ERROR (authenaction::init) gdbmhome is not set\n";
     return false;
+  }
-  if (usersfile.empty()) usersfile = filename_cat (gdbmhome, "etc", "users.db");
-  if (keyfile.empty()) keyfile = filename_cat (gdbmhome, "etc", "key.db");
   return action::init (logout);
 …
   if (args["uan"].empty()) return true;
-  userdbclass *user_database = new userdbclass(usersfile);
-  keydbclass *key_database = new keydbclass(keyfile);
   // failure means we have to redirect to this action to get authentication
   // (if we are not already doing this)
 …
   else args_us = "failed";
+  // make sure we have a username
+  if (!args_un.empty() && (user_database->get_user_info (args_un, thisuser) == ERRNO_SUCCEED)) {
+  // make sure we have a username
+  int status = user_database->get_user_info (args_un, thisuser);
+  if (!args_un.empty() && (status == ERRNO_SUCCEED)) {
     if (!args_pw.empty()) {
       // we are authenticating using a password
 …
+  }
-  //close the database
-  user_database->closedatabase();
-  key_database->closedatabase();
   return true;
+}

gsdl/branches/gsdl-2.74/src/recpt/authenaction.h

-              r7432
+              r14270
 #include "action.h"
 #include "text_t.h"
+#include "userdb.h"
 #include "receptionist.h"
 …
 class authenaction : public action {
 protected:
   text_t usersfile;
   text_t keyfile;
+  userdbclass *user_database;
+  keydbclass *key_database;
   int keydecay;
 …
   authenaction ();
   virtual ~authenaction () {}
+  void set_userdb(userdbclass *udb) {user_database = udb;}
+  void set_keydb (keydbclass *kdb) {key_database = kdb;}
   void set_receptionist (receptionist *therecpt) {recpt=therecpt;}

gsdl/branches/gsdl-2.74/src/recpt/librarymain.cpp

-              r12517
+              r14270
   recpt.add_action (adocumentaction);
+  text_t userdbfile = filename_cat(gsdlhome, "etc", "users.db");
+  userdbclass *udb = new userdbclass(userdbfile);
+  text_t keydbfile = filename_cat(gsdlhome, "etc", "key.db");
+  keydbclass *kdb = new keydbclass(keydbfile);
 #ifdef GSDL_USE_USERS_ACTION
+  recpt.add_action (new usersaction());
+  usersaction *ausersaction = new usersaction();
+  ausersaction->set_userdb(udb);
+  recpt.add_action (ausersaction);
 #endif
 …
 #ifdef GSDL_USE_AUTHEN_ACTION
   authenaction *aauthenaction = new authenaction();
+  aauthenaction->set_userdb(udb);
+  aauthenaction->set_keydb(kdb);
   aauthenaction->set_receptionist(&recpt);
   recpt.add_action (aauthenaction);
 …
   cgiwrapper (recpt, "");
   delete cservers;
+  delete udb;
+  delete kdb;
   // clean up the actions

gsdl/branches/gsdl-2.74/src/recpt/userdb.cpp

-              r14013
+              r14270
 userdbclass::userdbclass(const text_t &userdbfilename)
+{
+  activated = (!userdb.opendatabase(userdbfilename, GDBM_WRCREAT, 1000, true)) ? false : true;
+  storeduserdbfilename = userdbfilename;
+  activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true)) ? false : true;
+  if (activated == false)
+    {
+      activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true)) ? false : true;
+      if (activated == true)
+        {
+          userdb.closedatabase();
+          activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true)) ? false : true;
+        }
+    }
   external_db = false;
+}
 …
       info["groups"] = userinfo.groups;
       info["comment"] = userinfo.comment;
+      return (userdb.setinfo (username, info)) ? ERRNO_SUCCEED : ERRNO_GDBMACTIONFILED ;
+      userdb.closedatabase();
+      userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true);
+      int result = (userdb.setinfo (username, info)) ? ERRNO_SUCCEED : ERRNO_GDBMACTIONFILED;
+      userdb.closedatabase();
+      userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true);
+      return  result;
+    }
   return ERRNO_CONNECTIONFAILED;
 …
   if (activated == true)
+    {
+      userdb.closedatabase();
+      userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true);
       userdb.deletekey (username);
+      userdb.closedatabase();
+      userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true);
       return ERRNO_SUCCEED;
+    }
 …
   return ERRNO_CONNECTIONFAILED;
+}
-//an alernative way to colse the database if the class can't reach the destructor
-void userdbclass::closedatabase()
+{
-   userdb.closedatabase();
+}
 //==========================================//
 //       userdbclass functions (End)        //
 …
 keydbclass::keydbclass(const text_t &keydbfilename)
+{
+  activated = (!keydb.opendatabase(keydbfilename, GDBM_WRCREAT, 1000, true)) ? false : true;
+  storedkeydbfilename = keydbfilename;
+  activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true)) ? false : true;
+  if (activated == false)
+    {
+      activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true)) ? false : true;
+      if (activated == true)
+        {
+          keydb.closedatabase();
+          activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true)) ? false : true;
+        }
+    }
   external_db = false;
+}
 …
       keydata["time"] = time2text(time(NULL));
+      keydb.closedatabase();
+      keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true);
       if (!keydb.setinfo (crypt_userkey, keydata))
+        {
           userkey.clear(); // failed
+        }
+      keydb.closedatabase();
+      keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true);
       return userkey;
 …
             // succeeded, update the key's time
             info["time"] = time2text(time(NULL));
+            keydb.closedatabase();
+            keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true);
             keydb.setinfo (crypt_key, info);
+            keydb.closedatabase();
+            keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true);
             return true;
+          }
 …
+    }
+}
-//an alernative way to colse the database if the class can't reach the destructor
-void keydbclass::closedatabase()
+{
-   keydb.closedatabase();
+}
 //==========================================//
 //       keydbclass functions (End)         //

gsdl/branches/gsdl-2.74/src/recpt/userdb.h

-              r14015
+              r14270
   bool external_db;
   bool activated;
+  text_t storeduserdbfilename;
  public:
 …
   // on success
   int get_user_list (text_tarray &userlist);
-  //an alernative way to colse the database if the class can't reach the destructor
-  void closedatabase();
 };
 …
   bool external_db;
   bool activated;
+   text_t storedkeydbfilename;
  public:
 …
   // use sparingly, it can be quite an expensive function
   void remove_old_keys (int keydecay);
- //an alernative way to colse the database if the class can't reach the destructor
-  void closedatabase();
 };

gsdl/branches/gsdl-2.74/src/recpt/usersaction.cpp

-              r13844
+              r14270
 void usersaction::configure (const text_t &key, const text_tarray &cfgline) {
-  // get the password filename
-  if (cfgline.size() == 1) {
-    if (key == "usersfile") usersfile = cfgline[0];
+  }
   action::configure (key, cfgline);
+}
 …
     return false;
+  }
-  if (usersfile.empty()) usersfile = filename_cat (gdbmhome, "etc", "users.db");
   return action::init (logout);
 …
                  outconvertclass &outconvert, ostream &textout,
                  ostream &logout) {
-  // open the user database (it will be used a lot)
-  user_database = new userdbclass(usersfile);
   if (args["uma"] == "adduser" || args["uma"] == "edituser") {

gsdl/branches/gsdl-2.74/src/recpt/usersaction.h

-              r13844
+              r14270
 #include "gsdlconf.h"
 #include "action.h"
+#include "userdb.h"
 #include "text_t.h"
-#include "userdb.h"
 class usersaction : public action {
 protected:
+  text_t usersfile;
+  userdbclass* user_database;
+  userdbclass *user_database;
 public:
 …
   bool init (ostream &logout);
+  void set_userdb(userdbclass *udb) {user_database = udb;}
   text_t get_action_name () {return "um";}

Context Navigation

Legend:

gsdl/branches/gsdl-2.74/bin/script/buildcol.pl

gsdl/branches/gsdl-2.74/bin/script/downloadfrom.pl

gsdl/branches/gsdl-2.74/bin/script/gti.pl

gsdl/branches/gsdl-2.74/bin/script/mkcol.pl

gsdl/branches/gsdl-2.74/cgi-bin/gliserver.pl

gsdl/branches/gsdl-2.74/cgi-bin/gsdlCGI.pm

gsdl/branches/gsdl-2.74/macros/style.dm

gsdl/branches/gsdl-2.74/perllib/basebuilder.pm

gsdl/branches/gsdl-2.74/perllib/cfgread4gs3.pm

gsdl/branches/gsdl-2.74/perllib/classify.pm

gsdl/branches/gsdl-2.74/perllib/colcfg.pm

gsdl/branches/gsdl-2.74/perllib/plugin.pm

gsdl/branches/gsdl-2.74/perllib/plugins/HTMLPlug.pm

gsdl/branches/gsdl-2.74/perllib/plugins/MediaWikiPlug.pm

gsdl/branches/gsdl-2.74/perllib/strings.properties

gsdl/branches/gsdl-2.74/src/recpt/authenaction.cpp

gsdl/branches/gsdl-2.74/src/recpt/authenaction.h

gsdl/branches/gsdl-2.74/src/recpt/librarymain.cpp

gsdl/branches/gsdl-2.74/src/recpt/userdb.cpp

gsdl/branches/gsdl-2.74/src/recpt/userdb.h

gsdl/branches/gsdl-2.74/src/recpt/usersaction.cpp

gsdl/branches/gsdl-2.74/src/recpt/usersaction.h

Download in other formats: