Changeset 14270
- Timestamp:
- 2007-07-25T13:37:52+12:00 (17 years ago)
- Location:
- gsdl/branches/gsdl-2.74
- Files:
-
- 22 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
gsdl/branches/gsdl-2.74/bin/script/buildcol.pl
r14197 r14270 73 73 74 74 my $arguments = 75 [ { 'name' => "disable_OAI", 76 'desc' => "{buildcol.disable_OAI}", 77 'type' => "flag", 78 'reqd' => "no", 79 'modegli' => "2" }, 80 { 'name' => "remove_empty_classifications", 75 [ { 'name' => "remove_empty_classifications", 81 76 'desc' => "{buildcol.remove_empty_classifications}", 82 77 'type' => "flag", … … 205 200 'type' => "flag", 206 201 'reqd' => "no", 207 'hiddengli' => "yes" } 202 'hiddengli' => "yes" }, 203 { 'name' => "disable_OAI", 204 'desc' => "{buildcol.disable_OAI}", 205 'type' => "flag", 206 'reqd' => "no", 207 'modegli' => "2", 208 'hiddengli' => "yes" } 208 209 209 210 # { 'name' => "incremental_dlc", … … 353 354 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins"); 354 355 355 # read the configuration file (for gs2) 356 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg"); 356 # Read in the collection configuration file. 357 357 my ($collectcfg, $buildtype); 358 359 if (-e $configfilename) { 360 $collectcfg = &colcfg::read_collect_cfg ($configfilename); 361 $gs_mode = "gs2"; 362 } 363 else { 364 365 # If it is gs3 366 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collectionConfig.xml"); 367 368 if (!-e $configfilename) { 369 &gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die; 370 } 371 else { 358 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out); 359 if ($gs_mode eq "gs2") { 360 $collectcfg = &colcfg::read_collect_cfg ($configfilename); 361 } elsif ($gs_mode eq "gs3") { 372 362 $collectcfg = &colcfg::read_collection_cfg_xml ($configfilename); 373 $gs_mode = "gs3"; 374 } 375 } 376 363 } 364 377 365 if ($verbosity !~ /\d+/) { 378 366 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) { … … 441 429 $remove_empty_classifications = 1; 442 430 } 443 } 431 } 432 444 433 445 434 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) { … … 465 454 $gli = 0 unless defined $gli; 466 455 456 # If the disable_OAI flag is not present, the option $disable_OAI with the value of 0 will be passed to basebuilder.pm 467 457 $disable_OAI = 0 unless defined $disable_OAI; 468 458 469 459 # New argument to track whether build is incremental 470 460 $incremental = 0 unless defined $incremental; … … 537 527 # if a builder class has been created for this collection, use it 538 528 # otherwise, use the mg or mgpp builder 539 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") { 529 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") { 530 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib"; 531 $buildertype = "custombuilder"; 532 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") { 533 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; 534 $buildertype = "custombuilder"; 535 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") { 540 536 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; 541 537 $buildertype = "${collection}builder"; -
gsdl/branches/gsdl-2.74/bin/script/downloadfrom.pl
r12903 r14270 47 47 'desc' => "{downloadfrom.download_mode.Web}", 48 48 'downloadname' => "WebDownload" }, 49 { 'name' => "MediaWiki", 50 'desc' => "{downloadfrom.download_mode.MediaWiki}", 51 'downloadname' => "MediaWikiDownload" }, 49 52 { 'name' => "OAI", 50 53 'desc' => "{downloadfrom.download_mode.OAI}", -
gsdl/branches/gsdl-2.74/bin/script/gti.pl
r13948 r14270 39 39 40 40 41 my $anonymous_cvs_root = ":pserver:cvs_anon\@cvs.scms.waikato.ac.nz:2402/usr/local/global-cvs/gsdl-src"; 41 #my $anonymous_cvs_root = ":pserver:cvs_anon\@cvs.scms.waikato.ac.nz:2402/usr/local/global-cvs/gsdl-src"; 42 #my $anonymous_svn_root = "http://http://svn.greenstone.org/gsdl/trunk/"; 42 43 my $gsdl_root_directory = "$ENV{'GSDLHOME'}"; 43 44 my $gti_log_file = &util::filename_cat($gsdl_root_directory, "etc", "gti.log"); … … 79 80 # 'target_file' => "gsdl-documentation/tutorials/xml-source/tutorial_{target_language_code}.xml" }, 80 81 81 # Greenstone.org82 # new Greenstone.org 82 83 { 'key' => "greenorg", 83 'file_type' => "macrofile", 84 'source_file' => "greenorg/macros/english.dm", 85 'target_file' => "greenorg/macros/{iso_639_1_target_language_name}.dm" } 84 'file_type' => "resource_bundle", 85 'source_file' => "greenstoneorg/website/classes/Gsc.properties", 86 'target_file' => "greenstoneorg/website/classes/Gsc_{iso_639_1_target_language_name}.properties" 87 # 'file_type' => "macrofile", 88 # 'source_file' => "greenorg/macros/english.dm", 89 # 'target_file' => "greenorg/macros/{iso_639_1_target_language_name}.dm" 90 } 86 91 ]; 87 92 … … 111 116 } 112 117 if ($gti_command =~ /^get-first-n-chunks-requiring-work$/i) { 113 print &get_first_n_chunks_requiring_work(@gti_command_arguments); 118 print &get_first_n_chunks_requiring_work(@gti_command_arguments); 114 119 } 115 120 if ($gti_command =~ /^get-language-status$/i) { 116 print &get_language_status(@gti_command_arguments); 121 print &get_language_status(@gti_command_arguments); 117 122 } 118 123 if ($gti_command =~ /^search-chunks$/i) { … … 248 253 my @source_file_lines = &read_file_lines($source_file_path); 249 254 my %source_file_key_to_line_mapping = &build_key_to_line_mapping(\@source_file_lines, $translation_file_type); 250 255 251 256 my $target_file_path = &util::filename_cat($gsdl_root_directory, $target_file); 252 257 my @target_file_lines = &read_file_lines($target_file_path); … … 299 304 my $source_file_chunk_date = $source_file_key_to_last_update_date_mapping{$chunk_key}; 300 305 my $source_file_chunk_text = &make_text_xml_safe($source_file_key_to_text_mapping{$chunk_key}); 301 306 307 if(!defined $source_file_chunk_date){ 308 $source_file_chunk_date = ""; 309 } 310 302 311 $xml_response .= " <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n"; 303 $xml_response .= " <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n"; 312 $xml_response .= " <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n"; 304 313 $xml_response .= " <TargetFileText></TargetFileText>\n"; 305 314 $xml_response .= " </Chunk>\n"; … … 325 334 my $target_file_chunk_date = $target_file_key_to_last_update_date_mapping{$chunk_key}; 326 335 my $target_file_chunk_text = &make_text_xml_safe($target_file_key_to_text_mapping{$chunk_key}); 327 328 $xml_response .= " <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n"; 336 337 if(!defined $source_file_chunk_date){ 338 $source_file_chunk_date = ""; 339 } 340 341 $xml_response .= " <Chunk key=\"" . &make_text_xml_safe($chunk_key) . "\">\n"; 329 342 $xml_response .= " <SourceFileText date=\"$source_file_chunk_date\">$source_file_chunk_text</SourceFileText>\n"; 330 343 $xml_response .= " <TargetFileText date=\"$target_file_chunk_date\">$target_file_chunk_text</TargetFileText>\n"; … … 636 649 # The "2>/dev/null" is very important! If it is missing this will never return when run from the receptionist 637 650 # unless ($translation_file_is_not_in_cvs) { 638 my $source_file_cvs_status = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root update $source_file 2>/dev/null`; 651 #my $source_file_cvs_status = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root update $source_file 2>/dev/null`; 652 my $source_file_cvs_status = `cd $gsdl_root_directory; svn status $source_file 2>/dev/null`; 639 653 if ($source_file_cvs_status =~ /^C /) { 640 654 &throw_fatal_error("Source file $source_file_path conflicts with the repository."); … … 753 767 my $chunk_cvs_date = $key_to_cvs_date_mapping{$chunk_key}; 754 768 $key_to_last_update_date_mapping{$chunk_key} = $chunk_cvs_date; 755 769 756 770 # If a comment date exists and it is after the CVS date, use that instead 771 # need to convert the comment date format to SVN format 757 772 my $chunk_gti_comment = $key_to_gti_comment_mapping{$chunk_key}; 758 773 if (defined($chunk_gti_comment) && $chunk_gti_comment =~ /(\d?\d-\D\D\D-\d\d\d\d)/) { 759 my $chunk_comment_date = $1; 774 my $chunk_comment_date = $1; 760 775 if ((!defined($chunk_cvs_date) || &is_date_after($chunk_comment_date, $chunk_cvs_date))) { 761 776 $key_to_last_update_date_mapping{$chunk_key} = $chunk_comment_date; … … 774 789 # Use CVS to annotate each line of the file with the date it was last edited 775 790 # The "2>/dev/null" is very important! If it is missing this will never return when run from the receptionist 776 my $cvs_annotated_file = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root annotate -F $filename 2>/dev/null`; 791 # my $cvs_annotated_file = `cd $gsdl_root_directory; cvs -d $anonymous_cvs_root annotate -F $filename 2>/dev/null`; 792 # my $cvs_annotated_file = `cd $gsdl_root_directory; export PATH=.:/research/lh92/programs/subversion/bin; svn annotate -v --force $filename`; 793 my $cvs_annotated_file = `cd $gsdl_root_directory; svn annotate -v $filename`; 794 777 795 my @cvs_annotated_file_lines = split(/\n/, $cvs_annotated_file); 778 796 … … 780 798 foreach my $cvs_annotated_file_line (@cvs_annotated_file_lines) { 781 799 # Extract the date from the CVS annotation at the front 782 $cvs_annotated_file_line =~ s/^\S+\s+\(\S+\s+(\S+)\):\s//; 783 push(@cvs_annotated_file_lines_date, $1); 784 } 785 800 # cvs format : 07-Jun-02 801 # svn format : 2007-07-16 802 # $cvs_annotated_file_line =~ s/^\S+\s+\(\S+\s+(\S+)\):\s//; 803 $cvs_annotated_file_line =~ s/^\s+\S+\s+\S+\s(\S+)//; 804 805 push(@cvs_annotated_file_lines_date, $1); 806 807 # trim extra date information in svn annotation format 808 # 15:42:49 +1200 (Wed, 21 Jun 2006) 809 $cvs_annotated_file_line =~ s/^\s+\S+\s\S+\s\((.+?)\)\s//; 810 } 811 786 812 # Build a key to line mapping for the CVS annotated file, for matching the chunk key to the CVS date 787 813 my %key_to_line_mapping = &build_key_to_line_mapping(\@cvs_annotated_file_lines, $translation_file_type); 788 814 789 815 my %key_to_cvs_date_mapping = (); 790 816 foreach my $chunk_key (keys(%key_to_line_mapping)) { 791 817 my $chunk_starting_line = (split(/-/, $key_to_line_mapping{$chunk_key}))[0]; 792 818 my $chunk_finishing_line = (split(/-/, $key_to_line_mapping{$chunk_key}))[1]; 793 819 794 820 # Find the date this chunk was last edited, from the CVS annotation 795 my $chunk_date = $cvs_annotated_file_lines_date[$chunk_starting_line]; 821 my $chunk_date = $cvs_annotated_file_lines_date[$chunk_starting_line]; 796 822 for (my $l = ($chunk_starting_line + 1); $l <= $chunk_finishing_line; $l++) { 797 823 if (&is_date_after($cvs_annotated_file_lines_date[$l], $chunk_date)) { 798 824 # This part of the chunk has been updated more recently 799 825 $chunk_date = $cvs_annotated_file_lines_date[$l]; 826 800 827 } 801 828 } … … 861 888 my $source_chunk_last_update_date = $source_file_key_to_last_update_date_mapping->{$chunk_key}; 862 889 my $target_chunk_last_update_date = $target_file_key_to_last_update_date_mapping->{$chunk_key}; 863 if (defined($target_chunk_last_update_date) && &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date)) { 890 891 # print "key: $chunk_key\nsource date : $source_chunk_last_update_date\ntarget date : $target_chunk_last_update_date\nafter? ". &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date) . "\n\n"; 892 893 if (defined($target_chunk_last_update_date) && &is_date_after($source_chunk_last_update_date, $target_chunk_last_update_date)) { 864 894 # &log_message("Chunk with key $chunk_key needs updating."); 865 895 push(@target_file_keys_requiring_updating, $chunk_key); … … 903 933 904 934 # Returns 1 if $date1 is after $date2, 0 otherwise 905 sub is_date_after 935 sub is_date_after_cvs 906 936 { 907 937 my ($date1, $date2) = @_; … … 909 939 "Jul", 7, "Aug", 8, "Sep", 9, "Oct", 10, "Nov", 11, "Dec", 12); 910 940 941 if(!defined $date1) { 942 return 1; 943 } 944 911 945 my @date1parts = split(/-/, $date1); 912 946 my @date2parts = split(/-/, $date2); … … 915 949 my $year1 = $date1parts[2]; 916 950 if ($year1 < 80) { 917 $year1 += 2000;951 $year1 += 2000; 918 952 } 919 953 my $year2 = $date2parts[2]; 920 954 if ($year2 < 80) { 921 $year2 += 2000;955 $year2 += 2000; 922 956 } 923 957 … … 939 973 } 940 974 975 return 0; 976 } 977 978 sub is_date_after 979 { 980 my ($date1, $date2) = @_; 981 982 if(!defined $date1) { 983 return 1; 984 } 985 if(!defined $date2) { 986 return 0; 987 } 988 989 # 16-Aug-2006 990 if($date1=~ /(\d+?)-(\S\S\S)-(\d\d\d\d)/){ 991 my %months = ("Jan", "01", "Feb", "02", "Mar", "03", "Apr", "04", "May", "05", "Jun", "06", 992 "Jul", "07", "Aug", "08", "Sep", "09", "Oct", "10", "Nov", "11", "Dec", "12"); 993 $date1=$3 . "-" . $months{$2} . "-" . $1; 994 # print "** converted date1: $date1\n"; 995 } 996 if($date2=~ /(\d+?)-(\S\S\S)-(\d\d\d\d)/){ 997 my %months = ("Jan", "01", "Feb", "02", "Mar", "03", "Apr", "04", "May", "05", "Jun", "06", 998 "Jul", "07", "Aug", "08", "Sep", "09", "Oct", "10", "Nov", "11", "Dec", "12"); 999 $date2=$3 . "-" . $months{$2} . "-" . $1; 1000 # print "** converted date2: $date2\n"; 1001 } 1002 1003 1004 # 2006-08-16 1005 my @date1parts = split(/-/, $date1); 1006 my @date2parts = split(/-/, $date2); 1007 1008 # Compare year 1009 if ($date1parts[0] > $date2parts[0]) { 1010 return 1; 1011 } 1012 elsif ($date1parts[0] == $date2parts[0]) { 1013 # Year is the same, so compare month 1014 if ($date1parts[1] > $date2parts[1]) { 1015 return 1; 1016 } 1017 elsif ($date1parts[1] == $date2parts[1]) { 1018 # Month is the same, so compare day 1019 if ($date1parts[2] > $date2parts[2]) { 1020 return 1; 1021 } 1022 } 1023 } 1024 941 1025 return 0; 942 1026 } -
gsdl/branches/gsdl-2.74/bin/script/mkcol.pl
r14032 r14270 76 76 'reqd' => "no" }, 77 77 { 'name' => "gs3mode", 78 'desc' => " ",78 'desc' => "mkcol.gs3mode", 79 79 'type' => "flag", 80 80 'reqd' => "no" }, -
gsdl/branches/gsdl-2.74/cgi-bin/gliserver.pl
r14025 r14270 1 1 #!perl -w 2 3 2 # Need to specify the full path of Perl above 4 3 5 4 6 use gsdlCGI;7 5 use strict; 6 7 8 # Set this to 1 to work around IIS 6 craziness 9 my $iis6_mode = 0; 10 11 12 # IIS 6: for some reason, IIS runs this script with the working directory set to the Greenstone 13 # directory rather than the cgi-bin directory, causing lots of stuff to fail 14 if ($iis6_mode) 15 { 16 # Change into cgi-bin directory 17 chdir("cgi-bin"); 18 } 19 20 21 # We use require and an eval here (instead of "use") to catch any errors loading the module (for IIS) 22 eval("require \"gsdlCGI.pm\""); 23 if ($@) 24 { 25 print STDOUT "Content-type:text/plain\n\n"; 26 print STDOUT "ERROR: $@\n"; 27 exit 0; 28 } 8 29 9 30 … … 109 130 sub authenticate_user 110 131 { 111 112 132 my $gsdl_cgi = shift(@_); 113 133 my $username = shift(@_); … … 244 264 my $installation_status = ""; 245 265 266 print STDOUT "Content-type:text/plain\n\n"; 267 246 268 # Check that Java is installed and accessible 247 269 my $java = $gsdl_cgi->get_java_path(); 248 270 my $java_command = "$java -version 2>&1"; 271 272 # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go 273 # directly out to the page 274 if ($iis6_mode) 275 { 276 $java_command = "java -version"; 277 } 278 249 279 my $java_output = `$java_command`; 250 280 my $java_status = $?; … … 265 295 266 296 if ($installation_ok) { 267 $gsdl_cgi->generate_ok_message($installation_status . "\nInstallation OK!");297 print STDOUT $installation_status . "\nInstallation OK!"; 268 298 } 269 299 else { 270 $gsdl_cgi->generate_error($installation_status);300 print STDOUT $installation_status; 271 301 } 272 302 } … … 563 593 } 564 594 565 print STDOUT "Content-type:text/plain\n\n";566 595 foreach my $cgi_arg_name ($gsdl_cgi->param) { 567 596 my $cgi_arg_value = $gsdl_cgi->clean_param($cgi_arg_name) || ""; … … 575 604 } 576 605 606 print STDOUT "Content-type:text/plain\n\n"; 607 577 608 my $perl_command = "perl -S $script $perl_args 2>&1"; 609 610 # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go 611 # directly out to the page 612 if ($iis6_mode) 613 { 614 $perl_command = "perl -S $script $perl_args"; 615 } 616 578 617 my $perl_output = `$perl_command`; 579 618 my $perl_status = $?; … … 582 621 } 583 622 584 print STDOUT "Content-type:text/plain\n\n"; 585 print STDOUT $perl_output; 586 623 if (defined($perl_output)) 624 { 625 print STDOUT $perl_output; 626 } 587 627 } 588 628 … … 728 768 } 729 769 770 print STDOUT "Content-type:text/plain\n\n"; 771 730 772 my $perl_command = "perl -S $script $perl_args 2>&1"; 773 774 # IIS 6: redirecting output from STDERR to STDOUT just doesn't work, so we have to let it go 775 # directly out to the page 776 if ($iis6_mode) 777 { 778 $perl_command = "perl -S $script $perl_args"; 779 } 780 731 781 if (!open(PIN, "$perl_command |")) { 732 782 $gsdl_cgi->generate_error("Unable to execute command: $perl_command"); 733 783 } 734 784 735 print STDOUT "Content-type:text/plain\n\n";736 785 while (defined (my $perl_output_line = <PIN>)) { 737 786 print STDOUT $perl_output_line; … … 799 848 800 849 # Read the uploaded data and write it out to file 850 # We have to pass the size of the uploaded data in the "fs" argument because IIS 6 seems to be 851 # completely incapable of working this out otherwise (causing the old code to crash) 801 852 my $buf; 802 853 my $num_bytes = 0; 854 my $num_bytes_remaining = $gsdl_cgi->clean_param("fs"); 855 my $bytes_to_read = $num_bytes_remaining; 856 if ($bytes_to_read > 1024) { $bytes_to_read = 1024; } 803 857 binmode(FOUT); 804 while (read(STDIN, $buf, 1024) > 0) {858 while (read(STDIN, $buf, $bytes_to_read) > 0) { 805 859 print FOUT $buf; 806 860 $num_bytes += length($buf); 861 $num_bytes_remaining -= length($buf); 862 $bytes_to_read = $num_bytes_remaining; 863 if ($bytes_to_read > 1024) { $bytes_to_read = 1024; } 807 864 } 808 865 close(FOUT); -
gsdl/branches/gsdl-2.74/cgi-bin/gsdlCGI.pm
r14024 r14270 101 101 print STDOUT $full_mess; 102 102 103 die $full_mess;103 exit 0; 104 104 } 105 105 -
gsdl/branches/gsdl-2.74/macros/style.dm
r13429 r14270 100 100 # _pagetitle_ 101 101 # _globalscripts_ 102 _htmlhead_ {<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"> 102 _htmlhead_ {<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 103 "http://www.w3.org/TR/html4/loose.dtd"> 103 104 104 105 <html_htmlextra_> -
gsdl/branches/gsdl-2.74/perllib/basebuilder.pm
r14212 r14270 26 26 package basebuilder; 27 27 28 use strict; 29 no strict 'refs'; # allow filehandles to be variables and viceversa 30 28 31 use classify; 29 32 use cfgread; … … 56 59 $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_; 57 60 58 $outhandle = STDERR unless defined $outhandle;61 $outhandle = *STDERR unless defined $outhandle; 59 62 $no_text = 0 unless defined $no_text; 60 $failhandle = STDERR unless defined $failhandle;63 $failhandle = *STDERR unless defined $failhandle; 61 64 62 65 # create a builder object … … 80 83 81 84 $self->{'gli'} = 0 unless defined $self->{'gli'}; 85 86 # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml 82 87 $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'}; 83 84 # read in the collection configuration file 85 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; 86 if (-e $colcfgname) { 87 ##$self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname); 88 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 89 $gs_mode = "gs2"; 90 } 91 else { 92 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collectionConfig.xml"; 93 if (!-e $colcfgname) { 94 die "mgbuilder::new - couldn't find collectionConfig.xml for collection $collection\n"; 95 } 96 else { 97 #$self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 98 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname); 99 $gs_mode = "gs3"; 100 } 88 89 # Read in the collection configuration file. 90 my ($colcfgname); 91 ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle); 92 if ($gs_mode eq "gs2") { 93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 94 } elsif ($gs_mode eq "gs3") { 95 $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname); 101 96 } 102 97 … … 196 191 my ($buildprocdir, $buildproctype); 197 192 my $collection = $self->{'collection'}; 198 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { 193 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") { 194 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib"; 195 $buildproctype = "custombuildproc"; 196 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") { 197 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; 198 $buildproctype = "custombuildproc"; 199 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { 199 200 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; 200 201 $buildproctype = "${collection}buildproc"; … … 240 241 $self->{'maxnumeric'} = $maxnumeric; 241 242 } 242 # It seems we don't need this sub243 #sub set_disable_OAI {244 # my $disable_OAI = shift (@_);245 # my ($disable_OAI = @_;246 #247 # $self->{'disable_OAI'} = $disable_OAI;248 #}249 243 sub set_strip_html { 250 244 my $self = shift (@_); … … 279 273 # and their directory names (includes subcolls and langs) 280 274 $self->{'index_mapping'} = $self->create_index_mapping ($indexes); 281 282 my $indexmap = $self->{'index_mapping'}->{'indexmap'}; 283 275 284 276 # build each of the indexes 285 277 foreach my $index (@$indexes) { … … 351 343 my ($handle); 352 344 if ($self->{'debug'}) { 353 $handle = STDOUT;345 $handle = *STDOUT; 354 346 } else { 355 347 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) { … … 357 349 die "builder::make_infodatabase - couldn't run $txt2db_exe\n"; 358 350 } 359 $handle = basebuilder::PIPEOUT;351 $handle = *PIPEOUT; 360 352 } 361 353 … … 439 431 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); 440 432 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); 441 442 # store whether to disable OAI service 443 $build_cfg->{'disable_OAI'} = $self->{'disable_OAI'}; 444 433 445 434 # store the mapping between the index names and the directory names 446 435 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map. … … 478 467 479 468 if ($gs_mode eq "gs2") { 480 #&colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'});481 469 &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg); 482 470 } 483 471 if ($gs_mode eq "gs3") { 484 #&colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg); 485 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'}); 472 &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg'}, $self->{'disable_OAI'}); 486 473 } 487 474 -
gsdl/branches/gsdl-2.74/perllib/cfgread4gs3.pm
r14200 r14270 337 337 } 338 338 339 print "*** collectionConfig.xml internal ***\n";340 &Display;339 #print "*** collectionConfig.xml internal ***\n"; 340 #&Display; 341 341 return $data; 342 342 } … … 350 350 # Create the buildConfig.xml file for a specific collection 351 351 sub write_cfg_file { 352 # this sub is called make_auxiliary_files() in basebuilder.pm352 # this sub is called in make_auxiliary_files() in basebuilder.pm 353 353 # the received args: $buildoutfile - destination file: buildConfig.xml 354 354 # $buildcfg - all build options, eg, disable_OAI 355 355 # $collectcfg - contents of collectionConfig.xml read in by read_cfg_file sub in cfgread4gs3.pm. 356 my ($buildoutfile, $buildcfg, $collectcfg ) = @_;356 my ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI) = @_; 357 357 my $line = []; 358 358 359 359 if (!open (COLCFG, ">$buildoutfile")) { 360 print STDERR "cfgread ::write_cfg_file couldn't write the cfg file $buildoutfile\n";360 print STDERR "cfgread4gs3::write_cfg_file couldn't write the build config file $buildoutfile\n"; 361 361 die; 362 362 } … … 391 391 392 392 # This serviceRack enables the collection to provide the oai metadata retrieve service, which is served by the OAIPMH.java class 393 # For each collection, we write the following serviceRack in the collection's buildConfig.xml file as follows if the 'disable_OAI' argument is not ticked inGLI (or equivalently, a 'disable_OAI' flag is not specified on the command line). There are also other configurations in the OAIConfig.xml.394 if ($ buildcfg->{'disable_OAI'}== 0) {393 # For each collection, we write the following serviceRack in the collection's buildConfig.xml file if the 'disable_OAI' argument is not checked in the GLI (or equivalently, a 'disable_OAI' flag is not specified on the command line). There are also other configurations in the OAIConfig.xml. 394 if ($disable_OAI == 0) { 395 395 &write_line('COLCFG', ["<serviceRack name=\"OAIPMH\">"]); 396 396 if (defined $buildcfg->{'indexstem'}) { -
gsdl/branches/gsdl-2.74/perllib/classify.pm
r14112 r14270 49 49 50 50 # find the classifier 51 my $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'}, 51 my $customclassname; 52 if (defined($ENV{'GSDLCOLLECTION'})) 53 { 54 $customclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'}, 52 55 "perllib", "classify", "${classifier}.pm"); 56 } 53 57 my $colclassname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "perllib", "classify", "${classifier}.pm"); 54 58 my $mainclassname = &util::filename_cat($ENV{'GSDLHOME'}, "perllib", "classify", "${classifier}.pm"); 55 59 56 if ( -e $customclassname) { require $customclassname; }60 if (defined($customclassname) && -e $customclassname) { require $customclassname; } 57 61 elsif (-e $colclassname) { require $colclassname; } 58 62 elsif (-e $mainclassname) { require $mainclassname; } -
gsdl/branches/gsdl-2.74/perllib/colcfg.pm
r14115 r14270 100 100 } 101 101 sub write_build_cfg_xml { 102 my ($buildoutfile, $buildcfg, $collectcfg ) = @_;102 my ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI) = @_; 103 103 104 return &cfgread4gs3::write_cfg_file ($buildoutfile, $buildcfg, $collectcfg );104 return &cfgread4gs3::write_cfg_file ($buildoutfile, $buildcfg, $collectcfg, $disable_OAI); 105 105 } 106 106 … … 148 148 149 149 return &cfgread::read_cfg_file ($filename, 150 q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/, 150 q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/, 151 151 q/^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields|indexfieldmap|indexlevels|levelmap)$/); 152 152 … … 157 157 158 158 &cfgread::write_cfg_file($filename, $data, 159 q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/, 159 q/^(builddate|buildtype|numdocs|numsections|numwords|numbytes|maxnumeric|textlevel|indexstem|stemindexes)$/, 160 160 q/^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields|indexfieldmap|indexlevels|levelmap)$/); 161 161 } -
gsdl/branches/gsdl-2.74/perllib/plugin.pm
r14112 r14270 48 48 49 49 # find the plugin 50 my $customplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'}, 50 my $customplugname; 51 if (defined($ENV{'GSDLCOLLECTION'})) 52 { 53 $customplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "custom", $ENV{'GSDLCOLLECTION'}, 51 54 'perllib', 'plugins', "${pluginname}.pm"); 55 } 52 56 my $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'plugins', 53 57 "${pluginname}.pm"); 54 58 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'}, 'perllib', 'plugins', 55 59 "${pluginname}.pm"); 56 if ( -e $customplugname) { require $customplugname; }60 if (defined($customplugname) && -e $customplugname) { require $customplugname; } 57 61 elsif (-e $colplugname) { require $colplugname; } 58 62 elsif (-e $mainplugname) { require $mainplugname; } -
gsdl/branches/gsdl-2.74/perllib/plugins/HTMLPlug.pm
r14089 r14270 1187 1187 1188 1188 foreach my $field (split /,/, $self->{'metadata_fields'}) { 1189 $field =~ s/^\s+//; # remove leading whitespace 1190 $field =~ s/\s+$//; # remove trailing whitespace 1191 1189 1192 # support tag<tagname> 1190 1193 if ($field =~ /^(.*?)<(.*?)>$/) { -
gsdl/branches/gsdl-2.74/perllib/plugins/MediaWikiPlug.pm
r14108 r14270 24 24 # 25 25 ########################################################################### 26 # This plugin is to process an HTML file where sections are divided by 27 # user-defined headings tags. As it is difficult to predict what user's definition 28 # this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...) 29 # as well as allows to get rid of user-defined Table of Content (TOC)... 30 # format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)... 26 # This plugin is to process an HTML file from a MediaWiki website which downloaded by 27 # the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like 28 # login, discussion, history, etc. Only the navigation and search section could be preserved. 29 # Searchbox will be modified to search the Greenstone collection instead of the website. 30 # It also can automatically add the table of contents on the website's Main_Page to the 31 # collection's Home page. 31 32 32 33 package MediaWikiPlug; 33 34 34 35 use HTMLPlug; 35 use ImagePlug; 36 use File::Copy; 36 # use ImagePlug; 37 # use File::Copy; 38 use unicode; 39 37 40 38 41 #use strict; # every perl program should have this! … … 40 43 41 44 sub BEGIN { 42 @MediaWikiPlug::ISA = ('HTMLPlug'); 45 @MediaWikiPlug::ISA = ('HTMLPlug'); 43 46 } 44 47 45 48 my $arguments = 46 49 [ 50 # show the table of contents on collection's home page 47 51 { 'name' => "show_toc", 48 52 'desc' => "{MediaWikiPlug.show_toc}", 49 53 'type' => "flag", 50 54 'reqd' => "no"}, 55 # set to delete the table of contents section on each MediaWiki page 56 { 'name' => "delete_toc", 57 'desc' => "{MediaWikiPlug.delete_toc}", 58 'type' => "flag", 59 'reqd' => "no"}, 60 # regexp to match the table of contents 51 61 { 'name' => "toc_exp", 52 62 'desc' => "{MediaWikiPlug.toc_exp}", 53 63 'type' => "regexp", 54 64 'reqd' => "no", 55 'deft' => "" }, 56 { 'name' => "delete_toc", 57 'desc' => "{MediaWikiPlug.delete_toc}", 58 'type' => "flag", 59 'reqd' => "no"}, 65 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*</table>\\n" }, 66 # set to delete the navigation section 60 67 { 'name' => "delete_nav", 61 68 'desc' => "{MediaWikiPlug.delete_nav}", 62 69 'type' => "flag", 63 70 'reqd' => "no", 64 'deft' => ""}, 65 { 'name' => "nav_exp", 66 'desc' => "{MediaWikiPlug.nav_exp}", 71 'deft' => ""}, 72 # regexp to match the navigation section 73 { 'name' => "nav_div_exp", 74 'desc' => "{MediaWikiPlug.nav_div_exp}", 67 75 'type' => "regexp", 68 76 'reqd' => "no", 69 'deft' => "" }, 70 { 'name' => "tag_sections", 71 'desc' => "{MediaWikiPlug.tag_sections}", 77 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" }, 78 # set to delete the searchbox section 79 { 'name' => "delete_searchbox", 80 'desc' => "{MediaWikiPlug.delete_searchbox}", 72 81 'type' => "flag", 73 'reqd' => "no"}, 74 { 'name' => "description_tags", 75 'desc' => "{HTMLPlug.description_tags}", 76 'type' => "flag", 77 'reqd' => "no"} 82 'reqd' => "no", 83 'deft' => ""}, 84 # regexp to match the searchbox section 85 { 'name' => "searchbox_div_exp", 86 'desc' => "{MediaWikiPlug.searchbox_div_exp}", 87 'type' => "regexp", 88 'reqd' => "no", 89 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"}, 90 # regexp to match title suffix 91 # can't use the title_sub option in HTMLPlug instead 92 # because title_sub always matches from the begining 93 { 'name' => "remove_title_suffix_exp", 94 'desc' => "{MediaWikiPlug.remove_title_suffix_exp}", 95 'type' => "regexp", 96 'reqd' => "no", 97 'deft' => ""} 78 98 ]; 79 80 99 81 100 my $options = { 'name' => "MediaWikiPlug", … … 85 104 'args' => $arguments }; 86 105 87 88 106 sub new { 89 107 my ($class) = shift (@_); … … 112 130 113 131 $head =~ m/<title>(.+)<\/title>/i; 114 my $doctitle = $1 if defined $1; 132 my $doctitle = $1 if defined $1; 115 133 116 134 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) { … … 126 144 # set the title here if we haven't found it yet 127 145 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) { 128 if (defined $doctitle && $doctitle =~ /\S/) { 129 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle); 146 if (defined $doctitle && $doctitle =~ /\S/) { 147 # remove suffix in title if required 148 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'}; 149 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){ 150 $doctitle =~ s/$remove_suffix_exp//i; 151 } 152 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle); 130 153 } else { 131 154 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file); 132 155 } 133 } 134 135 if(defined $base_dir && $base_dir ne ""){ 136 # find and download stylesheet 156 } 157 158 # we are only interested in the column-contents div <div id="column-content"> 159 # remove header section, it may contain header images or additional search boxes 160 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content"; 161 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg; 162 163 # remove timeline 164 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg; 165 166 # remove extra bits 167 my $extra_bits = "Retrieved from(.+)</a>\""; 168 $body_text =~ s/$extra_bits//isg; 169 170 $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg; 171 $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg; 172 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g; 173 $body_text =~ s/( )+/ /sg; 174 175 # get rid of the [edit] buttons 176 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g; 177 # get rid of the last time edit information at the bottom 178 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g; 179 # get rid of the (Redirected from ...) 180 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg; 181 182 # escape texts macros 183 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg; 184 # may change the links, like Greenstone_Documentation_All.html, then change back 185 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg; 186 187 # define file delimiter for different platforms 188 my $file_delimiter; 189 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 190 $file_delimiter = "\\"; 191 } else { 192 $file_delimiter = "/"; 193 } 194 195 # IMPORTANT: different delimiter for $base_dir and $file 196 # $base_dir use forward slash for both windows and linux 197 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import 198 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import 199 # $file use different delimiters : forward slash for linux; backward slash for windows 200 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html 201 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html 202 203 # get the base url for the MediaWiki website 204 my $safe_delimiter = &safe_escape_regexp($file_delimiter); 205 my @url_dirs=split($safe_delimiter, $file); 206 my $url_base = $url_dirs[0]; 207 208 # Re-check css files associated with MediaWiki pages 209 if(defined $base_dir && $base_dir ne ""){ 137 210 my @css_files; 138 211 my $css_file_count = 0; 139 # find all the style sheets imported with import statement 212 213 # find all the stylesheets imported with @import statement 140 214 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){ 141 $css_files[$css_file_count++] = $2 if defined $2; 142 } 215 $css_files[$css_file_count++] = $2 if defined $2; 216 } 217 218 # download the stylesheets if we haven't downloaded them yet 219 # add prefix to each style elmement, comment out the body element 220 # and copy the files to collection's images folder 221 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) { 222 223 my $css_file = $css_files[$css_file_count]; 224 225 # remove prefix gli/cache directory 226 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i; 227 228 # change the \ delimiter in $css_file to / for consistency 229 $css_file =~ s/\\/\//isg; 230 if($css_file !~ /$url_base/) { 231 $css_file = $url_base . $css_file; 232 } 233 234 # trim the ? mark append to the end of a stylesheet 235 $css_file =~ s/\?(.+)$//isg; 236 237 my $css_file_path = &util::filename_cat($base_dir, $css_file); 238 239 # do nothing if we have already downloaded the css files 240 if (! -e $css_file_path) { 241 242 # check the stylesheet's directory in the import folder 243 # if the directory doesn't exist, create one 244 my @dirs = split(/\//i,$css_file); 245 my $path_check = "$base_dir/"; 246 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) { 247 $path_check .= $dirs[$i] . "/"; 248 mkdir($path_check) if (! -d $path_check ); 249 } 250 251 # NOTE: wget needs configuration to directly access Internet 252 # These files should already downloaded if we used the MediaWikiDownload 253 # downloading 254 $css_file = "http://$css_file"; 255 print "\ndownloading : " . $css_file . "\n\n"; 256 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path"); 257 if ($? != 0) { 258 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n"; 259 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n"; 260 unlink("$css_file_path"); 261 } 262 } # done with download 263 264 # add a prefix "#wikispecificstyle" to each element 265 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats 266 # so we will wrap the web page with a div with id = wikispecificstyle 267 my $css_content; 268 if(open(INPUT, "<$css_file_path")){ 269 while(my $line = <INPUT>){ 270 # comment out the body element because we change the body to div 271 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg; 272 273 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){ 274 $line = "#wikispecificstyle " . $line; 275 } 276 $css_content .= $line; 277 } 278 close(INPUT); 279 open(OUTPUT, ">$css_file_path"); 280 print OUTPUT $css_content; 281 close(OUTPUT); 282 } 283 284 # Copy the modified stylesheets to collection's images folder 285 # for future customization 286 my $images_dir = $base_dir; 287 $images_dir =~ s/import$/images/; 288 $css_file =~ m/(.*)\/(.*)$/; 289 $images_dir = &util::filename_cat($images_dir, $2); 290 291 if(open(OUTPUT, ">$images_dir")){ 292 print OUTPUT $css_content; 293 close(OUTPUT); 294 } 295 } 296 } 297 298 299 # by default, only preserve navigation box and search box 300 # others like toolbox, interaction, languages box, will be removed 301 302 # extract the larger part -- footer section 303 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>"; 304 $body_text =~ /$print_footer/; 305 my $footer = ""; 306 $footer = $& if defined $&; 307 $footer =~ s/<\/body>//isg; 308 309 # trim the comments first 310 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg; 311 312 # contain sections that are to be preserved 313 my $preserve_sections = ""; 314 315 # process the navigation section 316 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>"; 317 if (defined $self->{'nav_div_exp'}) { 318 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ; 319 } 320 321 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) { 322 # do nothing 323 } else { 324 if ($footer =~ m/$nav_match_exp/ig) { 325 $preserve_sections = $& ; 326 } else { 327 print $outhandle "Can't find the navigation section with : $nav_match_exp\n"; 328 } 329 # if($preserve_sections =~/\S/){ 330 # $preserve_sections .= "</div>"; 331 # } 332 } 333 334 # process the searchbox section 335 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>"; 336 if(defined $self->{'searchbox_div_exp'}) { 337 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/); 338 } 339 340 my $searchbox_section = ""; 341 $footer =~ m/$searchbox_exp/ig; 342 $searchbox_section = $& if defined $&; 343 344 # make the searchbox form work in Greenstone 345 if($searchbox_section =~ /\S/){ 346 # replace action 347 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg; 348 349 # remove buttons 350 $searchbox_section =~ s/name="search"/name="q"/isg; 351 $searchbox_section =~ s/name="go"//isg; 352 $searchbox_section =~ s/name="fulltext"//isg; 353 354 # get collection name from $base_dir for c param 355 $base_dir =~ m/\/collect\/(.+)\//i; 356 my $collection_name = ""; 357 $collection_name = $1 if defined $1; 358 359 # add Greenstone search params 360 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n" 361 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n"; 362 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n" 363 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n"; 364 365 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg; 366 367 # $searchbox_section .= "</div>"; 368 } else { 369 print $outhandle "Can't find the searchbox section with : $searchbox_section\n"; 370 } 371 372 # either delete or replace the searchbox 373 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") { 374 # do nothing 375 } else { 376 $preserve_sections .= "\n$searchbox_section\n"; 377 } 378 379 380 if($preserve_sections ne ""){ 381 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n"; 382 } 383 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>"; 384 385 $body_text =~ s/$print_footer/$preserve_sections/isg; 386 387 388 # delete other forms in the page 389 my @forms; 390 my $form_count = 0; 391 while($body_text =~ m/<form([^>]*)name=("|')([^>]*)("|')/isg){ 392 next if($3 eq "q"); 393 $forms[$form_count++] = $&; 394 } 395 foreach my $form (@forms) { 396 $body_text =~ s/$form[\s\S]*?<\/form>//m; 397 } 398 399 400 # process links. 401 # because current WGET 1.10 the -k and -E option doesn't work together 402 # need to 'manually' convert the links to relative links 403 # Dealing with 3 types of links: 404 # -- outgoing links 405 # -- if we have downloaded the target files, link to the internal version (relative link) 406 # -- otherwise, link to the external version (absolute links) 407 # -- in-page links (relative link) 408 409 # NOTE: (important) 410 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website 411 # otherwise, the internal links may have problems 412 413 # remove the title attribute of <a> tag 414 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg; 415 416 # extract all the links 417 my @links; 418 my $link_count = 0; 419 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){ 420 $links[$link_count++] = "$1=\"$2$url_base/$3\""; 421 } 422 423 foreach my $cur_link (@links) { 424 # escape greedy match + character 425 $cur_link =~ s/\+/\\+/isg; 426 427 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/; 428 my $external_file_path = "$1\"http://$url_base/$3\""; 429 430 $body_text =~ s/$cur_link/$external_file_path/i; 431 } 432 433 # tag links to new wiki pages as red 434 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi; 435 436 # tag links to pages external of the MediaWiki website as blue 437 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi; 438 439 440 # process the table-of-contents section 441 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file 442 # 1. read _content_ macro from about.dm 443 # 2. append the toc, change all links to the Greenstone internal format for relative links 444 # 3. write to the extra.dm 445 # TODO: we assume the _about:content_ hasn't been specified before 446 # so needs to add function to handle when the macro is already in the extra.dm 447 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){ 448 449 # extract toc of the Main_Page 450 my $mainpage_toc = ""; 451 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n"; 452 if($self->{'toc_exp'} =~ /\S/){ 453 $toc_exp = $self->{'toc_exp'}; 454 } 455 if($body_text =~ /$toc_exp/){ 456 $mainpage_toc = $&; 457 } 458 459 if($mainpage_toc =~ /\S/) { 460 461 # change the in-page links to relative links, for example, change <a href="#section1"> to 462 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1"> 463 my $file_url_format = $file; 464 $file_url_format =~ s/\\/\//isg; 465 $file_url_format = "http://" . $file_url_format; 466 467 # encode as URL, otherwise doesn't work on Windows 468 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; 469 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg; 470 471 472 # read the collection's extra.dm 473 my $macro_path = $base_dir; 474 $macro_path =~ s/import$/macros/; 475 my $extradm_file = &util::filename_cat($macro_path, "extra.dm"); 476 477 my $extra_dm = ""; 478 if(open(INPUT, "<$extradm_file")){ 479 while(my $line = <INPUT>){ 480 $extra_dm .= $line; 481 } 482 } else { 483 print $outhandle "can't open file $extradm_file\n"; 484 } 485 close(INPUT); 486 487 # check whether we have changed the macros 488 my @packages = split("package ", $extra_dm); 489 my $about_package = ""; 490 foreach my $package (@packages) { 491 $about_package = "package " . $package if($package =~ /^about/); 492 } 493 494 my $update_extra_dm = 0; 495 496 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){ 497 print $outhandle "_content_ macro already changed!!!!\n"; 498 } 499 # if extra.dm doesn't have an "about package" 500 elsif ($about_package !~ /\S/) { 501 # read _content_ macro from $GSDLHOME/macros/about.dm file 502 my $global_about_package = &read_content_from_about_dm(); 503 504 # create the extra _content_ macro for this collection 505 # add the original content of the _content_ macro 506 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/; 507 508 # append the new about package to extra.dm 509 $extra_dm .= "\n\npackage about\n_content_$&\n\n"; 510 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}"; 511 512 $update_extra_dm = 1; 513 } 514 # the about package exists, but either doesn't have the _content_ macro or 515 # the _content_ macro doesn't contain the toc 516 else { 517 # check if there is a content macro 518 my $content_macro_existed = 0; 519 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/); 520 521 # if there is one 522 # append a new section div for toc to the end of the document section 523 if($content_macro_existed ==1) { 524 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/; 525 my $content_macro = $&; 526 my $new_content_macro = $content_macro; 527 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/; 528 $extra_dm =~ s/$content_macro/$new_content_macro/mg; 529 } 530 # otherwise, append _content_ macro to the about package 531 else { 532 my $new_about_package = $about_package; 533 $content_macro = &read_content_from_about_dm(); 534 $content_macro =~ m/{(.|\n)*<\/div>\n\n/; 535 536 $new_about_package .= "\n\n_content_$&\n\n"; 537 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}"; 538 $extra_dm =~ s/$about_package/$new_about_package/mg; 539 } 540 541 # either the case, we need to update the extra.dm 542 $update_extra_dm = 1; 543 } 544 545 if($update_extra_dm==1){ 546 # write to the extra.dm file of the collection 547 if (open(OUTPUT, ">$extradm_file")) { 548 print OUTPUT $extra_dm; 549 } else { 550 print "can't open $extradm_file\n"; 551 } 552 close(OUTPUT); 553 } 554 } else { 555 print $outhandle "Main_Page doesn't have a table-of-contents section\n"; 556 } 557 } 143 558 144 # check whether the stylesheet exists 145 # if not, download it and copy to the collection's images folder 146 for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){ 147 my $css_file = $css_files[$css_file_count]; 148 $css_file =~ s/^(.+)gli\/cache\///i; 149 150 my $css_file_path = "$base_dir/$css_file"; 151 152 if (-e $css_file_path){ # the file already exists 153 next; 154 } 155 156 # check the css directory and create one if it's not there 157 my @dirs = split(/\//i,$css_file); 158 my $path_check = "$base_dir/"; 159 for(my $i = 0; $i < (scalar(@dirs)-1); $i++){ 160 $path_check .= $dirs[$i] . "/"; 161 if(! -d $path_check ){ 162 mkdir($path_check); 163 } 164 } 165 166 # download 167 $css_file = "http://$css_file"; 168 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path"); 169 if ($? != 0) {unlink("$css_file_path");} 170 171 # change every style element to #wikispecificstyle ... 172 if(open(INPUT, "<$css_file_path")){ 173 my $css_content; 174 while(my $line = <INPUT>){ 175 if($line =~ m/^(.+)\{/i){ 176 $line = "#wikispecificstyle " . $line; 177 } 178 $css_content .= $line; 179 } 180 close(INPUT); 181 open(OUTPUT, ">$css_file_path"); 182 print OUTPUT $css_content; 183 close(OUTPUT); 184 } 185 186 # copy to images folder 187 # do not copy, because collection can only have one specific stylesheet 188 # better to add and modify the style sheets manually 189 # @dirs = split(/\//i,$base_dir); 190 # my $collection_base_dir; 191 # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){ 192 # $collection_base_dir .= $dirs[$i] . "/"; 193 # } 194 # my $images_folder = $collection_base_dir . "images/"; 195 # copy($css_file_path, $images_folder) || die "File cannot be copied."; 559 # If delete_toc is set, remove toc and tof contents. 560 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){ 561 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){ 562 # print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/; 563 if ($body_text =~ /$self->{'toc_exp'}/) { 564 $body_text =~ s/$self->{'toc_exp'}//i; 565 } 196 566 } 197 } 198 199 # add sections around h2 tag 200 # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles 201 # add search box with each section 202 if ($self->{'tag_sections'}) { 203 my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi); 204 for(my $i=1; $i < scalar(@sections); $i++){ 205 my $section_title = $sections[$i]; 206 $section_title =~ s/<([^>]*)>//g; 207 $section_title =~ s/(^\s|\s$)//g; 208 my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n"; 209 if($i !=1){ 210 $section_metadata = "</Section>\n" . $section_metadata; 211 } 212 $section_metadata = "\n<!--\n" . $section_metadata . "-->\n"; 213 214 $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n"; 215 $section_metadata = "</div></div>\n" . $section_metadata if $i !=1; 216 217 $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i; 218 219 if($i==scalar(@sections)-1) { 220 # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i; 221 $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i; 222 } 223 } 224 } 225 226 # If delete_nav is enabled, it means to get rid of navigation contents. 227 # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){ 228 # if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){ 229 # print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/; 230 # $body_text =~ s/$self->{'nav_exp'}//isg; 231 # } 232 #} 233 my $searchbox = ""; 234 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){ 235 my $nav_match_express; 236 if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) { 237 $nav_match_express = $self->{'nav_exp'} ; 238 } else { # default setting for mediawiki 239 $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->"; 240 } 241 242 print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/; 243 244 # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg; 245 $body_text =~ m/$nav_match_express/isg; 246 my $navigate = $& if defined $&; 247 248 # find the search box and add it to the document page 249 if(defined $navigate && $navigate =~ /\S/){ 250 $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/; 251 $searchbox = $& . "\n<\/div>\n<\/div>"; 252 $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg; 253 $searchbox =~ s/name="search"/name="q"/isg; 254 $searchbox =~ s/name="go"//isg; 255 $searchbox =~ s/name="fulltext"//isg; 256 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n" 257 ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n" 258 ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>" 259 ."<input type=\"hidden\" name=\"t\" value=\"1\">"; 260 $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg; 261 $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>"; 262 } 263 264 # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg; 265 $body_text =~ s/$nav_match_express/$searchbox/isg; 266 } 267 268 if ($self->{'tag_sections'}) { 269 $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig; 270 } 271 272 # Tidy up extra new lines 273 $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg; 274 $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg; 275 276 $section_text .= "<!--\n<Section>\n-->\n"; 277 my $body = "<body".$body_text; 278 279 $$textref = $body; 280 281 # get the base dir for convert absolute links to relative links 282 $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i; 283 my $basedir = $2; 284 285 $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g; 286 $$textref =~ s/( )+/ /sg; 287 288 # get rid of the [edit] button 289 $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g; 290 291 # get rid of the last time edit information at the bottom 292 $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g; 293 294 # get rid of the (Redirected from ...) 295 $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg; 296 297 # escape macros 298 $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg; 299 # may change the links, like Greenstone_Documentation_All.html, then change back 300 $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg; 301 302 # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together 303 # get rid of the title attribute of a tag 304 $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg; 305 # find the relative path of current directory 306 if($basedir ne ""){ 307 my @dirs=split("\/", $file); 308 my $dirnum = scalar(@dirs); 309 my $replace = ""; 310 for(my $i=0; $i<$dirnum-2; $i++){ 311 $replace .= "../"; 312 } 313 # test if the linked relative file exists, if not, link to the internet version 314 $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi; 315 # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi); 316 # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n"; 317 # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){ 318 319 #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){ 320 #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i; 321 # my $prefix = $1; 322 # my $link = $&; 323 # my $rel_file_name = $3; 324 # my $rel_link = "$replace$rel_file_name"; 325 # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n"; 326 # if(-e $rel_link){ 327 # $rel_link = "$prefix=\"$rel_link\""; 328 # $$textref =~ s/$link/$rel_link/i; 329 # }else{ 330 # my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\""; 331 # print $outhandle "external link==> $ext_link\n"; 332 # $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i; 333 # } 334 #} 335 336 337 # tag the link to new wiki pages as red 338 $$textref =~ s/(href|src)="$replace([^>]*)&action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi; 339 $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi; 340 341 # tag the link to external pages as blue 342 $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi; 343 344 #print $outhandle $$textref; 345 } 346 347 # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection 348 # 1. read _content_ macro from about.dm 349 # 2. append the toc, change all links to the Greenstone internal format for relative links 350 # 3. write to the extra.dm 351 # TODO: currently we suppose the _about:content_ hasn't been specified before 352 # so needs to add function to handle when the macro is already in the extra.dm 353 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){ 354 my $macro_path = $base_dir; 355 $macro_path =~ s/import$/macros/; 356 my $extra_dm; 357 my $extradm_file = "$macro_path/extra.dm"; 358 if(open(INPUT, "<$extradm_file")){ 359 while(my $line = <INPUT>){ 360 $extra_dm .= $line; 361 } 362 close(INPUT); 363 364 if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){ 365 print $outhandle "already changed!!!!\n"; 366 } else { 367 # read _content_ macro from about.dm file 368 my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm"; 369 my $about_page_content = ""; 370 if(open(INPUT, "<$about_macro")){ 371 while(my $line=<INPUT>){ 372 $about_page_content .= $line; 373 } 374 }else{ 375 print $outhandle "can't open file $about_macro\n"; 376 } 377 close(INPUT); 378 379 # extract the _content_ macro 380 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i; 381 $about_page_content = $&; 382 383 # extract toc of the Main_Page 384 my $mainpage_content = ""; 385 if($self->{'toc_exp'} =~ /\S/){ 386 $$textref =~ /$self->{'toc_exp'}/; 387 $mainpage_content = $&; 388 } else { 389 # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs; 390 } 391 # print $outhandle "---------\n$$textref\n--------\n\n"; 392 # print $outhandle "==========\n$mainpage_content\n==========\n\n"; 393 394 # add toc to the _content_ macro 395 $about_page_content =~ m/{(.|\n)*<\/div>\n\n/; 396 $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}"; 397 398 # change all links to the internal Greenstone relative link format 399 $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg; 400 $extra_dm =~ s/(\.\.\/)+/\//isg; 401 # print $outhandle "to add---------\n$extra_dm\n--------\n"; 402 403 # write to the extra.dm file of the collection 404 open(OUTPUT, ">$extradm_file"); 405 print OUTPUT $extra_dm; 406 close(OUTPUT); 407 } 408 } else { 409 print $outhandle "can't open file $extradm_file\n"; 410 } 411 } 412 413 # If delete_toc is enabled, it means to get rid of toc and tof contents. 414 # get rid of TOC and TOF sections and their title 415 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){ 416 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){ 417 # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg; 418 # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/; 419 # $body_text =~ s/$self->{'toc_exp'}//i; 420 print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/; 421 $$textref =~ s/$self->{'toc_exp'}//i; 422 } 423 } 424 425 # To add a layer on top of the wiki page 426 # so as to keep the wiki style inside the wiki page 427 # and keep the Greenstone style at the same time 428 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is; 429 $$textref =~ s/<\/body>/<\/div><\/body>/is; 430 431 # tag with sections 432 $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is; 433 $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is; 434 435 #print $outhandle "\n\n$$textref\n\n"; 436 437 # use description tags 438 if ($self->{'description_tags'}) { 439 my $cursection = $doc_obj->get_top_section(); 440 # remove the html header - note that doing this here means any 441 # sections defined within the header will be lost (so all <Section> 442 # tags must appear within the body of the HTML) 443 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is); 444 445 $$textref =~ s/^.*?<body[^>]*>//is; 446 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 447 448 my $opencom = '(?:<!--|<!(?:—|—|--))'; 449 my $closecom = '(?:-->|(?:—|—|--)>)'; 450 451 my $lt = '(?:<|<)'; 452 my $gt = '(?:>|>)'; 453 my $quot = '(?:"|"|”|“)'; 454 455 # my $dont_strip = ''; 456 # if ($self->{'no_strip_metadata_html'}) { 457 # ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g; 458 # } 459 460 my $found_something = 0; 461 my $top = 1; 462 while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) { 463 my $text = $1; 464 my $comment = $2; 465 if (defined $text) { 466 # text before a comment - note that getting to here 467 # doesn't necessarily mean there are Section tags in 468 # the document 469 # print $outhandle "section text:\n$text\n"; 470 $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection); 471 } 472 while ($comment =~ s/$lt(.*?)$gt//s) { 473 my $tag = $1; 474 if ($tag eq "Section") { 475 $found_something = 1; 476 $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top; 477 $top = 0; 478 } elsif ($tag eq "/Section") { 479 $found_something = 1; 480 $cursection = $doc_obj->get_parent_section ($cursection); 481 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) { 482 my $metaname = $1; 483 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0; 484 $comment =~ s/^(.*?)$lt\/Metadata$gt//s; 485 my $metavalue = $1; 486 $metavalue =~ s/^\s+//; 487 $metavalue =~ s/\s+$//; 488 # assume that no metadata value intentionally includes 489 # carriage returns or HTML tags (if they're there they 490 # were probably introduced when converting to HTML from 491 # some other format). 492 # actually some people want to have html tags in their 493 # metadata. 494 $metavalue =~ s/[\cJ\cM]/ /sg; 495 # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/); 496 $metavalue =~ s/\s+/ /sg; 497 # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n"; 498 if ($accumulate) { 499 $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue); 500 } else { 501 $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue); 502 } 503 } elsif ($tag eq "Description" || $tag eq "/Description") { 504 # do nothing with containing Description tags 505 } else { 506 # simple HTML tag (probably created by the conversion 507 # to HTML from some other format) - we'll ignore it and 508 # hope for the best ;-) 509 } 510 } 511 }# end while 512 513 if ($cursection ne "") { 514 print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n"; 515 } 516 517 $$textref =~ s/^.*?<body[^>]*>//is; 518 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 519 if ($$textref =~ /\S/) { 520 if (!$found_something) { 521 if ($self->{'verbosity'} > 2) { 522 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; 523 print $outhandle " will be processed as a single section document\n"; 524 } 525 526 # go ahead and process single-section document 527 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 528 529 } else { 530 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n"; 531 print $outhandle " of the final closing </Section> tag. This text will\n"; 532 print $outhandle " be ignored."; 533 534 my ($text); 535 if (length($$textref) > 30) { 536 $text = substr($$textref, 0, 30) . "..."; 537 } else { 538 $text = $$textref; 539 } 540 $text =~ s/\n/ /isg; 541 print $outhandle " ($text)\n"; 542 } 543 } elsif (!$found_something) { 544 if ($self->{'verbosity'} > 2) { 545 # may get to here if document contained no valid Section 546 # tags but did contain some comments. The text will have 547 # been processed already but we should print the warning 548 # as above and extract metadata 549 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n"; 550 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 551 } 552 } 553 } # if $self->{'description_tags'} 554 else { 555 # remove header and footer 556 # if (!$self->{'keep_head'}) { 557 # $$textref =~ s/^.*?<body[^>]*>//is; 558 # $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 559 # } 560 561 # single section document 562 # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 563 564 # Important: to get the relative links to work, 565 # 1: use the below statement instead of the above one 566 # 2. cannot have process_section method. 567 # why????? 568 $self->SUPER::process(@_); 569 } 570 return 1; 567 } 568 569 $$textref = "<body" . $body_text; 570 571 # Wrap the whole page with <div id="wikispecificstyle"></div> 572 # keep the style of this website and don't mess up with the Greenstone styles 573 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is; 574 $$textref =~ s/<\/body>/<\/div><\/body>/is; 571 575 572 #$self->SUPER::process(@_); 576 $self->SUPER::process(@_); 577 578 return 1; 573 579 } 574 575 576 577 # note that process_section may be called multiple times for a single578 # section (relying on the fact that add_utf8_text appends the text to any579 # that may exist already).580 # sub process_section {581 # my $self = shift (@_);582 # my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;583 584 # trap links585 # if (!$self->{'nolinks'}) {586 # usemap="./#index" not handled correctly => change to "#index"587 # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/588 #$self->replace_usemap_links($1, $2, $3)/isge;589 590 #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/591 #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;592 #}593 594 # trap images595 596 # allow spaces if inside quotes - jrm21597 #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/598 #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;599 600 # add text to document object601 # turn \ into \\ so that the rest of greenstone doesn't think there602 # is an escape code following. (Macro parsing loses them...)603 # $$textref =~ s/\\/\\\\/go;604 605 # $doc_obj->add_utf8_text($cursection, $$textref);606 #}607 580 608 581 … … 651 624 } 652 625 626 sub safe_escape_regexp 627 { 628 my $regexp = shift (@_); 629 630 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 631 $regexp =~ s/\\/\\\\/isg; 632 #} else { 633 $regexp =~ s/\//\\\//isg; 634 #} 635 return $regexp; 636 } 637 638 sub read_content_from_about_dm 639 { 640 my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm"); 641 my $about_page_content = ""; 642 if (open(INPUT, "<$about_macro_file")){ 643 while (my $line=<INPUT>){ 644 $about_page_content .= $line; 645 } 646 } else { 647 print $outhandle "can't open file $about_macro_file\n"; 648 } 649 close(INPUT); 650 651 # extract the _content_ macro 652 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i; 653 $about_page_content = $&; 654 655 return $about_page_content; 656 } 657 653 658 1; -
gsdl/branches/gsdl-2.74/perllib/strings.properties
r14198 r14270 60 60 61 61 # -- buildcol.pl -- 62 buildcol.disable_OAI:tick to make it not providing the OAI service for this collection.63 62 64 63 buildcol.archivedir:Where the archives live. … … 153 152 downloadfrom.download_mode:The type of server to download from 154 153 downloadfrom.download_mode.Web:HTTP 154 downloadfrom.download_mode.MediaWiki:MediaWiki website 155 155 downloadfrom.download_mode.OAI: Open Archives Initiative 156 156 downloadfrom.download_mode.z3950:z3950 server … … 547 547 GenericList.desc:A general and flexible list classifier with most of the abilities of AZCompactList, but with better Unicode, metadata and sorting capabilities. 548 548 GenericList.metadata:Metadata fields used for classification. Use '/' to separate the levels in the hierarchy and ';' to separate metadata fields within each level. 549 GenericList.partition_name_length:The length of the partition name; defaults to a variable length from 1 up to 3 characters, depending on how many are required to distinguish the partition start from its end. This option only applies when partition_type_within_level is set to 'constant_size'. 549 550 GenericList.partition_size_within_level:The number of items in each partition (only applies when partition_type_within_level is set to 'constant_size'). 550 551 GenericList.partition_type_within_level:The type of partitioning done: either 'per_letter', 'constant_size', or 'none'. … … 861 862 862 863 MARCXMLPlug.metadata_mapping_file:Name of file that includes mapping details from MARC values to Greenstone metadata names. Defaults to 'marctodc.txt' found in the site's etc directory. 864 865 MediaWikiPlug.desc:Plugin for importing MediaWiki web pages 866 867 MediaWikiPlug.show_toc: Add to the collection's About page the 'table of contents' on the MediaWiki website's main page. Needs to specify a Perl regular expression in toc_exp below to match the 'table of contents' section. 868 869 MediaWikiPlug.delete_toc:Delete the 'table of contents' section on each HTML page. Needs to specify a Perl regular expression in toc_exp below to match the 'table of contents' section. 870 871 MediaWikiPlug.toc_exp:A Perl regular expression to match the 'table of content'. The default value matches common MediaWiki web pages. 872 873 MediaWikiPlug.delete_nav:Delete the navigation section. Needs to specify a Perl regular expression in nav_div_exp below. 874 875 MediaWikiPlug.nav_div_exp:A Perl regular expression to match the navigation section. The default value matches common MediaWiki web pages. 876 877 MediaWikiPlug.delete_searchbox:Delete the searchbox section. Needs to specify a Perl regular expression in searchbox_div_exp below. 878 879 MediaWikiPlug.searchbox_div_id:A Perl regular expression to match the searchbox section. The default value matches common MediaWiki web pages. 880 881 MediaWikiPlug.remove_title_suffix_exp:A Perl regular expression to trim the extracted title. For example, \\s-(.+) will trim title contents after "-". 863 882 864 883 MetadataCSVPlug.desc:A plugin for metadata in comma-separated value format. The Filename field in the CSV file is used to determine which document the metadata belongs to. … … 1047 1066 1048 1067 BasDownload.desc:Base class for Download modules 1068 1069 MediaWikiDownload.desc:A module for downloading from MediaWiki websites 1070 MediaWikiDownload.reject_filetype:Ignore url list, separate by comma, e.g.*cgi-bin*,*.ppt ignores hyperlinks that contain either 'cgi-bin' or '.ppt' 1071 MediaWikiDownload.reject_filetype_disp:Ignore url list, separate by comma 1072 MediaWikiDownload.exclude_directories:List of exclude directories (must be absolute path to the directory), e.g. /people,/documentation will exclude the 'people' and 'documentation' subdirectory under the currently crawling site. 1073 MediaWikiDownload.exclude_directories_disp:List of exclude directories, separate by comma 1049 1074 1050 1075 OAIDownload.desc:A module for downloading from OAI repositories -
gsdl/branches/gsdl-2.74/src/recpt/authenaction.cpp
r14014 r14270 33 33 #include "infodbclass.h" 34 34 #include "gsdltimes.h" 35 #include "userdb.h"36 35 37 36 … … 129 128 130 129 void authenaction::configure (const text_t &key, const text_tarray &cfgline) { 131 // get the password filename132 if (cfgline.size() == 1) {133 if (key == "usersfile") usersfile = cfgline[0];134 else if (key == "keyfile") keyfile = cfgline[0];135 else if (key == "keydecay") keydecay = cfgline[0].getint();136 }137 138 130 action::configure (key, cfgline); 139 131 } 140 132 141 133 bool authenaction::init (ostream &logout) { 142 143 134 if (gdbmhome.empty()) { 144 135 logout << "ERROR (authenaction::init) gdbmhome is not set\n"; 145 136 return false; 146 137 } 147 148 if (usersfile.empty()) usersfile = filename_cat (gdbmhome, "etc", "users.db");149 if (keyfile.empty()) keyfile = filename_cat (gdbmhome, "etc", "key.db");150 138 151 139 return action::init (logout); … … 169 157 if (args["uan"].empty()) return true; 170 158 171 userdbclass *user_database = new userdbclass(usersfile);172 keydbclass *key_database = new keydbclass(keyfile);173 174 159 // failure means we have to redirect to this action to get authentication 175 160 // (if we are not already doing this) … … 188 173 else args_us = "failed"; 189 174 190 // make sure we have a username 191 if (!args_un.empty() && (user_database->get_user_info (args_un, thisuser) == ERRNO_SUCCEED)) { 175 // make sure we have a username 176 int status = user_database->get_user_info (args_un, thisuser); 177 if (!args_un.empty() && (status == ERRNO_SUCCEED)) { 192 178 if (!args_pw.empty()) { 193 179 // we are authenticating using a password … … 286 272 } 287 273 288 //close the database289 user_database->closedatabase();290 key_database->closedatabase();291 274 return true; 292 275 } -
gsdl/branches/gsdl-2.74/src/recpt/authenaction.h
r7432 r14270 33 33 #include "action.h" 34 34 #include "text_t.h" 35 #include "userdb.h" 35 36 #include "receptionist.h" 36 37 … … 41 42 class authenaction : public action { 42 43 protected: 43 text_t usersfile;44 text_t keyfile;44 userdbclass *user_database; 45 keydbclass *key_database; 45 46 int keydecay; 46 47 … … 50 51 authenaction (); 51 52 virtual ~authenaction () {} 53 54 void set_userdb(userdbclass *udb) {user_database = udb;} 55 56 void set_keydb (keydbclass *kdb) {key_database = kdb;} 52 57 53 58 void set_receptionist (receptionist *therecpt) {recpt=therecpt;} -
gsdl/branches/gsdl-2.74/src/recpt/librarymain.cpp
r12517 r14270 178 178 recpt.add_action (adocumentaction); 179 179 180 text_t userdbfile = filename_cat(gsdlhome, "etc", "users.db"); 181 userdbclass *udb = new userdbclass(userdbfile); 182 183 text_t keydbfile = filename_cat(gsdlhome, "etc", "key.db"); 184 keydbclass *kdb = new keydbclass(keydbfile); 185 180 186 #ifdef GSDL_USE_USERS_ACTION 181 recpt.add_action (new usersaction()); 187 usersaction *ausersaction = new usersaction(); 188 ausersaction->set_userdb(udb); 189 recpt.add_action (ausersaction); 182 190 #endif 183 191 … … 190 198 #ifdef GSDL_USE_AUTHEN_ACTION 191 199 authenaction *aauthenaction = new authenaction(); 200 aauthenaction->set_userdb(udb); 201 aauthenaction->set_keydb(kdb); 192 202 aauthenaction->set_receptionist(&recpt); 193 203 recpt.add_action (aauthenaction); … … 272 282 cgiwrapper (recpt, ""); 273 283 delete cservers; 284 delete udb; 285 delete kdb; 274 286 275 287 // clean up the actions -
gsdl/branches/gsdl-2.74/src/recpt/userdb.cpp
r14013 r14270 77 77 userdbclass::userdbclass(const text_t &userdbfilename) 78 78 { 79 activated = (!userdb.opendatabase(userdbfilename, GDBM_WRCREAT, 1000, true)) ? false : true; 79 storeduserdbfilename = userdbfilename; 80 activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true)) ? false : true; 81 if (activated == false) 82 { 83 activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true)) ? false : true; 84 if (activated == true) 85 { 86 userdb.closedatabase(); 87 activated = (!userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true)) ? false : true; 88 } 89 } 90 80 91 external_db = false; 81 92 } … … 227 238 info["groups"] = userinfo.groups; 228 239 info["comment"] = userinfo.comment; 229 230 return (userdb.setinfo (username, info)) ? ERRNO_SUCCEED : ERRNO_GDBMACTIONFILED ; 240 userdb.closedatabase(); 241 userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true); 242 int result = (userdb.setinfo (username, info)) ? ERRNO_SUCCEED : ERRNO_GDBMACTIONFILED; 243 userdb.closedatabase(); 244 userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true); 245 return result; 231 246 } 232 247 return ERRNO_CONNECTIONFAILED; … … 290 305 if (activated == true) 291 306 { 307 userdb.closedatabase(); 308 userdb.opendatabase(storeduserdbfilename, GDBM_WRCREAT, 1000, true); 292 309 userdb.deletekey (username); 310 userdb.closedatabase(); 311 userdb.opendatabase(storeduserdbfilename, GDBM_READER, 1000, true); 293 312 return ERRNO_SUCCEED; 294 313 } … … 335 354 return ERRNO_CONNECTIONFAILED; 336 355 } 337 338 //an alernative way to colse the database if the class can't reach the destructor339 void userdbclass::closedatabase()340 {341 userdb.closedatabase();342 }343 344 356 //==========================================// 345 357 // userdbclass functions (End) // … … 351 363 keydbclass::keydbclass(const text_t &keydbfilename) 352 364 { 353 activated = (!keydb.opendatabase(keydbfilename, GDBM_WRCREAT, 1000, true)) ? false : true; 365 storedkeydbfilename = keydbfilename; 366 activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true)) ? false : true; 367 if (activated == false) 368 { 369 activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true)) ? false : true; 370 if (activated == true) 371 { 372 keydb.closedatabase(); 373 activated = (!keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true)) ? false : true; 374 } 375 } 354 376 external_db = false; 355 377 } … … 399 421 keydata["time"] = time2text(time(NULL)); 400 422 423 keydb.closedatabase(); 424 keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true); 401 425 if (!keydb.setinfo (crypt_userkey, keydata)) 402 426 { 403 427 userkey.clear(); // failed 404 428 } 429 keydb.closedatabase(); 430 keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true); 405 431 406 432 return userkey; … … 434 460 // succeeded, update the key's time 435 461 info["time"] = time2text(time(NULL)); 462 keydb.closedatabase(); 463 keydb.opendatabase(storedkeydbfilename, GDBM_WRCREAT, 1000, true); 436 464 keydb.setinfo (crypt_key, info); 465 keydb.closedatabase(); 466 keydb.opendatabase(storedkeydbfilename, GDBM_READER, 1000, true); 437 467 return true; 438 468 } … … 477 507 } 478 508 } 479 480 //an alernative way to colse the database if the class can't reach the destructor481 void keydbclass::closedatabase()482 {483 keydb.closedatabase();484 }485 509 //==========================================// 486 510 // keydbclass functions (End) // -
gsdl/branches/gsdl-2.74/src/recpt/userdb.h
r14015 r14270 64 64 bool external_db; 65 65 bool activated; 66 text_t storeduserdbfilename; 66 67 67 68 public: … … 114 115 // on success 115 116 int get_user_list (text_tarray &userlist); 116 117 //an alernative way to colse the database if the class can't reach the destructor118 void closedatabase();119 117 }; 120 118 … … 126 124 bool external_db; 127 125 bool activated; 126 text_t storedkeydbfilename; 128 127 129 128 public: … … 146 145 // use sparingly, it can be quite an expensive function 147 146 void remove_old_keys (int keydecay); 148 149 //an alernative way to colse the database if the class can't reach the destructor150 void closedatabase();151 147 }; 152 148 -
gsdl/branches/gsdl-2.74/src/recpt/usersaction.cpp
r13844 r14270 147 147 148 148 void usersaction::configure (const text_t &key, const text_tarray &cfgline) { 149 // get the password filename150 if (cfgline.size() == 1) {151 if (key == "usersfile") usersfile = cfgline[0];152 }153 154 149 action::configure (key, cfgline); 155 150 } … … 161 156 return false; 162 157 } 163 164 if (usersfile.empty()) usersfile = filename_cat (gdbmhome, "etc", "users.db");165 158 166 159 return action::init (logout); … … 193 186 outconvertclass &outconvert, ostream &textout, 194 187 ostream &logout) { 195 196 // open the user database (it will be used a lot)197 user_database = new userdbclass(usersfile);198 188 199 189 if (args["uma"] == "adduser" || args["uma"] == "edituser") { -
gsdl/branches/gsdl-2.74/src/recpt/usersaction.h
r13844 r14270 32 32 #include "gsdlconf.h" 33 33 #include "action.h" 34 #include "userdb.h" 34 35 #include "text_t.h" 35 #include "userdb.h"36 36 37 37 38 38 class usersaction : public action { 39 39 protected: 40 text_t usersfile; 41 userdbclass* user_database; 40 userdbclass *user_database; 42 41 43 42 public: … … 48 47 49 48 bool init (ostream &logout); 49 50 void set_userdb(userdbclass *udb) {user_database = udb;} 50 51 51 52 text_t get_action_name () {return "um";}
Note:
See TracChangeset
for help on using the changeset viewer.