Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlug.pm@ 14337

Last change on this file since 14337 was 14337, checked in by anna, 17 years ago
Fixed a bug in extracting search box.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Rev	Line
[14251]	1	###########################################################################
	2	#
	3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
	27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
	28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
	29	# Searchbox will be modified to search the Greenstone collection instead of the website.
	30	# It also can automatically add the table of contents on the website's Main_Page to the
	31	# collection's Home page.
	32
	33	package MediaWikiPlug;
	34
	35	use HTMLPlug;
	36	# use ImagePlug;
	37	# use File::Copy;
	38	use unicode;
	39
	40
	41	#use strict; # every perl program should have this!
	42	#no strict 'refs'; # make an exception so we can use variables as filehandles
	43
	44	sub BEGIN {
	45	@MediaWikiPlug::ISA = ('HTMLPlug');
	46	}
	47
	48	my $arguments =
	49	[
	50	# show the table of contents on collection's home page
	51	{ 'name' => "show_toc",
	52	'desc' => "{MediaWikiPlug.show_toc}",
	53	'type' => "flag",
	54	'reqd' => "no"},
	55	# set to delete the table of contents section on each MediaWiki page
	56	{ 'name' => "delete_toc",
	57	'desc' => "{MediaWikiPlug.delete_toc}",
	58	'type' => "flag",
	59	'reqd' => "no"},
	60	# regexp to match the table of contents
	61	{ 'name' => "toc_exp",
	62	'desc' => "{MediaWikiPlug.toc_exp}",
	63	'type' => "regexp",
	64	'reqd' => "no",
	65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)</table>\\n" },
	66	# set to delete the navigation section
	67	{ 'name' => "delete_nav",
	68	'desc' => "{MediaWikiPlug.delete_nav}",
	69	'type' => "flag",
	70	'reqd' => "no",
	71	'deft' => ""},
	72	# regexp to match the navigation section
	73	{ 'name' => "nav_div_exp",
	74	'desc' => "{MediaWikiPlug.nav_div_exp}",
	75	'type' => "regexp",
	76	'reqd' => "no",
	77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
	78	# set to delete the searchbox section
	79	{ 'name' => "delete_searchbox",
	80	'desc' => "{MediaWikiPlug.delete_searchbox}",
	81	'type' => "flag",
	82	'reqd' => "no",
	83	'deft' => ""},
	84	# regexp to match the searchbox section
	85	{ 'name' => "searchbox_div_exp",
	86	'desc' => "{MediaWikiPlug.searchbox_div_exp}",
	87	'type' => "regexp",
	88	'reqd' => "no",
	89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
	90	# regexp to match title suffix
	91	# can't use the title_sub option in HTMLPlug instead
	92	# because title_sub always matches from the begining
	93	{ 'name' => "remove_title_suffix_exp",
	94	'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
	95	'type' => "regexp",
	96	'reqd' => "no",
	97	'deft' => ""}
	98	];
	99
	100	my $options = { 'name' => "MediaWikiPlug",
	101	'desc' => "{MediaWikiPlug.desc}",
	102	'abstract' => "no",
	103	'inherits' => "yes",
	104	'args' => $arguments };
	105
	106	sub new {
	107	my ($class) = shift (@_);
	108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	109	push(@$pluginlist, $class);
	110
	111	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	112	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	113
	114	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
	115	return bless $self, $class;
	116	}
	117
	118
	119
	120	sub process {
	121	my $self = shift (@_);
	122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	123	my $outhandle = $self->{'outhandle'};
	124
	125	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
	126
	127	my @head_and_body = split(/<body/i,$$textref);
	128	my $head = shift(@head_and_body);
	129	my $body_text = join("<body", @head_and_body);
	130
	131	$head =~ m/<title>(.+)<\/title>/i;
	132	my $doctitle = $1 if defined $1;
	133
	134	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
	135	my @doc_properties = split(/<xml>/i,$head);
	136	my $doc_heading = shift(@doc_properties);
	137	my $rest_doc_properties = join(" ", @doc_properties);
	138
	139	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	140	my $extracted_metadata = shift (@extracted_metadata);
	141	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	142	}
	143
	144	# set the title here if we haven't found it yet
	145	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	146	if (defined $doctitle && $doctitle =~ /\S/) {
	147	# remove suffix in title if required
	148	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
	149	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
	150	$doctitle =~ s/$remove_suffix_exp//i;
	151	}
	152	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	153	} else {
	154	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	155	}
	156	}
	157
	158	# we are only interested in the column-contents div <div id="column-content">
	159	# remove header section, it may contain header images or additional search boxes
	160	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
	161	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
	162
	163	# remove timeline
	164	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
	165
	166	# remove extra bits
	167	my $extra_bits = "Retrieved from(.+)</a>\"";
	168	$body_text =~ s/$extra_bits//isg;
	169
	170	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	171	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
	172	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
	173	$body_text =~ s/( )+/ /sg;
	174
	175	# get rid of the [edit] buttons
	176	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
	177	# get rid of the last time edit information at the bottom
	178	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
	179	# get rid of the (Redirected from ...)
	180	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
	181
	182	# escape texts macros
	183	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
	184	# may change the links, like Greenstone_Documentation_All.html, then change back
	185	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
	186
	187	# define file delimiter for different platforms
	188	my $file_delimiter;
	189	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	190	$file_delimiter = "\\";
	191	} else {
	192	$file_delimiter = "/";
	193	}
	194
	195	# IMPORTANT: different delimiter for $base_dir and $file
	196	# $base_dir use forward slash for both windows and linux
	197	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
	198	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
	199	# $file use different delimiters : forward slash for linux; backward slash for windows
	200	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
	201	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
	202
	203	# get the base url for the MediaWiki website
	204	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
	205	my @url_dirs=split($safe_delimiter, $file);
	206	my $url_base = $url_dirs[0];
	207
	208	# Re-check css files associated with MediaWiki pages
	209	if(defined $base_dir && $base_dir ne ""){
	210	my @css_files;
	211	my $css_file_count = 0;
	212
	213	# find all the stylesheets imported with @import statement
	214	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
	215	$css_files[$css_file_count++] = $2 if defined $2;
	216	}
	217
	218	# download the stylesheets if we haven't downloaded them yet
	219	# add prefix to each style elmement, comment out the body element
	220	# and copy the files to collection's images folder
	221	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
	222
	223	my $css_file = $css_files[$css_file_count];
	224
	225	# remove prefix gli/cache directory
	226	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
	227
	228	# change the \ delimiter in $css_file to / for consistency
	229	$css_file =~ s/\\/\//isg;
	230	if($css_file !~ /$url_base/) {
	231	$css_file = $url_base . $css_file;
	232	}
	233
	234	# trim the ? mark append to the end of a stylesheet
	235	$css_file =~ s/\?(.+)$//isg;
	236
	237	my $css_file_path = &util::filename_cat($base_dir, $css_file);
	238
	239	# do nothing if we have already downloaded the css files
	240	if (! -e $css_file_path) {
	241
	242	# check the stylesheet's directory in the import folder
	243	# if the directory doesn't exist, create one
	244	my @dirs = split(/\//i,$css_file);
	245	my $path_check = "$base_dir/";
	246	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
	247	$path_check .= $dirs[$i] . "/";
	248	mkdir($path_check) if (! -d $path_check );
	249	}
	250
	251	# NOTE: wget needs configuration to directly access Internet
	252	# These files should already downloaded if we used the MediaWikiDownload
	253	# downloading
	254	$css_file = "http://$css_file";
	255	print "\ndownloading : " . $css_file . "\n\n";
	256	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
	257	if ($? != 0) {
	258	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
	259	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
	260	unlink("$css_file_path");
	261	}
	262	} # done with download
	263
	264	# add a prefix "#wikispecificstyle" to each element
	265	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
	266	# so we will wrap the web page with a div with id = wikispecificstyle
	267	my $css_content;
	268	if(open(INPUT, "<$css_file_path")){
	269	while(my $line = <INPUT>){
	270	# comment out the body element because we change the body to div
	271	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
	272
	273	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
	274	$line = "#wikispecificstyle " . $line;
	275	}
	276	$css_content .= $line;
	277	}
	278	close(INPUT);
	279	open(OUTPUT, ">$css_file_path");
	280	print OUTPUT $css_content;
	281	close(OUTPUT);
	282	}
	283
	284	# Copy the modified stylesheets to collection's images folder
	285	# for future customization
	286	my $images_dir = $base_dir;
	287	$images_dir =~ s/import$/images/;
	288	$css_file =~ m/(.)\/(.)$/;
	289	$images_dir = &util::filename_cat($images_dir, $2);
	290
	291	if(open(OUTPUT, ">$images_dir")){
	292	print OUTPUT $css_content;
	293	close(OUTPUT);
	294	}
	295	}
	296	}
	297
	298
	299	# by default, only preserve navigation box and search box
	300	# others like toolbox, interaction, languages box, will be removed
	301
	302	# extract the larger part -- footer section
	303	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
	304	$body_text =~ /$print_footer/;
	305	my $footer = "";
	306	$footer = $& if defined $&;
	307	$footer =~ s/<\/body>//isg;
	308
	309	# trim the comments first
	310	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
	311
	312	# contain sections that are to be preserved
	313	my $preserve_sections = "";
	314
	315	# process the navigation section
	316	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
	317	if (defined $self->{'nav_div_exp'}) {
	318	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
	319	}
	320
	321	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
	322	# do nothing
	323	} else {
	324	if ($footer =~ m/$nav_match_exp/ig) {
	325	$preserve_sections = $& ;
	326	} else {
	327	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
	328	}
	329	# if($preserve_sections =~/\S/){
	330	# $preserve_sections .= "</div>";
	331	# }
	332	}
	333
	334	# process the searchbox section
	335	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
	336	if(defined $self->{'searchbox_div_exp'}) {
	337	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
	338	}
	339
	340	my $searchbox_section = "";
	341	$footer =~ m/$searchbox_exp/ig;
	342	$searchbox_section = $& if defined $&;
	343
	344	# make the searchbox form work in Greenstone
	345	if($searchbox_section =~ /\S/){
	346	# replace action
	347	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
	348
	349	# remove buttons
	350	$searchbox_section =~ s/name="search"/name="q"/isg;
	351	$searchbox_section =~ s/name="go"//isg;
	352	$searchbox_section =~ s/name="fulltext"//isg;
	353
	354	# get collection name from $base_dir for c param
	355	$base_dir =~ m/\/collect\/(.+)\//i;
	356	my $collection_name = "";
	357	$collection_name = $1 if defined $1;
	358
	359	# add Greenstone search params
	360	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
	361	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
	362	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
	363	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
	364
	365	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
	366
	367	# $searchbox_section .= "</div>";
	368	} else {
	369	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
	370	}
	371
	372	# either delete or replace the searchbox
	373	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
	374	# do nothing
	375	} else {
	376	$preserve_sections .= "\n$searchbox_section\n";
[14337]	377	}
[14251]	378
	379	if($preserve_sections ne ""){
	380	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
	381	}
[14337]	382	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
	383
[14251]	384	$body_text =~ s/$print_footer/$preserve_sections/isg;
	385
	386
	387	# delete other forms in the page
	388	my @forms;
	389	my $form_count = 0;
[14337]	390	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
	391	next if($3 eq "searchform");
	392	$forms[$form_count++] = $&;
[14251]	393	}
	394	foreach my $form (@forms) {
	395	$body_text =~ s/$form[\s\S]*?<\/form>//m;
[14337]	396	}
[14251]	397
	398	# process links.
	399	# because current WGET 1.10 the -k and -E option doesn't work together
	400	# need to 'manually' convert the links to relative links
	401	# Dealing with 3 types of links:
	402	# -- outgoing links
	403	# -- if we have downloaded the target files, link to the internal version (relative link)
	404	# -- otherwise, link to the external version (absolute links)
	405	# -- in-page links (relative link)
	406
	407	# NOTE: (important)
	408	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
	409	# otherwise, the internal links may have problems
	410
	411	# remove the title attribute of <a> tag
	412	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
	413
	414	# extract all the links
	415	my @links;
	416	my $link_count = 0;
	417	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
	418	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
	419	}
	420
	421	foreach my $cur_link (@links) {
	422	# escape greedy match + character
	423	$cur_link =~ s/\+/\\+/isg;
	424
	425	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
	426	my $external_file_path = "$1\"http://$url_base/$3\"";
	427
	428	$body_text =~ s/$cur_link/$external_file_path/i;
	429	}
	430
	431	# tag links to new wiki pages as red
	432	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
	433
	434	# tag links to pages external of the MediaWiki website as blue
	435	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
	436
	437
	438	# process the table-of-contents section
	439	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
	440	# 1. read _content_ macro from about.dm
	441	# 2. append the toc, change all links to the Greenstone internal format for relative links
	442	# 3. write to the extra.dm
	443	# TODO: we assume the _about:content_ hasn't been specified before
	444	# so needs to add function to handle when the macro is already in the extra.dm
	445	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
	446
	447	# extract toc of the Main_Page
	448	my $mainpage_toc = "";
	449	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
	450	if($self->{'toc_exp'} =~ /\S/){
	451	$toc_exp = $self->{'toc_exp'};
	452	}
	453	if($body_text =~ /$toc_exp/){
	454	$mainpage_toc = $&;
	455	}
	456
	457	if($mainpage_toc =~ /\S/) {
	458
	459	# change the in-page links to relative links, for example, change <a href="#section1"> to
	460	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
	461	my $file_url_format = $file;
	462	$file_url_format =~ s/\\/\//isg;
	463	$file_url_format = "http://" . $file_url_format;
	464
	465	# encode as URL, otherwise doesn't work on Windows
	466	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
	467	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
	468
	469
	470	# read the collection's extra.dm
	471	my $macro_path = $base_dir;
	472	$macro_path =~ s/import$/macros/;
	473	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
	474
	475	my $extra_dm = "";
	476	if(open(INPUT, "<$extradm_file")){
	477	while(my $line = <INPUT>){
	478	$extra_dm .= $line;
	479	}
	480	} else {
	481	print $outhandle "can't open file $extradm_file\n";
	482	}
	483	close(INPUT);
	484
	485	# check whether we have changed the macros
	486	my @packages = split("package ", $extra_dm);
	487	my $about_package = "";
	488	foreach my $package (@packages) {
	489	$about_package = "package " . $package if($package =~ /^about/);
	490	}
	491
	492	my $update_extra_dm = 0;
	493
	494	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
	495	print $outhandle "_content_ macro already changed!!!!\n";
	496	}
	497	# if extra.dm doesn't have an "about package"
	498	elsif ($about_package !~ /\S/) {
	499	# read _content_ macro from $GSDLHOME/macros/about.dm file
	500	my $global_about_package = &read_content_from_about_dm();
	501
	502	# create the extra _content_ macro for this collection
	503	# add the original content of the _content_ macro
	504	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
	505
	506	# append the new about package to extra.dm
	507	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
	508	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
	509
	510	$update_extra_dm = 1;
	511	}
	512	# the about package exists, but either doesn't have the _content_ macro or
	513	# the _content_ macro doesn't contain the toc
	514	else {
	515	# check if there is a content macro
	516	my $content_macro_existed = 0;
	517	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
	518
	519	# if there is one
	520	# append a new section div for toc to the end of the document section
	521	if($content_macro_existed ==1) {
	522	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
	523	my $content_macro = $&;
	524	my $new_content_macro = $content_macro;
	525	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
	526	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
	527	}
	528	# otherwise, append _content_ macro to the about package
	529	else {
	530	my $new_about_package = $about_package;
	531	$content_macro = &read_content_from_about_dm();
	532	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
	533
	534	$new_about_package .= "\n\n_content_$&\n\n";
	535	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
	536	$extra_dm =~ s/$about_package/$new_about_package/mg;
	537	}
	538
	539	# either the case, we need to update the extra.dm
	540	$update_extra_dm = 1;
	541	}
	542
	543	if($update_extra_dm==1){
	544	# write to the extra.dm file of the collection
	545	if (open(OUTPUT, ">$extradm_file")) {
	546	print OUTPUT $extra_dm;
	547	} else {
	548	print "can't open $extradm_file\n";
	549	}
	550	close(OUTPUT);
	551	}
	552	} else {
	553	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
	554	}
	555	}
	556
	557	# If delete_toc is set, remove toc and tof contents.
	558	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
	559	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
	560	# print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
	561	if ($body_text =~ /$self->{'toc_exp'}/) {
	562	$body_text =~ s/$self->{'toc_exp'}//i;
	563	}
	564	}
	565	}
	566
	567	$$textref = "<body" . $body_text;
	568
	569	# Wrap the whole page with <div id="wikispecificstyle"></div>
	570	# keep the style of this website and don't mess up with the Greenstone styles
	571	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
	572	$$textref =~ s/<\/body>/<\/div><\/body>/is;
	573
	574	$self->SUPER::process(@_);
	575
	576	return 1;
	577	}
	578
	579
	580	sub extract_metadata
	581	{
	582	my $self = shift (@_);
	583	my ($textref, $metadata, $doc_obj) = @_;
	584	my $outhandle = $self->{'outhandle'};
	585
	586	return if (!defined $textref);
	587
	588	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	589	# html meta, 'value' is the metadata name for greenstone to use
	590	my %find_fields = ();
	591	my ($tag,$value);
	592
	593	my $orig_field = "";
	594	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	595	# support tag<tagname>
	596	if ($field =~ /^(.?)<(.?)>$/) {
	597	# "$2" is the user's preferred gs metadata name
	598	$find_fields{lc($1)}=$2; # lc = lowercase
	599	$orig_field = $1;
	600	} else { # no <tagname> for mapping
	601	# "$field" is the user's preferred gs metadata name
	602	$find_fields{lc($field)}=$field; # lc = lowercase
	603	$orig_field = $field;
	604	}
	605
	606	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	607	$tag = $orig_field;
	608	$value = $1;
	609	if (!defined $value \|\| !defined $tag){
	610	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
	611	next;
	612	} else {
	613	# clean up and add
	614	chomp($value); # remove trailing \n, if any
	615	$tag = $find_fields{lc($tag)};
	616	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	617	# if ($self->{'verbosity'} > 2);
	618	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	619	}
	620	}
	621	}
	622	}
	623
	624	sub safe_escape_regexp
	625	{
	626	my $regexp = shift (@_);
	627
	628	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	629	$regexp =~ s/\\/\\\\/isg;
	630	#} else {
	631	$regexp =~ s/\//\\\//isg;
	632	#}
	633	return $regexp;
	634	}
	635
	636	sub read_content_from_about_dm
	637	{
	638	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
	639	my $about_page_content = "";
	640	if (open(INPUT, "<$about_macro_file")){
	641	while (my $line=<INPUT>){
	642	$about_page_content .= $line;
	643	}
	644	} else {
	645	print $outhandle "can't open file $about_macro_file\n";
	646	}
	647	close(INPUT);
	648
	649	# extract the _content_ macro
	650	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
	651	$about_page_content = $&;
	652
	653	return $about_page_content;
	654	}
	655
	656	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: