Context Navigation

MediaWikiPlugin.pm@ 32589

Last change on this file since 32589 was 32129, checked in by kjdon, 6 years ago
After () in a regex, {} signifys quantifiers. eg (xx){2,4} - 2-4 occurrences. In later perl versions, it is illegal to have an unescaped { after a ) in a regex. If you actually want to match { you need to escape it. So I have escaped all { following ) in regex
Property svn:keywords set to `Author Date Id Revision`
File size: 26.7 KB

Rev	Line
[14662]	1	###########################################################################
	2	#
[15872]	3	# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
[14662]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
	27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
	28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
	29	# Searchbox will be modified to search the Greenstone collection instead of the website.
	30	# It also can automatically add the table of contents on the website's Main_Page to the
	31	# collection's Home page.
	32
[15872]	33	package MediaWikiPlugin;
[14662]	34
[15872]	35	use HTMLPlugin;
[14662]	36	use unicode;
[28560]	37	use util;
	38	use FileUtils;
[14662]	39
[15887]	40	use strict; # every perl program should have this!
	41	no strict 'refs'; # make an exception so we can use variables as filehandles
[14662]	42
	43
	44	sub BEGIN {
[15872]	45	@MediaWikiPlugin::ISA = ('HTMLPlugin');
[14662]	46	}
	47
	48	my $arguments =
	49	[
	50	# show the table of contents on collection's home page
	51	{ 'name' => "show_toc",
[15872]	52	'desc' => "{MediaWikiPlugin.show_toc}",
[14662]	53	'type' => "flag",
	54	'reqd' => "no"},
	55	# set to delete the table of contents section on each MediaWiki page
	56	{ 'name' => "delete_toc",
[15872]	57	'desc' => "{MediaWikiPlugin.delete_toc}",
[14662]	58	'type' => "flag",
	59	'reqd' => "no"},
	60	# regexp to match the table of contents
	61	{ 'name' => "toc_exp",
[15872]	62	'desc' => "{MediaWikiPlugin.toc_exp}",
[14662]	63	'type' => "regexp",
	64	'reqd' => "no",
	65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
	66	# set to delete the navigation section
	67	{ 'name' => "delete_nav",
[15872]	68	'desc' => "{MediaWikiPlugin.delete_nav}",
[14662]	69	'type' => "flag",
	70	'reqd' => "no",
	71	'deft' => ""},
	72	# regexp to match the navigation section
	73	{ 'name' => "nav_div_exp",
[15872]	74	'desc' => "{MediaWikiPlugin.nav_div_exp}",
[14662]	75	'type' => "regexp",
	76	'reqd' => "no",
	77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
	78	# set to delete the searchbox section
	79	{ 'name' => "delete_searchbox",
[15872]	80	'desc' => "{MediaWikiPlugin.delete_searchbox}",
[14662]	81	'type' => "flag",
	82	'reqd' => "no",
	83	'deft' => ""},
	84	# regexp to match the searchbox section
	85	{ 'name' => "searchbox_div_exp",
[15872]	86	'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
[14662]	87	'type' => "regexp",
	88	'reqd' => "no",
	89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
	90	# regexp to match title suffix
[15872]	91	# can't use the title_sub option in HTMLPlugin instead
[14662]	92	# because title_sub always matches from the begining
	93	{ 'name' => "remove_title_suffix_exp",
[15872]	94	'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
[14662]	95	'type' => "regexp",
	96	'reqd' => "no",
	97	'deft' => ""}
	98	];
	99
[15872]	100	my $options = { 'name' => "MediaWikiPlugin",
	101	'desc' => "{MediaWikiPlugin.desc}",
[14662]	102	'abstract' => "no",
	103	'inherits' => "yes",
	104	'args' => $arguments };
	105
	106	sub new {
	107	my ($class) = shift (@_);
	108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	109	push(@$pluginlist, $class);
	110
[15872]	111	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	112	push(@{$hashArgOptLists->{"OptList"}},$options);
[14662]	113
[15872]	114	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
[14662]	115	return bless $self, $class;
	116	}
	117
	118
	119
	120	sub process {
	121	my $self = shift (@_);
	122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	123	my $outhandle = $self->{'outhandle'};
	124
	125	my @head_and_body = split(/<body/i,$$textref);
	126	my $head = shift(@head_and_body);
	127	my $body_text = join("<body", @head_and_body);
	128
	129	$head =~ m/<title>(.+)<\/title>/i;
	130	my $doctitle = $1 if defined $1;
	131
	132	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
	133	my @doc_properties = split(/<xml>/i,$head);
	134	my $doc_heading = shift(@doc_properties);
	135	my $rest_doc_properties = join(" ", @doc_properties);
	136
	137	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	138	my $extracted_metadata = shift (@extracted_metadata);
	139	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	140	}
	141
	142	# set the title here if we haven't found it yet
	143	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	144	if (defined $doctitle && $doctitle =~ /\S/) {
	145	# remove suffix in title if required
	146	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
	147	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
	148	$doctitle =~ s/$remove_suffix_exp//i;
	149	}
	150	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	151	} else {
	152	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	153	}
	154	}
	155
	156	# we are only interested in the column-contents div <div id="column-content">
	157	# remove header section, it may contain header images or additional search boxes
	158	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
	159	if($body_text =~ /$header_exp/){
	160	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
	161	} else {
	162	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
	163	if($body_text =~ /$header_exp/){
	164	$body_text =~ s/$header_exp/<div$2id='column-content/i;
	165	}
	166	}
	167
	168	# remove timeline
	169	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
	170
	171	# remove extra bits
	172	my $extra_bits = "Retrieved from(.+)</a>\"";
	173	$body_text =~ s/$extra_bits//isg;
	174
	175	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	176	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
	177	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
	178	$body_text =~ s/( )+/ /sg;
	179
	180	# get rid of the [edit] buttons
	181	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
	182	# get rid of the last time edit information at the bottom
	183	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)$PST$//g;
	184	# get rid of the (Redirected from ...)
	185	$body_text =~ s/$Redirected from <a ([^>])>(\w\|\s)?<\/a>$//isg;
	186
	187	# escape texts macros
	188	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
	189	# may change the links, like Greenstone_Documentation_All.html, then change back
	190	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
	191
	192	# define file delimiter for different platforms
	193	my $file_delimiter;
	194	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	195	$file_delimiter = "\\";
	196	} else {
	197	$file_delimiter = "/";
	198	}
	199
	200	# IMPORTANT: different delimiter for $base_dir and $file
	201	# $base_dir use forward slash for both windows and linux
	202	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
	203	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
	204	# $file use different delimiters : forward slash for linux; backward slash for windows
[15872]	205	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
[14662]	206	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
	207
	208	# get the base url for the MediaWiki website
	209	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
	210	my @url_dirs=split($safe_delimiter, $file);
	211	my $url_base = $url_dirs[0];
	212
	213	# Re-check css files associated with MediaWiki pages
	214	if(defined $base_dir && $base_dir ne ""){
	215	my @css_files;
	216	my $css_file_count = 0;
	217
	218	# find all the stylesheets imported with @import statement
	219	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
	220	$css_files[$css_file_count++] = $2 if defined $2;
	221	}
[28560]	222
	223	# Set the env for wget once, outside the for loop
	224	# the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
	225	&util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
[14662]	226
	227	# download the stylesheets if we haven't downloaded them yet
	228	# add prefix to each style elmement, comment out the body element
[19123]	229	# and copy the files to collection's style folder
[14662]	230	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
	231
	232	my $css_file = $css_files[$css_file_count];
	233
	234	# remove prefix gli/cache directory
	235	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
	236
	237	# change the \ delimiter in $css_file to / for consistency
	238	$css_file =~ s/\\/\//isg;
	239	if($css_file !~ /$url_base/) {
	240	$css_file = $url_base . $css_file;
	241	}
	242
	243	# trim the ? mark append to the end of a stylesheet
	244	$css_file =~ s/\?(.+)$//isg;
	245
[28560]	246	my $css_file_path = &FileUtils::filenameConcatenate($base_dir, $css_file);
[14662]	247
	248	# do nothing if we have already downloaded the css files
	249	if (! -e $css_file_path) {
	250
	251	# check the stylesheet's directory in the import folder
	252	# if the directory doesn't exist, create one
	253	my @dirs = split(/\//i,$css_file);
	254	my $path_check = "$base_dir/";
	255	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
	256	$path_check .= $dirs[$i] . "/";
	257	mkdir($path_check) if (! -d $path_check );
	258	}
[28560]	259
[14662]	260	# NOTE: wget needs configuration to directly access Internet
	261	# These files should already downloaded if we used the MediaWikiDownload
	262	# downloading
	263	$css_file = "http://$css_file";
	264	print "\ndownloading : " . $css_file . "\n\n";
	265	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
	266	if ($? != 0) {
	267	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
	268	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
	269	unlink("$css_file_path");
	270	}
	271	} # done with download
	272
	273	# add a prefix "#wikispecificstyle" to each element
	274	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
	275	# so we will wrap the web page with a div with id = wikispecificstyle
	276	my $css_content;
	277	if(open(INPUT, "<$css_file_path")){
	278	while(my $line = <INPUT>){
	279	# comment out the body element because we change the body to div
[32129]	280	$line =~ s/^(\s)body(\s)\{(\s)$/$1\/body$2*\/{$3/isg;
[14662]	281
	282	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
	283	if($line !~ m/wikispecificstyle/i){
	284	$line = "#wikispecificstyle " . $line;
	285	}
	286	}
	287
	288	$css_content .= $line;
	289	}
	290	close(INPUT);
	291	open(OUTPUT, ">$css_file_path");
	292	print OUTPUT $css_content;
	293	close(OUTPUT);
	294	}
	295
[19123]	296	# Copy the modified stylesheets to collection's style folder
[14662]	297	# for future customization
[19123]	298	my $style_dir = $base_dir;
	299	$style_dir =~ s/import$/style/;
[14662]	300	$css_file =~ m/(.)\/(.)$/;
[28560]	301	$style_dir = &FileUtils::filenameConcatenate($style_dir, $2);
[14662]	302
[19123]	303	if(open(OUTPUT, ">$style_dir")){
[14662]	304	print OUTPUT $css_content;
	305	close(OUTPUT);
	306	}
	307	}
	308	}
	309
	310
	311	# by default, only preserve navigation box and search box
	312	# others like toolbox, interaction, languages box, will be removed
	313
	314	# extract the larger part -- footer section
	315	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
	316	$body_text =~ /$print_footer/;
	317	my $footer = "";
	318	$footer = $& if defined $&;
	319	$footer =~ s/<\/body>//isg;
	320
	321	# trim the comments first
	322	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
	323
	324	# contain sections that are to be preserved
	325	my $preserve_sections = "";
	326
	327	# process the navigation section
	328	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
	329	if (defined $self->{'nav_div_exp'}) {
	330	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
	331	}
	332
	333	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
	334	# do nothing
	335	} else {
	336	if ($footer =~ m/$nav_match_exp/ig) {
	337	$preserve_sections = $& ;
	338	} else {
	339	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
	340	}
	341	# if($preserve_sections =~/\S/){
	342	# $preserve_sections .= "</div>";
	343	# }
	344	}
	345
	346	# process the searchbox section
	347	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
	348	if(defined $self->{'searchbox_div_exp'}) {
	349	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
	350	}
	351
	352	my $searchbox_section = "";
	353	$footer =~ m/$searchbox_exp/ig;
	354	$searchbox_section = $& if defined $&;
	355
	356	# make the searchbox form work in Greenstone
	357	if($searchbox_section =~ /\S/){
	358	# replace action
	359	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
	360
	361	# remove buttons
	362	$searchbox_section =~ s/name="search"/name="q"/isg;
	363	$searchbox_section =~ s/name="go"//isg;
	364	$searchbox_section =~ s/name="fulltext"//isg;
	365
	366	# get collection name from $base_dir for c param
	367	$base_dir =~ m/\/collect\/(.+)\//i;
	368	my $collection_name = "";
	369	$collection_name = $1 if defined $1;
	370
	371	# add Greenstone search params
	372	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
	373	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
	374	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
	375	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
	376
	377	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
	378
	379	# $searchbox_section .= "</div>";
	380	} else {
	381	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
	382	}
	383
	384	# either delete or replace the searchbox
	385	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
	386	# do nothing
	387	} else {
	388	$preserve_sections .= "\n$searchbox_section\n";
	389	}
	390
	391	if($preserve_sections ne ""){
	392	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
	393	}
	394	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
	395
	396	$body_text =~ s/$print_footer/$preserve_sections/isg;
	397
	398
	399	# delete other forms in the page
	400	my @forms;
	401	my $form_count = 0;
	402	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
	403	next if($3 eq "searchform");
	404	$forms[$form_count++] = $&;
	405	}
	406	foreach my $form (@forms) {
	407	$body_text =~ s/$form[\s\S]*?<\/form>//m;
	408	}
	409
	410	# process links.
	411	# because current WGET 1.10 the -k and -E option doesn't work together
	412	# need to 'manually' convert the links to relative links
	413	# Dealing with 3 types of links:
	414	# -- outgoing links
	415	# -- if we have downloaded the target files, link to the internal version (relative link)
	416	# -- otherwise, link to the external version (absolute links)
	417	# -- in-page links (relative link)
	418
	419	# NOTE: (important)
	420	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
	421	# otherwise, the internal links may have problems
	422
	423	# remove the title attribute of <a> tag
	424	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
	425
	426	# extract all the links
	427	my @links;
	428	my $link_count = 0;
	429	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
	430	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
	431	}
	432
	433	foreach my $cur_link (@links) {
	434	# escape greedy match + character
	435	$cur_link =~ s/\+/\\+/isg;
	436
	437	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
	438	my $external_file_path = "$1\"http://$url_base/$3\"";
	439
	440	$body_text =~ s/$cur_link/$external_file_path/i;
	441	}
	442
	443	# tag links to new wiki pages as red
	444	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
	445
	446	# tag links to pages external of the MediaWiki website as blue
	447	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
	448
	449
	450	# process the table-of-contents section
	451	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
	452	# 1. read _content_ macro from about.dm
	453	# 2. append the toc, change all links to the Greenstone internal format for relative links
	454	# 3. write to the extra.dm
	455	# TODO: we assume the _about:content_ hasn't been specified before
	456	# so needs to add function to handle when the macro is already in the extra.dm
	457	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
	458
	459	# extract toc of the Main_Page
	460	my $mainpage_toc = "";
	461	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
	462	if($self->{'toc_exp'} =~ /\S/){
	463	$toc_exp = $self->{'toc_exp'};
	464	}
	465	if($body_text =~ /$toc_exp/){
	466	$mainpage_toc = $&;
	467	}
	468
	469	if($mainpage_toc =~ /\S/) {
	470
	471	# change the in-page links to relative links, for example, change <a href="#section1"> to
	472	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
	473	my $file_url_format = $file;
	474	$file_url_format =~ s/\\/\//isg;
	475	$file_url_format = "http://" . $file_url_format;
	476
	477	# encode as URL, otherwise doesn't work on Windows
	478	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
	479	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
	480
	481
	482	# read the collection's extra.dm
	483	my $macro_path = $base_dir;
	484	$macro_path =~ s/import$/macros/;
[28560]	485	my $extradm_file = &FileUtils::filenameConcatenate($macro_path, "extra.dm");
[14662]	486
	487	my $extra_dm = "";
	488	if(open(INPUT, "<$extradm_file")){
	489	while(my $line = <INPUT>){
	490	$extra_dm .= $line;
	491	}
	492	} else {
	493	print $outhandle "can't open file $extradm_file\n";
	494	}
	495	close(INPUT);
	496
	497	# check whether we have changed the macros
	498	my @packages = split("package ", $extra_dm);
	499	my $about_package = "";
	500	foreach my $package (@packages) {
	501	$about_package = "package " . $package if($package =~ /^about/);
	502	}
	503
	504	my $update_extra_dm = 0;
	505
[32129]	506	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*)\{/ && $about_package =~ m/$mainpage_toc/){
[14662]	507	print $outhandle "_content_ macro already changed!!!!\n";
	508	}
	509	# if extra.dm doesn't have an "about package"
	510	elsif ($about_package !~ /\S/) {
	511	# read _content_ macro from $GSDLHOME/macros/about.dm file
[15887]	512	my $global_about_package = $self->read_content_from_about_dm();
[14662]	513
	514	# create the extra _content_ macro for this collection
	515	# add the original content of the _content_ macro
	516	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
	517
	518	# append the new about package to extra.dm
	519	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
	520	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
	521
	522	$update_extra_dm = 1;
	523	}
	524	# the about package exists, but either doesn't have the _content_ macro or
	525	# the _content_ macro doesn't contain the toc
	526	else {
	527	# check if there is a content macro
	528	my $content_macro_existed = 0;
[32129]	529	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s)\{/);
[14662]	530
	531	# if there is one
	532	# append a new section div for toc to the end of the document section
	533	if($content_macro_existed ==1) {
[32129]	534	$about_package =~ /(\s\|\n)_content_(\s)\{(.\|\n)*?}/;
[14662]	535	my $content_macro = $&;
	536	my $new_content_macro = $content_macro;
	537	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
	538	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
	539	}
	540	# otherwise, append _content_ macro to the about package
	541	else {
	542	my $new_about_package = $about_package;
[15887]	543	my $content_macro = &read_content_from_about_dm();
[14662]	544	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
	545
	546	$new_about_package .= "\n\n_content_$&\n\n";
	547	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
	548	$extra_dm =~ s/$about_package/$new_about_package/mg;
	549	}
	550
	551	# either the case, we need to update the extra.dm
	552	$update_extra_dm = 1;
	553	}
	554
	555	if($update_extra_dm==1){
	556	# write to the extra.dm file of the collection
	557	if (open(OUTPUT, ">$extradm_file")) {
	558	print OUTPUT $extra_dm;
	559	} else {
	560	print "can't open $extradm_file\n";
	561	}
	562	close(OUTPUT);
	563	}
	564	} else {
	565	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
	566	}
	567	}
	568
	569	# If delete_toc is set, remove toc and tof contents.
	570	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
	571	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
	572	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
	573	if ($body_text =~ /$self->{'toc_exp'}/) {
	574	$body_text =~ s/$self->{'toc_exp'}//i;
	575	}
	576	}
	577	}
	578
	579	$$textref = "<body" . $body_text;
	580
	581	# Wrap the whole page with <div id="wikispecificstyle"></div>
	582	# keep the style of this website and don't mess up with the Greenstone styles
	583	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
	584	$$textref =~ s/<\/body>/<\/div><\/body>/is;
	585
	586	$self->SUPER::process(@_);
	587
	588	return 1;
	589	}
	590
	591
	592	sub extract_metadata
	593	{
	594	my $self = shift (@_);
	595	my ($textref, $metadata, $doc_obj) = @_;
	596	my $outhandle = $self->{'outhandle'};
	597
	598	return if (!defined $textref);
	599
	600	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	601	# html meta, 'value' is the metadata name for greenstone to use
	602	my %find_fields = ();
	603	my ($tag,$value);
	604
	605	my $orig_field = "";
	606	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	607	# support tag<tagname>
	608	if ($field =~ /^(.?)<(.?)>$/) {
	609	# "$2" is the user's preferred gs metadata name
	610	$find_fields{lc($1)}=$2; # lc = lowercase
	611	$orig_field = $1;
	612	} else { # no <tagname> for mapping
	613	# "$field" is the user's preferred gs metadata name
	614	$find_fields{lc($field)}=$field; # lc = lowercase
	615	$orig_field = $field;
	616	}
	617
	618	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	619	$tag = $orig_field;
	620	$value = $1;
	621	if (!defined $value \|\| !defined $tag){
[15872]	622	#print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
[14662]	623	next;
	624	} else {
	625	# clean up and add
	626	chomp($value); # remove trailing \n, if any
	627	$tag = $find_fields{lc($tag)};
	628	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	629	# if ($self->{'verbosity'} > 2);
	630	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	631	}
	632	}
	633	}
	634	}
	635
	636	sub safe_escape_regexp
	637	{
	638	my $regexp = shift (@_);
	639
	640	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	641	$regexp =~ s/\\/\\\\/isg;
	642	#} else {
	643	$regexp =~ s/\//\\\//isg;
	644	#}
	645	return $regexp;
	646	}
	647
	648	sub read_content_from_about_dm
	649	{
[15887]	650	my $self = shift(@_);
	651
[28560]	652	my $about_macro_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "macros", "about.dm");
[14662]	653	my $about_page_content = "";
	654	if (open(INPUT, "<$about_macro_file")){
	655	while (my $line=<INPUT>){
	656	$about_page_content .= $line;
	657	}
	658	} else {
[15887]	659	my $outhandle = $self->{'outhandle'};
[14662]	660	print $outhandle "can't open file $about_macro_file\n";
	661	}
	662	close(INPUT);
	663
	664	# extract the _content_ macro
[31780]	665	$about_page_content =~ m/_content_ \{(.\|\n)*<\/div>\n\n<\/div>\n}/i;
[14662]	666	$about_page_content = $&;
	667
	668	return $about_page_content;
	669	}
	670
	671	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/plugins/MediaWikiPlugin.pm@ 32589

Download in other formats: