Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 15918

Last change on this file since 15918 was 15887, checked in by mdewsnip, 16 years ago
Added "use strict" to the few files that were missing it, and fixing resulting problems in MediaWikiPlug.pm.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.4 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlugin;
34
35	use HTMLPlugin;
36	use unicode;
37
38	use strict; # every perl program should have this!
39	no strict 'refs'; # make an exception so we can use variables as filehandles
40
41
42	sub BEGIN {
43	@MediaWikiPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	# show the table of contents on collection's home page
49	{ 'name' => "show_toc",
50	'desc' => "{MediaWikiPlugin.show_toc}",
51	'type' => "flag",
52	'reqd' => "no"},
53	# set to delete the table of contents section on each MediaWiki page
54	{ 'name' => "delete_toc",
55	'desc' => "{MediaWikiPlugin.delete_toc}",
56	'type' => "flag",
57	'reqd' => "no"},
58	# regexp to match the table of contents
59	{ 'name' => "toc_exp",
60	'desc' => "{MediaWikiPlugin.toc_exp}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
64	# set to delete the navigation section
65	{ 'name' => "delete_nav",
66	'desc' => "{MediaWikiPlugin.delete_nav}",
67	'type' => "flag",
68	'reqd' => "no",
69	'deft' => ""},
70	# regexp to match the navigation section
71	{ 'name' => "nav_div_exp",
72	'desc' => "{MediaWikiPlugin.nav_div_exp}",
73	'type' => "regexp",
74	'reqd' => "no",
75	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
76	# set to delete the searchbox section
77	{ 'name' => "delete_searchbox",
78	'desc' => "{MediaWikiPlugin.delete_searchbox}",
79	'type' => "flag",
80	'reqd' => "no",
81	'deft' => ""},
82	# regexp to match the searchbox section
83	{ 'name' => "searchbox_div_exp",
84	'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
85	'type' => "regexp",
86	'reqd' => "no",
87	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
88	# regexp to match title suffix
89	# can't use the title_sub option in HTMLPlugin instead
90	# because title_sub always matches from the begining
91	{ 'name' => "remove_title_suffix_exp",
92	'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
93	'type' => "regexp",
94	'reqd' => "no",
95	'deft' => ""}
96	];
97
98	my $options = { 'name' => "MediaWikiPlugin",
99	'desc' => "{MediaWikiPlugin.desc}",
100	'abstract' => "no",
101	'inherits' => "yes",
102	'args' => $arguments };
103
104	sub new {
105	my ($class) = shift (@_);
106	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107	push(@$pluginlist, $class);
108
109	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110	push(@{$hashArgOptLists->{"OptList"}},$options);
111
112	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
113	return bless $self, $class;
114	}
115
116
117
118	sub process {
119	my $self = shift (@_);
120	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
121	my $outhandle = $self->{'outhandle'};
122
123	print $outhandle "MediaWikiPlugin: processing $file\n" if $self->{'verbosity'} > 1;
124
125	my @head_and_body = split(/<body/i,$$textref);
126	my $head = shift(@head_and_body);
127	my $body_text = join("<body", @head_and_body);
128
129	$head =~ m/<title>(.+)<\/title>/i;
130	my $doctitle = $1 if defined $1;
131
132	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
133	my @doc_properties = split(/<xml>/i,$head);
134	my $doc_heading = shift(@doc_properties);
135	my $rest_doc_properties = join(" ", @doc_properties);
136
137	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
138	my $extracted_metadata = shift (@extracted_metadata);
139	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
140	}
141
142	# set the title here if we haven't found it yet
143	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
144	if (defined $doctitle && $doctitle =~ /\S/) {
145	# remove suffix in title if required
146	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
147	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
148	$doctitle =~ s/$remove_suffix_exp//i;
149	}
150	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
151	} else {
152	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
153	}
154	}
155
156	# we are only interested in the column-contents div <div id="column-content">
157	# remove header section, it may contain header images or additional search boxes
158	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
159	if($body_text =~ /$header_exp/){
160	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
161	} else {
162	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
163	if($body_text =~ /$header_exp/){
164	$body_text =~ s/$header_exp/<div$2id='column-content/i;
165	}
166	}
167
168	# remove timeline
169	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
170
171	# remove extra bits
172	my $extra_bits = "Retrieved from(.+)</a>\"";
173	$body_text =~ s/$extra_bits//isg;
174
175	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
176	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
177	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
178	$body_text =~ s/( )+/ /sg;
179
180	# get rid of the [edit] buttons
181	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
182	# get rid of the last time edit information at the bottom
183	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
184	# get rid of the (Redirected from ...)
185	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
186
187	# escape texts macros
188	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
189	# may change the links, like Greenstone_Documentation_All.html, then change back
190	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
191
192	# define file delimiter for different platforms
193	my $file_delimiter;
194	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
195	$file_delimiter = "\\";
196	} else {
197	$file_delimiter = "/";
198	}
199
200	# IMPORTANT: different delimiter for $base_dir and $file
201	# $base_dir use forward slash for both windows and linux
202	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
203	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
204	# $file use different delimiters : forward slash for linux; backward slash for windows
205	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
206	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
207
208	# get the base url for the MediaWiki website
209	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
210	my @url_dirs=split($safe_delimiter, $file);
211	my $url_base = $url_dirs[0];
212
213	# Re-check css files associated with MediaWiki pages
214	if(defined $base_dir && $base_dir ne ""){
215	my @css_files;
216	my $css_file_count = 0;
217
218	# find all the stylesheets imported with @import statement
219	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
220	$css_files[$css_file_count++] = $2 if defined $2;
221	}
222
223	# download the stylesheets if we haven't downloaded them yet
224	# add prefix to each style elmement, comment out the body element
225	# and copy the files to collection's images folder
226	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
227
228	my $css_file = $css_files[$css_file_count];
229
230	# remove prefix gli/cache directory
231	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
232
233	# change the \ delimiter in $css_file to / for consistency
234	$css_file =~ s/\\/\//isg;
235	if($css_file !~ /$url_base/) {
236	$css_file = $url_base . $css_file;
237	}
238
239	# trim the ? mark append to the end of a stylesheet
240	$css_file =~ s/\?(.+)$//isg;
241
242	my $css_file_path = &util::filename_cat($base_dir, $css_file);
243
244	# do nothing if we have already downloaded the css files
245	if (! -e $css_file_path) {
246
247	# check the stylesheet's directory in the import folder
248	# if the directory doesn't exist, create one
249	my @dirs = split(/\//i,$css_file);
250	my $path_check = "$base_dir/";
251	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
252	$path_check .= $dirs[$i] . "/";
253	mkdir($path_check) if (! -d $path_check );
254	}
255
256	# NOTE: wget needs configuration to directly access Internet
257	# These files should already downloaded if we used the MediaWikiDownload
258	# downloading
259	$css_file = "http://$css_file";
260	print "\ndownloading : " . $css_file . "\n\n";
261	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
262	if ($? != 0) {
263	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
264	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
265	unlink("$css_file_path");
266	}
267	} # done with download
268
269	# add a prefix "#wikispecificstyle" to each element
270	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
271	# so we will wrap the web page with a div with id = wikispecificstyle
272	my $css_content;
273	if(open(INPUT, "<$css_file_path")){
274	while(my $line = <INPUT>){
275	# comment out the body element because we change the body to div
276	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
277
278	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
279	if($line !~ m/wikispecificstyle/i){
280	$line = "#wikispecificstyle " . $line;
281	}
282	}
283
284	$css_content .= $line;
285	}
286	close(INPUT);
287	open(OUTPUT, ">$css_file_path");
288	print OUTPUT $css_content;
289	close(OUTPUT);
290	}
291
292	# Copy the modified stylesheets to collection's images folder
293	# for future customization
294	my $images_dir = $base_dir;
295	$images_dir =~ s/import$/images/;
296	$css_file =~ m/(.)\/(.)$/;
297	$images_dir = &util::filename_cat($images_dir, $2);
298
299	if(open(OUTPUT, ">$images_dir")){
300	print OUTPUT $css_content;
301	close(OUTPUT);
302	}
303	}
304	}
305
306
307	# by default, only preserve navigation box and search box
308	# others like toolbox, interaction, languages box, will be removed
309
310	# extract the larger part -- footer section
311	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
312	$body_text =~ /$print_footer/;
313	my $footer = "";
314	$footer = $& if defined $&;
315	$footer =~ s/<\/body>//isg;
316
317	# trim the comments first
318	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
319
320	# contain sections that are to be preserved
321	my $preserve_sections = "";
322
323	# process the navigation section
324	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
325	if (defined $self->{'nav_div_exp'}) {
326	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
327	}
328
329	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
330	# do nothing
331	} else {
332	if ($footer =~ m/$nav_match_exp/ig) {
333	$preserve_sections = $& ;
334	} else {
335	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
336	}
337	# if($preserve_sections =~/\S/){
338	# $preserve_sections .= "</div>";
339	# }
340	}
341
342	# process the searchbox section
343	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
344	if(defined $self->{'searchbox_div_exp'}) {
345	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
346	}
347
348	my $searchbox_section = "";
349	$footer =~ m/$searchbox_exp/ig;
350	$searchbox_section = $& if defined $&;
351
352	# make the searchbox form work in Greenstone
353	if($searchbox_section =~ /\S/){
354	# replace action
355	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
356
357	# remove buttons
358	$searchbox_section =~ s/name="search"/name="q"/isg;
359	$searchbox_section =~ s/name="go"//isg;
360	$searchbox_section =~ s/name="fulltext"//isg;
361
362	# get collection name from $base_dir for c param
363	$base_dir =~ m/\/collect\/(.+)\//i;
364	my $collection_name = "";
365	$collection_name = $1 if defined $1;
366
367	# add Greenstone search params
368	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
369	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
370	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
371	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
372
373	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
374
375	# $searchbox_section .= "</div>";
376	} else {
377	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
378	}
379
380	# either delete or replace the searchbox
381	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
382	# do nothing
383	} else {
384	$preserve_sections .= "\n$searchbox_section\n";
385	}
386
387	if($preserve_sections ne ""){
388	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
389	}
390	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
391
392	$body_text =~ s/$print_footer/$preserve_sections/isg;
393
394
395	# delete other forms in the page
396	my @forms;
397	my $form_count = 0;
398	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
399	next if($3 eq "searchform");
400	$forms[$form_count++] = $&;
401	}
402	foreach my $form (@forms) {
403	$body_text =~ s/$form[\s\S]*?<\/form>//m;
404	}
405
406	# process links.
407	# because current WGET 1.10 the -k and -E option doesn't work together
408	# need to 'manually' convert the links to relative links
409	# Dealing with 3 types of links:
410	# -- outgoing links
411	# -- if we have downloaded the target files, link to the internal version (relative link)
412	# -- otherwise, link to the external version (absolute links)
413	# -- in-page links (relative link)
414
415	# NOTE: (important)
416	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
417	# otherwise, the internal links may have problems
418
419	# remove the title attribute of <a> tag
420	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
421
422	# extract all the links
423	my @links;
424	my $link_count = 0;
425	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
426	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
427	}
428
429	foreach my $cur_link (@links) {
430	# escape greedy match + character
431	$cur_link =~ s/\+/\\+/isg;
432
433	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
434	my $external_file_path = "$1\"http://$url_base/$3\"";
435
436	$body_text =~ s/$cur_link/$external_file_path/i;
437	}
438
439	# tag links to new wiki pages as red
440	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
441
442	# tag links to pages external of the MediaWiki website as blue
443	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
444
445
446	# process the table-of-contents section
447	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
448	# 1. read _content_ macro from about.dm
449	# 2. append the toc, change all links to the Greenstone internal format for relative links
450	# 3. write to the extra.dm
451	# TODO: we assume the _about:content_ hasn't been specified before
452	# so needs to add function to handle when the macro is already in the extra.dm
453	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
454
455	# extract toc of the Main_Page
456	my $mainpage_toc = "";
457	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
458	if($self->{'toc_exp'} =~ /\S/){
459	$toc_exp = $self->{'toc_exp'};
460	}
461	if($body_text =~ /$toc_exp/){
462	$mainpage_toc = $&;
463	}
464
465	if($mainpage_toc =~ /\S/) {
466
467	# change the in-page links to relative links, for example, change <a href="#section1"> to
468	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
469	my $file_url_format = $file;
470	$file_url_format =~ s/\\/\//isg;
471	$file_url_format = "http://" . $file_url_format;
472
473	# encode as URL, otherwise doesn't work on Windows
474	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
475	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
476
477
478	# read the collection's extra.dm
479	my $macro_path = $base_dir;
480	$macro_path =~ s/import$/macros/;
481	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
482
483	my $extra_dm = "";
484	if(open(INPUT, "<$extradm_file")){
485	while(my $line = <INPUT>){
486	$extra_dm .= $line;
487	}
488	} else {
489	print $outhandle "can't open file $extradm_file\n";
490	}
491	close(INPUT);
492
493	# check whether we have changed the macros
494	my @packages = split("package ", $extra_dm);
495	my $about_package = "";
496	foreach my $package (@packages) {
497	$about_package = "package " . $package if($package =~ /^about/);
498	}
499
500	my $update_extra_dm = 0;
501
502	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
503	print $outhandle "_content_ macro already changed!!!!\n";
504	}
505	# if extra.dm doesn't have an "about package"
506	elsif ($about_package !~ /\S/) {
507	# read _content_ macro from $GSDLHOME/macros/about.dm file
508	my $global_about_package = $self->read_content_from_about_dm();
509
510	# create the extra _content_ macro for this collection
511	# add the original content of the _content_ macro
512	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
513
514	# append the new about package to extra.dm
515	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
516	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
517
518	$update_extra_dm = 1;
519	}
520	# the about package exists, but either doesn't have the _content_ macro or
521	# the _content_ macro doesn't contain the toc
522	else {
523	# check if there is a content macro
524	my $content_macro_existed = 0;
525	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
526
527	# if there is one
528	# append a new section div for toc to the end of the document section
529	if($content_macro_existed ==1) {
530	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
531	my $content_macro = $&;
532	my $new_content_macro = $content_macro;
533	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
534	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
535	}
536	# otherwise, append _content_ macro to the about package
537	else {
538	my $new_about_package = $about_package;
539	my $content_macro = &read_content_from_about_dm();
540	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
541
542	$new_about_package .= "\n\n_content_$&\n\n";
543	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
544	$extra_dm =~ s/$about_package/$new_about_package/mg;
545	}
546
547	# either the case, we need to update the extra.dm
548	$update_extra_dm = 1;
549	}
550
551	if($update_extra_dm==1){
552	# write to the extra.dm file of the collection
553	if (open(OUTPUT, ">$extradm_file")) {
554	print OUTPUT $extra_dm;
555	} else {
556	print "can't open $extradm_file\n";
557	}
558	close(OUTPUT);
559	}
560	} else {
561	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
562	}
563	}
564
565	# If delete_toc is set, remove toc and tof contents.
566	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
567	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
568	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
569	if ($body_text =~ /$self->{'toc_exp'}/) {
570	$body_text =~ s/$self->{'toc_exp'}//i;
571	}
572	}
573	}
574
575	$$textref = "<body" . $body_text;
576
577	# Wrap the whole page with <div id="wikispecificstyle"></div>
578	# keep the style of this website and don't mess up with the Greenstone styles
579	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
580	$$textref =~ s/<\/body>/<\/div><\/body>/is;
581
582	$self->SUPER::process(@_);
583
584	return 1;
585	}
586
587
588	sub extract_metadata
589	{
590	my $self = shift (@_);
591	my ($textref, $metadata, $doc_obj) = @_;
592	my $outhandle = $self->{'outhandle'};
593
594	return if (!defined $textref);
595
596	# metadata fields to extract/save. 'key' is the (lowercase) name of the
597	# html meta, 'value' is the metadata name for greenstone to use
598	my %find_fields = ();
599	my ($tag,$value);
600
601	my $orig_field = "";
602	foreach my $field (split /,/, $self->{'metadata_fields'}) {
603	# support tag<tagname>
604	if ($field =~ /^(.?)<(.?)>$/) {
605	# "$2" is the user's preferred gs metadata name
606	$find_fields{lc($1)}=$2; # lc = lowercase
607	$orig_field = $1;
608	} else { # no <tagname> for mapping
609	# "$field" is the user's preferred gs metadata name
610	$find_fields{lc($field)}=$field; # lc = lowercase
611	$orig_field = $field;
612	}
613
614	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
615	$tag = $orig_field;
616	$value = $1;
617	if (!defined $value \|\| !defined $tag){
618	#print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
619	next;
620	} else {
621	# clean up and add
622	chomp($value); # remove trailing \n, if any
623	$tag = $find_fields{lc($tag)};
624	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
625	# if ($self->{'verbosity'} > 2);
626	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
627	}
628	}
629	}
630	}
631
632	sub safe_escape_regexp
633	{
634	my $regexp = shift (@_);
635
636	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
637	$regexp =~ s/\\/\\\\/isg;
638	#} else {
639	$regexp =~ s/\//\\\//isg;
640	#}
641	return $regexp;
642	}
643
644	sub read_content_from_about_dm
645	{
646	my $self = shift(@_);
647
648	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
649	my $about_page_content = "";
650	if (open(INPUT, "<$about_macro_file")){
651	while (my $line=<INPUT>){
652	$about_page_content .= $line;
653	}
654	} else {
655	my $outhandle = $self->{'outhandle'};
656	print $outhandle "can't open file $about_macro_file\n";
657	}
658	close(INPUT);
659
660	# extract the _content_ macro
661	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
662	$about_page_content = $&;
663
664	return $about_page_content;
665	}
666
667	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: