Context Navigation

source: main/trunk/greenstone2/perllib/plugins/MediaWikiPlugin.pm@ 22597

Last change on this file since 22597 was 19123, checked in by kjdon, 15 years ago
copy css files to collection's style dir, not images dir
Property svn:keywords set to `Author Date Id Revision`
File size: 26.3 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlugin;
34
35	use HTMLPlugin;
36	use unicode;
37
38	use strict; # every perl program should have this!
39	no strict 'refs'; # make an exception so we can use variables as filehandles
40
41
42	sub BEGIN {
43	@MediaWikiPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	# show the table of contents on collection's home page
49	{ 'name' => "show_toc",
50	'desc' => "{MediaWikiPlugin.show_toc}",
51	'type' => "flag",
52	'reqd' => "no"},
53	# set to delete the table of contents section on each MediaWiki page
54	{ 'name' => "delete_toc",
55	'desc' => "{MediaWikiPlugin.delete_toc}",
56	'type' => "flag",
57	'reqd' => "no"},
58	# regexp to match the table of contents
59	{ 'name' => "toc_exp",
60	'desc' => "{MediaWikiPlugin.toc_exp}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
64	# set to delete the navigation section
65	{ 'name' => "delete_nav",
66	'desc' => "{MediaWikiPlugin.delete_nav}",
67	'type' => "flag",
68	'reqd' => "no",
69	'deft' => ""},
70	# regexp to match the navigation section
71	{ 'name' => "nav_div_exp",
72	'desc' => "{MediaWikiPlugin.nav_div_exp}",
73	'type' => "regexp",
74	'reqd' => "no",
75	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
76	# set to delete the searchbox section
77	{ 'name' => "delete_searchbox",
78	'desc' => "{MediaWikiPlugin.delete_searchbox}",
79	'type' => "flag",
80	'reqd' => "no",
81	'deft' => ""},
82	# regexp to match the searchbox section
83	{ 'name' => "searchbox_div_exp",
84	'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
85	'type' => "regexp",
86	'reqd' => "no",
87	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
88	# regexp to match title suffix
89	# can't use the title_sub option in HTMLPlugin instead
90	# because title_sub always matches from the begining
91	{ 'name' => "remove_title_suffix_exp",
92	'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
93	'type' => "regexp",
94	'reqd' => "no",
95	'deft' => ""}
96	];
97
98	my $options = { 'name' => "MediaWikiPlugin",
99	'desc' => "{MediaWikiPlugin.desc}",
100	'abstract' => "no",
101	'inherits' => "yes",
102	'args' => $arguments };
103
104	sub new {
105	my ($class) = shift (@_);
106	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107	push(@$pluginlist, $class);
108
109	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110	push(@{$hashArgOptLists->{"OptList"}},$options);
111
112	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
113	return bless $self, $class;
114	}
115
116
117
118	sub process {
119	my $self = shift (@_);
120	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
121	my $outhandle = $self->{'outhandle'};
122
123	my @head_and_body = split(/<body/i,$$textref);
124	my $head = shift(@head_and_body);
125	my $body_text = join("<body", @head_and_body);
126
127	$head =~ m/<title>(.+)<\/title>/i;
128	my $doctitle = $1 if defined $1;
129
130	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
131	my @doc_properties = split(/<xml>/i,$head);
132	my $doc_heading = shift(@doc_properties);
133	my $rest_doc_properties = join(" ", @doc_properties);
134
135	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
136	my $extracted_metadata = shift (@extracted_metadata);
137	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
138	}
139
140	# set the title here if we haven't found it yet
141	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
142	if (defined $doctitle && $doctitle =~ /\S/) {
143	# remove suffix in title if required
144	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
145	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
146	$doctitle =~ s/$remove_suffix_exp//i;
147	}
148	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
149	} else {
150	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
151	}
152	}
153
154	# we are only interested in the column-contents div <div id="column-content">
155	# remove header section, it may contain header images or additional search boxes
156	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
157	if($body_text =~ /$header_exp/){
158	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
159	} else {
160	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
161	if($body_text =~ /$header_exp/){
162	$body_text =~ s/$header_exp/<div$2id='column-content/i;
163	}
164	}
165
166	# remove timeline
167	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
168
169	# remove extra bits
170	my $extra_bits = "Retrieved from(.+)</a>\"";
171	$body_text =~ s/$extra_bits//isg;
172
173	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
174	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
175	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
176	$body_text =~ s/( )+/ /sg;
177
178	# get rid of the [edit] buttons
179	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
180	# get rid of the last time edit information at the bottom
181	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
182	# get rid of the (Redirected from ...)
183	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
184
185	# escape texts macros
186	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
187	# may change the links, like Greenstone_Documentation_All.html, then change back
188	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
189
190	# define file delimiter for different platforms
191	my $file_delimiter;
192	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
193	$file_delimiter = "\\";
194	} else {
195	$file_delimiter = "/";
196	}
197
198	# IMPORTANT: different delimiter for $base_dir and $file
199	# $base_dir use forward slash for both windows and linux
200	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
201	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
202	# $file use different delimiters : forward slash for linux; backward slash for windows
203	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
204	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
205
206	# get the base url for the MediaWiki website
207	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
208	my @url_dirs=split($safe_delimiter, $file);
209	my $url_base = $url_dirs[0];
210
211	# Re-check css files associated with MediaWiki pages
212	if(defined $base_dir && $base_dir ne ""){
213	my @css_files;
214	my $css_file_count = 0;
215
216	# find all the stylesheets imported with @import statement
217	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
218	$css_files[$css_file_count++] = $2 if defined $2;
219	}
220
221	# download the stylesheets if we haven't downloaded them yet
222	# add prefix to each style elmement, comment out the body element
223	# and copy the files to collection's style folder
224	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
225
226	my $css_file = $css_files[$css_file_count];
227
228	# remove prefix gli/cache directory
229	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
230
231	# change the \ delimiter in $css_file to / for consistency
232	$css_file =~ s/\\/\//isg;
233	if($css_file !~ /$url_base/) {
234	$css_file = $url_base . $css_file;
235	}
236
237	# trim the ? mark append to the end of a stylesheet
238	$css_file =~ s/\?(.+)$//isg;
239
240	my $css_file_path = &util::filename_cat($base_dir, $css_file);
241
242	# do nothing if we have already downloaded the css files
243	if (! -e $css_file_path) {
244
245	# check the stylesheet's directory in the import folder
246	# if the directory doesn't exist, create one
247	my @dirs = split(/\//i,$css_file);
248	my $path_check = "$base_dir/";
249	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
250	$path_check .= $dirs[$i] . "/";
251	mkdir($path_check) if (! -d $path_check );
252	}
253
254	# NOTE: wget needs configuration to directly access Internet
255	# These files should already downloaded if we used the MediaWikiDownload
256	# downloading
257	$css_file = "http://$css_file";
258	print "\ndownloading : " . $css_file . "\n\n";
259	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
260	if ($? != 0) {
261	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
262	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
263	unlink("$css_file_path");
264	}
265	} # done with download
266
267	# add a prefix "#wikispecificstyle" to each element
268	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
269	# so we will wrap the web page with a div with id = wikispecificstyle
270	my $css_content;
271	if(open(INPUT, "<$css_file_path")){
272	while(my $line = <INPUT>){
273	# comment out the body element because we change the body to div
274	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
275
276	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
277	if($line !~ m/wikispecificstyle/i){
278	$line = "#wikispecificstyle " . $line;
279	}
280	}
281
282	$css_content .= $line;
283	}
284	close(INPUT);
285	open(OUTPUT, ">$css_file_path");
286	print OUTPUT $css_content;
287	close(OUTPUT);
288	}
289
290	# Copy the modified stylesheets to collection's style folder
291	# for future customization
292	my $style_dir = $base_dir;
293	$style_dir =~ s/import$/style/;
294	$css_file =~ m/(.)\/(.)$/;
295	$style_dir = &util::filename_cat($style_dir, $2);
296
297	if(open(OUTPUT, ">$style_dir")){
298	print OUTPUT $css_content;
299	close(OUTPUT);
300	}
301	}
302	}
303
304
305	# by default, only preserve navigation box and search box
306	# others like toolbox, interaction, languages box, will be removed
307
308	# extract the larger part -- footer section
309	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
310	$body_text =~ /$print_footer/;
311	my $footer = "";
312	$footer = $& if defined $&;
313	$footer =~ s/<\/body>//isg;
314
315	# trim the comments first
316	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
317
318	# contain sections that are to be preserved
319	my $preserve_sections = "";
320
321	# process the navigation section
322	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
323	if (defined $self->{'nav_div_exp'}) {
324	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
325	}
326
327	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
328	# do nothing
329	} else {
330	if ($footer =~ m/$nav_match_exp/ig) {
331	$preserve_sections = $& ;
332	} else {
333	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
334	}
335	# if($preserve_sections =~/\S/){
336	# $preserve_sections .= "</div>";
337	# }
338	}
339
340	# process the searchbox section
341	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
342	if(defined $self->{'searchbox_div_exp'}) {
343	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
344	}
345
346	my $searchbox_section = "";
347	$footer =~ m/$searchbox_exp/ig;
348	$searchbox_section = $& if defined $&;
349
350	# make the searchbox form work in Greenstone
351	if($searchbox_section =~ /\S/){
352	# replace action
353	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
354
355	# remove buttons
356	$searchbox_section =~ s/name="search"/name="q"/isg;
357	$searchbox_section =~ s/name="go"//isg;
358	$searchbox_section =~ s/name="fulltext"//isg;
359
360	# get collection name from $base_dir for c param
361	$base_dir =~ m/\/collect\/(.+)\//i;
362	my $collection_name = "";
363	$collection_name = $1 if defined $1;
364
365	# add Greenstone search params
366	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
367	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
368	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
369	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
370
371	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
372
373	# $searchbox_section .= "</div>";
374	} else {
375	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
376	}
377
378	# either delete or replace the searchbox
379	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
380	# do nothing
381	} else {
382	$preserve_sections .= "\n$searchbox_section\n";
383	}
384
385	if($preserve_sections ne ""){
386	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
387	}
388	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
389
390	$body_text =~ s/$print_footer/$preserve_sections/isg;
391
392
393	# delete other forms in the page
394	my @forms;
395	my $form_count = 0;
396	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
397	next if($3 eq "searchform");
398	$forms[$form_count++] = $&;
399	}
400	foreach my $form (@forms) {
401	$body_text =~ s/$form[\s\S]*?<\/form>//m;
402	}
403
404	# process links.
405	# because current WGET 1.10 the -k and -E option doesn't work together
406	# need to 'manually' convert the links to relative links
407	# Dealing with 3 types of links:
408	# -- outgoing links
409	# -- if we have downloaded the target files, link to the internal version (relative link)
410	# -- otherwise, link to the external version (absolute links)
411	# -- in-page links (relative link)
412
413	# NOTE: (important)
414	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
415	# otherwise, the internal links may have problems
416
417	# remove the title attribute of <a> tag
418	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
419
420	# extract all the links
421	my @links;
422	my $link_count = 0;
423	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
424	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
425	}
426
427	foreach my $cur_link (@links) {
428	# escape greedy match + character
429	$cur_link =~ s/\+/\\+/isg;
430
431	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
432	my $external_file_path = "$1\"http://$url_base/$3\"";
433
434	$body_text =~ s/$cur_link/$external_file_path/i;
435	}
436
437	# tag links to new wiki pages as red
438	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
439
440	# tag links to pages external of the MediaWiki website as blue
441	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
442
443
444	# process the table-of-contents section
445	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
446	# 1. read _content_ macro from about.dm
447	# 2. append the toc, change all links to the Greenstone internal format for relative links
448	# 3. write to the extra.dm
449	# TODO: we assume the _about:content_ hasn't been specified before
450	# so needs to add function to handle when the macro is already in the extra.dm
451	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
452
453	# extract toc of the Main_Page
454	my $mainpage_toc = "";
455	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
456	if($self->{'toc_exp'} =~ /\S/){
457	$toc_exp = $self->{'toc_exp'};
458	}
459	if($body_text =~ /$toc_exp/){
460	$mainpage_toc = $&;
461	}
462
463	if($mainpage_toc =~ /\S/) {
464
465	# change the in-page links to relative links, for example, change <a href="#section1"> to
466	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
467	my $file_url_format = $file;
468	$file_url_format =~ s/\\/\//isg;
469	$file_url_format = "http://" . $file_url_format;
470
471	# encode as URL, otherwise doesn't work on Windows
472	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
473	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
474
475
476	# read the collection's extra.dm
477	my $macro_path = $base_dir;
478	$macro_path =~ s/import$/macros/;
479	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
480
481	my $extra_dm = "";
482	if(open(INPUT, "<$extradm_file")){
483	while(my $line = <INPUT>){
484	$extra_dm .= $line;
485	}
486	} else {
487	print $outhandle "can't open file $extradm_file\n";
488	}
489	close(INPUT);
490
491	# check whether we have changed the macros
492	my @packages = split("package ", $extra_dm);
493	my $about_package = "";
494	foreach my $package (@packages) {
495	$about_package = "package " . $package if($package =~ /^about/);
496	}
497
498	my $update_extra_dm = 0;
499
500	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
501	print $outhandle "_content_ macro already changed!!!!\n";
502	}
503	# if extra.dm doesn't have an "about package"
504	elsif ($about_package !~ /\S/) {
505	# read _content_ macro from $GSDLHOME/macros/about.dm file
506	my $global_about_package = $self->read_content_from_about_dm();
507
508	# create the extra _content_ macro for this collection
509	# add the original content of the _content_ macro
510	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
511
512	# append the new about package to extra.dm
513	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
514	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
515
516	$update_extra_dm = 1;
517	}
518	# the about package exists, but either doesn't have the _content_ macro or
519	# the _content_ macro doesn't contain the toc
520	else {
521	# check if there is a content macro
522	my $content_macro_existed = 0;
523	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
524
525	# if there is one
526	# append a new section div for toc to the end of the document section
527	if($content_macro_existed ==1) {
528	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
529	my $content_macro = $&;
530	my $new_content_macro = $content_macro;
531	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
532	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
533	}
534	# otherwise, append _content_ macro to the about package
535	else {
536	my $new_about_package = $about_package;
537	my $content_macro = &read_content_from_about_dm();
538	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
539
540	$new_about_package .= "\n\n_content_$&\n\n";
541	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
542	$extra_dm =~ s/$about_package/$new_about_package/mg;
543	}
544
545	# either the case, we need to update the extra.dm
546	$update_extra_dm = 1;
547	}
548
549	if($update_extra_dm==1){
550	# write to the extra.dm file of the collection
551	if (open(OUTPUT, ">$extradm_file")) {
552	print OUTPUT $extra_dm;
553	} else {
554	print "can't open $extradm_file\n";
555	}
556	close(OUTPUT);
557	}
558	} else {
559	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
560	}
561	}
562
563	# If delete_toc is set, remove toc and tof contents.
564	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
565	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
566	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
567	if ($body_text =~ /$self->{'toc_exp'}/) {
568	$body_text =~ s/$self->{'toc_exp'}//i;
569	}
570	}
571	}
572
573	$$textref = "<body" . $body_text;
574
575	# Wrap the whole page with <div id="wikispecificstyle"></div>
576	# keep the style of this website and don't mess up with the Greenstone styles
577	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
578	$$textref =~ s/<\/body>/<\/div><\/body>/is;
579
580	$self->SUPER::process(@_);
581
582	return 1;
583	}
584
585
586	sub extract_metadata
587	{
588	my $self = shift (@_);
589	my ($textref, $metadata, $doc_obj) = @_;
590	my $outhandle = $self->{'outhandle'};
591
592	return if (!defined $textref);
593
594	# metadata fields to extract/save. 'key' is the (lowercase) name of the
595	# html meta, 'value' is the metadata name for greenstone to use
596	my %find_fields = ();
597	my ($tag,$value);
598
599	my $orig_field = "";
600	foreach my $field (split /,/, $self->{'metadata_fields'}) {
601	# support tag<tagname>
602	if ($field =~ /^(.?)<(.?)>$/) {
603	# "$2" is the user's preferred gs metadata name
604	$find_fields{lc($1)}=$2; # lc = lowercase
605	$orig_field = $1;
606	} else { # no <tagname> for mapping
607	# "$field" is the user's preferred gs metadata name
608	$find_fields{lc($field)}=$field; # lc = lowercase
609	$orig_field = $field;
610	}
611
612	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
613	$tag = $orig_field;
614	$value = $1;
615	if (!defined $value \|\| !defined $tag){
616	#print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
617	next;
618	} else {
619	# clean up and add
620	chomp($value); # remove trailing \n, if any
621	$tag = $find_fields{lc($tag)};
622	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
623	# if ($self->{'verbosity'} > 2);
624	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
625	}
626	}
627	}
628	}
629
630	sub safe_escape_regexp
631	{
632	my $regexp = shift (@_);
633
634	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
635	$regexp =~ s/\\/\\\\/isg;
636	#} else {
637	$regexp =~ s/\//\\\//isg;
638	#}
639	return $regexp;
640	}
641
642	sub read_content_from_about_dm
643	{
644	my $self = shift(@_);
645
646	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
647	my $about_page_content = "";
648	if (open(INPUT, "<$about_macro_file")){
649	while (my $line=<INPUT>){
650	$about_page_content .= $line;
651	}
652	} else {
653	my $outhandle = $self->{'outhandle'};
654	print $outhandle "can't open file $about_macro_file\n";
655	}
656	close(INPUT);
657
658	# extract the _content_ macro
659	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
660	$about_page_content = $&;
661
662	return $about_page_content;
663	}
664
665	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: