Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 26.4 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlugin;
34
35	use HTMLPlugin;
36	# use ImagePlugin;
37	# use File::Copy;
38	use unicode;
39
40
41	#use strict; # every perl program should have this!
42	#no strict 'refs'; # make an exception so we can use variables as filehandles
43
44	sub BEGIN {
45	@MediaWikiPlugin::ISA = ('HTMLPlugin');
46	}
47
48	my $arguments =
49	[
50	# show the table of contents on collection's home page
51	{ 'name' => "show_toc",
52	'desc' => "{MediaWikiPlugin.show_toc}",
53	'type' => "flag",
54	'reqd' => "no"},
55	# set to delete the table of contents section on each MediaWiki page
56	{ 'name' => "delete_toc",
57	'desc' => "{MediaWikiPlugin.delete_toc}",
58	'type' => "flag",
59	'reqd' => "no"},
60	# regexp to match the table of contents
61	{ 'name' => "toc_exp",
62	'desc' => "{MediaWikiPlugin.toc_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
66	# set to delete the navigation section
67	{ 'name' => "delete_nav",
68	'desc' => "{MediaWikiPlugin.delete_nav}",
69	'type' => "flag",
70	'reqd' => "no",
71	'deft' => ""},
72	# regexp to match the navigation section
73	{ 'name' => "nav_div_exp",
74	'desc' => "{MediaWikiPlugin.nav_div_exp}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
78	# set to delete the searchbox section
79	{ 'name' => "delete_searchbox",
80	'desc' => "{MediaWikiPlugin.delete_searchbox}",
81	'type' => "flag",
82	'reqd' => "no",
83	'deft' => ""},
84	# regexp to match the searchbox section
85	{ 'name' => "searchbox_div_exp",
86	'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
87	'type' => "regexp",
88	'reqd' => "no",
89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
90	# regexp to match title suffix
91	# can't use the title_sub option in HTMLPlugin instead
92	# because title_sub always matches from the begining
93	{ 'name' => "remove_title_suffix_exp",
94	'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
95	'type' => "regexp",
96	'reqd' => "no",
97	'deft' => ""}
98	];
99
100	my $options = { 'name' => "MediaWikiPlugin",
101	'desc' => "{MediaWikiPlugin.desc}",
102	'abstract' => "no",
103	'inherits' => "yes",
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112	push(@{$hashArgOptLists->{"OptList"}},$options);
113
114	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
115	return bless $self, $class;
116	}
117
118
119
120	sub process {
121	my $self = shift (@_);
122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123	my $outhandle = $self->{'outhandle'};
124
125	print $outhandle "MediaWikiPlugin: processing $file\n" if $self->{'verbosity'} > 1;
126
127	my @head_and_body = split(/<body/i,$$textref);
128	my $head = shift(@head_and_body);
129	my $body_text = join("<body", @head_and_body);
130
131	$head =~ m/<title>(.+)<\/title>/i;
132	my $doctitle = $1 if defined $1;
133
134	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
135	my @doc_properties = split(/<xml>/i,$head);
136	my $doc_heading = shift(@doc_properties);
137	my $rest_doc_properties = join(" ", @doc_properties);
138
139	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
140	my $extracted_metadata = shift (@extracted_metadata);
141	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
142	}
143
144	# set the title here if we haven't found it yet
145	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
146	if (defined $doctitle && $doctitle =~ /\S/) {
147	# remove suffix in title if required
148	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
149	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
150	$doctitle =~ s/$remove_suffix_exp//i;
151	}
152	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
153	} else {
154	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
155	}
156	}
157
158	# we are only interested in the column-contents div <div id="column-content">
159	# remove header section, it may contain header images or additional search boxes
160	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
161	if($body_text =~ /$header_exp/){
162	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
163	} else {
164	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
165	if($body_text =~ /$header_exp/){
166	$body_text =~ s/$header_exp/<div$2id='column-content/i;
167	}
168	}
169
170	# remove timeline
171	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
172
173	# remove extra bits
174	my $extra_bits = "Retrieved from(.+)</a>\"";
175	$body_text =~ s/$extra_bits//isg;
176
177	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
178	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
179	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
180	$body_text =~ s/( )+/ /sg;
181
182	# get rid of the [edit] buttons
183	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
184	# get rid of the last time edit information at the bottom
185	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
186	# get rid of the (Redirected from ...)
187	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
188
189	# escape texts macros
190	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
191	# may change the links, like Greenstone_Documentation_All.html, then change back
192	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
193
194	# define file delimiter for different platforms
195	my $file_delimiter;
196	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
197	$file_delimiter = "\\";
198	} else {
199	$file_delimiter = "/";
200	}
201
202	# IMPORTANT: different delimiter for $base_dir and $file
203	# $base_dir use forward slash for both windows and linux
204	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
205	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
206	# $file use different delimiters : forward slash for linux; backward slash for windows
207	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
208	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
209
210	# get the base url for the MediaWiki website
211	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
212	my @url_dirs=split($safe_delimiter, $file);
213	my $url_base = $url_dirs[0];
214
215	# Re-check css files associated with MediaWiki pages
216	if(defined $base_dir && $base_dir ne ""){
217	my @css_files;
218	my $css_file_count = 0;
219
220	# find all the stylesheets imported with @import statement
221	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
222	$css_files[$css_file_count++] = $2 if defined $2;
223	}
224
225	# download the stylesheets if we haven't downloaded them yet
226	# add prefix to each style elmement, comment out the body element
227	# and copy the files to collection's images folder
228	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
229
230	my $css_file = $css_files[$css_file_count];
231
232	# remove prefix gli/cache directory
233	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
234
235	# change the \ delimiter in $css_file to / for consistency
236	$css_file =~ s/\\/\//isg;
237	if($css_file !~ /$url_base/) {
238	$css_file = $url_base . $css_file;
239	}
240
241	# trim the ? mark append to the end of a stylesheet
242	$css_file =~ s/\?(.+)$//isg;
243
244	my $css_file_path = &util::filename_cat($base_dir, $css_file);
245
246	# do nothing if we have already downloaded the css files
247	if (! -e $css_file_path) {
248
249	# check the stylesheet's directory in the import folder
250	# if the directory doesn't exist, create one
251	my @dirs = split(/\//i,$css_file);
252	my $path_check = "$base_dir/";
253	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
254	$path_check .= $dirs[$i] . "/";
255	mkdir($path_check) if (! -d $path_check );
256	}
257
258	# NOTE: wget needs configuration to directly access Internet
259	# These files should already downloaded if we used the MediaWikiDownload
260	# downloading
261	$css_file = "http://$css_file";
262	print "\ndownloading : " . $css_file . "\n\n";
263	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
264	if ($? != 0) {
265	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
266	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
267	unlink("$css_file_path");
268	}
269	} # done with download
270
271	# add a prefix "#wikispecificstyle" to each element
272	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
273	# so we will wrap the web page with a div with id = wikispecificstyle
274	my $css_content;
275	if(open(INPUT, "<$css_file_path")){
276	while(my $line = <INPUT>){
277	# comment out the body element because we change the body to div
278	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
279
280	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
281	if($line !~ m/wikispecificstyle/i){
282	$line = "#wikispecificstyle " . $line;
283	}
284	}
285
286	$css_content .= $line;
287	}
288	close(INPUT);
289	open(OUTPUT, ">$css_file_path");
290	print OUTPUT $css_content;
291	close(OUTPUT);
292	}
293
294	# Copy the modified stylesheets to collection's images folder
295	# for future customization
296	my $images_dir = $base_dir;
297	$images_dir =~ s/import$/images/;
298	$css_file =~ m/(.)\/(.)$/;
299	$images_dir = &util::filename_cat($images_dir, $2);
300
301	if(open(OUTPUT, ">$images_dir")){
302	print OUTPUT $css_content;
303	close(OUTPUT);
304	}
305	}
306	}
307
308
309	# by default, only preserve navigation box and search box
310	# others like toolbox, interaction, languages box, will be removed
311
312	# extract the larger part -- footer section
313	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
314	$body_text =~ /$print_footer/;
315	my $footer = "";
316	$footer = $& if defined $&;
317	$footer =~ s/<\/body>//isg;
318
319	# trim the comments first
320	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
321
322	# contain sections that are to be preserved
323	my $preserve_sections = "";
324
325	# process the navigation section
326	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
327	if (defined $self->{'nav_div_exp'}) {
328	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
329	}
330
331	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
332	# do nothing
333	} else {
334	if ($footer =~ m/$nav_match_exp/ig) {
335	$preserve_sections = $& ;
336	} else {
337	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
338	}
339	# if($preserve_sections =~/\S/){
340	# $preserve_sections .= "</div>";
341	# }
342	}
343
344	# process the searchbox section
345	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
346	if(defined $self->{'searchbox_div_exp'}) {
347	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
348	}
349
350	my $searchbox_section = "";
351	$footer =~ m/$searchbox_exp/ig;
352	$searchbox_section = $& if defined $&;
353
354	# make the searchbox form work in Greenstone
355	if($searchbox_section =~ /\S/){
356	# replace action
357	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
358
359	# remove buttons
360	$searchbox_section =~ s/name="search"/name="q"/isg;
361	$searchbox_section =~ s/name="go"//isg;
362	$searchbox_section =~ s/name="fulltext"//isg;
363
364	# get collection name from $base_dir for c param
365	$base_dir =~ m/\/collect\/(.+)\//i;
366	my $collection_name = "";
367	$collection_name = $1 if defined $1;
368
369	# add Greenstone search params
370	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
371	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
372	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
373	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
374
375	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
376
377	# $searchbox_section .= "</div>";
378	} else {
379	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
380	}
381
382	# either delete or replace the searchbox
383	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
384	# do nothing
385	} else {
386	$preserve_sections .= "\n$searchbox_section\n";
387	}
388
389	if($preserve_sections ne ""){
390	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
391	}
392	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
393
394	$body_text =~ s/$print_footer/$preserve_sections/isg;
395
396
397	# delete other forms in the page
398	my @forms;
399	my $form_count = 0;
400	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
401	next if($3 eq "searchform");
402	$forms[$form_count++] = $&;
403	}
404	foreach my $form (@forms) {
405	$body_text =~ s/$form[\s\S]*?<\/form>//m;
406	}
407
408	# process links.
409	# because current WGET 1.10 the -k and -E option doesn't work together
410	# need to 'manually' convert the links to relative links
411	# Dealing with 3 types of links:
412	# -- outgoing links
413	# -- if we have downloaded the target files, link to the internal version (relative link)
414	# -- otherwise, link to the external version (absolute links)
415	# -- in-page links (relative link)
416
417	# NOTE: (important)
418	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
419	# otherwise, the internal links may have problems
420
421	# remove the title attribute of <a> tag
422	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
423
424	# extract all the links
425	my @links;
426	my $link_count = 0;
427	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
428	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
429	}
430
431	foreach my $cur_link (@links) {
432	# escape greedy match + character
433	$cur_link =~ s/\+/\\+/isg;
434
435	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
436	my $external_file_path = "$1\"http://$url_base/$3\"";
437
438	$body_text =~ s/$cur_link/$external_file_path/i;
439	}
440
441	# tag links to new wiki pages as red
442	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
443
444	# tag links to pages external of the MediaWiki website as blue
445	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
446
447
448	# process the table-of-contents section
449	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
450	# 1. read _content_ macro from about.dm
451	# 2. append the toc, change all links to the Greenstone internal format for relative links
452	# 3. write to the extra.dm
453	# TODO: we assume the _about:content_ hasn't been specified before
454	# so needs to add function to handle when the macro is already in the extra.dm
455	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
456
457	# extract toc of the Main_Page
458	my $mainpage_toc = "";
459	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
460	if($self->{'toc_exp'} =~ /\S/){
461	$toc_exp = $self->{'toc_exp'};
462	}
463	if($body_text =~ /$toc_exp/){
464	$mainpage_toc = $&;
465	}
466
467	if($mainpage_toc =~ /\S/) {
468
469	# change the in-page links to relative links, for example, change <a href="#section1"> to
470	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
471	my $file_url_format = $file;
472	$file_url_format =~ s/\\/\//isg;
473	$file_url_format = "http://" . $file_url_format;
474
475	# encode as URL, otherwise doesn't work on Windows
476	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
477	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
478
479
480	# read the collection's extra.dm
481	my $macro_path = $base_dir;
482	$macro_path =~ s/import$/macros/;
483	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
484
485	my $extra_dm = "";
486	if(open(INPUT, "<$extradm_file")){
487	while(my $line = <INPUT>){
488	$extra_dm .= $line;
489	}
490	} else {
491	print $outhandle "can't open file $extradm_file\n";
492	}
493	close(INPUT);
494
495	# check whether we have changed the macros
496	my @packages = split("package ", $extra_dm);
497	my $about_package = "";
498	foreach my $package (@packages) {
499	$about_package = "package " . $package if($package =~ /^about/);
500	}
501
502	my $update_extra_dm = 0;
503
504	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
505	print $outhandle "_content_ macro already changed!!!!\n";
506	}
507	# if extra.dm doesn't have an "about package"
508	elsif ($about_package !~ /\S/) {
509	# read _content_ macro from $GSDLHOME/macros/about.dm file
510	my $global_about_package = &read_content_from_about_dm();
511
512	# create the extra _content_ macro for this collection
513	# add the original content of the _content_ macro
514	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
515
516	# append the new about package to extra.dm
517	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
518	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
519
520	$update_extra_dm = 1;
521	}
522	# the about package exists, but either doesn't have the _content_ macro or
523	# the _content_ macro doesn't contain the toc
524	else {
525	# check if there is a content macro
526	my $content_macro_existed = 0;
527	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
528
529	# if there is one
530	# append a new section div for toc to the end of the document section
531	if($content_macro_existed ==1) {
532	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
533	my $content_macro = $&;
534	my $new_content_macro = $content_macro;
535	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
536	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
537	}
538	# otherwise, append _content_ macro to the about package
539	else {
540	my $new_about_package = $about_package;
541	$content_macro = &read_content_from_about_dm();
542	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
543
544	$new_about_package .= "\n\n_content_$&\n\n";
545	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
546	$extra_dm =~ s/$about_package/$new_about_package/mg;
547	}
548
549	# either the case, we need to update the extra.dm
550	$update_extra_dm = 1;
551	}
552
553	if($update_extra_dm==1){
554	# write to the extra.dm file of the collection
555	if (open(OUTPUT, ">$extradm_file")) {
556	print OUTPUT $extra_dm;
557	} else {
558	print "can't open $extradm_file\n";
559	}
560	close(OUTPUT);
561	}
562	} else {
563	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
564	}
565	}
566
567	# If delete_toc is set, remove toc and tof contents.
568	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
569	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
570	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
571	if ($body_text =~ /$self->{'toc_exp'}/) {
572	$body_text =~ s/$self->{'toc_exp'}//i;
573	}
574	}
575	}
576
577	$$textref = "<body" . $body_text;
578
579	# Wrap the whole page with <div id="wikispecificstyle"></div>
580	# keep the style of this website and don't mess up with the Greenstone styles
581	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
582	$$textref =~ s/<\/body>/<\/div><\/body>/is;
583
584	$self->SUPER::process(@_);
585
586	return 1;
587	}
588
589
590	sub extract_metadata
591	{
592	my $self = shift (@_);
593	my ($textref, $metadata, $doc_obj) = @_;
594	my $outhandle = $self->{'outhandle'};
595
596	return if (!defined $textref);
597
598	# metadata fields to extract/save. 'key' is the (lowercase) name of the
599	# html meta, 'value' is the metadata name for greenstone to use
600	my %find_fields = ();
601	my ($tag,$value);
602
603	my $orig_field = "";
604	foreach my $field (split /,/, $self->{'metadata_fields'}) {
605	# support tag<tagname>
606	if ($field =~ /^(.?)<(.?)>$/) {
607	# "$2" is the user's preferred gs metadata name
608	$find_fields{lc($1)}=$2; # lc = lowercase
609	$orig_field = $1;
610	} else { # no <tagname> for mapping
611	# "$field" is the user's preferred gs metadata name
612	$find_fields{lc($field)}=$field; # lc = lowercase
613	$orig_field = $field;
614	}
615
616	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
617	$tag = $orig_field;
618	$value = $1;
619	if (!defined $value \|\| !defined $tag){
620	#print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
621	next;
622	} else {
623	# clean up and add
624	chomp($value); # remove trailing \n, if any
625	$tag = $find_fields{lc($tag)};
626	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
627	# if ($self->{'verbosity'} > 2);
628	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
629	}
630	}
631	}
632	}
633
634	sub safe_escape_regexp
635	{
636	my $regexp = shift (@_);
637
638	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
639	$regexp =~ s/\\/\\\\/isg;
640	#} else {
641	$regexp =~ s/\//\\\//isg;
642	#}
643	return $regexp;
644	}
645
646	sub read_content_from_about_dm
647	{
648	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
649	my $about_page_content = "";
650	if (open(INPUT, "<$about_macro_file")){
651	while (my $line=<INPUT>){
652	$about_page_content .= $line;
653	}
654	} else {
655	print $outhandle "can't open file $about_macro_file\n";
656	}
657	close(INPUT);
658
659	# extract the _content_ macro
660	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
661	$about_page_content = $&;
662
663	return $about_page_content;
664	}
665
666	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: