Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 17739

Last change on this file since 17739 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 26.3 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlugin;
34
35	use HTMLPlugin;
36	use unicode;
37
38	use strict; # every perl program should have this!
39	no strict 'refs'; # make an exception so we can use variables as filehandles
40
41
42	sub BEGIN {
43	@MediaWikiPlugin::ISA = ('HTMLPlugin');
44	}
45
46	my $arguments =
47	[
48	# show the table of contents on collection's home page
49	{ 'name' => "show_toc",
50	'desc' => "{MediaWikiPlugin.show_toc}",
51	'type' => "flag",
52	'reqd' => "no"},
53	# set to delete the table of contents section on each MediaWiki page
54	{ 'name' => "delete_toc",
55	'desc' => "{MediaWikiPlugin.delete_toc}",
56	'type' => "flag",
57	'reqd' => "no"},
58	# regexp to match the table of contents
59	{ 'name' => "toc_exp",
60	'desc' => "{MediaWikiPlugin.toc_exp}",
61	'type' => "regexp",
62	'reqd' => "no",
63	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
64	# set to delete the navigation section
65	{ 'name' => "delete_nav",
66	'desc' => "{MediaWikiPlugin.delete_nav}",
67	'type' => "flag",
68	'reqd' => "no",
69	'deft' => ""},
70	# regexp to match the navigation section
71	{ 'name' => "nav_div_exp",
72	'desc' => "{MediaWikiPlugin.nav_div_exp}",
73	'type' => "regexp",
74	'reqd' => "no",
75	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
76	# set to delete the searchbox section
77	{ 'name' => "delete_searchbox",
78	'desc' => "{MediaWikiPlugin.delete_searchbox}",
79	'type' => "flag",
80	'reqd' => "no",
81	'deft' => ""},
82	# regexp to match the searchbox section
83	{ 'name' => "searchbox_div_exp",
84	'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
85	'type' => "regexp",
86	'reqd' => "no",
87	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
88	# regexp to match title suffix
89	# can't use the title_sub option in HTMLPlugin instead
90	# because title_sub always matches from the begining
91	{ 'name' => "remove_title_suffix_exp",
92	'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
93	'type' => "regexp",
94	'reqd' => "no",
95	'deft' => ""}
96	];
97
98	my $options = { 'name' => "MediaWikiPlugin",
99	'desc' => "{MediaWikiPlugin.desc}",
100	'abstract' => "no",
101	'inherits' => "yes",
102	'args' => $arguments };
103
104	sub new {
105	my ($class) = shift (@_);
106	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107	push(@$pluginlist, $class);
108
109	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110	push(@{$hashArgOptLists->{"OptList"}},$options);
111
112	my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
113	return bless $self, $class;
114	}
115
116
117
118	sub process {
119	my $self = shift (@_);
120	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
121	my $outhandle = $self->{'outhandle'};
122
123	my @head_and_body = split(/<body/i,$$textref);
124	my $head = shift(@head_and_body);
125	my $body_text = join("<body", @head_and_body);
126
127	$head =~ m/<title>(.+)<\/title>/i;
128	my $doctitle = $1 if defined $1;
129
130	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
131	my @doc_properties = split(/<xml>/i,$head);
132	my $doc_heading = shift(@doc_properties);
133	my $rest_doc_properties = join(" ", @doc_properties);
134
135	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
136	my $extracted_metadata = shift (@extracted_metadata);
137	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
138	}
139
140	# set the title here if we haven't found it yet
141	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
142	if (defined $doctitle && $doctitle =~ /\S/) {
143	# remove suffix in title if required
144	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
145	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
146	$doctitle =~ s/$remove_suffix_exp//i;
147	}
148	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
149	} else {
150	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
151	}
152	}
153
154	# we are only interested in the column-contents div <div id="column-content">
155	# remove header section, it may contain header images or additional search boxes
156	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
157	if($body_text =~ /$header_exp/){
158	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
159	} else {
160	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
161	if($body_text =~ /$header_exp/){
162	$body_text =~ s/$header_exp/<div$2id='column-content/i;
163	}
164	}
165
166	# remove timeline
167	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
168
169	# remove extra bits
170	my $extra_bits = "Retrieved from(.+)</a>\"";
171	$body_text =~ s/$extra_bits//isg;
172
173	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
174	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
175	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
176	$body_text =~ s/( )+/ /sg;
177
178	# get rid of the [edit] buttons
179	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
180	# get rid of the last time edit information at the bottom
181	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
182	# get rid of the (Redirected from ...)
183	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
184
185	# escape texts macros
186	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
187	# may change the links, like Greenstone_Documentation_All.html, then change back
188	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
189
190	# define file delimiter for different platforms
191	my $file_delimiter;
192	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
193	$file_delimiter = "\\";
194	} else {
195	$file_delimiter = "/";
196	}
197
198	# IMPORTANT: different delimiter for $base_dir and $file
199	# $base_dir use forward slash for both windows and linux
200	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
201	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
202	# $file use different delimiters : forward slash for linux; backward slash for windows
203	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
204	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
205
206	# get the base url for the MediaWiki website
207	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
208	my @url_dirs=split($safe_delimiter, $file);
209	my $url_base = $url_dirs[0];
210
211	# Re-check css files associated with MediaWiki pages
212	if(defined $base_dir && $base_dir ne ""){
213	my @css_files;
214	my $css_file_count = 0;
215
216	# find all the stylesheets imported with @import statement
217	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
218	$css_files[$css_file_count++] = $2 if defined $2;
219	}
220
221	# download the stylesheets if we haven't downloaded them yet
222	# add prefix to each style elmement, comment out the body element
223	# and copy the files to collection's images folder
224	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
225
226	my $css_file = $css_files[$css_file_count];
227
228	# remove prefix gli/cache directory
229	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
230
231	# change the \ delimiter in $css_file to / for consistency
232	$css_file =~ s/\\/\//isg;
233	if($css_file !~ /$url_base/) {
234	$css_file = $url_base . $css_file;
235	}
236
237	# trim the ? mark append to the end of a stylesheet
238	$css_file =~ s/\?(.+)$//isg;
239
240	my $css_file_path = &util::filename_cat($base_dir, $css_file);
241
242	# do nothing if we have already downloaded the css files
243	if (! -e $css_file_path) {
244
245	# check the stylesheet's directory in the import folder
246	# if the directory doesn't exist, create one
247	my @dirs = split(/\//i,$css_file);
248	my $path_check = "$base_dir/";
249	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
250	$path_check .= $dirs[$i] . "/";
251	mkdir($path_check) if (! -d $path_check );
252	}
253
254	# NOTE: wget needs configuration to directly access Internet
255	# These files should already downloaded if we used the MediaWikiDownload
256	# downloading
257	$css_file = "http://$css_file";
258	print "\ndownloading : " . $css_file . "\n\n";
259	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
260	if ($? != 0) {
261	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
262	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
263	unlink("$css_file_path");
264	}
265	} # done with download
266
267	# add a prefix "#wikispecificstyle" to each element
268	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
269	# so we will wrap the web page with a div with id = wikispecificstyle
270	my $css_content;
271	if(open(INPUT, "<$css_file_path")){
272	while(my $line = <INPUT>){
273	# comment out the body element because we change the body to div
274	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
275
276	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
277	if($line !~ m/wikispecificstyle/i){
278	$line = "#wikispecificstyle " . $line;
279	}
280	}
281
282	$css_content .= $line;
283	}
284	close(INPUT);
285	open(OUTPUT, ">$css_file_path");
286	print OUTPUT $css_content;
287	close(OUTPUT);
288	}
289
290	# Copy the modified stylesheets to collection's images folder
291	# for future customization
292	my $images_dir = $base_dir;
293	$images_dir =~ s/import$/images/;
294	$css_file =~ m/(.)\/(.)$/;
295	$images_dir = &util::filename_cat($images_dir, $2);
296
297	if(open(OUTPUT, ">$images_dir")){
298	print OUTPUT $css_content;
299	close(OUTPUT);
300	}
301	}
302	}
303
304
305	# by default, only preserve navigation box and search box
306	# others like toolbox, interaction, languages box, will be removed
307
308	# extract the larger part -- footer section
309	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
310	$body_text =~ /$print_footer/;
311	my $footer = "";
312	$footer = $& if defined $&;
313	$footer =~ s/<\/body>//isg;
314
315	# trim the comments first
316	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
317
318	# contain sections that are to be preserved
319	my $preserve_sections = "";
320
321	# process the navigation section
322	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
323	if (defined $self->{'nav_div_exp'}) {
324	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
325	}
326
327	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
328	# do nothing
329	} else {
330	if ($footer =~ m/$nav_match_exp/ig) {
331	$preserve_sections = $& ;
332	} else {
333	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
334	}
335	# if($preserve_sections =~/\S/){
336	# $preserve_sections .= "</div>";
337	# }
338	}
339
340	# process the searchbox section
341	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
342	if(defined $self->{'searchbox_div_exp'}) {
343	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
344	}
345
346	my $searchbox_section = "";
347	$footer =~ m/$searchbox_exp/ig;
348	$searchbox_section = $& if defined $&;
349
350	# make the searchbox form work in Greenstone
351	if($searchbox_section =~ /\S/){
352	# replace action
353	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
354
355	# remove buttons
356	$searchbox_section =~ s/name="search"/name="q"/isg;
357	$searchbox_section =~ s/name="go"//isg;
358	$searchbox_section =~ s/name="fulltext"//isg;
359
360	# get collection name from $base_dir for c param
361	$base_dir =~ m/\/collect\/(.+)\//i;
362	my $collection_name = "";
363	$collection_name = $1 if defined $1;
364
365	# add Greenstone search params
366	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
367	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
368	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
369	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
370
371	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
372
373	# $searchbox_section .= "</div>";
374	} else {
375	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
376	}
377
378	# either delete or replace the searchbox
379	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
380	# do nothing
381	} else {
382	$preserve_sections .= "\n$searchbox_section\n";
383	}
384
385	if($preserve_sections ne ""){
386	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
387	}
388	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
389
390	$body_text =~ s/$print_footer/$preserve_sections/isg;
391
392
393	# delete other forms in the page
394	my @forms;
395	my $form_count = 0;
396	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
397	next if($3 eq "searchform");
398	$forms[$form_count++] = $&;
399	}
400	foreach my $form (@forms) {
401	$body_text =~ s/$form[\s\S]*?<\/form>//m;
402	}
403
404	# process links.
405	# because current WGET 1.10 the -k and -E option doesn't work together
406	# need to 'manually' convert the links to relative links
407	# Dealing with 3 types of links:
408	# -- outgoing links
409	# -- if we have downloaded the target files, link to the internal version (relative link)
410	# -- otherwise, link to the external version (absolute links)
411	# -- in-page links (relative link)
412
413	# NOTE: (important)
414	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
415	# otherwise, the internal links may have problems
416
417	# remove the title attribute of <a> tag
418	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
419
420	# extract all the links
421	my @links;
422	my $link_count = 0;
423	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
424	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
425	}
426
427	foreach my $cur_link (@links) {
428	# escape greedy match + character
429	$cur_link =~ s/\+/\\+/isg;
430
431	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
432	my $external_file_path = "$1\"http://$url_base/$3\"";
433
434	$body_text =~ s/$cur_link/$external_file_path/i;
435	}
436
437	# tag links to new wiki pages as red
438	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
439
440	# tag links to pages external of the MediaWiki website as blue
441	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
442
443
444	# process the table-of-contents section
445	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
446	# 1. read _content_ macro from about.dm
447	# 2. append the toc, change all links to the Greenstone internal format for relative links
448	# 3. write to the extra.dm
449	# TODO: we assume the _about:content_ hasn't been specified before
450	# so needs to add function to handle when the macro is already in the extra.dm
451	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
452
453	# extract toc of the Main_Page
454	my $mainpage_toc = "";
455	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
456	if($self->{'toc_exp'} =~ /\S/){
457	$toc_exp = $self->{'toc_exp'};
458	}
459	if($body_text =~ /$toc_exp/){
460	$mainpage_toc = $&;
461	}
462
463	if($mainpage_toc =~ /\S/) {
464
465	# change the in-page links to relative links, for example, change <a href="#section1"> to
466	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
467	my $file_url_format = $file;
468	$file_url_format =~ s/\\/\//isg;
469	$file_url_format = "http://" . $file_url_format;
470
471	# encode as URL, otherwise doesn't work on Windows
472	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
473	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
474
475
476	# read the collection's extra.dm
477	my $macro_path = $base_dir;
478	$macro_path =~ s/import$/macros/;
479	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
480
481	my $extra_dm = "";
482	if(open(INPUT, "<$extradm_file")){
483	while(my $line = <INPUT>){
484	$extra_dm .= $line;
485	}
486	} else {
487	print $outhandle "can't open file $extradm_file\n";
488	}
489	close(INPUT);
490
491	# check whether we have changed the macros
492	my @packages = split("package ", $extra_dm);
493	my $about_package = "";
494	foreach my $package (@packages) {
495	$about_package = "package " . $package if($package =~ /^about/);
496	}
497
498	my $update_extra_dm = 0;
499
500	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
501	print $outhandle "_content_ macro already changed!!!!\n";
502	}
503	# if extra.dm doesn't have an "about package"
504	elsif ($about_package !~ /\S/) {
505	# read _content_ macro from $GSDLHOME/macros/about.dm file
506	my $global_about_package = $self->read_content_from_about_dm();
507
508	# create the extra _content_ macro for this collection
509	# add the original content of the _content_ macro
510	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
511
512	# append the new about package to extra.dm
513	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
514	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
515
516	$update_extra_dm = 1;
517	}
518	# the about package exists, but either doesn't have the _content_ macro or
519	# the _content_ macro doesn't contain the toc
520	else {
521	# check if there is a content macro
522	my $content_macro_existed = 0;
523	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
524
525	# if there is one
526	# append a new section div for toc to the end of the document section
527	if($content_macro_existed ==1) {
528	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
529	my $content_macro = $&;
530	my $new_content_macro = $content_macro;
531	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
532	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
533	}
534	# otherwise, append _content_ macro to the about package
535	else {
536	my $new_about_package = $about_package;
537	my $content_macro = &read_content_from_about_dm();
538	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
539
540	$new_about_package .= "\n\n_content_$&\n\n";
541	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
542	$extra_dm =~ s/$about_package/$new_about_package/mg;
543	}
544
545	# either the case, we need to update the extra.dm
546	$update_extra_dm = 1;
547	}
548
549	if($update_extra_dm==1){
550	# write to the extra.dm file of the collection
551	if (open(OUTPUT, ">$extradm_file")) {
552	print OUTPUT $extra_dm;
553	} else {
554	print "can't open $extradm_file\n";
555	}
556	close(OUTPUT);
557	}
558	} else {
559	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
560	}
561	}
562
563	# If delete_toc is set, remove toc and tof contents.
564	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
565	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
566	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
567	if ($body_text =~ /$self->{'toc_exp'}/) {
568	$body_text =~ s/$self->{'toc_exp'}//i;
569	}
570	}
571	}
572
573	$$textref = "<body" . $body_text;
574
575	# Wrap the whole page with <div id="wikispecificstyle"></div>
576	# keep the style of this website and don't mess up with the Greenstone styles
577	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
578	$$textref =~ s/<\/body>/<\/div><\/body>/is;
579
580	$self->SUPER::process(@_);
581
582	return 1;
583	}
584
585
586	sub extract_metadata
587	{
588	my $self = shift (@_);
589	my ($textref, $metadata, $doc_obj) = @_;
590	my $outhandle = $self->{'outhandle'};
591
592	return if (!defined $textref);
593
594	# metadata fields to extract/save. 'key' is the (lowercase) name of the
595	# html meta, 'value' is the metadata name for greenstone to use
596	my %find_fields = ();
597	my ($tag,$value);
598
599	my $orig_field = "";
600	foreach my $field (split /,/, $self->{'metadata_fields'}) {
601	# support tag<tagname>
602	if ($field =~ /^(.?)<(.?)>$/) {
603	# "$2" is the user's preferred gs metadata name
604	$find_fields{lc($1)}=$2; # lc = lowercase
605	$orig_field = $1;
606	} else { # no <tagname> for mapping
607	# "$field" is the user's preferred gs metadata name
608	$find_fields{lc($field)}=$field; # lc = lowercase
609	$orig_field = $field;
610	}
611
612	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
613	$tag = $orig_field;
614	$value = $1;
615	if (!defined $value \|\| !defined $tag){
616	#print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
617	next;
618	} else {
619	# clean up and add
620	chomp($value); # remove trailing \n, if any
621	$tag = $find_fields{lc($tag)};
622	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
623	# if ($self->{'verbosity'} > 2);
624	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
625	}
626	}
627	}
628	}
629
630	sub safe_escape_regexp
631	{
632	my $regexp = shift (@_);
633
634	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
635	$regexp =~ s/\\/\\\\/isg;
636	#} else {
637	$regexp =~ s/\//\\\//isg;
638	#}
639	return $regexp;
640	}
641
642	sub read_content_from_about_dm
643	{
644	my $self = shift(@_);
645
646	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
647	my $about_page_content = "";
648	if (open(INPUT, "<$about_macro_file")){
649	while (my $line=<INPUT>){
650	$about_page_content .= $line;
651	}
652	} else {
653	my $outhandle = $self->{'outhandle'};
654	print $outhandle "can't open file $about_macro_file\n";
655	}
656	close(INPUT);
657
658	# extract the _content_ macro
659	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
660	$about_page_content = $&;
661
662	return $about_page_content;
663	}
664
665	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: