Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlug.pm@ 14496

Last change on this file since 14496 was 14496, checked in by anna, 17 years ago
Trimmed contents before column-content div and fixed the css modifications.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.4 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlug;
34
35	use HTMLPlug;
36	# use ImagePlug;
37	# use File::Copy;
38	use unicode;
39
40
41	#use strict; # every perl program should have this!
42	#no strict 'refs'; # make an exception so we can use variables as filehandles
43
44	sub BEGIN {
45	@MediaWikiPlug::ISA = ('HTMLPlug');
46	}
47
48	my $arguments =
49	[
50	# show the table of contents on collection's home page
51	{ 'name' => "show_toc",
52	'desc' => "{MediaWikiPlug.show_toc}",
53	'type' => "flag",
54	'reqd' => "no"},
55	# set to delete the table of contents section on each MediaWiki page
56	{ 'name' => "delete_toc",
57	'desc' => "{MediaWikiPlug.delete_toc}",
58	'type' => "flag",
59	'reqd' => "no"},
60	# regexp to match the table of contents
61	{ 'name' => "toc_exp",
62	'desc' => "{MediaWikiPlug.toc_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)?</table>\\n" },
66	# set to delete the navigation section
67	{ 'name' => "delete_nav",
68	'desc' => "{MediaWikiPlug.delete_nav}",
69	'type' => "flag",
70	'reqd' => "no",
71	'deft' => ""},
72	# regexp to match the navigation section
73	{ 'name' => "nav_div_exp",
74	'desc' => "{MediaWikiPlug.nav_div_exp}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
78	# set to delete the searchbox section
79	{ 'name' => "delete_searchbox",
80	'desc' => "{MediaWikiPlug.delete_searchbox}",
81	'type' => "flag",
82	'reqd' => "no",
83	'deft' => ""},
84	# regexp to match the searchbox section
85	{ 'name' => "searchbox_div_exp",
86	'desc' => "{MediaWikiPlug.searchbox_div_exp}",
87	'type' => "regexp",
88	'reqd' => "no",
89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
90	# regexp to match title suffix
91	# can't use the title_sub option in HTMLPlug instead
92	# because title_sub always matches from the begining
93	{ 'name' => "remove_title_suffix_exp",
94	'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
95	'type' => "regexp",
96	'reqd' => "no",
97	'deft' => ""}
98	];
99
100	my $options = { 'name' => "MediaWikiPlug",
101	'desc' => "{MediaWikiPlug.desc}",
102	'abstract' => "no",
103	'inherits' => "yes",
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
112	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
113
114	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
115	return bless $self, $class;
116	}
117
118
119
120	sub process {
121	my $self = shift (@_);
122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123	my $outhandle = $self->{'outhandle'};
124
125	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
126
127	my @head_and_body = split(/<body/i,$$textref);
128	my $head = shift(@head_and_body);
129	my $body_text = join("<body", @head_and_body);
130
131	$head =~ m/<title>(.+)<\/title>/i;
132	my $doctitle = $1 if defined $1;
133
134	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
135	my @doc_properties = split(/<xml>/i,$head);
136	my $doc_heading = shift(@doc_properties);
137	my $rest_doc_properties = join(" ", @doc_properties);
138
139	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
140	my $extracted_metadata = shift (@extracted_metadata);
141	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
142	}
143
144	# set the title here if we haven't found it yet
145	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
146	if (defined $doctitle && $doctitle =~ /\S/) {
147	# remove suffix in title if required
148	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
149	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
150	$doctitle =~ s/$remove_suffix_exp//i;
151	}
152	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
153	} else {
154	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
155	}
156	}
157
158	# we are only interested in the column-contents div <div id="column-content">
159	# remove header section, it may contain header images or additional search boxes
160	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
161	if($body_text =~ /$header_exp/){
162	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
163	} else {
164	$header_exp = "(.\|\\n)?<div([^>])?id=(\"\|')column-content";
165	if($body_text =~ /$header_exp/){
166	$body_text =~ s/$header_exp/<div$2id='column-content/i;
167	}
168	}
169
170	# remove timeline
171	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
172
173	# remove extra bits
174	my $extra_bits = "Retrieved from(.+)</a>\"";
175	$body_text =~ s/$extra_bits//isg;
176
177	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
178	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
179	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
180	$body_text =~ s/( )+/ /sg;
181
182	# get rid of the [edit] buttons
183	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
184	# get rid of the last time edit information at the bottom
185	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
186	# get rid of the (Redirected from ...)
187	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
188
189	# escape texts macros
190	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
191	# may change the links, like Greenstone_Documentation_All.html, then change back
192	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
193
194	# define file delimiter for different platforms
195	my $file_delimiter;
196	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
197	$file_delimiter = "\\";
198	} else {
199	$file_delimiter = "/";
200	}
201
202	# IMPORTANT: different delimiter for $base_dir and $file
203	# $base_dir use forward slash for both windows and linux
204	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
205	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
206	# $file use different delimiters : forward slash for linux; backward slash for windows
207	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
208	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
209
210	# get the base url for the MediaWiki website
211	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
212	my @url_dirs=split($safe_delimiter, $file);
213	my $url_base = $url_dirs[0];
214
215	# Re-check css files associated with MediaWiki pages
216	if(defined $base_dir && $base_dir ne ""){
217	my @css_files;
218	my $css_file_count = 0;
219
220	# find all the stylesheets imported with @import statement
221	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
222	$css_files[$css_file_count++] = $2 if defined $2;
223	}
224
225	# download the stylesheets if we haven't downloaded them yet
226	# add prefix to each style elmement, comment out the body element
227	# and copy the files to collection's images folder
228	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
229
230	my $css_file = $css_files[$css_file_count];
231
232	# remove prefix gli/cache directory
233	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
234
235	# change the \ delimiter in $css_file to / for consistency
236	$css_file =~ s/\\/\//isg;
237	if($css_file !~ /$url_base/) {
238	$css_file = $url_base . $css_file;
239	}
240
241	# trim the ? mark append to the end of a stylesheet
242	$css_file =~ s/\?(.+)$//isg;
243
244	my $css_file_path = &util::filename_cat($base_dir, $css_file);
245
246	# do nothing if we have already downloaded the css files
247	if (! -e $css_file_path) {
248
249	# check the stylesheet's directory in the import folder
250	# if the directory doesn't exist, create one
251	my @dirs = split(/\//i,$css_file);
252	my $path_check = "$base_dir/";
253	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
254	$path_check .= $dirs[$i] . "/";
255	mkdir($path_check) if (! -d $path_check );
256	}
257
258	# NOTE: wget needs configuration to directly access Internet
259	# These files should already downloaded if we used the MediaWikiDownload
260	# downloading
261	$css_file = "http://$css_file";
262	print "\ndownloading : " . $css_file . "\n\n";
263	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
264	if ($? != 0) {
265	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
266	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
267	unlink("$css_file_path");
268	}
269	} # done with download
270
271	# add a prefix "#wikispecificstyle" to each element
272	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
273	# so we will wrap the web page with a div with id = wikispecificstyle
274	my $css_content;
275	if(open(INPUT, "<$css_file_path")){
276	while(my $line = <INPUT>){
277	# comment out the body element because we change the body to div
278	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
279
280	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
281	if($line !~ m/wikispecificstyle/i){
282	$line = "#wikispecificstyle " . $line;
283	}
284	}
285
286	$css_content .= $line;
287	}
288	close(INPUT);
289	open(OUTPUT, ">$css_file_path");
290	print OUTPUT $css_content;
291	close(OUTPUT);
292	}
293
294	# Copy the modified stylesheets to collection's images folder
295	# for future customization
296	my $images_dir = $base_dir;
297	$images_dir =~ s/import$/images/;
298	$css_file =~ m/(.)\/(.)$/;
299	$images_dir = &util::filename_cat($images_dir, $2);
300
301	if(open(OUTPUT, ">$images_dir")){
302	print OUTPUT $css_content;
303	close(OUTPUT);
304	}
305	}
306	}
307
308
309	# by default, only preserve navigation box and search box
310	# others like toolbox, interaction, languages box, will be removed
311
312	# extract the larger part -- footer section
313	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
314	$body_text =~ /$print_footer/;
315	my $footer = "";
316	$footer = $& if defined $&;
317	$footer =~ s/<\/body>//isg;
318
319	# trim the comments first
320	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
321
322	# contain sections that are to be preserved
323	my $preserve_sections = "";
324
325	# process the navigation section
326	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
327	if (defined $self->{'nav_div_exp'}) {
328	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
329	}
330
331	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
332	# do nothing
333	} else {
334	if ($footer =~ m/$nav_match_exp/ig) {
335	$preserve_sections = $& ;
336	} else {
337	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
338	}
339	# if($preserve_sections =~/\S/){
340	# $preserve_sections .= "</div>";
341	# }
342	}
343
344	# process the searchbox section
345	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
346	if(defined $self->{'searchbox_div_exp'}) {
347	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
348	}
349
350	my $searchbox_section = "";
351	$footer =~ m/$searchbox_exp/ig;
352	$searchbox_section = $& if defined $&;
353
354	# make the searchbox form work in Greenstone
355	if($searchbox_section =~ /\S/){
356	# replace action
357	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
358
359	# remove buttons
360	$searchbox_section =~ s/name="search"/name="q"/isg;
361	$searchbox_section =~ s/name="go"//isg;
362	$searchbox_section =~ s/name="fulltext"//isg;
363
364	# get collection name from $base_dir for c param
365	$base_dir =~ m/\/collect\/(.+)\//i;
366	my $collection_name = "";
367	$collection_name = $1 if defined $1;
368
369	# add Greenstone search params
370	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
371	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
372	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
373	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
374
375	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
376
377	# $searchbox_section .= "</div>";
378	} else {
379	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
380	}
381
382	# either delete or replace the searchbox
383	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
384	# do nothing
385	} else {
386	$preserve_sections .= "\n$searchbox_section\n";
387	}
388
389	if($preserve_sections ne ""){
390	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
391	}
392	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
393
394	$body_text =~ s/$print_footer/$preserve_sections/isg;
395
396
397	# delete other forms in the page
398	my @forms;
399	my $form_count = 0;
400	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
401	next if($3 eq "searchform");
402	$forms[$form_count++] = $&;
403	}
404	foreach my $form (@forms) {
405	$body_text =~ s/$form[\s\S]*?<\/form>//m;
406	}
407
408	# process links.
409	# because current WGET 1.10 the -k and -E option doesn't work together
410	# need to 'manually' convert the links to relative links
411	# Dealing with 3 types of links:
412	# -- outgoing links
413	# -- if we have downloaded the target files, link to the internal version (relative link)
414	# -- otherwise, link to the external version (absolute links)
415	# -- in-page links (relative link)
416
417	# NOTE: (important)
418	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
419	# otherwise, the internal links may have problems
420
421	# remove the title attribute of <a> tag
422	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
423
424	# extract all the links
425	my @links;
426	my $link_count = 0;
427	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
428	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
429	}
430
431	foreach my $cur_link (@links) {
432	# escape greedy match + character
433	$cur_link =~ s/\+/\\+/isg;
434
435	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
436	my $external_file_path = "$1\"http://$url_base/$3\"";
437
438	$body_text =~ s/$cur_link/$external_file_path/i;
439	}
440
441	# tag links to new wiki pages as red
442	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
443
444	# tag links to pages external of the MediaWiki website as blue
445	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
446
447
448	# process the table-of-contents section
449	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
450	# 1. read _content_ macro from about.dm
451	# 2. append the toc, change all links to the Greenstone internal format for relative links
452	# 3. write to the extra.dm
453	# TODO: we assume the _about:content_ hasn't been specified before
454	# so needs to add function to handle when the macro is already in the extra.dm
455	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
456
457	# extract toc of the Main_Page
458	my $mainpage_toc = "";
459	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
460	if($self->{'toc_exp'} =~ /\S/){
461	$toc_exp = $self->{'toc_exp'};
462	}
463	if($body_text =~ /$toc_exp/){
464	$mainpage_toc = $&;
465	}
466
467	if($mainpage_toc =~ /\S/) {
468
469	# change the in-page links to relative links, for example, change <a href="#section1"> to
470	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
471	my $file_url_format = $file;
472	$file_url_format =~ s/\\/\//isg;
473	$file_url_format = "http://" . $file_url_format;
474
475	# encode as URL, otherwise doesn't work on Windows
476	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
477	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
478
479
480	# read the collection's extra.dm
481	my $macro_path = $base_dir;
482	$macro_path =~ s/import$/macros/;
483	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
484
485	my $extra_dm = "";
486	if(open(INPUT, "<$extradm_file")){
487	while(my $line = <INPUT>){
488	$extra_dm .= $line;
489	}
490	} else {
491	print $outhandle "can't open file $extradm_file\n";
492	}
493	close(INPUT);
494
495	# check whether we have changed the macros
496	my @packages = split("package ", $extra_dm);
497	my $about_package = "";
498	foreach my $package (@packages) {
499	$about_package = "package " . $package if($package =~ /^about/);
500	}
501
502	my $update_extra_dm = 0;
503
504	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
505	print $outhandle "_content_ macro already changed!!!!\n";
506	}
507	# if extra.dm doesn't have an "about package"
508	elsif ($about_package !~ /\S/) {
509	# read _content_ macro from $GSDLHOME/macros/about.dm file
510	my $global_about_package = &read_content_from_about_dm();
511
512	# create the extra _content_ macro for this collection
513	# add the original content of the _content_ macro
514	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
515
516	# append the new about package to extra.dm
517	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
518	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
519
520	$update_extra_dm = 1;
521	}
522	# the about package exists, but either doesn't have the _content_ macro or
523	# the _content_ macro doesn't contain the toc
524	else {
525	# check if there is a content macro
526	my $content_macro_existed = 0;
527	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
528
529	# if there is one
530	# append a new section div for toc to the end of the document section
531	if($content_macro_existed ==1) {
532	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
533	my $content_macro = $&;
534	my $new_content_macro = $content_macro;
535	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
536	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
537	}
538	# otherwise, append _content_ macro to the about package
539	else {
540	my $new_about_package = $about_package;
541	$content_macro = &read_content_from_about_dm();
542	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
543
544	$new_about_package .= "\n\n_content_$&\n\n";
545	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
546	$extra_dm =~ s/$about_package/$new_about_package/mg;
547	}
548
549	# either the case, we need to update the extra.dm
550	$update_extra_dm = 1;
551	}
552
553	if($update_extra_dm==1){
554	# write to the extra.dm file of the collection
555	if (open(OUTPUT, ">$extradm_file")) {
556	print OUTPUT $extra_dm;
557	} else {
558	print "can't open $extradm_file\n";
559	}
560	close(OUTPUT);
561	}
562	} else {
563	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
564	}
565	}
566
567	# If delete_toc is set, remove toc and tof contents.
568	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
569	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
570	# print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
571	if ($body_text =~ /$self->{'toc_exp'}/) {
572	$body_text =~ s/$self->{'toc_exp'}//i;
573	}
574	}
575	}
576
577	$$textref = "<body" . $body_text;
578
579	# Wrap the whole page with <div id="wikispecificstyle"></div>
580	# keep the style of this website and don't mess up with the Greenstone styles
581	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
582	$$textref =~ s/<\/body>/<\/div><\/body>/is;
583
584	$self->SUPER::process(@_);
585
586	return 1;
587	}
588
589
590	sub extract_metadata
591	{
592	my $self = shift (@_);
593	my ($textref, $metadata, $doc_obj) = @_;
594	my $outhandle = $self->{'outhandle'};
595
596	return if (!defined $textref);
597
598	# metadata fields to extract/save. 'key' is the (lowercase) name of the
599	# html meta, 'value' is the metadata name for greenstone to use
600	my %find_fields = ();
601	my ($tag,$value);
602
603	my $orig_field = "";
604	foreach my $field (split /,/, $self->{'metadata_fields'}) {
605	# support tag<tagname>
606	if ($field =~ /^(.?)<(.?)>$/) {
607	# "$2" is the user's preferred gs metadata name
608	$find_fields{lc($1)}=$2; # lc = lowercase
609	$orig_field = $1;
610	} else { # no <tagname> for mapping
611	# "$field" is the user's preferred gs metadata name
612	$find_fields{lc($field)}=$field; # lc = lowercase
613	$orig_field = $field;
614	}
615
616	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
617	$tag = $orig_field;
618	$value = $1;
619	if (!defined $value \|\| !defined $tag){
620	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
621	next;
622	} else {
623	# clean up and add
624	chomp($value); # remove trailing \n, if any
625	$tag = $find_fields{lc($tag)};
626	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
627	# if ($self->{'verbosity'} > 2);
628	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
629	}
630	}
631	}
632	}
633
634	sub safe_escape_regexp
635	{
636	my $regexp = shift (@_);
637
638	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
639	$regexp =~ s/\\/\\\\/isg;
640	#} else {
641	$regexp =~ s/\//\\\//isg;
642	#}
643	return $regexp;
644	}
645
646	sub read_content_from_about_dm
647	{
648	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
649	my $about_page_content = "";
650	if (open(INPUT, "<$about_macro_file")){
651	while (my $line=<INPUT>){
652	$about_page_content .= $line;
653	}
654	} else {
655	print $outhandle "can't open file $about_macro_file\n";
656	}
657	close(INPUT);
658
659	# extract the _content_ macro
660	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
661	$about_page_content = $&;
662
663	return $about_page_content;
664	}
665
666	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: