Context Navigation

source: gsdl/branches/gsdl-2.74/perllib/plugins/MediaWikiPlug.pm@ 14270

Last change on this file since 14270 was 14270, checked in by oranfry, 17 years ago
merged selected changes to the gsdl trunk since r14217 into the 2.74 branch
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlug;
34
35	use HTMLPlug;
36	# use ImagePlug;
37	# use File::Copy;
38	use unicode;
39
40
41	#use strict; # every perl program should have this!
42	#no strict 'refs'; # make an exception so we can use variables as filehandles
43
44	sub BEGIN {
45	@MediaWikiPlug::ISA = ('HTMLPlug');
46	}
47
48	my $arguments =
49	[
50	# show the table of contents on collection's home page
51	{ 'name' => "show_toc",
52	'desc' => "{MediaWikiPlug.show_toc}",
53	'type' => "flag",
54	'reqd' => "no"},
55	# set to delete the table of contents section on each MediaWiki page
56	{ 'name' => "delete_toc",
57	'desc' => "{MediaWikiPlug.delete_toc}",
58	'type' => "flag",
59	'reqd' => "no"},
60	# regexp to match the table of contents
61	{ 'name' => "toc_exp",
62	'desc' => "{MediaWikiPlug.toc_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)</table>\\n" },
66	# set to delete the navigation section
67	{ 'name' => "delete_nav",
68	'desc' => "{MediaWikiPlug.delete_nav}",
69	'type' => "flag",
70	'reqd' => "no",
71	'deft' => ""},
72	# regexp to match the navigation section
73	{ 'name' => "nav_div_exp",
74	'desc' => "{MediaWikiPlug.nav_div_exp}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
78	# set to delete the searchbox section
79	{ 'name' => "delete_searchbox",
80	'desc' => "{MediaWikiPlug.delete_searchbox}",
81	'type' => "flag",
82	'reqd' => "no",
83	'deft' => ""},
84	# regexp to match the searchbox section
85	{ 'name' => "searchbox_div_exp",
86	'desc' => "{MediaWikiPlug.searchbox_div_exp}",
87	'type' => "regexp",
88	'reqd' => "no",
89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
90	# regexp to match title suffix
91	# can't use the title_sub option in HTMLPlug instead
92	# because title_sub always matches from the begining
93	{ 'name' => "remove_title_suffix_exp",
94	'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
95	'type' => "regexp",
96	'reqd' => "no",
97	'deft' => ""}
98	];
99
100	my $options = { 'name' => "MediaWikiPlug",
101	'desc' => "{MediaWikiPlug.desc}",
102	'abstract' => "no",
103	'inherits' => "yes",
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
112	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
113
114	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
115	return bless $self, $class;
116	}
117
118
119
120	sub process {
121	my $self = shift (@_);
122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123	my $outhandle = $self->{'outhandle'};
124
125	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
126
127	my @head_and_body = split(/<body/i,$$textref);
128	my $head = shift(@head_and_body);
129	my $body_text = join("<body", @head_and_body);
130
131	$head =~ m/<title>(.+)<\/title>/i;
132	my $doctitle = $1 if defined $1;
133
134	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
135	my @doc_properties = split(/<xml>/i,$head);
136	my $doc_heading = shift(@doc_properties);
137	my $rest_doc_properties = join(" ", @doc_properties);
138
139	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
140	my $extracted_metadata = shift (@extracted_metadata);
141	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
142	}
143
144	# set the title here if we haven't found it yet
145	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
146	if (defined $doctitle && $doctitle =~ /\S/) {
147	# remove suffix in title if required
148	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
149	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
150	$doctitle =~ s/$remove_suffix_exp//i;
151	}
152	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
153	} else {
154	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
155	}
156	}
157
158	# we are only interested in the column-contents div <div id="column-content">
159	# remove header section, it may contain header images or additional search boxes
160	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
161	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
162
163	# remove timeline
164	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
165
166	# remove extra bits
167	my $extra_bits = "Retrieved from(.+)</a>\"";
168	$body_text =~ s/$extra_bits//isg;
169
170	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
171	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
172	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
173	$body_text =~ s/( )+/ /sg;
174
175	# get rid of the [edit] buttons
176	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
177	# get rid of the last time edit information at the bottom
178	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
179	# get rid of the (Redirected from ...)
180	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
181
182	# escape texts macros
183	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
184	# may change the links, like Greenstone_Documentation_All.html, then change back
185	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
186
187	# define file delimiter for different platforms
188	my $file_delimiter;
189	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
190	$file_delimiter = "\\";
191	} else {
192	$file_delimiter = "/";
193	}
194
195	# IMPORTANT: different delimiter for $base_dir and $file
196	# $base_dir use forward slash for both windows and linux
197	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
198	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
199	# $file use different delimiters : forward slash for linux; backward slash for windows
200	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
201	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
202
203	# get the base url for the MediaWiki website
204	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
205	my @url_dirs=split($safe_delimiter, $file);
206	my $url_base = $url_dirs[0];
207
208	# Re-check css files associated with MediaWiki pages
209	if(defined $base_dir && $base_dir ne ""){
210	my @css_files;
211	my $css_file_count = 0;
212
213	# find all the stylesheets imported with @import statement
214	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
215	$css_files[$css_file_count++] = $2 if defined $2;
216	}
217
218	# download the stylesheets if we haven't downloaded them yet
219	# add prefix to each style elmement, comment out the body element
220	# and copy the files to collection's images folder
221	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
222
223	my $css_file = $css_files[$css_file_count];
224
225	# remove prefix gli/cache directory
226	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
227
228	# change the \ delimiter in $css_file to / for consistency
229	$css_file =~ s/\\/\//isg;
230	if($css_file !~ /$url_base/) {
231	$css_file = $url_base . $css_file;
232	}
233
234	# trim the ? mark append to the end of a stylesheet
235	$css_file =~ s/\?(.+)$//isg;
236
237	my $css_file_path = &util::filename_cat($base_dir, $css_file);
238
239	# do nothing if we have already downloaded the css files
240	if (! -e $css_file_path) {
241
242	# check the stylesheet's directory in the import folder
243	# if the directory doesn't exist, create one
244	my @dirs = split(/\//i,$css_file);
245	my $path_check = "$base_dir/";
246	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
247	$path_check .= $dirs[$i] . "/";
248	mkdir($path_check) if (! -d $path_check );
249	}
250
251	# NOTE: wget needs configuration to directly access Internet
252	# These files should already downloaded if we used the MediaWikiDownload
253	# downloading
254	$css_file = "http://$css_file";
255	print "\ndownloading : " . $css_file . "\n\n";
256	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
257	if ($? != 0) {
258	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
259	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
260	unlink("$css_file_path");
261	}
262	} # done with download
263
264	# add a prefix "#wikispecificstyle" to each element
265	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
266	# so we will wrap the web page with a div with id = wikispecificstyle
267	my $css_content;
268	if(open(INPUT, "<$css_file_path")){
269	while(my $line = <INPUT>){
270	# comment out the body element because we change the body to div
271	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
272
273	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
274	$line = "#wikispecificstyle " . $line;
275	}
276	$css_content .= $line;
277	}
278	close(INPUT);
279	open(OUTPUT, ">$css_file_path");
280	print OUTPUT $css_content;
281	close(OUTPUT);
282	}
283
284	# Copy the modified stylesheets to collection's images folder
285	# for future customization
286	my $images_dir = $base_dir;
287	$images_dir =~ s/import$/images/;
288	$css_file =~ m/(.)\/(.)$/;
289	$images_dir = &util::filename_cat($images_dir, $2);
290
291	if(open(OUTPUT, ">$images_dir")){
292	print OUTPUT $css_content;
293	close(OUTPUT);
294	}
295	}
296	}
297
298
299	# by default, only preserve navigation box and search box
300	# others like toolbox, interaction, languages box, will be removed
301
302	# extract the larger part -- footer section
303	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
304	$body_text =~ /$print_footer/;
305	my $footer = "";
306	$footer = $& if defined $&;
307	$footer =~ s/<\/body>//isg;
308
309	# trim the comments first
310	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
311
312	# contain sections that are to be preserved
313	my $preserve_sections = "";
314
315	# process the navigation section
316	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
317	if (defined $self->{'nav_div_exp'}) {
318	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
319	}
320
321	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
322	# do nothing
323	} else {
324	if ($footer =~ m/$nav_match_exp/ig) {
325	$preserve_sections = $& ;
326	} else {
327	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
328	}
329	# if($preserve_sections =~/\S/){
330	# $preserve_sections .= "</div>";
331	# }
332	}
333
334	# process the searchbox section
335	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
336	if(defined $self->{'searchbox_div_exp'}) {
337	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
338	}
339
340	my $searchbox_section = "";
341	$footer =~ m/$searchbox_exp/ig;
342	$searchbox_section = $& if defined $&;
343
344	# make the searchbox form work in Greenstone
345	if($searchbox_section =~ /\S/){
346	# replace action
347	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
348
349	# remove buttons
350	$searchbox_section =~ s/name="search"/name="q"/isg;
351	$searchbox_section =~ s/name="go"//isg;
352	$searchbox_section =~ s/name="fulltext"//isg;
353
354	# get collection name from $base_dir for c param
355	$base_dir =~ m/\/collect\/(.+)\//i;
356	my $collection_name = "";
357	$collection_name = $1 if defined $1;
358
359	# add Greenstone search params
360	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
361	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
362	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
363	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
364
365	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
366
367	# $searchbox_section .= "</div>";
368	} else {
369	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
370	}
371
372	# either delete or replace the searchbox
373	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
374	# do nothing
375	} else {
376	$preserve_sections .= "\n$searchbox_section\n";
377	}
378
379
380	if($preserve_sections ne ""){
381	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
382	}
383	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
384
385	$body_text =~ s/$print_footer/$preserve_sections/isg;
386
387
388	# delete other forms in the page
389	my @forms;
390	my $form_count = 0;
391	while($body_text =~ m/<form([^>])name=("\|')([^>])("\|')/isg){
392	next if($3 eq "q");
393	$forms[$form_count++] = $&;
394	}
395	foreach my $form (@forms) {
396	$body_text =~ s/$form[\s\S]*?<\/form>//m;
397	}
398
399
400	# process links.
401	# because current WGET 1.10 the -k and -E option doesn't work together
402	# need to 'manually' convert the links to relative links
403	# Dealing with 3 types of links:
404	# -- outgoing links
405	# -- if we have downloaded the target files, link to the internal version (relative link)
406	# -- otherwise, link to the external version (absolute links)
407	# -- in-page links (relative link)
408
409	# NOTE: (important)
410	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
411	# otherwise, the internal links may have problems
412
413	# remove the title attribute of <a> tag
414	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
415
416	# extract all the links
417	my @links;
418	my $link_count = 0;
419	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
420	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
421	}
422
423	foreach my $cur_link (@links) {
424	# escape greedy match + character
425	$cur_link =~ s/\+/\\+/isg;
426
427	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
428	my $external_file_path = "$1\"http://$url_base/$3\"";
429
430	$body_text =~ s/$cur_link/$external_file_path/i;
431	}
432
433	# tag links to new wiki pages as red
434	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
435
436	# tag links to pages external of the MediaWiki website as blue
437	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
438
439
440	# process the table-of-contents section
441	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
442	# 1. read _content_ macro from about.dm
443	# 2. append the toc, change all links to the Greenstone internal format for relative links
444	# 3. write to the extra.dm
445	# TODO: we assume the _about:content_ hasn't been specified before
446	# so needs to add function to handle when the macro is already in the extra.dm
447	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
448
449	# extract toc of the Main_Page
450	my $mainpage_toc = "";
451	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
452	if($self->{'toc_exp'} =~ /\S/){
453	$toc_exp = $self->{'toc_exp'};
454	}
455	if($body_text =~ /$toc_exp/){
456	$mainpage_toc = $&;
457	}
458
459	if($mainpage_toc =~ /\S/) {
460
461	# change the in-page links to relative links, for example, change <a href="#section1"> to
462	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
463	my $file_url_format = $file;
464	$file_url_format =~ s/\\/\//isg;
465	$file_url_format = "http://" . $file_url_format;
466
467	# encode as URL, otherwise doesn't work on Windows
468	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
469	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
470
471
472	# read the collection's extra.dm
473	my $macro_path = $base_dir;
474	$macro_path =~ s/import$/macros/;
475	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
476
477	my $extra_dm = "";
478	if(open(INPUT, "<$extradm_file")){
479	while(my $line = <INPUT>){
480	$extra_dm .= $line;
481	}
482	} else {
483	print $outhandle "can't open file $extradm_file\n";
484	}
485	close(INPUT);
486
487	# check whether we have changed the macros
488	my @packages = split("package ", $extra_dm);
489	my $about_package = "";
490	foreach my $package (@packages) {
491	$about_package = "package " . $package if($package =~ /^about/);
492	}
493
494	my $update_extra_dm = 0;
495
496	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
497	print $outhandle "_content_ macro already changed!!!!\n";
498	}
499	# if extra.dm doesn't have an "about package"
500	elsif ($about_package !~ /\S/) {
501	# read _content_ macro from $GSDLHOME/macros/about.dm file
502	my $global_about_package = &read_content_from_about_dm();
503
504	# create the extra _content_ macro for this collection
505	# add the original content of the _content_ macro
506	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
507
508	# append the new about package to extra.dm
509	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
510	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
511
512	$update_extra_dm = 1;
513	}
514	# the about package exists, but either doesn't have the _content_ macro or
515	# the _content_ macro doesn't contain the toc
516	else {
517	# check if there is a content macro
518	my $content_macro_existed = 0;
519	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
520
521	# if there is one
522	# append a new section div for toc to the end of the document section
523	if($content_macro_existed ==1) {
524	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
525	my $content_macro = $&;
526	my $new_content_macro = $content_macro;
527	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
528	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
529	}
530	# otherwise, append _content_ macro to the about package
531	else {
532	my $new_about_package = $about_package;
533	$content_macro = &read_content_from_about_dm();
534	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
535
536	$new_about_package .= "\n\n_content_$&\n\n";
537	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
538	$extra_dm =~ s/$about_package/$new_about_package/mg;
539	}
540
541	# either the case, we need to update the extra.dm
542	$update_extra_dm = 1;
543	}
544
545	if($update_extra_dm==1){
546	# write to the extra.dm file of the collection
547	if (open(OUTPUT, ">$extradm_file")) {
548	print OUTPUT $extra_dm;
549	} else {
550	print "can't open $extradm_file\n";
551	}
552	close(OUTPUT);
553	}
554	} else {
555	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
556	}
557	}
558
559	# If delete_toc is set, remove toc and tof contents.
560	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
561	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
562	# print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
563	if ($body_text =~ /$self->{'toc_exp'}/) {
564	$body_text =~ s/$self->{'toc_exp'}//i;
565	}
566	}
567	}
568
569	$$textref = "<body" . $body_text;
570
571	# Wrap the whole page with <div id="wikispecificstyle"></div>
572	# keep the style of this website and don't mess up with the Greenstone styles
573	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
574	$$textref =~ s/<\/body>/<\/div><\/body>/is;
575
576	$self->SUPER::process(@_);
577
578	return 1;
579	}
580
581
582	sub extract_metadata
583	{
584	my $self = shift (@_);
585	my ($textref, $metadata, $doc_obj) = @_;
586	my $outhandle = $self->{'outhandle'};
587
588	return if (!defined $textref);
589
590	# metadata fields to extract/save. 'key' is the (lowercase) name of the
591	# html meta, 'value' is the metadata name for greenstone to use
592	my %find_fields = ();
593	my ($tag,$value);
594
595	my $orig_field = "";
596	foreach my $field (split /,/, $self->{'metadata_fields'}) {
597	# support tag<tagname>
598	if ($field =~ /^(.?)<(.?)>$/) {
599	# "$2" is the user's preferred gs metadata name
600	$find_fields{lc($1)}=$2; # lc = lowercase
601	$orig_field = $1;
602	} else { # no <tagname> for mapping
603	# "$field" is the user's preferred gs metadata name
604	$find_fields{lc($field)}=$field; # lc = lowercase
605	$orig_field = $field;
606	}
607
608	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
609	$tag = $orig_field;
610	$value = $1;
611	if (!defined $value \|\| !defined $tag){
612	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
613	next;
614	} else {
615	# clean up and add
616	chomp($value); # remove trailing \n, if any
617	$tag = $find_fields{lc($tag)};
618	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
619	# if ($self->{'verbosity'} > 2);
620	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
621	}
622	}
623	}
624	}
625
626	sub safe_escape_regexp
627	{
628	my $regexp = shift (@_);
629
630	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
631	$regexp =~ s/\\/\\\\/isg;
632	#} else {
633	$regexp =~ s/\//\\\//isg;
634	#}
635	return $regexp;
636	}
637
638	sub read_content_from_about_dm
639	{
640	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
641	my $about_page_content = "";
642	if (open(INPUT, "<$about_macro_file")){
643	while (my $line=<INPUT>){
644	$about_page_content .= $line;
645	}
646	} else {
647	print $outhandle "can't open file $about_macro_file\n";
648	}
649	close(INPUT);
650
651	# extract the _content_ macro
652	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
653	$about_page_content = $&;
654
655	return $about_page_content;
656	}
657
658	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: