Context Navigation

source: gsdl/trunk/perllib/plugins/MediaWikiPlug.pm@ 14337

Last change on this file since 14337 was 14337, checked in by anna, 17 years ago
Fixed a bug in extracting search box.
Property svn:keywords set to `Author Date Id Revision`
File size: 26.0 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27	# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28	# login, discussion, history, etc. Only the navigation and search section could be preserved.
29	# Searchbox will be modified to search the Greenstone collection instead of the website.
30	# It also can automatically add the table of contents on the website's Main_Page to the
31	# collection's Home page.
32
33	package MediaWikiPlug;
34
35	use HTMLPlug;
36	# use ImagePlug;
37	# use File::Copy;
38	use unicode;
39
40
41	#use strict; # every perl program should have this!
42	#no strict 'refs'; # make an exception so we can use variables as filehandles
43
44	sub BEGIN {
45	@MediaWikiPlug::ISA = ('HTMLPlug');
46	}
47
48	my $arguments =
49	[
50	# show the table of contents on collection's home page
51	{ 'name' => "show_toc",
52	'desc' => "{MediaWikiPlug.show_toc}",
53	'type' => "flag",
54	'reqd' => "no"},
55	# set to delete the table of contents section on each MediaWiki page
56	{ 'name' => "delete_toc",
57	'desc' => "{MediaWikiPlug.delete_toc}",
58	'type' => "flag",
59	'reqd' => "no"},
60	# regexp to match the table of contents
61	{ 'name' => "toc_exp",
62	'desc' => "{MediaWikiPlug.toc_exp}",
63	'type' => "regexp",
64	'reqd' => "no",
65	'deft' => "<table([^>])id=(\\\"\|')toc(\\\"\|')(.\|\\n)</table>\\n" },
66	# set to delete the navigation section
67	{ 'name' => "delete_nav",
68	'desc' => "{MediaWikiPlug.delete_nav}",
69	'type' => "flag",
70	'reqd' => "no",
71	'deft' => ""},
72	# regexp to match the navigation section
73	{ 'name' => "nav_div_exp",
74	'desc' => "{MediaWikiPlug.nav_div_exp}",
75	'type' => "regexp",
76	'reqd' => "no",
77	'deft' => "<div([^>])id=(\\\"\|')p-navigation(\\\"\|')(.\|\\n)?<\/div>" },
78	# set to delete the searchbox section
79	{ 'name' => "delete_searchbox",
80	'desc' => "{MediaWikiPlug.delete_searchbox}",
81	'type' => "flag",
82	'reqd' => "no",
83	'deft' => ""},
84	# regexp to match the searchbox section
85	{ 'name' => "searchbox_div_exp",
86	'desc' => "{MediaWikiPlug.searchbox_div_exp}",
87	'type' => "regexp",
88	'reqd' => "no",
89	'deft' => "<div([^>])id=(\\\"\|')p-search(\\\"\|')(.\|\\n)?<\/div>"},
90	# regexp to match title suffix
91	# can't use the title_sub option in HTMLPlug instead
92	# because title_sub always matches from the begining
93	{ 'name' => "remove_title_suffix_exp",
94	'desc' => "{MediaWikiPlug.remove_title_suffix_exp}",
95	'type' => "regexp",
96	'reqd' => "no",
97	'deft' => ""}
98	];
99
100	my $options = { 'name' => "MediaWikiPlug",
101	'desc' => "{MediaWikiPlug.desc}",
102	'abstract' => "no",
103	'inherits' => "yes",
104	'args' => $arguments };
105
106	sub new {
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
112	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
113
114	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
115	return bless $self, $class;
116	}
117
118
119
120	sub process {
121	my $self = shift (@_);
122	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123	my $outhandle = $self->{'outhandle'};
124
125	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
126
127	my @head_and_body = split(/<body/i,$$textref);
128	my $head = shift(@head_and_body);
129	my $body_text = join("<body", @head_and_body);
130
131	$head =~ m/<title>(.+)<\/title>/i;
132	my $doctitle = $1 if defined $1;
133
134	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
135	my @doc_properties = split(/<xml>/i,$head);
136	my $doc_heading = shift(@doc_properties);
137	my $rest_doc_properties = join(" ", @doc_properties);
138
139	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
140	my $extracted_metadata = shift (@extracted_metadata);
141	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
142	}
143
144	# set the title here if we haven't found it yet
145	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
146	if (defined $doctitle && $doctitle =~ /\S/) {
147	# remove suffix in title if required
148	my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
149	if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
150	$doctitle =~ s/$remove_suffix_exp//i;
151	}
152	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
153	} else {
154	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
155	}
156	}
157
158	# we are only interested in the column-contents div <div id="column-content">
159	# remove header section, it may contain header images or additional search boxes
160	my $header_exp = "<div([^>])id=(\"\|')container(\"\|')([^>])>(.\|\\n)<div([^>])id=(\"\|')column-content";
161	$body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
162
163	# remove timeline
164	$body_text =~ s/<div([^>])class=("\|')smwtimeline("\|')[\s\S]?<\/div>//mg;
165
166	# remove extra bits
167	my $extra_bits = "Retrieved from(.+)</a>\"";
168	$body_text =~ s/$extra_bits//isg;
169
170	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
171	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
172	$body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
173	$body_text =~ s/( )+/ /sg;
174
175	# get rid of the [edit] buttons
176	$body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
177	# get rid of the last time edit information at the bottom
178	$body_text =~ s/<a href="([^>])edit([^>])"([^>]?)>(\w+)<\/a> \d\d:\d\d,([\s\|\w]?)\(PST\)//g;
179	# get rid of the (Redirected from ...)
180	$body_text =~ s/\(Redirected from <a ([^>])>(\w\|\s)?<\/a>\)//isg;
181
182	# escape texts macros
183	$body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
184	# may change the links, like Greenstone_Documentation_All.html, then change back
185	$body_text =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
186
187	# define file delimiter for different platforms
188	my $file_delimiter;
189	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
190	$file_delimiter = "\\";
191	} else {
192	$file_delimiter = "/";
193	}
194
195	# IMPORTANT: different delimiter for $base_dir and $file
196	# $base_dir use forward slash for both windows and linux
197	# print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
198	# linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
199	# $file use different delimiters : forward slash for linux; backward slash for windows
200	# print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlug.html
201	# linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
202
203	# get the base url for the MediaWiki website
204	my $safe_delimiter = &safe_escape_regexp($file_delimiter);
205	my @url_dirs=split($safe_delimiter, $file);
206	my $url_base = $url_dirs[0];
207
208	# Re-check css files associated with MediaWiki pages
209	if(defined $base_dir && $base_dir ne ""){
210	my @css_files;
211	my $css_file_count = 0;
212
213	# find all the stylesheets imported with @import statement
214	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
215	$css_files[$css_file_count++] = $2 if defined $2;
216	}
217
218	# download the stylesheets if we haven't downloaded them yet
219	# add prefix to each style elmement, comment out the body element
220	# and copy the files to collection's images folder
221	for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
222
223	my $css_file = $css_files[$css_file_count];
224
225	# remove prefix gli/cache directory
226	$css_file =~ s/^(.+)gli(\\\|\/)cache(\\\|\/)//i;
227
228	# change the \ delimiter in $css_file to / for consistency
229	$css_file =~ s/\\/\//isg;
230	if($css_file !~ /$url_base/) {
231	$css_file = $url_base . $css_file;
232	}
233
234	# trim the ? mark append to the end of a stylesheet
235	$css_file =~ s/\?(.+)$//isg;
236
237	my $css_file_path = &util::filename_cat($base_dir, $css_file);
238
239	# do nothing if we have already downloaded the css files
240	if (! -e $css_file_path) {
241
242	# check the stylesheet's directory in the import folder
243	# if the directory doesn't exist, create one
244	my @dirs = split(/\//i,$css_file);
245	my $path_check = "$base_dir/";
246	for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
247	$path_check .= $dirs[$i] . "/";
248	mkdir($path_check) if (! -d $path_check );
249	}
250
251	# NOTE: wget needs configuration to directly access Internet
252	# These files should already downloaded if we used the MediaWikiDownload
253	# downloading
254	$css_file = "http://$css_file";
255	print "\ndownloading : " . $css_file . "\n\n";
256	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
257	if ($? != 0) {
258	print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
259	print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
260	unlink("$css_file_path");
261	}
262	} # done with download
263
264	# add a prefix "#wikispecificstyle" to each element
265	# because we want to preserve this website's formats and don't want to mess up with Greenstone formats
266	# so we will wrap the web page with a div with id = wikispecificstyle
267	my $css_content;
268	if(open(INPUT, "<$css_file_path")){
269	while(my $line = <INPUT>){
270	# comment out the body element because we change the body to div
271	$line =~ s/^(\s)body(\s){(\s)$/$1\/body$2*\/{$3/isg;
272
273	if($line =~ m/^(.+)\{/i \|\| $line =~ m/^(\s)*#/i){
274	$line = "#wikispecificstyle " . $line;
275	}
276	$css_content .= $line;
277	}
278	close(INPUT);
279	open(OUTPUT, ">$css_file_path");
280	print OUTPUT $css_content;
281	close(OUTPUT);
282	}
283
284	# Copy the modified stylesheets to collection's images folder
285	# for future customization
286	my $images_dir = $base_dir;
287	$images_dir =~ s/import$/images/;
288	$css_file =~ m/(.)\/(.)$/;
289	$images_dir = &util::filename_cat($images_dir, $2);
290
291	if(open(OUTPUT, ">$images_dir")){
292	print OUTPUT $css_content;
293	close(OUTPUT);
294	}
295	}
296	}
297
298
299	# by default, only preserve navigation box and search box
300	# others like toolbox, interaction, languages box, will be removed
301
302	# extract the larger part -- footer section
303	my $print_footer = "<div class=\"printfooter\">(.\|\n)+</body>";
304	$body_text =~ /$print_footer/;
305	my $footer = "";
306	$footer = $& if defined $&;
307	$footer =~ s/<\/body>//isg;
308
309	# trim the comments first
310	$footer =~ s/<!--[\s\S]?--[ \t\n\r]>//isg;
311
312	# contain sections that are to be preserved
313	my $preserve_sections = "";
314
315	# process the navigation section
316	my $nav_match_exp = "<div([^>])id=(\"\|')p-navigation(\"\|')(.\|\n)?<\/div>";
317	if (defined $self->{'nav_div_exp'}) {
318	$nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
319	}
320
321	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
322	# do nothing
323	} else {
324	if ($footer =~ m/$nav_match_exp/ig) {
325	$preserve_sections = $& ;
326	} else {
327	print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
328	}
329	# if($preserve_sections =~/\S/){
330	# $preserve_sections .= "</div>";
331	# }
332	}
333
334	# process the searchbox section
335	my $searchbox_exp = "<div([^>])id=(\"\|')p-search(\"\|')(.\|\\n)?<\/div>";
336	if(defined $self->{'searchbox_div_exp'}) {
337	$searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
338	}
339
340	my $searchbox_section = "";
341	$footer =~ m/$searchbox_exp/ig;
342	$searchbox_section = $& if defined $&;
343
344	# make the searchbox form work in Greenstone
345	if($searchbox_section =~ /\S/){
346	# replace action
347	$searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
348
349	# remove buttons
350	$searchbox_section =~ s/name="search"/name="q"/isg;
351	$searchbox_section =~ s/name="go"//isg;
352	$searchbox_section =~ s/name="fulltext"//isg;
353
354	# get collection name from $base_dir for c param
355	$base_dir =~ m/\/collect\/(.+)\//i;
356	my $collection_name = "";
357	$collection_name = $1 if defined $1;
358
359	# add Greenstone search params
360	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
361	."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
362	# ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
363	# ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
364
365	$searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
366
367	# $searchbox_section .= "</div>";
368	} else {
369	print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
370	}
371
372	# either delete or replace the searchbox
373	if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
374	# do nothing
375	} else {
376	$preserve_sections .= "\n$searchbox_section\n";
377	}
378
379	if($preserve_sections ne ""){
380	$preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
381	}
382	$preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
383
384	$body_text =~ s/$print_footer/$preserve_sections/isg;
385
386
387	# delete other forms in the page
388	my @forms;
389	my $form_count = 0;
390	while($body_text =~ m/<form([^>])name=("\|')([^>"'])?("\|')/isg){
391	next if($3 eq "searchform");
392	$forms[$form_count++] = $&;
393	}
394	foreach my $form (@forms) {
395	$body_text =~ s/$form[\s\S]*?<\/form>//m;
396	}
397
398	# process links.
399	# because current WGET 1.10 the -k and -E option doesn't work together
400	# need to 'manually' convert the links to relative links
401	# Dealing with 3 types of links:
402	# -- outgoing links
403	# -- if we have downloaded the target files, link to the internal version (relative link)
404	# -- otherwise, link to the external version (absolute links)
405	# -- in-page links (relative link)
406
407	# NOTE: (important)
408	# must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
409	# otherwise, the internal links may have problems
410
411	# remove the title attribute of <a> tag
412	$body_text =~ s/<a([^>])title="(.?)"/<a$1/isg;
413
414	# extract all the links
415	my @links;
416	my $link_count = 0;
417	while($body_text =~ m/(href\|src)="([^>\s])$url_base\/([^>\s])"/ig){
418	$links[$link_count++] = "$1=\"$2$url_base/$3\"";
419	}
420
421	foreach my $cur_link (@links) {
422	# escape greedy match + character
423	$cur_link =~ s/\+/\\+/isg;
424
425	$cur_link =~ m/(.+)"([^>])$url_base\/([^>\s])"/;
426	my $external_file_path = "$1\"http://$url_base/$3\"";
427
428	$body_text =~ s/$cur_link/$external_file_path/i;
429	}
430
431	# tag links to new wiki pages as red
432	$body_text =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
433
434	# tag links to pages external of the MediaWiki website as blue
435	$body_text =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
436
437
438	# process the table-of-contents section
439	# if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
440	# 1. read _content_ macro from about.dm
441	# 2. append the toc, change all links to the Greenstone internal format for relative links
442	# 3. write to the extra.dm
443	# TODO: we assume the _about:content_ hasn't been specified before
444	# so needs to add function to handle when the macro is already in the extra.dm
445	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
446
447	# extract toc of the Main_Page
448	my $mainpage_toc = "";
449	my $toc_exp = "<table([^>])id=(\"\|')toc(\"\|')(.\|\\n)</table>\\n";
450	if($self->{'toc_exp'} =~ /\S/){
451	$toc_exp = $self->{'toc_exp'};
452	}
453	if($body_text =~ /$toc_exp/){
454	$mainpage_toc = $&;
455	}
456
457	if($mainpage_toc =~ /\S/) {
458
459	# change the in-page links to relative links, for example, change <a href="#section1"> to
460	# <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
461	my $file_url_format = $file;
462	$file_url_format =~ s/\\/\//isg;
463	$file_url_format = "http://" . $file_url_format;
464
465	# encode as URL, otherwise doesn't work on Windows
466	$file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
467	$mainpage_toc =~ s/<a href="([^>"#])#([^>"])"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
468
469
470	# read the collection's extra.dm
471	my $macro_path = $base_dir;
472	$macro_path =~ s/import$/macros/;
473	my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
474
475	my $extra_dm = "";
476	if(open(INPUT, "<$extradm_file")){
477	while(my $line = <INPUT>){
478	$extra_dm .= $line;
479	}
480	} else {
481	print $outhandle "can't open file $extradm_file\n";
482	}
483	close(INPUT);
484
485	# check whether we have changed the macros
486	my @packages = split("package ", $extra_dm);
487	my $about_package = "";
488	foreach my $package (@packages) {
489	$about_package = "package " . $package if($package =~ /^about/);
490	}
491
492	my $update_extra_dm = 0;
493
494	if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
495	print $outhandle "_content_ macro already changed!!!!\n";
496	}
497	# if extra.dm doesn't have an "about package"
498	elsif ($about_package !~ /\S/) {
499	# read _content_ macro from $GSDLHOME/macros/about.dm file
500	my $global_about_package = &read_content_from_about_dm();
501
502	# create the extra _content_ macro for this collection
503	# add the original content of the _content_ macro
504	$global_about_package =~ m/{(.\|\n)*<\/div>\n\n/;
505
506	# append the new about package to extra.dm
507	$extra_dm .= "\n\npackage about\n_content_$&\n\n";
508	$extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
509
510	$update_extra_dm = 1;
511	}
512	# the about package exists, but either doesn't have the _content_ macro or
513	# the _content_ macro doesn't contain the toc
514	else {
515	# check if there is a content macro
516	my $content_macro_existed = 0;
517	$content_macro_existed = ($about_package =~ /(\s\|\n)_content_(\s){/);
518
519	# if there is one
520	# append a new section div for toc to the end of the document section
521	if($content_macro_existed ==1) {
522	$about_package =~ /(\s\|\n)_content_(\s){(.\|\n)*?}/;
523	my $content_macro = $&;
524	my $new_content_macro = $content_macro;
525	$new_content_macro =~ s/<div[^>]class="document">(.\|\n)<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
526	$extra_dm =~ s/$content_macro/$new_content_macro/mg;
527	}
528	# otherwise, append _content_ macro to the about package
529	else {
530	my $new_about_package = $about_package;
531	$content_macro = &read_content_from_about_dm();
532	$content_macro =~ m/{(.\|\n)*<\/div>\n\n/;
533
534	$new_about_package .= "\n\n_content_$&\n\n";
535	$new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
536	$extra_dm =~ s/$about_package/$new_about_package/mg;
537	}
538
539	# either the case, we need to update the extra.dm
540	$update_extra_dm = 1;
541	}
542
543	if($update_extra_dm==1){
544	# write to the extra.dm file of the collection
545	if (open(OUTPUT, ">$extradm_file")) {
546	print OUTPUT $extra_dm;
547	} else {
548	print "can't open $extradm_file\n";
549	}
550	close(OUTPUT);
551	}
552	} else {
553	print $outhandle "Main_Page doesn't have a table-of-contents section\n";
554	}
555	}
556
557	# If delete_toc is set, remove toc and tof contents.
558	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
559	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
560	# print "\nit matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
561	if ($body_text =~ /$self->{'toc_exp'}/) {
562	$body_text =~ s/$self->{'toc_exp'}//i;
563	}
564	}
565	}
566
567	$$textref = "<body" . $body_text;
568
569	# Wrap the whole page with <div id="wikispecificstyle"></div>
570	# keep the style of this website and don't mess up with the Greenstone styles
571	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
572	$$textref =~ s/<\/body>/<\/div><\/body>/is;
573
574	$self->SUPER::process(@_);
575
576	return 1;
577	}
578
579
580	sub extract_metadata
581	{
582	my $self = shift (@_);
583	my ($textref, $metadata, $doc_obj) = @_;
584	my $outhandle = $self->{'outhandle'};
585
586	return if (!defined $textref);
587
588	# metadata fields to extract/save. 'key' is the (lowercase) name of the
589	# html meta, 'value' is the metadata name for greenstone to use
590	my %find_fields = ();
591	my ($tag,$value);
592
593	my $orig_field = "";
594	foreach my $field (split /,/, $self->{'metadata_fields'}) {
595	# support tag<tagname>
596	if ($field =~ /^(.?)<(.?)>$/) {
597	# "$2" is the user's preferred gs metadata name
598	$find_fields{lc($1)}=$2; # lc = lowercase
599	$orig_field = $1;
600	} else { # no <tagname> for mapping
601	# "$field" is the user's preferred gs metadata name
602	$find_fields{lc($field)}=$field; # lc = lowercase
603	$orig_field = $field;
604	}
605
606	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
607	$tag = $orig_field;
608	$value = $1;
609	if (!defined $value \|\| !defined $tag){
610	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
611	next;
612	} else {
613	# clean up and add
614	chomp($value); # remove trailing \n, if any
615	$tag = $find_fields{lc($tag)};
616	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
617	# if ($self->{'verbosity'} > 2);
618	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
619	}
620	}
621	}
622	}
623
624	sub safe_escape_regexp
625	{
626	my $regexp = shift (@_);
627
628	# if ($ENV{'GSDLOS'} =~ /^windows$/i) {
629	$regexp =~ s/\\/\\\\/isg;
630	#} else {
631	$regexp =~ s/\//\\\//isg;
632	#}
633	return $regexp;
634	}
635
636	sub read_content_from_about_dm
637	{
638	my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
639	my $about_page_content = "";
640	if (open(INPUT, "<$about_macro_file")){
641	while (my $line=<INPUT>){
642	$about_page_content .= $line;
643	}
644	} else {
645	print $outhandle "can't open file $about_macro_file\n";
646	}
647	close(INPUT);
648
649	# extract the _content_ macro
650	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
651	$about_page_content = $&;
652
653	return $about_page_content;
654	}
655
656	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: