source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 26.4 KB
Line 
1###########################################################################
2#
3# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28# login, discussion, history, etc. Only the navigation and search section could be preserved.
29# Searchbox will be modified to search the Greenstone collection instead of the website.
30# It also can automatically add the table of contents on the website's Main_Page to the
31# collection's Home page.
32
33package MediaWikiPlugin;
34
35use HTMLPlugin;
36# use ImagePlugin;
37# use File::Copy;
38use unicode;
39
40
41#use strict; # every perl program should have this!
42#no strict 'refs'; # make an exception so we can use variables as filehandles
43
44sub BEGIN {
45 @MediaWikiPlugin::ISA = ('HTMLPlugin');
46}
47
48my $arguments =
49 [
50 # show the table of contents on collection's home page
51 { 'name' => "show_toc",
52 'desc' => "{MediaWikiPlugin.show_toc}",
53 'type' => "flag",
54 'reqd' => "no"},
55 # set to delete the table of contents section on each MediaWiki page
56 { 'name' => "delete_toc",
57 'desc' => "{MediaWikiPlugin.delete_toc}",
58 'type' => "flag",
59 'reqd' => "no"},
60 # regexp to match the table of contents
61 { 'name' => "toc_exp",
62 'desc' => "{MediaWikiPlugin.toc_exp}",
63 'type' => "regexp",
64 'reqd' => "no",
65 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
66 # set to delete the navigation section
67 { 'name' => "delete_nav",
68 'desc' => "{MediaWikiPlugin.delete_nav}",
69 'type' => "flag",
70 'reqd' => "no",
71 'deft' => ""},
72 # regexp to match the navigation section
73 { 'name' => "nav_div_exp",
74 'desc' => "{MediaWikiPlugin.nav_div_exp}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
78 # set to delete the searchbox section
79 { 'name' => "delete_searchbox",
80 'desc' => "{MediaWikiPlugin.delete_searchbox}",
81 'type' => "flag",
82 'reqd' => "no",
83 'deft' => ""},
84 # regexp to match the searchbox section
85 { 'name' => "searchbox_div_exp",
86 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
87 'type' => "regexp",
88 'reqd' => "no",
89 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
90 # regexp to match title suffix
91 # can't use the title_sub option in HTMLPlugin instead
92 # because title_sub always matches from the begining
93 { 'name' => "remove_title_suffix_exp",
94 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
95 'type' => "regexp",
96 'reqd' => "no",
97 'deft' => ""}
98 ];
99
100my $options = { 'name' => "MediaWikiPlugin",
101 'desc' => "{MediaWikiPlugin.desc}",
102 'abstract' => "no",
103 'inherits' => "yes",
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
115 return bless $self, $class;
116}
117
118
119
120sub process {
121 my $self = shift (@_);
122 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123 my $outhandle = $self->{'outhandle'};
124
125 print $outhandle "MediaWikiPlugin: processing $file\n" if $self->{'verbosity'} > 1;
126
127 my @head_and_body = split(/<body/i,$$textref);
128 my $head = shift(@head_and_body);
129 my $body_text = join("<body", @head_and_body);
130
131 $head =~ m/<title>(.+)<\/title>/i;
132 my $doctitle = $1 if defined $1;
133
134 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
135 my @doc_properties = split(/<xml>/i,$head);
136 my $doc_heading = shift(@doc_properties);
137 my $rest_doc_properties = join(" ", @doc_properties);
138
139 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
140 my $extracted_metadata = shift (@extracted_metadata);
141 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
142 }
143
144 # set the title here if we haven't found it yet
145 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
146 if (defined $doctitle && $doctitle =~ /\S/) {
147 # remove suffix in title if required
148 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
149 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
150 $doctitle =~ s/$remove_suffix_exp//i;
151 }
152 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
153 } else {
154 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
155 }
156 }
157
158 # we are only interested in the column-contents div <div id="column-content">
159 # remove header section, it may contain header images or additional search boxes
160 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
161 if($body_text =~ /$header_exp/){
162 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
163 } else {
164 $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
165 if($body_text =~ /$header_exp/){
166 $body_text =~ s/$header_exp/<div$2id='column-content/i;
167 }
168 }
169
170 # remove timeline
171 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
172
173 # remove extra bits
174 my $extra_bits = "Retrieved from(.+)</a>\"";
175 $body_text =~ s/$extra_bits//isg;
176
177 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
178 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
179 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
180 $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
181
182 # get rid of the [edit] buttons
183 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
184 # get rid of the last time edit information at the bottom
185 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
186 # get rid of the (Redirected from ...)
187 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
188
189 # escape texts macros
190 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
191 # may change the links, like Greenstone_Documentation_All.html, then change back
192 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
193
194 # define file delimiter for different platforms
195 my $file_delimiter;
196 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
197 $file_delimiter = "\\";
198 } else {
199 $file_delimiter = "/";
200 }
201
202 # IMPORTANT: different delimiter for $base_dir and $file
203 # $base_dir use forward slash for both windows and linux
204 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
205 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
206 # $file use different delimiters : forward slash for linux; backward slash for windows
207 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
208 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
209
210 # get the base url for the MediaWiki website
211 my $safe_delimiter = &safe_escape_regexp($file_delimiter);
212 my @url_dirs=split($safe_delimiter, $file);
213 my $url_base = $url_dirs[0];
214
215 # Re-check css files associated with MediaWiki pages
216 if(defined $base_dir && $base_dir ne ""){
217 my @css_files;
218 my $css_file_count = 0;
219
220 # find all the stylesheets imported with @import statement
221 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
222 $css_files[$css_file_count++] = $2 if defined $2;
223 }
224
225 # download the stylesheets if we haven't downloaded them yet
226 # add prefix to each style elmement, comment out the body element
227 # and copy the files to collection's images folder
228 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
229
230 my $css_file = $css_files[$css_file_count];
231
232 # remove prefix gli/cache directory
233 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
234
235 # change the \ delimiter in $css_file to / for consistency
236 $css_file =~ s/\\/\//isg;
237 if($css_file !~ /$url_base/) {
238 $css_file = $url_base . $css_file;
239 }
240
241 # trim the ? mark append to the end of a stylesheet
242 $css_file =~ s/\?(.+)$//isg;
243
244 my $css_file_path = &util::filename_cat($base_dir, $css_file);
245
246 # do nothing if we have already downloaded the css files
247 if (! -e $css_file_path) {
248
249 # check the stylesheet's directory in the import folder
250 # if the directory doesn't exist, create one
251 my @dirs = split(/\//i,$css_file);
252 my $path_check = "$base_dir/";
253 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
254 $path_check .= $dirs[$i] . "/";
255 mkdir($path_check) if (! -d $path_check );
256 }
257
258 # NOTE: wget needs configuration to directly access Internet
259 # These files should already downloaded if we used the MediaWikiDownload
260 # downloading
261 $css_file = "http://$css_file";
262 print "\ndownloading : " . $css_file . "\n\n";
263 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
264 if ($? != 0) {
265 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
266 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
267 unlink("$css_file_path");
268 }
269 } # done with download
270
271 # add a prefix "#wikispecificstyle" to each element
272 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
273 # so we will wrap the web page with a div with id = wikispecificstyle
274 my $css_content;
275 if(open(INPUT, "<$css_file_path")){
276 while(my $line = <INPUT>){
277 # comment out the body element because we change the body to div
278 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
279
280 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
281 if($line !~ m/wikispecificstyle/i){
282 $line = "#wikispecificstyle " . $line;
283 }
284 }
285
286 $css_content .= $line;
287 }
288 close(INPUT);
289 open(OUTPUT, ">$css_file_path");
290 print OUTPUT $css_content;
291 close(OUTPUT);
292 }
293
294 # Copy the modified stylesheets to collection's images folder
295 # for future customization
296 my $images_dir = $base_dir;
297 $images_dir =~ s/import$/images/;
298 $css_file =~ m/(.*)\/(.*)$/;
299 $images_dir = &util::filename_cat($images_dir, $2);
300
301 if(open(OUTPUT, ">$images_dir")){
302 print OUTPUT $css_content;
303 close(OUTPUT);
304 }
305 }
306 }
307
308
309 # by default, only preserve navigation box and search box
310 # others like toolbox, interaction, languages box, will be removed
311
312 # extract the larger part -- footer section
313 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
314 $body_text =~ /$print_footer/;
315 my $footer = "";
316 $footer = $& if defined $&;
317 $footer =~ s/<\/body>//isg;
318
319 # trim the comments first
320 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
321
322 # contain sections that are to be preserved
323 my $preserve_sections = "";
324
325 # process the navigation section
326 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
327 if (defined $self->{'nav_div_exp'}) {
328 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
329 }
330
331 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
332 # do nothing
333 } else {
334 if ($footer =~ m/$nav_match_exp/ig) {
335 $preserve_sections = $& ;
336 } else {
337 print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
338 }
339 # if($preserve_sections =~/\S/){
340 # $preserve_sections .= "</div>";
341 # }
342 }
343
344 # process the searchbox section
345 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
346 if(defined $self->{'searchbox_div_exp'}) {
347 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
348 }
349
350 my $searchbox_section = "";
351 $footer =~ m/$searchbox_exp/ig;
352 $searchbox_section = $& if defined $&;
353
354 # make the searchbox form work in Greenstone
355 if($searchbox_section =~ /\S/){
356 # replace action
357 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
358
359 # remove buttons
360 $searchbox_section =~ s/name="search"/name="q"/isg;
361 $searchbox_section =~ s/name="go"//isg;
362 $searchbox_section =~ s/name="fulltext"//isg;
363
364 # get collection name from $base_dir for c param
365 $base_dir =~ m/\/collect\/(.+)\//i;
366 my $collection_name = "";
367 $collection_name = $1 if defined $1;
368
369 # add Greenstone search params
370 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
371 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
372 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
373 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
374
375 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
376
377 # $searchbox_section .= "</div>";
378 } else {
379 print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
380 }
381
382 # either delete or replace the searchbox
383 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
384 # do nothing
385 } else {
386 $preserve_sections .= "\n$searchbox_section\n";
387 }
388
389 if($preserve_sections ne ""){
390 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
391 }
392 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
393
394 $body_text =~ s/$print_footer/$preserve_sections/isg;
395
396
397 # delete other forms in the page
398 my @forms;
399 my $form_count = 0;
400 while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
401 next if($3 eq "searchform");
402 $forms[$form_count++] = $&;
403 }
404 foreach my $form (@forms) {
405 $body_text =~ s/$form[\s\S]*?<\/form>//m;
406 }
407
408 # process links.
409 # because current WGET 1.10 the -k and -E option doesn't work together
410 # need to 'manually' convert the links to relative links
411 # Dealing with 3 types of links:
412 # -- outgoing links
413 # -- if we have downloaded the target files, link to the internal version (relative link)
414 # -- otherwise, link to the external version (absolute links)
415 # -- in-page links (relative link)
416
417 # NOTE: (important)
418 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
419 # otherwise, the internal links may have problems
420
421 # remove the title attribute of <a> tag
422 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
423
424 # extract all the links
425 my @links;
426 my $link_count = 0;
427 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
428 $links[$link_count++] = "$1=\"$2$url_base/$3\"";
429 }
430
431 foreach my $cur_link (@links) {
432 # escape greedy match + character
433 $cur_link =~ s/\+/\\+/isg;
434
435 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
436 my $external_file_path = "$1\"http://$url_base/$3\"";
437
438 $body_text =~ s/$cur_link/$external_file_path/i;
439 }
440
441 # tag links to new wiki pages as red
442 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
443
444 # tag links to pages external of the MediaWiki website as blue
445 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
446
447
448 # process the table-of-contents section
449 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
450 # 1. read _content_ macro from about.dm
451 # 2. append the toc, change all links to the Greenstone internal format for relative links
452 # 3. write to the extra.dm
453 # TODO: we assume the _about:content_ hasn't been specified before
454 # so needs to add function to handle when the macro is already in the extra.dm
455 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
456
457 # extract toc of the Main_Page
458 my $mainpage_toc = "";
459 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
460 if($self->{'toc_exp'} =~ /\S/){
461 $toc_exp = $self->{'toc_exp'};
462 }
463 if($body_text =~ /$toc_exp/){
464 $mainpage_toc = $&;
465 }
466
467 if($mainpage_toc =~ /\S/) {
468
469 # change the in-page links to relative links, for example, change <a href="#section1"> to
470 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
471 my $file_url_format = $file;
472 $file_url_format =~ s/\\/\//isg;
473 $file_url_format = "http://" . $file_url_format;
474
475 # encode as URL, otherwise doesn't work on Windows
476 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
477 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
478
479
480 # read the collection's extra.dm
481 my $macro_path = $base_dir;
482 $macro_path =~ s/import$/macros/;
483 my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
484
485 my $extra_dm = "";
486 if(open(INPUT, "<$extradm_file")){
487 while(my $line = <INPUT>){
488 $extra_dm .= $line;
489 }
490 } else {
491 print $outhandle "can't open file $extradm_file\n";
492 }
493 close(INPUT);
494
495 # check whether we have changed the macros
496 my @packages = split("package ", $extra_dm);
497 my $about_package = "";
498 foreach my $package (@packages) {
499 $about_package = "package " . $package if($package =~ /^about/);
500 }
501
502 my $update_extra_dm = 0;
503
504 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
505 print $outhandle "_content_ macro already changed!!!!\n";
506 }
507 # if extra.dm doesn't have an "about package"
508 elsif ($about_package !~ /\S/) {
509 # read _content_ macro from $GSDLHOME/macros/about.dm file
510 my $global_about_package = &read_content_from_about_dm();
511
512 # create the extra _content_ macro for this collection
513 # add the original content of the _content_ macro
514 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
515
516 # append the new about package to extra.dm
517 $extra_dm .= "\n\npackage about\n_content_$&\n\n";
518 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
519
520 $update_extra_dm = 1;
521 }
522 # the about package exists, but either doesn't have the _content_ macro or
523 # the _content_ macro doesn't contain the toc
524 else {
525 # check if there is a content macro
526 my $content_macro_existed = 0;
527 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
528
529 # if there is one
530 # append a new section div for toc to the end of the document section
531 if($content_macro_existed ==1) {
532 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
533 my $content_macro = $&;
534 my $new_content_macro = $content_macro;
535 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
536 $extra_dm =~ s/$content_macro/$new_content_macro/mg;
537 }
538 # otherwise, append _content_ macro to the about package
539 else {
540 my $new_about_package = $about_package;
541 $content_macro = &read_content_from_about_dm();
542 $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
543
544 $new_about_package .= "\n\n_content_$&\n\n";
545 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
546 $extra_dm =~ s/$about_package/$new_about_package/mg;
547 }
548
549 # either the case, we need to update the extra.dm
550 $update_extra_dm = 1;
551 }
552
553 if($update_extra_dm==1){
554 # write to the extra.dm file of the collection
555 if (open(OUTPUT, ">$extradm_file")) {
556 print OUTPUT $extra_dm;
557 } else {
558 print "can't open $extradm_file\n";
559 }
560 close(OUTPUT);
561 }
562 } else {
563 print $outhandle "Main_Page doesn't have a table-of-contents section\n";
564 }
565 }
566
567 # If delete_toc is set, remove toc and tof contents.
568 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
569 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
570 # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
571 if ($body_text =~ /$self->{'toc_exp'}/) {
572 $body_text =~ s/$self->{'toc_exp'}//i;
573 }
574 }
575 }
576
577 $$textref = "<body" . $body_text;
578
579 # Wrap the whole page with <div id="wikispecificstyle"></div>
580 # keep the style of this website and don't mess up with the Greenstone styles
581 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
582 $$textref =~ s/<\/body>/<\/div><\/body>/is;
583
584 $self->SUPER::process(@_);
585
586 return 1;
587}
588
589
590sub extract_metadata
591{
592 my $self = shift (@_);
593 my ($textref, $metadata, $doc_obj) = @_;
594 my $outhandle = $self->{'outhandle'};
595
596 return if (!defined $textref);
597
598 # metadata fields to extract/save. 'key' is the (lowercase) name of the
599 # html meta, 'value' is the metadata name for greenstone to use
600 my %find_fields = ();
601 my ($tag,$value);
602
603 my $orig_field = "";
604 foreach my $field (split /,/, $self->{'metadata_fields'}) {
605 # support tag<tagname>
606 if ($field =~ /^(.*?)<(.*?)>$/) {
607 # "$2" is the user's preferred gs metadata name
608 $find_fields{lc($1)}=$2; # lc = lowercase
609 $orig_field = $1;
610 } else { # no <tagname> for mapping
611 # "$field" is the user's preferred gs metadata name
612 $find_fields{lc($field)}=$field; # lc = lowercase
613 $orig_field = $field;
614 }
615
616 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
617 $tag = $orig_field;
618 $value = $1;
619 if (!defined $value || !defined $tag){
620 #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
621 next;
622 } else {
623 # clean up and add
624 chomp($value); # remove trailing \n, if any
625 $tag = $find_fields{lc($tag)};
626 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
627 # if ($self->{'verbosity'} > 2);
628 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
629 }
630 }
631 }
632}
633
634sub safe_escape_regexp
635{
636 my $regexp = shift (@_);
637
638 # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
639 $regexp =~ s/\\/\\\\/isg;
640 #} else {
641 $regexp =~ s/\//\\\//isg;
642 #}
643 return $regexp;
644}
645
646sub read_content_from_about_dm
647{
648 my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
649 my $about_page_content = "";
650 if (open(INPUT, "<$about_macro_file")){
651 while (my $line=<INPUT>){
652 $about_page_content .= $line;
653 }
654 } else {
655 print $outhandle "can't open file $about_macro_file\n";
656 }
657 close(INPUT);
658
659 # extract the _content_ macro
660 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
661 $about_page_content = $&;
662
663 return $about_page_content;
664}
665
6661;
Note: See TracBrowser for help on using the repository browser.