source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 15918

Last change on this file since 15918 was 15887, checked in by mdewsnip, 16 years ago

Added "use strict" to the few files that were missing it, and fixing resulting problems in MediaWikiPlug.pm.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.4 KB
Line 
1###########################################################################
2#
3# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28# login, discussion, history, etc. Only the navigation and search section could be preserved.
29# Searchbox will be modified to search the Greenstone collection instead of the website.
30# It also can automatically add the table of contents on the website's Main_Page to the
31# collection's Home page.
32
33package MediaWikiPlugin;
34
35use HTMLPlugin;
36use unicode;
37
38use strict; # every perl program should have this!
39no strict 'refs'; # make an exception so we can use variables as filehandles
40
41
42sub BEGIN {
43 @MediaWikiPlugin::ISA = ('HTMLPlugin');
44}
45
46my $arguments =
47 [
48 # show the table of contents on collection's home page
49 { 'name' => "show_toc",
50 'desc' => "{MediaWikiPlugin.show_toc}",
51 'type' => "flag",
52 'reqd' => "no"},
53 # set to delete the table of contents section on each MediaWiki page
54 { 'name' => "delete_toc",
55 'desc' => "{MediaWikiPlugin.delete_toc}",
56 'type' => "flag",
57 'reqd' => "no"},
58 # regexp to match the table of contents
59 { 'name' => "toc_exp",
60 'desc' => "{MediaWikiPlugin.toc_exp}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
64 # set to delete the navigation section
65 { 'name' => "delete_nav",
66 'desc' => "{MediaWikiPlugin.delete_nav}",
67 'type' => "flag",
68 'reqd' => "no",
69 'deft' => ""},
70 # regexp to match the navigation section
71 { 'name' => "nav_div_exp",
72 'desc' => "{MediaWikiPlugin.nav_div_exp}",
73 'type' => "regexp",
74 'reqd' => "no",
75 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
76 # set to delete the searchbox section
77 { 'name' => "delete_searchbox",
78 'desc' => "{MediaWikiPlugin.delete_searchbox}",
79 'type' => "flag",
80 'reqd' => "no",
81 'deft' => ""},
82 # regexp to match the searchbox section
83 { 'name' => "searchbox_div_exp",
84 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
88 # regexp to match title suffix
89 # can't use the title_sub option in HTMLPlugin instead
90 # because title_sub always matches from the begining
91 { 'name' => "remove_title_suffix_exp",
92 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
93 'type' => "regexp",
94 'reqd' => "no",
95 'deft' => ""}
96 ];
97
98my $options = { 'name' => "MediaWikiPlugin",
99 'desc' => "{MediaWikiPlugin.desc}",
100 'abstract' => "no",
101 'inherits' => "yes",
102 'args' => $arguments };
103
104sub new {
105 my ($class) = shift (@_);
106 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107 push(@$pluginlist, $class);
108
109 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 push(@{$hashArgOptLists->{"OptList"}},$options);
111
112 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
113 return bless $self, $class;
114}
115
116
117
118sub process {
119 my $self = shift (@_);
120 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
121 my $outhandle = $self->{'outhandle'};
122
123 print $outhandle "MediaWikiPlugin: processing $file\n" if $self->{'verbosity'} > 1;
124
125 my @head_and_body = split(/<body/i,$$textref);
126 my $head = shift(@head_and_body);
127 my $body_text = join("<body", @head_and_body);
128
129 $head =~ m/<title>(.+)<\/title>/i;
130 my $doctitle = $1 if defined $1;
131
132 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
133 my @doc_properties = split(/<xml>/i,$head);
134 my $doc_heading = shift(@doc_properties);
135 my $rest_doc_properties = join(" ", @doc_properties);
136
137 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
138 my $extracted_metadata = shift (@extracted_metadata);
139 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
140 }
141
142 # set the title here if we haven't found it yet
143 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
144 if (defined $doctitle && $doctitle =~ /\S/) {
145 # remove suffix in title if required
146 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
147 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
148 $doctitle =~ s/$remove_suffix_exp//i;
149 }
150 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
151 } else {
152 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
153 }
154 }
155
156 # we are only interested in the column-contents div <div id="column-content">
157 # remove header section, it may contain header images or additional search boxes
158 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
159 if($body_text =~ /$header_exp/){
160 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
161 } else {
162 $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
163 if($body_text =~ /$header_exp/){
164 $body_text =~ s/$header_exp/<div$2id='column-content/i;
165 }
166 }
167
168 # remove timeline
169 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
170
171 # remove extra bits
172 my $extra_bits = "Retrieved from(.+)</a>\"";
173 $body_text =~ s/$extra_bits//isg;
174
175 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
176 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
177 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
178 $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
179
180 # get rid of the [edit] buttons
181 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
182 # get rid of the last time edit information at the bottom
183 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
184 # get rid of the (Redirected from ...)
185 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
186
187 # escape texts macros
188 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
189 # may change the links, like Greenstone_Documentation_All.html, then change back
190 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
191
192 # define file delimiter for different platforms
193 my $file_delimiter;
194 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
195 $file_delimiter = "\\";
196 } else {
197 $file_delimiter = "/";
198 }
199
200 # IMPORTANT: different delimiter for $base_dir and $file
201 # $base_dir use forward slash for both windows and linux
202 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
203 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
204 # $file use different delimiters : forward slash for linux; backward slash for windows
205 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
206 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
207
208 # get the base url for the MediaWiki website
209 my $safe_delimiter = &safe_escape_regexp($file_delimiter);
210 my @url_dirs=split($safe_delimiter, $file);
211 my $url_base = $url_dirs[0];
212
213 # Re-check css files associated with MediaWiki pages
214 if(defined $base_dir && $base_dir ne ""){
215 my @css_files;
216 my $css_file_count = 0;
217
218 # find all the stylesheets imported with @import statement
219 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
220 $css_files[$css_file_count++] = $2 if defined $2;
221 }
222
223 # download the stylesheets if we haven't downloaded them yet
224 # add prefix to each style elmement, comment out the body element
225 # and copy the files to collection's images folder
226 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
227
228 my $css_file = $css_files[$css_file_count];
229
230 # remove prefix gli/cache directory
231 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
232
233 # change the \ delimiter in $css_file to / for consistency
234 $css_file =~ s/\\/\//isg;
235 if($css_file !~ /$url_base/) {
236 $css_file = $url_base . $css_file;
237 }
238
239 # trim the ? mark append to the end of a stylesheet
240 $css_file =~ s/\?(.+)$//isg;
241
242 my $css_file_path = &util::filename_cat($base_dir, $css_file);
243
244 # do nothing if we have already downloaded the css files
245 if (! -e $css_file_path) {
246
247 # check the stylesheet's directory in the import folder
248 # if the directory doesn't exist, create one
249 my @dirs = split(/\//i,$css_file);
250 my $path_check = "$base_dir/";
251 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
252 $path_check .= $dirs[$i] . "/";
253 mkdir($path_check) if (! -d $path_check );
254 }
255
256 # NOTE: wget needs configuration to directly access Internet
257 # These files should already downloaded if we used the MediaWikiDownload
258 # downloading
259 $css_file = "http://$css_file";
260 print "\ndownloading : " . $css_file . "\n\n";
261 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
262 if ($? != 0) {
263 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
264 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
265 unlink("$css_file_path");
266 }
267 } # done with download
268
269 # add a prefix "#wikispecificstyle" to each element
270 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
271 # so we will wrap the web page with a div with id = wikispecificstyle
272 my $css_content;
273 if(open(INPUT, "<$css_file_path")){
274 while(my $line = <INPUT>){
275 # comment out the body element because we change the body to div
276 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
277
278 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
279 if($line !~ m/wikispecificstyle/i){
280 $line = "#wikispecificstyle " . $line;
281 }
282 }
283
284 $css_content .= $line;
285 }
286 close(INPUT);
287 open(OUTPUT, ">$css_file_path");
288 print OUTPUT $css_content;
289 close(OUTPUT);
290 }
291
292 # Copy the modified stylesheets to collection's images folder
293 # for future customization
294 my $images_dir = $base_dir;
295 $images_dir =~ s/import$/images/;
296 $css_file =~ m/(.*)\/(.*)$/;
297 $images_dir = &util::filename_cat($images_dir, $2);
298
299 if(open(OUTPUT, ">$images_dir")){
300 print OUTPUT $css_content;
301 close(OUTPUT);
302 }
303 }
304 }
305
306
307 # by default, only preserve navigation box and search box
308 # others like toolbox, interaction, languages box, will be removed
309
310 # extract the larger part -- footer section
311 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
312 $body_text =~ /$print_footer/;
313 my $footer = "";
314 $footer = $& if defined $&;
315 $footer =~ s/<\/body>//isg;
316
317 # trim the comments first
318 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
319
320 # contain sections that are to be preserved
321 my $preserve_sections = "";
322
323 # process the navigation section
324 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
325 if (defined $self->{'nav_div_exp'}) {
326 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
327 }
328
329 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
330 # do nothing
331 } else {
332 if ($footer =~ m/$nav_match_exp/ig) {
333 $preserve_sections = $& ;
334 } else {
335 print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
336 }
337 # if($preserve_sections =~/\S/){
338 # $preserve_sections .= "</div>";
339 # }
340 }
341
342 # process the searchbox section
343 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
344 if(defined $self->{'searchbox_div_exp'}) {
345 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
346 }
347
348 my $searchbox_section = "";
349 $footer =~ m/$searchbox_exp/ig;
350 $searchbox_section = $& if defined $&;
351
352 # make the searchbox form work in Greenstone
353 if($searchbox_section =~ /\S/){
354 # replace action
355 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
356
357 # remove buttons
358 $searchbox_section =~ s/name="search"/name="q"/isg;
359 $searchbox_section =~ s/name="go"//isg;
360 $searchbox_section =~ s/name="fulltext"//isg;
361
362 # get collection name from $base_dir for c param
363 $base_dir =~ m/\/collect\/(.+)\//i;
364 my $collection_name = "";
365 $collection_name = $1 if defined $1;
366
367 # add Greenstone search params
368 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
369 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
370 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
371 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
372
373 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
374
375 # $searchbox_section .= "</div>";
376 } else {
377 print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
378 }
379
380 # either delete or replace the searchbox
381 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
382 # do nothing
383 } else {
384 $preserve_sections .= "\n$searchbox_section\n";
385 }
386
387 if($preserve_sections ne ""){
388 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
389 }
390 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
391
392 $body_text =~ s/$print_footer/$preserve_sections/isg;
393
394
395 # delete other forms in the page
396 my @forms;
397 my $form_count = 0;
398 while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
399 next if($3 eq "searchform");
400 $forms[$form_count++] = $&;
401 }
402 foreach my $form (@forms) {
403 $body_text =~ s/$form[\s\S]*?<\/form>//m;
404 }
405
406 # process links.
407 # because current WGET 1.10 the -k and -E option doesn't work together
408 # need to 'manually' convert the links to relative links
409 # Dealing with 3 types of links:
410 # -- outgoing links
411 # -- if we have downloaded the target files, link to the internal version (relative link)
412 # -- otherwise, link to the external version (absolute links)
413 # -- in-page links (relative link)
414
415 # NOTE: (important)
416 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
417 # otherwise, the internal links may have problems
418
419 # remove the title attribute of <a> tag
420 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
421
422 # extract all the links
423 my @links;
424 my $link_count = 0;
425 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
426 $links[$link_count++] = "$1=\"$2$url_base/$3\"";
427 }
428
429 foreach my $cur_link (@links) {
430 # escape greedy match + character
431 $cur_link =~ s/\+/\\+/isg;
432
433 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
434 my $external_file_path = "$1\"http://$url_base/$3\"";
435
436 $body_text =~ s/$cur_link/$external_file_path/i;
437 }
438
439 # tag links to new wiki pages as red
440 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
441
442 # tag links to pages external of the MediaWiki website as blue
443 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
444
445
446 # process the table-of-contents section
447 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
448 # 1. read _content_ macro from about.dm
449 # 2. append the toc, change all links to the Greenstone internal format for relative links
450 # 3. write to the extra.dm
451 # TODO: we assume the _about:content_ hasn't been specified before
452 # so needs to add function to handle when the macro is already in the extra.dm
453 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
454
455 # extract toc of the Main_Page
456 my $mainpage_toc = "";
457 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
458 if($self->{'toc_exp'} =~ /\S/){
459 $toc_exp = $self->{'toc_exp'};
460 }
461 if($body_text =~ /$toc_exp/){
462 $mainpage_toc = $&;
463 }
464
465 if($mainpage_toc =~ /\S/) {
466
467 # change the in-page links to relative links, for example, change <a href="#section1"> to
468 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
469 my $file_url_format = $file;
470 $file_url_format =~ s/\\/\//isg;
471 $file_url_format = "http://" . $file_url_format;
472
473 # encode as URL, otherwise doesn't work on Windows
474 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
475 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
476
477
478 # read the collection's extra.dm
479 my $macro_path = $base_dir;
480 $macro_path =~ s/import$/macros/;
481 my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
482
483 my $extra_dm = "";
484 if(open(INPUT, "<$extradm_file")){
485 while(my $line = <INPUT>){
486 $extra_dm .= $line;
487 }
488 } else {
489 print $outhandle "can't open file $extradm_file\n";
490 }
491 close(INPUT);
492
493 # check whether we have changed the macros
494 my @packages = split("package ", $extra_dm);
495 my $about_package = "";
496 foreach my $package (@packages) {
497 $about_package = "package " . $package if($package =~ /^about/);
498 }
499
500 my $update_extra_dm = 0;
501
502 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
503 print $outhandle "_content_ macro already changed!!!!\n";
504 }
505 # if extra.dm doesn't have an "about package"
506 elsif ($about_package !~ /\S/) {
507 # read _content_ macro from $GSDLHOME/macros/about.dm file
508 my $global_about_package = $self->read_content_from_about_dm();
509
510 # create the extra _content_ macro for this collection
511 # add the original content of the _content_ macro
512 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
513
514 # append the new about package to extra.dm
515 $extra_dm .= "\n\npackage about\n_content_$&\n\n";
516 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
517
518 $update_extra_dm = 1;
519 }
520 # the about package exists, but either doesn't have the _content_ macro or
521 # the _content_ macro doesn't contain the toc
522 else {
523 # check if there is a content macro
524 my $content_macro_existed = 0;
525 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
526
527 # if there is one
528 # append a new section div for toc to the end of the document section
529 if($content_macro_existed ==1) {
530 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
531 my $content_macro = $&;
532 my $new_content_macro = $content_macro;
533 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
534 $extra_dm =~ s/$content_macro/$new_content_macro/mg;
535 }
536 # otherwise, append _content_ macro to the about package
537 else {
538 my $new_about_package = $about_package;
539 my $content_macro = &read_content_from_about_dm();
540 $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
541
542 $new_about_package .= "\n\n_content_$&\n\n";
543 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
544 $extra_dm =~ s/$about_package/$new_about_package/mg;
545 }
546
547 # either the case, we need to update the extra.dm
548 $update_extra_dm = 1;
549 }
550
551 if($update_extra_dm==1){
552 # write to the extra.dm file of the collection
553 if (open(OUTPUT, ">$extradm_file")) {
554 print OUTPUT $extra_dm;
555 } else {
556 print "can't open $extradm_file\n";
557 }
558 close(OUTPUT);
559 }
560 } else {
561 print $outhandle "Main_Page doesn't have a table-of-contents section\n";
562 }
563 }
564
565 # If delete_toc is set, remove toc and tof contents.
566 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
567 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
568 # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
569 if ($body_text =~ /$self->{'toc_exp'}/) {
570 $body_text =~ s/$self->{'toc_exp'}//i;
571 }
572 }
573 }
574
575 $$textref = "<body" . $body_text;
576
577 # Wrap the whole page with <div id="wikispecificstyle"></div>
578 # keep the style of this website and don't mess up with the Greenstone styles
579 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
580 $$textref =~ s/<\/body>/<\/div><\/body>/is;
581
582 $self->SUPER::process(@_);
583
584 return 1;
585}
586
587
588sub extract_metadata
589{
590 my $self = shift (@_);
591 my ($textref, $metadata, $doc_obj) = @_;
592 my $outhandle = $self->{'outhandle'};
593
594 return if (!defined $textref);
595
596 # metadata fields to extract/save. 'key' is the (lowercase) name of the
597 # html meta, 'value' is the metadata name for greenstone to use
598 my %find_fields = ();
599 my ($tag,$value);
600
601 my $orig_field = "";
602 foreach my $field (split /,/, $self->{'metadata_fields'}) {
603 # support tag<tagname>
604 if ($field =~ /^(.*?)<(.*?)>$/) {
605 # "$2" is the user's preferred gs metadata name
606 $find_fields{lc($1)}=$2; # lc = lowercase
607 $orig_field = $1;
608 } else { # no <tagname> for mapping
609 # "$field" is the user's preferred gs metadata name
610 $find_fields{lc($field)}=$field; # lc = lowercase
611 $orig_field = $field;
612 }
613
614 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
615 $tag = $orig_field;
616 $value = $1;
617 if (!defined $value || !defined $tag){
618 #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
619 next;
620 } else {
621 # clean up and add
622 chomp($value); # remove trailing \n, if any
623 $tag = $find_fields{lc($tag)};
624 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
625 # if ($self->{'verbosity'} > 2);
626 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
627 }
628 }
629 }
630}
631
632sub safe_escape_regexp
633{
634 my $regexp = shift (@_);
635
636 # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
637 $regexp =~ s/\\/\\\\/isg;
638 #} else {
639 $regexp =~ s/\//\\\//isg;
640 #}
641 return $regexp;
642}
643
644sub read_content_from_about_dm
645{
646 my $self = shift(@_);
647
648 my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
649 my $about_page_content = "";
650 if (open(INPUT, "<$about_macro_file")){
651 while (my $line=<INPUT>){
652 $about_page_content .= $line;
653 }
654 } else {
655 my $outhandle = $self->{'outhandle'};
656 print $outhandle "can't open file $about_macro_file\n";
657 }
658 close(INPUT);
659
660 # extract the _content_ macro
661 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
662 $about_page_content = $&;
663
664 return $about_page_content;
665}
666
6671;
Note: See TracBrowser for help on using the repository browser.