source: gsdl/trunk/perllib/plugins/MediaWikiPlugin.pm@ 17739

Last change on this file since 17739 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 26.3 KB
Line 
1###########################################################################
2#
3# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28# login, discussion, history, etc. Only the navigation and search section could be preserved.
29# Searchbox will be modified to search the Greenstone collection instead of the website.
30# It also can automatically add the table of contents on the website's Main_Page to the
31# collection's Home page.
32
33package MediaWikiPlugin;
34
35use HTMLPlugin;
36use unicode;
37
38use strict; # every perl program should have this!
39no strict 'refs'; # make an exception so we can use variables as filehandles
40
41
42sub BEGIN {
43 @MediaWikiPlugin::ISA = ('HTMLPlugin');
44}
45
46my $arguments =
47 [
48 # show the table of contents on collection's home page
49 { 'name' => "show_toc",
50 'desc' => "{MediaWikiPlugin.show_toc}",
51 'type' => "flag",
52 'reqd' => "no"},
53 # set to delete the table of contents section on each MediaWiki page
54 { 'name' => "delete_toc",
55 'desc' => "{MediaWikiPlugin.delete_toc}",
56 'type' => "flag",
57 'reqd' => "no"},
58 # regexp to match the table of contents
59 { 'name' => "toc_exp",
60 'desc' => "{MediaWikiPlugin.toc_exp}",
61 'type' => "regexp",
62 'reqd' => "no",
63 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
64 # set to delete the navigation section
65 { 'name' => "delete_nav",
66 'desc' => "{MediaWikiPlugin.delete_nav}",
67 'type' => "flag",
68 'reqd' => "no",
69 'deft' => ""},
70 # regexp to match the navigation section
71 { 'name' => "nav_div_exp",
72 'desc' => "{MediaWikiPlugin.nav_div_exp}",
73 'type' => "regexp",
74 'reqd' => "no",
75 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
76 # set to delete the searchbox section
77 { 'name' => "delete_searchbox",
78 'desc' => "{MediaWikiPlugin.delete_searchbox}",
79 'type' => "flag",
80 'reqd' => "no",
81 'deft' => ""},
82 # regexp to match the searchbox section
83 { 'name' => "searchbox_div_exp",
84 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
85 'type' => "regexp",
86 'reqd' => "no",
87 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
88 # regexp to match title suffix
89 # can't use the title_sub option in HTMLPlugin instead
90 # because title_sub always matches from the begining
91 { 'name' => "remove_title_suffix_exp",
92 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
93 'type' => "regexp",
94 'reqd' => "no",
95 'deft' => ""}
96 ];
97
98my $options = { 'name' => "MediaWikiPlugin",
99 'desc' => "{MediaWikiPlugin.desc}",
100 'abstract' => "no",
101 'inherits' => "yes",
102 'args' => $arguments };
103
104sub new {
105 my ($class) = shift (@_);
106 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107 push(@$pluginlist, $class);
108
109 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 push(@{$hashArgOptLists->{"OptList"}},$options);
111
112 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
113 return bless $self, $class;
114}
115
116
117
118sub process {
119 my $self = shift (@_);
120 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
121 my $outhandle = $self->{'outhandle'};
122
123 my @head_and_body = split(/<body/i,$$textref);
124 my $head = shift(@head_and_body);
125 my $body_text = join("<body", @head_and_body);
126
127 $head =~ m/<title>(.+)<\/title>/i;
128 my $doctitle = $1 if defined $1;
129
130 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
131 my @doc_properties = split(/<xml>/i,$head);
132 my $doc_heading = shift(@doc_properties);
133 my $rest_doc_properties = join(" ", @doc_properties);
134
135 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
136 my $extracted_metadata = shift (@extracted_metadata);
137 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
138 }
139
140 # set the title here if we haven't found it yet
141 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
142 if (defined $doctitle && $doctitle =~ /\S/) {
143 # remove suffix in title if required
144 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
145 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
146 $doctitle =~ s/$remove_suffix_exp//i;
147 }
148 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
149 } else {
150 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
151 }
152 }
153
154 # we are only interested in the column-contents div <div id="column-content">
155 # remove header section, it may contain header images or additional search boxes
156 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
157 if($body_text =~ /$header_exp/){
158 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
159 } else {
160 $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
161 if($body_text =~ /$header_exp/){
162 $body_text =~ s/$header_exp/<div$2id='column-content/i;
163 }
164 }
165
166 # remove timeline
167 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
168
169 # remove extra bits
170 my $extra_bits = "Retrieved from(.+)</a>\"";
171 $body_text =~ s/$extra_bits//isg;
172
173 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
174 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
175 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
176 $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
177
178 # get rid of the [edit] buttons
179 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
180 # get rid of the last time edit information at the bottom
181 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
182 # get rid of the (Redirected from ...)
183 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
184
185 # escape texts macros
186 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
187 # may change the links, like Greenstone_Documentation_All.html, then change back
188 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
189
190 # define file delimiter for different platforms
191 my $file_delimiter;
192 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
193 $file_delimiter = "\\";
194 } else {
195 $file_delimiter = "/";
196 }
197
198 # IMPORTANT: different delimiter for $base_dir and $file
199 # $base_dir use forward slash for both windows and linux
200 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
201 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
202 # $file use different delimiters : forward slash for linux; backward slash for windows
203 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
204 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
205
206 # get the base url for the MediaWiki website
207 my $safe_delimiter = &safe_escape_regexp($file_delimiter);
208 my @url_dirs=split($safe_delimiter, $file);
209 my $url_base = $url_dirs[0];
210
211 # Re-check css files associated with MediaWiki pages
212 if(defined $base_dir && $base_dir ne ""){
213 my @css_files;
214 my $css_file_count = 0;
215
216 # find all the stylesheets imported with @import statement
217 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
218 $css_files[$css_file_count++] = $2 if defined $2;
219 }
220
221 # download the stylesheets if we haven't downloaded them yet
222 # add prefix to each style elmement, comment out the body element
223 # and copy the files to collection's images folder
224 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
225
226 my $css_file = $css_files[$css_file_count];
227
228 # remove prefix gli/cache directory
229 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
230
231 # change the \ delimiter in $css_file to / for consistency
232 $css_file =~ s/\\/\//isg;
233 if($css_file !~ /$url_base/) {
234 $css_file = $url_base . $css_file;
235 }
236
237 # trim the ? mark append to the end of a stylesheet
238 $css_file =~ s/\?(.+)$//isg;
239
240 my $css_file_path = &util::filename_cat($base_dir, $css_file);
241
242 # do nothing if we have already downloaded the css files
243 if (! -e $css_file_path) {
244
245 # check the stylesheet's directory in the import folder
246 # if the directory doesn't exist, create one
247 my @dirs = split(/\//i,$css_file);
248 my $path_check = "$base_dir/";
249 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
250 $path_check .= $dirs[$i] . "/";
251 mkdir($path_check) if (! -d $path_check );
252 }
253
254 # NOTE: wget needs configuration to directly access Internet
255 # These files should already downloaded if we used the MediaWikiDownload
256 # downloading
257 $css_file = "http://$css_file";
258 print "\ndownloading : " . $css_file . "\n\n";
259 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
260 if ($? != 0) {
261 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
262 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
263 unlink("$css_file_path");
264 }
265 } # done with download
266
267 # add a prefix "#wikispecificstyle" to each element
268 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
269 # so we will wrap the web page with a div with id = wikispecificstyle
270 my $css_content;
271 if(open(INPUT, "<$css_file_path")){
272 while(my $line = <INPUT>){
273 # comment out the body element because we change the body to div
274 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
275
276 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
277 if($line !~ m/wikispecificstyle/i){
278 $line = "#wikispecificstyle " . $line;
279 }
280 }
281
282 $css_content .= $line;
283 }
284 close(INPUT);
285 open(OUTPUT, ">$css_file_path");
286 print OUTPUT $css_content;
287 close(OUTPUT);
288 }
289
290 # Copy the modified stylesheets to collection's images folder
291 # for future customization
292 my $images_dir = $base_dir;
293 $images_dir =~ s/import$/images/;
294 $css_file =~ m/(.*)\/(.*)$/;
295 $images_dir = &util::filename_cat($images_dir, $2);
296
297 if(open(OUTPUT, ">$images_dir")){
298 print OUTPUT $css_content;
299 close(OUTPUT);
300 }
301 }
302 }
303
304
305 # by default, only preserve navigation box and search box
306 # others like toolbox, interaction, languages box, will be removed
307
308 # extract the larger part -- footer section
309 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
310 $body_text =~ /$print_footer/;
311 my $footer = "";
312 $footer = $& if defined $&;
313 $footer =~ s/<\/body>//isg;
314
315 # trim the comments first
316 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
317
318 # contain sections that are to be preserved
319 my $preserve_sections = "";
320
321 # process the navigation section
322 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
323 if (defined $self->{'nav_div_exp'}) {
324 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
325 }
326
327 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
328 # do nothing
329 } else {
330 if ($footer =~ m/$nav_match_exp/ig) {
331 $preserve_sections = $& ;
332 } else {
333 print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
334 }
335 # if($preserve_sections =~/\S/){
336 # $preserve_sections .= "</div>";
337 # }
338 }
339
340 # process the searchbox section
341 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
342 if(defined $self->{'searchbox_div_exp'}) {
343 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
344 }
345
346 my $searchbox_section = "";
347 $footer =~ m/$searchbox_exp/ig;
348 $searchbox_section = $& if defined $&;
349
350 # make the searchbox form work in Greenstone
351 if($searchbox_section =~ /\S/){
352 # replace action
353 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
354
355 # remove buttons
356 $searchbox_section =~ s/name="search"/name="q"/isg;
357 $searchbox_section =~ s/name="go"//isg;
358 $searchbox_section =~ s/name="fulltext"//isg;
359
360 # get collection name from $base_dir for c param
361 $base_dir =~ m/\/collect\/(.+)\//i;
362 my $collection_name = "";
363 $collection_name = $1 if defined $1;
364
365 # add Greenstone search params
366 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
367 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
368 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
369 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
370
371 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
372
373 # $searchbox_section .= "</div>";
374 } else {
375 print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
376 }
377
378 # either delete or replace the searchbox
379 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
380 # do nothing
381 } else {
382 $preserve_sections .= "\n$searchbox_section\n";
383 }
384
385 if($preserve_sections ne ""){
386 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
387 }
388 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
389
390 $body_text =~ s/$print_footer/$preserve_sections/isg;
391
392
393 # delete other forms in the page
394 my @forms;
395 my $form_count = 0;
396 while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
397 next if($3 eq "searchform");
398 $forms[$form_count++] = $&;
399 }
400 foreach my $form (@forms) {
401 $body_text =~ s/$form[\s\S]*?<\/form>//m;
402 }
403
404 # process links.
405 # because current WGET 1.10 the -k and -E option doesn't work together
406 # need to 'manually' convert the links to relative links
407 # Dealing with 3 types of links:
408 # -- outgoing links
409 # -- if we have downloaded the target files, link to the internal version (relative link)
410 # -- otherwise, link to the external version (absolute links)
411 # -- in-page links (relative link)
412
413 # NOTE: (important)
414 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
415 # otherwise, the internal links may have problems
416
417 # remove the title attribute of <a> tag
418 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
419
420 # extract all the links
421 my @links;
422 my $link_count = 0;
423 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
424 $links[$link_count++] = "$1=\"$2$url_base/$3\"";
425 }
426
427 foreach my $cur_link (@links) {
428 # escape greedy match + character
429 $cur_link =~ s/\+/\\+/isg;
430
431 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
432 my $external_file_path = "$1\"http://$url_base/$3\"";
433
434 $body_text =~ s/$cur_link/$external_file_path/i;
435 }
436
437 # tag links to new wiki pages as red
438 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
439
440 # tag links to pages external of the MediaWiki website as blue
441 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
442
443
444 # process the table-of-contents section
445 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
446 # 1. read _content_ macro from about.dm
447 # 2. append the toc, change all links to the Greenstone internal format for relative links
448 # 3. write to the extra.dm
449 # TODO: we assume the _about:content_ hasn't been specified before
450 # so needs to add function to handle when the macro is already in the extra.dm
451 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
452
453 # extract toc of the Main_Page
454 my $mainpage_toc = "";
455 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
456 if($self->{'toc_exp'} =~ /\S/){
457 $toc_exp = $self->{'toc_exp'};
458 }
459 if($body_text =~ /$toc_exp/){
460 $mainpage_toc = $&;
461 }
462
463 if($mainpage_toc =~ /\S/) {
464
465 # change the in-page links to relative links, for example, change <a href="#section1"> to
466 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
467 my $file_url_format = $file;
468 $file_url_format =~ s/\\/\//isg;
469 $file_url_format = "http://" . $file_url_format;
470
471 # encode as URL, otherwise doesn't work on Windows
472 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
473 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
474
475
476 # read the collection's extra.dm
477 my $macro_path = $base_dir;
478 $macro_path =~ s/import$/macros/;
479 my $extradm_file = &util::filename_cat($macro_path, "extra.dm");
480
481 my $extra_dm = "";
482 if(open(INPUT, "<$extradm_file")){
483 while(my $line = <INPUT>){
484 $extra_dm .= $line;
485 }
486 } else {
487 print $outhandle "can't open file $extradm_file\n";
488 }
489 close(INPUT);
490
491 # check whether we have changed the macros
492 my @packages = split("package ", $extra_dm);
493 my $about_package = "";
494 foreach my $package (@packages) {
495 $about_package = "package " . $package if($package =~ /^about/);
496 }
497
498 my $update_extra_dm = 0;
499
500 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
501 print $outhandle "_content_ macro already changed!!!!\n";
502 }
503 # if extra.dm doesn't have an "about package"
504 elsif ($about_package !~ /\S/) {
505 # read _content_ macro from $GSDLHOME/macros/about.dm file
506 my $global_about_package = $self->read_content_from_about_dm();
507
508 # create the extra _content_ macro for this collection
509 # add the original content of the _content_ macro
510 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
511
512 # append the new about package to extra.dm
513 $extra_dm .= "\n\npackage about\n_content_$&\n\n";
514 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
515
516 $update_extra_dm = 1;
517 }
518 # the about package exists, but either doesn't have the _content_ macro or
519 # the _content_ macro doesn't contain the toc
520 else {
521 # check if there is a content macro
522 my $content_macro_existed = 0;
523 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
524
525 # if there is one
526 # append a new section div for toc to the end of the document section
527 if($content_macro_existed ==1) {
528 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
529 my $content_macro = $&;
530 my $new_content_macro = $content_macro;
531 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
532 $extra_dm =~ s/$content_macro/$new_content_macro/mg;
533 }
534 # otherwise, append _content_ macro to the about package
535 else {
536 my $new_about_package = $about_package;
537 my $content_macro = &read_content_from_about_dm();
538 $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
539
540 $new_about_package .= "\n\n_content_$&\n\n";
541 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
542 $extra_dm =~ s/$about_package/$new_about_package/mg;
543 }
544
545 # either the case, we need to update the extra.dm
546 $update_extra_dm = 1;
547 }
548
549 if($update_extra_dm==1){
550 # write to the extra.dm file of the collection
551 if (open(OUTPUT, ">$extradm_file")) {
552 print OUTPUT $extra_dm;
553 } else {
554 print "can't open $extradm_file\n";
555 }
556 close(OUTPUT);
557 }
558 } else {
559 print $outhandle "Main_Page doesn't have a table-of-contents section\n";
560 }
561 }
562
563 # If delete_toc is set, remove toc and tof contents.
564 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
565 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
566 # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
567 if ($body_text =~ /$self->{'toc_exp'}/) {
568 $body_text =~ s/$self->{'toc_exp'}//i;
569 }
570 }
571 }
572
573 $$textref = "<body" . $body_text;
574
575 # Wrap the whole page with <div id="wikispecificstyle"></div>
576 # keep the style of this website and don't mess up with the Greenstone styles
577 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
578 $$textref =~ s/<\/body>/<\/div><\/body>/is;
579
580 $self->SUPER::process(@_);
581
582 return 1;
583}
584
585
586sub extract_metadata
587{
588 my $self = shift (@_);
589 my ($textref, $metadata, $doc_obj) = @_;
590 my $outhandle = $self->{'outhandle'};
591
592 return if (!defined $textref);
593
594 # metadata fields to extract/save. 'key' is the (lowercase) name of the
595 # html meta, 'value' is the metadata name for greenstone to use
596 my %find_fields = ();
597 my ($tag,$value);
598
599 my $orig_field = "";
600 foreach my $field (split /,/, $self->{'metadata_fields'}) {
601 # support tag<tagname>
602 if ($field =~ /^(.*?)<(.*?)>$/) {
603 # "$2" is the user's preferred gs metadata name
604 $find_fields{lc($1)}=$2; # lc = lowercase
605 $orig_field = $1;
606 } else { # no <tagname> for mapping
607 # "$field" is the user's preferred gs metadata name
608 $find_fields{lc($field)}=$field; # lc = lowercase
609 $orig_field = $field;
610 }
611
612 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
613 $tag = $orig_field;
614 $value = $1;
615 if (!defined $value || !defined $tag){
616 #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
617 next;
618 } else {
619 # clean up and add
620 chomp($value); # remove trailing \n, if any
621 $tag = $find_fields{lc($tag)};
622 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
623 # if ($self->{'verbosity'} > 2);
624 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
625 }
626 }
627 }
628}
629
630sub safe_escape_regexp
631{
632 my $regexp = shift (@_);
633
634 # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
635 $regexp =~ s/\\/\\\\/isg;
636 #} else {
637 $regexp =~ s/\//\\\//isg;
638 #}
639 return $regexp;
640}
641
642sub read_content_from_about_dm
643{
644 my $self = shift(@_);
645
646 my $about_macro_file = &util::filename_cat($ENV{'GSDLHOME'}, "macros", "about.dm");
647 my $about_page_content = "";
648 if (open(INPUT, "<$about_macro_file")){
649 while (my $line=<INPUT>){
650 $about_page_content .= $line;
651 }
652 } else {
653 my $outhandle = $self->{'outhandle'};
654 print $outhandle "can't open file $about_macro_file\n";
655 }
656 close(INPUT);
657
658 # extract the _content_ macro
659 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
660 $about_page_content = $&;
661
662 return $about_page_content;
663}
664
6651;
Note: See TracBrowser for help on using the repository browser.