source: main/trunk/greenstone2/perllib/plugins/MediaWikiPlugin.pm@ 28836

Last change on this file since 28836 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
Line 
1###########################################################################
2#
3# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28# login, discussion, history, etc. Only the navigation and search section could be preserved.
29# Searchbox will be modified to search the Greenstone collection instead of the website.
30# It also can automatically add the table of contents on the website's Main_Page to the
31# collection's Home page.
32
33package MediaWikiPlugin;
34
35use HTMLPlugin;
36use unicode;
37use util;
38use FileUtils;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44sub BEGIN {
45 @MediaWikiPlugin::ISA = ('HTMLPlugin');
46}
47
48my $arguments =
49 [
50 # show the table of contents on collection's home page
51 { 'name' => "show_toc",
52 'desc' => "{MediaWikiPlugin.show_toc}",
53 'type' => "flag",
54 'reqd' => "no"},
55 # set to delete the table of contents section on each MediaWiki page
56 { 'name' => "delete_toc",
57 'desc' => "{MediaWikiPlugin.delete_toc}",
58 'type' => "flag",
59 'reqd' => "no"},
60 # regexp to match the table of contents
61 { 'name' => "toc_exp",
62 'desc' => "{MediaWikiPlugin.toc_exp}",
63 'type' => "regexp",
64 'reqd' => "no",
65 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
66 # set to delete the navigation section
67 { 'name' => "delete_nav",
68 'desc' => "{MediaWikiPlugin.delete_nav}",
69 'type' => "flag",
70 'reqd' => "no",
71 'deft' => ""},
72 # regexp to match the navigation section
73 { 'name' => "nav_div_exp",
74 'desc' => "{MediaWikiPlugin.nav_div_exp}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
78 # set to delete the searchbox section
79 { 'name' => "delete_searchbox",
80 'desc' => "{MediaWikiPlugin.delete_searchbox}",
81 'type' => "flag",
82 'reqd' => "no",
83 'deft' => ""},
84 # regexp to match the searchbox section
85 { 'name' => "searchbox_div_exp",
86 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
87 'type' => "regexp",
88 'reqd' => "no",
89 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
90 # regexp to match title suffix
91 # can't use the title_sub option in HTMLPlugin instead
92 # because title_sub always matches from the begining
93 { 'name' => "remove_title_suffix_exp",
94 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
95 'type' => "regexp",
96 'reqd' => "no",
97 'deft' => ""}
98 ];
99
100my $options = { 'name' => "MediaWikiPlugin",
101 'desc' => "{MediaWikiPlugin.desc}",
102 'abstract' => "no",
103 'inherits' => "yes",
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
115 return bless $self, $class;
116}
117
118
119
120sub process {
121 my $self = shift (@_);
122 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123 my $outhandle = $self->{'outhandle'};
124
125 my @head_and_body = split(/<body/i,$$textref);
126 my $head = shift(@head_and_body);
127 my $body_text = join("<body", @head_and_body);
128
129 $head =~ m/<title>(.+)<\/title>/i;
130 my $doctitle = $1 if defined $1;
131
132 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
133 my @doc_properties = split(/<xml>/i,$head);
134 my $doc_heading = shift(@doc_properties);
135 my $rest_doc_properties = join(" ", @doc_properties);
136
137 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
138 my $extracted_metadata = shift (@extracted_metadata);
139 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
140 }
141
142 # set the title here if we haven't found it yet
143 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
144 if (defined $doctitle && $doctitle =~ /\S/) {
145 # remove suffix in title if required
146 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
147 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
148 $doctitle =~ s/$remove_suffix_exp//i;
149 }
150 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
151 } else {
152 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
153 }
154 }
155
156 # we are only interested in the column-contents div <div id="column-content">
157 # remove header section, it may contain header images or additional search boxes
158 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
159 if($body_text =~ /$header_exp/){
160 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
161 } else {
162 $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
163 if($body_text =~ /$header_exp/){
164 $body_text =~ s/$header_exp/<div$2id='column-content/i;
165 }
166 }
167
168 # remove timeline
169 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
170
171 # remove extra bits
172 my $extra_bits = "Retrieved from(.+)</a>\"";
173 $body_text =~ s/$extra_bits//isg;
174
175 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
176 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
177 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
178 $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
179
180 # get rid of the [edit] buttons
181 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
182 # get rid of the last time edit information at the bottom
183 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
184 # get rid of the (Redirected from ...)
185 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
186
187 # escape texts macros
188 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
189 # may change the links, like Greenstone_Documentation_All.html, then change back
190 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
191
192 # define file delimiter for different platforms
193 my $file_delimiter;
194 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
195 $file_delimiter = "\\";
196 } else {
197 $file_delimiter = "/";
198 }
199
200 # IMPORTANT: different delimiter for $base_dir and $file
201 # $base_dir use forward slash for both windows and linux
202 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
203 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
204 # $file use different delimiters : forward slash for linux; backward slash for windows
205 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
206 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
207
208 # get the base url for the MediaWiki website
209 my $safe_delimiter = &safe_escape_regexp($file_delimiter);
210 my @url_dirs=split($safe_delimiter, $file);
211 my $url_base = $url_dirs[0];
212
213 # Re-check css files associated with MediaWiki pages
214 if(defined $base_dir && $base_dir ne ""){
215 my @css_files;
216 my $css_file_count = 0;
217
218 # find all the stylesheets imported with @import statement
219 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
220 $css_files[$css_file_count++] = $2 if defined $2;
221 }
222
223 # Set the env for wget once, outside the for loop
224 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
225 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
226
227 # download the stylesheets if we haven't downloaded them yet
228 # add prefix to each style elmement, comment out the body element
229 # and copy the files to collection's style folder
230 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
231
232 my $css_file = $css_files[$css_file_count];
233
234 # remove prefix gli/cache directory
235 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
236
237 # change the \ delimiter in $css_file to / for consistency
238 $css_file =~ s/\\/\//isg;
239 if($css_file !~ /$url_base/) {
240 $css_file = $url_base . $css_file;
241 }
242
243 # trim the ? mark append to the end of a stylesheet
244 $css_file =~ s/\?(.+)$//isg;
245
246 my $css_file_path = &FileUtils::filenameConcatenate($base_dir, $css_file);
247
248 # do nothing if we have already downloaded the css files
249 if (! -e $css_file_path) {
250
251 # check the stylesheet's directory in the import folder
252 # if the directory doesn't exist, create one
253 my @dirs = split(/\//i,$css_file);
254 my $path_check = "$base_dir/";
255 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
256 $path_check .= $dirs[$i] . "/";
257 mkdir($path_check) if (! -d $path_check );
258 }
259
260 # NOTE: wget needs configuration to directly access Internet
261 # These files should already downloaded if we used the MediaWikiDownload
262 # downloading
263 $css_file = "http://$css_file";
264 print "\ndownloading : " . $css_file . "\n\n";
265 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
266 if ($? != 0) {
267 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
268 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
269 unlink("$css_file_path");
270 }
271 } # done with download
272
273 # add a prefix "#wikispecificstyle" to each element
274 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
275 # so we will wrap the web page with a div with id = wikispecificstyle
276 my $css_content;
277 if(open(INPUT, "<$css_file_path")){
278 while(my $line = <INPUT>){
279 # comment out the body element because we change the body to div
280 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
281
282 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
283 if($line !~ m/wikispecificstyle/i){
284 $line = "#wikispecificstyle " . $line;
285 }
286 }
287
288 $css_content .= $line;
289 }
290 close(INPUT);
291 open(OUTPUT, ">$css_file_path");
292 print OUTPUT $css_content;
293 close(OUTPUT);
294 }
295
296 # Copy the modified stylesheets to collection's style folder
297 # for future customization
298 my $style_dir = $base_dir;
299 $style_dir =~ s/import$/style/;
300 $css_file =~ m/(.*)\/(.*)$/;
301 $style_dir = &FileUtils::filenameConcatenate($style_dir, $2);
302
303 if(open(OUTPUT, ">$style_dir")){
304 print OUTPUT $css_content;
305 close(OUTPUT);
306 }
307 }
308 }
309
310
311 # by default, only preserve navigation box and search box
312 # others like toolbox, interaction, languages box, will be removed
313
314 # extract the larger part -- footer section
315 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
316 $body_text =~ /$print_footer/;
317 my $footer = "";
318 $footer = $& if defined $&;
319 $footer =~ s/<\/body>//isg;
320
321 # trim the comments first
322 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
323
324 # contain sections that are to be preserved
325 my $preserve_sections = "";
326
327 # process the navigation section
328 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
329 if (defined $self->{'nav_div_exp'}) {
330 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
331 }
332
333 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
334 # do nothing
335 } else {
336 if ($footer =~ m/$nav_match_exp/ig) {
337 $preserve_sections = $& ;
338 } else {
339 print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
340 }
341 # if($preserve_sections =~/\S/){
342 # $preserve_sections .= "</div>";
343 # }
344 }
345
346 # process the searchbox section
347 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
348 if(defined $self->{'searchbox_div_exp'}) {
349 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
350 }
351
352 my $searchbox_section = "";
353 $footer =~ m/$searchbox_exp/ig;
354 $searchbox_section = $& if defined $&;
355
356 # make the searchbox form work in Greenstone
357 if($searchbox_section =~ /\S/){
358 # replace action
359 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
360
361 # remove buttons
362 $searchbox_section =~ s/name="search"/name="q"/isg;
363 $searchbox_section =~ s/name="go"//isg;
364 $searchbox_section =~ s/name="fulltext"//isg;
365
366 # get collection name from $base_dir for c param
367 $base_dir =~ m/\/collect\/(.+)\//i;
368 my $collection_name = "";
369 $collection_name = $1 if defined $1;
370
371 # add Greenstone search params
372 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
373 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
374 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
375 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
376
377 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
378
379 # $searchbox_section .= "</div>";
380 } else {
381 print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
382 }
383
384 # either delete or replace the searchbox
385 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
386 # do nothing
387 } else {
388 $preserve_sections .= "\n$searchbox_section\n";
389 }
390
391 if($preserve_sections ne ""){
392 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
393 }
394 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
395
396 $body_text =~ s/$print_footer/$preserve_sections/isg;
397
398
399 # delete other forms in the page
400 my @forms;
401 my $form_count = 0;
402 while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
403 next if($3 eq "searchform");
404 $forms[$form_count++] = $&;
405 }
406 foreach my $form (@forms) {
407 $body_text =~ s/$form[\s\S]*?<\/form>//m;
408 }
409
410 # process links.
411 # because current WGET 1.10 the -k and -E option doesn't work together
412 # need to 'manually' convert the links to relative links
413 # Dealing with 3 types of links:
414 # -- outgoing links
415 # -- if we have downloaded the target files, link to the internal version (relative link)
416 # -- otherwise, link to the external version (absolute links)
417 # -- in-page links (relative link)
418
419 # NOTE: (important)
420 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
421 # otherwise, the internal links may have problems
422
423 # remove the title attribute of <a> tag
424 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
425
426 # extract all the links
427 my @links;
428 my $link_count = 0;
429 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
430 $links[$link_count++] = "$1=\"$2$url_base/$3\"";
431 }
432
433 foreach my $cur_link (@links) {
434 # escape greedy match + character
435 $cur_link =~ s/\+/\\+/isg;
436
437 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
438 my $external_file_path = "$1\"http://$url_base/$3\"";
439
440 $body_text =~ s/$cur_link/$external_file_path/i;
441 }
442
443 # tag links to new wiki pages as red
444 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
445
446 # tag links to pages external of the MediaWiki website as blue
447 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
448
449
450 # process the table-of-contents section
451 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
452 # 1. read _content_ macro from about.dm
453 # 2. append the toc, change all links to the Greenstone internal format for relative links
454 # 3. write to the extra.dm
455 # TODO: we assume the _about:content_ hasn't been specified before
456 # so needs to add function to handle when the macro is already in the extra.dm
457 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
458
459 # extract toc of the Main_Page
460 my $mainpage_toc = "";
461 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
462 if($self->{'toc_exp'} =~ /\S/){
463 $toc_exp = $self->{'toc_exp'};
464 }
465 if($body_text =~ /$toc_exp/){
466 $mainpage_toc = $&;
467 }
468
469 if($mainpage_toc =~ /\S/) {
470
471 # change the in-page links to relative links, for example, change <a href="#section1"> to
472 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
473 my $file_url_format = $file;
474 $file_url_format =~ s/\\/\//isg;
475 $file_url_format = "http://" . $file_url_format;
476
477 # encode as URL, otherwise doesn't work on Windows
478 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
479 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
480
481
482 # read the collection's extra.dm
483 my $macro_path = $base_dir;
484 $macro_path =~ s/import$/macros/;
485 my $extradm_file = &FileUtils::filenameConcatenate($macro_path, "extra.dm");
486
487 my $extra_dm = "";
488 if(open(INPUT, "<$extradm_file")){
489 while(my $line = <INPUT>){
490 $extra_dm .= $line;
491 }
492 } else {
493 print $outhandle "can't open file $extradm_file\n";
494 }
495 close(INPUT);
496
497 # check whether we have changed the macros
498 my @packages = split("package ", $extra_dm);
499 my $about_package = "";
500 foreach my $package (@packages) {
501 $about_package = "package " . $package if($package =~ /^about/);
502 }
503
504 my $update_extra_dm = 0;
505
506 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
507 print $outhandle "_content_ macro already changed!!!!\n";
508 }
509 # if extra.dm doesn't have an "about package"
510 elsif ($about_package !~ /\S/) {
511 # read _content_ macro from $GSDLHOME/macros/about.dm file
512 my $global_about_package = $self->read_content_from_about_dm();
513
514 # create the extra _content_ macro for this collection
515 # add the original content of the _content_ macro
516 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
517
518 # append the new about package to extra.dm
519 $extra_dm .= "\n\npackage about\n_content_$&\n\n";
520 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
521
522 $update_extra_dm = 1;
523 }
524 # the about package exists, but either doesn't have the _content_ macro or
525 # the _content_ macro doesn't contain the toc
526 else {
527 # check if there is a content macro
528 my $content_macro_existed = 0;
529 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
530
531 # if there is one
532 # append a new section div for toc to the end of the document section
533 if($content_macro_existed ==1) {
534 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
535 my $content_macro = $&;
536 my $new_content_macro = $content_macro;
537 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
538 $extra_dm =~ s/$content_macro/$new_content_macro/mg;
539 }
540 # otherwise, append _content_ macro to the about package
541 else {
542 my $new_about_package = $about_package;
543 my $content_macro = &read_content_from_about_dm();
544 $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
545
546 $new_about_package .= "\n\n_content_$&\n\n";
547 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
548 $extra_dm =~ s/$about_package/$new_about_package/mg;
549 }
550
551 # either the case, we need to update the extra.dm
552 $update_extra_dm = 1;
553 }
554
555 if($update_extra_dm==1){
556 # write to the extra.dm file of the collection
557 if (open(OUTPUT, ">$extradm_file")) {
558 print OUTPUT $extra_dm;
559 } else {
560 print "can't open $extradm_file\n";
561 }
562 close(OUTPUT);
563 }
564 } else {
565 print $outhandle "Main_Page doesn't have a table-of-contents section\n";
566 }
567 }
568
569 # If delete_toc is set, remove toc and tof contents.
570 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
571 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
572 # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
573 if ($body_text =~ /$self->{'toc_exp'}/) {
574 $body_text =~ s/$self->{'toc_exp'}//i;
575 }
576 }
577 }
578
579 $$textref = "<body" . $body_text;
580
581 # Wrap the whole page with <div id="wikispecificstyle"></div>
582 # keep the style of this website and don't mess up with the Greenstone styles
583 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
584 $$textref =~ s/<\/body>/<\/div><\/body>/is;
585
586 $self->SUPER::process(@_);
587
588 return 1;
589}
590
591
592sub extract_metadata
593{
594 my $self = shift (@_);
595 my ($textref, $metadata, $doc_obj) = @_;
596 my $outhandle = $self->{'outhandle'};
597
598 return if (!defined $textref);
599
600 # metadata fields to extract/save. 'key' is the (lowercase) name of the
601 # html meta, 'value' is the metadata name for greenstone to use
602 my %find_fields = ();
603 my ($tag,$value);
604
605 my $orig_field = "";
606 foreach my $field (split /,/, $self->{'metadata_fields'}) {
607 # support tag<tagname>
608 if ($field =~ /^(.*?)<(.*?)>$/) {
609 # "$2" is the user's preferred gs metadata name
610 $find_fields{lc($1)}=$2; # lc = lowercase
611 $orig_field = $1;
612 } else { # no <tagname> for mapping
613 # "$field" is the user's preferred gs metadata name
614 $find_fields{lc($field)}=$field; # lc = lowercase
615 $orig_field = $field;
616 }
617
618 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
619 $tag = $orig_field;
620 $value = $1;
621 if (!defined $value || !defined $tag){
622 #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
623 next;
624 } else {
625 # clean up and add
626 chomp($value); # remove trailing \n, if any
627 $tag = $find_fields{lc($tag)};
628 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
629 # if ($self->{'verbosity'} > 2);
630 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
631 }
632 }
633 }
634}
635
636sub safe_escape_regexp
637{
638 my $regexp = shift (@_);
639
640 # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
641 $regexp =~ s/\\/\\\\/isg;
642 #} else {
643 $regexp =~ s/\//\\\//isg;
644 #}
645 return $regexp;
646}
647
648sub read_content_from_about_dm
649{
650 my $self = shift(@_);
651
652 my $about_macro_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "macros", "about.dm");
653 my $about_page_content = "";
654 if (open(INPUT, "<$about_macro_file")){
655 while (my $line=<INPUT>){
656 $about_page_content .= $line;
657 }
658 } else {
659 my $outhandle = $self->{'outhandle'};
660 print $outhandle "can't open file $about_macro_file\n";
661 }
662 close(INPUT);
663
664 # extract the _content_ macro
665 $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
666 $about_page_content = $&;
667
668 return $about_page_content;
669}
670
6711;
Note: See TracBrowser for help on using the repository browser.