source: main/trunk/greenstone2/perllib/plugins/MediaWikiPlugin.pm@ 31780

Last change on this file since 31780 was 31780, checked in by ak19, 4 years ago

When testing GS3.08's GLI on Ubuntu v 16.04, found its perl v 5.22.1 issued warnings about unescaped left brace in regex. As per https://unix.stackexchange.com/questions/238539/automake-error-unescaped-left-brace-in-regex-is-deprecated, in even later versions of perl, this counts as a syntax error rather than resulting in a warning. So making the minimum number of changes that allowed perl to parse the plugins and open GLI successfully, without issuing warnings on perl 5.22.1 on this Ubuntu 16.04.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
Line 
1###########################################################################
2#
3# MediaWikiPlugin.pm -- html plugin with extra facilities for wiki page
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26# This plugin is to process an HTML file from a MediaWiki website which downloaded by
27# the MediaWikiDownload plug. This plugin will trim MediaWiki functional sections like
28# login, discussion, history, etc. Only the navigation and search section could be preserved.
29# Searchbox will be modified to search the Greenstone collection instead of the website.
30# It also can automatically add the table of contents on the website's Main_Page to the
31# collection's Home page.
32
33package MediaWikiPlugin;
34
35use HTMLPlugin;
36use unicode;
37use util;
38use FileUtils;
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44sub BEGIN {
45 @MediaWikiPlugin::ISA = ('HTMLPlugin');
46}
47
48my $arguments =
49 [
50 # show the table of contents on collection's home page
51 { 'name' => "show_toc",
52 'desc' => "{MediaWikiPlugin.show_toc}",
53 'type' => "flag",
54 'reqd' => "no"},
55 # set to delete the table of contents section on each MediaWiki page
56 { 'name' => "delete_toc",
57 'desc' => "{MediaWikiPlugin.delete_toc}",
58 'type' => "flag",
59 'reqd' => "no"},
60 # regexp to match the table of contents
61 { 'name' => "toc_exp",
62 'desc' => "{MediaWikiPlugin.toc_exp}",
63 'type' => "regexp",
64 'reqd' => "no",
65 'deft' => "<table([^>]*)id=(\\\"|')toc(\\\"|')(.|\\n)*?</table>\\n" },
66 # set to delete the navigation section
67 { 'name' => "delete_nav",
68 'desc' => "{MediaWikiPlugin.delete_nav}",
69 'type' => "flag",
70 'reqd' => "no",
71 'deft' => ""},
72 # regexp to match the navigation section
73 { 'name' => "nav_div_exp",
74 'desc' => "{MediaWikiPlugin.nav_div_exp}",
75 'type' => "regexp",
76 'reqd' => "no",
77 'deft' => "<div([^>]*)id=(\\\"|')p-navigation(\\\"|')(.|\\n)*?<\/div>" },
78 # set to delete the searchbox section
79 { 'name' => "delete_searchbox",
80 'desc' => "{MediaWikiPlugin.delete_searchbox}",
81 'type' => "flag",
82 'reqd' => "no",
83 'deft' => ""},
84 # regexp to match the searchbox section
85 { 'name' => "searchbox_div_exp",
86 'desc' => "{MediaWikiPlugin.searchbox_div_exp}",
87 'type' => "regexp",
88 'reqd' => "no",
89 'deft' => "<div([^>]*)id=(\\\"|')p-search(\\\"|')(.|\\n)*?<\/div>"},
90 # regexp to match title suffix
91 # can't use the title_sub option in HTMLPlugin instead
92 # because title_sub always matches from the begining
93 { 'name' => "remove_title_suffix_exp",
94 'desc' => "{MediaWikiPlugin.remove_title_suffix_exp}",
95 'type' => "regexp",
96 'reqd' => "no",
97 'deft' => ""}
98 ];
99
100my $options = { 'name' => "MediaWikiPlugin",
101 'desc' => "{MediaWikiPlugin.desc}",
102 'abstract' => "no",
103 'inherits' => "yes",
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114 my $self = new HTMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
115 return bless $self, $class;
116}
117
118
119
120sub process {
121 my $self = shift (@_);
122 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
123 my $outhandle = $self->{'outhandle'};
124
125 my @head_and_body = split(/<body/i,$$textref);
126 my $head = shift(@head_and_body);
127 my $body_text = join("<body", @head_and_body);
128
129 $head =~ m/<title>(.+)<\/title>/i;
130 my $doctitle = $1 if defined $1;
131
132 if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
133 my @doc_properties = split(/<xml>/i,$head);
134 my $doc_heading = shift(@doc_properties);
135 my $rest_doc_properties = join(" ", @doc_properties);
136
137 my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
138 my $extracted_metadata = shift (@extracted_metadata);
139 $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
140 }
141
142 # set the title here if we haven't found it yet
143 if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
144 if (defined $doctitle && $doctitle =~ /\S/) {
145 # remove suffix in title if required
146 my $remove_suffix_exp = $self->{'remove_title_suffix_exp'};
147 if (defined $remove_suffix_exp && $remove_suffix_exp =~ /\S/){
148 $doctitle =~ s/$remove_suffix_exp//i;
149 }
150 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
151 } else {
152 $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
153 }
154 }
155
156 # we are only interested in the column-contents div <div id="column-content">
157 # remove header section, it may contain header images or additional search boxes
158 my $header_exp = "<div([^>]*)id=(\"|')container(\"|')([^>]*)>(.|\\n)*<div([^>]*)id=(\"|')column-content";
159 if($body_text =~ /$header_exp/){
160 $body_text =~ s/$header_exp/<div$1id='container'$4><div$6id='column-content/isg;
161 } else {
162 $header_exp = "(.|\\n)*?<div([^>]*)?id=(\"|')column-content";
163 if($body_text =~ /$header_exp/){
164 $body_text =~ s/$header_exp/<div$2id='column-content/i;
165 }
166 }
167
168 # remove timeline
169 $body_text =~ s/<div([^>]*)class=("|')smwtimeline("|')[\s\S]*?<\/div>//mg;
170
171 # remove extra bits
172 my $extra_bits = "Retrieved from(.+)</a>\"";
173 $body_text =~ s/$extra_bits//isg;
174
175 $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
176 $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
177 $body_text =~ s/<!\[if !vml\]>/<![if vml]>/g;
178 $body_text =~ s/(&nbsp;)+/&nbsp;/sg;
179
180 # get rid of the [edit] buttons
181 $body_text =~ s/\[<a([^>]*)>edit<\/a>]//g;
182 # get rid of the last time edit information at the bottom
183 $body_text =~ s/<a href="([^>]*)edit([^>]*)"([^>]*?)>(\w+)<\/a> \d\d:\d\d,([\s|\w]*?)\(PST\)//g;
184 # get rid of the (Redirected from ...)
185 $body_text =~ s/\(Redirected from <a ([^>]*)>(\w|\s)*?<\/a>\)//isg;
186
187 # escape texts macros
188 $body_text =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
189 # may change the links, like Greenstone_Documentation_All.html, then change back
190 $body_text =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
191
192 # define file delimiter for different platforms
193 my $file_delimiter;
194 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
195 $file_delimiter = "\\";
196 } else {
197 $file_delimiter = "/";
198 }
199
200 # IMPORTANT: different delimiter for $base_dir and $file
201 # $base_dir use forward slash for both windows and linux
202 # print "\nbase_dir : $base_dir\n\n"; # windows: C:/Program Files/Greenstone2.73/collect/wiki/import
203 # linux: /research/lh92/greenstone/greenstone2.73/collect/wiki/import
204 # $file use different delimiters : forward slash for linux; backward slash for windows
205 # print "\nfile : $file\n\n"; # windows: greenstone.sourceforge.net\wiki\index.php\Access_Processing_using_DBPlugin.html
206 # linux: greenstone.sourceforge.net/wiki/index.php/Using_GreenstoneWiki.html
207
208 # get the base url for the MediaWiki website
209 my $safe_delimiter = &safe_escape_regexp($file_delimiter);
210 my @url_dirs=split($safe_delimiter, $file);
211 my $url_base = $url_dirs[0];
212
213 # Re-check css files associated with MediaWiki pages
214 if(defined $base_dir && $base_dir ne ""){
215 my @css_files;
216 my $css_file_count = 0;
217
218 # find all the stylesheets imported with @import statement
219 while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
220 $css_files[$css_file_count++] = $2 if defined $2;
221 }
222
223 # Set the env for wget once, outside the for loop
224 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
225 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
226
227 # download the stylesheets if we haven't downloaded them yet
228 # add prefix to each style elmement, comment out the body element
229 # and copy the files to collection's style folder
230 for ($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++) {
231
232 my $css_file = $css_files[$css_file_count];
233
234 # remove prefix gli/cache directory
235 $css_file =~ s/^(.+)gli(\\|\/)cache(\\|\/)//i;
236
237 # change the \ delimiter in $css_file to / for consistency
238 $css_file =~ s/\\/\//isg;
239 if($css_file !~ /$url_base/) {
240 $css_file = $url_base . $css_file;
241 }
242
243 # trim the ? mark append to the end of a stylesheet
244 $css_file =~ s/\?(.+)$//isg;
245
246 my $css_file_path = &FileUtils::filenameConcatenate($base_dir, $css_file);
247
248 # do nothing if we have already downloaded the css files
249 if (! -e $css_file_path) {
250
251 # check the stylesheet's directory in the import folder
252 # if the directory doesn't exist, create one
253 my @dirs = split(/\//i,$css_file);
254 my $path_check = "$base_dir/";
255 for (my $i = 0; $i < (scalar(@dirs)-1); $i++) {
256 $path_check .= $dirs[$i] . "/";
257 mkdir($path_check) if (! -d $path_check );
258 }
259
260 # NOTE: wget needs configuration to directly access Internet
261 # These files should already downloaded if we used the MediaWikiDownload
262 # downloading
263 $css_file = "http://$css_file";
264 print "\ndownloading : " . $css_file . "\n\n";
265 system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
266 if ($? != 0) {
267 print "[ERROR] Download Failed! Make sure WGet connects to Internet directly \n";
268 print "[ERROR] OR ues the MediaWikiDownload in the GLI DownloadPanel to download from a MediaWiki website\n";
269 unlink("$css_file_path");
270 }
271 } # done with download
272
273 # add a prefix "#wikispecificstyle" to each element
274 # because we want to preserve this website's formats and don't want to mess up with Greenstone formats
275 # so we will wrap the web page with a div with id = wikispecificstyle
276 my $css_content;
277 if(open(INPUT, "<$css_file_path")){
278 while(my $line = <INPUT>){
279 # comment out the body element because we change the body to div
280 $line =~ s/^(\s*)body(\s*){(\s*)$/$1\/*body$2*\/{$3/isg;
281
282 if($line =~ m/^(.+)\{/i || $line =~ m/^(\s)*#/i){
283 if($line !~ m/wikispecificstyle/i){
284 $line = "#wikispecificstyle " . $line;
285 }
286 }
287
288 $css_content .= $line;
289 }
290 close(INPUT);
291 open(OUTPUT, ">$css_file_path");
292 print OUTPUT $css_content;
293 close(OUTPUT);
294 }
295
296 # Copy the modified stylesheets to collection's style folder
297 # for future customization
298 my $style_dir = $base_dir;
299 $style_dir =~ s/import$/style/;
300 $css_file =~ m/(.*)\/(.*)$/;
301 $style_dir = &FileUtils::filenameConcatenate($style_dir, $2);
302
303 if(open(OUTPUT, ">$style_dir")){
304 print OUTPUT $css_content;
305 close(OUTPUT);
306 }
307 }
308 }
309
310
311 # by default, only preserve navigation box and search box
312 # others like toolbox, interaction, languages box, will be removed
313
314 # extract the larger part -- footer section
315 my $print_footer = "<div class=\"printfooter\">(.|\n)+</body>";
316 $body_text =~ /$print_footer/;
317 my $footer = "";
318 $footer = $& if defined $&;
319 $footer =~ s/<\/body>//isg;
320
321 # trim the comments first
322 $footer =~ s/<!--[\s\S]*?--[ \t\n\r]*>//isg;
323
324 # contain sections that are to be preserved
325 my $preserve_sections = "";
326
327 # process the navigation section
328 my $nav_match_exp = "<div([^>]*)id=(\"|')p-navigation(\"|')(.|\n)*?<\/div>";
329 if (defined $self->{'nav_div_exp'}) {
330 $nav_match_exp = $self->{'nav_div_exp'} if ($self->{'nav_div_exp'} =~ /\S/) ;
331 }
332
333 if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} eq "1")) {
334 # do nothing
335 } else {
336 if ($footer =~ m/$nav_match_exp/ig) {
337 $preserve_sections = $& ;
338 } else {
339 print $outhandle "Can't find the navigation section with : $nav_match_exp\n";
340 }
341 # if($preserve_sections =~/\S/){
342 # $preserve_sections .= "</div>";
343 # }
344 }
345
346 # process the searchbox section
347 my $searchbox_exp = "<div([^>]*)id=(\"|')p-search(\"|')(.|\\n)*?<\/div>";
348 if(defined $self->{'searchbox_div_exp'}) {
349 $searchbox_exp = $self->{'searchbox_div_exp'} if ($self->{'searchbox_div_exp'} =~ /\S/);
350 }
351
352 my $searchbox_section = "";
353 $footer =~ m/$searchbox_exp/ig;
354 $searchbox_section = $& if defined $&;
355
356 # make the searchbox form work in Greenstone
357 if($searchbox_section =~ /\S/){
358 # replace action
359 $searchbox_section =~ s/action="([^>]*)"/action="_gwcgi_"/isg;
360
361 # remove buttons
362 $searchbox_section =~ s/name="search"/name="q"/isg;
363 $searchbox_section =~ s/name="go"//isg;
364 $searchbox_section =~ s/name="fulltext"//isg;
365
366 # get collection name from $base_dir for c param
367 $base_dir =~ m/\/collect\/(.+)\//i;
368 my $collection_name = "";
369 $collection_name = $1 if defined $1;
370
371 # add Greenstone search params
372 my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
373 ."<input type=\"hidden\" name=\"c\" value=\"$collection_name\"/>\n";
374 # ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>\n"
375 # ."<input type=\"hidden\" name=\"r\" value=\"1\">\n";
376
377 $searchbox_section =~ s/<form([^>]*)>/<form$1>\n$hidden_params/isg;
378
379 # $searchbox_section .= "</div>";
380 } else {
381 print $outhandle "Can't find the searchbox section with : $searchbox_section\n";
382 }
383
384 # either delete or replace the searchbox
385 if(defined $self->{'delete_searchbox'} && $self->{'delete_searchbox'} eq "1") {
386 # do nothing
387 } else {
388 $preserve_sections .= "\n$searchbox_section\n";
389 }
390
391 if($preserve_sections ne ""){
392 $preserve_sections = "<div id=\"column-one\">\n" . $preserve_sections . "\n</div>\n";
393 }
394 $preserve_sections = "</div></div></div>\n" . $preserve_sections . "\n</body>";
395
396 $body_text =~ s/$print_footer/$preserve_sections/isg;
397
398
399 # delete other forms in the page
400 my @forms;
401 my $form_count = 0;
402 while($body_text =~ m/<form([^>]*)name=("|')([^>"']*)?("|')/isg){
403 next if($3 eq "searchform");
404 $forms[$form_count++] = $&;
405 }
406 foreach my $form (@forms) {
407 $body_text =~ s/$form[\s\S]*?<\/form>//m;
408 }
409
410 # process links.
411 # because current WGET 1.10 the -k and -E option doesn't work together
412 # need to 'manually' convert the links to relative links
413 # Dealing with 3 types of links:
414 # -- outgoing links
415 # -- if we have downloaded the target files, link to the internal version (relative link)
416 # -- otherwise, link to the external version (absolute links)
417 # -- in-page links (relative link)
418
419 # NOTE: (important)
420 # must use the MediaWikiDownload in GLI Download Panel to download files from a MediaWiki website
421 # otherwise, the internal links may have problems
422
423 # remove the title attribute of <a> tag
424 $body_text =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
425
426 # extract all the links
427 my @links;
428 my $link_count = 0;
429 while($body_text =~ m/(href|src)="([^>\s]*)$url_base\/([^>\s]*)"/ig){
430 $links[$link_count++] = "$1=\"$2$url_base/$3\"";
431 }
432
433 foreach my $cur_link (@links) {
434 # escape greedy match + character
435 $cur_link =~ s/\+/\\+/isg;
436
437 $cur_link =~ m/(.+)"([^>]*)$url_base\/([^>\s]*)"/;
438 my $external_file_path = "$1\"http://$url_base/$3\"";
439
440 $body_text =~ s/$cur_link/$external_file_path/i;
441 }
442
443 # tag links to new wiki pages as red
444 $body_text =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
445
446 # tag links to pages external of the MediaWiki website as blue
447 $body_text =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
448
449
450 # process the table-of-contents section
451 # if 'show_toc' is set, add Main_Page's toc to the collection's About page, change extra.dm file
452 # 1. read _content_ macro from about.dm
453 # 2. append the toc, change all links to the Greenstone internal format for relative links
454 # 3. write to the extra.dm
455 # TODO: we assume the _about:content_ hasn't been specified before
456 # so needs to add function to handle when the macro is already in the extra.dm
457 if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
458
459 # extract toc of the Main_Page
460 my $mainpage_toc = "";
461 my $toc_exp = "<table([^>]*)id=(\"|')toc(\"|')(.|\\n)*</table>\\n";
462 if($self->{'toc_exp'} =~ /\S/){
463 $toc_exp = $self->{'toc_exp'};
464 }
465 if($body_text =~ /$toc_exp/){
466 $mainpage_toc = $&;
467 }
468
469 if($mainpage_toc =~ /\S/) {
470
471 # change the in-page links to relative links, for example, change <a href="#section1"> to
472 # <a href="_httpquery_&a=extlink&rl=1&href=http://www.mediawikisite.com/Main_Page.html#section1">
473 my $file_url_format = $file;
474 $file_url_format =~ s/\\/\//isg;
475 $file_url_format = "http://" . $file_url_format;
476
477 # encode as URL, otherwise doesn't work on Windows
478 $file_url_format =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
479 $mainpage_toc =~ s/<a href="([^>"#]*)#([^>"]*)"/<a href="_httpquery_&a=extlink&rl=1&href=$file_url_format#$2"/isg;
480
481
482 # read the collection's extra.dm
483 my $macro_path = $base_dir;
484 $macro_path =~ s/import$/macros/;
485 my $extradm_file = &FileUtils::filenameConcatenate($macro_path, "extra.dm");
486
487 my $extra_dm = "";
488 if(open(INPUT, "<$extradm_file")){
489 while(my $line = <INPUT>){
490 $extra_dm .= $line;
491 }
492 } else {
493 print $outhandle "can't open file $extradm_file\n";
494 }
495 close(INPUT);
496
497 # check whether we have changed the macros
498 my @packages = split("package ", $extra_dm);
499 my $about_package = "";
500 foreach my $package (@packages) {
501 $about_package = "package " . $package if($package =~ /^about/);
502 }
503
504 my $update_extra_dm = 0;
505
506 if( $about_package =~ /\S/ && $about_package =~ m/_content_(\s*){/ && $about_package =~ m/$mainpage_toc/){
507 print $outhandle "_content_ macro already changed!!!!\n";
508 }
509 # if extra.dm doesn't have an "about package"
510 elsif ($about_package !~ /\S/) {
511 # read _content_ macro from $GSDLHOME/macros/about.dm file
512 my $global_about_package = $self->read_content_from_about_dm();
513
514 # create the extra _content_ macro for this collection
515 # add the original content of the _content_ macro
516 $global_about_package =~ m/{(.|\n)*<\/div>\n\n/;
517
518 # append the new about package to extra.dm
519 $extra_dm .= "\n\npackage about\n_content_$&\n\n";
520 $extra_dm .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
521
522 $update_extra_dm = 1;
523 }
524 # the about package exists, but either doesn't have the _content_ macro or
525 # the _content_ macro doesn't contain the toc
526 else {
527 # check if there is a content macro
528 my $content_macro_existed = 0;
529 $content_macro_existed = ($about_package =~ /(\s*|\n)_content_(\s*){/);
530
531 # if there is one
532 # append a new section div for toc to the end of the document section
533 if($content_macro_existed ==1) {
534 $about_package =~ /(\s*|\n)_content_(\s*){(.|\n)*?}/;
535 my $content_macro = $&;
536 my $new_content_macro = $content_macro;
537 $new_content_macro =~ s/<div[^>]*class="document">(.|\n)*<\/div>/<div$1class="document">$2\n\n<div class="section">\n$mainpage_toc\n<\/div>\n<\/div>/;
538 $extra_dm =~ s/$content_macro/$new_content_macro/mg;
539 }
540 # otherwise, append _content_ macro to the about package
541 else {
542 my $new_about_package = $about_package;
543 my $content_macro = &read_content_from_about_dm();
544 $content_macro =~ m/{(.|\n)*<\/div>\n\n/;
545
546 $new_about_package .= "\n\n_content_$&\n\n";
547 $new_about_package .= "<div class=\"section\">\n$mainpage_toc\n</div>\n</div>\n}";
548 $extra_dm =~ s/$about_package/$new_about_package/mg;
549 }
550
551 # either the case, we need to update the extra.dm
552 $update_extra_dm = 1;
553 }
554
555 if($update_extra_dm==1){
556 # write to the extra.dm file of the collection
557 if (open(OUTPUT, ">$extradm_file")) {
558 print OUTPUT $extra_dm;
559 } else {
560 print "can't open $extradm_file\n";
561 }
562 close(OUTPUT);
563 }
564 } else {
565 print $outhandle "Main_Page doesn't have a table-of-contents section\n";
566 }
567 }
568
569 # If delete_toc is set, remove toc and tof contents.
570 if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
571 if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
572 # print "\nit matches toc_exp !!\n" if $body_text =~ /$self->{'toc_exp'}/;
573 if ($body_text =~ /$self->{'toc_exp'}/) {
574 $body_text =~ s/$self->{'toc_exp'}//i;
575 }
576 }
577 }
578
579 $$textref = "<body" . $body_text;
580
581 # Wrap the whole page with <div id="wikispecificstyle"></div>
582 # keep the style of this website and don't mess up with the Greenstone styles
583 $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
584 $$textref =~ s/<\/body>/<\/div><\/body>/is;
585
586 $self->SUPER::process(@_);
587
588 return 1;
589}
590
591
592sub extract_metadata
593{
594 my $self = shift (@_);
595 my ($textref, $metadata, $doc_obj) = @_;
596 my $outhandle = $self->{'outhandle'};
597
598 return if (!defined $textref);
599
600 # metadata fields to extract/save. 'key' is the (lowercase) name of the
601 # html meta, 'value' is the metadata name for greenstone to use
602 my %find_fields = ();
603 my ($tag,$value);
604
605 my $orig_field = "";
606 foreach my $field (split /,/, $self->{'metadata_fields'}) {
607 # support tag<tagname>
608 if ($field =~ /^(.*?)<(.*?)>$/) {
609 # "$2" is the user's preferred gs metadata name
610 $find_fields{lc($1)}=$2; # lc = lowercase
611 $orig_field = $1;
612 } else { # no <tagname> for mapping
613 # "$field" is the user's preferred gs metadata name
614 $find_fields{lc($field)}=$field; # lc = lowercase
615 $orig_field = $field;
616 }
617
618 if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
619 $tag = $orig_field;
620 $value = $1;
621 if (!defined $value || !defined $tag){
622 #print $outhandle "MediaWikiPlugin: can't find VALUE in \"$tag\"\n";
623 next;
624 } else {
625 # clean up and add
626 chomp($value); # remove trailing \n, if any
627 $tag = $find_fields{lc($tag)};
628 #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
629 # if ($self->{'verbosity'} > 2);
630 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
631 }
632 }
633 }
634}
635
636sub safe_escape_regexp
637{
638 my $regexp = shift (@_);
639
640 # if ($ENV{'GSDLOS'} =~ /^windows$/i) {
641 $regexp =~ s/\\/\\\\/isg;
642 #} else {
643 $regexp =~ s/\//\\\//isg;
644 #}
645 return $regexp;
646}
647
648sub read_content_from_about_dm
649{
650 my $self = shift(@_);
651
652 my $about_macro_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "macros", "about.dm");
653 my $about_page_content = "";
654 if (open(INPUT, "<$about_macro_file")){
655 while (my $line=<INPUT>){
656 $about_page_content .= $line;
657 }
658 } else {
659 my $outhandle = $self->{'outhandle'};
660 print $outhandle "can't open file $about_macro_file\n";
661 }
662 close(INPUT);
663
664 # extract the _content_ macro
665 $about_page_content =~ m/_content_ \{(.|\n)*<\/div>\n\n<\/div>\n}/i;
666 $about_page_content = $&;
667
668 return $about_page_content;
669}
670
6711;
Note: See TracBrowser for help on using the repository browser.