1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 | # This plugin is to process an HTML file where sections are divided by
|
---|
27 | # user-defined headings tags. As it is difficult to predict what user's definition
|
---|
28 | # this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
|
---|
29 | # as well as allows to get rid of user-defined Table of Content (TOC)...
|
---|
30 | # format:e.g. level1 (Abstract_title|ChapterTitle|Referencing Heading) level2(SectionHeading)...
|
---|
31 |
|
---|
32 | package MediaWikiPlug;
|
---|
33 |
|
---|
34 | use HTMLPlug;
|
---|
35 | use ImagePlug;
|
---|
36 | use File::Copy;
|
---|
37 |
|
---|
38 | #use strict; # every perl program should have this!
|
---|
39 | #no strict 'refs'; # make an exception so we can use variables as filehandles
|
---|
40 |
|
---|
41 | sub BEGIN {
|
---|
42 | @MediaWikiPlug::ISA = ('HTMLPlug');
|
---|
43 | }
|
---|
44 |
|
---|
45 | my $arguments =
|
---|
46 | [
|
---|
47 | { 'name' => "show_toc",
|
---|
48 | 'desc' => "{MediaWikiPlug.show_toc}",
|
---|
49 | 'type' => "flag",
|
---|
50 | 'reqd' => "no"},
|
---|
51 | { 'name' => "toc_exp",
|
---|
52 | 'desc' => "{MediaWikiPlug.toc_exp}",
|
---|
53 | 'type' => "regexp",
|
---|
54 | 'reqd' => "no",
|
---|
55 | 'deft' => "" },
|
---|
56 | { 'name' => "delete_toc",
|
---|
57 | 'desc' => "{MediaWikiPlug.delete_toc}",
|
---|
58 | 'type' => "flag",
|
---|
59 | 'reqd' => "no"},
|
---|
60 | { 'name' => "delete_nav",
|
---|
61 | 'desc' => "{MediaWikiPlug.delete_nav}",
|
---|
62 | 'type' => "flag",
|
---|
63 | 'reqd' => "no",
|
---|
64 | 'deft' => ""},
|
---|
65 | { 'name' => "nav_exp",
|
---|
66 | 'desc' => "{MediaWikiPlug.nav_exp}",
|
---|
67 | 'type' => "regexp",
|
---|
68 | 'reqd' => "no",
|
---|
69 | 'deft' => "" },
|
---|
70 | { 'name' => "tag_sections",
|
---|
71 | 'desc' => "{MediaWikiPlug.tag_sections}",
|
---|
72 | 'type' => "flag",
|
---|
73 | 'reqd' => "no"},
|
---|
74 | { 'name' => "description_tags",
|
---|
75 | 'desc' => "{HTMLPlug.description_tags}",
|
---|
76 | 'type' => "flag",
|
---|
77 | 'reqd' => "no"}
|
---|
78 | ];
|
---|
79 |
|
---|
80 |
|
---|
81 | my $options = { 'name' => "MediaWikiPlug",
|
---|
82 | 'desc' => "{MediaWikiPlug.desc}",
|
---|
83 | 'abstract' => "no",
|
---|
84 | 'inherits' => "yes",
|
---|
85 | 'args' => $arguments };
|
---|
86 |
|
---|
87 |
|
---|
88 | sub new {
|
---|
89 | my ($class) = shift (@_);
|
---|
90 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
91 | push(@$pluginlist, $class);
|
---|
92 |
|
---|
93 | if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
|
---|
94 | if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
|
---|
95 |
|
---|
96 | my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
|
---|
97 | return bless $self, $class;
|
---|
98 | }
|
---|
99 |
|
---|
100 |
|
---|
101 |
|
---|
102 | sub process {
|
---|
103 | my $self = shift (@_);
|
---|
104 | my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
|
---|
105 | my $outhandle = $self->{'outhandle'};
|
---|
106 |
|
---|
107 | print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
|
---|
108 |
|
---|
109 | my @head_and_body = split(/<body/i,$$textref);
|
---|
110 | my $head = shift(@head_and_body);
|
---|
111 | my $body_text = join("<body", @head_and_body);
|
---|
112 |
|
---|
113 | $head =~ m/<title>(.+)<\/title>/i;
|
---|
114 | my $doctitle = $1 if defined $1;
|
---|
115 |
|
---|
116 | if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
|
---|
117 | my @doc_properties = split(/<xml>/i,$head);
|
---|
118 | my $doc_heading = shift(@doc_properties);
|
---|
119 | my $rest_doc_properties = join(" ", @doc_properties);
|
---|
120 |
|
---|
121 | my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
|
---|
122 | my $extracted_metadata = shift (@extracted_metadata);
|
---|
123 | $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
|
---|
124 | }
|
---|
125 |
|
---|
126 | # set the title here if we haven't found it yet
|
---|
127 | if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
|
---|
128 | if (defined $doctitle && $doctitle =~ /\S/) {
|
---|
129 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
|
---|
130 | } else {
|
---|
131 | $self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
|
---|
132 | }
|
---|
133 | }
|
---|
134 |
|
---|
135 | if(defined $base_dir && $base_dir ne ""){
|
---|
136 | # find and download stylesheet
|
---|
137 | my @css_files;
|
---|
138 | my $css_file_count = 0;
|
---|
139 | # find all the style sheets imported with import statement
|
---|
140 | while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
|
---|
141 | $css_files[$css_file_count++] = $2 if defined $2;
|
---|
142 | }
|
---|
143 |
|
---|
144 | # check whether the stylesheet exists
|
---|
145 | # if not, download it and copy to the collection's images folder
|
---|
146 | for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
|
---|
147 | my $css_file = $css_files[$css_file_count];
|
---|
148 | $css_file =~ s/^(.+)gli\/cache\///i;
|
---|
149 |
|
---|
150 | my $css_file_path = "$base_dir/$css_file";
|
---|
151 |
|
---|
152 | if (-e $css_file_path){ # the file already exists
|
---|
153 | next;
|
---|
154 | }
|
---|
155 |
|
---|
156 | # check the css directory and create one if it's not there
|
---|
157 | my @dirs = split(/\//i,$css_file);
|
---|
158 | my $path_check = "$base_dir/";
|
---|
159 | for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
|
---|
160 | $path_check .= $dirs[$i] . "/";
|
---|
161 | if(! -d $path_check ){
|
---|
162 | mkdir($path_check);
|
---|
163 | }
|
---|
164 | }
|
---|
165 |
|
---|
166 | # download
|
---|
167 | $css_file = "http://$css_file";
|
---|
168 | system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
|
---|
169 | if ($? != 0) {unlink("$css_file_path");}
|
---|
170 |
|
---|
171 | # change every style element to #wikispecificstyle ...
|
---|
172 | if(open(INPUT, "<$css_file_path")){
|
---|
173 | my $css_content;
|
---|
174 | while(my $line = <INPUT>){
|
---|
175 | if($line =~ m/^(.+)\{/i){
|
---|
176 | $line = "#wikispecificstyle " . $line;
|
---|
177 | }
|
---|
178 | $css_content .= $line;
|
---|
179 | }
|
---|
180 | close(INPUT);
|
---|
181 | open(OUTPUT, ">$css_file_path");
|
---|
182 | print OUTPUT $css_content;
|
---|
183 | close(OUTPUT);
|
---|
184 | }
|
---|
185 |
|
---|
186 | # copy to images folder
|
---|
187 | # do not copy, because collection can only have one specific stylesheet
|
---|
188 | # better to add and modify the style sheets manually
|
---|
189 | # @dirs = split(/\//i,$base_dir);
|
---|
190 | # my $collection_base_dir;
|
---|
191 | # for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
|
---|
192 | # $collection_base_dir .= $dirs[$i] . "/";
|
---|
193 | # }
|
---|
194 | # my $images_folder = $collection_base_dir . "images/";
|
---|
195 | # copy($css_file_path, $images_folder) || die "File cannot be copied.";
|
---|
196 | }
|
---|
197 | }
|
---|
198 |
|
---|
199 | # add sections around h2 tag
|
---|
200 | # wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
|
---|
201 | # add search box with each section
|
---|
202 | if ($self->{'tag_sections'}) {
|
---|
203 | my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
|
---|
204 | for(my $i=1; $i < scalar(@sections); $i++){
|
---|
205 | my $section_title = $sections[$i];
|
---|
206 | $section_title =~ s/<([^>]*)>//g;
|
---|
207 | $section_title =~ s/(^\s|\s$)//g;
|
---|
208 | my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
|
---|
209 | if($i !=1){
|
---|
210 | $section_metadata = "</Section>\n" . $section_metadata;
|
---|
211 | }
|
---|
212 | $section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
|
---|
213 |
|
---|
214 | $section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
|
---|
215 | $section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
|
---|
216 |
|
---|
217 | $body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
|
---|
218 |
|
---|
219 | if($i==scalar(@sections)-1) {
|
---|
220 | # $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
|
---|
221 | $body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
|
---|
222 | }
|
---|
223 | }
|
---|
224 | }
|
---|
225 |
|
---|
226 | # If delete_nav is enabled, it means to get rid of navigation contents.
|
---|
227 | # if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
|
---|
228 | # if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
|
---|
229 | # print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
|
---|
230 | # $body_text =~ s/$self->{'nav_exp'}//isg;
|
---|
231 | # }
|
---|
232 | #}
|
---|
233 | my $searchbox = "";
|
---|
234 | if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
|
---|
235 | my $nav_match_express;
|
---|
236 | if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
|
---|
237 | $nav_match_express = $self->{'nav_exp'} ;
|
---|
238 | } else { # default setting for mediawiki
|
---|
239 | $nav_match_express = "<div class=\"printfooter\">(.|\n)*secs. -->";
|
---|
240 | }
|
---|
241 |
|
---|
242 | print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
|
---|
243 |
|
---|
244 | # $body_text =~ m/<div class=\"printfooter\">(.|\n)*secs. -->/isg;
|
---|
245 | $body_text =~ m/$nav_match_express/isg;
|
---|
246 | my $navigate = $& if defined $&;
|
---|
247 |
|
---|
248 | # find the search box and add it to the document page
|
---|
249 | if(defined $navigate && $navigate =~ /\S/){
|
---|
250 | $navigate =~ m/<div id="p-search" class="portlet">(.|\n)*<\/form>/;
|
---|
251 | $searchbox = $& . "\n<\/div>\n<\/div>";
|
---|
252 | $searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
|
---|
253 | $searchbox =~ s/name="search"/name="q"/isg;
|
---|
254 | $searchbox =~ s/name="go"//isg;
|
---|
255 | $searchbox =~ s/name="fulltext"//isg;
|
---|
256 | my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
|
---|
257 | ."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
|
---|
258 | ."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
|
---|
259 | ."<input type=\"hidden\" name=\"t\" value=\"1\">";
|
---|
260 | $searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
|
---|
261 | $searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
|
---|
262 | }
|
---|
263 |
|
---|
264 | # $body_text =~ s/<div class=\"printfooter\">(.|\n)*secs. -->/$searchbox/isg;
|
---|
265 | $body_text =~ s/$nav_match_express/$searchbox/isg;
|
---|
266 | }
|
---|
267 |
|
---|
268 | if ($self->{'tag_sections'}) {
|
---|
269 | $body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
|
---|
270 | }
|
---|
271 |
|
---|
272 | # Tidy up extra new lines
|
---|
273 | $body_text =~ s/(<p[^>]*><span[^>]*><o:p> <\/o:p><\/span><\/p>)//isg;
|
---|
274 | $body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
|
---|
275 |
|
---|
276 | $section_text .= "<!--\n<Section>\n-->\n";
|
---|
277 | my $body = "<body".$body_text;
|
---|
278 |
|
---|
279 | $$textref = $body;
|
---|
280 |
|
---|
281 | # get the base dir for convert absolute links to relative links
|
---|
282 | $$textref =~ m"href=\"(.*?)/cache/(.*?)/"i;
|
---|
283 | my $basedir = $2;
|
---|
284 |
|
---|
285 | $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
|
---|
286 | $$textref =~ s/( )+/ /sg;
|
---|
287 |
|
---|
288 | # get rid of the [edit] button
|
---|
289 | $$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
|
---|
290 |
|
---|
291 | # get rid of the last time edit information at the bottom
|
---|
292 | $$textref =~ s/<a href="(.+)edit(.*?)"(.*?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
|
---|
293 |
|
---|
294 | # get rid of the (Redirected from ...)
|
---|
295 | $$textref =~ s/(Redirected from <a ([^>]*)>(\w|\s)*<\/a>)//isg;
|
---|
296 |
|
---|
297 | # escape macros
|
---|
298 | $$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
|
---|
299 | # may change the links, like Greenstone_Documentation_All.html, then change back
|
---|
300 | $$textref =~ s/<a([^>]*)_<span>([^>]*)<\/span>_/<a$1_$2_/isg;
|
---|
301 |
|
---|
302 | # convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
|
---|
303 | # get rid of the title attribute of a tag
|
---|
304 | $$textref =~ s/<a([^>]*)title="(.*?)"/<a$1/isg;
|
---|
305 | # find the relative path of current directory
|
---|
306 | if($basedir ne ""){
|
---|
307 | my @dirs=split("\/", $file);
|
---|
308 | my $dirnum = scalar(@dirs);
|
---|
309 | my $replace = "";
|
---|
310 | for(my $i=0; $i<$dirnum-2; $i++){
|
---|
311 | $replace .= "../";
|
---|
312 | }
|
---|
313 | # test if the linked relative file exists, if not, link to the internet version
|
---|
314 | $$textref =~ s/(href|src)="([^>]*)$basedir\/([^>]*)"/$1="$replace$3"/gi;
|
---|
315 | # my @total_links = ($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi);
|
---|
316 | # print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
|
---|
317 | # for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
|
---|
318 |
|
---|
319 | #while($$textref =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/gi){
|
---|
320 | #$total_links[$cur_link_no] =~ m/(href|src)="([^>]*)$basedir\/([^>]*)"/i;
|
---|
321 | # my $prefix = $1;
|
---|
322 | # my $link = $&;
|
---|
323 | # my $rel_file_name = $3;
|
---|
324 | # my $rel_link = "$replace$rel_file_name";
|
---|
325 | # print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
|
---|
326 | # if(-e $rel_link){
|
---|
327 | # $rel_link = "$prefix=\"$rel_link\"";
|
---|
328 | # $$textref =~ s/$link/$rel_link/i;
|
---|
329 | # }else{
|
---|
330 | # my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
|
---|
331 | # print $outhandle "external link==> $ext_link\n";
|
---|
332 | # $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
|
---|
333 | # }
|
---|
334 | #}
|
---|
335 |
|
---|
336 |
|
---|
337 | # tag the link to new wiki pages as red
|
---|
338 | $$textref =~ s/(href|src)="$replace([^>]*)&action=edit([^>]*)"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi;
|
---|
339 | $$textref =~ s/<a([^>]*)class="new"([^>]*)>/<a$1style="color:red"$2)>/gi;
|
---|
340 |
|
---|
341 | # tag the link to external pages as blue
|
---|
342 | $$textref =~ s/<a([^>]*)class='external text'([^>]*)>/<a$1style="color:blue"$2)>/gi;
|
---|
343 |
|
---|
344 | #print $outhandle $$textref;
|
---|
345 | }
|
---|
346 |
|
---|
347 | # if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
|
---|
348 | # 1. read _content_ macro from about.dm
|
---|
349 | # 2. append the toc, change all links to the Greenstone internal format for relative links
|
---|
350 | # 3. write to the extra.dm
|
---|
351 | # TODO: currently we suppose the _about:content_ hasn't been specified before
|
---|
352 | # so needs to add function to handle when the macro is already in the extra.dm
|
---|
353 | if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html|htm)$/){
|
---|
354 | my $macro_path = $base_dir;
|
---|
355 | $macro_path =~ s/import$/macros/;
|
---|
356 | my $extra_dm;
|
---|
357 | my $extradm_file = "$macro_path/extra.dm";
|
---|
358 | if(open(INPUT, "<$extradm_file")){
|
---|
359 | while(my $line = <INPUT>){
|
---|
360 | $extra_dm .= $line;
|
---|
361 | }
|
---|
362 | close(INPUT);
|
---|
363 |
|
---|
364 | if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
|
---|
365 | print $outhandle "already changed!!!!\n";
|
---|
366 | } else {
|
---|
367 | # read _content_ macro from about.dm file
|
---|
368 | my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
|
---|
369 | my $about_page_content = "";
|
---|
370 | if(open(INPUT, "<$about_macro")){
|
---|
371 | while(my $line=<INPUT>){
|
---|
372 | $about_page_content .= $line;
|
---|
373 | }
|
---|
374 | }else{
|
---|
375 | print $outhandle "can't open file $about_macro\n";
|
---|
376 | }
|
---|
377 | close(INPUT);
|
---|
378 |
|
---|
379 | # extract the _content_ macro
|
---|
380 | $about_page_content =~ m/_content_ {(.|\n)*<\/div>\n\n<\/div>\n}/i;
|
---|
381 | $about_page_content = $&;
|
---|
382 |
|
---|
383 | # extract toc of the Main_Page
|
---|
384 | my $mainpage_content = "";
|
---|
385 | if($self->{'toc_exp'} =~ /\S/){
|
---|
386 | $$textref =~ /$self->{'toc_exp'}/;
|
---|
387 | $mainpage_content = $&;
|
---|
388 | } else {
|
---|
389 | # $mainpage_content =~ s/<!-- start content -->(.|\n)*<!-- end content -->/$1/igs;
|
---|
390 | }
|
---|
391 | # print $outhandle "---------\n$$textref\n--------\n\n";
|
---|
392 | # print $outhandle "==========\n$mainpage_content\n==========\n\n";
|
---|
393 |
|
---|
394 | # add toc to the _content_ macro
|
---|
395 | $about_page_content =~ m/{(.|\n)*<\/div>\n\n/;
|
---|
396 | $extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
|
---|
397 |
|
---|
398 | # change all links to the internal Greenstone relative link format
|
---|
399 | $extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
|
---|
400 | $extra_dm =~ s/(\.\.\/)+/\//isg;
|
---|
401 | # print $outhandle "to add---------\n$extra_dm\n--------\n";
|
---|
402 |
|
---|
403 | # write to the extra.dm file of the collection
|
---|
404 | open(OUTPUT, ">$extradm_file");
|
---|
405 | print OUTPUT $extra_dm;
|
---|
406 | close(OUTPUT);
|
---|
407 | }
|
---|
408 | } else {
|
---|
409 | print $outhandle "can't open file $extradm_file\n";
|
---|
410 | }
|
---|
411 | }
|
---|
412 |
|
---|
413 | # If delete_toc is enabled, it means to get rid of toc and tof contents.
|
---|
414 | # get rid of TOC and TOF sections and their title
|
---|
415 | if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
|
---|
416 | if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
|
---|
417 | # $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
|
---|
418 | # print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
|
---|
419 | # $body_text =~ s/$self->{'toc_exp'}//i;
|
---|
420 | print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
|
---|
421 | $$textref =~ s/$self->{'toc_exp'}//i;
|
---|
422 | }
|
---|
423 | }
|
---|
424 |
|
---|
425 | # To add a layer on top of the wiki page
|
---|
426 | # so as to keep the wiki style inside the wiki page
|
---|
427 | # and keep the Greenstone style at the same time
|
---|
428 | $$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
|
---|
429 | $$textref =~ s/<\/body>/<\/div><\/body>/is;
|
---|
430 |
|
---|
431 | # tag with sections
|
---|
432 | $$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
|
---|
433 | $$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
|
---|
434 |
|
---|
435 | #print $outhandle "\n\n$$textref\n\n";
|
---|
436 |
|
---|
437 | # use description tags
|
---|
438 | if ($self->{'description_tags'}) {
|
---|
439 | my $cursection = $doc_obj->get_top_section();
|
---|
440 | # remove the html header - note that doing this here means any
|
---|
441 | # sections defined within the header will be lost (so all <Section>
|
---|
442 | # tags must appear within the body of the HTML)
|
---|
443 | my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is);
|
---|
444 |
|
---|
445 | $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
446 | $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
447 |
|
---|
448 | my $opencom = '(?:<!--|<!(?:—|—|--))';
|
---|
449 | my $closecom = '(?:-->|(?:—|—|--)>)';
|
---|
450 |
|
---|
451 | my $lt = '(?:<|<)';
|
---|
452 | my $gt = '(?:>|>)';
|
---|
453 | my $quot = '(?:"|"|”|“)';
|
---|
454 |
|
---|
455 | # my $dont_strip = '';
|
---|
456 | # if ($self->{'no_strip_metadata_html'}) {
|
---|
457 | # ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{|}g;
|
---|
458 | # }
|
---|
459 |
|
---|
460 | my $found_something = 0;
|
---|
461 | my $top = 1;
|
---|
462 | while ($$textref =~ s/^(.*?)$opencom(.*?)$closecom//s) {
|
---|
463 | my $text = $1;
|
---|
464 | my $comment = $2;
|
---|
465 | if (defined $text) {
|
---|
466 | # text before a comment - note that getting to here
|
---|
467 | # doesn't necessarily mean there are Section tags in
|
---|
468 | # the document
|
---|
469 | # print $outhandle "section text:\n$text\n";
|
---|
470 | $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
|
---|
471 | }
|
---|
472 | while ($comment =~ s/$lt(.*?)$gt//s) {
|
---|
473 | my $tag = $1;
|
---|
474 | if ($tag eq "Section") {
|
---|
475 | $found_something = 1;
|
---|
476 | $cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
|
---|
477 | $top = 0;
|
---|
478 | } elsif ($tag eq "/Section") {
|
---|
479 | $found_something = 1;
|
---|
480 | $cursection = $doc_obj->get_parent_section ($cursection);
|
---|
481 | } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
|
---|
482 | my $metaname = $1;
|
---|
483 | my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
|
---|
484 | $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
|
---|
485 | my $metavalue = $1;
|
---|
486 | $metavalue =~ s/^\s+//;
|
---|
487 | $metavalue =~ s/\s+$//;
|
---|
488 | # assume that no metadata value intentionally includes
|
---|
489 | # carriage returns or HTML tags (if they're there they
|
---|
490 | # were probably introduced when converting to HTML from
|
---|
491 | # some other format).
|
---|
492 | # actually some people want to have html tags in their
|
---|
493 | # metadata.
|
---|
494 | $metavalue =~ s/[\cJ\cM]/ /sg;
|
---|
495 | # $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
|
---|
496 | $metavalue =~ s/\s+/ /sg;
|
---|
497 | # print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
|
---|
498 | if ($accumulate) {
|
---|
499 | $doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
|
---|
500 | } else {
|
---|
501 | $doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
|
---|
502 | }
|
---|
503 | } elsif ($tag eq "Description" || $tag eq "/Description") {
|
---|
504 | # do nothing with containing Description tags
|
---|
505 | } else {
|
---|
506 | # simple HTML tag (probably created by the conversion
|
---|
507 | # to HTML from some other format) - we'll ignore it and
|
---|
508 | # hope for the best ;-)
|
---|
509 | }
|
---|
510 | }
|
---|
511 | }# end while
|
---|
512 |
|
---|
513 | if ($cursection ne "") {
|
---|
514 | print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
|
---|
515 | }
|
---|
516 |
|
---|
517 | $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
518 | $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
519 | if ($$textref =~ /\S/) {
|
---|
520 | if (!$found_something) {
|
---|
521 | if ($self->{'verbosity'} > 2) {
|
---|
522 | print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
|
---|
523 | print $outhandle " will be processed as a single section document\n";
|
---|
524 | }
|
---|
525 |
|
---|
526 | # go ahead and process single-section document
|
---|
527 | $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
|
---|
528 |
|
---|
529 | } else {
|
---|
530 | print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
|
---|
531 | print $outhandle " of the final closing </Section> tag. This text will\n";
|
---|
532 | print $outhandle " be ignored.";
|
---|
533 |
|
---|
534 | my ($text);
|
---|
535 | if (length($$textref) > 30) {
|
---|
536 | $text = substr($$textref, 0, 30) . "...";
|
---|
537 | } else {
|
---|
538 | $text = $$textref;
|
---|
539 | }
|
---|
540 | $text =~ s/\n/ /isg;
|
---|
541 | print $outhandle " ($text)\n";
|
---|
542 | }
|
---|
543 | } elsif (!$found_something) {
|
---|
544 | if ($self->{'verbosity'} > 2) {
|
---|
545 | # may get to here if document contained no valid Section
|
---|
546 | # tags but did contain some comments. The text will have
|
---|
547 | # been processed already but we should print the warning
|
---|
548 | # as above and extract metadata
|
---|
549 | print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
|
---|
550 | print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
|
---|
551 | }
|
---|
552 | }
|
---|
553 | } # if $self->{'description_tags'}
|
---|
554 | else {
|
---|
555 | # remove header and footer
|
---|
556 | # if (!$self->{'keep_head'}) {
|
---|
557 | # $$textref =~ s/^.*?<body[^>]*>//is;
|
---|
558 | # $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
|
---|
559 | # }
|
---|
560 |
|
---|
561 | # single section document
|
---|
562 | # $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
|
---|
563 |
|
---|
564 | # Important: to get the relative links to work,
|
---|
565 | # 1: use the below statement instead of the above one
|
---|
566 | # 2. cannot have process_section method.
|
---|
567 | # why?????
|
---|
568 | $self->SUPER::process(@_);
|
---|
569 | }
|
---|
570 | return 1;
|
---|
571 |
|
---|
572 | #$self->SUPER::process(@_);
|
---|
573 | }
|
---|
574 |
|
---|
575 |
|
---|
576 |
|
---|
577 | # note that process_section may be called multiple times for a single
|
---|
578 | # section (relying on the fact that add_utf8_text appends the text to any
|
---|
579 | # that may exist already).
|
---|
580 | # sub process_section {
|
---|
581 | # my $self = shift (@_);
|
---|
582 | # my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
|
---|
583 |
|
---|
584 | # trap links
|
---|
585 | # if (!$self->{'nolinks'}) {
|
---|
586 | # usemap="./#index" not handled correctly => change to "#index"
|
---|
587 | # $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
|
---|
588 | #$self->replace_usemap_links($1, $2, $3)/isge;
|
---|
589 |
|
---|
590 | #$$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
|
---|
591 | #$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
|
---|
592 | #}
|
---|
593 |
|
---|
594 | # trap images
|
---|
595 |
|
---|
596 | # allow spaces if inside quotes - jrm21
|
---|
597 | #$$textref =~ s/(<(?:img|embed|table|tr|td)[^>]*?(?:src|background)\s*=\s*)([\"\'][^\"\']+[\"\']|[^\s>]+)([^>]*>)/
|
---|
598 | #$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
|
---|
599 |
|
---|
600 | # add text to document object
|
---|
601 | # turn \ into \\ so that the rest of greenstone doesn't think there
|
---|
602 | # is an escape code following. (Macro parsing loses them...)
|
---|
603 | # $$textref =~ s/\\/\\\\/go;
|
---|
604 |
|
---|
605 | # $doc_obj->add_utf8_text($cursection, $$textref);
|
---|
606 | #}
|
---|
607 |
|
---|
608 |
|
---|
609 | sub extract_metadata
|
---|
610 | {
|
---|
611 | my $self = shift (@_);
|
---|
612 | my ($textref, $metadata, $doc_obj) = @_;
|
---|
613 | my $outhandle = $self->{'outhandle'};
|
---|
614 |
|
---|
615 | return if (!defined $textref);
|
---|
616 |
|
---|
617 | # metadata fields to extract/save. 'key' is the (lowercase) name of the
|
---|
618 | # html meta, 'value' is the metadata name for greenstone to use
|
---|
619 | my %find_fields = ();
|
---|
620 | my ($tag,$value);
|
---|
621 |
|
---|
622 | my $orig_field = "";
|
---|
623 | foreach my $field (split /,/, $self->{'metadata_fields'}) {
|
---|
624 | # support tag<tagname>
|
---|
625 | if ($field =~ /^(.*?)<(.*?)>$/) {
|
---|
626 | # "$2" is the user's preferred gs metadata name
|
---|
627 | $find_fields{lc($1)}=$2; # lc = lowercase
|
---|
628 | $orig_field = $1;
|
---|
629 | } else { # no <tagname> for mapping
|
---|
630 | # "$field" is the user's preferred gs metadata name
|
---|
631 | $find_fields{lc($field)}=$field; # lc = lowercase
|
---|
632 | $orig_field = $field;
|
---|
633 | }
|
---|
634 |
|
---|
635 | if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
|
---|
636 | $tag = $orig_field;
|
---|
637 | $value = $1;
|
---|
638 | if (!defined $value || !defined $tag){
|
---|
639 | #print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
|
---|
640 | next;
|
---|
641 | } else {
|
---|
642 | # clean up and add
|
---|
643 | chomp($value); # remove trailing \n, if any
|
---|
644 | $tag = $find_fields{lc($tag)};
|
---|
645 | #print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
|
---|
646 | # if ($self->{'verbosity'} > 2);
|
---|
647 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
|
---|
648 | }
|
---|
649 | }
|
---|
650 | }
|
---|
651 | }
|
---|
652 |
|
---|
653 | 1;
|
---|