Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/plugins/MediaWikiPlug.pm@ 14108

Last change on this file since 14108 was 14108, checked in by lh92, 17 years ago
Plugin for processing MediaWiki pages
Property svn:keywords set to `Author Date Id Revision`
File size: 24.3 KB

Line
1	###########################################################################
2	#
3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26	# This plugin is to process an HTML file where sections are divided by
27	# user-defined headings tags. As it is difficult to predict what user's definition
28	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
29	# as well as allows to get rid of user-defined Table of Content (TOC)...
30	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
31
32	package MediaWikiPlug;
33
34	use HTMLPlug;
35	use ImagePlug;
36	use File::Copy;
37
38	#use strict; # every perl program should have this!
39	#no strict 'refs'; # make an exception so we can use variables as filehandles
40
41	sub BEGIN {
42	@MediaWikiPlug::ISA = ('HTMLPlug');
43	}
44
45	my $arguments =
46	[
47	{ 'name' => "show_toc",
48	'desc' => "{MediaWikiPlug.show_toc}",
49	'type' => "flag",
50	'reqd' => "no"},
51	{ 'name' => "toc_exp",
52	'desc' => "{MediaWikiPlug.toc_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => "" },
56	{ 'name' => "delete_toc",
57	'desc' => "{MediaWikiPlug.delete_toc}",
58	'type' => "flag",
59	'reqd' => "no"},
60	{ 'name' => "delete_nav",
61	'desc' => "{MediaWikiPlug.delete_nav}",
62	'type' => "flag",
63	'reqd' => "no",
64	'deft' => ""},
65	{ 'name' => "nav_exp",
66	'desc' => "{MediaWikiPlug.nav_exp}",
67	'type' => "regexp",
68	'reqd' => "no",
69	'deft' => "" },
70	{ 'name' => "tag_sections",
71	'desc' => "{MediaWikiPlug.tag_sections}",
72	'type' => "flag",
73	'reqd' => "no"},
74	{ 'name' => "description_tags",
75	'desc' => "{HTMLPlug.description_tags}",
76	'type' => "flag",
77	'reqd' => "no"}
78	];
79
80
81	my $options = { 'name' => "MediaWikiPlug",
82	'desc' => "{MediaWikiPlug.desc}",
83	'abstract' => "no",
84	'inherits' => "yes",
85	'args' => $arguments };
86
87
88	sub new {
89	my ($class) = shift (@_);
90	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
91	push(@$pluginlist, $class);
92
93	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
94	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
95
96	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
97	return bless $self, $class;
98	}
99
100
101
102	sub process {
103	my $self = shift (@_);
104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
105	my $outhandle = $self->{'outhandle'};
106
107	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
108
109	my @head_and_body = split(/<body/i,$$textref);
110	my $head = shift(@head_and_body);
111	my $body_text = join("<body", @head_and_body);
112
113	$head =~ m/<title>(.+)<\/title>/i;
114	my $doctitle = $1 if defined $1;
115
116	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
117	my @doc_properties = split(/<xml>/i,$head);
118	my $doc_heading = shift(@doc_properties);
119	my $rest_doc_properties = join(" ", @doc_properties);
120
121	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
122	my $extracted_metadata = shift (@extracted_metadata);
123	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
124	}
125
126	# set the title here if we haven't found it yet
127	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
128	if (defined $doctitle && $doctitle =~ /\S/) {
129	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
130	} else {
131	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
132	}
133	}
134
135	if(defined $base_dir && $base_dir ne ""){
136	# find and download stylesheet
137	my @css_files;
138	my $css_file_count = 0;
139	# find all the style sheets imported with import statement
140	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
141	$css_files[$css_file_count++] = $2 if defined $2;
142	}
143
144	# check whether the stylesheet exists
145	# if not, download it and copy to the collection's images folder
146	for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
147	my $css_file = $css_files[$css_file_count];
148	$css_file =~ s/^(.+)gli\/cache\///i;
149
150	my $css_file_path = "$base_dir/$css_file";
151
152	if (-e $css_file_path){ # the file already exists
153	next;
154	}
155
156	# check the css directory and create one if it's not there
157	my @dirs = split(/\//i,$css_file);
158	my $path_check = "$base_dir/";
159	for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
160	$path_check .= $dirs[$i] . "/";
161	if(! -d $path_check ){
162	mkdir($path_check);
163	}
164	}
165
166	# download
167	$css_file = "http://$css_file";
168	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
169	if ($? != 0) {unlink("$css_file_path");}
170
171	# change every style element to #wikispecificstyle ...
172	if(open(INPUT, "<$css_file_path")){
173	my $css_content;
174	while(my $line = <INPUT>){
175	if($line =~ m/^(.+)\{/i){
176	$line = "#wikispecificstyle " . $line;
177	}
178	$css_content .= $line;
179	}
180	close(INPUT);
181	open(OUTPUT, ">$css_file_path");
182	print OUTPUT $css_content;
183	close(OUTPUT);
184	}
185
186	# copy to images folder
187	# do not copy, because collection can only have one specific stylesheet
188	# better to add and modify the style sheets manually
189	# @dirs = split(/\//i,$base_dir);
190	# my $collection_base_dir;
191	# for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
192	# $collection_base_dir .= $dirs[$i] . "/";
193	# }
194	# my $images_folder = $collection_base_dir . "images/";
195	# copy($css_file_path, $images_folder) \|\| die "File cannot be copied.";
196	}
197	}
198
199	# add sections around h2 tag
200	# wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
201	# add search box with each section
202	if ($self->{'tag_sections'}) {
203	my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
204	for(my $i=1; $i < scalar(@sections); $i++){
205	my $section_title = $sections[$i];
206	$section_title =~ s/<([^>]*)>//g;
207	$section_title =~ s/(^\s\|\s$)//g;
208	my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
209	if($i !=1){
210	$section_metadata = "</Section>\n" . $section_metadata;
211	}
212	$section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
213
214	$section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
215	$section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
216
217	$body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
218
219	if($i==scalar(@sections)-1) {
220	# $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
221	$body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
222	}
223	}
224	}
225
226	# If delete_nav is enabled, it means to get rid of navigation contents.
227	# if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
228	# if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
229	# print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
230	# $body_text =~ s/$self->{'nav_exp'}//isg;
231	# }
232	#}
233	my $searchbox = "";
234	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
235	my $nav_match_express;
236	if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
237	$nav_match_express = $self->{'nav_exp'} ;
238	} else { # default setting for mediawiki
239	$nav_match_express = "<div class=\"printfooter\">(.\|\n)*secs. -->";
240	}
241
242	print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
243
244	# $body_text =~ m/<div class=\"printfooter\">(.\|\n)*secs. -->/isg;
245	$body_text =~ m/$nav_match_express/isg;
246	my $navigate = $& if defined $&;
247
248	# find the search box and add it to the document page
249	if(defined $navigate && $navigate =~ /\S/){
250	$navigate =~ m/<div id="p-search" class="portlet">(.\|\n)*<\/form>/;
251	$searchbox = $& . "\n<\/div>\n<\/div>";
252	$searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
253	$searchbox =~ s/name="search"/name="q"/isg;
254	$searchbox =~ s/name="go"//isg;
255	$searchbox =~ s/name="fulltext"//isg;
256	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
257	."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
258	."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
259	."<input type=\"hidden\" name=\"t\" value=\"1\">";
260	$searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
261	$searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
262	}
263
264	# $body_text =~ s/<div class=\"printfooter\">(.\|\n)*secs. -->/$searchbox/isg;
265	$body_text =~ s/$nav_match_express/$searchbox/isg;
266	}
267
268	if ($self->{'tag_sections'}) {
269	$body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
270	}
271
272	# Tidy up extra new lines
273	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
274	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
275
276	$section_text .= "<!--\n<Section>\n-->\n";
277	my $body = "<body".$body_text;
278
279	$$textref = $body;
280
281	# get the base dir for convert absolute links to relative links
282	$$textref =~ m"href=\"(.?)/cache/(.?)/"i;
283	my $basedir = $2;
284
285	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
286	$$textref =~ s/( )+/ /sg;
287
288	# get rid of the [edit] button
289	$$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
290
291	# get rid of the last time edit information at the bottom
292	$$textref =~ s/<a href="(.+)edit(.?)"(.?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
293
294	# get rid of the (Redirected from ...)
295	$$textref =~ s/(Redirected from <a ([^>])>(\w\|\s)<\/a>)//isg;
296
297	# escape macros
298	$$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
299	# may change the links, like Greenstone_Documentation_All.html, then change back
300	$$textref =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
301
302	# convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
303	# get rid of the title attribute of a tag
304	$$textref =~ s/<a([^>])title="(.?)"/<a$1/isg;
305	# find the relative path of current directory
306	if($basedir ne ""){
307	my @dirs=split("\/", $file);
308	my $dirnum = scalar(@dirs);
309	my $replace = "";
310	for(my $i=0; $i<$dirnum-2; $i++){
311	$replace .= "../";
312	}
313	# test if the linked relative file exists, if not, link to the internet version
314	$$textref =~ s/(href\|src)="([^>])$basedir\/([^>])"/$1="$replace$3"/gi;
315	# my @total_links = ($$textref =~ m/(href\|src)="([^>])$basedir\/([^>])"/gi);
316	# print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
317	# for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
318
319	#while($$textref =~ m/(href\|src)="([^>])$basedir\/([^>])"/gi){
320	#$total_links[$cur_link_no] =~ m/(href\|src)="([^>])$basedir\/([^>])"/i;
321	# my $prefix = $1;
322	# my $link = $&;
323	# my $rel_file_name = $3;
324	# my $rel_link = "$replace$rel_file_name";
325	# print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
326	# if(-e $rel_link){
327	# $rel_link = "$prefix=\"$rel_link\"";
328	# $$textref =~ s/$link/$rel_link/i;
329	# }else{
330	# my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
331	# print $outhandle "external link==> $ext_link\n";
332	# $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
333	# }
334	#}
335
336
337	# tag the link to new wiki pages as red
338	$$textref =~ s/(href\|src)="$replace([^>])&action=edit([^>])"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi;
339	$$textref =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
340
341	# tag the link to external pages as blue
342	$$textref =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
343
344	#print $outhandle $$textref;
345	}
346
347	# if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
348	# 1. read _content_ macro from about.dm
349	# 2. append the toc, change all links to the Greenstone internal format for relative links
350	# 3. write to the extra.dm
351	# TODO: currently we suppose the _about:content_ hasn't been specified before
352	# so needs to add function to handle when the macro is already in the extra.dm
353	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
354	my $macro_path = $base_dir;
355	$macro_path =~ s/import$/macros/;
356	my $extra_dm;
357	my $extradm_file = "$macro_path/extra.dm";
358	if(open(INPUT, "<$extradm_file")){
359	while(my $line = <INPUT>){
360	$extra_dm .= $line;
361	}
362	close(INPUT);
363
364	if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
365	print $outhandle "already changed!!!!\n";
366	} else {
367	# read _content_ macro from about.dm file
368	my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
369	my $about_page_content = "";
370	if(open(INPUT, "<$about_macro")){
371	while(my $line=<INPUT>){
372	$about_page_content .= $line;
373	}
374	}else{
375	print $outhandle "can't open file $about_macro\n";
376	}
377	close(INPUT);
378
379	# extract the _content_ macro
380	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
381	$about_page_content = $&;
382
383	# extract toc of the Main_Page
384	my $mainpage_content = "";
385	if($self->{'toc_exp'} =~ /\S/){
386	$$textref =~ /$self->{'toc_exp'}/;
387	$mainpage_content = $&;
388	} else {
389	# $mainpage_content =~ s/<!-- start content -->(.\|\n)*<!-- end content -->/$1/igs;
390	}
391	# print $outhandle "---------\n$$textref\n--------\n\n";
392	# print $outhandle "==========\n$mainpage_content\n==========\n\n";
393
394	# add toc to the _content_ macro
395	$about_page_content =~ m/{(.\|\n)*<\/div>\n\n/;
396	$extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
397
398	# change all links to the internal Greenstone relative link format
399	$extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
400	$extra_dm =~ s/(\.\.\/)+/\//isg;
401	# print $outhandle "to add---------\n$extra_dm\n--------\n";
402
403	# write to the extra.dm file of the collection
404	open(OUTPUT, ">$extradm_file");
405	print OUTPUT $extra_dm;
406	close(OUTPUT);
407	}
408	} else {
409	print $outhandle "can't open file $extradm_file\n";
410	}
411	}
412
413	# If delete_toc is enabled, it means to get rid of toc and tof contents.
414	# get rid of TOC and TOF sections and their title
415	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
416	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
417	# $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
418	# print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
419	# $body_text =~ s/$self->{'toc_exp'}//i;
420	print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
421	$$textref =~ s/$self->{'toc_exp'}//i;
422	}
423	}
424
425	# To add a layer on top of the wiki page
426	# so as to keep the wiki style inside the wiki page
427	# and keep the Greenstone style at the same time
428	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
429	$$textref =~ s/<\/body>/<\/div><\/body>/is;
430
431	# tag with sections
432	$$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
433	$$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
434
435	#print $outhandle "\n\n$$textref\n\n";
436
437	# use description tags
438	if ($self->{'description_tags'}) {
439	my $cursection = $doc_obj->get_top_section();
440	# remove the html header - note that doing this here means any
441	# sections defined within the header will be lost (so all <Section>
442	# tags must appear within the body of the HTML)
443	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
444
445	$$textref =~ s/^.?<body[^>]>//is;
446	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
447
448	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
449	my $closecom = '(?:-->\|(?:—\|\|--)>)';
450
451	my $lt = '(?:<\|<)';
452	my $gt = '(?:>\|>)';
453	my $quot = '(?:"\|"\|”\|“)';
454
455	# my $dont_strip = '';
456	# if ($self->{'no_strip_metadata_html'}) {
457	# ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
458	# }
459
460	my $found_something = 0;
461	my $top = 1;
462	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
463	my $text = $1;
464	my $comment = $2;
465	if (defined $text) {
466	# text before a comment - note that getting to here
467	# doesn't necessarily mean there are Section tags in
468	# the document
469	# print $outhandle "section text:\n$text\n";
470	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
471	}
472	while ($comment =~ s/$lt(.*?)$gt//s) {
473	my $tag = $1;
474	if ($tag eq "Section") {
475	$found_something = 1;
476	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
477	$top = 0;
478	} elsif ($tag eq "/Section") {
479	$found_something = 1;
480	$cursection = $doc_obj->get_parent_section ($cursection);
481	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
482	my $metaname = $1;
483	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
484	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
485	my $metavalue = $1;
486	$metavalue =~ s/^\s+//;
487	$metavalue =~ s/\s+$//;
488	# assume that no metadata value intentionally includes
489	# carriage returns or HTML tags (if they're there they
490	# were probably introduced when converting to HTML from
491	# some other format).
492	# actually some people want to have html tags in their
493	# metadata.
494	$metavalue =~ s/[\cJ\cM]/ /sg;
495	# $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
496	$metavalue =~ s/\s+/ /sg;
497	# print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
498	if ($accumulate) {
499	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
500	} else {
501	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
502	}
503	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
504	# do nothing with containing Description tags
505	} else {
506	# simple HTML tag (probably created by the conversion
507	# to HTML from some other format) - we'll ignore it and
508	# hope for the best ;-)
509	}
510	}
511	}# end while
512
513	if ($cursection ne "") {
514	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
515	}
516
517	$$textref =~ s/^.?<body[^>]>//is;
518	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
519	if ($$textref =~ /\S/) {
520	if (!$found_something) {
521	if ($self->{'verbosity'} > 2) {
522	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
523	print $outhandle " will be processed as a single section document\n";
524	}
525
526	# go ahead and process single-section document
527	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
528
529	} else {
530	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
531	print $outhandle " of the final closing </Section> tag. This text will\n";
532	print $outhandle " be ignored.";
533
534	my ($text);
535	if (length($$textref) > 30) {
536	$text = substr($$textref, 0, 30) . "...";
537	} else {
538	$text = $$textref;
539	}
540	$text =~ s/\n/ /isg;
541	print $outhandle " ($text)\n";
542	}
543	} elsif (!$found_something) {
544	if ($self->{'verbosity'} > 2) {
545	# may get to here if document contained no valid Section
546	# tags but did contain some comments. The text will have
547	# been processed already but we should print the warning
548	# as above and extract metadata
549	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
550	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
551	}
552	}
553	} # if $self->{'description_tags'}
554	else {
555	# remove header and footer
556	# if (!$self->{'keep_head'}) {
557	# $$textref =~ s/^.?<body[^>]>//is;
558	# $$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
559	# }
560
561	# single section document
562	# $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
563
564	# Important: to get the relative links to work,
565	# 1: use the below statement instead of the above one
566	# 2. cannot have process_section method.
567	# why?????
568	$self->SUPER::process(@_);
569	}
570	return 1;
571
572	#$self->SUPER::process(@_);
573	}
574
575
576
577	# note that process_section may be called multiple times for a single
578	# section (relying on the fact that add_utf8_text appends the text to any
579	# that may exist already).
580	# sub process_section {
581	# my $self = shift (@_);
582	# my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
583
584	# trap links
585	# if (!$self->{'nolinks'}) {
586	# usemap="./#index" not handled correctly => change to "#index"
587	# $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
588	#$self->replace_usemap_links($1, $2, $3)/isge;
589
590	#$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
591	#$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
592	#}
593
594	# trap images
595
596	# allow spaces if inside quotes - jrm21
597	#$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)([\"\'][^\"\']+[\"\']\|[^\s>]+)([^>]>)/
598	#$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
599
600	# add text to document object
601	# turn \ into \\ so that the rest of greenstone doesn't think there
602	# is an escape code following. (Macro parsing loses them...)
603	# $$textref =~ s/\\/\\\\/go;
604
605	# $doc_obj->add_utf8_text($cursection, $$textref);
606	#}
607
608
609	sub extract_metadata
610	{
611	my $self = shift (@_);
612	my ($textref, $metadata, $doc_obj) = @_;
613	my $outhandle = $self->{'outhandle'};
614
615	return if (!defined $textref);
616
617	# metadata fields to extract/save. 'key' is the (lowercase) name of the
618	# html meta, 'value' is the metadata name for greenstone to use
619	my %find_fields = ();
620	my ($tag,$value);
621
622	my $orig_field = "";
623	foreach my $field (split /,/, $self->{'metadata_fields'}) {
624	# support tag<tagname>
625	if ($field =~ /^(.?)<(.?)>$/) {
626	# "$2" is the user's preferred gs metadata name
627	$find_fields{lc($1)}=$2; # lc = lowercase
628	$orig_field = $1;
629	} else { # no <tagname> for mapping
630	# "$field" is the user's preferred gs metadata name
631	$find_fields{lc($field)}=$field; # lc = lowercase
632	$orig_field = $field;
633	}
634
635	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
636	$tag = $orig_field;
637	$value = $1;
638	if (!defined $value \|\| !defined $tag){
639	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
640	next;
641	} else {
642	# clean up and add
643	chomp($value); # remove trailing \n, if any
644	$tag = $find_fields{lc($tag)};
645	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
646	# if ($self->{'verbosity'} > 2);
647	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
648	}
649	}
650	}
651	}
652
653	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: