Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/plugins/MediaWikiPlug.pm@ 14108

Last change on this file since 14108 was 14108, checked in by lh92, 17 years ago
Plugin for processing MediaWiki pages
Property svn:keywords set to `Author Date Id Revision`
File size: 24.3 KB

Rev	Line
[14108]	1	###########################################################################
	2	#
	3	# MediaWikiPlug.pm -- html plugin with extra facilities for wiki page
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26	# This plugin is to process an HTML file where sections are divided by
	27	# user-defined headings tags. As it is difficult to predict what user's definition
	28	# this plugin allows to detect the user-defined titles up to three levels (level1, level2, level3...)
	29	# as well as allows to get rid of user-defined Table of Content (TOC)...
	30	# format:e.g. level1 (Abstract_title\|ChapterTitle\|Referencing Heading) level2(SectionHeading)...
	31
	32	package MediaWikiPlug;
	33
	34	use HTMLPlug;
	35	use ImagePlug;
	36	use File::Copy;
	37
	38	#use strict; # every perl program should have this!
	39	#no strict 'refs'; # make an exception so we can use variables as filehandles
	40
	41	sub BEGIN {
	42	@MediaWikiPlug::ISA = ('HTMLPlug');
	43	}
	44
	45	my $arguments =
	46	[
	47	{ 'name' => "show_toc",
	48	'desc' => "{MediaWikiPlug.show_toc}",
	49	'type' => "flag",
	50	'reqd' => "no"},
	51	{ 'name' => "toc_exp",
	52	'desc' => "{MediaWikiPlug.toc_exp}",
	53	'type' => "regexp",
	54	'reqd' => "no",
	55	'deft' => "" },
	56	{ 'name' => "delete_toc",
	57	'desc' => "{MediaWikiPlug.delete_toc}",
	58	'type' => "flag",
	59	'reqd' => "no"},
	60	{ 'name' => "delete_nav",
	61	'desc' => "{MediaWikiPlug.delete_nav}",
	62	'type' => "flag",
	63	'reqd' => "no",
	64	'deft' => ""},
	65	{ 'name' => "nav_exp",
	66	'desc' => "{MediaWikiPlug.nav_exp}",
	67	'type' => "regexp",
	68	'reqd' => "no",
	69	'deft' => "" },
	70	{ 'name' => "tag_sections",
	71	'desc' => "{MediaWikiPlug.tag_sections}",
	72	'type' => "flag",
	73	'reqd' => "no"},
	74	{ 'name' => "description_tags",
	75	'desc' => "{HTMLPlug.description_tags}",
	76	'type' => "flag",
	77	'reqd' => "no"}
	78	];
	79
	80
	81	my $options = { 'name' => "MediaWikiPlug",
	82	'desc' => "{MediaWikiPlug.desc}",
	83	'abstract' => "no",
	84	'inherits' => "yes",
	85	'args' => $arguments };
	86
	87
	88	sub new {
	89	my ($class) = shift (@_);
	90	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	91	push(@$pluginlist, $class);
	92
	93	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	94	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
	95
	96	my $self = new HTMLPlug($pluginlist, $inputargs, $hashArgOptLists);
	97	return bless $self, $class;
	98	}
	99
	100
	101
	102	sub process {
	103	my $self = shift (@_);
	104	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
	105	my $outhandle = $self->{'outhandle'};
	106
	107	print $outhandle "MediaWikiPlug: processing $file\n" if $self->{'verbosity'} > 1;
	108
	109	my @head_and_body = split(/<body/i,$$textref);
	110	my $head = shift(@head_and_body);
	111	my $body_text = join("<body", @head_and_body);
	112
	113	$head =~ m/<title>(.+)<\/title>/i;
	114	my $doctitle = $1 if defined $1;
	115
	116	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'}=~ /\S/) {
	117	my @doc_properties = split(/<xml>/i,$head);
	118	my $doc_heading = shift(@doc_properties);
	119	my $rest_doc_properties = join(" ", @doc_properties);
	120
	121	my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
	122	my $extracted_metadata = shift (@extracted_metadata);
	123	$self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
	124	}
	125
	126	# set the title here if we haven't found it yet
	127	if (!defined $doc_obj->get_metadata_element ($doc_obj->get_top_section(), "Title")) {
	128	if (defined $doctitle && $doctitle =~ /\S/) {
	129	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Title", $doctitle);
	130	} else {
	131	$self->title_fallback($doc_obj,$doc_obj->get_top_section(),$file);
	132	}
	133	}
	134
	135	if(defined $base_dir && $base_dir ne ""){
	136	# find and download stylesheet
	137	my @css_files;
	138	my $css_file_count = 0;
	139	# find all the style sheets imported with import statement
	140	while($head =~ m"<style type=\"text/css\"(.+)import \"(.+)\""ig){
	141	$css_files[$css_file_count++] = $2 if defined $2;
	142	}
	143
	144	# check whether the stylesheet exists
	145	# if not, download it and copy to the collection's images folder
	146	for($css_file_count = 0; $css_file_count < scalar(@css_files); $css_file_count++){
	147	my $css_file = $css_files[$css_file_count];
	148	$css_file =~ s/^(.+)gli\/cache\///i;
	149
	150	my $css_file_path = "$base_dir/$css_file";
	151
	152	if (-e $css_file_path){ # the file already exists
	153	next;
	154	}
	155
	156	# check the css directory and create one if it's not there
	157	my @dirs = split(/\//i,$css_file);
	158	my $path_check = "$base_dir/";
	159	for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
	160	$path_check .= $dirs[$i] . "/";
	161	if(! -d $path_check ){
	162	mkdir($path_check);
	163	}
	164	}
	165
	166	# download
	167	$css_file = "http://$css_file";
	168	system("wget", "--non-verbose", "$css_file", "--output-document=$css_file_path");
	169	if ($? != 0) {unlink("$css_file_path");}
	170
	171	# change every style element to #wikispecificstyle ...
	172	if(open(INPUT, "<$css_file_path")){
	173	my $css_content;
	174	while(my $line = <INPUT>){
	175	if($line =~ m/^(.+)\{/i){
	176	$line = "#wikispecificstyle " . $line;
	177	}
	178	$css_content .= $line;
	179	}
	180	close(INPUT);
	181	open(OUTPUT, ">$css_file_path");
	182	print OUTPUT $css_content;
	183	close(OUTPUT);
	184	}
	185
	186	# copy to images folder
	187	# do not copy, because collection can only have one specific stylesheet
	188	# better to add and modify the style sheets manually
	189	# @dirs = split(/\//i,$base_dir);
	190	# my $collection_base_dir;
	191	# for(my $i = 0; $i < (scalar(@dirs)-1); $i++){
	192	# $collection_base_dir .= $dirs[$i] . "/";
	193	# }
	194	# my $images_folder = $collection_base_dir . "images/";
	195	# copy($css_file_path, $images_folder) \|\| die "File cannot be copied.";
	196	}
	197	}
	198
	199	# add sections around h2 tag
	200	# wrap each section with <div id=\"wikispecificstyle\"></div> to get the wiki styles
	201	# add search box with each section
	202	if ($self->{'tag_sections'}) {
	203	my @sections = ($body_text =~ /<h2>(.+)<\/h2>/gi);
	204	for(my $i=1; $i < scalar(@sections); $i++){
	205	my $section_title = $sections[$i];
	206	$section_title =~ s/<([^>]*)>//g;
	207	$section_title =~ s/(^\s\|\s$)//g;
	208	my $section_metadata = "<Section>\n<Description>\n<Metadata name=\"Title\">$section_title</Metadata>\n</Description>\n";
	209	if($i !=1){
	210	$section_metadata = "</Section>\n" . $section_metadata;
	211	}
	212	$section_metadata = "\n<!--\n" . $section_metadata . "-->\n";
	213
	214	$section_metadata .= "<div id=\"wikispecificstyle\">\n<div id=\"content\">\n";
	215	$section_metadata = "</div></div>\n" . $section_metadata if $i !=1;
	216
	217	$body_text =~ s/<h2>$sections[$i]<\/h2>/$section_metadata<h2>$sections[$i]<\/h2>/i;
	218
	219	if($i==scalar(@sections)-1) {
	220	# $body_text =~ s/<div class=\"printfooter\">/<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
	221	$body_text =~ s/<div class=\"printfooter\">/<\/div>\n<\/div>\n<!--\n<\/Section>\n-->\n<div class=\"printfooter\">/i;
	222	}
	223	}
	224	}
	225
	226	# If delete_nav is enabled, it means to get rid of navigation contents.
	227	# if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
	228	# if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/){
	229	# print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
	230	# $body_text =~ s/$self->{'nav_exp'}//isg;
	231	# }
	232	#}
	233	my $searchbox = "";
	234	if (defined $self->{'delete_nav'} && ($self->{'delete_nav'} == 1)){
	235	my $nav_match_express;
	236	if (defined $self->{'nav_exp'}&& $self->{'nav_exp'} =~ /\S/) {
	237	$nav_match_express = $self->{'nav_exp'} ;
	238	} else { # default setting for mediawiki
	239	$nav_match_express = "<div class=\"printfooter\">(.\|\n)*secs. -->";
	240	}
	241
	242	print "it matches nav_exp!!\n" if $body_text =~ /$self->{'nav_exp'}/;
	243
	244	# $body_text =~ m/<div class=\"printfooter\">(.\|\n)*secs. -->/isg;
	245	$body_text =~ m/$nav_match_express/isg;
	246	my $navigate = $& if defined $&;
	247
	248	# find the search box and add it to the document page
	249	if(defined $navigate && $navigate =~ /\S/){
	250	$navigate =~ m/<div id="p-search" class="portlet">(.\|\n)*<\/form>/;
	251	$searchbox = $& . "\n<\/div>\n<\/div>";
	252	$searchbox =~ s/action="([^>]*)"/action="\/gsdl\/cgi-bin\/library"/isg;
	253	$searchbox =~ s/name="search"/name="q"/isg;
	254	$searchbox =~ s/name="go"//isg;
	255	$searchbox =~ s/name="fulltext"//isg;
	256	my $hidden_params = "<input type=\"hidden\" name=\"a\" value=\"q\"/>\n"
	257	."<input type=\"hidden\" name=\"c\" value=\"wikitest\"/>\n"
	258	."<input type=\"hidden\" name=\"fqf\" value=\"TX\"/>"
	259	."<input type=\"hidden\" name=\"t\" value=\"1\">";
	260	$searchbox =~ s/<\/form>/$hidden_params<\/form>/isg;
	261	$searchbox = "\n</div>\n</div><div id=\"wikispecificstyle\"><div id=\"column-one\">$searchbox</div></div>";
	262	}
	263
	264	# $body_text =~ s/<div class=\"printfooter\">(.\|\n)*secs. -->/$searchbox/isg;
	265	$body_text =~ s/$nav_match_express/$searchbox/isg;
	266	}
	267
	268	if ($self->{'tag_sections'}) {
	269	$body_text =~ s/<!--\n<\/Section>/$searchbox\n<!--\n<\/Section>/ig;
	270	}
	271
	272	# Tidy up extra new lines
	273	$body_text =~ s/(<p[^>]><span[^>]><o:p> <\/o:p><\/span><\/p>)//isg;
	274	$body_text =~ s/(<p[^>]*><o:p> <\/o:p><\/p>)//isg;
	275
	276	$section_text .= "<!--\n<Section>\n-->\n";
	277	my $body = "<body".$body_text;
	278
	279	$$textref = $body;
	280
	281	# get the base dir for convert absolute links to relative links
	282	$$textref =~ m"href=\"(.?)/cache/(.?)/"i;
	283	my $basedir = $2;
	284
	285	$$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
	286	$$textref =~ s/( )+/ /sg;
	287
	288	# get rid of the [edit] button
	289	$$textref =~ s/\[<a([^>]*)>edit<\/a>]//g;
	290
	291	# get rid of the last time edit information at the bottom
	292	$$textref =~ s/<a href="(.+)edit(.?)"(.?)>(\w+)<\/a> \d\d:\d\d,(.*?)(PST)//g;
	293
	294	# get rid of the (Redirected from ...)
	295	$$textref =~ s/(Redirected from <a ([^>])>(\w\|\s)<\/a>)//isg;
	296
	297	# escape macros
	298	$$textref =~ s/_([^\s]*)_/_<span>$1<\/span>_/isg;
	299	# may change the links, like Greenstone_Documentation_All.html, then change back
	300	$$textref =~ s/<a([^>])_<span>([^>])<\/span>_/<a$1_$2_/isg;
	301
	302	# convert all the urls to relative url, because current wget 1.10 -k and -E option doesn't work together
	303	# get rid of the title attribute of a tag
	304	$$textref =~ s/<a([^>])title="(.?)"/<a$1/isg;
	305	# find the relative path of current directory
	306	if($basedir ne ""){
	307	my @dirs=split("\/", $file);
	308	my $dirnum = scalar(@dirs);
	309	my $replace = "";
	310	for(my $i=0; $i<$dirnum-2; $i++){
	311	$replace .= "../";
	312	}
	313	# test if the linked relative file exists, if not, link to the internet version
	314	$$textref =~ s/(href\|src)="([^>])$basedir\/([^>])"/$1="$replace$3"/gi;
	315	# my @total_links = ($$textref =~ m/(href\|src)="([^>])$basedir\/([^>])"/gi);
	316	# print $outhandle "\nnumber of total links: " . scalar(@total_links)."\n";
	317	# for(my $cur_link_no = 0; $cur_link_no < scalar(@total_links); $cur_link_no++){
	318
	319	#while($$textref =~ m/(href\|src)="([^>])$basedir\/([^>])"/gi){
	320	#$total_links[$cur_link_no] =~ m/(href\|src)="([^>])$basedir\/([^>])"/i;
	321	# my $prefix = $1;
	322	# my $link = $&;
	323	# my $rel_file_name = $3;
	324	# my $rel_link = "$replace$rel_file_name";
	325	# print $outhandle "catched link==> $link\nrelative link==> $rel_link\n";
	326	# if(-e $rel_link){
	327	# $rel_link = "$prefix=\"$rel_link\"";
	328	# $$textref =~ s/$link/$rel_link/i;
	329	# }else{
	330	# my $ext_link = "$prefix=\"http:\/\/$basedir\/$rel_file_name\"";
	331	# print $outhandle "external link==> $ext_link\n";
	332	# $$textref =~ s/$link/$ext_link/i; #s/$link/$prefix="http:\/\/$rel_file_name"/i;
	333	# }
	334	#}
	335
	336
	337	# tag the link to new wiki pages as red
	338	$$textref =~ s/(href\|src)="$replace([^>])&action=edit([^>])"/$1="http:\/\/$basedir\/$2&action=edit$3"/gi;
	339	$$textref =~ s/<a([^>])class="new"([^>])>/<a$1style="color:red"$2)>/gi;
	340
	341	# tag the link to external pages as blue
	342	$$textref =~ s/<a([^>])class='external text'([^>])>/<a$1style="color:blue"$2)>/gi;
	343
	344	#print $outhandle $$textref;
	345	}
	346
	347	# if 'show_toc' is set, put the table of content on the Wiki Main_Page to the about page of the collection
	348	# 1. read _content_ macro from about.dm
	349	# 2. append the toc, change all links to the Greenstone internal format for relative links
	350	# 3. write to the extra.dm
	351	# TODO: currently we suppose the _about:content_ hasn't been specified before
	352	# so needs to add function to handle when the macro is already in the extra.dm
	353	if($self->{'show_toc'}==1 && $file =~ m/Main_Page.(html\|htm)$/){
	354	my $macro_path = $base_dir;
	355	$macro_path =~ s/import$/macros/;
	356	my $extra_dm;
	357	my $extradm_file = "$macro_path/extra.dm";
	358	if(open(INPUT, "<$extradm_file")){
	359	while(my $line = <INPUT>){
	360	$extra_dm .= $line;
	361	}
	362	close(INPUT);
	363
	364	if($extra_dm =~ m/package about/ && $extra_dm =~ m/_content_(\s)*{/){
	365	print $outhandle "already changed!!!!\n";
	366	} else {
	367	# read _content_ macro from about.dm file
	368	my $about_macro = $ENV{'GSDLHOME'} . "/macros/about.dm";
	369	my $about_page_content = "";
	370	if(open(INPUT, "<$about_macro")){
	371	while(my $line=<INPUT>){
	372	$about_page_content .= $line;
	373	}
	374	}else{
	375	print $outhandle "can't open file $about_macro\n";
	376	}
	377	close(INPUT);
	378
	379	# extract the _content_ macro
	380	$about_page_content =~ m/_content_ {(.\|\n)*<\/div>\n\n<\/div>\n}/i;
	381	$about_page_content = $&;
	382
	383	# extract toc of the Main_Page
	384	my $mainpage_content = "";
	385	if($self->{'toc_exp'} =~ /\S/){
	386	$$textref =~ /$self->{'toc_exp'}/;
	387	$mainpage_content = $&;
	388	} else {
	389	# $mainpage_content =~ s/<!-- start content -->(.\|\n)*<!-- end content -->/$1/igs;
	390	}
	391	# print $outhandle "---------\n$$textref\n--------\n\n";
	392	# print $outhandle "==========\n$mainpage_content\n==========\n\n";
	393
	394	# add toc to the _content_ macro
	395	$about_page_content =~ m/{(.\|\n)*<\/div>\n\n/;
	396	$extra_dm .= "package about\n_content_$&\n\n<div class=\"section\">\n$mainpage_content\n</div>\n</div>\n}";
	397
	398	# change all links to the internal Greenstone relative link format
	399	$extra_dm =~ s/<a href="([^>]*)"/<a href="_httpquery_&a=extlink&rl=1&href=http:\/\/$basedir$1"/isg;
	400	$extra_dm =~ s/(\.\.\/)+/\//isg;
	401	# print $outhandle "to add---------\n$extra_dm\n--------\n";
	402
	403	# write to the extra.dm file of the collection
	404	open(OUTPUT, ">$extradm_file");
	405	print OUTPUT $extra_dm;
	406	close(OUTPUT);
	407	}
	408	} else {
	409	print $outhandle "can't open file $extradm_file\n";
	410	}
	411	}
	412
	413	# If delete_toc is enabled, it means to get rid of toc and tof contents.
	414	# get rid of TOC and TOF sections and their title
	415	if (defined $self->{'delete_toc'} && ($self->{'delete_toc'} == 1)){
	416	if (defined $self->{'toc_exp'} && $self->{'toc_exp'} =~ /\S/){
	417	# $body_text =~ s/<p class=(($self->{'toc_exp'})[^>]*)>(.+?)<\/p>//isg;
	418	# print "it matches toc_exp!!\n" if $body_text =~ /$self->{'toc_exp'}/;
	419	# $body_text =~ s/$self->{'toc_exp'}//i;
	420	print "it matches toc_exp!!\n" if $$textref =~ /$self->{'toc_exp'}/;
	421	$$textref =~ s/$self->{'toc_exp'}//i;
	422	}
	423	}
	424
	425	# To add a layer on top of the wiki page
	426	# so as to keep the wiki style inside the wiki page
	427	# and keep the Greenstone style at the same time
	428	$$textref =~ s/<body([^>]*)>/$&\n<div id="wikispecificstyle">\n/is;
	429	$$textref =~ s/<\/body>/<\/div><\/body>/is;
	430
	431	# tag with sections
	432	$$textref =~ s/<body([^>]*)>/$&\n<!--\n<Section>\n<Description>\n<Metadata name=\"Title\">$doctitle<\/Metadata>\n<\/Description>\n-->\n/is;
	433	$$textref =~ s/<\/body>/\n<!--\n<\/Section>\n-->\n/is;
	434
	435	#print $outhandle "\n\n$$textref\n\n";
	436
	437	# use description tags
	438	if ($self->{'description_tags'}) {
	439	my $cursection = $doc_obj->get_top_section();
	440	# remove the html header - note that doing this here means any
	441	# sections defined within the header will be lost (so all <Section>
	442	# tags must appear within the body of the HTML)
	443	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	444
	445	$$textref =~ s/^.?<body[^>]>//is;
	446	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	447
	448	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	449	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	450
	451	my $lt = '(?:<\|<)';
	452	my $gt = '(?:>\|>)';
	453	my $quot = '(?:"\|"\|”\|“)';
	454
	455	# my $dont_strip = '';
	456	# if ($self->{'no_strip_metadata_html'}) {
	457	# ($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	458	# }
	459
	460	my $found_something = 0;
	461	my $top = 1;
	462	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
	463	my $text = $1;
	464	my $comment = $2;
	465	if (defined $text) {
	466	# text before a comment - note that getting to here
	467	# doesn't necessarily mean there are Section tags in
	468	# the document
	469	# print $outhandle "section text:\n$text\n";
	470	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
	471	}
	472	while ($comment =~ s/$lt(.*?)$gt//s) {
	473	my $tag = $1;
	474	if ($tag eq "Section") {
	475	$found_something = 1;
	476	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	477	$top = 0;
	478	} elsif ($tag eq "/Section") {
	479	$found_something = 1;
	480	$cursection = $doc_obj->get_parent_section ($cursection);
	481	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
	482	my $metaname = $1;
	483	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
	484	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
	485	my $metavalue = $1;
	486	$metavalue =~ s/^\s+//;
	487	$metavalue =~ s/\s+$//;
	488	# assume that no metadata value intentionally includes
	489	# carriage returns or HTML tags (if they're there they
	490	# were probably introduced when converting to HTML from
	491	# some other format).
	492	# actually some people want to have html tags in their
	493	# metadata.
	494	$metavalue =~ s/[\cJ\cM]/ /sg;
	495	# $metavalue =~ s/<[^>]+>//sg unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
	496	$metavalue =~ s/\s+/ /sg;
	497	# print $outhandle "metaname = $metaname\nmetavalue = $metavalue\n";
	498	if ($accumulate) {
	499	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	500	} else {
	501	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	502	}
	503	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	504	# do nothing with containing Description tags
	505	} else {
	506	# simple HTML tag (probably created by the conversion
	507	# to HTML from some other format) - we'll ignore it and
	508	# hope for the best ;-)
	509	}
	510	}
	511	}# end while
	512
	513	if ($cursection ne "") {
	514	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
	515	}
	516
	517	$$textref =~ s/^.?<body[^>]>//is;
	518	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	519	if ($$textref =~ /\S/) {
	520	if (!$found_something) {
	521	if ($self->{'verbosity'} > 2) {
	522	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
	523	print $outhandle " will be processed as a single section document\n";
	524	}
	525
	526	# go ahead and process single-section document
	527	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	528
	529	} else {
	530	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
	531	print $outhandle " of the final closing </Section> tag. This text will\n";
	532	print $outhandle " be ignored.";
	533
	534	my ($text);
	535	if (length($$textref) > 30) {
	536	$text = substr($$textref, 0, 30) . "...";
	537	} else {
	538	$text = $$textref;
	539	}
	540	$text =~ s/\n/ /isg;
	541	print $outhandle " ($text)\n";
	542	}
	543	} elsif (!$found_something) {
	544	if ($self->{'verbosity'} > 2) {
	545	# may get to here if document contained no valid Section
	546	# tags but did contain some comments. The text will have
	547	# been processed already but we should print the warning
	548	# as above and extract metadata
	549	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
	550	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	551	}
	552	}
	553	} # if $self->{'description_tags'}
	554	else {
	555	# remove header and footer
	556	# if (!$self->{'keep_head'}) {
	557	# $$textref =~ s/^.?<body[^>]>//is;
	558	# $$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	559	# }
	560
	561	# single section document
	562	# $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	563
	564	# Important: to get the relative links to work,
	565	# 1: use the below statement instead of the above one
	566	# 2. cannot have process_section method.
	567	# why?????
	568	$self->SUPER::process(@_);
	569	}
	570	return 1;
	571
	572	#$self->SUPER::process(@_);
	573	}
	574
	575
	576
	577	# note that process_section may be called multiple times for a single
	578	# section (relying on the fact that add_utf8_text appends the text to any
	579	# that may exist already).
	580	# sub process_section {
	581	# my $self = shift (@_);
	582	# my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
	583
	584	# trap links
	585	# if (!$self->{'nolinks'}) {
	586	# usemap="./#index" not handled correctly => change to "#index"
	587	# $$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
	588	#$self->replace_usemap_links($1, $2, $3)/isge;
	589
	590	#$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
	591	#$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
	592	#}
	593
	594	# trap images
	595
	596	# allow spaces if inside quotes - jrm21
	597	#$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)([\"\'][^\"\']+[\"\']\|[^\s>]+)([^>]>)/
	598	#$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
	599
	600	# add text to document object
	601	# turn \ into \\ so that the rest of greenstone doesn't think there
	602	# is an escape code following. (Macro parsing loses them...)
	603	# $$textref =~ s/\\/\\\\/go;
	604
	605	# $doc_obj->add_utf8_text($cursection, $$textref);
	606	#}
	607
	608
	609	sub extract_metadata
	610	{
	611	my $self = shift (@_);
	612	my ($textref, $metadata, $doc_obj) = @_;
	613	my $outhandle = $self->{'outhandle'};
	614
	615	return if (!defined $textref);
	616
	617	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	618	# html meta, 'value' is the metadata name for greenstone to use
	619	my %find_fields = ();
	620	my ($tag,$value);
	621
	622	my $orig_field = "";
	623	foreach my $field (split /,/, $self->{'metadata_fields'}) {
	624	# support tag<tagname>
	625	if ($field =~ /^(.?)<(.?)>$/) {
	626	# "$2" is the user's preferred gs metadata name
	627	$find_fields{lc($1)}=$2; # lc = lowercase
	628	$orig_field = $1;
	629	} else { # no <tagname> for mapping
	630	# "$field" is the user's preferred gs metadata name
	631	$find_fields{lc($field)}=$field; # lc = lowercase
	632	$orig_field = $field;
	633	}
	634
	635	if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
	636	$tag = $orig_field;
	637	$value = $1;
	638	if (!defined $value \|\| !defined $tag){
	639	#print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
	640	next;
	641	} else {
	642	# clean up and add
	643	chomp($value); # remove trailing \n, if any
	644	$tag = $find_fields{lc($tag)};
	645	#print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	646	# if ($self->{'verbosity'} > 2);
	647	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
	648	}
	649	}
	650	}
	651	}
	652
	653	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: