Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 12947

Last change on this file since 12947 was 12947, checked in by kjdon, 18 years ago
added new -extract_style option to HTMLPlug. looks for style, script and link tags in the html head tag, and saves them as ex.DocumentHeader metadata. -metadata_fields can now be used with -description_tags - why shouldn't we have metadata in the header as well as in the description tags?? can always turn head metadata off using -no_metadata. -hunt_creator_metadata no longer needs -metadata_fields option to be set.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 33.1 KB

Rev	Line
[585]	1	###########################################################################
	2	#
	3	# HTMLPlug.pm -- basic html plugin
[808]	4	#
[585]	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[808]	27	#
	28	# Note that this plugin handles frames only in a very simple way
	29	# i.e. each frame is treated as a separate document. This means
	30	# search results will contain links to individual frames rather
	31	# than linking to the top level frameset.
	32	# There may also be some problems caused by the _parent target
	33	# (it's removed by this plugin)
	34	#
[585]	35
	36	package HTMLPlug;
	37
[1435]	38	use BasPlug;
[1010]	39	use ghtml;
[1891]	40	use unicode;
[585]	41	use util;
[8509]	42	use XMLParser;
[585]	43
	44	sub BEGIN {
[8716]	45	@HTMLPlug::ISA = ('BasPlug');
[585]	46	}
	47
[7202]	48	use strict; # every perl program should have this!
	49	no strict 'refs'; # make an exception so we can use variables as filehandles
	50
[4744]	51	my $arguments =
	52	[ { 'name' => "process_exp",
[4873]	53	'desc' => "{BasPlug.process_exp}",
[6408]	54	'type' => "regexp",
[4744]	55	'deft' => &get_default_process_exp() },
	56	{ 'name' => "block_exp",
[4873]	57	'desc' => "{BasPlug.block_exp}",
[6408]	58	'type' => 'regexp',
[4744]	59	'deft' => &get_default_block_exp() },
	60	{ 'name' => "nolinks",
[4873]	61	'desc' => "{HTMLPlug.nolinks}",
[4744]	62	'type' => "flag" },
	63	{ 'name' => "keep_head",
[4873]	64	'desc' => "{HTMLPlug.keep_head}",
[4744]	65	'type' => "flag" },
[12947]	66	{ 'name' => "extract_style",
	67	'desc' => "{HTMLPlug.extract_style}",
	68	'type' => "flag" },
[4744]	69	{ 'name' => "no_metadata",
[4873]	70	'desc' => "{HTMLPlug.no_metadata}",
[4744]	71	'type' => "flag" },
	72	{ 'name' => "metadata_fields",
[4873]	73	'desc' => "{HTMLPlug.metadata_fields}",
[5096]	74	'type' => "string",
[4744]	75	'deft' => "Title" },
	76	{ 'name' => "hunt_creator_metadata",
[4873]	77	'desc' => "{HTMLPlug.hunt_creator_metadata}",
[4744]	78	'type' => "flag" },
	79	{ 'name' => "file_is_url",
[4873]	80	'desc' => "{HTMLPlug.file_is_url}",
[4744]	81	'type' => "flag" },
	82	{ 'name' => "assoc_files",
[4873]	83	'desc' => "{HTMLPlug.assoc_files}",
[6408]	84	'type' => "regexp",
	85	'deft' => &get_default_block_exp() },
[4744]	86	{ 'name' => "rename_assoc_files",
[4873]	87	'desc' => "{HTMLPlug.rename_assoc_files}",
[4744]	88	'type' => "flag" },
	89	{ 'name' => "title_sub",
[4873]	90	'desc' => "{HTMLPlug.title_sub}",
[4744]	91	'type' => "string",
	92	'deft' => "" },
	93	{ 'name' => "description_tags",
[4873]	94	'desc' => "{HTMLPlug.description_tags}",
[9067]	95	'type' => "flag" },
[10218]	96	# retain this for backward compatibility (w3mir option was replaced by
	97	# file_is_url)
	98	{ 'name' => "w3mir",
	99	# 'desc' => "{HTMLPlug.w3mir}",
	100	'type' => "flag",
	101	'hiddengli' => "yes"},
[9056]	102	{ 'name' => "no_strip_metadata_html",
	103	'desc' => "{HTMLPlug.no_strip_metadata_html}",
	104	'type' => "string",
	105	'deft' => "",
[10121]	106	'reqd' => "no"},
	107	{ 'name' => "sectionalise_using_h_tags",
	108	'desc' => "{HTMLPlug.sectionalise_using_h_tags}",
	109	'type' => "flag" }
[9056]	110	];
[3540]	111
	112	my $options = { 'name' => "HTMLPlug",
[5680]	113	'desc' => "{HTMLPlug.desc}",
[6408]	114	'abstract' => "no",
[3540]	115	'inherits' => "yes",
	116	'args' => $arguments };
	117
[585]	118	sub new {
[10218]	119	my ($class) = shift (@_);
	120	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	121	push(@$pluginlist, $class);
[10277]	122
[10218]	123	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	124	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[10277]	125
[12169]	126	my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
[10277]	127
[2342]	128	if ($self->{'w3mir'}) {
	129	$self->{'file_is_url'} = 1;
	130	}
[808]	131	$self->{'aux_files'} = {};
	132	$self->{'dir_num'} = 0;
	133	$self->{'file_num'} = 0;
[10218]	134
[585]	135	return bless $self, $class;
	136	}
	137
[8366]	138	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
	139	# if have eg <script language="javascript" src="img/lib.js@123">
[1243]	140	sub get_default_block_exp {
[585]	141	my $self = shift (@_);
[8914]	142
[8509]	143	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
[585]	144	}
	145
[1243]	146	sub get_default_process_exp {
[808]	147	my $self = shift (@_);
[8914]	148
[1403]	149	# the last option is an attempt to encode the concept of an html query ...
[11538]	150	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+[\?\@].+=.*)$^;
[1243]	151	}
[721]	152
[8509]	153	sub store_block_files
	154	{
	155	my $self =shift (@_);
[9067]	156	my ($filename) = @_;
[8509]	157	my $html_fname = $filename;
	158	my @file_blocks;
[9067]	159
	160	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[10277]	161
[9067]	162	# read in file ($text will be in utf8)
	163	my $text = "";
	164	$self->read_file ($filename, $encoding, $language, \$text);
	165	my $textref = \$text;
[8509]	166	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	167	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	168	$$textref =~ s/$opencom(.*?)$closecom//gs;
	169
	170	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
	171	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
	172	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
	173	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
[9143]	174	my @embed_matches = ($$textref =~ m/<embed[^>]?src\s=\s($attval)[^>]>/igs);
[9747]	175	my @tabbg_matches = ($$textref =~ m/<(?:table\|tr\|td)[^>]?background\s=\s($attval)[^>]>/igs);
[12947]	176	my @script_matches = ($$textref =~ m/<script[^>]?src\s=\s($attval)[^>]>/igs);
	177	foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
[8509]	178
	179	# remove quotes from link at start and end if necessary
	180	if ($link=~/^\"/) {
	181	$link=~s/^\"//;
	182	$link=~s/\"$//;
	183	}
	184
	185	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
	186
[9143]	187	if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
[8509]	188	# Turn relative file path into full path
	189	my $dirname = &File::Basename::dirname($filename);
	190	$link = &util::filename_cat($dirname, $link);
	191	}
	192	$link = $self->eval_dir_dots($link);
[9143]	193
[8509]	194	$self->{'file_blocks'}->{$link} = 1;
	195	}
	196	}
[9067]	197
[8509]	198
[1243]	199	# do plugin specific processing of doc_obj
	200	sub process {
	201	my $self = shift (@_);
[6332]	202	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1431]	203	my $outhandle = $self->{'outhandle'};
[6332]	204
	205	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
	206
[1431]	207	print $outhandle "HTMLPlug: processing $file\n"
[808]	208	if $self->{'verbosity'} > 1;
[721]	209
[3019]	210	if ($ENV{'GSDLOS'} =~ /^windows/i) {
	211	# this makes life so much easier... perl can cope with unix-style '/'s.
	212	$base_dir =~ s@(\\)+@/@g;
	213	$file =~ s@(\\)+@/@g;
	214	}
[10277]	215
[3148]	216	# reset per-doc stuff...
	217	$self->{'aux_files'} = {};
	218	$self->{'dir_num'} = 0;
	219	$self->{'file_num'} = 0;
	220
[10121]	221	# process an HTML file where sections are divided by headings tags (H1, H2 ...)
	222	# you can also include metadata in the format (X can be any number)
	223	# <hX>Title<!--gsdl-metadata
	224	# <Metadata name="name1">value1</Metadata>
	225	# ...
	226	# <Metadata name="nameN">valueN</Metadata>
	227	#--></hX>
	228	if ($self->{'sectionalise_using_h_tags'}) {
[12269]	229	# description_tags should always be activated because we convert headings to description tags
[10121]	230	$self->{'description_tags'} = 1;
	231
	232	my $arrSections = [];
	233	$$textref =~ s/<h([0-9]+)[^>]>(.?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
	234
	235	if (scalar(@$arrSections)) {
	236	my $strMetadata = $self->update_section_data($arrSections, -1);
	237	if (length($strMetadata)) {
	238	$strMetadata = '<!--' . $strMetadata . "\n-->\n</body>";
	239	$$textref =~ s/<\/body>/$strMetadata/ig;
	240	}
	241	}
	242	}
	243
[808]	244	my $cursection = $doc_obj->get_top_section();
[1220]	245
[8509]	246	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
[12947]	247	unless $self->{'no_metadata'};
[721]	248
[12947]	249	# extract style info as DocumentHeader metadata
	250	$self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file)
	251	if ($self->{'extract_style'} == 1);
	252
[9228]	253	# Store URL for page as metadata - this can be used for an
	254	# altavista style search interface. The URL won't be valid
	255	# unless the file structure contains the domain name (i.e.
	256	# like when w3mir is used to download a website).
	257
	258	# URL metadata (even invalid ones) are used to support internal
	259	# links, so even if 'file_is_url' is off, still need to store info
	260
	261	my $web_url = "http://$file";
	262	$doc_obj->add_metadata($cursection, "URL", $web_url);
	263
[9169]	264	if ($self->{'file_is_url'}) {
	265	$doc_obj->add_metadata($cursection, "weblink", "<a href=\"$web_url\">");
	266	$doc_obj->add_metadata($cursection, "webicon", "_iconworld_");
	267	$doc_obj->add_metadata($cursection, "/weblink", "</a>");
	268	}
[721]	269
[2817]	270	if ($self->{'description_tags'}) {
[2995]	271	# remove the html header - note that doing this here means any
	272	# sections defined within the header will be lost (so all <Section>
	273	# tags must appear within the body of the HTML)
[8509]	274	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	275
[2995]	276	$$textref =~ s/^.?<body[^>]>//is;
	277	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	278
[2819]	279	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	280	my $closecom = '(?:-->\|(?:—\|\|--)>)';
[8509]	281
[2819]	282	my $lt = '(?:<\|<)';
	283	my $gt = '(?:>\|>)';
	284	my $quot = '(?:"\|"\|”\|“)';
	285
[9057]	286	my $dont_strip = '';
	287	if ($self->{'no_strip_metadata_html'}) {
	288	($dont_strip = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	289	}
	290
[2817]	291	my $found_something = 0; my $top = 1;
[2819]	292	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
[2817]	293	my $text = $1;
	294	my $comment = $2;
	295	if (defined $text) {
[3369]	296	# text before a comment - note that getting to here
	297	# doesn't necessarily mean there are Section tags in
	298	# the document
[2817]	299	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
	300	}
[2819]	301	while ($comment =~ s/$lt(.*?)$gt//s) {
[2817]	302	my $tag = $1;
	303	if ($tag eq "Section") {
	304	$found_something = 1;
	305	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	306	$top = 0;
	307	} elsif ($tag eq "/Section") {
	308	$found_something = 1;
	309	$cursection = $doc_obj->get_parent_section ($cursection);
[2819]	310	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
[2817]	311	my $metaname = $1;
[9053]	312	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
[2819]	313	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
[2817]	314	my $metavalue = $1;
	315	$metavalue =~ s/^\s+//;
	316	$metavalue =~ s/\s+$//;
[2819]	317	# assume that no metadata value intentionally includes
	318	# carriage returns or HTML tags (if they're there they
	319	# were probably introduced when converting to HTML from
	320	# some other format).
[9067]	321	# actually some people want to have html tags in their
[9056]	322	# metadata.
[2819]	323	$metavalue =~ s/[\cJ\cM]/ /sg;
[9056]	324	$metavalue =~ s/<[^>]+>//sg
[9057]	325	unless $dont_strip && ($dont_strip eq 'all' \|\| $metaname =~ /^($dont_strip)$/);
[2819]	326	$metavalue =~ s/\s+/ /sg;
[9053]	327	if ($accumulate) {
	328	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	329	} else {
	330	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	331	}
[2819]	332	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	333	# do nothing with containing Description tags
	334	} else {
	335	# simple HTML tag (probably created by the conversion
	336	# to HTML from some other format) - we'll ignore it and
	337	# hope for the best ;-)
[2817]	338	}
	339	}
	340	}
	341	if ($cursection ne "") {
	342	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
	343	}
	344
	345	$$textref =~ s/^.?<body[^>]>//is;
	346	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	347	if ($$textref =~ /\S/) {
	348	if (!$found_something) {
[8509]	349	if ($self->{'verbosity'} > 2) {
	350	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
	351	print $outhandle " will be processed as a single section document\n";
	352	}
	353
[3369]	354	# go ahead and process single-section document
[3349]	355	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
[3369]	356
[2817]	357	} else {
	358	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
	359	print $outhandle " of the final closing </Section> tag. This text will\n";
	360	print $outhandle " be ignored.";
[8509]	361
[2819]	362	my ($text);
[2817]	363	if (length($$textref) > 30) {
	364	$text = substr($$textref, 0, 30) . "...";
[2819]	365	} else {
	366	$text = $$textref;
[2817]	367	}
	368	$text =~ s/\n/ /isg;
	369	print $outhandle " ($text)\n";
	370	}
[3369]	371	} elsif (!$found_something) {
	372
[8509]	373	if ($self->{'verbosity'} > 2) {
	374	# may get to here if document contained no valid Section
	375	# tags but did contain some comments. The text will have
	376	# been processed already but we should print the warning
	377	# as above and extract metadata
	378	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
	379	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	380	}
[2817]	381	}
[12947]	382	} # if $self->{'description_tags'}
[12883]	383	else {
[2995]	384	# remove header and footer
[12883]	385	if (!$self->{'keep_head'}) {
[2995]	386	$$textref =~ s/^.?<body[^>]>//is;
	387	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	388	}
	389
[2817]	390	# single section document
	391	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	392	}
	393	return 1;
	394	}
	395
[10121]	396
	397	sub process_heading
	398	{
	399	my ($self, $nHeadNo, $strHeadingText, $arrSections, $file) = @_;
	400	$strHeadingText = '' if (!defined($strHeadingText));
	401
	402	my $strMetadata = $self->update_section_data($arrSections, int($nHeadNo));
	403
	404	my $strSecMetadata = '';
	405	while ($strHeadingText =~ s/<!--gsdl-metadata(.*?)-->//is)
	406	{
	407	$strSecMetadata .= $1;
	408	}
	409
	410	$strHeadingText =~ s/^\s+//g;
	411	$strHeadingText =~ s/\s+$//g;
	412	$strSecMetadata =~ s/^\s+//g;
	413	$strSecMetadata =~ s/\s+$//g;
	414
	415	$strMetadata .= "\n<Section>\n\t<Description>\n\t\t<Metadata name=\"Title\">" . $strHeadingText . "</Metadata>\n";
	416
	417	if (length($strSecMetadata)) {
	418	$strMetadata .= "\t\t" . $strSecMetadata . "\n";
	419	}
	420
	421	$strMetadata .= "\t</Description>\n";
	422
	423	return "<!--" . $strMetadata . "-->";
	424	}
	425
	426
	427	sub update_section_data
	428	{
	429	my ($self, $arrSections, $nCurTocNo) = @_;
	430	my ($strBuffer, $nLast, $nSections) = ('', 0, scalar(@$arrSections));
	431
	432	if ($nSections == 0) {
	433	push @$arrSections, $nCurTocNo;
	434	return $strBuffer;
	435	}
	436	$nLast = $arrSections->[$nSections - 1];
	437	if ($nCurTocNo > $nLast) {
	438	push @$arrSections, $nCurTocNo;
	439	return $strBuffer;
	440	}
	441	for(my $i = $nSections - 1; $i >= 0; $i--) {
	442	if ($nCurTocNo <= $arrSections->[$i]) {
	443	$strBuffer .= "\n</Section>";
	444	pop @$arrSections;
	445	}
	446	}
	447	push @$arrSections, $nCurTocNo;
	448	return $strBuffer;
	449	}
	450
	451
[2817]	452	# note that process_section may be called multiple times for a single
	453	# section (relying on the fact that add_utf8_text appends the text to any
	454	# that may exist already).
	455	sub process_section {
	456	my $self = shift (@_);
	457	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[808]	458	# trap links
	459	if (!$self->{'nolinks'}) {
[721]	460
[808]	461	# usemap="./#index" not handled correctly => change to "#index"
[11286]	462	$$textref =~ s/(<img[^>]?usemap\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]>)/
[808]	463	$self->replace_usemap_links($1, $2, $3)/isge;
[721]	464
[11286]	465	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
[897]	466	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[721]	467	}
	468
[808]	469	# trap images
[1929]	470
[2695]	471	# allow spaces if inside quotes - jrm21
[11286]	472	$$textref =~ s/(<(?:img\|embed\|table\|tr\|td)[^>]?(?:src\|background)\s=\s)([\"\'][^\"\']+[\"\']\|[^\s>]+)([^>]>)/
[897]	473	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[1244]	474
	475	# add text to document object
[3019]	476	# turn \ into \\ so that the rest of greenstone doesn't think there
	477	# is an escape code following. (Macro parsing loses them...)
	478	$$textref =~ s/\\/\\\\/go;
[10277]	479
[1358]	480	$doc_obj->add_utf8_text($cursection, $$textref);
[721]	481	}
	482
[808]	483	sub replace_images {
	484	my $self = shift (@_);
[897]	485	my ($front, $link, $back, $base_dir,
	486	$file, $doc_obj, $section) = @_;
[8509]	487
[2695]	488	# remove quotes from link at start and end if necessary
[11286]	489	if ($link=~/^[\"\']/) {
	490	$link=~s/^[\"\']//;$link=~s/[\"\']$//;
[2695]	491	$front.='"';
	492	$back="\"$back";
	493	}
	494
[808]	495	$link =~ s/\n/ /g;
[7949]	496
	497	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
	498	# If the Word file path has spaces in it, wv messes up and you end up with
	499	# absolute paths for the images, and without the "file://" prefix
	500	# So check for this special case and massage the data to be correct
[7966]	501	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
	502	$link =~ s/^.*\\([^\\]+)$/$1/;
[7949]	503	}
	504
[808]	505	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[9143]	506
[6812]	507	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
[9143]	508
[6812]	509	my $anchor_name = $img_file;
	510	$anchor_name =~ s/^.*\///;
	511	$anchor_name = "<a name=\"$anchor_name\">";
	512
[7595]	513	return $front . $img_file . $back . $anchor_name;
[721]	514	}
	515
[808]	516	sub replace_href_links {
[585]	517	my $self = shift (@_);
[897]	518	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[585]	519
[808]	520	# attempt to sort out targets - frames are not handled
	521	# well in this plugin and some cases will screw things
	522	# up - e.g. the _parent target (so we'll just remove
	523	# them all ;-)
	524	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	525	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	526	$front =~ s/target=\"?_parent\"?//is;
	527	$back =~ s/target=\"?_parent\"?//is;
[721]	528
[808]	529	return $front . $link . $back if $link =~ /^\#/s;
	530	$link =~ s/\n/ /g;
[721]	531
[808]	532	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[1312]	533	# href may use '\'s where '/'s should be on Windows
	534	$href =~ s/\\/\//g;
[585]	535
[850]	536	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
[8509]	537
[8914]	538
[897]	539	##### leave all these links alone (they won't be picked up by intermediate
	540	##### pages). I think that's safest when dealing with frames, targets etc.
	541	##### (at least until I think of a better way to do it). Problems occur with
	542	##### mailto links from within small frames, the intermediate page is displayed
	543	##### within that frame and can't be seen. There is still potential for this to
	544	##### happen even with html pages - the solution seems to be to somehow tell
	545	##### the browser from the server side to display the page being sent (i.e.
	546	##### the intermediate page) in the top level window - I'm not sure if that's
	547	##### possible - the following line should probably be deleted if that can be done
	548	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
	549
[1605]	550
[850]	551	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
[808]	552	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
[1010]	553	&ghtml::urlsafe ($href);
[897]	554	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[808]	555	} else {
[1686]	556	# link is to some other type of file (eg image) so we'll
[808]	557	# need to associate that file
[965]	558	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
[721]	559	}
[808]	560	}
[721]	561
[808]	562	sub add_file {
	563	my $self = shift (@_);
[965]	564	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
[808]	565	my ($newname);
[585]	566
[808]	567	my $filename = $href;
[10277]	568	if ($base_dir eq "") {
	569	# remove http:/ thereby leaving one slash at the start
	570	$filename =~ s/^[^:]*:\///;
	571	}
	572	else {
	573	# remove http://
	574	$filename =~ s/^[^:]*:\/\///;
	575	}
	576
[1410]	577	$filename = &util::filename_cat($base_dir, $filename);
[3708]	578
	579	# Replace %20's in URL with a space if required. Note that the filename
	580	# may include the %20 in some situations
	581	if ($filename =~ /\%20/) {
	582	if (!-e $filename) {
	583	$filename =~ s/\%20/ /g;
	584	}
	585	}
	586
[808]	587	my ($ext) = $filename =~ /(\.[^\.]*)$/;
[965]	588
[9143]	589	if ($rl == 0) {
	590	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
	591	return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
	592	}
	593	else {
	594	return "_httpextlink_&rl=0&el=direct&href=" . $href . $hash_part;
	595	}
	596	}
	597
[965]	598	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
	599	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
	600	}
[900]	601	if ($self->{'rename_assoc_files'}) {
	602	if (defined $self->{'aux_files'}->{$href}) {
	603	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
	604	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
	605	} else {
	606	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
[965]	607	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
[900]	608	$self->inc_filecount ();
	609	}
	610	$doc_obj->associate_file($filename, $newname, undef, $section);
[3148]	611	return "_httpdocimg_/$newname";
[585]	612	} else {
[900]	613	($newname) = $filename =~ /([^\/\\]*)$/;
	614	$doc_obj->associate_file($filename, $newname, undef, $section);
[1020]	615	return "_httpdocimg_/$newname";
[585]	616	}
[808]	617	}
[585]	618
[721]	619
[808]	620	sub format_link {
	621	my $self = shift (@_);
	622	my ($link, $base_dir, $file) = @_;
[585]	623
[808]	624	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
[8509]	625
[808]	626	$hash_part = "" if !defined $hash_part;
	627	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
[1424]	628	my $outhandle = $self->{'outhandle'};
	629	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
[808]	630	if $self->{'verbosity'};
	631	return ($link, "", 0);
[732]	632	}
[8509]	633
[3019]	634	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
[808]	635	my $type = $1;
[1929]	636
[808]	637	if ($link =~ /^(http\|ftp):/i) {
	638	# Turn url (using /) into file name (possibly using \ on windows)
	639	my @http_dir_split = split('/', $before_hash);
	640	$before_hash = &util::filename_cat(@http_dir_split);
[585]	641	}
	642
[808]	643	$before_hash = $self->eval_dir_dots($before_hash);
	644
	645	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[8509]	646
[808]	647	my $rl = 0;
	648	$rl = 1 if (-e $linkfilename);
[585]	649
[808]	650	# make sure there's a slash on the end if it's a directory
	651	if ($before_hash !~ /\/$/) {
	652	$before_hash .= "/" if (-d $linkfilename);
	653	}
[585]	654
[808]	655	return ($type . $before_hash, $hash_part, $rl);
[721]	656
[10513]	657	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i && $link !~ /^\//) {
[3019]	658	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
[721]	659
[2342]	660	# the first directory will be the domain name if file_is_url
[808]	661	# to generate archives, otherwise we'll assume all files are
	662	# from the same site and base_dir is the root
[3019]	663
[2342]	664	if ($self->{'file_is_url'}) {
[808]	665	my @dirs = split /[\/\\]/, $file;
	666	my $domname = shift (@dirs);
	667	$before_hash = &util::filename_cat($domname, $before_hash);
[3019]	668	$before_hash =~ s@\\@/@g; # for windows
[808]	669	}
[1410]	670	else
	671	{
	672	# see if link shares directory with source document
	673	# => turn into relative link if this is so!
[1929]	674
	675	if ($ENV{'GSDLOS'} =~ /^windows/i) {
[3019]	676	# too difficult doing a pattern match with embedded '\'s...
	677	my $win_before_hash=$before_hash;
	678	$win_before_hash =~ s@(\\)+@/@g;
	679	# $base_dir is already similarly "converted" on windows.
	680	if ($win_before_hash =~ s@^$base_dir/@@o) {
	681	# if this is true, we removed a prefix
	682	$before_hash=$win_before_hash;
	683	}
[1929]	684	}
	685	else {
[9143]	686	# before_hash has lost leading slash by this point,
	687	# -> add back in prior to substitution with $base_dir
	688	$before_hash = "/$before_hash";
	689
[3019]	690	$before_hash = &util::filename_cat("",$before_hash);
	691	$before_hash =~ s@^$base_dir/@@;
[1929]	692	}
[1410]	693	}
[808]	694	} else {
	695	# Turn relative file path into full path
	696	my $dirname = &File::Basename::dirname($file);
	697	$before_hash = &util::filename_cat($dirname, $before_hash);
	698	$before_hash = $self->eval_dir_dots($before_hash);
	699	}
[721]	700
[1410]	701	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[808]	702	# make sure there's a slash on the end if it's a directory
	703	if ($before_hash !~ /\/$/) {
	704	$before_hash .= "/" if (-d $linkfilename);
	705	}
	706	return ("http://" . $before_hash, $hash_part, 1);
	707	} else {
	708	# mailto, news, nntp, telnet, javascript or gopher link
	709	return ($before_hash, "", 0);
	710	}
	711	}
[1605]	712
[1602]	713	sub extract_first_NNNN_characters {
	714	my $self = shift (@_);
	715	my ($textref, $doc_obj, $thissection) = @_;
	716
	717	foreach my $size (split /,/, $self->{'first'}) {
	718	my $tmptext = $$textref;
[4821]	719	# skip to the body
[1602]	720	$tmptext =~ s/.<body[^>]>//i;
[4821]	721	# remove javascript
	722	$tmptext =~ s@<script.*?</script>@ @sig;
[1602]	723	$tmptext =~ s/<[^>]*>/ /g;
	724	$tmptext =~ s/ / /g;
	725	$tmptext =~ s/^\s+//;
	726	$tmptext =~ s/\s+$//;
	727	$tmptext =~ s/\s+/ /gs;
[9125]	728	$tmptext = &unicode::substr ($tmptext, 0, $size);
[4821]	729	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
[1602]	730	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	731	}
	732	}
[721]	733
[7202]	734
[808]	735	sub extract_metadata {
	736	my $self = shift (@_);
	737	my ($textref, $metadata, $doc_obj, $section) = @_;
[1605]	738	my $outhandle = $self->{'outhandle'};
[1602]	739	# if we don't want metadata, we may as well not be here ...
[12947]	740	return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);
[1400]	741
[8843]	742	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	743	# html meta, 'value' is the metadata name for greenstone to use
	744	my %find_fields = ();
[7202]	745
	746	my %creator_fields = (); # short-cut for lookups
	747
	748
	749	foreach my $field (split /,/, $self->{'metadata_fields'}) {
[8225]	750	# support tag<tagname>
	751	if ($field =~ /^(.?)<(.?)>$/) {
	752	# "$2" is the user's preferred gs metadata name
	753	$find_fields{lc($1)}=$2; # lc = lowercase
[8843]	754	} else { # no <tagname> for mapping
	755	# "$field" is the user's preferred gs metadata name
	756	$find_fields{lc($field)}=$field; # lc = lowercase
[8225]	757	}
[7202]	758	}
	759
[12947]	760	if ($self->{'hunt_creator_metadata'} == 1 ) {
[7202]	761	my @extra_fields =
	762	(
	763	'author',
	764	'author.email',
	765	'creator',
	766	'dc.creator',
	767	'dc.creator.corporatename',
	768	);
	769
	770	# add the creator_metadata fields to search for
	771	foreach my $field (@extra_fields) {
	772	$creator_fields{$field}=0; # add to lookup hash
[1602]	773	}
	774	}
[721]	775
[8509]	776
[7202]	777	# find the header in the html file, which has the meta tags
	778	$$textref =~ m@<head>(.*?)</head>@si;
	779
	780	my $html_header=$1;
[10725]	781
[7202]	782	# go through every <meta... tag defined in the html and see if it is
	783	# one of the tags we want to match.
[10725]	784
[7235]	785	# special case for title - we want to remember if its been found
	786	my $found_title = 0;
[7202]	787	# this assumes that ">" won't appear. (I don't think it's allowed to...)
	788	$html_header =~ /^/; # match the start of the string, for \G assertion
[8509]	789
[7202]	790	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
	791	my $metatag=$1;
	792	my ($tag, $value);
	793
	794	# find the tag name
	795	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
	796	$tag=$2;
	797	# in case they're not using " or ', but they should...
	798	if (! $tag) {
[8843]	799	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[7202]	800	$tag=$1;
[808]	801	}
[1605]	802
[7202]	803	if (!defined $tag) {
	804	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
[1230]	805	next;
[1190]	806	}
[10725]	807
[7202]	808	# don't need to assign this field if it was passed in from a previous
	809	# (recursive) plugin
	810	if (defined $metadata->{$tag}) {next}
	811
	812	# find the tag content
	813	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
	814	$value=$2;
[10725]	815
[7202]	816	if (! $value) {
[8843]	817	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[7202]	818	$value=$1;
	819	}
	820	if (!defined $value) {
	821	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
	822	next;
	823	}
	824
	825	# clean up and add
	826	$value =~ s/\s+/ /gs;
[8794]	827	chomp($value); # remove trailing \n, if any
[7202]	828	if (exists $creator_fields{lc($tag)}) {
	829	# map this value onto greenstone's "Creator" metadata
	830	$tag='Creator';
	831	} elsif (!exists $find_fields{lc($tag)}) {
[10725]	832	next; # don't want this tag
[7202]	833	} else {
	834	# get the user's preferred capitalisation
	835	$tag = $find_fields{lc($tag)};
	836	}
[7235]	837	if (lc($tag) eq "title") {
	838	$found_title = 1;
	839	}
[7202]	840	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	841	if ($self->{'verbosity'} > 2);
[10725]	842	if ($tag =~ /date.*/i){
	843	$tag = lc($tag);
	844	}
[7202]	845	$doc_obj->add_utf8_metadata($section, $tag, $value);
	846
[808]	847	}
[7202]	848
	849	# TITLE: extract the document title
[7235]	850	if (exists $find_fields{'title'} && !$found_title) {
[7202]	851	# we want a title, and didn't find one in the meta tags
	852	# see if there's a <title> tag
	853	my $title;
[8843]	854	my $from = ""; # for debugging output only
[7235]	855	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
[7202]	856	$title = $1;
[7235]	857	$from = "<title> tags";
[7202]	858	}
[8509]	859
[7202]	860	if (!defined $title) {
[7235]	861	$from = "first 100 chars";
[7202]	862	# if no title use first 100 or so characters
	863	$title = $$textref;
[8071]	864	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
[7202]	865	$title =~ s/^.*?<body>//si;
	866	# ignore javascript!
	867	$title =~ s@<script.*?</script>@ @sig;
	868	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	869	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
	870	$title = substr ($title, 0, 100);
	871	$title =~ s/\s\S*$/.../;
	872	}
	873	$title =~ s/<[^>]*>/ /g; # remove html tags
	874	$title =~ s/ / /g;
	875	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	876	$title =~ s/\s+/ /gs; # collapse multiple spaces
	877	$title =~ s/^\s*//; # remove leading spaces
	878	$title =~ s/\s*$//; # remove trailing spaces
[8071]	879
[7202]	880	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	881	$title =~ s/^\s+//s; # in case title_sub introduced any...
	882	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
[7235]	883	print $outhandle " extracted Title metadata \"$title\" from $from\n"
[7202]	884	if ($self->{'verbosity'} > 2);
[7235]	885	}
[8121]	886
	887	# add FileFormat metadata
	888	$doc_obj->add_metadata($section,"FileFormat", "HTML");
[7202]	889
	890	# Special, for metadata names such as tagH1 - extracts
	891	# the text between the first <H1> and </H1> tags into "H1" metadata.
	892
	893	foreach my $field (keys %find_fields) {
	894	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
	895	my $tag = $1;
	896	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
	897	my $content = $1;
	898	$content =~ s/ / /g;
	899	$content =~ s/<[^>]*>/ /g;
	900	$content =~ s/^\s+//;
	901	$content =~ s/\s+$//;
	902	$content =~ s/\s+/ /gs;
	903	if ($content) {
	904	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
	905	$tag =~ s/^tag//i;
	906	$doc_obj->add_utf8_metadata ($section, $tag, $content);
	907	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
	908	if ($self->{'verbosity'} > 2);
	909	}
	910	}
	911	}
[585]	912	}
	913
[1190]	914
[12947]	915	sub extract_style {
	916	my $self = shift (@_);
	917	my ($textref, $doc_obj, $section, $base_dir, $file) = @_;
	918	my $outhandle = $self->{'outhandle'};
	919
	920	# find the header in the html file, which has the style info
	921	$$textref =~ m@<head>(.*?)</head>@si;
	922
	923	my $html_header=$1;
	924	my $style_contents = "";
	925
	926	# look for style tags
	927	$html_header =~ /^/; # match the start of the string, for \G assertion
	928	while ($html_header =~ m/\G.*?<(style\|script\|link)/sig) {
	929	my $tag_name = $1;
	930	if ($tag_name eq "style") {
	931	if ($html_header =~ m/\G([^>]>[^<]+<\/style[^>]>)/is) {
	932	$style_contents .= "\n<style";
	933	$style_contents .= $1;
	934	}
	935	}
	936	elsif ($tag_name eq "link") {
	937	$style_contents .= "\n<link";
	938	$html_header =~ m/\G(.*?>)/is;
	939	$style_contents .= $1;
	940	}
	941	elsif ($tag_name eq "script") {
	942	# bit more tricky cos it may or may not have content
	943	if ($html_header =~ m/\G([^>]?src=[^>]>)/is) {
	944	$style_contents .= "\n<script";
	945	$style_contents .= $1;
	946	} elsif ($html_header =~ m/\G([^>]>[^<]+<\/script[^>]>)/is) {
	947	$style_contents .= "\n<script";
	948	$style_contents .= $1;
	949	}
	950	}
	951	}
	952
	953	# now we need to do something with any links found in the style thing
	954	$style_contents =~ s/(<(?:link\|script)\s+[^>]?\s(?:href\|src)\s=\s[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
	955	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;
	956
	957	$doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);
	958
	959	}
	960
[808]	961	# evaluate any "../" to next directory up
	962	# evaluate any "./" as here
	963	sub eval_dir_dots {
	964	my $self = shift (@_);
	965	my ($filename) = @_;
[721]	966	my $dirsep_os = &util::get_os_dirsep();
	967	my @dirsep = split(/$dirsep_os/,$filename);
	968
	969	my @eval_dirs = ();
[850]	970	foreach my $d (@dirsep) {
[808]	971	if ($d eq "..") {
[721]	972	pop(@eval_dirs);
[8509]	973
[808]	974	} elsif ($d eq ".") {
[721]	975	# do nothing!
[808]	976
	977	} else {
[721]	978	push(@eval_dirs,$d);
	979	}
[585]	980	}
	981
[8509]	982	# Need to fiddle with number of elements in @eval_dirs if the
	983	# first one is the empty string. This is because of a
	984	# modification to util::filename_cat that supresses the addition
	985	# of a leading '/' character (or \ if windows) (intended to help
	986	# filename cat with relative paths) if the first entry in the
	987	# array is the empty string. Making the array start with two
	988	# empty strings is a way to defeat this "smart" option.
	989	#
	990	if (scalar(@eval_dirs) > 0) {
	991	if ($eval_dirs[0] eq ""){
	992	unshift(@eval_dirs,"");
	993	}
	994	}
[721]	995	return &util::filename_cat(@eval_dirs);
	996	}
	997
[808]	998	sub replace_usemap_links {
	999	my $self = shift (@_);
[721]	1000	my ($front, $link, $back) = @_;
	1001
	1002	$link =~ s/^\.\///;
[808]	1003	return $front . $link . $back;
[721]	1004	}
	1005
[808]	1006	sub inc_filecount {
	1007	my $self = shift (@_);
[721]	1008
[808]	1009	if ($self->{'file_num'} == 1000) {
	1010	$self->{'dir_num'} ++;
	1011	$self->{'file_num'} = 0;
	1012	} else {
	1013	$self->{'file_num'} ++;
	1014	}
	1015	}
[721]	1016
[1891]	1017
	1018	# Extend the BasPlug read_file so that strings like é are
	1019	# converted to UTF8 internally.
	1020	#
	1021	# We don't convert < or > or & or " in case
	1022	# they interfere with the GML files
	1023
	1024	sub read_file {
[2735]	1025	my ($self, $filename, $encoding, $language, $textref) = @_;
[1891]	1026
[2735]	1027	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
[2364]	1028
[3181]	1029	# Convert entities to their UTF8 equivalents
[3196]	1030	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
[3181]	1031	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
[3196]	1032	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
[1891]	1033	}
	1034
[585]	1035	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: