Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 9056

Last change on this file since 9056 was 9056, checked in by kjdon, 19 years ago
added an option to not strip html tags from metadata in description tags. contributed by jens wille
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 29.6 KB

Rev	Line
[585]	1	###########################################################################
	2	#
	3	# HTMLPlug.pm -- basic html plugin
[808]	4	#
[585]	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[808]	27	#
	28	# Note that this plugin handles frames only in a very simple way
	29	# i.e. each frame is treated as a separate document. This means
	30	# search results will contain links to individual frames rather
	31	# than linking to the top level frameset.
	32	# There may also be some problems caused by the _parent target
	33	# (it's removed by this plugin)
	34	#
[585]	35
	36	package HTMLPlug;
	37
[1435]	38	use BasPlug;
[1010]	39	use ghtml;
[1891]	40	use unicode;
[585]	41	use util;
[808]	42	use parsargv;
[8509]	43	use XMLParser;
[585]	44
	45	sub BEGIN {
[8716]	46	@HTMLPlug::ISA = ('BasPlug');
[585]	47	}
	48
[7202]	49	use strict; # every perl program should have this!
	50	no strict 'refs'; # make an exception so we can use variables as filehandles
	51
[4744]	52	my $arguments =
	53	[ { 'name' => "process_exp",
[4873]	54	'desc' => "{BasPlug.process_exp}",
[6408]	55	'type' => "regexp",
[4744]	56	'deft' => &get_default_process_exp() },
	57	{ 'name' => "block_exp",
[4873]	58	'desc' => "{BasPlug.block_exp}",
[6408]	59	'type' => 'regexp',
[4744]	60	'deft' => &get_default_block_exp() },
	61	{ 'name' => "nolinks",
[4873]	62	'desc' => "{HTMLPlug.nolinks}",
[4744]	63	'type' => "flag" },
	64	{ 'name' => "keep_head",
[4873]	65	'desc' => "{HTMLPlug.keep_head}",
[4744]	66	'type' => "flag" },
	67	{ 'name' => "no_metadata",
[4873]	68	'desc' => "{HTMLPlug.no_metadata}",
[4744]	69	'type' => "flag" },
	70	{ 'name' => "metadata_fields",
[4873]	71	'desc' => "{HTMLPlug.metadata_fields}",
[5096]	72	'type' => "string",
[4744]	73	'deft' => "Title" },
	74	{ 'name' => "hunt_creator_metadata",
[4873]	75	'desc' => "{HTMLPlug.hunt_creator_metadata}",
[4744]	76	'type' => "flag" },
	77	{ 'name' => "file_is_url",
[4873]	78	'desc' => "{HTMLPlug.file_is_url}",
[4744]	79	'type' => "flag" },
	80	{ 'name' => "assoc_files",
[4873]	81	'desc' => "{HTMLPlug.assoc_files}",
[6408]	82	'type' => "regexp",
	83	'deft' => &get_default_block_exp() },
[4744]	84	{ 'name' => "rename_assoc_files",
[4873]	85	'desc' => "{HTMLPlug.rename_assoc_files}",
[4744]	86	'type' => "flag" },
	87	{ 'name' => "title_sub",
[4873]	88	'desc' => "{HTMLPlug.title_sub}",
[4744]	89	'type' => "string",
	90	'deft' => "" },
	91	{ 'name' => "description_tags",
[4873]	92	'desc' => "{HTMLPlug.description_tags}",
[9056]	93	'type' => "flag" } ,
	94	{ 'name' => "no_strip_metadata_html",
	95	'desc' => "{HTMLPlug.no_strip_metadata_html}",
	96	'type' => "string",
	97	'deft' => "",
	98	'reqd' => "no"}
	99	];
[3540]	100
	101	my $options = { 'name' => "HTMLPlug",
[5680]	102	'desc' => "{HTMLPlug.desc}",
[6408]	103	'abstract' => "no",
[3540]	104	'inherits' => "yes",
	105	'args' => $arguments };
	106
[585]	107	sub new {
[808]	108	my $class = shift (@_);
[1435]	109	my $self = new BasPlug ($class, @_);
[5924]	110	$self->{'plugin_type'} = "HTMLPlug";
[3540]	111	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
	112	my $option_list = $self->{'option_list'};
	113	push( @{$option_list}, $options );
	114
[808]	115	if (!parsargv::parse(\@_,
	116	q^nolinks^, \$self->{'nolinks'},
	117	q^keep_head^, \$self->{'keep_head'},
	118	q^no_metadata^, \$self->{'no_metadata'},
	119	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
[1602]	120	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
[900]	121	q^w3mir^, \$self->{'w3mir'},
[2342]	122	q^file_is_url^, \$self->{'file_is_url'},
[3539]	123	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
[1220]	124	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
[1410]	125	q^title_sub/.*/^, \$self->{'title_sub'},
[2817]	126	q^description_tags^, \$self->{'description_tags'},
[9056]	127	q^no_strip_metadata_html/.*/^, \$self->{'no_strip_metadata_html'},
[1220]	128	"allow_extra_options")) {
[1244]	129
	130	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
[4873]	131	$self->print_txt_usage(""); # Use default resource bundle
[808]	132	die "\n";
	133	}
[2342]	134
	135	# retain this for backward compatibility (w3mir option was replaced by
	136	# file_is_url)
	137	if ($self->{'w3mir'}) {
	138	$self->{'file_is_url'} = 1;
	139	}
[1243]	140
[808]	141	$self->{'aux_files'} = {};
	142	$self->{'dir_num'} = 0;
	143	$self->{'file_num'} = 0;
[8509]	144
[585]	145	return bless $self, $class;
	146	}
	147
[8366]	148	# may want to use (?i)\.(gif\|jpe?g\|jpe\|png\|css\|js(?:@.*)?)$
	149	# if have eg <script language="javascript" src="img/lib.js@123">
[1243]	150	sub get_default_block_exp {
[585]	151	my $self = shift (@_);
[8914]	152
[8509]	153	return q^(?i)\.(gif\|jpe?g\|jpe\|jpg\|png\|css)$^;
[585]	154	}
	155
[1243]	156	sub get_default_process_exp {
[808]	157	my $self = shift (@_);
[8914]	158
[1403]	159	# the last option is an attempt to encode the concept of an html query ...
[3135]	160	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
[1243]	161	}
[721]	162
[8914]	163
[8509]	164	sub metadata_read {
	165	my $self = shift (@_);
	166	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[1243]	167
[8509]	168	my $outhandle = $self->{'outhandle'};
	169
	170	my $filename = $file;
	171	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
	172
	173	my ($dir) = $filename =~ /^(.?)[^\/\\]$/;
	174
	175	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	176	return undef; # can't recognise
	177	}
	178
[8914]	179	if ($self->{'smart_block'}) {
	180	# Do encoding stuff
	181	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[8509]	182
[8914]	183	# read in file ($text will be in utf8)
	184	my $text = "";
	185	$self->read_file ($filename, $encoding, $language, \$text);
	186	$self->store_block_files (\$text, $filename);
	187	}
[8509]	188	return 1;
	189	}
	190
	191	sub store_block_files
	192	{
	193	my $self =shift (@_);
	194	my ($textref, $filename) = @_;
	195
	196	my $html_fname = $filename;
	197	my @file_blocks;
	198
	199	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	200	my $closecom = '(?:-->\|(?:—\|\|--)>)';
	201	$$textref =~ s/$opencom(.*?)$closecom//gs;
	202
	203	my $attval = "\\\"[^\\\"]+\\\"\|[^\\s>]+";
	204	my @img_matches = ($$textref =~ m/<img[^>]?src\s=\s($attval)[^>]>/igs);
	205	my @usemap_matches = ($$textref =~ m/<img[^>]?usemap\s=\s($attval)[^>]>/igs);
	206	my @link_matches = ($$textref =~ m/<link[^>]?href\s=\s($attval)[^>]>/igs);
	207
	208
	209	foreach my $link (@img_matches, @usemap_matches, @link_matches) {
	210
	211	# remove quotes from link at start and end if necessary
	212	if ($link=~/^\"/) {
	213	$link=~s/^\"//;
	214	$link=~s/\"$//;
	215	}
	216
	217	$link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
	218
	219	if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) {
	220	# Turn relative file path into full path
	221	my $dirname = &File::Basename::dirname($filename);
	222	$link = &util::filename_cat($dirname, $link);
	223	}
	224	$link = $self->eval_dir_dots($link);
	225	$self->{'file_blocks'}->{$link} = 1;
	226	}
	227	}
	228
[1243]	229	# do plugin specific processing of doc_obj
	230	sub process {
	231	my $self = shift (@_);
[6332]	232	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[1431]	233	my $outhandle = $self->{'outhandle'};
[6332]	234
	235	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
	236
[1431]	237	print $outhandle "HTMLPlug: processing $file\n"
[808]	238	if $self->{'verbosity'} > 1;
[721]	239
[3019]	240	if ($ENV{'GSDLOS'} =~ /^windows/i) {
	241	# this makes life so much easier... perl can cope with unix-style '/'s.
	242	$base_dir =~ s@(\\)+@/@g;
	243	$file =~ s@(\\)+@/@g;
	244	}
[3148]	245
	246	# reset per-doc stuff...
	247	$self->{'aux_files'} = {};
	248	$self->{'dir_num'} = 0;
	249	$self->{'file_num'} = 0;
	250
[808]	251	my $cursection = $doc_obj->get_top_section();
[1220]	252
[8509]	253	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
[2817]	254	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
[721]	255
[808]	256	# Store URL for page as metadata - this can be used for an
	257	# altavista style search interface. The URL won't be valid
	258	# unless the file structure contains the domain name (i.e.
	259	# like when w3mir is used to download a website).
	260	my $web_url = "http://$file";
[4845]	261	$doc_obj->add_metadata($cursection, "URL", $web_url);
[721]	262
[2817]	263	if ($self->{'description_tags'}) {
[2995]	264	# remove the html header - note that doing this here means any
	265	# sections defined within the header will be lost (so all <Section>
	266	# tags must appear within the body of the HTML)
[8509]	267	my ($head_keep) = ($$textref =~ m/^(.?)<body[^>]>/is);
	268
[2995]	269	$$textref =~ s/^.?<body[^>]>//is;
	270	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	271
[2819]	272	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
	273	my $closecom = '(?:-->\|(?:—\|\|--)>)';
[8509]	274
[2819]	275	my $lt = '(?:<\|<)';
	276	my $gt = '(?:>\|>)';
	277	my $quot = '(?:"\|"\|”\|“)';
	278
[2817]	279	my $found_something = 0; my $top = 1;
[2819]	280	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
[2817]	281	my $text = $1;
	282	my $comment = $2;
	283	if (defined $text) {
[3369]	284	# text before a comment - note that getting to here
	285	# doesn't necessarily mean there are Section tags in
	286	# the document
[2817]	287	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
	288	}
[2819]	289	while ($comment =~ s/$lt(.*?)$gt//s) {
[2817]	290	my $tag = $1;
	291	if ($tag eq "Section") {
	292	$found_something = 1;
	293	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
	294	$top = 0;
	295	} elsif ($tag eq "/Section") {
	296	$found_something = 1;
	297	$cursection = $doc_obj->get_parent_section ($cursection);
[2819]	298	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
[2817]	299	my $metaname = $1;
[9053]	300	my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
[2819]	301	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
[2817]	302	my $metavalue = $1;
	303	$metavalue =~ s/^\s+//;
	304	$metavalue =~ s/\s+$//;
[2819]	305	# assume that no metadata value intentionally includes
	306	# carriage returns or HTML tags (if they're there they
	307	# were probably introduced when converting to HTML from
	308	# some other format).
[9056]	309	# actually some people want to have html tags in their
	310	# metadata.
[2819]	311	$metavalue =~ s/[\cJ\cM]/ /sg;
[9056]	312	my $dont = '';
	313	if ($self->{'no_strip_metadata_html'}) {
	314	($dont = $self->{'no_strip_metadata_html'}) =~ s{,}{\|}g;
	315	}
	316	$metavalue =~ s/<[^>]+>//sg
	317	unless $dont && ($dont eq 'all' \|\| $metaname =~ /^($dont)$/);
	318	if ($dont && ($dont eq 'all' \|\| $metaname =~ /^($dont)$/)) {
	319	}
[2819]	320	$metavalue =~ s/\s+/ /sg;
[9053]	321	if ($accumulate) {
	322	$doc_obj->add_utf8_metadata($cursection, $metaname, $metavalue);
	323	} else {
	324	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
	325	}
[2819]	326	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
	327	# do nothing with containing Description tags
	328	} else {
	329	# simple HTML tag (probably created by the conversion
	330	# to HTML from some other format) - we'll ignore it and
	331	# hope for the best ;-)
[2817]	332	}
	333	}
	334	}
	335	if ($cursection ne "") {
	336	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
	337	}
	338
	339	$$textref =~ s/^.?<body[^>]>//is;
	340	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	341	if ($$textref =~ /\S/) {
	342	if (!$found_something) {
[8509]	343	if ($self->{'verbosity'} > 2) {
	344	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
	345	print $outhandle " will be processed as a single section document\n";
	346	}
	347
[3369]	348	# go ahead and process single-section document
[3349]	349	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
[3369]	350
	351	# if document contains no Section tags we'll go ahead
	352	# and extract metadata (this won't have been done
	353	# above as the -description_tags option prevents it)
[8509]	354	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	355	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
[3369]	356	unless $self->{'no_metadata'};
	357
[2817]	358	} else {
	359	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
	360	print $outhandle " of the final closing </Section> tag. This text will\n";
	361	print $outhandle " be ignored.";
[8509]	362
[2819]	363	my ($text);
[2817]	364	if (length($$textref) > 30) {
	365	$text = substr($$textref, 0, 30) . "...";
[2819]	366	} else {
	367	$text = $$textref;
[2817]	368	}
	369	$text =~ s/\n/ /isg;
	370	print $outhandle " ($text)\n";
	371	}
[3369]	372	} elsif (!$found_something) {
	373
[8509]	374	if ($self->{'verbosity'} > 2) {
	375	# may get to here if document contained no valid Section
	376	# tags but did contain some comments. The text will have
	377	# been processed already but we should print the warning
	378	# as above and extract metadata
	379	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n";
	380	print $outhandle " is blank or empty. Metadata will be assigned if present.\n";
	381	}
[3369]	382
[8509]	383	my $complete_text = $head_keep.$doc_obj->get_text($cursection);
	384	$self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection)
[3369]	385	unless $self->{'no_metadata'};
[2817]	386	}
	387
	388	} else {
[2995]	389
	390	# remove header and footer
	391	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
	392	$$textref =~ s/^.?<body[^>]>//is;
	393	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
	394	}
	395
[2817]	396	# single section document
	397	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
	398	}
	399	return 1;
	400	}
	401
	402	# note that process_section may be called multiple times for a single
	403	# section (relying on the fact that add_utf8_text appends the text to any
	404	# that may exist already).
	405	sub process_section {
	406	my $self = shift (@_);
	407	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
[808]	408	# trap links
	409	if (!$self->{'nolinks'}) {
[721]	410
[808]	411	# usemap="./#index" not handled correctly => change to "#index"
[1243]	412	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
[808]	413	$self->replace_usemap_links($1, $2, $3)/isge;
[721]	414
[8366]	415	$$textref =~ s/(<(?:a\|area\|frame\|link\|script)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
[897]	416	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[721]	417	}
	418
[808]	419	# trap images
[1929]	420
[2695]	421	# allow spaces if inside quotes - jrm21
	422	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
[897]	423	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
[1244]	424
	425	# add text to document object
[3019]	426	# turn \ into \\ so that the rest of greenstone doesn't think there
	427	# is an escape code following. (Macro parsing loses them...)
	428	$$textref =~ s/\\/\\\\/go;
[1358]	429	$doc_obj->add_utf8_text($cursection, $$textref);
[721]	430	}
	431
[808]	432	sub replace_images {
	433	my $self = shift (@_);
[897]	434	my ($front, $link, $back, $base_dir,
	435	$file, $doc_obj, $section) = @_;
[8509]	436
[2695]	437	# remove quotes from link at start and end if necessary
	438	if ($link=~/^\"/) {
	439	$link=~s/^\"//;$link=~s/\"$//;
	440	$front.='"';
	441	$back="\"$back";
	442	}
	443
[808]	444	$link =~ s/\n/ /g;
[7949]	445
	446	# Hack to overcome Windows wv 0.7.1 bug that causes embedded images to be broken
	447	# If the Word file path has spaces in it, wv messes up and you end up with
	448	# absolute paths for the images, and without the "file://" prefix
	449	# So check for this special case and massage the data to be correct
[7966]	450	if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
	451	$link =~ s/^.*\\([^\\]+)$/$1/;
[7949]	452	}
	453
[808]	454	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[8509]	455
[6812]	456	my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
	457	my $anchor_name = $img_file;
	458	$anchor_name =~ s/^.*\///;
	459	$anchor_name = "<a name=\"$anchor_name\">";
	460
[7595]	461	return $front . $img_file . $back . $anchor_name;
[721]	462	}
	463
[808]	464	sub replace_href_links {
[585]	465	my $self = shift (@_);
[897]	466	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
[585]	467
[808]	468	# attempt to sort out targets - frames are not handled
	469	# well in this plugin and some cases will screw things
	470	# up - e.g. the _parent target (so we'll just remove
	471	# them all ;-)
	472	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	473	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
	474	$front =~ s/target=\"?_parent\"?//is;
	475	$back =~ s/target=\"?_parent\"?//is;
[721]	476
[808]	477	return $front . $link . $back if $link =~ /^\#/s;
	478	$link =~ s/\n/ /g;
[721]	479
[808]	480	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
[1312]	481	# href may use '\'s where '/'s should be on Windows
	482	$href =~ s/\\/\//g;
[585]	483
[850]	484	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
[8509]	485
[8914]	486
[897]	487	##### leave all these links alone (they won't be picked up by intermediate
	488	##### pages). I think that's safest when dealing with frames, targets etc.
	489	##### (at least until I think of a better way to do it). Problems occur with
	490	##### mailto links from within small frames, the intermediate page is displayed
	491	##### within that frame and can't be seen. There is still potential for this to
	492	##### happen even with html pages - the solution seems to be to somehow tell
	493	##### the browser from the server side to display the page being sent (i.e.
	494	##### the intermediate page) in the top level window - I'm not sure if that's
	495	##### possible - the following line should probably be deleted if that can be done
	496	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
	497
[1605]	498
[850]	499	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
[808]	500	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
[1010]	501	&ghtml::urlsafe ($href);
[897]	502	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
[808]	503	} else {
[1686]	504	# link is to some other type of file (eg image) so we'll
[808]	505	# need to associate that file
[965]	506	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
[721]	507	}
[808]	508	}
[721]	509
[808]	510	sub add_file {
	511	my $self = shift (@_);
[965]	512	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
[808]	513	my ($newname);
[585]	514
[808]	515	my $filename = $href;
	516	$filename =~ s/^[^:]*:\/\///;
[1410]	517	$filename = &util::filename_cat($base_dir, $filename);
[3708]	518
	519	# Replace %20's in URL with a space if required. Note that the filename
	520	# may include the %20 in some situations
	521	if ($filename =~ /\%20/) {
	522	if (!-e $filename) {
	523	$filename =~ s/\%20/ /g;
	524	}
	525	}
	526
[808]	527	my ($ext) = $filename =~ /(\.[^\.]*)$/;
[965]	528
	529	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
	530	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
	531	}
[900]	532	if ($self->{'rename_assoc_files'}) {
	533	if (defined $self->{'aux_files'}->{$href}) {
	534	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
	535	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
	536	} else {
	537	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
[965]	538	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
[900]	539	$self->inc_filecount ();
	540	}
	541	$doc_obj->associate_file($filename, $newname, undef, $section);
[3148]	542	return "_httpdocimg_/$newname";
[585]	543	} else {
[900]	544	($newname) = $filename =~ /([^\/\\]*)$/;
	545	$doc_obj->associate_file($filename, $newname, undef, $section);
[1020]	546	return "_httpdocimg_/$newname";
[585]	547	}
[808]	548	}
[585]	549
[721]	550
[808]	551	sub format_link {
	552	my $self = shift (@_);
	553	my ($link, $base_dir, $file) = @_;
[585]	554
[808]	555	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
[8509]	556
[808]	557	$hash_part = "" if !defined $hash_part;
	558	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
[1424]	559	my $outhandle = $self->{'outhandle'};
	560	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
[808]	561	if $self->{'verbosity'};
	562	return ($link, "", 0);
[732]	563	}
[8509]	564
[3019]	565	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
[808]	566	my $type = $1;
[1929]	567
[808]	568	if ($link =~ /^(http\|ftp):/i) {
	569	# Turn url (using /) into file name (possibly using \ on windows)
	570	my @http_dir_split = split('/', $before_hash);
	571	$before_hash = &util::filename_cat(@http_dir_split);
[585]	572	}
	573
[808]	574	$before_hash = $self->eval_dir_dots($before_hash);
	575
	576	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[8509]	577
[808]	578	my $rl = 0;
	579	$rl = 1 if (-e $linkfilename);
[585]	580
[808]	581	# make sure there's a slash on the end if it's a directory
	582	if ($before_hash !~ /\/$/) {
	583	$before_hash .= "/" if (-d $linkfilename);
	584	}
[585]	585
[808]	586	return ($type . $before_hash, $hash_part, $rl);
[721]	587
[808]	588	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
[3019]	589	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
[721]	590
[2342]	591	# the first directory will be the domain name if file_is_url
[808]	592	# to generate archives, otherwise we'll assume all files are
	593	# from the same site and base_dir is the root
[3019]	594
[2342]	595	if ($self->{'file_is_url'}) {
[808]	596	my @dirs = split /[\/\\]/, $file;
	597	my $domname = shift (@dirs);
	598	$before_hash = &util::filename_cat($domname, $before_hash);
[3019]	599	$before_hash =~ s@\\@/@g; # for windows
[808]	600	}
[1410]	601	else
	602	{
	603	# see if link shares directory with source document
	604	# => turn into relative link if this is so!
[1929]	605
	606	if ($ENV{'GSDLOS'} =~ /^windows/i) {
[3019]	607	# too difficult doing a pattern match with embedded '\'s...
	608	my $win_before_hash=$before_hash;
	609	$win_before_hash =~ s@(\\)+@/@g;
	610	# $base_dir is already similarly "converted" on windows.
	611	if ($win_before_hash =~ s@^$base_dir/@@o) {
	612	# if this is true, we removed a prefix
	613	$before_hash=$win_before_hash;
	614	}
[1929]	615	}
	616	else {
[3019]	617	$before_hash = &util::filename_cat("",$before_hash);
	618	$before_hash =~ s@^$base_dir/@@;
[1929]	619	}
[1410]	620	}
[808]	621	} else {
	622	# Turn relative file path into full path
	623	my $dirname = &File::Basename::dirname($file);
	624	$before_hash = &util::filename_cat($dirname, $before_hash);
	625	$before_hash = $self->eval_dir_dots($before_hash);
	626	}
[721]	627
[1410]	628	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
[808]	629	# make sure there's a slash on the end if it's a directory
	630	if ($before_hash !~ /\/$/) {
	631	$before_hash .= "/" if (-d $linkfilename);
	632	}
	633	return ("http://" . $before_hash, $hash_part, 1);
	634	} else {
	635	# mailto, news, nntp, telnet, javascript or gopher link
	636	return ($before_hash, "", 0);
	637	}
	638	}
[1605]	639
[1602]	640	sub extract_first_NNNN_characters {
	641	my $self = shift (@_);
	642	my ($textref, $doc_obj, $thissection) = @_;
	643
	644	foreach my $size (split /,/, $self->{'first'}) {
	645	my $tmptext = $$textref;
[4821]	646	# skip to the body
[1602]	647	$tmptext =~ s/.<body[^>]>//i;
[4821]	648	# remove javascript
	649	$tmptext =~ s@<script.*?</script>@ @sig;
[1602]	650	$tmptext =~ s/<[^>]*>/ /g;
	651	$tmptext =~ s/ / /g;
	652	$tmptext =~ s/^\s+//;
	653	$tmptext =~ s/\s+$//;
	654	$tmptext =~ s/\s+/ /gs;
[8767]	655	# with perl 5.6 at least, substr might segment at a multi-byte char...
	656	use utf8;
[1602]	657	$tmptext = substr ($tmptext, 0, $size);
[4821]	658	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
[1602]	659	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	660	}
	661	}
[721]	662
[7202]	663
[808]	664	sub extract_metadata {
	665	my $self = shift (@_);
	666	my ($textref, $metadata, $doc_obj, $section) = @_;
[1605]	667	my $outhandle = $self->{'outhandle'};
[1602]	668	# if we don't want metadata, we may as well not be here ...
[1400]	669	return if (!defined $self->{'metadata_fields'});
	670
[8843]	671	# metadata fields to extract/save. 'key' is the (lowercase) name of the
	672	# html meta, 'value' is the metadata name for greenstone to use
	673	my %find_fields = ();
[7202]	674
	675	my %creator_fields = (); # short-cut for lookups
	676
	677
	678	foreach my $field (split /,/, $self->{'metadata_fields'}) {
[8225]	679	# support tag<tagname>
	680	if ($field =~ /^(.?)<(.?)>$/) {
	681	# "$2" is the user's preferred gs metadata name
	682	$find_fields{lc($1)}=$2; # lc = lowercase
[8843]	683	} else { # no <tagname> for mapping
	684	# "$field" is the user's preferred gs metadata name
	685	$find_fields{lc($field)}=$field; # lc = lowercase
[8225]	686	}
[7202]	687	}
	688
	689	if (defined $self->{'hunt_creator_metadata'} &&
	690	$self->{'hunt_creator_metadata'} == 1 ) {
	691	my @extra_fields =
	692	(
	693	'author',
	694	'author.email',
	695	'creator',
	696	'dc.creator',
	697	'dc.creator.corporatename',
	698	);
	699
	700	# add the creator_metadata fields to search for
	701	foreach my $field (@extra_fields) {
	702	$creator_fields{$field}=0; # add to lookup hash
[1602]	703	}
	704	}
[721]	705
[8509]	706
[7202]	707	# find the header in the html file, which has the meta tags
	708	$$textref =~ m@<head>(.*?)</head>@si;
	709
	710	my $html_header=$1;
	711	# go through every <meta... tag defined in the html and see if it is
	712	# one of the tags we want to match.
	713
[7235]	714	# special case for title - we want to remember if its been found
	715	my $found_title = 0;
[7202]	716	# this assumes that ">" won't appear. (I don't think it's allowed to...)
	717	$html_header =~ /^/; # match the start of the string, for \G assertion
[8509]	718
[7202]	719	while ($html_header =~ m/\G.?<meta(.?)>/sig) {
	720	my $metatag=$1;
	721	my ($tag, $value);
	722
	723	# find the tag name
	724	$metatag =~ /(?:name\|http-equiv)\s=\s([\"\'])?(.*?)\1/is;
	725	$tag=$2;
	726	# in case they're not using " or ', but they should...
	727	if (! $tag) {
[8843]	728	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[7202]	729	$tag=$1;
[808]	730	}
[1605]	731
[7202]	732	if (!defined $tag) {
	733	print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n";
[1230]	734	next;
[1190]	735	}
[1230]	736
[7202]	737	# don't need to assign this field if it was passed in from a previous
	738	# (recursive) plugin
	739	if (defined $metadata->{$tag}) {next}
	740
	741	# find the tag content
	742	$metatag =~ /content\s=\s([\"\'])?(.*?)\1/is;
	743	$value=$2;
	744	if (! $value) {
[8843]	745	$metatag =~ /(?:name\|http-equiv)\s=\s([^\s\>]+)/is;
[7202]	746	$value=$1;
	747	}
	748	if (!defined $value) {
	749	print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n";
	750	next;
	751	}
	752
	753	# clean up and add
	754	$value =~ s/\s+/ /gs;
[8794]	755	chomp($value); # remove trailing \n, if any
[7202]	756	if (exists $creator_fields{lc($tag)}) {
	757	# map this value onto greenstone's "Creator" metadata
	758	$tag='Creator';
	759	} elsif (!exists $find_fields{lc($tag)}) {
	760	next; # don't want this tag
	761	} else {
	762	# get the user's preferred capitalisation
	763	$tag = $find_fields{lc($tag)};
	764	}
[7235]	765	if (lc($tag) eq "title") {
	766	$found_title = 1;
	767	}
[7202]	768	print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
	769	if ($self->{'verbosity'} > 2);
	770	$doc_obj->add_utf8_metadata($section, $tag, $value);
	771
[808]	772	}
[7202]	773
	774	# TITLE: extract the document title
[7235]	775	if (exists $find_fields{'title'} && !$found_title) {
[7202]	776	# we want a title, and didn't find one in the meta tags
	777	# see if there's a <title> tag
	778	my $title;
[8843]	779	my $from = ""; # for debugging output only
[7235]	780	if ($html_header =~ /<title[^>]>([^<]+)<\/title[^>]>/is) {
[7202]	781	$title = $1;
[7235]	782	$from = "<title> tags";
[7202]	783	}
[8509]	784
[7202]	785	if (!defined $title) {
[7235]	786	$from = "first 100 chars";
[7202]	787	# if no title use first 100 or so characters
	788	$title = $$textref;
[8071]	789	$title =~ s/^\xFE\xFF//; # Remove unicode byte order mark
[7202]	790	$title =~ s/^.*?<body>//si;
	791	# ignore javascript!
	792	$title =~ s@<script.*?</script>@ @sig;
	793	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	794	$title =~ s/<[^>]*>/ /g; # remove all HTML tags
	795	$title = substr ($title, 0, 100);
	796	$title =~ s/\s\S*$/.../;
	797	}
	798	$title =~ s/<[^>]*>/ /g; # remove html tags
	799	$title =~ s/ / /g;
	800	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	801	$title =~ s/\s+/ /gs; # collapse multiple spaces
	802	$title =~ s/^\s*//; # remove leading spaces
	803	$title =~ s/\s*$//; # remove trailing spaces
[8071]	804
[7202]	805	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	806	$title =~ s/^\s+//s; # in case title_sub introduced any...
	807	$doc_obj->add_utf8_metadata ($section, 'Title', $title);
[7235]	808	print $outhandle " extracted Title metadata \"$title\" from $from\n"
[7202]	809	if ($self->{'verbosity'} > 2);
[7235]	810	}
[8121]	811
	812	# add FileFormat metadata
	813	$doc_obj->add_metadata($section,"FileFormat", "HTML");
[7202]	814
	815	# Special, for metadata names such as tagH1 - extracts
	816	# the text between the first <H1> and </H1> tags into "H1" metadata.
	817
	818	foreach my $field (keys %find_fields) {
	819	if ($field !~ /^tag([a-z0-9]+)$/i) {next}
	820	my $tag = $1;
	821	if ($$textref =~ m@<$tag[^>]>(.?)</$tag[^>]*>@g) {
	822	my $content = $1;
	823	$content =~ s/ / /g;
	824	$content =~ s/<[^>]*>/ /g;
	825	$content =~ s/^\s+//;
	826	$content =~ s/\s+$//;
	827	$content =~ s/\s+/ /gs;
	828	if ($content) {
	829	$tag=$find_fields{"tag$tag"}; # get the user's capitalisation
	830	$tag =~ s/^tag//i;
	831	$doc_obj->add_utf8_metadata ($section, $tag, $content);
	832	print $outhandle " extracted \"$tag\" metadata \"$content\"\n"
	833	if ($self->{'verbosity'} > 2);
	834	}
	835	}
	836	}
[585]	837	}
	838
[1190]	839
[808]	840	# evaluate any "../" to next directory up
	841	# evaluate any "./" as here
	842	sub eval_dir_dots {
	843	my $self = shift (@_);
	844	my ($filename) = @_;
[721]	845	my $dirsep_os = &util::get_os_dirsep();
	846	my @dirsep = split(/$dirsep_os/,$filename);
	847
	848	my @eval_dirs = ();
[850]	849	foreach my $d (@dirsep) {
[808]	850	if ($d eq "..") {
[721]	851	pop(@eval_dirs);
[8509]	852
[808]	853	} elsif ($d eq ".") {
[721]	854	# do nothing!
[808]	855
	856	} else {
[721]	857	push(@eval_dirs,$d);
	858	}
[585]	859	}
	860
[8509]	861	# Need to fiddle with number of elements in @eval_dirs if the
	862	# first one is the empty string. This is because of a
	863	# modification to util::filename_cat that supresses the addition
	864	# of a leading '/' character (or \ if windows) (intended to help
	865	# filename cat with relative paths) if the first entry in the
	866	# array is the empty string. Making the array start with two
	867	# empty strings is a way to defeat this "smart" option.
	868	#
	869	if (scalar(@eval_dirs) > 0) {
	870	if ($eval_dirs[0] eq ""){
	871	unshift(@eval_dirs,"");
	872	}
	873	}
[721]	874	return &util::filename_cat(@eval_dirs);
	875	}
	876
[808]	877	sub replace_usemap_links {
	878	my $self = shift (@_);
[721]	879	my ($front, $link, $back) = @_;
	880
	881	$link =~ s/^\.\///;
[808]	882	return $front . $link . $back;
[721]	883	}
	884
[808]	885	sub inc_filecount {
	886	my $self = shift (@_);
[721]	887
[808]	888	if ($self->{'file_num'} == 1000) {
	889	$self->{'dir_num'} ++;
	890	$self->{'file_num'} = 0;
	891	} else {
	892	$self->{'file_num'} ++;
	893	}
	894	}
[721]	895
[1891]	896
	897	# Extend the BasPlug read_file so that strings like é are
	898	# converted to UTF8 internally.
	899	#
	900	# We don't convert < or > or & or " in case
	901	# they interfere with the GML files
	902
	903	sub read_file {
[2735]	904	my ($self, $filename, $encoding, $language, $textref) = @_;
[1891]	905
[2735]	906	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
[2364]	907
[3181]	908	# Convert entities to their UTF8 equivalents
[3196]	909	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
[3181]	910	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
[3196]	911	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
[1891]	912	}
	913
[585]	914	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: