Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 3540

Last change on this file since 3540 was 3540, checked in by kjdon, 21 years ago
added John T's changes into CVS - added info to enable retrieval of usage info in xml
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 27.8 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use unicode;
43	use util;
44	use parsargv;
45
46	sub BEGIN {
47	@ISA = ('BasPlug');
48	}
49
50	my $arguments = [ { 'name' => "process_exp",
51	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
52	'type' => "string",
53	'deft' => q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\|\.cgi\|.+\?.+=.*)$^ },
54	{ 'name' => "block_exp",
55	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
56	'type' => 'string',
57	'deft' => q^(?i)\.(gif\|jpe?g\|png\|css)$^ },
58	{ 'name' => "nolinks",
59	'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
60	'type' => "flag" },
61	{ 'name' => "keep_head",
62	'desc' => "Don't remove headers from html files.",
63	'type' => "flag" },
64	{ 'name' => "no_metadata",
65	'desc' => "Don't attempt to extract any metadata from files.",
66	'type' => "flag" },
67	{ 'name' => "metadata_fields",
68	'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag<tagname>' to have the contents of the first <tagname > pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
69	'type' => "metadatum",
70	'deft' => "" },
71	{ 'name' => "hunt_creator_metadata",
72	'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
73	'type' => "flag" },
74	{ 'name' => "file_is_url",
75	'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
76	'type' => "flag" },
77	{ 'name' => "assoc_files",
78	'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g\|gif\|png\|css)\$'",
79	'type' => "string",
80	'deft' => q^(?i)\.(jpe?g\|gif\|png\|css)\$^ },
81	{ 'name' => "rename_assoc_files",
82	'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
83	'type' => "flag" } ,
84	{ 'name' => "title_sub",
85	'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
86	'type' => "string" } ,
87	{ 'name' => "description_tags",
88	'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.",
89	'type' => "flag" } ];
90
91	my $options = { 'name' => "HTMLPlug",
92	'desc' => "This plugin processes HTML files",
93	'inherits' => "yes",
94	'args' => $arguments };
95
96	sub print_usage {
97	print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
98	print STDERR " options:\n";
99	print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
100	print STDERR " flag may improve speed of building/importing but\n";
101	print STDERR " any relative links within documents will be broken).\n";
102	print STDERR " -keep_head Don't remove headers from html files.\n";
103	print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
104	print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
105	extract. Defaults to 'Title'.
106	Use 'tag<tagname>' to have the contents of the first
107	<tagname> pair put in a metadata element called
108	'tagname'. Capitalise this as you want the metadata
109	capitalised in Greenstone, since the tag extraction
110	is case insensitive.\n";
111	print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
112	place it in the 'Creator' field. Requires the
113	-metadata_fields flag.\n";
114	print STDERR " -file_is_url Set if input filenames make up url of original source
115	documents e.g. if a web mirroring tool was used to
116	create the import directory structure\n";
117	print STDERR " -assoc_files Perl regular expression of file extensions to
118	associate with html documents.
119	Defaults to '(?i)\.(jpe?g\|gif\|png\|css)\$'\n";
120	print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
121	Also creates much shallower directory structure
122	(useful when creating collections to go on cd-rom).\n";
123	print STDERR " -title_sub Substitution expression to modify string stored as
124	Title. Used by, for example, PDFPlug to remove
125	\"Page 1\", etc from text used as the title.\n";
126	print STDERR " -description_tags Split document into sub-sections where <Section> tags
127	occur. Note that by setting this option you
128	implicitly set -no_metadata, as all metadata should
129	be included within the <Section> tags (this is only
130	true for documents that actually contain <Section> tags
131	however). Also, '-keep_head' will have no effect when
132	this option is set, regardless of whether a document
133	contains Section tags.\n";
134	}
135
136	sub new {
137	my $class = shift (@_);
138	my $self = new BasPlug ($class, @_);
139
140	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
141	my $option_list = $self->{'option_list'};
142	push( @{$option_list}, $options );
143
144	if (!parsargv::parse(\@_,
145	q^nolinks^, \$self->{'nolinks'},
146	q^keep_head^, \$self->{'keep_head'},
147	q^no_metadata^, \$self->{'no_metadata'},
148	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
149	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
150	q^w3mir^, \$self->{'w3mir'},
151	q^file_is_url^, \$self->{'file_is_url'},
152	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
153	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
154	q^title_sub/.*/^, \$self->{'title_sub'},
155	q^description_tags^, \$self->{'description_tags'},
156	"allow_extra_options")) {
157
158	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
159	&print_usage();
160	die "\n";
161	}
162
163	# retain this for backward compatibility (w3mir option was replaced by
164	# file_is_url)
165	if ($self->{'w3mir'}) {
166	$self->{'file_is_url'} = 1;
167	}
168
169	$self->{'aux_files'} = {};
170	$self->{'dir_num'} = 0;
171	$self->{'file_num'} = 0;
172	return bless $self, $class;
173	}
174
175
176	sub get_default_block_exp {
177	my $self = shift (@_);
178
179	return q^(?i)\.(gif\|jpe?g\|jpe\|png\|css)$^;
180	}
181
182	sub get_default_process_exp {
183	my $self = shift (@_);
184
185	# the last option is an attempt to encode the concept of an html query ...
186	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
187	}
188
189
190	# do plugin specific processing of doc_obj
191	sub process {
192	my $self = shift (@_);
193	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
194	my $outhandle = $self->{'outhandle'};
195	print $outhandle "HTMLPlug: processing $file\n"
196	if $self->{'verbosity'} > 1;
197
198	if ($ENV{'GSDLOS'} =~ /^windows/i) {
199	# this makes life so much easier... perl can cope with unix-style '/'s.
200	$base_dir =~ s@(\\)+@/@g;
201	$file =~ s@(\\)+@/@g;
202	}
203
204	# reset per-doc stuff...
205	$self->{'aux_files'} = {};
206	$self->{'dir_num'} = 0;
207	$self->{'file_num'} = 0;
208
209	my $cursection = $doc_obj->get_top_section();
210
211	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
212	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
213
214	# Store URL for page as metadata - this can be used for an
215	# altavista style search interface. The URL won't be valid
216	# unless the file structure contains the domain name (i.e.
217	# like when w3mir is used to download a website).
218	my $web_url = "http://$file";
219	$doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
220
221	if ($self->{'description_tags'}) {
222
223	# remove the html header - note that doing this here means any
224	# sections defined within the header will be lost (so all <Section>
225	# tags must appear within the body of the HTML)
226	$$textref =~ s/^.?<body[^>]>//is;
227	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
228
229	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
230	my $closecom = '(?:-->\|(?:—\|\|--)>)';
231	my $lt = '(?:<\|<)';
232	my $gt = '(?:>\|>)';
233	my $quot = '(?:"\|"\|”\|“)';
234
235	my $found_something = 0; my $top = 1;
236	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
237	my $text = $1;
238	my $comment = $2;
239	if (defined $text) {
240	# text before a comment - note that getting to here
241	# doesn't necessarily mean there are Section tags in
242	# the document
243	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
244	}
245	while ($comment =~ s/$lt(.*?)$gt//s) {
246
247	my $tag = $1;
248	if ($tag eq "Section") {
249	$found_something = 1;
250	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
251	$top = 0;
252	} elsif ($tag eq "/Section") {
253	$found_something = 1;
254	$cursection = $doc_obj->get_parent_section ($cursection);
255	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
256	my $metaname = $1;
257	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
258	my $metavalue = $1;
259	$metavalue =~ s/^\s+//;
260	$metavalue =~ s/\s+$//;
261	# assume that no metadata value intentionally includes
262	# carriage returns or HTML tags (if they're there they
263	# were probably introduced when converting to HTML from
264	# some other format).
265	$metavalue =~ s/[\cJ\cM]/ /sg;
266	$metavalue =~ s/<[^>]+>//sg;
267	$metavalue =~ s/\s+/ /sg;
268	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
269	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
270	# do nothing with containing Description tags
271	} else {
272	# simple HTML tag (probably created by the conversion
273	# to HTML from some other format) - we'll ignore it and
274	# hope for the best ;-)
275	}
276	}
277	}
278	if ($cursection ne "") {
279	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
280	}
281
282	$$textref =~ s/^.?<body[^>]>//is;
283	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
284	if ($$textref =~ /\S/) {
285	if (!$found_something) {
286	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
287	print $outhandle " will be processed as a single section document\n";
288
289	# go ahead and process single-section document
290	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
291
292	# if document contains no Section tags we'll go ahead
293	# and extract metadata (this won't have been done
294	# above as the -description_tags option prevents it)
295	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
296	unless $self->{'no_metadata'};
297
298	} else {
299	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
300	print $outhandle " of the final closing </Section> tag. This text will\n";
301	print $outhandle " be ignored.";
302	my ($text);
303	if (length($$textref) > 30) {
304	$text = substr($$textref, 0, 30) . "...";
305	} else {
306	$text = $$textref;
307	}
308	$text =~ s/\n/ /isg;
309	print $outhandle " ($text)\n";
310	}
311	} elsif (!$found_something) {
312
313	# may get to here if document contained no valid Section
314	# tags but did contain some comments. The text will have
315	# been processed already but we should print the warning
316	# as above and extract metadata
317	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
318	print $outhandle " will be processed as a single section document\n";
319
320	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
321	unless $self->{'no_metadata'};
322	}
323
324	} else {
325
326	# remove header and footer
327	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
328	$$textref =~ s/^.?<body[^>]>//is;
329	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
330	}
331
332	# single section document
333	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
334	}
335	return 1;
336	}
337
338	# note that process_section may be called multiple times for a single
339	# section (relying on the fact that add_utf8_text appends the text to any
340	# that may exist already).
341	sub process_section {
342	my $self = shift (@_);
343	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
344	# trap links
345	if (!$self->{'nolinks'}) {
346
347	# usemap="./#index" not handled correctly => change to "#index"
348	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
349	$self->replace_usemap_links($1, $2, $3)/isge;
350
351	$$textref =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
352	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
353	}
354
355	# trap images
356
357	# allow spaces if inside quotes - jrm21
358	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
359	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
360
361	# add text to document object
362	# turn \ into \\ so that the rest of greenstone doesn't think there
363	# is an escape code following. (Macro parsing loses them...)
364	$$textref =~ s/\\/\\\\/go;
365	$doc_obj->add_utf8_text($cursection, $$textref);
366	}
367
368	sub replace_images {
369	my $self = shift (@_);
370	my ($front, $link, $back, $base_dir,
371	$file, $doc_obj, $section) = @_;
372	# remove quotes from link at start and end if necessary
373	if ($link=~/^\"/) {
374	$link=~s/^\"//;$link=~s/\"$//;
375	$front.='"';
376	$back="\"$back";
377	}
378
379	$link =~ s/\n/ /g;
380	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
381	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
382	}
383
384	sub replace_href_links {
385	my $self = shift (@_);
386	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
387
388	# attempt to sort out targets - frames are not handled
389	# well in this plugin and some cases will screw things
390	# up - e.g. the _parent target (so we'll just remove
391	# them all ;-)
392	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
393	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
394	$front =~ s/target=\"?_parent\"?//is;
395	$back =~ s/target=\"?_parent\"?//is;
396
397	return $front . $link . $back if $link =~ /^\#/s;
398	$link =~ s/\n/ /g;
399
400	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
401	# href may use '\'s where '/'s should be on Windows
402	$href =~ s/\\/\//g;
403
404	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
405
406	##### leave all these links alone (they won't be picked up by intermediate
407	##### pages). I think that's safest when dealing with frames, targets etc.
408	##### (at least until I think of a better way to do it). Problems occur with
409	##### mailto links from within small frames, the intermediate page is displayed
410	##### within that frame and can't be seen. There is still potential for this to
411	##### happen even with html pages - the solution seems to be to somehow tell
412	##### the browser from the server side to display the page being sent (i.e.
413	##### the intermediate page) in the top level window - I'm not sure if that's
414	##### possible - the following line should probably be deleted if that can be done
415	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
416
417
418	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
419	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
420	&ghtml::urlsafe ($href);
421	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
422
423	} else {
424	# link is to some other type of file (eg image) so we'll
425	# need to associate that file
426	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
427	}
428	}
429
430	sub add_file {
431	my $self = shift (@_);
432	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
433	my ($newname);
434
435	my $filename = $href;
436	$filename =~ s/^[^:]*:\/\///;
437	$filename = &util::filename_cat($base_dir, $filename);
438
439	my ($ext) = $filename =~ /(\.[^\.]*)$/;
440
441	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
442	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
443	}
444
445	if ($self->{'rename_assoc_files'}) {
446	if (defined $self->{'aux_files'}->{$href}) {
447	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
448	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
449	} else {
450	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
451	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
452	$self->inc_filecount ();
453	}
454	$doc_obj->associate_file($filename, $newname, undef, $section);
455	return "_httpdocimg_/$newname";
456
457	} else {
458	($newname) = $filename =~ /([^\/\\]*)$/;
459	$doc_obj->associate_file($filename, $newname, undef, $section);
460	return "_httpdocimg_/$newname";
461	}
462	}
463
464
465	sub format_link {
466	my $self = shift (@_);
467	my ($link, $base_dir, $file) = @_;
468
469	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
470
471	$hash_part = "" if !defined $hash_part;
472	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
473	my $outhandle = $self->{'outhandle'};
474	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
475	if $self->{'verbosity'};
476	return ($link, "", 0);
477	}
478
479	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
480	my $type = $1;
481
482	if ($link =~ /^(http\|ftp):/i) {
483	# Turn url (using /) into file name (possibly using \ on windows)
484	my @http_dir_split = split('/', $before_hash);
485	$before_hash = &util::filename_cat(@http_dir_split);
486	}
487
488	$before_hash = $self->eval_dir_dots($before_hash);
489
490	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
491
492	my $rl = 0;
493	$rl = 1 if (-e $linkfilename);
494
495	# make sure there's a slash on the end if it's a directory
496	if ($before_hash !~ /\/$/) {
497	$before_hash .= "/" if (-d $linkfilename);
498	}
499
500	return ($type . $before_hash, $hash_part, $rl);
501
502	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
503	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
504
505	# the first directory will be the domain name if file_is_url
506	# to generate archives, otherwise we'll assume all files are
507	# from the same site and base_dir is the root
508
509	if ($self->{'file_is_url'}) {
510	my @dirs = split /[\/\\]/, $file;
511	my $domname = shift (@dirs);
512	$before_hash = &util::filename_cat($domname, $before_hash);
513	$before_hash =~ s@\\@/@g; # for windows
514	}
515	else
516	{
517	# see if link shares directory with source document
518	# => turn into relative link if this is so!
519
520	if ($ENV{'GSDLOS'} =~ /^windows/i) {
521	# too difficult doing a pattern match with embedded '\'s...
522	my $win_before_hash=$before_hash;
523	$win_before_hash =~ s@(\\)+@/@g;
524	# $base_dir is already similarly "converted" on windows.
525	if ($win_before_hash =~ s@^$base_dir/@@o) {
526	# if this is true, we removed a prefix
527	$before_hash=$win_before_hash;
528	}
529
530	}
531	else {
532	$before_hash = &util::filename_cat("",$before_hash);
533	$before_hash =~ s@^$base_dir/@@;
534	}
535
536	}
537
538	} else {
539	# Turn relative file path into full path
540	my $dirname = &File::Basename::dirname($file);
541	$before_hash = &util::filename_cat($dirname, $before_hash);
542	$before_hash = $self->eval_dir_dots($before_hash);
543	}
544
545	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
546
547	# make sure there's a slash on the end if it's a directory
548	if ($before_hash !~ /\/$/) {
549	$before_hash .= "/" if (-d $linkfilename);
550	}
551
552	return ("http://" . $before_hash, $hash_part, 1);
553
554	} else {
555	# mailto, news, nntp, telnet, javascript or gopher link
556	return ($before_hash, "", 0);
557	}
558	}
559
560	sub extract_first_NNNN_characters {
561	my $self = shift (@_);
562	my ($textref, $doc_obj, $thissection) = @_;
563
564	foreach my $size (split /,/, $self->{'first'}) {
565	my $tmptext = $$textref;
566	$tmptext =~ s/.<body[^>]>//i;
567	$tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'});
568	$tmptext =~ s/<[^>]*>/ /g;
569	$tmptext =~ s/ / /g;
570	$tmptext =~ s/^\s+//;
571	$tmptext =~ s/\s+$//;
572	$tmptext =~ s/\s+/ /gs;
573	$tmptext = substr ($tmptext, 0, $size);
574	$tmptext =~ s/\s\S*$/…/;
575	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
576	}
577	}
578
579	sub extract_metadata {
580	my $self = shift (@_);
581	my ($textref, $metadata, $doc_obj, $section) = @_;
582	my $outhandle = $self->{'outhandle'};
583
584	# if we don't want metadata, we may as well not be here ...
585	return if (!defined $self->{'metadata_fields'});
586
587	# hunt for an author look in the metadata elements:
588	if (defined $self->{'hunt_creator_metadata'}) {
589	for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
590	if ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$name\"?([^>])/is) {
591	my $content = $1 . $2;
592	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
593	if (defined $1) {
594	my $value = $1;
595	$value =~ s/\"$//;
596	$value =~ s/\s+/ /gs;
597	$doc_obj->add_utf8_metadata($section, "Creator", $value);
598	print $outhandle " extracted Creator metadata \"$value\"\n"
599	if ($self->{'verbosity'} > 2);
600	next;
601	}
602	}
603	}
604	}
605	}
606
607	foreach my $field (split /,/, $self->{'metadata_fields'}) {
608
609	# don't need to extract field if it was passed in from a previous
610	# (recursive) plugin
611	next if defined $metadata->{$field};
612
613	# see if there's a <meta> tag for this field
614	if ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/is) {
615	my $content = $1 . $2;
616	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
617	if (defined $1) {
618	my $value = $1;
619	$value =~ s/\"$//;
620	$value =~ s/\s+/ /gs;
621	$value =~ s/\".*//gs;
622	$doc_obj->add_utf8_metadata($section, $field, $value);
623	print $outhandle " extracted \"$field\" metadata \"$value\"\n"
624	if ($self->{'verbosity'} > 2);
625	next;
626	}
627	}
628	}
629
630	# TITLE: extract the document title
631
632	if ($field =~ /^title$/i) {
633
634	# see if there's a <title> tag
635	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
636	if (defined $1) {
637	my $title = $1;
638	if ($title =~ /\w/) {
639	$title =~ s/<[^>]*>/ /g;
640	$title =~ s/ / /g;
641	$title =~ s/\s+/ /gs;
642	$title =~ s/^\s+//;
643	$title =~ s/\s+$//;
644	$doc_obj->add_utf8_metadata ($section, $field, $title);
645	print $outhandle " extracted \"$field\" metadata \"$title\"\n"
646	if ($self->{'verbosity'} > 2);
647	next;
648	}
649	}
650	}
651
652	# if no title use first 100 characters
653	my $tmptext = $$textref;
654	$tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
655	$tmptext =~ s/<[^>]*>/ /g;
656	$tmptext =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
657	$tmptext =~ s/^\s+//s;
658	$tmptext =~ s/\s+$//;
659	$tmptext =~ s/\s+/ /gs;
660	$tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
661	$tmptext =~ s/^\s+//s; # in case title_sub introduced any...
662	$tmptext = substr ($tmptext, 0, 100);
663	$tmptext =~ s/\s\S*$/.../;
664	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
665	print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
666	if ($self->{'verbosity'} > 2);
667	next;
668	}
669
670
671	# tag: extract the text between the first <H1> and </H1> tags
672	if ($field =~ /^tag[a-z0-9]+$/i) {
673
674	my $tag = $field;
675	$tag =~ s/^tag//i;
676	my $tmptext = $$textref;
677	$tmptext =~ s/\s+/ /gs;
678	if ($tmptext =~ /<$tag[^>]*>/i) {
679	foreach my $word ($tmptext =~ m/<$tag[^>]>(.?)<\/$tag[^>]*>/g) {
680	$word =~ s/ / /g;
681	$word =~ s/<[^>]*>/ /g;
682	$word =~ s/^\s+//;
683	$word =~ s/\s+$//;
684	$word =~ s/\s+/ /gs;
685	if ($word ne "") {
686	$doc_obj->add_utf8_metadata ($section, $tag, $word);
687	print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
688	if ($self->{'verbosity'} > 2);
689	}
690	}
691	}
692	next;
693	}
694	}
695	}
696
697
698	# evaluate any "../" to next directory up
699	# evaluate any "./" as here
700	sub eval_dir_dots {
701	my $self = shift (@_);
702	my ($filename) = @_;
703
704	my $dirsep_os = &util::get_os_dirsep();
705	my @dirsep = split(/$dirsep_os/,$filename);
706
707	my @eval_dirs = ();
708	foreach my $d (@dirsep) {
709	if ($d eq "..") {
710	pop(@eval_dirs);
711
712	} elsif ($d eq ".") {
713	# do nothing!
714
715	} else {
716	push(@eval_dirs,$d);
717	}
718	}
719
720	return &util::filename_cat(@eval_dirs);
721	}
722
723	sub replace_usemap_links {
724	my $self = shift (@_);
725	my ($front, $link, $back) = @_;
726
727	$link =~ s/^\.\///;
728	return $front . $link . $back;
729	}
730
731	sub inc_filecount {
732	my $self = shift (@_);
733
734	if ($self->{'file_num'} == 1000) {
735	$self->{'dir_num'} ++;
736	$self->{'file_num'} = 0;
737	} else {
738	$self->{'file_num'} ++;
739	}
740	}
741
742
743	# Extend the BasPlug read_file so that strings like é are
744	# converted to UTF8 internally.
745	#
746	# We don't convert < or > or & or " in case
747	# they interfere with the GML files
748
749	sub read_file {
750	my ($self, $filename, $encoding, $language, $textref) = @_;
751
752	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
753
754	# Convert entities to their UTF8 equivalents
755	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
756	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
757	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
758	}
759
760	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: