Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: tags/gsdl-2_40-distribution/gsdl/perllib/plugins/HTMLPlug.pm@ 4846

Last change on this file since 4846 was 4846, checked in by (none), 21 years ago
This commit was manufactured by cvs2svn to create tag 'gsdl-2_40-distribution'.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 27.9 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use unicode;
43	use util;
44	use parsargv;
45
46	sub BEGIN {
47	@ISA = ('BasPlug');
48	}
49
50	my $arguments =
51	[ { 'name' => "process_exp",
52	'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
53	'type' => "string",
54	'deft' => &get_default_process_exp() },
55	{ 'name' => "block_exp",
56	'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
57	'type' => 'string',
58	'deft' => &get_default_block_exp() },
59	{ 'name' => "nolinks",
60	'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
61	'type' => "flag" },
62	{ 'name' => "keep_head",
63	'desc' => "Don't remove headers from html files.",
64	'type' => "flag" },
65	{ 'name' => "no_metadata",
66	'desc' => "Don't attempt to extract any metadata from files.",
67	'type' => "flag" },
68	{ 'name' => "metadata_fields",
69	'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag<tagname>' to have the contents of the first <tagname> pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
70	'type' => "metadata",
71	'deft' => "Title" },
72	{ 'name' => "hunt_creator_metadata",
73	'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
74	'type' => "flag" },
75	{ 'name' => "file_is_url",
76	'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
77	'type' => "flag" },
78	{ 'name' => "assoc_files",
79	'desc' => "Perl regular expression of file extensions to associate with html documents.",
80	'type' => "string",
81	'deft' => q^(?i)\.(jpe?g\|gif\|png\|css)$^ },
82	{ 'name' => "rename_assoc_files",
83	'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
84	'type' => "flag" },
85	{ 'name' => "title_sub",
86	'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
87	'type' => "string",
88	'deft' => "" },
89	{ 'name' => "description_tags",
90	'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.",
91	'type' => "flag" } ];
92
93	my $options = { 'name' => "HTMLPlug",
94	'desc' => "This plugin processes HTML files",
95	'inherits' => "yes",
96	'args' => $arguments };
97
98
99	# sub print_usage {
100	# print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
101	# print STDERR " options:\n";
102	# print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
103	# print STDERR " flag may improve speed of building/importing but\n";
104	# print STDERR " any relative links within documents will be broken).\n";
105	# print STDERR " -keep_head Don't remove headers from html files.\n";
106	# print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
107	# print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
108	# extract. Defaults to 'Title'.
109	# Use 'tag<tagname>' to have the contents of the first
110	# <tagname> pair put in a metadata element called
111	# 'tagname'. Capitalise this as you want the metadata
112	# capitalised in Greenstone, since the tag extraction
113	# is case insensitive.\n";
114	# print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
115	# place it in the 'Creator' field. Requires the
116	# -metadata_fields flag.\n";
117	# print STDERR " -file_is_url Set if input filenames make up url of original source
118	# documents e.g. if a web mirroring tool was used to
119	# create the import directory structure\n";
120	# print STDERR " -assoc_files Perl regular expression of file extensions to
121	# associate with html documents.
122	# Defaults to '(?i)\.(jpe?g\|gif\|png\|css)\$'\n";
123	# print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
124	# Also creates much shallower directory structure
125	# (useful when creating collections to go on cd-rom).\n";
126	# print STDERR " -title_sub Substitution expression to modify string stored as
127	# Title. Used by, for example, PDFPlug to remove
128	# \"Page 1\", etc from text used as the title.\n";
129	# print STDERR " -description_tags Split document into sub-sections where <Section> tags
130	# occur. Note that by setting this option you
131	# implicitly set -no_metadata, as all metadata should
132	# be included within the <Section> tags (this is only
133	# true for documents that actually contain <Section> tags
134	# however). Also, '-keep_head' will have no effect when
135	# this option is set, regardless of whether a document
136	# contains Section tags.\n";
137	# }
138
139	sub new {
140	my $class = shift (@_);
141	my $self = new BasPlug ($class, @_);
142
143	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
144	my $option_list = $self->{'option_list'};
145	push( @{$option_list}, $options );
146
147	if (!parsargv::parse(\@_,
148	q^nolinks^, \$self->{'nolinks'},
149	q^keep_head^, \$self->{'keep_head'},
150	q^no_metadata^, \$self->{'no_metadata'},
151	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
152	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
153	q^w3mir^, \$self->{'w3mir'},
154	q^file_is_url^, \$self->{'file_is_url'},
155	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
156	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
157	q^title_sub/.*/^, \$self->{'title_sub'},
158	q^description_tags^, \$self->{'description_tags'},
159	"allow_extra_options")) {
160
161	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
162	$self->print_txt_usage();
163	die "\n";
164	}
165
166	# retain this for backward compatibility (w3mir option was replaced by
167	# file_is_url)
168	if ($self->{'w3mir'}) {
169	$self->{'file_is_url'} = 1;
170	}
171
172	$self->{'aux_files'} = {};
173	$self->{'dir_num'} = 0;
174	$self->{'file_num'} = 0;
175	return bless $self, $class;
176	}
177
178
179	sub get_default_block_exp {
180	my $self = shift (@_);
181
182	return q^(?i)\.(gif\|jpe?g\|jpe\|png\|css)$^;
183	}
184
185	sub get_default_process_exp {
186	my $self = shift (@_);
187
188	# the last option is an attempt to encode the concept of an html query ...
189	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
190	}
191
192
193	# do plugin specific processing of doc_obj
194	sub process {
195	my $self = shift (@_);
196	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
197	my $outhandle = $self->{'outhandle'};
198	print $outhandle "HTMLPlug: processing $file\n"
199	if $self->{'verbosity'} > 1;
200
201	if ($ENV{'GSDLOS'} =~ /^windows/i) {
202	# this makes life so much easier... perl can cope with unix-style '/'s.
203	$base_dir =~ s@(\\)+@/@g;
204	$file =~ s@(\\)+@/@g;
205	}
206
207	# reset per-doc stuff...
208	$self->{'aux_files'} = {};
209	$self->{'dir_num'} = 0;
210	$self->{'file_num'} = 0;
211
212	my $cursection = $doc_obj->get_top_section();
213
214	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
215	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
216
217	# Store URL for page as metadata - this can be used for an
218	# altavista style search interface. The URL won't be valid
219	# unless the file structure contains the domain name (i.e.
220	# like when w3mir is used to download a website).
221	my $web_url = "http://$file";
222	$doc_obj->add_metadata($cursection, "URL", $web_url);
223
224	if ($self->{'description_tags'}) {
225
226	# remove the html header - note that doing this here means any
227	# sections defined within the header will be lost (so all <Section>
228	# tags must appear within the body of the HTML)
229	$$textref =~ s/^.?<body[^>]>//is;
230	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
231
232	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
233	my $closecom = '(?:-->\|(?:—\|\|--)>)';
234	my $lt = '(?:<\|<)';
235	my $gt = '(?:>\|>)';
236	my $quot = '(?:"\|"\|”\|“)';
237
238	my $found_something = 0; my $top = 1;
239	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
240	my $text = $1;
241	my $comment = $2;
242	if (defined $text) {
243	# text before a comment - note that getting to here
244	# doesn't necessarily mean there are Section tags in
245	# the document
246	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
247	}
248	while ($comment =~ s/$lt(.*?)$gt//s) {
249
250	my $tag = $1;
251	if ($tag eq "Section") {
252	$found_something = 1;
253	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
254	$top = 0;
255	} elsif ($tag eq "/Section") {
256	$found_something = 1;
257	$cursection = $doc_obj->get_parent_section ($cursection);
258	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
259	my $metaname = $1;
260	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
261	my $metavalue = $1;
262	$metavalue =~ s/^\s+//;
263	$metavalue =~ s/\s+$//;
264	# assume that no metadata value intentionally includes
265	# carriage returns or HTML tags (if they're there they
266	# were probably introduced when converting to HTML from
267	# some other format).
268	$metavalue =~ s/[\cJ\cM]/ /sg;
269	$metavalue =~ s/<[^>]+>//sg;
270	$metavalue =~ s/\s+/ /sg;
271	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
272	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
273	# do nothing with containing Description tags
274	} else {
275	# simple HTML tag (probably created by the conversion
276	# to HTML from some other format) - we'll ignore it and
277	# hope for the best ;-)
278	}
279	}
280	}
281	if ($cursection ne "") {
282	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
283	}
284
285	$$textref =~ s/^.?<body[^>]>//is;
286	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
287	if ($$textref =~ /\S/) {
288	if (!$found_something) {
289	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
290	print $outhandle " will be processed as a single section document\n";
291
292	# go ahead and process single-section document
293	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
294
295	# if document contains no Section tags we'll go ahead
296	# and extract metadata (this won't have been done
297	# above as the -description_tags option prevents it)
298	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
299	unless $self->{'no_metadata'};
300
301	} else {
302	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
303	print $outhandle " of the final closing </Section> tag. This text will\n";
304	print $outhandle " be ignored.";
305	my ($text);
306	if (length($$textref) > 30) {
307	$text = substr($$textref, 0, 30) . "...";
308	} else {
309	$text = $$textref;
310	}
311	$text =~ s/\n/ /isg;
312	print $outhandle " ($text)\n";
313	}
314	} elsif (!$found_something) {
315
316	# may get to here if document contained no valid Section
317	# tags but did contain some comments. The text will have
318	# been processed already but we should print the warning
319	# as above and extract metadata
320	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
321	print $outhandle " will be processed as a single section document\n";
322
323	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
324	unless $self->{'no_metadata'};
325	}
326
327	} else {
328
329	# remove header and footer
330	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
331	$$textref =~ s/^.?<body[^>]>//is;
332	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
333	}
334
335	# single section document
336	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
337	}
338	return 1;
339	}
340
341	# note that process_section may be called multiple times for a single
342	# section (relying on the fact that add_utf8_text appends the text to any
343	# that may exist already).
344	sub process_section {
345	my $self = shift (@_);
346	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
347	# trap links
348	if (!$self->{'nolinks'}) {
349
350	# usemap="./#index" not handled correctly => change to "#index"
351	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
352	$self->replace_usemap_links($1, $2, $3)/isge;
353
354	$$textref =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
355	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
356	}
357
358	# trap images
359
360	# allow spaces if inside quotes - jrm21
361	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
362	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
363
364	# add text to document object
365	# turn \ into \\ so that the rest of greenstone doesn't think there
366	# is an escape code following. (Macro parsing loses them...)
367	$$textref =~ s/\\/\\\\/go;
368	$doc_obj->add_utf8_text($cursection, $$textref);
369	}
370
371	sub replace_images {
372	my $self = shift (@_);
373	my ($front, $link, $back, $base_dir,
374	$file, $doc_obj, $section) = @_;
375	# remove quotes from link at start and end if necessary
376	if ($link=~/^\"/) {
377	$link=~s/^\"//;$link=~s/\"$//;
378	$front.='"';
379	$back="\"$back";
380	}
381
382	$link =~ s/\n/ /g;
383	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
384	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
385	}
386
387	sub replace_href_links {
388	my $self = shift (@_);
389	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
390
391	# attempt to sort out targets - frames are not handled
392	# well in this plugin and some cases will screw things
393	# up - e.g. the _parent target (so we'll just remove
394	# them all ;-)
395	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
396	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
397	$front =~ s/target=\"?_parent\"?//is;
398	$back =~ s/target=\"?_parent\"?//is;
399
400	return $front . $link . $back if $link =~ /^\#/s;
401	$link =~ s/\n/ /g;
402
403	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
404	# href may use '\'s where '/'s should be on Windows
405	$href =~ s/\\/\//g;
406
407	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
408
409	##### leave all these links alone (they won't be picked up by intermediate
410	##### pages). I think that's safest when dealing with frames, targets etc.
411	##### (at least until I think of a better way to do it). Problems occur with
412	##### mailto links from within small frames, the intermediate page is displayed
413	##### within that frame and can't be seen. There is still potential for this to
414	##### happen even with html pages - the solution seems to be to somehow tell
415	##### the browser from the server side to display the page being sent (i.e.
416	##### the intermediate page) in the top level window - I'm not sure if that's
417	##### possible - the following line should probably be deleted if that can be done
418	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
419
420
421	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
422	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
423	&ghtml::urlsafe ($href);
424	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
425
426	} else {
427	# link is to some other type of file (eg image) so we'll
428	# need to associate that file
429	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
430	}
431	}
432
433	sub add_file {
434	my $self = shift (@_);
435	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
436	my ($newname);
437
438	my $filename = $href;
439	$filename =~ s/^[^:]*:\/\///;
440	$filename = &util::filename_cat($base_dir, $filename);
441
442	# Replace %20's in URL with a space if required. Note that the filename
443	# may include the %20 in some situations
444	if ($filename =~ /\%20/) {
445	if (!-e $filename) {
446	$filename =~ s/\%20/ /g;
447	}
448	}
449
450	my ($ext) = $filename =~ /(\.[^\.]*)$/;
451
452	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
453	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
454	}
455
456	if ($self->{'rename_assoc_files'}) {
457	if (defined $self->{'aux_files'}->{$href}) {
458	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
459	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
460	} else {
461	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
462	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
463	$self->inc_filecount ();
464	}
465	$doc_obj->associate_file($filename, $newname, undef, $section);
466	return "_httpdocimg_/$newname";
467
468	} else {
469	($newname) = $filename =~ /([^\/\\]*)$/;
470	$doc_obj->associate_file($filename, $newname, undef, $section);
471	return "_httpdocimg_/$newname";
472	}
473	}
474
475
476	sub format_link {
477	my $self = shift (@_);
478	my ($link, $base_dir, $file) = @_;
479
480	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
481
482	$hash_part = "" if !defined $hash_part;
483	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
484	my $outhandle = $self->{'outhandle'};
485	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
486	if $self->{'verbosity'};
487	return ($link, "", 0);
488	}
489
490	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
491	my $type = $1;
492
493	if ($link =~ /^(http\|ftp):/i) {
494	# Turn url (using /) into file name (possibly using \ on windows)
495	my @http_dir_split = split('/', $before_hash);
496	$before_hash = &util::filename_cat(@http_dir_split);
497	}
498
499	$before_hash = $self->eval_dir_dots($before_hash);
500
501	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
502
503	my $rl = 0;
504	$rl = 1 if (-e $linkfilename);
505
506	# make sure there's a slash on the end if it's a directory
507	if ($before_hash !~ /\/$/) {
508	$before_hash .= "/" if (-d $linkfilename);
509	}
510
511	return ($type . $before_hash, $hash_part, $rl);
512
513	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
514	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
515
516	# the first directory will be the domain name if file_is_url
517	# to generate archives, otherwise we'll assume all files are
518	# from the same site and base_dir is the root
519
520	if ($self->{'file_is_url'}) {
521	my @dirs = split /[\/\\]/, $file;
522	my $domname = shift (@dirs);
523	$before_hash = &util::filename_cat($domname, $before_hash);
524	$before_hash =~ s@\\@/@g; # for windows
525	}
526	else
527	{
528	# see if link shares directory with source document
529	# => turn into relative link if this is so!
530
531	if ($ENV{'GSDLOS'} =~ /^windows/i) {
532	# too difficult doing a pattern match with embedded '\'s...
533	my $win_before_hash=$before_hash;
534	$win_before_hash =~ s@(\\)+@/@g;
535	# $base_dir is already similarly "converted" on windows.
536	if ($win_before_hash =~ s@^$base_dir/@@o) {
537	# if this is true, we removed a prefix
538	$before_hash=$win_before_hash;
539	}
540
541	}
542	else {
543	$before_hash = &util::filename_cat("",$before_hash);
544	$before_hash =~ s@^$base_dir/@@;
545	}
546
547	}
548
549	} else {
550	# Turn relative file path into full path
551	my $dirname = &File::Basename::dirname($file);
552	$before_hash = &util::filename_cat($dirname, $before_hash);
553	$before_hash = $self->eval_dir_dots($before_hash);
554	}
555
556	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
557
558	# make sure there's a slash on the end if it's a directory
559	if ($before_hash !~ /\/$/) {
560	$before_hash .= "/" if (-d $linkfilename);
561	}
562
563	return ("http://" . $before_hash, $hash_part, 1);
564
565	} else {
566	# mailto, news, nntp, telnet, javascript or gopher link
567	return ($before_hash, "", 0);
568	}
569	}
570
571	sub extract_first_NNNN_characters {
572	my $self = shift (@_);
573	my ($textref, $doc_obj, $thissection) = @_;
574
575	foreach my $size (split /,/, $self->{'first'}) {
576	my $tmptext = $$textref;
577	# skip to the body
578	$tmptext =~ s/.<body[^>]>//i;
579	# remove javascript
580	$tmptext =~ s@<script.*?</script>@ @sig;
581	$tmptext =~ s/<[^>]*>/ /g;
582	$tmptext =~ s/ / /g;
583	$tmptext =~ s/^\s+//;
584	$tmptext =~ s/\s+$//;
585	$tmptext =~ s/\s+/ /gs;
586	$tmptext = substr ($tmptext, 0, $size);
587	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
588	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
589	}
590	}
591
592	sub extract_metadata {
593	my $self = shift (@_);
594	my ($textref, $metadata, $doc_obj, $section) = @_;
595	my $outhandle = $self->{'outhandle'};
596
597	# if we don't want metadata, we may as well not be here ...
598	return if (!defined $self->{'metadata_fields'});
599
600	# hunt for an author look in the metadata elements:
601	if (defined $self->{'hunt_creator_metadata'}) {
602	for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
603	if ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$name\"?([^>])/is) {
604	my $content = $1 . $2;
605	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
606	if (defined $1) {
607	my $value = $1;
608	$value =~ s/\"$//;
609	$value =~ s/\s+/ /gs;
610	$doc_obj->add_utf8_metadata($section, "Creator", $value);
611	print $outhandle " extracted Creator metadata \"$value\"\n"
612	if ($self->{'verbosity'} > 2);
613	next;
614	}
615	}
616	}
617	}
618	}
619
620	foreach my $field (split /,/, $self->{'metadata_fields'}) {
621
622	# don't need to extract field if it was passed in from a previous
623	# (recursive) plugin
624	next if defined $metadata->{$field};
625
626	# see if there's a <meta> tag for this field
627	if ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/is) {
628	my $content = $1 . $2;
629	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
630	if (defined $1) {
631	my $value = $1;
632	$value =~ s/\"$//;
633	$value =~ s/\s+/ /gs;
634	$value =~ s/\".*//gs;
635	$doc_obj->add_utf8_metadata($section, $field, $value);
636	print $outhandle " extracted \"$field\" metadata \"$value\"\n"
637	if ($self->{'verbosity'} > 2);
638	next;
639	}
640	}
641	}
642
643	# TITLE: extract the document title
644
645	if ($field =~ /^title$/i) {
646
647	# see if there's a <title> tag
648	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
649	if (defined $1) {
650	my $title = $1;
651	if ($title =~ /\w/) {
652	$title =~ s/<[^>]*>/ /g;
653	$title =~ s/ / /g;
654	$title =~ s/\s+/ /gs;
655	$title =~ s/^\s+//;
656	$title =~ s/\s+$//;
657	$doc_obj->add_utf8_metadata ($section, $field, $title);
658	print $outhandle " extracted \"$field\" metadata \"$title\"\n"
659	if ($self->{'verbosity'} > 2);
660	next;
661	}
662	}
663	}
664
665	# if no title use first 100 characters
666	my $tmptext = $$textref;
667	$tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
668	$tmptext =~ s/<[^>]*>/ /g;
669	$tmptext =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
670	$tmptext =~ s/^\s+//s;
671	$tmptext =~ s/\s+$//;
672	$tmptext =~ s/\s+/ /gs;
673	$tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
674	$tmptext =~ s/^\s+//s; # in case title_sub introduced any...
675	$tmptext = substr ($tmptext, 0, 100);
676	$tmptext =~ s/\s\S*$/.../;
677	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
678	print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
679	if ($self->{'verbosity'} > 2);
680	next;
681	}
682
683
684	# tag: extract the text between the first <H1> and </H1> tags
685	if ($field =~ /^tag[a-z0-9]+$/i) {
686
687	my $tag = $field;
688	$tag =~ s/^tag//i;
689	my $tmptext = $$textref;
690	$tmptext =~ s/\s+/ /gs;
691	if ($tmptext =~ /<$tag[^>]*>/i) {
692	foreach my $word ($tmptext =~ m/<$tag[^>]>(.?)<\/$tag[^>]*>/g) {
693	$word =~ s/ / /g;
694	$word =~ s/<[^>]*>/ /g;
695	$word =~ s/^\s+//;
696	$word =~ s/\s+$//;
697	$word =~ s/\s+/ /gs;
698	if ($word ne "") {
699	$doc_obj->add_utf8_metadata ($section, $tag, $word);
700	print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
701	if ($self->{'verbosity'} > 2);
702	}
703	}
704	}
705	next;
706	}
707	}
708	}
709
710
711	# evaluate any "../" to next directory up
712	# evaluate any "./" as here
713	sub eval_dir_dots {
714	my $self = shift (@_);
715	my ($filename) = @_;
716
717	my $dirsep_os = &util::get_os_dirsep();
718	my @dirsep = split(/$dirsep_os/,$filename);
719
720	my @eval_dirs = ();
721	foreach my $d (@dirsep) {
722	if ($d eq "..") {
723	pop(@eval_dirs);
724
725	} elsif ($d eq ".") {
726	# do nothing!
727
728	} else {
729	push(@eval_dirs,$d);
730	}
731	}
732
733	return &util::filename_cat(@eval_dirs);
734	}
735
736	sub replace_usemap_links {
737	my $self = shift (@_);
738	my ($front, $link, $back) = @_;
739
740	$link =~ s/^\.\///;
741	return $front . $link . $back;
742	}
743
744	sub inc_filecount {
745	my $self = shift (@_);
746
747	if ($self->{'file_num'} == 1000) {
748	$self->{'dir_num'} ++;
749	$self->{'file_num'} = 0;
750	} else {
751	$self->{'file_num'} ++;
752	}
753	}
754
755
756	# Extend the BasPlug read_file so that strings like é are
757	# converted to UTF8 internally.
758	#
759	# We don't convert < or > or & or " in case
760	# they interfere with the GML files
761
762	sub read_file {
763	my ($self, $filename, $encoding, $language, $textref) = @_;
764
765	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
766
767	# Convert entities to their UTF8 equivalents
768	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
769	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
770	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
771	}
772
773	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: