Context Navigation

source: trunk/gsdl/perllib/plugins/HTMLPlug.pm@ 6649

Last change on this file since 6649 was 6649, checked in by kjdon, 20 years ago
changed the regex for getting info out of meta tags so it now works if the content attribute comes before the name attribute
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.7 KB

Line
1	###########################################################################
2	#
3	# HTMLPlug.pm -- basic html plugin
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	#
28	# Note that this plugin handles frames only in a very simple way
29	# i.e. each frame is treated as a separate document. This means
30	# search results will contain links to individual frames rather
31	# than linking to the top level frameset.
32	# There may also be some problems caused by the _parent target
33	# (it's removed by this plugin)
34	# To use frames properly you'll need to use the WebPlug plugin.
35	#
36
37
38	package HTMLPlug;
39
40	use BasPlug;
41	use ghtml;
42	use unicode;
43	use util;
44	use parsargv;
45
46	sub BEGIN {
47	@ISA = ('BasPlug');
48	}
49
50	my $arguments =
51	[ { 'name' => "process_exp",
52	'desc' => "{BasPlug.process_exp}",
53	'type' => "regexp",
54	'deft' => &get_default_process_exp() },
55	{ 'name' => "block_exp",
56	'desc' => "{BasPlug.block_exp}",
57	'type' => 'regexp',
58	'deft' => &get_default_block_exp() },
59	{ 'name' => "nolinks",
60	'desc' => "{HTMLPlug.nolinks}",
61	'type' => "flag" },
62	{ 'name' => "keep_head",
63	'desc' => "{HTMLPlug.keep_head}",
64	'type' => "flag" },
65	{ 'name' => "no_metadata",
66	'desc' => "{HTMLPlug.no_metadata}",
67	'type' => "flag" },
68	{ 'name' => "metadata_fields",
69	'desc' => "{HTMLPlug.metadata_fields}",
70	'type' => "string",
71	'deft' => "Title" },
72	{ 'name' => "hunt_creator_metadata",
73	'desc' => "{HTMLPlug.hunt_creator_metadata}",
74	'type' => "flag" },
75	{ 'name' => "file_is_url",
76	'desc' => "{HTMLPlug.file_is_url}",
77	'type' => "flag" },
78	{ 'name' => "assoc_files",
79	'desc' => "{HTMLPlug.assoc_files}",
80	'type' => "regexp",
81	'deft' => &get_default_block_exp() },
82	{ 'name' => "rename_assoc_files",
83	'desc' => "{HTMLPlug.rename_assoc_files}",
84	'type' => "flag" },
85	{ 'name' => "title_sub",
86	'desc' => "{HTMLPlug.title_sub}",
87	'type' => "string",
88	'deft' => "" },
89	{ 'name' => "description_tags",
90	'desc' => "{HTMLPlug.description_tags}",
91	'type' => "flag" } ];
92
93	my $options = { 'name' => "HTMLPlug",
94	'desc' => "{HTMLPlug.desc}",
95	'abstract' => "no",
96	'inherits' => "yes",
97	'args' => $arguments };
98
99
100	# sub print_usage {
101	# print STDERR "\n usage: plugin HTMLPlug [options]\n\n";
102	# print STDERR " options:\n";
103	# print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";
104	# print STDERR " flag may improve speed of building/importing but\n";
105	# print STDERR " any relative links within documents will be broken).\n";
106	# print STDERR " -keep_head Don't remove headers from html files.\n";
107	# print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";
108	# print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to
109	# extract. Defaults to 'Title'.
110	# Use 'tag<tagname>' to have the contents of the first
111	# <tagname> pair put in a metadata element called
112	# 'tagname'. Capitalise this as you want the metadata
113	# capitalised in Greenstone, since the tag extraction
114	# is case insensitive.\n";
115	# print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and
116	# place it in the 'Creator' field. Requires the
117	# -metadata_fields flag.\n";
118	# print STDERR " -file_is_url Set if input filenames make up url of original source
119	# documents e.g. if a web mirroring tool was used to
120	# create the import directory structure\n";
121	# print STDERR " -assoc_files Perl regular expression of file extensions to
122	# associate with html documents.
123	# Defaults to '(?i)\.(jpe?g\|gif\|png\|css)\$'\n";
124	# print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).
125	# Also creates much shallower directory structure
126	# (useful when creating collections to go on cd-rom).\n";
127	# print STDERR " -title_sub Substitution expression to modify string stored as
128	# Title. Used by, for example, PDFPlug to remove
129	# \"Page 1\", etc from text used as the title.\n";
130	# print STDERR " -description_tags Split document into sub-sections where <Section> tags
131	# occur. Note that by setting this option you
132	# implicitly set -no_metadata, as all metadata should
133	# be included within the <Section> tags (this is only
134	# true for documents that actually contain <Section> tags
135	# however). Also, '-keep_head' will have no effect when
136	# this option is set, regardless of whether a document
137	# contains Section tags.\n";
138	# }
139
140	sub new {
141	my $class = shift (@_);
142	my $self = new BasPlug ($class, @_);
143	$self->{'plugin_type'} = "HTMLPlug";
144	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
145	my $option_list = $self->{'option_list'};
146	push( @{$option_list}, $options );
147
148	if (!parsargv::parse(\@_,
149	q^nolinks^, \$self->{'nolinks'},
150	q^keep_head^, \$self->{'keep_head'},
151	q^no_metadata^, \$self->{'no_metadata'},
152	q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
153	q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
154	q^w3mir^, \$self->{'w3mir'},
155	q^file_is_url^, \$self->{'file_is_url'},
156	q^assoc_files/.*/(?i)\.(jpe?g\|jpe\|gif\|png\|css)$^, \$self->{'assoc_files'},
157	q^rename_assoc_files^, \$self->{'rename_assoc_files'},
158	q^title_sub/.*/^, \$self->{'title_sub'},
159	q^description_tags^, \$self->{'description_tags'},
160	"allow_extra_options")) {
161
162	print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
163	$self->print_txt_usage(""); # Use default resource bundle
164	die "\n";
165	}
166
167	# retain this for backward compatibility (w3mir option was replaced by
168	# file_is_url)
169	if ($self->{'w3mir'}) {
170	$self->{'file_is_url'} = 1;
171	}
172
173	$self->{'aux_files'} = {};
174	$self->{'dir_num'} = 0;
175	$self->{'file_num'} = 0;
176	return bless $self, $class;
177	}
178
179
180	sub get_default_block_exp {
181	my $self = shift (@_);
182
183	return q^(?i)\.(gif\|jpe?g\|jpe\|png\|css)$^;
184	}
185
186	sub get_default_process_exp {
187	my $self = shift (@_);
188
189	# the last option is an attempt to encode the concept of an html query ...
190	return q^(?i)(\.html?\|\.shtml\|\.shm\|\.asp\|\.php\d?\|\.cgi\|.+\?.+=.*)$^;
191	}
192
193
194	# do plugin specific processing of doc_obj
195	sub process {
196	my $self = shift (@_);
197	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
198	my $outhandle = $self->{'outhandle'};
199
200	print STDERR "<Processing n='$file' p='HTMLPlug'>\n" if ($gli);
201
202	print $outhandle "HTMLPlug: processing $file\n"
203	if $self->{'verbosity'} > 1;
204
205	if ($ENV{'GSDLOS'} =~ /^windows/i) {
206	# this makes life so much easier... perl can cope with unix-style '/'s.
207	$base_dir =~ s@(\\)+@/@g;
208	$file =~ s@(\\)+@/@g;
209	}
210
211	# reset per-doc stuff...
212	$self->{'aux_files'} = {};
213	$self->{'dir_num'} = 0;
214	$self->{'file_num'} = 0;
215
216	my $cursection = $doc_obj->get_top_section();
217
218	$self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
219	unless $self->{'no_metadata'} \|\| $self->{'description_tags'};
220
221	# Store URL for page as metadata - this can be used for an
222	# altavista style search interface. The URL won't be valid
223	# unless the file structure contains the domain name (i.e.
224	# like when w3mir is used to download a website).
225	my $web_url = "http://$file";
226	$doc_obj->add_metadata($cursection, "URL", $web_url);
227
228	if ($self->{'description_tags'}) {
229
230	# remove the html header - note that doing this here means any
231	# sections defined within the header will be lost (so all <Section>
232	# tags must appear within the body of the HTML)
233	$$textref =~ s/^.?<body[^>]>//is;
234	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
235
236	my $opencom = '(?:<!--\|<!(?:—\|\|--))';
237	my $closecom = '(?:-->\|(?:—\|\|--)>)';
238	my $lt = '(?:<\|<)';
239	my $gt = '(?:>\|>)';
240	my $quot = '(?:"\|"\|”\|“)';
241
242	my $found_something = 0; my $top = 1;
243	while ($$textref =~ s/^(.?)$opencom(.?)$closecom//s) {
244	my $text = $1;
245	my $comment = $2;
246	if (defined $text) {
247	# text before a comment - note that getting to here
248	# doesn't necessarily mean there are Section tags in
249	# the document
250	$self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
251	}
252	while ($comment =~ s/$lt(.*?)$gt//s) {
253
254	my $tag = $1;
255	if ($tag eq "Section") {
256	$found_something = 1;
257	$cursection = $doc_obj->insert_section($doc_obj->get_end_child($cursection)) unless $top;
258	$top = 0;
259	} elsif ($tag eq "/Section") {
260	$found_something = 1;
261	$cursection = $doc_obj->get_parent_section ($cursection);
262	} elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
263	my $metaname = $1;
264	$comment =~ s/^(.*?)$lt\/Metadata$gt//s;
265	my $metavalue = $1;
266	$metavalue =~ s/^\s+//;
267	$metavalue =~ s/\s+$//;
268	# assume that no metadata value intentionally includes
269	# carriage returns or HTML tags (if they're there they
270	# were probably introduced when converting to HTML from
271	# some other format).
272	$metavalue =~ s/[\cJ\cM]/ /sg;
273	$metavalue =~ s/<[^>]+>//sg;
274	$metavalue =~ s/\s+/ /sg;
275	$doc_obj->set_utf8_metadata_element($cursection, $metaname, $metavalue);
276	} elsif ($tag eq "Description" \|\| $tag eq "/Description") {
277	# do nothing with containing Description tags
278	} else {
279	# simple HTML tag (probably created by the conversion
280	# to HTML from some other format) - we'll ignore it and
281	# hope for the best ;-)
282	}
283	}
284	}
285	if ($cursection ne "") {
286	print $outhandle "HTMLPlug: WARNING: $file contains unmatched <Section></Section> tags\n";
287	}
288
289	$$textref =~ s/^.?<body[^>]>//is;
290	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
291	if ($$textref =~ /\S/) {
292	if (!$found_something) {
293	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
294	print $outhandle " will be processed as a single section document\n";
295
296	# go ahead and process single-section document
297	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
298
299	# if document contains no Section tags we'll go ahead
300	# and extract metadata (this won't have been done
301	# above as the -description_tags option prevents it)
302	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
303	unless $self->{'no_metadata'};
304
305	} else {
306	print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n";
307	print $outhandle " of the final closing </Section> tag. This text will\n";
308	print $outhandle " be ignored.";
309	my ($text);
310	if (length($$textref) > 30) {
311	$text = substr($$textref, 0, 30) . "...";
312	} else {
313	$text = $$textref;
314	}
315	$text =~ s/\n/ /isg;
316	print $outhandle " ($text)\n";
317	}
318	} elsif (!$found_something) {
319
320	# may get to here if document contained no valid Section
321	# tags but did contain some comments. The text will have
322	# been processed already but we should print the warning
323	# as above and extract metadata
324	print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n";
325	print $outhandle " will be processed as a single section document\n";
326
327	$self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection)
328	unless $self->{'no_metadata'};
329	}
330
331	} else {
332
333	# remove header and footer
334	if (!$self->{'keep_head'} \|\| $self->{'description_tags'}) {
335	$$textref =~ s/^.?<body[^>]>//is;
336	$$textref =~ s/(<\/body[^>]>\|<\/html[^>]>)//isg;
337	}
338
339	# single section document
340	$self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
341	}
342	return 1;
343	}
344
345	# note that process_section may be called multiple times for a single
346	# section (relying on the fact that add_utf8_text appends the text to any
347	# that may exist already).
348	sub process_section {
349	my $self = shift (@_);
350	my ($textref, $base_dir, $file, $doc_obj, $cursection) = @_;
351	# trap links
352	if (!$self->{'nolinks'}) {
353
354	# usemap="./#index" not handled correctly => change to "#index"
355	$$textref =~ s/(<img[^>]?usemap\s=\s\"?)([^\">\s]+)(\"?[^>]>)/
356	$self->replace_usemap_links($1, $2, $3)/isge;
357
358	$$textref =~ s/(<(?:a\|area\|frame\|link)\s+[^>]?\s(?:href\|src)\s=\s\"?)([^\">\s]+)(\"?[^>]*>)/
359	$self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
360	}
361
362	# trap images
363
364	# allow spaces if inside quotes - jrm21
365	$$textref =~ s/(<img[^>]?src\s=\s)(\"[^\"]+\"\|[^\s>]+)([^>]>)/
366	$self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
367
368	# add text to document object
369	# turn \ into \\ so that the rest of greenstone doesn't think there
370	# is an escape code following. (Macro parsing loses them...)
371	$$textref =~ s/\\/\\\\/go;
372	$doc_obj->add_utf8_text($cursection, $$textref);
373	}
374
375	sub replace_images {
376	my $self = shift (@_);
377	my ($front, $link, $back, $base_dir,
378	$file, $doc_obj, $section) = @_;
379	# remove quotes from link at start and end if necessary
380	if ($link=~/^\"/) {
381	$link=~s/^\"//;$link=~s/\"$//;
382	$front.='"';
383	$back="\"$back";
384	}
385
386	$link =~ s/\n/ /g;
387	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
388	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
389	}
390
391	sub replace_href_links {
392	my $self = shift (@_);
393	my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
394
395	# attempt to sort out targets - frames are not handled
396	# well in this plugin and some cases will screw things
397	# up - e.g. the _parent target (so we'll just remove
398	# them all ;-)
399	$front =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
400	$back =~ s/(target=\"?)_top(\"?)/$1_gsdltop_$2/is;
401	$front =~ s/target=\"?_parent\"?//is;
402	$back =~ s/target=\"?_parent\"?//is;
403
404	return $front . $link . $back if $link =~ /^\#/s;
405	$link =~ s/\n/ /g;
406
407	my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
408	# href may use '\'s where '/'s should be on Windows
409	$href =~ s/\\/\//g;
410
411	my ($filename) = $href =~ /^(?:.?):(?:\/\/)?(.)/;
412
413	##### leave all these links alone (they won't be picked up by intermediate
414	##### pages). I think that's safest when dealing with frames, targets etc.
415	##### (at least until I think of a better way to do it). Problems occur with
416	##### mailto links from within small frames, the intermediate page is displayed
417	##### within that frame and can't be seen. There is still potential for this to
418	##### happen even with html pages - the solution seems to be to somehow tell
419	##### the browser from the server side to display the page being sent (i.e.
420	##### the intermediate page) in the top level window - I'm not sure if that's
421	##### possible - the following line should probably be deleted if that can be done
422	return $front . $link . $back if $href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/is;
423
424
425	if (($rl == 0) \|\| ($filename =~ /$self->{'process_exp'}/) \|\|
426	($href =~ /\/$/) \|\| ($href =~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i)) {
427	&ghtml::urlsafe ($href);
428	return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
429
430	} else {
431	# link is to some other type of file (eg image) so we'll
432	# need to associate that file
433	return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
434	}
435	}
436
437	sub add_file {
438	my $self = shift (@_);
439	my ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) = @_;
440	my ($newname);
441
442	my $filename = $href;
443	$filename =~ s/^[^:]*:\/\///;
444	$filename = &util::filename_cat($base_dir, $filename);
445
446	# Replace %20's in URL with a space if required. Note that the filename
447	# may include the %20 in some situations
448	if ($filename =~ /\%20/) {
449	if (!-e $filename) {
450	$filename =~ s/\%20/ /g;
451	}
452	}
453
454	my ($ext) = $filename =~ /(\.[^\.]*)$/;
455
456	if ((!defined $ext) \|\| ($ext !~ /$self->{'assoc_files'}/)) {
457	return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
458	}
459
460	if ($self->{'rename_assoc_files'}) {
461	if (defined $self->{'aux_files'}->{$href}) {
462	$newname = $self->{'aux_files'}->{$href}->{'dir_num'} . "/" .
463	$self->{'aux_files'}->{$href}->{'file_num'} . $ext;
464	} else {
465	$newname = $self->{'dir_num'} . "/" . $self->{'file_num'} . $ext;
466	$self->{'aux_files'}->{$href} = {'dir_num' => $self->{'dir_num'}, 'file_num' => $self->{'file_num'}};
467	$self->inc_filecount ();
468	}
469	$doc_obj->associate_file($filename, $newname, undef, $section);
470	return "_httpdocimg_/$newname";
471
472	} else {
473	($newname) = $filename =~ /([^\/\\]*)$/;
474	$doc_obj->associate_file($filename, $newname, undef, $section);
475	return "_httpdocimg_/$newname";
476	}
477	}
478
479
480	sub format_link {
481	my $self = shift (@_);
482	my ($link, $base_dir, $file) = @_;
483
484	my ($before_hash, $hash_part) = $link =~ /^([^\#])(\#?.)$/;
485
486	$hash_part = "" if !defined $hash_part;
487	if (!defined $before_hash \|\| $before_hash !~ /[\w\.\/]/) {
488	my $outhandle = $self->{'outhandle'};
489	print $outhandle "HTMLPlug: ERROR - badly formatted tag ignored ($link)\n"
490	if $self->{'verbosity'};
491	return ($link, "", 0);
492	}
493
494	if ($before_hash =~ s@^((?:http\|ftp\|file)://)@@i) {
495	my $type = $1;
496
497	if ($link =~ /^(http\|ftp):/i) {
498	# Turn url (using /) into file name (possibly using \ on windows)
499	my @http_dir_split = split('/', $before_hash);
500	$before_hash = &util::filename_cat(@http_dir_split);
501	}
502
503	$before_hash = $self->eval_dir_dots($before_hash);
504
505	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
506
507	my $rl = 0;
508	$rl = 1 if (-e $linkfilename);
509
510	# make sure there's a slash on the end if it's a directory
511	if ($before_hash !~ /\/$/) {
512	$before_hash .= "/" if (-d $linkfilename);
513	}
514
515	return ($type . $before_hash, $hash_part, $rl);
516
517	} elsif ($link !~ /^(mailto\|news\|gopher\|nntp\|telnet\|javascript):/i) {
518	if ($before_hash =~ s@^/@@ \|\| $before_hash =~ /\\/) {
519
520	# the first directory will be the domain name if file_is_url
521	# to generate archives, otherwise we'll assume all files are
522	# from the same site and base_dir is the root
523
524	if ($self->{'file_is_url'}) {
525	my @dirs = split /[\/\\]/, $file;
526	my $domname = shift (@dirs);
527	$before_hash = &util::filename_cat($domname, $before_hash);
528	$before_hash =~ s@\\@/@g; # for windows
529	}
530	else
531	{
532	# see if link shares directory with source document
533	# => turn into relative link if this is so!
534
535	if ($ENV{'GSDLOS'} =~ /^windows/i) {
536	# too difficult doing a pattern match with embedded '\'s...
537	my $win_before_hash=$before_hash;
538	$win_before_hash =~ s@(\\)+@/@g;
539	# $base_dir is already similarly "converted" on windows.
540	if ($win_before_hash =~ s@^$base_dir/@@o) {
541	# if this is true, we removed a prefix
542	$before_hash=$win_before_hash;
543	}
544
545	}
546	else {
547	$before_hash = &util::filename_cat("",$before_hash);
548	$before_hash =~ s@^$base_dir/@@;
549	}
550
551	}
552
553	} else {
554	# Turn relative file path into full path
555	my $dirname = &File::Basename::dirname($file);
556	$before_hash = &util::filename_cat($dirname, $before_hash);
557	$before_hash = $self->eval_dir_dots($before_hash);
558	}
559
560	my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
561
562	# make sure there's a slash on the end if it's a directory
563	if ($before_hash !~ /\/$/) {
564	$before_hash .= "/" if (-d $linkfilename);
565	}
566
567	return ("http://" . $before_hash, $hash_part, 1);
568
569	} else {
570	# mailto, news, nntp, telnet, javascript or gopher link
571	return ($before_hash, "", 0);
572	}
573	}
574
575	sub extract_first_NNNN_characters {
576	my $self = shift (@_);
577	my ($textref, $doc_obj, $thissection) = @_;
578
579	foreach my $size (split /,/, $self->{'first'}) {
580	my $tmptext = $$textref;
581	# skip to the body
582	$tmptext =~ s/.<body[^>]>//i;
583	# remove javascript
584	$tmptext =~ s@<script.*?</script>@ @sig;
585	$tmptext =~ s/<[^>]*>/ /g;
586	$tmptext =~ s/ / /g;
587	$tmptext =~ s/^\s+//;
588	$tmptext =~ s/\s+$//;
589	$tmptext =~ s/\s+/ /gs;
590	$tmptext = substr ($tmptext, 0, $size);
591	$tmptext =~ s/\s\S*$/…/; # adds an ellipse (...)
592	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
593	}
594	}
595
596	sub extract_metadata {
597	my $self = shift (@_);
598	my ($textref, $metadata, $doc_obj, $section) = @_;
599	my $outhandle = $self->{'outhandle'};
600	# if we don't want metadata, we may as well not be here ...
601	return if (!defined $self->{'metadata_fields'});
602
603	# hunt for an author look in the metadata elements:
604	if (defined $self->{'hunt_creator_metadata'}) {
605	for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
606	#if ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$name\"?([^>])/is) {
607	if ($$textref =~ /<meta(.?)(?:name\|http-equiv)\s=\s\"?$name\"?([^>])/is) {
608	my $content = $1 . $2;
609	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
610	if (defined $1) {
611	my $value = $1;
612	$value =~ s/\"$//;
613	$value =~ s/\s+/ /gs;
614	$doc_obj->add_utf8_metadata($section, "Creator", $value);
615	print $outhandle " extracted Creator metadata \"$value\"\n"
616	if ($self->{'verbosity'} > 2);
617	next;
618	}
619	}
620	}
621	}
622	}
623
624	foreach my $field (split /,/, $self->{'metadata_fields'}) {
625	my $found = 0;
626	# don't need to extract field if it was passed in from a previous
627	# (recursive) plugin
628	next if defined $metadata->{$field};
629
630	# see if there's a <meta> tag for this field
631	#while ($$textref =~ /<meta(\s?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/isg) {
632	while ($$textref =~ /<meta(.?)(?:name\|http-equiv)\s=\s\"?$field\"?([^>])/isg) {
633	my $content = $1 . $2;
634	if ($content =~ /content\s=\s\"?(.*)\"?/is) {
635	if (defined $1) {
636	my $value = $1;
637	$value =~ s/\"$//;
638	$value =~ s/\s+/ /gs;
639	$value =~ s/\".*//gs;
640	$doc_obj->add_utf8_metadata($section, $field, $value);
641	print $outhandle " extracted \"$field\" metadata \"$value\"\n"
642	if ($self->{'verbosity'} > 2);
643	$found = 1;
644	}
645	}
646	}
647	next if $found;
648	# TITLE: extract the document title
649
650	if ($field =~ /^title$/i) {
651
652	# see if there's a <title> tag
653	if ($$textref =~ /<title[^>]>([^<])<\/title[^>]*>/is) {
654	if (defined $1) {
655	my $title = $1;
656	# Arg. This allows only ascii value characters in titles
657	if ($title =~ /\w/) {
658	$title =~ s/<[^>]*>/ /g;
659	$title =~ s/ / /g;
660	$title =~ s/\s+/ /gs;
661	$title =~ s/^\s+//;
662	$title =~ s/\s+$//;
663	$doc_obj->add_utf8_metadata ($section, $field, $title);
664	print $outhandle " extracted \"$field\" metadata \"$title\"\n"
665	if ($self->{'verbosity'} > 2);
666	next;
667	}
668	}
669	}
670
671	# if no title use first 100 characters
672	my $tmptext = $$textref;
673	$tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
674	$tmptext =~ s/<[^>]*>/ /g;
675	$tmptext =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
676	$tmptext =~ s/^\s+//s;
677	$tmptext =~ s/\s+$//;
678	$tmptext =~ s/\s+/ /gs;
679	$tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
680	$tmptext =~ s/^\s+//s; # in case title_sub introduced any...
681	$tmptext = substr ($tmptext, 0, 100);
682	$tmptext =~ s/\s\S*$/.../;
683	$doc_obj->add_utf8_metadata ($section, $field, $tmptext);
684	print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n"
685	if ($self->{'verbosity'} > 2);
686	next;
687	}
688
689	# tag: extract the text between the first <H1> and </H1> tags
690	if ($field =~ /^tag[a-z0-9]+$/i) {
691
692	my $tag = $field;
693	$tag =~ s/^tag//i;
694	my $tmptext = $$textref;
695	$tmptext =~ s/\s+/ /gs;
696	if ($tmptext =~ /<$tag[^>]*>/i) {
697	foreach my $word ($tmptext =~ m/<$tag[^>]>(.?)<\/$tag[^>]*>/g) {
698	$word =~ s/ / /g;
699	$word =~ s/<[^>]*>/ /g;
700	$word =~ s/^\s+//;
701	$word =~ s/\s+$//;
702	$word =~ s/\s+/ /gs;
703	if ($word ne "") {
704	$doc_obj->add_utf8_metadata ($section, $tag, $word);
705	print $outhandle " extracted \"$tag\" metadata \"$word\"\n"
706	if ($self->{'verbosity'} > 2);
707	}
708	}
709	}
710	next;
711	}
712	}
713	}
714
715
716	# evaluate any "../" to next directory up
717	# evaluate any "./" as here
718	sub eval_dir_dots {
719	my $self = shift (@_);
720	my ($filename) = @_;
721
722	my $dirsep_os = &util::get_os_dirsep();
723	my @dirsep = split(/$dirsep_os/,$filename);
724
725	my @eval_dirs = ();
726	foreach my $d (@dirsep) {
727	if ($d eq "..") {
728	pop(@eval_dirs);
729
730	} elsif ($d eq ".") {
731	# do nothing!
732
733	} else {
734	push(@eval_dirs,$d);
735	}
736	}
737
738	return &util::filename_cat(@eval_dirs);
739	}
740
741	sub replace_usemap_links {
742	my $self = shift (@_);
743	my ($front, $link, $back) = @_;
744
745	$link =~ s/^\.\///;
746	return $front . $link . $back;
747	}
748
749	sub inc_filecount {
750	my $self = shift (@_);
751
752	if ($self->{'file_num'} == 1000) {
753	$self->{'dir_num'} ++;
754	$self->{'file_num'} = 0;
755	} else {
756	$self->{'file_num'} ++;
757	}
758	}
759
760
761	# Extend the BasPlug read_file so that strings like é are
762	# converted to UTF8 internally.
763	#
764	# We don't convert < or > or & or " in case
765	# they interfere with the GML files
766
767	sub read_file {
768	my ($self, $filename, $encoding, $language, $textref) = @_;
769
770	&BasPlug::read_file($self, $filename, $encoding, $language, $textref);
771
772	# Convert entities to their UTF8 equivalents
773	$$textref =~ s/&(lt\|gt\|amp\|quot\|nbsp);/&z$1;/go;
774	$$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
775	$$textref =~ s/&z(lt\|gt\|amp\|quot\|nbsp);/&$1;/go;
776	}
777
778	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: