- Timestamp:
- 2004-04-15T10:57:04+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r6812 r7202 47 47 @ISA = ('BasPlug'); 48 48 } 49 50 use strict; # every perl program should have this! 51 no strict 'refs'; # make an exception so we can use variables as filehandles 49 52 50 53 my $arguments = … … 97 100 'args' => $arguments }; 98 101 99 100 # sub print_usage {101 # print STDERR "\n usage: plugin HTMLPlug [options]\n\n";102 # print STDERR " options:\n";103 # print STDERR " -nolinks Don't make any attempt to trap links (setting this\n";104 # print STDERR " flag may improve speed of building/importing but\n";105 # print STDERR " any relative links within documents will be broken).\n";106 # print STDERR " -keep_head Don't remove headers from html files.\n";107 # print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n";108 # print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to109 # extract. Defaults to 'Title'.110 # Use 'tag<tagname>' to have the contents of the first111 # <tagname> pair put in a metadata element called112 # 'tagname'. Capitalise this as you want the metadata113 # capitalised in Greenstone, since the tag extraction114 # is case insensitive.\n";115 # print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and116 # place it in the 'Creator' field. Requires the117 # -metadata_fields flag.\n";118 # print STDERR " -file_is_url Set if input filenames make up url of original source119 # documents e.g. if a web mirroring tool was used to120 # create the import directory structure\n";121 # print STDERR " -assoc_files Perl regular expression of file extensions to122 # associate with html documents.123 # Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";124 # print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images).125 # Also creates much shallower directory structure126 # (useful when creating collections to go on cd-rom).\n";127 # print STDERR " -title_sub Substitution expression to modify string stored as128 # Title. Used by, for example, PDFPlug to remove129 # \"Page 1\", etc from text used as the title.\n";130 # print STDERR " -description_tags Split document into sub-sections where <Section> tags131 # occur. Note that by setting this option you132 # implicitly set -no_metadata, as all metadata should133 # be included within the <Section> tags (this is only134 # true for documents that actually contain <Section> tags135 # however). Also, '-keep_head' will have no effect when136 # this option is set, regardless of whether a document137 # contains Section tags.\n";138 # }139 140 102 sub new { 141 103 my $class = shift (@_); … … 600 562 } 601 563 564 602 565 sub extract_metadata { 603 566 my $self = shift (@_); … … 607 570 return if (!defined $self->{'metadata_fields'}); 608 571 609 # hunt for an author look in the metadata elements: 610 if (defined $self->{'hunt_creator_metadata'}) { 611 for my $name (split /,/, "AUTHOR,AUTHOR.EMAIL,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") { 612 #if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) { 613 if ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) { 614 my $content = $1 . $2; 615 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) { 616 if (defined $1) { 617 my $value = $1; 618 $value =~ s/\"$//; 619 $value =~ s/\s+/ /gs; 620 $doc_obj->add_utf8_metadata($section, "Creator", $value); 621 print $outhandle " extracted Creator metadata \"$value\"\n" 622 if ($self->{'verbosity'} > 2); 623 next; 624 } 625 } 572 my %find_fields = (); # metadata fields to extract/save 573 574 my %creator_fields = (); # short-cut for lookups 575 576 577 foreach my $field (split /,/, $self->{'metadata_fields'}) { 578 $find_fields{lc($field)}=$field; # lc = lowercase 579 } 580 581 if (defined $self->{'hunt_creator_metadata'} && 582 $self->{'hunt_creator_metadata'} == 1 ) { 583 my @extra_fields = 584 ( 585 'author', 586 'author.email', 587 'creator', 588 'dc.creator', 589 'dc.creator.corporatename', 590 ); 591 592 # add the creator_metadata fields to search for 593 foreach my $field (@extra_fields) { 594 $creator_fields{$field}=0; # add to lookup hash 595 } 596 } 597 598 # find the header in the html file, which has the meta tags 599 $$textref =~ m@<head>(.*?)</head>@si; 600 601 my $html_header=$1; 602 603 # go through every <meta... tag defined in the html and see if it is 604 # one of the tags we want to match. 605 606 # this assumes that ">" won't appear. (I don't think it's allowed to...) 607 $html_header =~ /^/; # match the start of the string, for \G assertion 608 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) { 609 my $metatag=$1; 610 my ($tag, $value); 611 612 # find the tag name 613 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; 614 $tag=$2; 615 # in case they're not using " or ', but they should... 616 if (! $tag) { 617 $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is; 618 $tag=$1; 619 } 620 621 if (!defined $tag) { 622 print $outhandle "HTMLPlug: can't find NAME in \"$metatag\"\n"; 623 next; 624 } 625 626 # don't need to assign this field if it was passed in from a previous 627 # (recursive) plugin 628 if (defined $metadata->{$tag}) {next} 629 630 # find the tag content 631 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is; 632 $value=$2; 633 if (! $value) { 634 $metatag =~ /(?:name|http-equiv)\s*=\s*(.*?)(?!\w)/is; 635 $value=$1; 636 } 637 if (!defined $value) { 638 print $outhandle "HTMLPlug: can't find VALUE in \"$metatag\"\n"; 639 next; 640 } 641 642 # clean up and add 643 $value =~ s/\s+/ /gs; 644 if (exists $creator_fields{lc($tag)}) { 645 # map this value onto greenstone's "Creator" metadata 646 $tag='Creator'; 647 } elsif (!exists $find_fields{lc($tag)}) { 648 next; # don't want this tag 649 } else { 650 # get the user's preferred capitalisation 651 $tag = $find_fields{lc($tag)}; 652 } 653 print $outhandle " extracted \"$tag\" metadata \"$value\"\n" 654 if ($self->{'verbosity'} > 2); 655 $doc_obj->add_utf8_metadata($section, $tag, $value); 656 657 } 658 659 # TITLE: extract the document title 660 if (exists $find_fields{'title'} && $find_fields{'title'} == 0) { 661 # we want a title, and didn't find one in the meta tags 662 # see if there's a <title> tag 663 my $title; 664 if ($html_header =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) { 665 $title = $1; 666 } 667 if (!defined $title) { 668 # if no title use first 100 or so characters 669 $title = $$textref; 670 $title =~ s/^.*?<body>//si; 671 # ignore javascript! 672 $title =~ s@<script.*?</script>@ @sig; 673 $title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space 674 $title =~ s/<[^>]*>/ /g; # remove all HTML tags 675 $title = substr ($title, 0, 100); 676 $title =~ s/\s\S*$/.../; 677 } 678 $title =~ s/<[^>]*>/ /g; # remove html tags 679 $title =~ s/ / /g; 680 $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... 681 $title =~ s/\s+/ /gs; # collapse multiple spaces 682 $title =~ s/^\s*//; # remove leading spaces 683 $title =~ s/\s*$//; # remove trailing spaces 684 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 685 $title =~ s/^\s+//s; # in case title_sub introduced any... 686 $doc_obj->add_utf8_metadata ($section, 'Title', $title); 687 print $outhandle " extracted Title metadata \"$title\"\n" 688 if ($self->{'verbosity'} > 2); 689 } 690 691 # Special, for metadata names such as tagH1 - extracts 692 # the text between the first <H1> and </H1> tags into "H1" metadata. 693 694 foreach my $field (keys %find_fields) { 695 if ($field !~ /^tag([a-z0-9]+)$/i) {next} 696 my $tag = $1; 697 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { 698 my $content = $1; 699 $content =~ s/ / /g; 700 $content =~ s/<[^>]*>/ /g; 701 $content =~ s/^\s+//; 702 $content =~ s/\s+$//; 703 $content =~ s/\s+/ /gs; 704 if ($content) { 705 $tag=$find_fields{"tag$tag"}; # get the user's capitalisation 706 $tag =~ s/^tag//i; 707 $doc_obj->add_utf8_metadata ($section, $tag, $content); 708 print $outhandle " extracted \"$tag\" metadata \"$content\"\n" 709 if ($self->{'verbosity'} > 2); 626 710 } 627 711 } 628 } 629 630 foreach my $field (split /,/, $self->{'metadata_fields'}) { 631 my $found = 0; 632 # don't need to extract field if it was passed in from a previous 633 # (recursive) plugin 634 next if defined $metadata->{$field}; 635 636 # see if there's a <meta> tag for this field 637 #while ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) { 638 while ($$textref =~ /<meta(\s*?[^<>]*?\s*?)(?:name|http-equiv)\s*=\s*\"?$field\"?([^>]*)/isg) { 639 my $content = $1 . $2; 640 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) { 641 if (defined $1) { 642 my $value = $1; 643 $value =~ s/\"$//; 644 $value =~ s/\s+/ /gs; 645 $value =~ s/\".*//gs; 646 $doc_obj->add_utf8_metadata($section, $field, $value); 647 print $outhandle " extracted \"$field\" metadata \"$value\"\n" 648 if ($self->{'verbosity'} > 2); 649 $found = 1; 650 } 651 } 652 } 653 next if $found; 654 # TITLE: extract the document title 655 656 if ($field =~ /^title$/i) { 657 658 # see if there's a <title> tag 659 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) { 660 if (defined $1) { 661 my $title = $1; 662 # Arg. This allows only ascii value characters in titles 663 if ($title =~ /\w/) { 664 $title =~ s/<[^>]*>/ /g; 665 $title =~ s/ / /g; 666 $title =~ s/\s+/ /gs; 667 $title =~ s/^\s+//; 668 $title =~ s/\s+$//; 669 $doc_obj->add_utf8_metadata ($section, $field, $title); 670 print $outhandle " extracted \"$field\" metadata \"$title\"\n" 671 if ($self->{'verbosity'} > 2); 672 next; 673 } 674 } 675 } 676 677 # if no title use first 100 characters 678 my $tmptext = $$textref; 679 $tmptext =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space 680 $tmptext =~ s/<[^>]*>/ /g; 681 $tmptext =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... 682 $tmptext =~ s/^\s+//s; 683 $tmptext =~ s/\s+$//; 684 $tmptext =~ s/\s+/ /gs; 685 $tmptext =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 686 $tmptext =~ s/^\s+//s; # in case title_sub introduced any... 687 $tmptext = substr ($tmptext, 0, 100); 688 $tmptext =~ s/\s\S*$/.../; 689 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 690 print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n" 691 if ($self->{'verbosity'} > 2); 692 next; 693 } 694 695 # tag: extract the text between the first <H1> and </H1> tags 696 if ($field =~ /^tag[a-z0-9]+$/i) { 697 698 my $tag = $field; 699 $tag =~ s/^tag//i; 700 my $tmptext = $$textref; 701 $tmptext =~ s/\s+/ /gs; 702 if ($tmptext =~ /<$tag[^>]*>/i) { 703 foreach my $word ($tmptext =~ m/<$tag[^>]*>(.*?)<\/$tag[^>]*>/g) { 704 $word =~ s/ / /g; 705 $word =~ s/<[^>]*>/ /g; 706 $word =~ s/^\s+//; 707 $word =~ s/\s+$//; 708 $word =~ s/\s+/ /gs; 709 if ($word ne "") { 710 $doc_obj->add_utf8_metadata ($section, $tag, $word); 711 print $outhandle " extracted \"$tag\" metadata \"$word\"\n" 712 if ($self->{'verbosity'} > 2); 713 } 714 } 715 } 716 next; 717 } 718 } 712 } 719 713 } 720 714
Note:
See TracChangeset
for help on using the changeset viewer.