Changeset 2975
- Timestamp:
- 2002-02-20T16:23:35+13:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r2819 r2975 51 51 print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 52 52 print STDERR " options:\n"; 53 print STDERR " -nolinks Don't make any attempt to trap links (setting this flag may\n";54 print STDERR " improve speed of building/importing but any relative links within\n";55 print STDERR " documents will be broken).\n";53 print STDERR " -nolinks Don't make any attempt to trap links (setting this\n"; 54 print STDERR " flag may improve speed of building/importing but\n"; 55 print STDERR " any relative links within documents will be broken).\n"; 56 56 print STDERR " -keep_head Don't remove headers from html files.\n"; 57 57 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; 58 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n"; 59 print STDERR " Defaults to 'Title'.\n"; 60 print STDERR " Use 'tag<tagname>' to have the contents of the first <tagname>\n"; 61 print STDERR " pair put in a metadata element called 'tagname' Capitalise \n"; 62 print STDERR " 'tagname' as you want the metadata capitalised in the GML \n"; 63 print STDERR " file, since the tag extraction is case insensitive.\n"; 64 print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and place it in the\n "; 65 print STDERR " 'Creator' field. Requires the -metadata_fields flag.\n "; 66 print STDERR " -file_is_url Set if input filenames make up url of original source documents\n"; 67 print STDERR " e.g. if a web mirroring tool was used to create the import\n"; 68 print STDERR " directory structure\n"; 69 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n"; 70 print STDERR " html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n"; 71 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). Also\n"; 72 print STDERR " creates much shallower directory structure (useful when creating\n"; 73 print STDERR " collections to go on cd-rom).\n\n"; 74 print STDERR " -title_sub Substitution expression to modify string stored as Title.\n"; 75 print STDERR " Used by, for example, PDFHtml to remove Page 1 etc from text\n"; 76 print STDERR " chosen to be used as the title.\n"; 77 print STDERR " -description_tags Split document into sub-sections where <Section> tags occur.\n"; 78 print STDERR " Note that by setting this option you implicitly set -no_metadata\n"; 79 print STDERR " as all metadata should be included within the <Section> tags.\n"; 80 print STDERR " Also, -keep_head will have no effect when this option is set.\n"; 58 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to 59 extract. Defaults to 'Title'. 60 Use 'tag<tagname>' to have the contents of the first 61 <tagname> pair put in a metadata element called 62 'tagname'. Capitalise this as you want the metadata 63 capitalised in Greenstone, since the tag extraction 64 is case insensitive.\n"; 65 print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and 66 place it in the 'Creator' field. Requires the 67 -metadata_fields flag.\n"; 68 print STDERR " -file_is_url Set if input filenames make up url of original source 69 documents e.g. if a web mirroring tool was used to 70 create the import directory structure\n"; 71 print STDERR " -assoc_files Perl regular expression of file extensions to 72 associate with html documents. 73 Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n"; 74 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). 75 Also creates much shallower directory structure 76 (useful when creating collections to go on cd-rom).\n"; 77 print STDERR " -title_sub Substitution expression to modify string stored as 78 Title. Used by, for example, PDFPlug to remove 79 \"Page 1\", etc from text used as the title.\n"; 80 print STDERR " -description_tags Split document into sub-sections where <Section> tags 81 occur. Note that by setting this option you 82 implicitly set -no_metadata, as all metadata should 83 be included within the <Section> tags. Also, 84 '-keep_head' will have no effect when this option 85 is set.\n"; 81 86 } 82 87 … … 113 118 $self->{'dir_num'} = 0; 114 119 $self->{'file_num'} = 0; 115 116 120 return bless $self, $class; 117 121 } … … 468 472 my $tmptext = $$textref; 469 473 $tmptext =~ s/.*<body[^>]*>//i; 470 $tmptext =~ s/$self->{'title_sub'}// if ( defined$self->{'title_sub'});474 $tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'}); 471 475 $tmptext =~ s/<[^>]*>/ /g; 472 476 $tmptext =~ s/ / /g; … … 561 565 $tmptext =~ s/\s+$//; 562 566 $tmptext =~ s/\s+/ /gs; 563 $tmptext =~ s/$self->{'title_sub'}// if ( defined$self->{'title_sub'});567 $tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'}); 564 568 $tmptext = substr ($tmptext, 0, 100); 565 569 $tmptext =~ s/\s\S*$/.../;
Note:
See TracChangeset
for help on using the changeset viewer.