Changeset 2975


Ignore:
Timestamp:
2002-02-20T16:23:35+13:00 (22 years ago)
Author:
jrm21
Message:

Tidied up usage info to fit in 80 columns. Fixed title_sub stuff, so we only
use it if its length is non-zero.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r2819 r2975  
    5151    print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    5252    print STDERR "  options:\n";
    53     print STDERR "   -nolinks               Don't make any attempt to trap links (setting this flag may\n";
    54     print STDERR "                          improve speed of building/importing but any relative links within\n";
    55     print STDERR "                          documents will be broken).\n";
     53    print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
     54    print STDERR "                          flag may improve speed of building/importing but\n";
     55    print STDERR "                          any relative links within documents will be broken).\n";
    5656    print STDERR "   -keep_head             Don't remove headers from html files.\n";
    5757    print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
    58     print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
    59     print STDERR "                          Defaults to 'Title'.\n";
    60     print STDERR "                          Use 'tag<tagname>' to have the contents of the first <tagname>\n";
    61     print STDERR "                          pair put in a metadata element called 'tagname' Capitalise \n";
    62     print STDERR "                          'tagname' as you want the metadata capitalised in the GML \n";
    63     print STDERR "                          file, since the tag extraction is case insensitive.\n";
    64     print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and place it in the\n ";
    65     print STDERR "                          'Creator' field. Requires the -metadata_fields flag.\n ";
    66     print STDERR "   -file_is_url           Set if input filenames make up url of original source documents\n";
    67     print STDERR "                          e.g. if a web mirroring tool was used to create the import\n";
    68     print STDERR "                          directory structure\n";
    69     print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
    70     print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
    71     print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images). Also\n";
    72     print STDERR "                          creates much shallower directory structure (useful when creating\n";
    73     print STDERR "                          collections to go on cd-rom).\n\n";
    74     print STDERR "   -title_sub             Substitution expression to modify string stored as Title.\n";
    75     print STDERR "                          Used by, for example, PDFHtml to remove Page 1 etc from text\n";
    76     print STDERR "                          chosen to be used as the title.\n";
    77     print STDERR "   -description_tags      Split document into sub-sections where <Section> tags occur.\n";
    78     print STDERR "                          Note that by setting this option you implicitly set -no_metadata\n";
    79     print STDERR "                          as all metadata should be included within the <Section> tags.\n";
    80     print STDERR "                          Also, -keep_head will have no effect when this option is set.\n";
     58    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
     59                          extract. Defaults to 'Title'.
     60                          Use 'tag<tagname>' to have the contents of the first
     61                          <tagname> pair put in a metadata element called
     62                          'tagname'. Capitalise this as you want the metadata
     63                          capitalised in Greenstone, since the tag extraction
     64                          is case insensitive.\n";
     65    print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
     66                          place it in the 'Creator' field. Requires the
     67                          -metadata_fields flag.\n";
     68    print STDERR "   -file_is_url           Set if input filenames make up url of original source
     69                          documents e.g. if a web mirroring tool was used to
     70                          create the import directory structure\n";
     71    print STDERR "   -assoc_files           Perl regular expression of file extensions to
     72                          associate with html documents.
     73                          Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
     74    print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
     75                          Also creates much shallower directory structure
     76                          (useful when creating collections to go on cd-rom).\n";
     77    print STDERR "   -title_sub             Substitution expression to modify string stored as
     78                          Title. Used by, for example, PDFPlug to remove
     79                          \"Page 1\", etc from text used as the title.\n";
     80    print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
     81                          occur. Note that by setting this option you
     82                          implicitly set -no_metadata, as all metadata should
     83                          be included within the <Section> tags. Also,
     84                          '-keep_head' will have no effect when this option
     85                          is set.\n";
    8186}
    8287
     
    113118    $self->{'dir_num'} = 0;
    114119    $self->{'file_num'} = 0;
    115 
    116120    return bless $self, $class;
    117121}
     
    468472    my $tmptext =  $$textref;
    469473    $tmptext =~ s/.*<body[^>]*>//i;
    470     $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
     474    $tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'});
    471475    $tmptext =~ s/<[^>]*>/ /g;
    472476    $tmptext =~ s/&nbsp;/ /g;
     
    561565        $tmptext =~ s/\s+$//;
    562566        $tmptext =~ s/\s+/ /gs;
    563         $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
     567        $tmptext =~ s/$self->{'title_sub'}// if ($self->{'title_sub'});
    564568        $tmptext = substr ($tmptext, 0, 100);
    565569        $tmptext =~ s/\s\S*$/.../;
Note: See TracChangeset for help on using the changeset viewer.