Changeset 1244


Ignore:
Timestamp:
2000-06-27T17:10:07+12:00 (24 years ago)
Author:
sjboddie
Message:

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

Location:
trunk/gsdl
Files:
1 added
12 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1242 r1244  
    2727
    2828use strict;
    29 use diagnostics;
     29#use diagnostics;
    3030
    3131package acronym;
  • trunk/gsdl/perllib/plugin.pm

    r1243 r1244  
    5151    map { $_ = "\"$_\""; } @$pluginoptions;
    5252    my $options = join (",", @$pluginoptions);
     53    $options =~ s/\$/\\\$/g;
    5354    eval ("\$plugobj = new \$pluginname($options)");
    5455    die "$@" if $@;
  • trunk/gsdl/perllib/plugins/ArcPlug.pm

    r809 r1244  
    3939}
    4040
     41use strict;
     42
    4143sub new {
    4244    my ($class) = @_;
    43     my $self = new BasPlug ();
     45    my $self = new BasPlug ("ArcPlug", @_);
    4446
    4547    return bless $self, $class;
     
    5860sub read {
    5961    my $self = shift (@_);
    60     ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     62    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    6163
    6264    my $count = 0;
    6365
    6466    # see if this has a archives information file within it
    65     $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
     67    my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
    6668
    6769    if (-e $archive_info_filename) {
     
    7779
    7880    # process each file
    79     foreach $subfile (@$file_list) {
     81    foreach my $subfile (@$file_list) {
    8082        last if ($maxdocs != -1 && $count >= $maxdocs);
    8183
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1242 r1244  
    3333use doc;
    3434
    35 sub print_usage {
     35sub print_general_usage {
    3636    my ($plugin_name) = @_;
    3737
    38     print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
    39     print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
    40    
    4138    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
    42     print STDERR "  currently supported general options are:\n";
    4339    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
    4440    print STDERR "                     converted from these encodings and stored internally as\n";
     
    7167}
    7268
     69# print_usage should be overridden for any sub-classes having
     70# their own plugin specific options
     71sub print_usage {
     72    print STDERR "\nThis plugin has no plugin specific options\n\n";
     73
     74}
     75
    7376sub new {
    7477    my $class = shift (@_);
     
    8588             q^extract_acronyms^, \$self->{'extract_acronyms'},
    8689             "allow_extra_options")) {
    87     &print_usage($plugin_name);
     90
     91    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
     92    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     93        &print_general_usage($plugin_name);
    8894    die "\n";
    8995    }
     
    103109    # set process_exp and block_exp to defaults unless they were
    104110    # explicitly set
    105     if ((!$self->is_recursive()) &&
     111
     112    if ((!$self->is_recursive()) and
    106113    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
    107    
     114
    108115    $self->{'process_exp'} = $self->get_default_process_exp ();
    109116    if ($self->{'process_exp'} eq "") {
    110         warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";
     117        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
    111118    }
    112119    }
     
    115122    $self->{'block_exp'} = $self->get_default_block_exp ();
    116123    }
     124   
     125    # handle input_encoding aliases
     126    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
     127    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
    117128}
    118129
     
    152163# process() function and let this read() function keep control.
    153164#
     165# recursive plugins (e.g. RecPlug) and specialized plugins like those
     166# capable of processing many documents within a single file (e.g.
     167# GMLPlug) should normally implement their own version of read()
     168#
    154169# Return number of files processed, undef if can't process
    155170# Note that $base_dir might be "" and that $file might
     
    165180
    166181    my $filename = &util::filename_cat($base_dir, $file);
    167     return 0 if $filename =~ /$self->{'block_exp'}/;
     182    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
    168183    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
    169184    return undef;
     
    174189    # create a new document
    175190    my $doc_obj = new doc ($file, "indexed_doc");
    176     my $cursection =
    177191   
    178192    # read in file ($text will be in utf8)
     
    190204
    191205    # do plugin specific processing of doc_obj
    192     $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
    193 
    194     # add text
    195     $doc_obj->add_utf8_text ($cursection, $text);
     206    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
    196207
    197208    # do any automatic metadata extraction
     
    207218}
    208219
     220# returns undef if file is rejected by the plugin
    209221sub process {
    210222    my $self = shift (@_);
     
    212224
    213225    die "Basplug::process function must be implemented in sub-class\n";
     226
     227    return undef; # never gets here
    214228}
    215229
     
    223237
    224238    $$textref = "";
    225     my $encoding = "";
    226     if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
    227     $encoding = "iso_8859_1";
    228     } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
    229     $encoding = "windows_1256";
    230     } else {
    231     $encoding = $self->{'input_encoding'};
    232     }
    233239
    234240    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
    235241
    236     if ($encoding eq "ascii") {
     242    if ($self->{'input_encoding'} eq "ascii") {
    237243    undef $/;
    238244    $$textref = <FILE>;
     
    241247    my $reader = new multiread();
    242248    $reader->set_handle ('BasPlug::FILE');
    243     $reader->set_encoding ($encoding);
     249    $reader->set_encoding ($self->{'input_encoding'});
    244250    $reader->read_file ($textref);
    245251
    246     if ($encoding eq "gb") {
     252    if ($self->{'input_encoding'} eq "gb") {
    247253        # segment the Chinese words
    248254        $$textref = &cnseg::segment($$textref);
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r1206 r1244  
    7070}
    7171
     72use strict;
    7273
    7374# Create a new EMAILPlug object with which to parse a file.
     
    7778sub new {
    7879    my ($class) = @_;
    79     $self = new BasPlug ();
     80    my $self = new BasPlug ("EMAILPlug", @_);
     81
    8082    return bless $self, $class;
    8183}
    8284
    83 
    84 # Is EMAILPlug recursive?  No.
    85 
    86 sub is_recursive {
    87     return 0;
    88 }
    89 
    90 
    91 # Read a file and store its contents in a new document object.
    92 # First, we check to see if it is an email message we're dealing
    93 # with, then we extract the text and metadata, then we store
    94 # all this information.
    95 #
    96 # Returns: number of files processed or undef if it can't process
    97 # a file.  This plugin only processes one file at a time.
    98 
    99 sub read {
     85sub get_default_process_exp {
    10086    my $self = shift (@_);
    101     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    102 
    103     #
    104     # Check that we're dealig with a valid mail file
    105     #
    106 
    107     # Make sure file exists
    108     my $filename = &util::filename_cat($base_dir, $file);
    109     return undef unless (-e $filename);
    110     return undef unless ($filename =~ /\d+(\.email)?$/);
    111 
    112     # Read the text and make sure it is an email message
    113     open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
    114     my @text = <FILE>;
    115     my $text = join("", @text);
    116     return undef unless (($text =~ /From:/) || ($text =~ /To:/));
    117 
    118     print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
     87
     88    return q^\d+(\.email)?$^;
     89}
     90
     91# do plugin specific processing of doc_obj
     92sub process {
     93    my $self = shift (@_);
     94    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     95   
     96    # Check that we're dealing with a valid mail file
     97    return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
     98
     99    print STDERR "EMAILPlug: processing $file\n"
     100    if $self->{'verbosity'} > 1;
     101
     102    my $cursection = $doc_obj->get_top_section();
    119103
    120104    #
     
    123107
    124108    # Separate header from body of message
    125     my $Headers = $text;
     109    my $Headers = $$textref;
    126110    $Headers =~ s/\n\n.*//s;
    127     $text = substr $text, (length $Headers);
     111    $$textref = substr $$textref, (length $Headers);
    128112
    129113    # Extract basic metadata from header
     
    158142
    159143
    160     #
    161     # Create a new document object
    162     #
    163 
    164     my $doc_obj = new doc ($file, "indexed_doc");
    165     my $cursection = $doc_obj->get_top_section();
    166 
    167     # Add specilised metadata
     144    # Add extracted metadata to document object
    168145    foreach my $name (keys %raw) {
    169146    $value = $raw{$name};
     
    173150        $value = "No $name field";
    174151    }
    175     $doc_obj->add_metadata ($cursection, $name, $value);
     152    $doc_obj->add_utf8_metadata ($cursection, $name, $value);
    176153    }
    177154
     
    179156    $Headers = &text_into_html($Headers);
    180157    $Headers = "No headers" unless ($Headers =~ /\w/);
    181     $doc_obj->add_metadata ($cursection, "Headers", $Headers);
    182 
    183     # Add document text
    184     $text = &text_into_html($text);
    185     $text = "No message" unless ($text =~ /\w/);
    186     $doc_obj->add_text ($cursection, $text);
    187    
    188     # Add the OID - that is, the big HASH value used as a unique ID
    189     $doc_obj->set_OID ();
    190 
    191     # Process the document
    192     $processor->process($doc_obj);
    193 
    194     # Return the number of documents processed
    195     return 1;
    196 
     158    $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
     159
     160    # Add text to document object
     161    $$textref = &text_into_html($$textref);
     162    $$textref = "No message" unless ($$textref =~ /\w/);
     163    $doc_obj->add_utf8_text($cursection, $$textref);
     164
     165    return 1;
    197166}
    198167
     
    213182    my ($text) = @_;
    214183
    215     # Convert problem charaters into HTML symbols
     184    # Convert problem characters into HTML symbols
    216185    $text =~ s/&/&amp;/go;
    217186    $text =~ s/</&lt;/go;
     
    236205# Perl packages have to return true if they are run.
    2372061;
    238    
    239 
    240 
    241 
    242 
    243 
    244 
  • trunk/gsdl/perllib/plugins/GMLPlug.pm

    r1010 r1244  
    3737}
    3838
     39use strict;
     40
    3941sub new {
    4042    my ($class) = @_;
    41     $self = new BasPlug ();
     43    my $self = new BasPlug ("GMLPlug", @_);
    4244
    4345    return bless $self, $class;
    4446}
    4547
    46 
    47 sub is_recursive {
     48sub get_default_process_exp {
    4849    my $self = shift (@_);
    4950
    50     return 0; # this is not a recursive plugin
    51 }
    52 
    53 sub _unescape_text {
    54     my ($text) = @_;
    55 
    56     # special characters in the gml encoding
    57     $text =~ s/&lt;/</g;
    58     $text =~ s/&gt;/>/g;
    59     $text =~ s/&quot;/\"/g;
    60     $text =~ s/&amp;/&/g; # this has to be last...
    61 
    62     return $text;
     51    return q^(?i)\.gml(\.gz)?$^;
    6352}
    6453
     
    6958    my $self = shift (@_);
    7059    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    71     my $fullname = &util::filename_cat ($base_dir, $file);
    72 
    73     # see if this is a gml book
    74     return undef unless (-f $fullname && $fullname =~ /\.gml(\.gz)?$/io);
    75 
    76     my ($parent_dir, $gz) = $fullname =~ /^(.*?)[\/\\][^\/\\]+.gml(\.gz)?$/io;
    77 
    78     if (defined $gz && $gz =~ /\.gz/io) {
     60
     61    my $filename = &util::filename_cat($base_dir, $file);
     62    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     63    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     64    return undef;
     65    }
     66    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     67
     68    print STDERR "GMLPlug: processing $file\n";
     69
     70    my $parent_dir = $file;
     71    $parent_dir =~ s/[^\\\/]*$//;
     72    $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
     73
     74    # all this gzip stuff should one day be replaced by a gzip/bzip/zip/tar
     75    # handling plugin
     76    my $gz = 0;
     77    if ($file =~ /\.gz$/i) {
    7978    $gz = 1;
    80     } else {
    81     $gz = 0;
    8279    }
    8380
    84     print STDERR "GMLPlug: processing $file\n";
    85 
    86     # read in the document
     81    # read in the document - input is assumed throughout this plugin to already be utf8
    8782    if ($gz) {
    88     if (!open (INFILE, "zcat $fullname |")) {
    89         print STDERR "GMLPlug::read - zcat couldn't read $fullname\n";
    90         return undef;
     83    if (!open (INFILE, "zcat $filename |")) {
     84        print STDERR "GMLPlug::read - zcat couldn't read $filename\n";
     85        return 0;
    9186    }
    9287    } else {
    93     if (!open (INFILE, $fullname)) {
    94         print STDERR "GMLPlug::read - couldn't read $fullname\n";
    95         return undef;
     88    if (!open (INFILE, $filename)) {
     89        print STDERR "GMLPlug::read - couldn't read $filename\n";
     90        return 0;
    9691    }
    9792    }
     
    106101
    107102    my $no_docs = 0;
    108 #    my $src_filename = ""; #### don't appear to use this anymore - not sure if that's right
    109103
    110104    while (1) {
     
    128122
    129123        } else {
    130             print STDERR "GMLPlug::read - error in file $fullname\n";
     124            print STDERR "GMLPlug::read - error in file $filename\n";
    131125            print STDERR "text: \"$gml\"\n";
    132126            last;
     
    166160        last if $section eq ""; # back to top level again (more than one document in gml file)
    167161        $section = $doc_obj->get_parent_section ($section);
    168     } #while (1) section level
     162    } # while (1) section level
    169163
    170164    # add the associated files
    171     $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
     165    my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
    172166    my ($assoc_file_info, $afile);
    173167    foreach $assoc_file_info (@$assoc_files) {
     
    186180    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
    187181   
    188     # assume the document has an OID
     182    # do any automatic metadata extraction
     183    $self->auto_extract_metadata ($doc_obj);
     184
     185    # assume the document has an OID already
    189186   
    190187    # process the document
     
    194191    last if ($maxdocs > -1 && $no_docs >= $maxdocs);
    195192    last unless defined $gml && $gml =~ /\w/;
    196     } #while(1) document level
     193    } # while(1) document level
    197194
    198195    return $no_docs; # no of docs processed
    199196}
    200197
     198sub _unescape_text {
     199    my ($text) = @_;
     200
     201    # special characters in the gml encoding
     202    $text =~ s/&lt;/</g;
     203    $text =~ s/&gt;/>/g;
     204    $text =~ s/&quot;/\"/g;
     205    $text =~ s/&amp;/&/g; # this has to be last...
     206
     207    return $text;
     208}
    201209
    2022101;
  • trunk/gsdl/perllib/plugins/HBPlug.pm

    r1020 r1244  
    2424###########################################################################
    2525
    26 # plugin which process an HTML book directory
     26# plugin which processes an HTML book directory
     27
     28# This plugin is used by the Humanity Library collections and does not handle
     29# input encodings other than ascii or extended ascii
     30
     31# this code is kind of ugly and could no doubt be made to run faster, by leaving
     32# it in this state I hope to encourage people to make their collections use
     33# HBSPlug instead ;-)
     34
     35# Use HBSPlug if creating a new collection and marking up files like the
     36# Humanity Library collections. HBSPlug accepts all input encodings but
     37# expects the marked up files to be cleaner than those used by the
     38# Humanity Library collections
    2739
    2840package HBPlug;
    2941
    30 use plugin;
    3142use ghtml;
    3243use BasPlug;
    3344use util;
    34 use lang;
    3545use doc;
    36 use cfgread;
    3746
    3847
     
    4352sub new {
    4453    my ($class) = @_;
    45     $self = new BasPlug ();
     54    my $self = new BasPlug ("HBPlug", @_);
    4655
    4756    return bless $self, $class;
    4857}
    4958
    50 sub is_recursive {
    51     my $self = shift (@_);
    52 
    53     return 0; # this is not a recursive plugin
    54 }
     59sub init {
     60    my $self = shift (@_);
     61    my ($verbosity) = @_;
     62
     63    $self->BasPlug::init();
     64
     65    # this plugin only handles ascii encodings
     66    if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
     67    die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
     68        $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
     69    }
     70}
     71
     72# this is included only to prevent warnings being printed out
     73# from BasPlug::init. The process_exp is not used by this plugin
     74sub get_default_process_exp {
     75    my $self = shift (@_);
     76
     77    return "This plugin does not use a process_exp\n";
     78}
     79
    5580
    5681sub HB_read_html_file {
     
    6590
    6691    my $foundbody = 0;
    67     $self->HB_gettext (\$foundbody, $text, FILE);
     92    $self->HB_gettext (\$foundbody, $text, "FILE");
    6893    close FILE;
    6994
     
    7297    $foundbody = 1;
    7398    open (FILE, $htmlfile) || return;
    74     $self->HB_gettext (\$foundbody, $text, FILE);   
     99    $self->HB_gettext (\$foundbody, $text, "FILE");
    75100    close FILE;
    76101    }
     
    159184}
    160185
     186# if input_encoding is ascii we can call add_utf8_metadata
     187# directly but if it's iso_8859_1 (the default) we need to call
     188# add_metadata so that the ascii2utf8 conversion is done first
     189# this should speed things up a little if processing an ascii only
     190# document with input_encoding set to ascii
     191sub HB_add_metadata {
     192    my $self = shift (@_);
     193    my ($doc_obj, $cursection, $field, $value) = @_;
     194
     195    if ($self->{'input_encoding'} eq "ascii") {
     196    $doc_obj->add_utf8_metadata ($cursection, $field, $value);
     197    } else {
     198    $doc_obj->add_metadata ($cursection, $field, $value);
     199    }
     200}
    161201
    162202# return number of files processed, undef if can't process
     
    192232
    193233    # add metadata for top level of document
    194     foreach $field (keys(%$metadata)) {
     234    foreach my $field (keys(%$metadata)) {
    195235    # $metadata->{$field} may be an array reference
    196236    if (ref ($metadata->{$field}) eq "ARRAY") {
    197237        map {
    198         $doc_obj->add_metadata ($cursection, $field, $_);
     238        $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
    199239        } @{$metadata->{$field}};
    200240    } else {
    201         $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
     241        $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
    202242    }
    203243    }
     
    240280
    241281        # add the metadata to this section
    242         $doc_obj->add_metadata ($cursection, "Title", $title);
     282        $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
    243283
    244284        # clean up the section html
     
    251291
    252292        # add the text for this section
    253         $doc_obj->add_text ($cursection, $sectiontext);
    254        
     293        if ($self->{'input_encoding'} eq "ascii") {
     294        $doc_obj->add_utf8_text ($cursection, $sectiontext);
     295        } else {
     296        $doc_obj->add_text ($cursection, $sectiontext);
     297        }
    255298    } else {
    256299        print STDERR "WARNING - leftover text\n" , $self->shorten($html),
  • trunk/gsdl/perllib/plugins/HBSPlug.pm

    r1235 r1244  
    3232# processing of html links or any other HTMLPlug type stuff is done).
    3333
    34 # expects input files to have a .hb file extension
     34# expects input files to have a .hb file extension by default (this can be
     35# changed by adding a -process_exp option
    3536
    3637# a file with the same name as the hb file but a .jpg extension is
    37 # taken as the cover image
     38# taken as the cover image (jpg files are blocked by this plugin)
    3839
    3940# HBSPlug is a simplification (and extension of) the HBPlug used
     
    5960sub new {
    6061    my ($class) = @_;
    61     my $self = new BasPlug (@_);
     62    my $self = new BasPlug ("HBSPlug", @_);
    6263
    6364    return bless $self, $class;
    6465}
    6566
    66 sub is_recursive {
     67sub get_default_block_exp {
    6768    my $self = shift (@_);
    6869
    69     return 0; # this is not a recursive plugin
    70 }
    71 
    72 
    73 # return number of files processed, undef if can't process
    74 # Note that $base_dir might be "" and that $file might
    75 # include directories
    76 sub read {
     70    return q^\.jpg$^;
     71}
     72
     73sub get_default_process_exp {
    7774    my $self = shift (@_);
    78     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
     75
     76    return q^(?i)\.hb$^;
     77}
     78
     79# do plugin specific processing of doc_obj
     80sub process {
     81    my $self = shift (@_);
     82    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     83   
     84    print STDERR "HBSPlug: processing $file\n"
     85    if $self->{'verbosity'} > 1;
     86   
     87    my $cursection = $doc_obj->get_top_section();
    7988
    8089    my $filename = &util::filename_cat($base_dir, $file);
     
    8291    $absdir =~ s/[^\/\\]*$//;
    8392
    84     return 0 if ($filename =~ /\.jpg$/i);
    85     return undef unless ($filename =~ /\.hb$/i && (-e $filename));
    86 
    87     print STDERR "HBSPlug: processing $filename\n" if $processor->{'verbosity'};
    88 
    89     # create a new document
    90     my $doc_obj = new doc ($file, "indexed_doc");
    91     my $cursection = $doc_obj->get_top_section();
    92 
    9393    # add the cover image
    9494    my $coverimage = $filename;
    95     $coverimage =~ s/\.hb/\.jpg/i;
     95    $coverimage =~ s/\.[^\.]*$/\.jpg/i;
    9696    $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
    9797
    98     # add metadata for top level of document
    99     $self->extra_metadata ($doc_obj, $cursection, $metadata);
    100 
    101     # read in HTML file ($text will be in utf8)
    102     my $text = "";
    103     $self->read_file ($filename, \$text);
    104 
    10598    my $title = "";
    10699
    107100    # remove any leading rubbish
    108     $text =~ s/^.*?(<<TOC)/$1/ios;
     101    $$textref =~ s/^.*?(<<TOC)/$1/ios;
    109102   
    110103    my $curtoclevel = 1;
    111104    my $firstsection = 1;
    112105    my $toccount = 0;
    113     while ($text =~ /\w/) {
    114     $text =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
     106    while ($$textref =~ /\w/) {
     107    $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
    115108    my $toclevel = $1;
    116109    my $metadata = $2;
     
    166159    $firstsection = 0;
    167160
    168     $text =~ s/^\s+//s;
    169     }
    170 
    171     # add OID
    172     $doc_obj->set_OID ();
    173 
    174     # process the document
    175     $processor->process($doc_obj);
    176 
    177     return 1; # processed the file
     161    $$textref =~ s/^\s+//s;
     162    }
     163
     164    return 1;
    178165}
    179166
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1243 r1244  
    5050
    5151sub print_usage {
    52     print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
    53 
    5452    print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    5553    print STDERR "  options:\n";
     
    6462    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
    6563    print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
    66     print STDERR "                          w3mir \n";
    6764    print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
    68     print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)$'\n";
     65    print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n";
    6966    print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images). Also\n";
    7067    print STDERR "                          creates much shallower directory structure (useful when creating\n";
     
    8582             q^rename_assoc_files^, \$self->{'rename_assoc_files'},
    8683             "allow_extra_options")) {
    87    
     84
     85    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
    8886    &print_usage();
    8987    die "\n";
     
    152150    $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
    153151    $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
     152
     153    # add text to document object
     154    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
     155
     156    return 1;
    154157}
    155158
  • trunk/gsdl/perllib/plugins/IndexPlug.pm

    r809 r1244  
    5454use plugin;
    5555use BasPlug;
    56 use lang;
    5756use doc;
    5857use util;
     
    6362}
    6463
     64use strict;
     65
    6566sub new {
    6667    my ($class) = @_;
    67     $self = new BasPlug ();
     68    my $self = new BasPlug ("IndexPlug", @_);
    6869
    6970    return bless $self, $class;
     
    7677    return 1;
    7778}
    78 
    7979
    8080# return number of files processed, undef if can't process
     
    104104    # process each document
    105105    my $count = 0;
    106     foreach $docfile (keys (%$list)) {
     106    foreach my $docfile (keys (%$list)) {
    107107    last if ($maxdocs != -1 && $count >= $maxdocs);
    108108    $metadata = {}; # at present we can do this as metadata
     
    113113    # note that $list->{$docfile} is an array reference
    114114    if ($docfile !~ /key:/i) {
     115        my $i = 0;
    115116        for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) {
    116117        if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) {
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r809 r1244  
    3838}
    3939
     40use strict;
     41
    4042sub new {
    4143    my ($class) = @_;
    42     my $self = new BasPlug ();
     44    my $self = new BasPlug ("RecPlug", @_);
    4345
    4446    $self->{'exclude_tail_dirs'} = []; # empty by default
     
    6264    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    6365
    64     foreach $etd ( @{$self->{'exclude_tail_dirs'}} )
     66    foreach my $etd ( @{$self->{'exclude_tail_dirs'}} )
    6567    {
    6668    return 0 if ($file =~ m/$etd/);
     
    7274
    7375    # see if this is a directory
    74     $dirname = &util::filename_cat ($base_dir, $file);
     76    my $dirname = &util::filename_cat ($base_dir, $file);
    7577    if (-d $dirname) {
    7678
  • trunk/gsdl/perllib/plugins/TEXTPlug.pm

    r732 r1244  
    2424###########################################################################
    2525
    26 # creates simple single-level document from .txt or .text files
    27 # (case-insensitive match on filenames). Adds Title metadata
    28 # of first 100 characters found.
     26# creates simple single-level document. Adds Title metadata
     27# of first line of text (up to 100 characters long).
    2928
    3029package TEXTPlug;
    3130
    3231use BasPlug;
    33 use sorttools;
    3432
    3533sub BEGIN {
     
    3735}
    3836
     37use strict;
     38
    3939sub new {
    4040    my ($class) = @_;
    41     $self = new BasPlug ();
     41    my $self = new BasPlug ("TEXTPlug", @_);
    4242
    4343    return bless $self, $class;
    4444}
    4545
    46 sub is_recursive {
     46sub get_default_process_exp {
    4747    my $self = shift (@_);
    4848
    49     return 0; # this is not a recursive plugin
     49    return q^(?i)\.te?xt$^;
    5050}
    5151
    52 
    53 # return number of files processed, undef if can't process
    54 # Note that $base_dir might be "" and that $file might
    55 # include directories
    56 sub read {
     52# do plugin specific processing of doc_obj
     53sub process {
    5754    my $self = shift (@_);
    58     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    59 
    60     my $filename = &util::filename_cat($base_dir, $file);
    61 
    62     return undef unless ($filename =~ /\.(te?xt(\.gz)?)$/i && (-e $filename));
    63 
    64     my $gz = 0;
    65     if (defined $2) {
    66     $gz = $2;
    67     $gz = 1 if ($gz =~ /\.gz/i);
    68     }
    69 
    70     print STDERR "TEXTPlug: processing $filename\n" if $processor->{'verbosity'};
    71 
    72     # create a new document
    73     my $doc_obj = new doc ($file, "indexed_doc");
    74 
    75     if ($gz) {
    76     open (FILE, "zcat $filename |") || die "TEXTPlug::read - zcat can't open $filename\n";
    77     } else {
    78     open (FILE, $filename) || die "TEXTPlug::read - can't open $filename\n";
    79     }
     55    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     56   
     57    print STDERR "TEXTPlug: processing $file\n"
     58    if $self->{'verbosity'} > 1;
     59   
    8060    my $cursection = $doc_obj->get_top_section();
    81 
    82     my $text = "";
    83     my $line = "";
    84     my $foundtitle = 0;
    85     # don't need to get title if it has been passed
    86     # in from another plugin
    87     if (defined $metadata->{'Title'}) {
    88     $foundtitle = 1;
    89     }
    90     while (defined ($line = <FILE>)) {
    91     # use first line as title (or first 100 characters if it's long)
    92     if (!$foundtitle && length($line) > 5) {
    93         my $title = "";
    94         if (length($line) > 100) {
    95         $title = substr ($line, 0, 100);
    96         } else {
    97         $title = $line;
    98         }
    99         $doc_obj->add_metadata ($cursection, "Title", $title);
    100         $foundtitle = 1;
     61   
     62    # get title metadata
     63    # (don't need to get title if it has been passed
     64    # in from another plugin)
     65    if (!defined $metadata->{'Title'}) {
     66    my ($title) = $$textref =~ /^([^\n]*)/;
     67    if (length($title) > 100) {
     68        $title = substr ($title, 0, 100);
    10169    }
    102     $text .= $line;
     70    $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
    10371    }
    10472   
    105     $doc_obj->add_text ($cursection, "<pre>\n$text\n</pre>");
     73    # insert preformat tags and add text to document object
     74    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
    10675
    107 
    108     foreach $field (keys(%$metadata)) {
    109     # $metadata->{$field} may be an array reference
    110     if (ref ($metadata->{$field}) eq "ARRAY") {
    111         map {
    112         $doc_obj->add_metadata ($cursection, $field, $_);
    113         } @{$metadata->{$field}};
    114     } else {
    115         $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
    116     }
    117     }
    118 
    119     # add OID
    120     $doc_obj->set_OID ();
    121 
    122     # process the document
    123     $processor->process($doc_obj);
    124 
    125     return 1; # processed the file
     76    return 1;
    12677}
    12778
Note: See TracChangeset for help on using the changeset viewer.