Changeset 317


Ignore:
Timestamp:
1999-06-30T15:36:35+12:00 (25 years ago)
Author:
sjboddie
Message:

Added maxdocs option

Location:
trunk/gsdl/perllib/plugins
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ArcPlug.pm

    r285 r317  
    1 # plugin which recurses through directories processing
    2 # each file it finds
     1# plugin which recurses through an archives.inf file
     2# (i.e. the file generated in the archives directory
     3# when an import is done), processing each file it finds
    34
    45package ArcPlug;
     
    2728}
    2829
    29 # return 1 if processed, 0 if not processed
     30# return number of files processed, undef if can't process
    3031# Note that $base_dir might be "" and that $file might
    3132# include directories
    3233sub read {
    3334    my $self = shift (@_);
    34     ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    35 #    my $count = 0;
     35    ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     36
     37    my $count = 0;
    3638
    3739    # see if this has a archives information file within it
     
    3941
    4042    if (-e $archive_info_filename) {
     43
     44    # found an archives.inf file
     45    print STDERR "ArcPlug: processing $archive_info_filename\n";
    4146
    4247    # read in the archives information file
     
    4853    # process each file
    4954    foreach $subfile (@$file_list) {
    50 ####        # note: metadata is not carried on to the next level - why ??? - I changed this
    51         # so I could pass the classifytype from mgbuilder - Stefan.
     55        last if (defined $maxdocs && $maxdocs =~ /\d/ && $count >= $maxdocs);
     56
    5257        my $tmp = &util::filename_cat ($file, $subfile->[0]);
    5358        next if $tmp eq $file;
    54         &plugin::read ($pluginfo, $base_dir, $tmp, $metadata, $processor);
    55 #       $count ++;
    56 #       last if $count > 200;
    57     }
    58    
    59     # all books have been processed so need to output classifications
    60     # to infodb - note that at present you have to import before building
    61     if (defined $processor->{'mode'} && $processor->{'mode'} eq 'infodb') {
    62         print STDERR "ArcPlug: Adding classifications to infodb\n";
    63         $processor->process('classifications');
     59        # note: metadata is not carried on to the next level
     60        $count += &plugin::read ($pluginfo, $base_dir, $tmp, {}, $processor, $maxdocs);
    6461    }
    6562
    66     return 1;
     63    return $count;
    6764    }
    6865
    6966    # wasn't an archives directory, someone else will have to process it
    70     return 0;
     67    return undef;
    7168}
    7269
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r4 r317  
    1717}
    1818
    19 # return 1 if processed, 0 if not processed
     19# return number of files processed, undef if can't process
    2020# Note that $base_dir might be "" and that $file might
    2121# include directories
    2222sub read {
    2323    my $self = shift (@_);
    24     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
     24    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    2525
    2626    die "BasPlug::read function must be implemented in sub classes\n";
    2727
    28     return 0; # will never get here
     28    return undef; # will never get here
    2929}
    3030
  • trunk/gsdl/perllib/plugins/FOXPlug.pm

    r168 r317  
    3232
    3333
    34 # return 1 if processed, 0 if not processed
     34# return number of files processed, undef if can't process
    3535# Note that $base_dir might be "" and that $file might
    3636# include directories
     
    4141
    4242    # dbt files are processed at the same time as dbf files
    43     return 1 if ($fullname =~ /\.dbt$/i);
     43    return 0 if ($fullname =~ /\.dbt$/i);
    4444
    4545    # see if this is a foxbase database
    46     return 0 unless (-f $fullname && $fullname =~ /\.dbf$/i);
     46    return undef unless (-f $fullname && $fullname =~ /\.dbf$/i);
    4747
    4848    my ($parent_dir) = $fullname =~ /^(.*)\/[^\/]+\.dbf$/i;
     
    5151    if (!open (FOXBASEIN, $fullname)) {
    5252    print STDERR "FOXPlug::read - couldn't read $fullname\n";
    53     return 0;
     53    return undef;
    5454    }
    5555   
     
    6363    print STDERR "FOXPlug::read - eof while reading database header";
    6464    close (FOXBASEIN);
    65     return 0;
     65    return undef;
    6666    }
    6767   
     
    7979    } else {
    8080    print STDERR "FOXPlug:read $fullname doesn't seem to be a Foxbase file\n";
    81     return 0;
     81    return undef;
    8282    }
    8383
     
    109109    print STDERR "FOXPlug::read - couldn't read $dbtfullname\n";
    110110    close (FOXBASEIN);
    111     return 0;
     111    return undef;
    112112    }
    113113
  • trunk/gsdl/perllib/plugins/GMLPlug.pm

    r245 r317  
    3838}
    3939
    40 
    41 # return 1 if processed, 0 if not processed
     40# return number of files processed, undef if can't process
    4241# Note that $base_dir might be "" and that $file might
    4342# include directories
     
    4847
    4948    # see if this is a gml book
    50     return 0 unless (-f $fullname && $fullname =~ /\.gml$/i);
     49    return undef unless (-f $fullname && $fullname =~ /\.gml$/i);
    5150
    5251    my ($parent_dir) = $fullname =~ /^(.*)\/[^\/]+.gml$/;
     
    6362    if (!open (INFILE, $fullname)) {
    6463    print STDERR "GMLPlug::read - couldn't read $fullname\n";
    65     return 0;
     64    return undef;
    6665    }
    6766    while (defined ($line = <INFILE>)) {
  • trunk/gsdl/perllib/plugins/HBPlug.pm

    r288 r317  
    104104    $section =~ s/(.{1,80})\s/$1\n/g;
    105105   
    106     # fix up the image links -- not sure what Rodger intended here - Stefan.
    107 #    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
    108 #    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
    109 
    110     $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
    111     $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
     106    # fix up the image links
     107    $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
     108    <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
     109    $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
     110    <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
    112111
    113112    return $section;
     
    126125
    127126
    128 # return 1 if processed, 0 if not processed
     127# return number of files processed, undef if can't process
    129128# Note that $base_dir might be "" and that $file might
    130129# include directories
     
    138137    ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
    139138    }
    140     return 0 unless defined $jobnumber;
     139    return undef unless defined $jobnumber;
    141140    my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
    142     return 0 unless -e $htmlfile;
     141    return undef unless -e $htmlfile;
    143142
    144143    print STDERR "HBPlug: processing $file\n";
     
    155154    if -e $bookcover;
    156155
     156    my $cursection = $doc_obj->get_top_section();
     157
     158    # add metadata for top level of document
     159    foreach $field (keys(%$metadata)) {
     160    # $metadata->{$field} may be an array reference
     161    if (ref ($metadata->{$field}) eq "ARRAY") {
     162        map {
     163        $doc_obj->add_metadata ($cursection, $field, $_);
     164        } @{$metadata->{$field}};
     165    } else {
     166        $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
     167    }
     168    }
     169    # need to add <classifytype> to each book as we'll be using 'Hierarchy'
     170    # for HB collections rather than the default ('Book')
     171    $doc_obj->add_metadata ($cursection, 'classifytype', 'Hierarchy');
     172
     173
    157174    # process the file one section at a time
    158175    my $curtoclevel = 1;
    159     my $cursection = $doc_obj->get_top_section();
    160176    my $firstsection = 1;
    161177    while (length ($html) > 0) {
     
    195211        $doc_obj->add_metadata ($cursection, "Title", $title);
    196212
    197         if ($firstsection) {
    198         foreach $field (keys(%$metadata)) {
    199             # $metadata->{$field} may be an array reference
    200             if (ref ($metadata->{$field}) eq "ARRAY") {
    201             map {
    202                 $doc_obj->add_metadata ($cursection, $field, $_);
    203             } @{$metadata->{$field}};
    204             } else {
    205             $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
    206             }
    207         }
    208         $firstsection = 0;
    209         }
    210 
    211213        # clean up the section html
    212214        $sectiontext = $self->HB_clean_section($sectiontext);
    213215
    214216        # associate any files
    215 #       map { $doc_obj->associate_file("$base_dir$file/$1", $1)
    216 #             if /_linkOID_\(_thisOID_\/([^\)]+)\)/; 0; }
    217 #        split (/(_linkOID_\(_thisOID_\/[^\)]+\))/, $sectiontext);
    218 
    219217        map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
    220218              if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
     
    229227        last;
    230228    }
     229    $firstsection = 0;
    231230    }
    232231
  • trunk/gsdl/perllib/plugins/IndexPlug.pm

    r286 r317  
    5252}
    5353
    54 # return 1 if processed, 0 if not processed
     54
     55# return number of files processed, undef if can't process
    5556# Note that $base_dir might be "" and that $file might
    5657# include directories
    5758sub read {
    5859    my $self = shift (@_);
    59     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
     60    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    6061
    6162    my $indexfile = &util::filename_cat($base_dir, $file, "index.txt");
    6263    if (!-f $indexfile) {
    6364    # not a directory containing an index file
    64     return 0;
     65    return undef;
    6566    }
    6667
     
    7273    my @fields = ();
    7374    # see if there's a 'key:' line
    74     foreach $line (keys %$list) {
    75     if ($line =~ /key:/i) {
    76         @fields = @{$list->{$line}};
    77         last;
    78     }
     75    if (defined $list->{'key:'}) {
     76    @fields = @{$list->{'key:'}};
    7977    }
    8078
    8179    # process each document
    82 
    8380    my $count = 0;
    8481    foreach $docfile (keys (%$list)) {
    85     last if $count > 10;
    86     $count ++;
     82    last if (defined $maxdocs && $maxdocs =~ /\d/ && $count >= $maxdocs);
    8783    $metadata = {}; # at present we can do this as metadata
    8884                    # will always be empty when it arrives
    89                     # at any plugin - this might cause
     85                    # at this plugin - this might cause
    9086                    # problems if things change though
    9187
     
    108104        }
    109105    }
    110     &plugin::read ($pluginfo, $base_dir, $docfile, $metadata, $processor);
     106    $count += &plugin::read ($pluginfo, $base_dir, $docfile, $metadata, $processor, $maxdocs);
    111107    }
    112108
    113     return 1; # was processed
     109    return $count; # was processed
    114110}
    115111
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r136 r317  
    2727}
    2828
    29 # return 1 if processed, 0 if not processed
     29
     30# return number of files processed, undef if can't process
    3031# Note that $base_dir might be "" and that $file might
    3132# include directories
    3233sub read {
    3334    my $self = shift (@_);
    34     my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
    35 
     35    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
    3636
    3737    my (@dir, $subfile);
     38
     39    my $count = 0;
    3840
    3941    # see if this is a directory
     
    4749    closedir (DIR);
    4850
     51    print STDERR "RecPlug: getting directory $dirname\n";
     52
    4953    # process each file
    5054    foreach $subfile (@dir) {
     55        last if (defined $maxdocs && $maxdocs =~ /\d/ && $count >= $maxdocs);
     56
    5157        if ($subfile !~ /^\.\.?$/) {
    5258        # note: metadata is not carried on to the next level
    53         &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile),
    54                    {}, $processor);
     59        $count += &plugin::read ($pluginfo, $base_dir, &util::filename_cat($file, $subfile),
     60                     {}, $processor, $maxdocs);
    5561        }
    5662    }
    57 
    58     return 1;
     63    return $count;
    5964    }
    6065
    6166    # wasn't a directory, someone else will have to process it
    62     return 0;
     67    return undef;
    6368}
    6469
Note: See TracChangeset for help on using the changeset viewer.