Changeset 1709


Ignore:
Timestamp:
2000-11-28T15:59:09+13:00 (23 years ago)
Author:
sjboddie
Message:

build script edited to use wget to download source files via http
and ftp

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/build

    r1678 r1709  
    144144        $download_dir =~ s/\s+$//;
    145145       
    146         if ($download_dir =~ /^http:\/\//) {
    147         # http download
    148 
    149         } elsif ($download_dir =~ /^ftp:\/\//) {
    150         # ftp download
    151 
     146        if ($download_dir =~ /^(http|ftp):\/\//) {
     147        # use wget to mirror http or ftp urls
     148        # options used are:
     149        #  -P = the directory to download documents to
     150        #  -np = don't ascend to parent directories. this means that only documents
     151        #        that live in the same directory or below on the same server as
     152        #        the given url will be downloaded
     153        #  -nv = not too verbose
     154        #  -r = recursively mirror
     155        #  -N = use time-stamping to see if an up-to-date local copy of each
     156        #       file already exists. this may be useful if wget fails and
     157        #       is restarted
     158        #  -l inf = infinite recursion depth
     159        #  -R "*\?*" = don't download cgi based urls
     160        #  -o = the output file to write download status to (only used if the -out
     161        #       option was given to build)
     162        my $download_cmd = "perl -S gsWget.pl -P \"$importdir\" -np -nv";
     163        $download .= " -r -N -l inf -R \"*\?*\"";
     164        $download_cmd .= " -o \"$outfile.download\"" if $use_out;
     165        $download_cmd .= " \"$download_dir\"";
     166        system ($download_cmd);
     167
     168        # note that wget obeys the robot rules. this means that it will have
     169        # downloaded a robots.txt file if one was present. since it's unlikely
     170        # anyone really wants to include it in a collection we'll delete it.
     171        # robots.txt shouldn't be more than two directories deep (I think it will
     172        # always be exactly two deep but will look for it in the top directory too)
     173        # so that's as far as we'll go looking for it.
     174        if (opendir (DIR, $importdir)) {
     175            my @files = readdir DIR;
     176            closedir DIR;
     177            foreach my $file (@files) {
     178            next if $file =~ /^\.\.?$/;
     179            if ($file =~ /^robots.txt$/i) {
     180                &util::rm (&util::filename_cat ($importdir, $file));
     181                last;
     182            } else {
     183                $file = &util:filename_cat ($importdir, $file);
     184                if (-d $file) {
     185                if (opendir (DIR, $file)) {
     186                    my @2files = readdir DIR;
     187                    closedir DIR;
     188                    foreach my $2file (@2files) {
     189                    if ($2file =~ /^robots.txt$/i) {
     190                        &util::rm (&util::filename_cat ($file, $2file));
     191                        last;
     192                    }
     193                    }
     194                }
     195                }
     196            }
     197            }
     198        }
     199
     200        # if using output directory append the file download output to it
     201        &append_file ($out, "$outfile.download");
     202       
    152203        } else {
    153204        # we assume anything not beginning with http:// or ftp://
Note: See TracChangeset for help on using the changeset viewer.