Ignore:
Timestamp:
2015-07-14T21:09:32+12:00 (9 years ago)
Author:
ak19
Message:

Finally committing Dr Bainbridge's suggested fix (tested) to handle the updated Tudor collection when downloaded from the web. The HTML files link to themselves with the <link rel> tag, in which case the HTML files were blocked from being processed and 0 docs were processed. Now HTMLPlugin tests whether any of the linked associated files found are to the document itself, and in such cases, they are not blocked.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r29818 r30022  
    555555        next;
    556556    }
    557     print STDERR "** Dir Plugin processing $raw_full_filename\n";
     557    print STDERR "** DirectoryPlugin processing $raw_full_filename\n";
    558558    # Follow Windows shortcuts
    559559    if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) {
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r28319 r30022  
    283283        print $outhandle " ->$unicode_url_original_filename\n";
    284284
    285         # Allow for possibility of raw byte version and Unicode versions of file
    286         &util::block_filename($block_hash,$unicode_url_original_filename);
     285        # make sure not to block the file itself, as happens when an html file links to itself
     286        # e.g. if the current file is mary-boleyn/index.html and contains <link rel="canonical" href="index.html" />
     287        my $unicode_html_fname = "";
     288        $self->decode_text($html_fname,$content_encoding,$language,\$unicode_html_fname);       
     289        if($unicode_url_original_filename ne $unicode_html_fname) {
     290            # Allow for possibility of raw byte version and Unicode versions of file
     291            &util::block_filename($block_hash,$unicode_url_original_filename);
     292        }
    287293    }
    288294
    289295    # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename);
    290     &util::block_filename($block_hash,$url_original_filename);
     296    &util::block_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname;
     297
     298            # but only add the linked file to the blocklist if the current html file does not link to itself
    291299       
    292300    }
Note: See TracChangeset for help on using the changeset viewer.