Changeset 30022

Show
Ignore:
Timestamp:
14.07.2015 21:09:32 (4 years ago)
Author:
ak19
Message:

Finally committing Dr Bainbridge's suggested fix (tested) to handle the updated Tudor collection when downloaded from the web. The HTML files link to themselves with the <link rel> tag, in which case the HTML files were blocked from being processed and 0 docs were processed. Now HTMLPlugin tests whether any of the linked associated files found are to the document itself, and in such cases, they are not blocked.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r29818 r30022  
    555555        next; 
    556556    } 
    557     print STDERR "** Dir Plugin processing $raw_full_filename\n"; 
     557    print STDERR "** DirectoryPlugin processing $raw_full_filename\n"; 
    558558    # Follow Windows shortcuts 
    559559    if ($raw_subfile =~ m/(?i)\.lnk$/ && (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin"))) { 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r28319 r30022  
    283283        print $outhandle " ->$unicode_url_original_filename\n"; 
    284284 
    285         # Allow for possibility of raw byte version and Unicode versions of file 
    286         &util::block_filename($block_hash,$unicode_url_original_filename); 
     285        # make sure not to block the file itself, as happens when an html file links to itself 
     286        # e.g. if the current file is mary-boleyn/index.html and contains <link rel="canonical" href="index.html" /> 
     287        my $unicode_html_fname = ""; 
     288        $self->decode_text($html_fname,$content_encoding,$language,\$unicode_html_fname);        
     289        if($unicode_url_original_filename ne $unicode_html_fname) { 
     290            # Allow for possibility of raw byte version and Unicode versions of file 
     291            &util::block_filename($block_hash,$unicode_url_original_filename); 
     292        } 
    287293    } 
    288294 
    289295    # $url_original_filename = &util::upgrade_if_dos_filename($url_original_filename); 
    290     &util::block_filename($block_hash,$url_original_filename); 
     296    &util::block_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname; 
     297 
     298            # but only add the linked file to the blocklist if the current html file does not link to itself 
    291299         
    292300    }