Changeset 16767

Show
Ignore:
Timestamp:
13.08.2008 16:25:49 (11 years ago)
Author:
ak19
Message:

In progress: Filename encoding after working with it on Windows. Still need to get interlinking html files (with multilingual filenames) working

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16698 r16767  
    411411    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting 
    412412 
    413 ##  print STDERR "**** User chose filename encoding setting: $filename_encoding\n"; 
    414      
    415413    # Whenever filename-encoding is set to any of the auto settings, we 
    416414    # check if the filename is already in UTF8. If it is, then we're done. 
     
    418416    if(&unicode::check_is_utf8($filemeta))  
    419417    { 
    420 ##      print STDERR "**** It is already UTF8\n"; 
    421418        $filename_encoding = "utf8"; 
    422419        return $filemeta; 
     
    484481    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 
    485482    } 
    486  
    487 ##    print STDERR "**** filename_encoding selected: $filename_encoding \n"; 
    488483         
    489484    # if still undefined, use utf8 as fallback 
     
    492487    } 
    493488 
     489    print STDERR "**** UTF8 encoding the filename $filemeta "; 
     490     
    494491    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when 
    495492    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to 
     
    498495    # cases attempt to make the filename utf8 to match. 
    499496    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) { 
    500 ##  print STDERR "**** BEFORE utf8 conversion: $filemeta\n"; 
    501497    &unicode::ensure_utf8(\$filemeta); 
    502 ##  print STDERR "**** AFTER utf8 conversion: $filemeta\n"; 
    503     } 
    504  
     498    } 
    505499 
    506500    # convert non-unicode encodings to utf8 
     
    511505    } 
    512506 
    513     print "*** filename encoding found: $filename_encoding\n"; 
    514     print "*** utf8 encoded filename: $filemeta\n"; 
    515  
     507    print STDERR " from encoding $filename_encoding -> $filemeta\n"; 
    516508    return $filemeta; 
    517509} 
     
    528520    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 
    529521    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding); 
    530  
     522     
    531523    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 
    532524 
     
    543535    } 
    544536 
    545     print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 
     537    print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 
    546538    return $self->{'filesystem_encoding'}; # can be the string "undefined" 
    547539} 
     
    559551    $strictfilemeta =~ s/\s*$//g; 
    560552     
    561 ##    print STDERR "**** strict filename is |$strictfilemeta|\n"; 
    562553    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta); 
    563554    if(!defined $filename_encoding) { 
     
    565556    } 
    566557 
    567 ##    print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n"; 
    568558    return $filename_encoding; # can be the string "undefined" 
    569559} 
     
    580570    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'}); 
    581571    #my $results = $self->{'textcat'}->classify(\$text); 
    582     my $results = $self->{'textcat'}->classify_cached(\$text); 
     572    my $results = $self->{'textcat'}->classify_cached_filename(\$text); 
    583573 
    584574 
    585575    if (scalar @$results < 0) {  
    586     print STDERR "**** Textcat returned 0 results\n"; 
    587576    return undef; 
    588577    } 
    589578     
    590     print STDERR "**** TEXTCAT RESULTS for $text: "; 
    591     print STDERR join(",", @$results); 
    592     print STDERR "\n"; 
     579##    print STDERR "**** TEXTCAT RESULTS for $text: "; 
     580##    print STDERR join(",", @$results); 
     581##    print STDERR "\n"; 
    593582 
    594583    # We have some results, we choose the first 
     
    597586    $best_encoding = $encoding; 
    598587    if (!defined $best_encoding) { 
    599 ##  print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n"; 
    600588    return undef; 
    601589    }  
     
    603591    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 
    604592    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) 
    605 ##  print STDERR "*** Filename turns out to be UTF8\n"; 
    606593    $best_encoding = 'utf8'; 
    607594    } 
     
    611598    # eg MS versions of standard encodings 
    612599    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) { 
    613 ##  print STDERR "**** best_encoding is ISO_8859: $best_encoding\n"; 
    614  
    615600    my $iso = $1; # which variant of the iso standard? 
    616601    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 
    617602    if ($text =~ /[\x80-\x9f]/) { 
    618 ##      print STDERR "**** best_encoding is some windows value: $best_encoding\n"; 
    619603        # Western Europe 
    620604        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 
     
    625609        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 
    626610        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 
    627 ##      print STDERR "**** best_encoding windows value: $best_encoding\n"; 
    628611    } 
    629612    } 
     
    635618        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef"); 
    636619    } 
    637 ##  print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n";   
    638620    $best_encoding = undef; 
    639621    } 
    640 ##    print STDERR "**** language: $language\n" if defined $language; 
    641 ##    print STDERR "**** encoding: $best_encoding\n" if defined $encoding; 
    642622     
    643623    return $best_encoding; 
     
    706686    my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding); 
    707687    $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta); 
    708  
    709688} 
    710689      
     
    724703    my $top_section = $doc_obj->get_top_section(); 
    725704    my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id); 
    726 ##  print STDERR "**** oid = $oid\n"; 
    727705        $doc_obj->set_OID($oid); 
    728706    }