Changeset 16767


Ignore:
Timestamp:
08/13/08 16:25:49 (13 years ago)
Author:
ak19
Message:

In progress: Filename encoding after working with it on Windows. Still need to get interlinking html files (with multilingual filenames) working

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16698 r16767  
    411411    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
    412412
    413 ##  print STDERR "**** User chose filename encoding setting: $filename_encoding\n";
    414    
    415413    # Whenever filename-encoding is set to any of the auto settings, we
    416414    # check if the filename is already in UTF8. If it is, then we're done.
     
    418416    if(&unicode::check_is_utf8($filemeta))
    419417    {
    420 ##      print STDERR "**** It is already UTF8\n";
    421418        $filename_encoding = "utf8";
    422419        return $filemeta;
     
    484481    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
    485482    }
    486 
    487 ##    print STDERR "**** filename_encoding selected: $filename_encoding \n";
    488483       
    489484    # if still undefined, use utf8 as fallback
     
    492487    }
    493488
     489    print STDERR "**** UTF8 encoding the filename $filemeta ";
     490   
    494491    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
    495492    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
     
    498495    # cases attempt to make the filename utf8 to match.
    499496    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
    500 ##  print STDERR "**** BEFORE utf8 conversion: $filemeta\n";
    501497    &unicode::ensure_utf8(\$filemeta);
    502 ##  print STDERR "**** AFTER utf8 conversion: $filemeta\n";
    503     }
    504 
     498    }
    505499
    506500    # convert non-unicode encodings to utf8
     
    511505    }
    512506
    513     print "*** filename encoding found: $filename_encoding\n";
    514     print "*** utf8 encoded filename: $filemeta\n";
    515 
     507    print STDERR " from encoding $filename_encoding -> $filemeta\n";
    516508    return $filemeta;
    517509}
     
    528520    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
    529521    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
    530 
     522   
    531523    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
    532524
     
    543535    }
    544536
    545     print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
     537    print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
    546538    return $self->{'filesystem_encoding'}; # can be the string "undefined"
    547539}
     
    559551    $strictfilemeta =~ s/\s*$//g;
    560552   
    561 ##    print STDERR "**** strict filename is |$strictfilemeta|\n";
    562553    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
    563554    if(!defined $filename_encoding) {
     
    565556    }
    566557
    567 ##    print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";
    568558    return $filename_encoding; # can be the string "undefined"
    569559}
     
    580570    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
    581571    #my $results = $self->{'textcat'}->classify(\$text);
    582     my $results = $self->{'textcat'}->classify_cached(\$text);
     572    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
    583573
    584574
    585575    if (scalar @$results < 0) {
    586     print STDERR "**** Textcat returned 0 results\n";
    587576    return undef;
    588577    }
    589578   
    590     print STDERR "**** TEXTCAT RESULTS for $text: ";
    591     print STDERR join(",", @$results);
    592     print STDERR "\n";
     579##    print STDERR "**** TEXTCAT RESULTS for $text: ";
     580##    print STDERR join(",", @$results);
     581##    print STDERR "\n";
    593582
    594583    # We have some results, we choose the first
     
    597586    $best_encoding = $encoding;
    598587    if (!defined $best_encoding) {
    599 ##  print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";
    600588    return undef;
    601589    }
     
    603591    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
    604592    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
    605 ##  print STDERR "*** Filename turns out to be UTF8\n";
    606593    $best_encoding = 'utf8';
    607594    }
     
    611598    # eg MS versions of standard encodings
    612599    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
    613 ##  print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";
    614 
    615600    my $iso = $1; # which variant of the iso standard?
    616601    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
    617602    if ($text =~ /[\x80-\x9f]/) {
    618 ##      print STDERR "**** best_encoding is some windows value: $best_encoding\n";
    619603        # Western Europe
    620604        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
     
    625609        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
    626610        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
    627 ##      print STDERR "**** best_encoding windows value: $best_encoding\n";
    628611    }
    629612    }
     
    635618        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
    636619    }
    637 ##  print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n"; 
    638620    $best_encoding = undef;
    639621    }
    640 ##    print STDERR "**** language: $language\n" if defined $language;
    641 ##    print STDERR "**** encoding: $best_encoding\n" if defined $encoding;
    642622   
    643623    return $best_encoding;
     
    706686    my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
    707687    $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
    708 
    709688}
    710689     
     
    724703    my $top_section = $doc_obj->get_top_section();
    725704    my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
    726 ##  print STDERR "**** oid = $oid\n";
    727705        $doc_obj->set_OID($oid);
    728706    }
Note: See TracChangeset for help on using the changeset viewer.