Changeset 1393


Ignore:
Timestamp:
08/11/00 11:12:15 (20 years ago)
Author:
say1
Message:

acronym markup functionality

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1361 r1393  
    4747my $min_acro_length = 3;
    4848#minimum acronym length saving
    49 my $min_length_saving = 3;
     49my $min_length_saving = 4;
    5050#allow recusive acronyms
    5151my $allow_recursive = "";
    5252
    53 my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND";
    54 #my @stop_words = split / /, "OF AT THE IN TO AND";
     53my @stop_words = split / /, "OF AT THE IN TO AND";
    5554
    5655#the text split into an array, one word per element
     
    5857my @acronym_list = ();
    5958
     59#the file to collate acronyms into
     60my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm";
     61my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm";
     62
     63my %acronyms_found_in_collection = ();
     64my %acronyms_banned_from_collection = ();
     65
     66my $writing_acronyms = 1;
     67my $accumulate_acronyms = 1;
     68my $markup_accumulate_acronyms = 1;
     69my $markup_local_acronyms = 1;
     70
     71
     72
     73###########################################################################
     74#   file saving / loading stuff
     75###########################################################################
     76
     77sub init_acronyms {
     78   
     79    my $file_text = "";
     80    if (open ACRONYM_HANDLE, "<$acronym_options_file")
     81    {
     82    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
     83    }
     84    if ($file_text eq "")
     85    {
     86    print STDERR "failed to open $acronym_options_file\n";
     87    open ACRONYM_HANDLE, ">$acronym_options_file\n";
     88    print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n";
     89    print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n";
     90    print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n";
     91    print ACRONYM_HANDLE "#valid here.\n\n";
     92    print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n";
     93    print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n";
     94    print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n";
     95    print ACRONYM_HANDLE "#explain them fully \n\n";
     96    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";
     97    print ACRONYM_HANDLE "\$max_offset = 30;\n\n";
     98    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n";
     99    print ACRONYM_HANDLE "\$upper_case = 1;\n\n";
     100    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n";
     101    print ACRONYM_HANDLE "\$case_match = 1;\n\n";
     102    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
     103    print ACRONYM_HANDLE "\$min_def_length = 3;\n\n";
     104    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n";
     105    print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n";
     106    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n";
     107    print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n";
     108    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n";
     109    print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n";
     110    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n";
     111    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n";
     112    print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n";
     113    print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";
     114    print ACRONYM_HANDLE "\n";
     115    print ACRONYM_HANDLE "#the file to collate acronyms into\n";
     116    print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n";
     117    print ACRONYM_HANDLE "\n";
     118    print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n";
     119    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n";
     120    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n";
     121    print ACRONYM_HANDLE "# \n";
     122    print ACRONYM_HANDLE "# \n";
     123    print ACRONYM_HANDLE "# \n";
     124    }
     125    eval $file_text;
     126    #promotes warnings/errors from evaluated file to current context
     127    warn $@ if $@;
     128
     129
     130
     131    &read_all_acronyms_from_file();
     132#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).
     133#   int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7));
     134    if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file")
     135    {
     136    print ACRONYM_HANDLE "#This is an automatically generated file.\n";
     137    print ACRONYM_HANDLE "#\n";
     138    print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n";
     139    print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n";
     140    print ACRONYM_HANDLE "#read-only. \n";
     141    print ACRONYM_HANDLE "#\n";
     142    print ACRONYM_HANDLE "#start of acronyms...\n";
     143    $writing_acronyms = 1;
     144    } else {
     145    warn "failed to open $acronym_accumulate_file for writing\n";
     146    $writing_acronyms = 0;
     147    }
     148}
     149
     150#close the list of accumulated acronyms
     151END {
     152    if ($writing_acronyms)
     153    {
     154    print ACRONYM_HANDLE "#end of acronyms.\n"; 
     155    close ACRONYM_HANDLE;
     156    }
     157}
     158
     159#eval a file of accumulated acronyms
     160sub read_all_acronyms_from_file {
     161   
     162    my $file_text = "";
     163    if (open ACRONYM_HANDLE, "<$acronym_accumulate_file")
     164    {
     165    $file_text = do { local $/; <ACRONYM_HANDLE> }; 
     166    } else {
     167    print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n";
     168    }
     169    eval $file_text;
     170    #promotes warnings/errors from evaluated file to current context
     171    warn $@ if $@;
     172}
     173
     174#called from within the file of accumulated acronyms to indicate a good acronym
     175sub add {
     176    my $self = shift (@_);
     177    if (defined ($acronyms_found_in_collection{$self->[0]}))
     178    {
     179    my $def = $self->to_def_string();
     180    if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/)
     181    {
     182        return;
     183    }
     184    $acronyms_found_in_collection{$self->[0]} =
     185        $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string();
     186    } else {
     187    $acronyms_found_in_collection{$self->[0]} = $self->to_def_string();
     188    }
     189}
     190
     191#called from within the file of accumulated acronyms to indicate a bad acronym
     192sub ban {
     193    my $self = shift (@_);
     194   
     195    if (!defined $acronyms_banned_from_collection{$self->[0]})
     196    {
     197    $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string();
     198    } else {
     199    $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string();
     200    }
     201}
     202
     203
     204#write a good acronym to the accumulated acronyms file
     205sub write_to_file {
     206    my $self = shift (@_);
     207    if ($writing_acronyms)
     208    {
     209    print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .
     210        $self->to_def_string() .
     211        "\")->add();\n";
     212    }
     213}
     214
     215
     216###########################################################################
     217# mark functionality   
     218###########################################################################
     219
     220#small routine to sort by length
     221sub sort_by_length {
     222    length($b) <=> length($a) or $a cmp $b
     223}
     224
     225sub markup_acronyms {
     226    my  $text = shift (@_);
     227    my  $verbosity_obj = shift (@_);
     228    if (defined $text)
     229    {
     230    for my $acro (sort sort_by_length keys %acronyms_found_in_collection)
     231    {
     232        $text  =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm;
     233        printf STDERR " " .  $acro . ","
     234        if ($verbosity_obj->{'verbosity'} >= 2);
     235    }
     236    }
     237    return $text;
     238}
     239
     240
    60241
    61242###########################################################################
     
    65246
    66247sub new {
     248    my $trash = shift (@_);
     249    my $acro = shift (@_);
     250    my $def  = shift (@_);
     251   
    67252    my $self = [
    68     "", # 0 acronym
    69     [], # 1 definition
    70     ];
     253        "", # 0 acronym
     254        [], # 1 definition
     255               ];
     256   
     257    $self->[0] = $acro                    if defined $acro;
     258    push @{$self->[1]},  split / /, $def  if defined $def;
     259   
    71260    bless $self;
    72261}
    73 
    74 
    75262
    76263sub clone  {
     
    226413#   print "acronym " . $self->to_string() . " rejected (too short III)\n";
    227414#   print "" . $min_length_saving .
    228         "|" . $self->letters_in_acronym() .
    229         "|" . $self->letters_in_acronym_definition() . "\n";
     415#       "|" . $self->letters_in_acronym() .
     416#       "|" . $self->letters_in_acronym_definition() . "\n";
    230417    return 0;
    231418    }
     
    446633#&test();
    447634
     635&init_acronyms();
     636
    4486371;
    449638
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1384 r1393  
    6767    print STDERR "                     file extensions.\n";
    6868    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
     69    print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
    6970    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
    7071}
     
    9091             q^block_exp/.*/^, \$self->{'block_exp'},
    9192             q^extract_acronyms^, \$self->{'extract_acronyms'},
     93             q^markup_acronyms^, \$self->{'markup_acronyms'},
    9294             q^extract_language^, \$self->{'extract_language'},
    9395             "allow_extra_options")) {
     
    297299    }
    298300
     301    if ($self->{'markup_acronyms'}) {
     302    my $thissection = $doc_obj->get_top_section();
     303    while (defined $thissection) {
     304        my $text = $doc_obj->get_text($thissection);
     305        $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
     306        $doc_obj->delete_text($thissection);
     307        $doc_obj->add_text($thissection, $text);
     308        $thissection = $doc_obj->get_next_section ($thissection);
     309    }
     310    }
     311
    299312    if ($self->{'extract_language'}) {
    300313    my $thissection = $doc_obj->get_top_section();
     
    341354    my ($textref, $doc_obj, $thissection) = @_;
    342355
    343     print STDERR " checking for acronyms ...\n"
     356    print STDERR " extracting acronyms ...\n"
    344357    if ($self->{'verbosity'} >= 2);
    345358
     
    362375    if ($seen_before eq "false")
    363376    {
     377        #write it to the file ...
     378        $acro->write_to_file();
     379
    364380        #do the normal acronym
    365381        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
     
    367383            if ($self->{'verbosity'} >= 1);
    368384       
    369         # do the KWIC (Key Word In Context) acronym
    370         my @kwic = $acro->to_string_kwic();
    371         foreach my $kwic (@kwic) {
    372         $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
    373         print STDERR "   adding ".  $kwic . "\n"
    374             if ($self->{'verbosity'} >= 2);
    375         }
    376     }
    377     }
    378     print STDERR " done with acronyms. \n"
     385#       # do the KWIC (Key Word In Context) acronym
     386#       my @kwic = $acro->to_string_kwic();
     387#       foreach my $kwic (@kwic) {
     388#       $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
     389#       print STDERR "   adding ".  $kwic . "\n"
     390#           if ($self->{'verbosity'} >= 2);
     391#       }
     392    }
     393    }
     394    print STDERR " done extracting acronyms. \n"
    379395    if ($self->{'verbosity'} >= 2);
    380396}
    381397
     398sub markup_acronyms {
     399    my $self = shift (@_);
     400    my ($text, $doc_obj, $thissection) = @_;
     401
     402    print STDERR " marking up acronyms ...\n"
     403    if ($self->{'verbosity'} >= 2);
     404
     405    #self is passed in to check for verbosity ...
     406    $text = &acronym::markup_acronyms($text, $self);
     407
     408    print STDERR " done marking up acronyms. \n"
     409    if ($self->{'verbosity'} >= 2);
     410
     411    return $text;
     412}
     413
    3824141;
Note: See TracChangeset for help on using the changeset viewer.