Changeset 1393

Show
Ignore:
Timestamp:
11.08.2000 11:12:15 (19 years ago)
Author:
say1
Message:

acronym markup functionality

Location:
trunk/gsdl/perllib
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/acronym.pm

    r1361 r1393  
    4747my $min_acro_length = 3; 
    4848#minimum acronym length saving 
    49 my $min_length_saving = 3; 
     49my $min_length_saving = 4; 
    5050#allow recusive acronyms 
    5151my $allow_recursive = ""; 
    5252 
    53 my @stop_words = split / /, "A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND"; 
    54 #my @stop_words = split / /, "OF AT THE IN TO AND"; 
     53my @stop_words = split / /, "OF AT THE IN TO AND"; 
    5554 
    5655#the text split into an array, one word per element 
     
    5857my @acronym_list = (); 
    5958 
     59#the file to collate acronyms into 
     60my $acronym_accumulate_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_definitions.pm"; 
     61my $acronym_options_file = $ENV{'GSDLCOLLECTDIR'} . "/etc/acronym_options.pm"; 
     62 
     63my %acronyms_found_in_collection = (); 
     64my %acronyms_banned_from_collection = (); 
     65 
     66my $writing_acronyms = 1; 
     67my $accumulate_acronyms = 1; 
     68my $markup_accumulate_acronyms = 1; 
     69my $markup_local_acronyms = 1; 
     70 
     71 
     72 
     73########################################################################### 
     74#   file saving / loading stuff 
     75########################################################################### 
     76 
     77sub init_acronyms { 
     78     
     79    my $file_text = ""; 
     80    if (open ACRONYM_HANDLE, "<$acronym_options_file") 
     81    { 
     82    $file_text = do { local $/; <ACRONYM_HANDLE> };   
     83    } 
     84    if ($file_text eq "") 
     85    { 
     86    print STDERR "failed to open $acronym_options_file\n"; 
     87    open ACRONYM_HANDLE, ">$acronym_options_file\n"; 
     88    print ACRONYM_HANDLE "#Config file for acronym extraction. EDIT THIS FILE, it should\n"; 
     89    print ACRONYM_HANDLE "#not be overridden by the software. It's read by GSDL using perl's\n"; 
     90    print ACRONYM_HANDLE "#'eval' function, so pretty much anything that's valid in perl is \n"; 
     91    print ACRONYM_HANDLE "#valid here.\n\n"; 
     92    print ACRONYM_HANDLE "#Quite a few things here are defined in terms of recall and precision\n"; 
     93    print ACRONYM_HANDLE "#which are the key measures from Information Retreval (IR). If you\n"; 
     94    print ACRONYM_HANDLE "#don't understand recall and precision, any good IR textbook should\n"; 
     95    print ACRONYM_HANDLE "#explain them fully \n\n"; 
     96    print ACRONYM_HANDLE "#the maximum range to look for acronyms (raise to raise precision)\n";  
     97    print ACRONYM_HANDLE "\$max_offset = 30;\n\n";  
     98    print ACRONYM_HANDLE "#acronyms must be upper case (0 = false, 1 = true)\n"; 
     99    print ACRONYM_HANDLE "\$upper_case = 1;\n\n"; 
     100    print ACRONYM_HANDLE "#acronym case must match (0 = false, 1 = true)\n"; 
     101    print ACRONYM_HANDLE "\$case_match = 1;\n\n"; 
     102    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 
     103    print ACRONYM_HANDLE "\$min_def_length = 3;\n\n"; 
     104    print ACRONYM_HANDLE "#minimum acronym length (raise to raise precision)\n"; 
     105    print ACRONYM_HANDLE "\$min_acro_length = 3;\n\n"; 
     106    print ACRONYM_HANDLE "#minimum acronym length saving (raise to raise precision)\n"; 
     107    print ACRONYM_HANDLE "\$min_length_saving = 4;\n\n"; 
     108    print ACRONYM_HANDLE "#allow recusive acronyms (0 = false, 1 = true)\n"; 
     109    print ACRONYM_HANDLE "\$allow_recursive = 0;\n\n"; 
     110    print ACRONYM_HANDLE "#stop words-words allowed in acronyms (the multi-lingual version\n"; 
     111    print ACRONYM_HANDLE "#slows down acronym extraction slightly so is not the default)\n"; 
     112    print ACRONYM_HANDLE "#\@stop_words = split / /, \"A OF AT THE IN TO AND VON BEI DER DIE DAS DEM DEN DES UND DE DU A LA LE LES L DANS ET S\";\n"; 
     113    print ACRONYM_HANDLE "\@stop_words = split / /, \"OF AT THE IN TO AND\";\n";  
     114    print ACRONYM_HANDLE "\n";  
     115    print ACRONYM_HANDLE "#the file to collate acronyms into\n"; 
     116    print ACRONYM_HANDLE "\$acronym_accumulate_file = \"$ENV{'GSDLCOLLECTDIR'}\" . \"/etc/acronym_definitions.pm\";\n"; 
     117    print ACRONYM_HANDLE "\n"; 
     118    print ACRONYM_HANDLE "\$accumulate_acronyms = 1;\n\n"; 
     119    print ACRONYM_HANDLE "# any acronym definitions which should always be marked up can be copied here\n"; 
     120    print ACRONYM_HANDLE "# from the acronym_accumulate_file file ...\n"; 
     121    print ACRONYM_HANDLE "# \n"; 
     122    print ACRONYM_HANDLE "# \n"; 
     123    print ACRONYM_HANDLE "# \n"; 
     124    } 
     125    eval $file_text; 
     126    #promotes warnings/errors from evaluated file to current context  
     127    warn $@ if $@; 
     128 
     129 
     130 
     131    &read_all_acronyms_from_file(); 
     132#    rename $acronym_file, $acronym_file . "." . int(rand (2<<7)).  
     133#   int(rand (2<<7)). int(rand (2<<7)). int(rand (2<<7)); 
     134    if ($writing_acronyms && open ACRONYM_HANDLE, ">$acronym_accumulate_file") 
     135    { 
     136    print ACRONYM_HANDLE "#This is an automatically generated file.\n"; 
     137    print ACRONYM_HANDLE "#\n"; 
     138    print ACRONYM_HANDLE "#If you edit this file and it will be overwritten the next\n"; 
     139    print ACRONYM_HANDLE "#time the acronym code runs unless you set the file to \n"; 
     140    print ACRONYM_HANDLE "#read-only. \n"; 
     141    print ACRONYM_HANDLE "#\n"; 
     142    print ACRONYM_HANDLE "#start of acronyms...\n"; 
     143    $writing_acronyms = 1; 
     144    } else { 
     145    warn "failed to open $acronym_accumulate_file for writing\n"; 
     146    $writing_acronyms = 0; 
     147    } 
     148} 
     149 
     150#close the list of accumulated acronyms 
     151END { 
     152    if ($writing_acronyms) 
     153    { 
     154    print ACRONYM_HANDLE "#end of acronyms.\n";   
     155    close ACRONYM_HANDLE; 
     156    } 
     157} 
     158 
     159#eval a file of accumulated acronyms 
     160sub read_all_acronyms_from_file { 
     161     
     162    my $file_text = ""; 
     163    if (open ACRONYM_HANDLE, "<$acronym_accumulate_file") 
     164    { 
     165    $file_text = do { local $/; <ACRONYM_HANDLE> };   
     166    } else { 
     167    print STDERR "failed to open $acronym_accumulate_file for reading (this is the first pass?).\n"; 
     168    } 
     169    eval $file_text; 
     170    #promotes warnings/errors from evaluated file to current context  
     171    warn $@ if $@; 
     172} 
     173 
     174#called from within the file of accumulated acronyms to indicate a good acronym 
     175sub add { 
     176    my $self = shift (@_); 
     177    if (defined ($acronyms_found_in_collection{$self->[0]})) 
     178    { 
     179    my $def = $self->to_def_string(); 
     180    if ($acronyms_found_in_collection{$self->[0]} =~ m/(^|\|)$def(\||$)/) 
     181    { 
     182        return; 
     183    } 
     184    $acronyms_found_in_collection{$self->[0]} =  
     185        $acronyms_found_in_collection{$self->[0]} . "|" . $self->to_def_string(); 
     186    } else { 
     187    $acronyms_found_in_collection{$self->[0]} = $self->to_def_string(); 
     188    } 
     189} 
     190 
     191#called from within the file of accumulated acronyms to indicate a bad acronym 
     192sub ban { 
     193    my $self = shift (@_); 
     194     
     195    if (!defined $acronyms_banned_from_collection{$self->[0]}) 
     196    { 
     197    $acronyms_banned_from_collection{$self->[0]} = $self->to_def_string(); 
     198    } else { 
     199    $acronyms_banned_from_collection{$self->[0]} = $acronyms_banned_from_collection{$self->[0]} . "|" . $self->to_def_string(); 
     200    } 
     201} 
     202 
     203 
     204#write a good acronym to the accumulated acronyms file 
     205sub write_to_file { 
     206    my $self = shift (@_); 
     207    if ($writing_acronyms) 
     208    { 
     209    print ACRONYM_HANDLE "new acronym(\"$self->[0]\",\"" .  
     210        $self->to_def_string() .  
     211        "\")->add();\n"; 
     212    } 
     213} 
     214 
     215 
     216########################################################################### 
     217# mark functionality     
     218########################################################################### 
     219 
     220#small routine to sort by length  
     221sub sort_by_length { 
     222    length($b) <=> length($a) or $a cmp $b 
     223} 
     224 
     225sub markup_acronyms { 
     226    my  $text = shift (@_); 
     227    my  $verbosity_obj = shift (@_); 
     228    if (defined $text) 
     229    { 
     230    for my $acro (sort sort_by_length keys %acronyms_found_in_collection) 
     231    { 
     232        $text  =~ s/^((?:[^\<\n]|(?:\<[^\>\n]*\>))*)$acro([^\<A-Z])/$1$acro\<img src=\"\" width=8 height=8 alt=\"$acronyms_found_in_collection{$acro}\"\>$2/gm; 
     233        printf STDERR " " .  $acro . "," 
     234        if ($verbosity_obj->{'verbosity'} >= 2); 
     235    } 
     236    } 
     237    return $text; 
     238} 
     239 
     240 
    60241 
    61242########################################################################### 
     
    65246 
    66247sub new { 
     248    my $trash = shift (@_);  
     249    my $acro = shift (@_);  
     250    my $def  = shift (@_);  
     251     
    67252    my $self = [ 
    68     "", # 0 acronym 
    69     [], # 1 definition 
    70     ]; 
     253        "", # 0 acronym 
     254        [], # 1 definition 
     255               ]; 
     256     
     257    $self->[0] = $acro                    if defined $acro; 
     258    push @{$self->[1]},  split / /, $def  if defined $def; 
     259     
    71260    bless $self; 
    72261} 
    73  
    74  
    75262 
    76263sub clone  { 
     
    226413#   print "acronym " . $self->to_string() . " rejected (too short III)\n"; 
    227414#   print "" . $min_length_saving . 
    228         "|" . $self->letters_in_acronym() . 
    229         "|" . $self->letters_in_acronym_definition() . "\n"; 
     415#       "|" . $self->letters_in_acronym() . 
     416#       "|" . $self->letters_in_acronym_definition() . "\n"; 
    230417    return 0; 
    231418    } 
     
    446633#&test(); 
    447634 
     635&init_acronyms(); 
     636 
    4486371; 
    449638 
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1384 r1393  
    6767    print STDERR "                     file extensions.\n"; 
    6868    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 
     69    print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n"; 
    6970    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n"; 
    7071} 
     
    9091             q^block_exp/.*/^, \$self->{'block_exp'}, 
    9192             q^extract_acronyms^, \$self->{'extract_acronyms'}, 
     93             q^markup_acronyms^, \$self->{'markup_acronyms'}, 
    9294             q^extract_language^, \$self->{'extract_language'}, 
    9395             "allow_extra_options")) { 
     
    297299    } 
    298300 
     301    if ($self->{'markup_acronyms'}) { 
     302    my $thissection = $doc_obj->get_top_section(); 
     303    while (defined $thissection) { 
     304        my $text = $doc_obj->get_text($thissection); 
     305        $text = $self->markup_acronyms ($text, $doc_obj, $thissection); 
     306        $doc_obj->delete_text($thissection); 
     307        $doc_obj->add_text($thissection, $text); 
     308        $thissection = $doc_obj->get_next_section ($thissection); 
     309    } 
     310    } 
     311 
    299312    if ($self->{'extract_language'}) { 
    300313    my $thissection = $doc_obj->get_top_section(); 
     
    341354    my ($textref, $doc_obj, $thissection) = @_; 
    342355 
    343     print STDERR " checking for acronyms ...\n"  
     356    print STDERR " extracting acronyms ...\n"  
    344357    if ($self->{'verbosity'} >= 2); 
    345358 
     
    362375    if ($seen_before eq "false") 
    363376    { 
     377        #write it to the file ... 
     378        $acro->write_to_file(); 
     379 
    364380        #do the normal acronym 
    365381        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string()); 
     
    367383            if ($self->{'verbosity'} >= 1); 
    368384         
    369         # do the KWIC (Key Word In Context) acronym 
    370         my @kwic = $acro->to_string_kwic(); 
    371         foreach my $kwic (@kwic) { 
    372         $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic); 
    373         print STDERR "   adding ".  $kwic . "\n"  
    374             if ($self->{'verbosity'} >= 2); 
    375         } 
    376     } 
    377     } 
    378     print STDERR " done with acronyms. \n"  
     385#       # do the KWIC (Key Word In Context) acronym 
     386#       my @kwic = $acro->to_string_kwic(); 
     387#       foreach my $kwic (@kwic) { 
     388#       $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic); 
     389#       print STDERR "   adding ".  $kwic . "\n"  
     390#           if ($self->{'verbosity'} >= 2); 
     391#       } 
     392    } 
     393    } 
     394    print STDERR " done extracting acronyms. \n"  
    379395    if ($self->{'verbosity'} >= 2); 
    380396} 
    381397 
     398sub markup_acronyms { 
     399    my $self = shift (@_); 
     400    my ($text, $doc_obj, $thissection) = @_; 
     401 
     402    print STDERR " marking up acronyms ...\n"  
     403    if ($self->{'verbosity'} >= 2); 
     404 
     405    #self is passed in to check for verbosity ... 
     406    $text = &acronym::markup_acronyms($text, $self); 
     407 
     408    print STDERR " done marking up acronyms. \n"  
     409    if ($self->{'verbosity'} >= 2); 
     410 
     411    return $text; 
     412} 
     413 
    3824141;