Changeset 1954
- Timestamp:
- 2001-02-13T10:58:26+13:00 (23 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/DateExtract.pm
r1467 r1954 7 7 #75% of the instances of the word century use full name ordinals 8 8 my %ordinals = ("first" => 1, "second" => 2, "third" => 3, "fourth" => 4, 9 10 11 12 13 14 15 16 9 "fifth" => 5, "sixth" => 6, "seventh" => 7, "eighth" => 8, 10 "ninth" => 9, "tenth" => 10, "eleventh" => 11, "twelfth" => 12, 11 "thirteenth" => 13, "fourteenth" => 14, "fifteenth" => 15, 12 "sixteenth" => 16, "seventeenth" => 17, "eighteenth" => 18, 13 "nineteenth" => 19, "twentieth" => 20); 14 15 16 17 17 18 18 #definitions for a date grammar. … … 20 20 21 21 my @months = ("january","february","march","april","may","june","july", 22 22 "august","september","october","november","december"); 23 23 24 24 my $shortmth = ""; … … 77 77 if($max_century =~ /B/) 78 78 { 79 80 81 82 79 $max_century = $`; 80 $max_century =~ /\d+/; 81 $max_century = $&; 82 $max_century *=-1 83 83 } 84 84 … … 87 87 $extr = &remove_tags($extr); 88 88 if(!$keep_bib){ 89 89 $extr = &remove_biblio($extr); 90 90 } 91 91 … … 94 94 while($extr =~ m!($range)|($millenium)|($qualified)|($centurydate)|($tri_digit)!i) 95 95 { 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 96 $extr = $'; 97 my $fulldate = $&; 98 if ($fulldate =~ /$centurydate/i) 99 { 100 if($max_century!=-1) 101 { 102 103 local $date = $fulldate; if($date =~ /\d+/) {$date = $&;} 104 else 105 { 106 $date=$fulldate; $date =~ m! ($Century)!i; $date = $`; 107 $date =~ tr/A-Z/a-z/; 108 $date = $ordinals{$date}; 109 } 110 if($max_century >= $date){ 111 $date = ($date-1)*100 +1; 112 #if it BC, make it negative 113 $date = &convert_bc($fulldate,$date); 114 $end = $date + 99; 115 @century = ($date..$end); 116 @datelist = (@datelist,@century); 117 } 118 } 119 } 120 121 elsif($fulldate =~ /$range/) 122 { 123 $fulldate =~ /$sep/; 124 my @addlist = (); 125 #print "Range: $fulldate\n"; 126 $fullfirst = $`; 127 $fullsecond = $'; 128 $fullfirst =~ /\d+/; $first = $&; 129 $fullsecond =~ /\d+/; $second = $&; 130 $len1 = length($first); 131 $len2 = length($second); 132 $second = (substr($first,0,($len1-$len2))).$second; 133 $first = &convert_bc($fullfirst,$first); 134 $second = &convert_bc($fullsecond,$second); 135 @addlist = ($first..$second); 136 @datelist = (@datelist,@addlist); 137 138 } 139 else { 140 141 my $date = $fulldate; $date =~ /\d+/; $date = $&; 142 $date = &convert_bc($fulldate,$date); 143 #add the date metadata 144 push(@datelist,$date); 145 #print "datelist @datelist\n" 146 } 147 148 148 } 149 149 150 150 if(@datelist){ 151 152 153 154 155 156 157 158 159 160 151 @datelist = sort { $a <=> $b } @datelist; 152 @datelist = &post_process($max_year, @datelist); 153 foreach $date (@datelist) 154 { 155 if($date>0){ 156 $doc->add_metadata($cursection,"Coverage",$date);} 157 else{ 158 $doc->add_metadata($cursection,"Coverage","bc".(-1*$date));} 159 160 } 161 161 } 162 162 } … … 172 172 $prev = 0; 173 173 foreach $e (@list) { 174 175 176 177 174 if ($e!=$prev && $e <= $max_year) { 175 push(@cleanlist, $e); 176 } 177 $prev = $e; 178 178 } 179 179 @cleanlist; … … 191 191 while($tmp=~ m!<([^>])*(>|$)! && $tmp ne "") 192 192 { 193 194 193 $parsed .= $`;#keep all that is not in a tag 194 $tmp = $'; #restart the search after then end of the tag 195 195 } 196 196 $parsed .= $tmp; #add anything after the last match … … 206 206 if(($tmp =~ m!($spurious)|($lookalikes)!i) == 0 ) 207 207 { 208 208 $parsed = $tmp; 209 209 } 210 210 else { 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 211 while ($tmp =~ m!($spurious)|($lookalikes)!i 212 && $tmp ne "") 213 { 214 $parsed .= $`; 215 $storage = $&; 216 $tmp = $'; 217 #match the pattern which indicates most recent alteration 218 if ($storage =~ m!$lastaltered!i) 219 { 220 #match a four digit year or up until the first / 221 #(as in last edited 3/97). 222 $tmp =~ m!($millenium)|(\/)!; 223 $tmp = $'; 224 } 225 226 } 227 228 $parsed .= $tmp; 229 230 230 } 231 231 #print "Parsed:\n $parsed\n\n"; … … 240 240 if($tmp =~ m!$bibheader!i) 241 241 { 242 242 $tmp=$`; 243 243 } 244 244 … … 246 246 if(($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|($seasonref) ($millenium)!i) == 0) 247 247 { 248 248 $parsed = $tmp; 249 249 } 250 250 else{ 251 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 252 #print "removing bib\n"; 253 while ($tmp =~ m!($ref_end)|($bracket)|($colonsp)|($reprint)|($comma)|($fullstop)|($semi)|(($seasonref) ($millenium))|($bibheader)!i && $tmp ne "") 254 { 255 256 $parsed .= $`; 257 $tmp = $'; 258 if($&=~m!($comma)|($fullstop)!) 259 { 260 261 local $date = $&; 262 if($parsed =~ m!((\d)($Ord)$)|(($shortmth)$)|(($longmth)$)!i) 263 { 264 $parsed .= $date; 265 } 266 } 267 268 } 269 $parsed .= $tmp; 270 270 } 271 271 $parsed; … … 292 292 293 293 294 295 296 297 298 299 300 301 -
trunk/gsdl/perllib/parsargv.pm
r1240 r1954 73 73 74 74 my @rest = @_; 75 75 76 76 77 # if the last argument is the string "allow_extra_options" then options … … 92 93 while (($spec, $var) = splice(@rest, 0, 2)) 93 94 { 95 94 96 die "Variable for $spec is not a valid type." 95 97 unless ref($var) eq 'SCALAR' || ref($var) eq 'ARRAY'; … … 107 109 my ($name, $regex, $default) = split(/$delimiter/, $spec, 3); 108 110 111 109 112 if ($name) 110 { 113 { 111 114 if ($default && $default !~ /$regex/) 112 115 { … … 225 228 226 229 1; 230 231 232 233 234 235 236 237 -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1903 r1954 25 25 26 26 package BasPlug; 27 27 use Kea; 28 28 use parsargv; 29 29 use multiread; … … 130 130 $self->{'outhandle'} = STDERR; 131 131 my $year = (localtime)[5]+1900; 132 132 133 133 134 # general options available to all plugins 134 135 if (!parsargv::parse(\@_, 135 136 q^process_exp/.*/^, \$self->{'process_exp'}, 136 137 q^block_exp/.*/^, \$self->{'block_exp'}, 138 q^extract_acronyms^, \$self->{'extract_acronyms'}, 139 q^extract_keyphrases^, \$self->{'kea'}, #with extra options 140 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options 137 141 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 138 142 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'}, 139 q^extract_acronyms^, \$self->{'extract_acronyms'},140 143 q^extract_email^, \$self->{'extract_email'}, 141 144 q^markup_acronyms^, \$self->{'markup_acronyms'}, 142 q^extract_language^, \$self->{'extract_language'},143 145 q^default_language/.{2}/en^, \$self->{'default_language'}, 144 146 q^first/.*/^, \$self->{'first'}, … … 233 235 234 236 sub read { 235 my $self = shift (@_); 237 my $self = shift (@_); 238 236 239 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 237 240 … … 283 286 return 0; 284 287 } 285 288 286 289 # include any metadata passed in from previous plugins 287 290 # note that this metadata is associated with the top level section … … 290 293 # do plugin specific processing of doc_obj 291 294 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj)); 292 295 293 296 # do any automatic metadata extraction 294 297 $self->auto_extract_metadata ($doc_obj); 295 298 296 299 # add an OID 297 300 $doc_obj->set_OID(); … … 501 504 502 505 # extract metadata 503 sub auto_extract_metadata { 506 sub auto_extract_metadata { 507 508 504 509 my $self = shift (@_); 505 510 my ($doc_obj) = @_; … … 512 517 $thissection = $doc_obj->get_next_section ($thissection); 513 518 } 514 } 519 } 520 521 522 #adding kea keyphrases 523 if ($self->{'kea'}) { 524 525 my $thissection = $doc_obj->get_top_section(); 526 my $text = ""; 527 my @list; 528 529 while (defined $thissection) { #loop through sections to gather whole doc 530 my $sectiontext = $doc_obj->get_text($thissection); 531 $text = $text.$sectiontext; 532 $thissection = $doc_obj->get_next_section ($thissection); 533 } 534 535 if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options 536 @list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'}); 537 } else { #otherwise call Kea with no options 538 @list = &Kea::extract_KeyPhrases ($text); 539 } 540 541 if(@list){ #if a list of kea keyphrases was returned (ie not empty) 542 my $keyphrases = $list[0]; #first arg is keyphrase list 543 my $stems = $list[1]; #second arg is stemmed keyphrase list 544 print STDERR "keyphrases: $keyphrases\n"; 545 print STDERR "stems: $stems\n"; 546 $thissection = $doc_obj->get_top_section(); #add metadata to top section 547 $doc_obj->add_metadata($thissection, "kea", $keyphrases); 548 $doc_obj->add_metadata($thissection, "stems", $stems); 549 } 550 } #end of kea 551 515 552 if ($self->{'first'}) { 516 553 my $thissection = $doc_obj->get_top_section(); -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r1929 r1954 73 73 74 74 my $generate_format; 75 if (!parsargv::parse($args, 75 my $kea_arg; 76 77 if (!parsargv::parse($args, 78 q^extract_keyphrases^, \$kea_arg->{'kea'}, #with extra options 79 q^extract_keyphrase_options/.*/^, \$kea_arg->{'kea_options'}, #no extra options 76 80 q^convert_to/(html|text)/html^, \$generate_format, 77 81 "allow_extra_options")) { … … 82 86 die "\n"; 83 87 } 84 85 return ($plugin_name,$generate_format );88 89 return ($plugin_name,$generate_format, $kea_arg); 86 90 } 87 91 88 92 sub new { 89 93 my $class = shift (@_); 90 my ($plugin_name,$generate_format ) = $class->parse_args(\@_);94 my ($plugin_name,$generate_format, $kea_arg) = $class->parse_args(\@_); 91 95 my $self; 92 96 … … 107 111 } 108 112 113 #if kea data to be extracted... 114 $self->{'kea'} = 1 if($kea_arg->{'kea'}); 115 $self->{'kea_options'} = 1 if($kea_arg->{'kea_options'}); 116 109 117 return bless $self, $class; 110 118 } … … 212 220 213 221 my $ret_val = BasPlug::read($self,@_); 214 222 215 223 $self->cleanup_tmp_area(); 216 224 -
trunk/gsdl/perllib/plugins/GMLPlug.pm
r1424 r1954 41 41 my $self = new BasPlug ("GMLPlug", @_); 42 42 43 return bless $self, $class; 44 } 43 return bless $self, $class;} 45 44 46 45 sub get_default_process_exp { -
trunk/gsdl/perllib/plugins/PDFPlug.pm
r1415 r1954 37 37 sub new { 38 38 my $class = shift (@_); 39 39 40 40 my $self = new ConvertToPlug ($class, @_, "--", "-title_sub", 'Page\s+\d+'); 41 41 … … 70 70 1; 71 71 72 73 74
Note:
See TracChangeset
for help on using the changeset viewer.