Changeset 15018 for gsdl/trunk
- Timestamp:
- 2008-02-27T17:07:17+13:00 (16 years ago)
- Location:
- gsdl/trunk
- Files:
-
- 1 added
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/BasPlug.pm
r15006 r15018 109 109 'type' => "string", 110 110 'reqd' => "no" }, 111 { 'name' => "use_as_doc_identifier", 112 'desc' => "{BasPlug.use_as_doc_identifier}", 113 'type' => "string", 114 'reqd' => "no" , 115 'deft' => "" } , 111 116 { 'name' => "input_encoding", 112 117 'desc' => "{BasPlug.input_encoding}", … … 867 872 } 868 873 874 875 sub add_OID 876 { 877 my $self = shift (@_); 878 my ($doc_obj) = @_; 879 880 # See if a metadata field is specified as the field 881 if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) { 882 my $metadata_doc_id = $self->{'use_as_doc_identifier'}; 883 884 # Consider "tidying" up metadata_doc_id to be something 885 # suitable in a URL 886 # Could even support a user specified plugin RE for this. 887 888 my $top_section = $doc_obj->get_top_section(); 889 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id); 890 ## print STDERR "**** oid = $oid\n"; 891 $doc_obj->set_OID($oid); 892 } 893 # See if there is a plugin-specific set_OID function... 894 elsif (defined ($self->can('set_OID'))) { 895 # it will need $doc_obj to set the Identifier metadata... 896 $self->set_OID(@_); # pass through any extra arguments supplied 897 } else { 898 # use the default set_OID() in doc.pm 899 $doc_obj->set_OID(); 900 } 901 } 869 902 870 903 # The BasPlug read_into_doc_obj() function. This function does all the … … 957 990 # do any automatic metadata extraction 958 991 $self->auto_extract_metadata ($doc_obj); 959 960 # add an OID 961 # see if there is a plugin-specific set_OID function... 962 if (defined ($self->can('set_OID'))) { 963 # it will need $doc_obj to set the Identifier metadata... 964 $self->set_OID($doc_obj); 965 } else { 966 # use the default set_OID() in doc.pm 967 $doc_obj->set_OID(); 968 } 992 993 $self->add_OID($doc_obj); 969 994 970 995 return (1,$doc_obj); -
gsdl/trunk/perllib/plugins/MARCPlug.pm
r14964 r15018 45 45 'type' => "string", 46 46 'deft' => "marctodc.txt", 47 'hiddengli' = "yes", # deprecated in favour or 'metadata_mapping_file' 48 'reqd' => "no" }, 49 { 'name' => "metadata_mapping_file", 50 'desc' => "{MARCXMLPlug.metadata_mapping_file}", 51 'type' => "string", 52 'deft' => "", 47 53 'reqd' => "no" }, 48 54 { 'name' => "process_exp", … … 80 86 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists); 81 87 88 # 'metadata_mapping' was used in two ways in the plugin: as a plugin 89 # option (filename) and as a datastructure to represent the mapping. 90 # In MARXXMLPlug (written later) the two are separated: filename is 91 # represented through 'metadata_mapping_file' and the data-structure 92 # mapping left as 'metadata_mapping' 93 # 'metadata_mapping' still present (but hidden in GLI) for 94 # backwards compatibility, but 'metadata_mapping_file' is used by 95 # preference 96 97 if ($self->{'metadata_mapping_file'} eq "") { 98 # If nothing set in the new version, use the old version 99 # that defaults to 'marctodc.txt' 100 $self->{'metadata_mapping_file'} = $self->{'metadata_mapping'}; 101 } 102 82 103 $self->{'type'} = ""; 83 104 … … 92 113 93 114 # read in the metadata mapping file 94 my $mm_file = 95 &util::filename_cat( $ENV{'GSDLHOME'}, "etc", $self->{'metadata_mapping'} ); 96 97 if (! -e$mm_file)115 116 my $mm_file = &util::locate_config_file($self->{'metadata_mapping_file'}); 117 118 if (!defined $mm_file) 98 119 { 99 120 100 121 my $msg = "MARCPlug ERROR: Can't locate mapping file \"" . 101 $self->{'metadata_mapping '} . "\".\n This file should be at $mm_file\n" .122 $self->{'metadata_mapping_file'} . "\".\n" . 102 123 " No marc files can be processed.\n"; 103 124 … … 131 152 } 132 153 close(MMIN); 154 155 $self->{'metadata_mapping'} = \@metadata_mapping; 133 156 } 134 157 else … … 137 160 } 138 161 139 $self->{'metadata_mapping'} = \@metadata_mapping; 162 140 163 141 164 $self->SUPER::init(@_); -
gsdl/trunk/perllib/plugins/MARCXMLPlug.pm
r13496 r15018 42 42 'desc' => "{MARCXMLPlug.metadata_mapping_file}", 43 43 'type' => "string", 44 'deft' => "marctodc.txt", 44 45 'reqd' => "no" }]; 45 46 … … 62 63 63 64 $self->{'content'} = ""; 65 $self->{'xmlcontent'} = ""; 64 66 $self->{'record_count'} = 1; 65 67 $self->{'language'} = ""; … … 83 85 84 86 87 sub _parse_marc_metadata_mapping 88 { 89 my $self = shift(@_); 90 my ($mm_file,$metadata_mapping) = @_; 91 92 my $outhandle = $self->{'outhandle'}; 93 94 if (open(MMIN, "<$mm_file")) 95 { 96 my $l=0; 97 my $line; 98 while (defined($line=<MMIN>)) 99 { 100 $l++; 101 chomp $line; 102 $line =~ s/#.*$//; # strip out any comments, including end of line 103 next if ($line =~ m/^\s*$/); 104 $line =~ s/\s+$//; # remove any white space at end of line 105 106 my $parse_error_count = 0; 107 if ($line =~ m/^-(\d+)\s*$/) { 108 # special "remove" rule syntax 109 my $marc_info = $1; 110 if (defined $metadata_mapping->{$marc_info}) { 111 delete $metadata_mapping->{$marc_info}; 112 } 113 else { 114 print $outhandle "Parse Warning: Did not file pre-existing rule $marc_info to remove"; 115 print $outhandle " on line $l of $mm_file:\n"; 116 print $outhandle " $line\n"; 117 } 118 } 119 elsif ($line =~ m/^(.*?)->\s*([\w\^]+)$/) 120 { 121 my $lhs = $1; 122 my $gsdl_info = $2; 123 124 my @fields = split(/,\s*/,$lhs); 125 my $f; 126 while ($f = shift (@fields)) { 127 $f =~ s/\s+$//; # remove any white space at end of line 128 129 if ($f =~ m/^(\d+)\-(\d+)$/) { 130 # number range => genrate number in range and 131 # push on to array 132 push(@fields,$1..$2); 133 next; 134 } 135 136 if ($f =~ m/^(\d+)((?:(?:\$|\^)\w)*)\s*$/) { 137 138 my $marc_info = $1; 139 my $opt_sub_fields = $2; 140 141 if ($opt_sub_fields ne "") { 142 my @sub_fields = split(/\$|\^/,$opt_sub_fields); 143 shift @sub_fields; # skip first entry, which is blank 144 145 foreach my $sub_field (@sub_fields) { 146 $metadata_mapping->{$marc_info."\$".$sub_field} = $gsdl_info; 147 } 148 } 149 else { 150 # no subfields to worry about 151 $marc_info =~ s/\^/\$/; 152 $metadata_mapping->{$marc_info} = $gsdl_info; 153 } 154 } 155 else { 156 $parse_error_count++; 157 } 158 } 159 } 160 else 161 { 162 $parse_error_count++; 163 } 164 165 if ($parse_error_count>0) { 166 167 print $outhandle "Parse Error: $parse_error_count syntax error(s) on line $l of $mm_file:\n"; 168 print $outhandle " $line\n"; 169 } 170 } 171 close(MMIN); 172 } 173 else 174 { 175 print STDERR "Unable to open $mm_file: $!\n"; 176 } 177 } 178 179 180 sub parse_marc_metadata_mapping 181 { 182 my $self = shift(@_); 183 my ($mm_file_or_files) = @_; 184 185 my $metadata_mapping = {}; 186 187 if (ref ($mm_file_or_files) eq 'SCALAR') { 188 my $mm_file = $mm_file_or_files; 189 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping); 190 } 191 else { 192 my $mm_files = $mm_file_or_files; 193 194 # Need to process files in reverse order. This is so in the 195 # case where we have both a "collect" and "main" version, 196 # the "collect" one tops up the main one 197 198 my $mm_file; 199 while ($mm_file = pop(@$mm_files)) { 200 $self->_parse_marc_metadata_mapping($mm_file,$metadata_mapping); 201 } 202 } 203 204 return $metadata_mapping; 205 } 206 207 85 208 sub init { 86 209 my $self = shift (@_); … … 93 216 } 94 217 95 my $metadata_mapping = {};96 97 218 # read in the metadata mapping file 98 my $mm_file = $self->{'metadata_mapping_file'}; 99 100 if (! defined $mm_file or $mm_file eq ""){ 101 $mm_file = &util::filename_cat( $ENV{'GSDLHOME'}, "etc","marctodc.txt" ); 102 $self->{'metadata_mapping_file'} = $mm_file; 103 } 104 105 if (!-e $mm_file) 219 my $mm_files = &util::locate_config_files($self->{'metadata_mapping_file'}); 220 221 222 if (scalar(@$mm_files)==0) 106 223 { 107 224 my $msg = "MARCXMLPlug ERROR: Can't locate mapping file \"" . 108 $self->{'metadata_mapping '} . "\".\n This file should be at $mm_file\n" .225 $self->{'metadata_mapping_file'} . "\".\n " . 109 226 " No marc files can be processed.\n"; 110 227 … … 115 232 # If we exit here, then pluginfo.pl will exit too! 116 233 } 117 elsif (open(MMIN, "<$mm_file")) 118 { 119 my $l=1; 120 my $line; 121 while (defined($line=<MMIN>)) 122 { 123 chomp $line; 124 if ($line =~ m/^(\d+\w?)\s*->\s*([\w\^]+)$/) 125 { 126 my $marc_info = $1; 127 my $gsdl_info = $2; 128 $metadata_mapping->{$marc_info} = $gsdl_info; 129 } 130 elsif ($line !~ m/^\#/ # allow comments (# in first column) 131 && $line !~ m/^\s*$/) # allow blank lines 132 { 133 print $outhandle "Parse error on line $l of $mm_file:\n"; 134 print $outhandle " \"$line\"\n"; 135 } 136 $l++ 137 } 138 close(MMIN); 139 } 140 else 141 { 142 print STDERR "Unable to open $mm_file: $!\n"; 143 } 144 145 $self->{'metadata_mapping'} = $metadata_mapping; 234 else { 235 $self->{'metadata_mapping'} = $self->parse_marc_metadata_mapping($mm_files); 236 } 237 146 238 147 239 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; … … 191 283 my $element = shift; 192 284 193 my $text = $self->escape_text($_); 285 my $text = $_; 286 my $escaped_text = $self->escape_text($_); 194 287 195 288 $self->{'current_element'} = $element; … … 203 296 } 204 297 205 298 206 299 my $processor = $self->{'processor'}; 207 300 … … 225 318 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "XML"); 226 319 227 $doc_obj->set_OID();228 $self->set_OID($doc_obj, $doc_obj->get_OID() , $self->{'record_count'});229 230 320 my $outhandle = $self->{'outhandle'}; 231 321 print $outhandle "Record $self->{'record_count'} - MARCXMLPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; … … 254 344 if ($element eq "record"){ 255 345 $self->{'indent'} = 0; 346 $self->{'content'} = ""; 347 $self->{'xmlcontent'} = ""; 256 348 } 257 349 else { … … 264 356 } 265 357 266 267 if ($element ne "collection"){ 268 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text; 358 359 if ($element eq "collection") { 360 # remember the full start tag for <collection ...> 361 # This is needed to wrap around each <record> when generating its associate MARCXML file 362 363 $self->{'xmlcollectiontag'} = $text; 364 } 365 else { 366 $self->{'content'} .= "<br/>" if ($element ne "record"); 367 $self->{'content'} .= $self->calculate_indent($self->{'indent'}).$escaped_text; 368 $self->{'xmlcontent'} .= $text; 269 369 } 370 270 371 } 271 372 … … 275 376 my $self = shift(@_); 276 377 my ($expat, $element) = @_; 277 my $text = $self->escape_text($_); 378 379 my $text = $_; 380 my $escaped_text = $self->escape_text($_); 278 381 279 382 if ($element eq "record" and defined $self->{'doc_obj'}) { … … 281 384 my $processor = $self->{'processor'}; 282 385 my $doc_obj = $self->{'doc_obj'}; 283 $self->{'content'} .= "<br/>".$text; 386 $self->{'content'} .= "<br/>".$escaped_text; 387 $self->{'xmlcontent'} .= $text; 284 388 389 390 my $top_section = $doc_obj->get_top_section(); 391 392 my $tmp_marcxml_filename = &util::get_tmp_filename().".xml"; 393 if (open (XMLOUT,">$tmp_marcxml_filename")) { 394 395 print XMLOUT "<?xml-stylesheet type=\"text/xsl\" href=\"MARC21slim2English.xsl\"?>\n"; 396 my $xml_content = $self->{'xmlcontent'}; 397 398 $xml_content = $self->{'xmlcollectiontag'}.$xml_content."</collection>"; 399 400 print XMLOUT $xml_content; 401 402 close(XMLOUT); 403 404 $doc_obj->associate_file($tmp_marcxml_filename,"marcxml.xml","text/xml", $top_section); 405 406 # assicate xsl style file for presentation as HTML 407 my $xsl_filename = &util::filename_cat($ENV{'GSDLHOME'},"etc","MARC21slim2English.xsl"); 408 $doc_obj->associate_file($xsl_filename,"MARC21slim2English.xsl","text/xml", $top_section); 409 410 } 411 else { 412 my $outhandle = $self->{'outhandle'}; 413 print $outhandle "Warning: Unable for write out associated MARCXML file $tmp_marcxml_filename\n"; 414 } 415 416 $self->add_OID($doc_obj, $self->{'record_count'}); 417 285 418 $doc_obj->add_utf8_text($doc_obj->get_top_section(),$self->{'content'}); 286 419 $processor->process($doc_obj); … … 288 421 ##clean up 289 422 $self->{'content'} = ""; 423 $self->{'xmlcontent'} = ""; 290 424 $self->{'doc_obj'} = undef; 291 425 return; … … 298 432 my $doc_obj = $self->{'doc_obj'}; 299 433 300 ##map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping; 301 ##map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; 434 ## print STDERR "**** Marc Record\n"; 435 ## map { print STDERR $_."=>".$marc_mapping->{$_}."\n"; } keys %$marc_mapping; 436 ## print STDERR "**** Metadata Mapping\n"; 437 ## map { print STDERR $_."=>".$metadata_mapping->{$_}."\n"; } keys %$metadata_mapping; 438 302 439 303 440 foreach my $marc_field (keys %$metadata_mapping){ 304 my ($meta_name,$meta_value); 441 442 ## test whether this field has subfield 443 my $subfield = undef; 444 if ($marc_field =~ /(\d\d\d)(?:\$|\^)?(\w)/){ 445 $marc_field = $1; 446 $subfield = $2; 447 } 448 305 449 my $matched_field = $marc_mapping->{$marc_field}; 306 my $subfield = undef; 307 if (defined $matched_field){ 308 ## test whether this field has subfield 309 if ($marc_field =~ /\d\d\d(\w)/){ 310 $subfield = $1; 450 451 if (defined $matched_field) { 452 453 my $meta_name = undef; 454 my $meta_value = undef; 455 456 if (defined $subfield){ 457 $meta_name = $metadata_mapping->{$marc_field."\$".$subfield}; 458 459 $meta_value = $matched_field->{$subfield}; 460 461 if (!defined $meta_value) { 462 # record read in does not have the specified subfield 463 next; 464 } 311 465 } 312 $meta_name = $metadata_mapping->{$marc_field}; 313 314 if (defined $subfield){ 315 my %mapped_subfield = {@$matched_field}; 316 $meta_value = $mapped_subfield{$subfield}; 317 } 318 else{ ## get all values 319 my $i =0; 320 foreach my $value (@$matched_field){ 321 if ($i%2 != 0){ 322 $meta_value .= $value." "; 323 } 324 $i++; 466 else { 467 $meta_name = $metadata_mapping->{$marc_field}; 468 469 # no subfield => get all the values 470 foreach my $value (sort keys %{$matched_field}) { 471 $meta_value .= $matched_field->{$value} ." "; 325 472 } 473 326 474 } 327 475 … … 343 491 if ($element eq "datafield"){ 344 492 $self->{'indent'} = 1; 345 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$text; 493 $self->{'content'} .= "<br/>".$self->calculate_indent($self->{'indent'}).$escaped_text; 494 $self->{'xmlcontent'} .= $text; 346 495 } 347 496 else{ 348 $self->{'content'} .= $text; 497 $self->{'content'} .= $escaped_text; 498 $self->{'xmlcontent'} .= $text; 349 499 } 350 500 … … 354 504 sub set_OID { 355 505 my $self = shift (@_); 356 my ($doc_obj, $id, $record_number) = @_; 357 506 my ($doc_obj, $record_number) = @_; 507 508 # first set it to generate hash value 509 $doc_obj->set_OID(); 510 511 # then top it up with an "r" + record-number suffix 512 my $id = $doc_obj->get_OID(); 358 513 $doc_obj->set_OID($id . "r" . $record_number); 359 514 } … … 363 518 my ($expat) = @_; 364 519 520 my $text = $_; 521 my $escaped_text = $self->escape_text($_); 522 523 # protect against & in raw text file 524 $text =~ s/&/&/g; # can't have & in raw form, even in 'raw' xml text 365 525 366 526 ## store the text of a marc code, for exapmle 520a=>A poem about.... 367 527 if ($self->{'current_element'} eq "subfield" and $self->{'current_code'} ne "" and $_ ne "" ){ 368 528 ##stored it in the marc_mapping 369 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$self->{'current_code'}); 370 push(@{$self->{'marc_mapping'}->{$self->{'current_tag'}}},$_); 529 530 my $current_tag = $self->{'current_tag'}; 531 my $current_code = $self->{'current_code'}; 532 533 $self->{'marc_mapping'}->{$current_tag}->{$current_code} .= $_; 534 371 535 $self->{'current_code'} = ""; 372 536 } 373 537 374 $self->{'content'} .=$self->escape_text($_); 538 $self->{'content'} .= $escaped_text; 539 $self->{'xmlcontent'} .= $text; 375 540 376 541 } … … 401 566 402 567 568 sub unescape_text { 569 my ($self,$text) = @_; 570 # special characters in the xml encoding 571 $text =~ s/</</g; 572 $text =~ s/>/>/g; 573 $text =~ s/"/\"/g; 574 575 $text =~ s/&/&/g; # can't have & in raw form, even in unescaped xml! 576 577 return $text; 578 } 579 580 403 581 1; 404 582 -
gsdl/trunk/perllib/plugins/PagedImgPlug.pm
r14661 r15018 71 71 #<PagedDocument> 72 72 #<Metadata name="Title">The Title of the entire document</Metadata> 73 #<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy. jpg">73 #<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt"> 74 74 #<Metadata name="Title">The Title of this page</Metadata> 75 75 #</Page> … … 257 257 my $self = shift (@_); 258 258 259 return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt| ~)$^259 return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|\.html?|~)$^ 260 260 } 261 261 … … 937 937 $text =~ s/\\/\\\\/g; # macro language 938 938 $text =~ s/_/\\_/g; # macro language 939 $text =~ s/</</g; 940 $text =~ s/>/>/g; 941 942 # insert preformat tags and add text to document object 943 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>"); 939 940 941 if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/s) { 942 # looks like HTML input 943 # no need to escape < and > or put in <pre> tags 944 945 $text = $1; 946 947 # insert preformat tags and add text to document object 948 $doc_obj->add_utf8_text($cursection, "$text"); 949 } 950 else { 951 $text =~ s/</</g; 952 $text =~ s/>/>/g; 953 954 # insert preformat tags and add text to document object 955 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>"); 956 } 957 944 958 945 959 return 1; -
gsdl/trunk/perllib/util.pm
r15003 r15018 797 797 798 798 799 sub locate_config_file 800 { 801 my ($file) = @_; 802 803 my $locations = locate_config_files($file); 804 805 return shift @$locations; # returns undef if 'locations' is empty 806 } 807 808 809 sub locate_config_files 810 { 811 my ($file) = @_; 812 813 my @locations = (); 814 815 if (-e $file) { 816 # Clearly specified (most likely full filename) 817 # No need to hunt in 'etc' directories, return value unchanged 818 push(@locations,$file); 819 } 820 else { 821 # Check for collection specific one before looking in global GSDL 'etc' 822 823 my $test_collect_etc_filename 824 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file); 825 826 if (-e $test_collect_etc_filename) { 827 push(@locations,$test_collect_etc_filename); 828 } 829 830 my $test_main_etc_filename 831 = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file); 832 if (-e $test_main_etc_filename) { 833 push(@locations,$test_main_etc_filename); 834 } 835 } 836 837 return \@locations; 838 } 839 840 799 841 sub hyperlink_text 800 842 {
Note:
See TracChangeset
for help on using the changeset viewer.