Changeset 10280
- Timestamp:
- 2005-07-25T14:19:14+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r10254 r10280 213 213 214 214 215 216 217 215 sub get_arguments 218 216 { … … 386 384 die "\n"; 387 385 } 386 388 387 389 388 # else parsing was successful. … … 431 430 $self->{'places'} = $places_ref; 432 431 } 433 } 432 } 434 433 return bless $self, $class; 435 434 … … 673 672 674 673 675 # The BasPlug read() function. This function does all the right things 676 # to make general options work for a given plugin. It calls the process() 677 # function which does all the work specific to a plugin (like the old 678 # read functions used to do). Most plugins should define their own 679 # process() function and let this read() function keep control. 680 # 681 # recursive plugins (e.g. RecPlug) and specialized plugins like those 682 # capable of processing many documents within a single file (e.g. 683 # GMLPlug) should normally implement their own version of read() 684 # 685 # Return number of files processed, undef if can't recognise, -1 if can't 686 # process 687 # Note that $base_dir might be "" and that $file might 688 # include directories 689 690 sub read { 674 sub read_block { 691 675 my $self = shift (@_); 692 676 693 677 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 694 678 695 if ($self->is_recursive()) {696 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";697 }698 699 my $outhandle = $self->{'outhandle'};700 my $smart_block = $self->{'smart_block'};701 my $smart_block_BN = $self->{'smart_block_BN'};702 679 703 680 my $filename = $file; … … 707 684 # a form of smart block 708 685 $self->{'num_blocked'} ++; 709 return 0; # blocked 710 } 686 return (0,undef); # blocked 687 } 688 689 my $smart_block = $self->{'smart_block'}; 690 my $smart_block_BN = $self->{'smart_block_BN'}; 711 691 712 692 if ($smart_block || $smart_block_BN) { … … 714 694 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){ 715 695 $self->{'num_blocked'} ++; 716 return 0; # blocked696 return (0,undef); # blocked 717 697 } 718 698 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { 719 699 $self->{'num_blocked'} ++; 720 return 0; # blocked700 return (0,undef); # blocked 721 701 } 722 702 723 703 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 724 return undef; # can't recognise 725 } 704 return (undef,undef); # can't recognise 705 } 706 707 return (1,$filename); 708 } 709 710 sub read_tidy_file { 711 712 my $self = shift (@_); 713 714 my ($file) = @_; 715 726 716 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 717 718 return $file; 719 } 720 721 722 723 # The BasPlug read_into_doc_obj() function. This function does all the 724 # right things to make general options work for a given plugin. It reads in 725 # a file and sets up a slew of metadata all saved in doc_obj, which 726 # it then returns as part of a tuple (process_status,doc_obj) 727 # 728 # Much of this functionality used to reside in read, but it was broken 729 # down into a supporting routine to make the code more flexible. 730 # 731 # recursive plugins (e.g. RecPlug) and specialized plugins like those 732 # capable of processing many documents within a single file (e.g. 733 # GMLPlug) will normally want to implement their own version of 734 # read_into_doc_obj() 735 # 736 # Note that $base_dir might be "" and that $file might 737 # include directories 738 sub read_into_doc_obj { 739 my $self = shift (@_); 740 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 741 742 if ($self->is_recursive()) { 743 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n"; 744 } 745 746 my $outhandle = $self->{'outhandle'}; 747 748 my ($block_status,$filename) = $self->read_block(@_); 749 return $block_status if ((!defined $block_status) || ($block_status==0)); 750 $file = $self->read_tidy_file($file); 727 751 728 752 # Do encoding stuff … … 760 784 $self->{'num_not_processed'} ++; 761 785 762 return 0; # what should we return here?? error but don't want to pass it on786 return (0,undef); # what should we return here?? error but don't want to pass it on 763 787 } 764 788 … … 773 797 undef $text; 774 798 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 775 return -1;799 return (-1,undef); 776 800 } 777 801 $text=''; … … 790 814 $doc_obj->set_OID(); 791 815 } 792 793 # process the document 794 $processor->process($doc_obj); 795 796 if(defined($self->{'places_filename'})){ 797 &util::rm($self->{'places_filename'}); 798 $self->{'places_filename'} = undef; 799 } 800 801 $self->{'num_processed'} ++; 802 undef $doc_obj; 803 return 1; # processed the file 816 817 return (1,$doc_obj); 818 } 819 820 821 # The BasPlug read() function. This function calls read_into_doc_obj() 822 # to ensure all the right things to make general options work for a 823 # given plugin are done. It then calls the process() function which 824 # does all the work specific to a plugin (like the old read functions 825 # used to do). Most plugins should define their own process() function 826 # and let this read() function keep control. 827 # 828 # recursive plugins (e.g. RecPlug) and specialized plugins like those 829 # capable of processing many documents within a single file (e.g. 830 # GMLPlug) might want to implement their own version of read(), but 831 # more likely need to implement their own version of read_into_doc_obj() 832 # 833 # Return number of files processed, undef if can't recognise, -1 if can't 834 # process 835 836 sub read { 837 my $self = shift (@_); 838 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 839 840 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_); 841 842 if ((defined $process_status) && ($process_status == 1)) { 843 # process the document 844 $processor->process($doc_obj); 845 846 if(defined($self->{'places_filename'})){ 847 &util::rm($self->{'places_filename'}); 848 $self->{'places_filename'} = undef; 849 } 850 851 $self->{'num_processed'} ++; 852 undef $doc_obj; 853 } 854 855 # if process_status == 1, then the file has been processed. 856 return $process_status; 857 804 858 } 805 859 … … 830 884 return; 831 885 } 832 833 886 $$textref = ""; 834 835 887 if (!open (FILE, $filename)) { 836 888 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename); 837 838 839 889 die "\n"; 890 } 891 840 892 if ($encoding eq "ascii") { 841 893 undef $/; … … 847 899 $reader->set_encoding ($encoding); 848 900 $reader->read_file ($textref); 849 850 #Now segments chinese if the separate_cjk option is set 901 #Now segments chinese if the separate_cjk option is set 851 902 if ($self->{'separate_cjk'}) { 852 903 # segment the Chinese words … … 854 905 } 855 906 } 856 857 907 close FILE; 858 908 } 909 910 # write_file -- used by ConvertToPlug, for example in post processing 911 # 912 sub utf8_write_file { 913 my $self = shift (@_); 914 my ($textref, $filename) = @_; 915 916 if (!open (FILE, ">$filename")) { 917 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename); 918 die "\n"; 919 } 920 print FILE $$textref; 921 922 close FILE; 923 } 924 859 925 860 926 sub filename_based_title … … 887 953 my ($filename) = @_; 888 954 955 889 956 my ($language, $encoding, $extracted_encoding); 890 957 if ($self->{'input_encoding'} eq "auto") { … … 910 977 $encoding = $self->{'input_encoding'}; 911 978 } 979 912 980 return ($language, $encoding); 913 981 } … … 942 1010 } 943 1011 } 944 1012 945 1013 946 1014 # remove <title>stuff</title> -- as titles tend often to be in English
Note:
See TracChangeset
for help on using the changeset viewer.