Changeset 10468
- Timestamp:
- 2005-08-10T16:19:17+12:00 (19 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuilder.pm
r10158 r10468 44 44 $self = bless $self, $class; 45 45 46 # load up the document processor for building47 # if a buildproc class has been created for this collection, use it48 # otherwise, use the lucene buildproc49 my ($buildprocdir, $buildproctype);50 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {51 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";52 $buildproctype = "${collection}buildproc";53 } else {54 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";55 $buildproctype = "lucenebuildproc";56 }57 require "$buildprocdir/$buildproctype.pm";58 59 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .60 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");61 die "$@" if $@;62 63 46 $self->{'buildtype'} = "lucene"; 64 47 65 48 return $self; 49 } 50 51 sub default_buildproc { 52 my $self = shift (@_); 53 54 return "lucenebuildproc"; 66 55 } 67 56 … … 188 177 } 189 178 190 191 192 193 194 179 sub build_index { 195 180 my $self = shift (@_); -
trunk/gsdl/perllib/mgbuilder.pm
r10158 r10468 26 26 package mgbuilder; 27 27 28 use basebuilder; 28 29 use classify; 29 30 use cfgread; … … 34 35 35 36 BEGIN { 36 # set autoflush on for STDERR and STDOUT so that mg 37 # doesn't get out of sync with plugins 38 STDOUT->autoflush(1); 39 STDERR->autoflush(1); 40 } 41 42 END { 43 STDOUT->autoflush(0); 44 STDERR->autoflush(0); 45 } 46 47 my $maxdocsize = 12000; 37 @mgbuilder::ISA = ('basebuilder'); 38 } 39 48 40 49 41 my %wanted_index_files = ('td'=>1, … … 58 50 'wa'=>1); 59 51 52 my $maxdocsize = $basebuilder::maxdocsize; 53 60 54 61 55 sub new { 62 my ($class, $collection, $source_dir, $build_dir, $verbosity, 56 my $class = shift(@_); 57 58 my ($collection, $source_dir, $build_dir, $verbosity, 63 59 $maxdocs, $debug, $keepold, $remove_empty_classifications, 64 60 $outhandle, $no_text, $failhandle, $gli) = @_; 65 61 66 $outhandle = STDERR unless defined $outhandle; 67 $no_text = 0 unless defined $no_text; 68 $failhandle = STDERR unless defined $failhandle; 69 70 # create an mgbuilder object 71 my $self = bless {'collection'=>$collection, 72 'source_dir'=>$source_dir, 73 'build_dir'=>$build_dir, 74 'verbosity'=>$verbosity, 75 'maxdocs'=>$maxdocs, 76 'debug'=>$debug, 77 'keepold'=>$keepold, 78 'remove_empty_classifications'=>$remove_empty_classifications, 79 'outhandle'=>$outhandle, 80 'no_text'=>$no_text, 81 'failhandle'=>$failhandle, 82 'notbuilt'=>{}, # indexes not built 83 'gli'=>$gli 84 }, $class; 85 86 $self->{'gli'} = 0 unless defined $self->{'gli'}; 87 88 # read in the collection configuration file 89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; 90 if (!-e $colcfgname) { 91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n"; 92 } 93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 62 my $self = new basebuilder (@_); 63 $self = bless $self, $class; 64 65 $self->{'buildtype'} = "mg"; 66 return $self; 67 } 68 69 sub default_buildproc { 70 my $self = shift (@_); 71 72 return "mgbuildproc"; 73 } 74 75 sub generate_index_list { 76 my $self = shift (@_); 94 77 95 78 if (!defined($self->{'collect_cfg'}->{'indexes'})) { 96 79 $self->{'collect_cfg'}->{'indexes'} = []; 97 80 } 98 99 # sort out subcollection indexes100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {101 my $indexes = $self->{'collect_cfg'}->{'indexes'};102 $self->{'collect_cfg'}->{'indexes'} = [];103 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {104 foreach my $index (@$indexes) {105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");106 }107 }108 }109 110 # sort out language subindexes111 if (defined $self->{'collect_cfg'}->{'languages'}) {112 my $indexes = $self->{'collect_cfg'}->{'indexes'};113 $self->{'collect_cfg'}->{'indexes'} = [];114 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {115 foreach my $index (@$indexes) {116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");118 }119 else { # add in an empty subcollection field120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");121 }122 }123 }124 }125 126 if (defined($self->{'collect_cfg'}->{'indexes'})) {127 # make sure that the same index isn't specified more than once128 my %tmphash = ();129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};130 $self->{'collect_cfg'}->{'indexes'} = [];131 foreach my $i (@tmparray) {132 if (!defined ($tmphash{$i})) {133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);134 $tmphash{$i} = 1;135 }136 }137 } else {138 $self->{'collect_cfg'}->{'indexes'} = [];139 }140 141 81 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) { 142 82 # no indexes have been specified so we'll build a "dummy:text" index … … 144 84 } 145 85 146 # get the list of plugins for this collection 147 my $plugins = []; 148 if (defined $self->{'collect_cfg'}->{'plugin'}) { 149 $plugins = $self->{'collect_cfg'}->{'plugin'}; 150 } 151 152 # load all the plugins 153 154 #build up the extra global options for the plugins 155 my @global_opts = (); 156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) { 157 push @global_opts, "-separate_cjk"; 158 } 159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts); 160 161 if (scalar(@{$self->{'pluginfo'}}) == 0) { 162 print $outhandle "No plugins were loaded.\n"; 163 die "\n"; 164 } 165 166 # get the list of classifiers for this collection 167 my $classifiers = []; 168 if (defined $self->{'collect_cfg'}->{'classify'}) { 169 $classifiers = $self->{'collect_cfg'}->{'classify'}; 170 } 171 172 # load all the classifiers 173 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); 174 175 # load up any dontgdbm fields 176 $self->{'dontgdbm'} = {}; 177 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { 178 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { 179 $self->{'dontgdbm'}->{$dg} = 1; 180 } 181 } 182 183 # load up the document processor for building 184 # if a buildproc class has been created for this collection, use it 185 # otherwise, use the mg buildproc 186 my ($buildprocdir, $buildproctype); 187 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { 188 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; 189 $buildproctype = "${collection}buildproc"; 190 } else { 191 $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; 192 $buildproctype = "mgbuildproc"; 193 } 194 195 require "$buildprocdir/$buildproctype.pm"; 196 197 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . 198 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)"); 199 die "$@" if $@; 200 201 return $self; 202 } 203 204 sub init { 205 my $self = shift (@_); 206 207 if (!$self->{'debug'} && !$self->{'keepold'}) { 208 # remove any old builds 209 &util::rm_r($self->{'build_dir'}); 210 &util::mk_all_dir($self->{'build_dir'}); 211 212 # make the text directory 213 my $textdir = "$self->{'build_dir'}/text"; 214 &util::mk_all_dir($textdir); 215 } 216 } 86 } 87 217 88 218 89 sub compress_text { … … 324 195 } 325 196 326 sub want_built {327 my $self = shift (@_);328 my ($index) = @_;329 330 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {331 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {332 if ($index =~ /^$checkstr$/) {333 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});334 $self->{'notbuilt'}->{$index} = 1;335 return 0;336 }337 }338 }339 340 return 1;341 }342 343 sub build_indexes {344 my $self = shift (@_);345 my ($indexname) = @_;346 my $outhandle = $self->{'outhandle'};347 my $indexes = [];348 if (defined $indexname && $indexname =~ /\w/) {349 push @$indexes, $indexname;350 } else {351 $indexes = $self->{'collect_cfg'}->{'indexes'};352 }353 354 # create the mapping between the index descriptions355 # and their directory names356 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);357 358 # build each of the indexes359 foreach my $index (@$indexes) {360 if ($self->want_built($index)) {361 print $outhandle "\n*** building index $index in subdirectory " .362 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);363 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};364 $self->build_index($index);365 } else {366 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);367 }368 }369 }370 197 371 198 # creates directory names for each of the index descriptions … … 440 267 } 441 268 442 # returns a processed version of a field.443 # if the field has only one component the processed444 # version will contain the first character and next consonant445 # of that componant - otherwise it will contain the first446 # character of the first two components447 sub process_field {448 my $self = shift (@_);449 my ($field) = @_;450 451 return "" unless (defined ($field) && $field =~ /\w/);452 453 my @components = split /,/, $field;454 if (scalar @components >= 2) {455 splice (@components, 2);456 map {s/^(.).*$/$1/;} @components;457 return join("", @components);458 } else {459 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;460 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;461 return "$a$b";462 }463 }464 269 465 270 sub make_unique { … … 477 282 return "$$indexref$$subref$$langref"; 478 283 } 479 480 sub get_next_version {481 my $self = shift (@_);482 my ($nameref) = @_;483 if ($$nameref =~ /(\d\d)$/) {484 my $num = $1; $num ++;485 $$nameref =~ s/\d\d$/$num/;486 } elsif ($$nameref =~ /(\d)$/) {487 my $num = $1;488 if ($num == 9) {$$nameref =~ s/\d$/10/;}489 else {$num ++; $$nameref =~ s/\d$/$num/;}490 } else {491 $$nameref =~ s/.$/0/;492 }493 }494 284 495 285 sub build_index { … … 696 486 } 697 487 698 sub make_infodatabase { 699 my $self = shift (@_); 700 my $outhandle = $self->{'outhandle'}; 701 702 my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); 703 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 704 &util::mk_all_dir ($textdir); 705 &util::mk_all_dir ($assocdir); 706 707 # get db name 708 my $dbext = ".bdb"; 709 $dbext = ".ldb" if &util::is_little_endian(); 710 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); 711 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); 712 713 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 714 my $exe = &util::get_os_exe (); 715 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); 716 717 print $outhandle "\n*** creating the info database and processing associated files\n" 718 if ($self->{'verbosity'} >= 1); 719 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 720 721 # init all the classifiers 722 &classify::init_classifiers ($self->{'classifiers'}); 723 724 # set up the document processor 725 my ($handle); 726 if ($self->{'debug'}) { 727 $handle = STDOUT; 728 } else { 729 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) { 730 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'}; 731 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; 732 } 733 $handle = mgbuilder::PIPEOUT; 734 } 735 736 $self->{'buildproc'}->set_output_handle ($handle); 737 $self->{'buildproc'}->set_mode ('infodb'); 738 $self->{'buildproc'}->set_assocdir ($assocdir); 739 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); 740 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 741 $self->{'buildproc'}->set_indexing_text (0); 742 $self->{'buildproc'}->set_store_text(1); 743 $self->{'buildproc'}->reset(); 488 sub build_cfg_extra { 489 my $self = shift(@_); 490 my ($build_cfg) = @_; 744 491 745 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {746 747 if (!defined $self->{'index_mapping'}) {748 $self->{'index_mapping'} =749 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});750 }751 752 print $handle "[collection]\n";753 754 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {755 my $defaultfound=0;756 my $first=1;757 my $metadata_entry = "";758 my $default="";759 my $cmetamap = "";760 if ($cmeta =~ s/^\.//) {761 if (defined $self->{'index_mapping'}->{$cmeta}) {762 $cmetamap = $self->{'index_mapping'}->{$cmeta};763 $cmeta = ".$cmeta";764 }765 else {766 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";767 next; #ignore this one768 }769 }770 else {771 $cmetamap = $cmeta; # just using the same name772 }773 #iterate through the languages774 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {775 if ($first) {776 $first=0;777 #set the default default to the first entry778 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};779 }780 if ($lang =~ /default/) {781 $defaultfound=1;782 #the default entry goes first783 $metadata_entry = "<$cmetamap>" .784 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;785 }786 else {787 my ($l) = $lang =~ /^\[l=(\w*)\]$/;788 if ($l) {789 $metadata_entry .= "<$cmetamap:$l>" .790 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";791 792 # Use the English value as the default if no default is specified793 if ($l =~ /en/i) {794 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};795 }796 }797 }798 }799 #if we haven't found a default, put one in800 if (!$defaultfound) {801 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;802 }803 #write the entry to the file804 print $handle $metadata_entry;805 806 }807 808 print $handle "\n" . ('-' x 70) . "\n";809 }810 811 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},812 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});813 814 # output classification information815 &classify::output_classify_info ($self->{'classifiers'}, $handle,816 $self->{'remove_empty_classifications'},817 $self->{'gli'});818 819 820 #output doclist821 my @doclist = $self->{'buildproc'}->get_doc_list();822 my $docs = join (";",@doclist);823 print $handle "[browselist]\n";824 print $handle "<hastxt>0\n";825 print $handle "<childtype>VList\n";826 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";827 print $handle "<thistype>Invisible\n";828 print $handle "<contains>$docs";829 print $handle "\n" . ('-' x 70) . "\n";830 831 close ($handle) if !$self->{'debug'};832 833 print STDERR "</Stage>\n" if $self->{'gli'};834 }835 836 sub collect_specific {837 my $self = shift (@_);838 }839 840 sub make_auxiliary_files {841 my $self = shift (@_);842 my ($index);843 my $build_cfg = {};844 my $outhandle = $self->{'outhandle'};845 846 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);847 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};848 849 # get the text directory850 &util::mk_all_dir ($self->{'build_dir'});851 852 # store the build date853 $build_cfg->{'builddate'} = time;854 $build_cfg->{'indexstem'} = $self->{'collection'};855 # store the number of documents and number of bytes856 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();857 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();858 859 492 # get additional stats from mg 860 493 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; … … 875 508 close PIPEIN; 876 509 } 877 878 # store the mapping between the index names and the directory names879 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.880 my @indexmap = ();881 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {882 if (not defined ($self->{'notbuilt'}->{$index})) {883 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");884 }885 }886 $build_cfg->{'indexmap'} = \@indexmap;887 888 my @subcollectionmap = ();889 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {890 push (@subcollectionmap, "$subcollection\-\>" .891 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});892 }893 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);894 895 my @languagemap = ();896 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {897 push (@languagemap, "$language\-\>" .898 $self->{'index_mapping'}->{'languagemap'}->{$language});899 }900 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);901 902 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};903 my @notbuilt = ();904 foreach my $nb (keys %{$self->{'notbuilt'}}) {905 push (@notbuilt, $nb);906 }907 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);908 $build_cfg->{'maxnumeric'} = 4;909 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&910 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {911 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};912 }913 914 # write out the build information915 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,916 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',917 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');918 919 print STDERR "</Stage>\n" if $self->{'gli'};920 }921 922 sub deinit {923 my $self = shift (@_);924 925 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});926 }927 928 sub print_stats {929 my $self = shift (@_);930 931 my $outhandle = $self->{'outhandle'};932 my $indexing_text = $self->{'buildproc'}->get_indexing_text();933 my $index = $self->{'buildproc'}->get_index();934 my $num_bytes = $self->{'buildproc'}->get_num_bytes();935 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();936 937 if ($indexing_text) {938 print $outhandle "Stats (Creating index $index)\n";939 } else {940 print $outhandle "Stats (Compressing text from $index)\n";941 }942 print $outhandle "Total bytes in collection: $num_bytes\n";943 print $outhandle "Total bytes in $index: $num_processed_bytes\n";944 945 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {946 print $outhandle "***************\n";947 if ($indexing_text) {948 print $outhandle "WARNING: There is very little or no text to process for $index\n";949 } elsif (!$self->{'no_text'}) {950 print $outhandle "WARNING: There is very little or no text to compress\n";951 }952 print $outhandle " Was this your intention?\n";953 print $outhandle "***************\n";954 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};955 }956 510 } 957 511 -
trunk/gsdl/perllib/mgppbuilder.pm
r10158 r10468 26 26 package mgppbuilder; 27 27 28 use basebuilder; 28 29 use classify; 29 30 use cfgread; … … 33 34 use FileHandle; 34 35 35 36 BEGIN { 37 # set autoflush on for STDERR and STDOUT so that mgpp 38 # doesn't get out of sync with plugins 39 STDOUT->autoflush(1); 40 STDERR->autoflush(1); 41 } 42 43 END { 44 STDOUT->autoflush(0); 45 STDERR->autoflush(0); 46 } 47 48 our $maxdocsize = 12000; 36 sub BEGIN { 37 @mgppbuilder::ISA = ('basebuilder'); 38 } 39 40 49 41 50 42 our %level_map = ('document'=>'Doc', … … 107 99 'Para'=>1); 108 100 101 my $maxdocsize = $basebuilder::maxdocsize; 102 109 103 sub new { 110 104 my $class = shift(@_); … … 114 108 $outhandle, $no_text, $failhandle, $gli) = @_; 115 109 116 $outhandle = STDERR unless defined $outhandle; 117 $no_text = 0 unless defined $no_text; 118 119 # create an mgppbuilder object 120 my $self = bless {'collection'=>$collection, 121 'source_dir'=>$source_dir, 122 'build_dir'=>$build_dir, 123 'verbosity'=>$verbosity, 124 'maxdocs'=>$maxdocs, 125 'debug'=>$debug, 126 'keepold'=>$keepold, 127 'remove_empty_classifications'=>$remove_empty_classifications, 128 'outhandle'=>$outhandle, 129 'no_text'=>$no_text, 130 'notbuilt'=>{}, # indexes not built 131 'indexfieldmap'=>\%static_indexfield_map, 132 'gli'=>$gli 133 }, $class; 134 135 $self->{'gli'} = 0 unless defined $self->{'gli'}; 136 137 # read in the collection configuration file 138 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; 139 if (!-e $colcfgname) { 140 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n"; 141 } 142 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); 143 144 # sort out the indexes 145 #indexes are specified with spaces, but we put them into one index 146 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 147 $self->{'collect_cfg'}->{'indexes'} = []; 148 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes)); 149 150 151 # sort out subcollection indexes 152 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { 153 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 154 $self->{'collect_cfg'}->{'indexes'} = []; 155 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { 156 foreach my $index (@$indexes) { 157 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); 158 } 159 } 160 } 161 162 # sort out language subindexes 163 if (defined $self->{'collect_cfg'}->{'languages'}) { 164 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 165 $self->{'collect_cfg'}->{'indexes'} = []; 166 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) { 167 foreach my $index (@$indexes) { 168 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { 169 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); 170 } 171 else { # add in an empty subcollection field 172 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); 173 174 } 175 } 176 } 177 } 178 179 # make sure that the same index isn't specified more than once 180 my %tmphash = (); 181 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}}; 182 $self->{'collect_cfg'}->{'indexes'} = []; 183 foreach my $i (@tmparray) { 184 if (!defined ($tmphash{$i})) { 185 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i); 186 $tmphash{$i} = 1; 187 } 188 } 189 110 my $self = new basebuilder (@_); 111 $self = bless $self, $class; 112 113 $self->{'indexfieldmap'} = \%static_indexfield_map; 190 114 191 115 # get the levels (Section, Paragraph) for indexing and compression … … 212 136 } 213 137 214 print $outhandle "doclevel = ". $self->{'doc_level'}."\n";215 # get the list of plugins for this collection216 217 #build up the extra global options for the plugins218 my @global_opts = ();219 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {220 push @global_opts, "-separate_cjk";221 }222 223 my $plugins = [];224 if (defined $self->{'collect_cfg'}->{'plugin'}) {225 $plugins = $self->{'collect_cfg'}->{'plugin'};226 }227 228 # load all the plugins229 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);230 if (scalar(@{$self->{'pluginfo'}}) == 0) {231 print $outhandle "No plugins were loaded.\n";232 die "\n";233 }234 235 # get the list of classifiers for this collection236 my $classifiers = [];237 if (defined $self->{'collect_cfg'}->{'classify'}) {238 $classifiers = $self->{'collect_cfg'}->{'classify'};239 }240 241 # load all the classifiers242 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);243 244 # load up any dontgdbm fields245 $self->{'dontgdbm'} = {};246 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {247 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {248 $self->{'dontgdbm'}->{$dg} = 1;249 }250 }251 252 # load up the document processor for building253 # if a buildproc class has been created for this collection, use it254 # otherwise, use the mgpp buildproc255 my ($buildprocdir, $buildproctype);256 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {257 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";258 $buildproctype = "${collection}buildproc";259 } else {260 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";261 $buildproctype = "mgppbuildproc";262 }263 require "$buildprocdir/$buildproctype.pm";264 265 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .266 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");267 die "$@" if $@;268 269 138 $self->{'buildtype'} = "mgpp"; 270 139 … … 272 141 } 273 142 274 sub init { 275 my $self = shift (@_); 276 277 if (!$self->{'debug'} && !$self->{'keepold'}) { 278 # remove any old builds 279 &util::rm_r($self->{'build_dir'}); 280 &util::mk_all_dir($self->{'build_dir'}); 281 282 # make the text directory 283 my $textdir = "$self->{'build_dir'}/text"; 284 &util::mk_all_dir($textdir); 285 } 286 } 287 288 sub set_strip_html { 289 my $self = shift (@_); 290 my ($strip) = @_; 291 292 $self->{'strip_html'} = $strip; 293 $self->{'buildproc'}->set_strip_html($strip); 143 sub generate_index_list { 144 my $self = shift (@_); 145 146 # sort out the indexes 147 #indexes are specified with spaces, but we put them into one index 148 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 149 $self->{'collect_cfg'}->{'indexes'} = []; 150 push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes)); 151 } 152 153 sub default_buildproc { 154 my $self = shift (@_); 155 156 return "mgppbuildproc"; 294 157 } 295 158 … … 423 286 } 424 287 425 sub want_built { 426 my $self = shift (@_); 427 my ($index) = @_; 428 429 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { 430 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { 431 if ($index =~ /^$checkstr$/) { 432 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); 433 $self->{'notbuilt'}->{$index} = 1; 434 return 0; 435 } 436 } 437 } 438 439 return 1; 440 } 441 442 sub build_indexes { 443 my $self = shift (@_); 444 my ($indexname) = @_; 445 my $outhandle = $self->{'outhandle'}; 446 447 my $indexes = []; 448 if (defined $indexname && $indexname =~ /\w/) { 449 push @$indexes, $indexname; 450 } else { 451 $indexes = $self->{'collect_cfg'}->{'indexes'}; 452 } 453 454 # create the mapping between the index descriptions 455 # and their directory names (includes subcolls and langs) 456 $self->{'index_mapping'} = $self->create_index_mapping ($indexes); 457 458 # build each of the indexes 459 foreach my $index (@$indexes) { 460 if ($self->want_built($index)) { 461 print $outhandle "\n*** building index $index in subdirectory " . 462 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); 463 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'}; 464 $self->build_index($index); 465 } else { 466 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); 467 } 468 } 469 288 289 sub build_indexes_extra { 290 my $self = shift(@_); 470 291 #define the final field lists 471 292 $self->make_final_field_list(); 472 473 } 293 } 474 294 475 295 # creates directory names for each of the index descriptions … … 545 365 } 546 366 547 # returns a processed version of a field.548 # if the field has only one component the processed549 # version will contain the first character and next consonant550 # of that componant - otherwise it will contain the first551 # character of the first two components552 sub process_field {553 my $self = shift (@_);554 my ($field) = @_;555 556 return "" unless (defined ($field) && $field =~ /\w/);557 558 my @components = split /,/, $field;559 if (scalar @components >= 2) {560 splice (@components, 2);561 map {s/^(.).*$/$1/;} @components;562 return join("", @components);563 } else {564 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;565 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;566 return "$a$b";567 }568 }569 570 367 sub make_unique { 571 368 my $self = shift (@_); … … 583 380 } 584 381 585 sub get_next_version {586 my $self = shift (@_);587 my ($nameref) = @_;588 my $num=0;589 if ($$nameref =~ /(\d\d)$/) {590 $num = $1; $num ++;591 $$nameref =~ s/\d\d$/$num/;592 } elsif ($$nameref =~ /(\d)$/) {593 $num = $1;594 if ($num == 9) {$$nameref =~ s/\d$/10/;}595 else {$num ++; $$nameref =~ s/\d$/$num/;}596 } else {597 $$nameref =~ s/.$/0/;598 }599 }600 382 601 383 sub build_index { … … 812 594 } 813 595 814 sub make_infodatabase { 815 my $self = shift (@_); 816 my $outhandle = $self->{'outhandle'}; 817 818 819 my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); 820 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 821 &util::mk_all_dir ($textdir); 822 &util::mk_all_dir ($assocdir); 823 824 # get db name 825 my $dbext = ".bdb"; 826 $dbext = ".ldb" if &util::is_little_endian(); 827 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); 828 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); 829 830 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 831 my $exe = &util::get_os_exe (); 832 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); 833 834 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index) 835 if (!defined $self->{'build_cfg'}) { 836 $self->read_final_field_list(); 837 } 838 print $outhandle "\n*** creating the info database and processing associated files\n" 839 if ($self->{'verbosity'} >= 1); 840 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 841 842 # init all the classifiers 843 &classify::init_classifiers ($self->{'classifiers'}); 844 845 # set up the document processor 846 my ($handle); 847 if ($self->{'debug'}) { 848 $handle = STDOUT; 849 } else { 850 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) { 851 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'}; 852 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; 853 } 854 $handle = mgppbuilder::PIPEOUT; 855 } 856 857 $self->{'buildproc'}->set_output_handle ($handle); 858 $self->{'buildproc'}->set_mode ('infodb'); 859 $self->{'buildproc'}->set_assocdir ($assocdir); 860 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); 861 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 862 $self->{'buildproc'}->set_indexing_text (0); 863 $self->{'buildproc'}->set_store_text(1); 864 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 865 866 # make_infodatabase does not support incremental build 867 # => full reset needed 868 $self->{'buildproc'}->zero_reset(); 596 597 sub output_collection_meta { 598 my $self = shift(@_); 599 my ($handle) = @_; 869 600 870 601 # do the collection info … … 873 604 # first do the collection meta stuff - everything without a dot 874 605 my $collmetadefined = 0; 606 my $metadata_entry; 875 607 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { 876 608 $collmetadefined = 1; 877 609 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { 878 610 next if ($cmeta =~ /^\./); # for now, ignore ones with dots 879 my ($metadata_entry)= $self->create_language_db_map($cmeta, $cmeta);611 $metadata_entry = $self->create_language_db_map($cmeta, $cmeta); 880 612 #write the entry to the file 881 613 print $handle $metadata_entry; … … 896 628 $collmeta = ".$longfield"; 897 629 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) { 898 my$metadata_entry = $self->create_language_db_map($collmeta, $shortfield);630 $metadata_entry = $self->create_language_db_map($collmeta, $shortfield); 899 631 $field_entry .= $metadata_entry; 900 632 } else { #use the metadata names, or the text macros for allfields and textonly … … 917 649 my $levelid = $level_map{$level}; # find the actual value we used in the index 918 650 if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) { 919 my$metadata_entry = $self->create_language_db_map($collmeta, $levelid);651 $metadata_entry = $self->create_language_db_map($collmeta, $levelid); 920 652 $level_entry .= $metadata_entry; 921 653 } else { … … 955 687 # end the collection entry 956 688 print $handle "\n" . ('-' x 70) . "\n"; 957 958 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 959 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 960 961 # output classification information 962 &classify::output_classify_info ($self->{'classifiers'}, $handle, 963 $self->{'remove_empty_classifications'}, 964 $self->{'gli'}); 965 966 #output doclist 967 my @doclist = $self->{'buildproc'}->get_doc_list(); 968 my $docs = join (";",@doclist); 969 print $handle "[browselist]\n"; 970 print $handle "<hastxt>0\n"; 971 print $handle "<childtype>VList\n"; 972 print $handle "<numleafdocs>" . ($#doclist+1) . "\n"; 973 print $handle "<thistype>Invisible\n"; 974 print $handle "<contains>$docs"; 975 print $handle "\n" . ('-' x 70) . "\n"; 976 close ($handle) if !$self->{'debug'}; 977 978 print STDERR "</Stage>\n" if $self->{'gli'}; 979 } 980 689 690 691 } 981 692 sub create_language_db_map { 982 693 my $self = shift (@_); … … 1020 731 1021 732 } 1022 sub collect_specific {1023 my $self = shift (@_);1024 }1025 733 1026 734 # at the end of building, we have an indexfieldmap with all the mappings, … … 1047 755 # those again. 1048 756 1049 foreach my $field (@{$self->{'collect_cfg'}->{'indexes'}}) { 757 my $field; 758 foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) { 1050 759 # remove subcoll stuff 1051 760 my $parts = $field; … … 1061 770 1062 771 #add all fields bit 1063 foreach my$field (@specifiedfieldorder) {772 foreach $field (@specifiedfieldorder) { 1064 773 if ($field eq "metadata") { 1065 774 foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) { … … 1114 823 1115 824 my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile); 1116 825 my $field; 1117 826 if (defined $buildcfg->{'indexfields'}) { 1118 foreach my$field (@{$buildcfg->{'indexfields'}}) {827 foreach $field (@{$buildcfg->{'indexfields'}}) { 1119 828 push (@indexfields, "$field"); 1120 829 } … … 1122 831 1123 832 if (defined $buildcfg->{'indexfieldmap'}) { 1124 foreach my$field (@{$buildcfg->{'indexfieldmap'}}) {833 foreach $field (@{$buildcfg->{'indexfieldmap'}}) { 1125 834 push (@indexfieldmap, "$field"); 1126 835 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/; … … 1133 842 } 1134 843 1135 sub make_auxiliary_files { 844 845 sub write_cfg_file { 846 my $self = shift(@_); 847 my ($build_cfg) = @_; 848 849 # write out the build information 850 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, 851 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem|maxnumeric)$', 852 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$'); 853 854 } 855 856 sub build_cfg_extra { 1136 857 my $self = shift (@_); 1137 my ($index); 1138 1139 my $build_cfg = {}; 1140 # this already includes indexfieldmap and indexfields 1141 if (defined $self->{'build_cfg'}) { 1142 $build_cfg = $self->{'build_cfg'}; 1143 } 1144 1145 my $outhandle = $self->{'outhandle'}; 1146 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); 1147 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'}; 1148 1149 # get the text directory 1150 &util::mk_all_dir ($self->{'build_dir'}); 1151 1152 # store the build date 1153 $build_cfg->{'builddate'} = time; 1154 $build_cfg->{'buildtype'} = $self->{'buildtype'}; 1155 $build_cfg->{'indexstem'} = $self->{'collection'}; 858 my ($build_cfg) = @_; 859 860 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); 861 1156 862 # store the level info 1157 863 my @indexlevels = (); … … 1169 875 $build_cfg->{'textlevel'} = $level_map{'document'}; 1170 876 } 1171 # store the number of documents and number of bytes 1172 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); 1173 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); 1174 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); 1175 1176 # store the mapping between the index names and the directory names 1177 my @indexmap = (); 1178 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { 1179 if (not defined ($self->{'notbuilt'}->{$index})) { 1180 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); 1181 } 1182 } 1183 $build_cfg->{'indexmap'} = \@indexmap; 1184 1185 my @subcollectionmap = (); 1186 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { 1187 push (@subcollectionmap, "$subcollection\-\>" . 1188 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); 1189 } 1190 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); 1191 1192 my @languagemap = (); 1193 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { 1194 push (@languagemap, "$language\-\>" . 1195 $self->{'index_mapping'}->{'languagemap'}->{$language}); 1196 } 1197 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); 1198 1199 my @notbuilt = (); 1200 foreach my $nb (keys %{$self->{'notbuilt'}}) { 1201 push (@notbuilt, $nb); 1202 } 1203 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt); 1204 1205 # write out the build information 1206 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, 1207 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem)$', 1208 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$'); 1209 1210 print STDERR "</Stage>\n" if $self->{'gli'}; 1211 } 1212 1213 sub deinit { 1214 my $self = shift (@_); 1215 1216 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'}); 1217 } 1218 1219 sub print_stats { 1220 my $self = shift (@_); 1221 1222 my $outhandle = $self->{'outhandle'}; 1223 my $indexing_text = $self->{'buildproc'}->get_indexing_text(); 1224 my $index = $self->{'buildproc'}->get_index(); 1225 my $num_bytes = $self->{'buildproc'}->get_num_bytes(); 1226 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); 1227 1228 if ($indexing_text) { 1229 print $outhandle "Stats (Creating index $index)\n"; 1230 } else { 1231 print $outhandle "Stats (Compressing text from $index)\n"; 1232 } 1233 print $outhandle "Total bytes in collection: $num_bytes\n"; 1234 print $outhandle "Total bytes in $index: $num_processed_bytes\n"; 1235 1236 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { 1237 1238 if ($self->{'keepold'}) { 1239 if ($num_processed_bytes == 0) { 1240 if ($indexing_text) { 1241 print $outhandle "No additional text was added to $index\n"; 1242 } elsif (!$self->{'no_text'}) { 1243 print $outhandle "No additional text was compressed\n"; 1244 } 1245 } 1246 } 1247 else { 1248 print $outhandle "***************\n"; 1249 if ($indexing_text) { 1250 print $outhandle "WARNING: There is very little or no text to process for $index\n"; 1251 } elsif (!$self->{'no_text'}) { 1252 print $outhandle "WARNING: There is very little or no text to compress\n"; 1253 } 1254 print $outhandle " Was this your intention?\n"; 1255 print $outhandle "***************\n"; 1256 } 1257 1258 } 1259 877 1260 878 } 1261 879
Note:
See TracChangeset
for help on using the changeset viewer.