Changeset 215 for trunk/gsdl/perllib/mgbuildproc.pm
- Timestamp:
- 1999-03-30T17:08:18+12:00 (25 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgbuildproc.pm
r139 r215 5 5 package mgbuildproc; 6 6 7 use classify; 8 use doc; 7 9 use docproc; 8 10 use util; … … 22 24 $self->{'build_dir'} = $build_dir; 23 25 $self->{'verbosity'} = $verbosity; 26 $self->{'classifiers'} = []; 24 27 $self->{'mode'} = "text"; 25 28 $self->{'index'} = "section:text"; … … 81 84 } 82 85 86 sub set_classifiers { 87 my $self = shift (@_); 88 my ($classifiers) = @_; 89 90 $self->{'classifiers'} = $classifiers; 91 } 92 83 93 sub process { 84 94 my $self = shift (@_); … … 87 97 $self->$method(@_); 88 98 } 99 100 sub newinfodb { 101 my $self = shift (@_); 102 my ($doc_obj, $filename) = @_; 103 my $handle = $self->{'output_handle'}; 104 # $handle = "main::STDOUT"; 105 106 # this was used in the old version 107 return if ($doc_obj eq 'classifications'); 108 109 my $doctype = $doc_obj->get_doc_type(); 110 111 # only output this document if it is one to be indexed 112 return if ($doctype ne "indexed_doc"); 113 114 # this is another document 115 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 116 117 my $section = $doc_obj->get_top_section (); 118 my $doc_OID = $doc_obj->get_OID(); 119 while (defined $section) { 120 # update a few statistics 121 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 122 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 123 124 # output the section name 125 if ($section eq "") { print $handle "[$doc_OID]\n"; } 126 else { print $handle "[$doc_OID.$section]\n"; } 127 128 # output the fact that this document is a document 129 print $handle "<doctype>doc\n"; 130 131 # output whether this node contains text 132 if ($doc_obj->get_text_length($section) > 0) { 133 print $handle "<hastxt>1\n"; 134 } else { 135 print $handle "<hastxt>0\n"; 136 } 137 138 # output all the section metadata 139 my $metadata = $doc_obj->get_all_metadata ($section); 140 foreach $pair (@$metadata) { 141 my ($field, $value) = (@$pair); 142 143 if ($field ne "Identifier" && $field !~ /^gsdl/ && 144 defined $value && $value ne "") { 145 # escape problematic stuff 146 $value =~ s/\\/\\\\/g; 147 $value =~ s/\n/\\n/g; 148 $value =~ s/\r/\\r/g; 149 150 print $handle "<$field>$value\n"; 151 } 152 } 153 154 # output a list of children 155 my $children = $doc_obj->get_children (); 156 if (scalar(@$children) > 0) { 157 print $handle "<contains>"; 158 my $firstchild = 1; 159 foreach $child (@$children) { 160 print $handle ";" unless $firstchild; 161 $firstchild = 0; 162 if ($child eq "") { print $handle "$doc_OID"; } 163 else { print $handle "$doc_OID.$child"; } 164 } 165 print $handle "\n"; 166 } 167 168 # output the matching document number 169 print $handle "<docnum>$self->{'num_sections'}\n"; 170 171 print $handle '-' x 70, "\n"; 172 173 174 # output a database entry for the document number 175 print $handle "[$self->{'num_sections'}]\n"; 176 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 177 else { print $handle "<section>$doc_OID.$section\n"; } 178 print $handle '-' x 70, "\n"; 179 180 181 $section = $doc_obj->get_next_section($section); 182 } 183 184 # classify this document 185 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 186 187 } 188 189 89 190 90 191 sub infodb {
Note:
See TracChangeset
for help on using the changeset viewer.