source: trunk/gsdl/perllib/mgbuildproc.pm@ 315

Last change on this file since 315 was 315, checked in by sjboddie, 25 years ago
  • removed old infodb stuff
  • changed the way classifiers work
  • added maxdocs and allclassifications options
  • no longer get doctype from collect.cfg but instead set it directly in plugins that don't use the default
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use classify;
8use doc;
9use docproc;
10use util;
11
12
13BEGIN {
14 @ISA = ('docproc');
15}
16
17
18sub new {
19 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20 my $self = new docproc ();
21
22 $self->{'collection'} = $collection;
23 $self->{'source_dir'} = $source_dir;
24 $self->{'build_dir'} = $build_dir;
25 $self->{'verbosity'} = $verbosity;
26 $self->{'classifiers'} = [];
27 $self->{'mode'} = "text";
28 $self->{'index'} = "section:text";
29 $self->{'indexexparr'} = [];
30 $self->{'output_handle'} = "STDOUT";
31 $self->{'num_docs'} = 0;
32 $self->{'num_sections'} = 0;
33 $self->{'num_bytes'} = 0;
34
35 $self->{'indexing_text'} = 0;
36
37 return bless $self, $class;
38}
39
40sub reset {
41 my $self = shift (@_);
42
43 $self->{'num_docs'} = 0;
44 $self->{'num_sections'} = 0;
45 $self->{'num_bytes'} = 0;
46}
47
48sub get_num_docs {
49 my $self = shift (@_);
50
51 return $self->{'num_docs'};
52}
53
54sub get_num_sections {
55 my $self = shift (@_);
56
57 return $self->{'num_sections'};
58}
59
60sub get_num_bytes {
61 my $self = shift (@_);
62
63 return $self->{'num_bytes'};
64}
65
66sub set_output_handle {
67 my $self = shift (@_);
68 my ($handle) = @_;
69
70 $self->{'output_handle'} = $handle;
71}
72
73sub set_mode {
74 my $self = shift (@_);
75 my ($mode) = @_;
76
77 $self->{'mode'} = $mode;
78}
79
80sub set_index {
81 my $self = shift (@_);
82 my ($index, $indexexparr) = @_;
83
84 $self->{'index'} = $index;
85 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
86}
87
88sub set_classifiers {
89 my $self = shift (@_);
90 my ($classifiers) = @_;
91
92 $self->{'classifiers'} = $classifiers;
93}
94
95sub set_indexing_text {
96 my $self = shift (@_);
97 my ($indexing_text) = @_;
98
99 $self->{'indexing_text'} = $indexing_text;
100}
101
102sub process {
103 my $self = shift (@_);
104 my $method = $self->{'mode'};
105
106 $self->$method(@_);
107}
108
109sub infodb {
110 my $self = shift (@_);
111 my ($doc_obj, $filename) = @_;
112 my $handle = $self->{'output_handle'};
113# $handle = "main::STDOUT";
114
115 my $doctype = $doc_obj->get_doc_type();
116
117 # only output this document if it is one to be indexed
118 return if ($doctype ne "indexed_doc");
119
120 # this is another document
121 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
122
123 my $section = $doc_obj->get_top_section ();
124 my $doc_OID = $doc_obj->get_OID();
125 while (defined $section) {
126 # update a few statistics
127 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
128 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
129
130 # output the section name
131 if ($section eq "") { print $handle "[$doc_OID]\n"; }
132 else { print $handle "[$doc_OID.$section]\n"; }
133
134 # output the fact that this document is a document
135 print $handle "<doctype>doc\n";
136
137 # output whether this node contains text
138 if ($doc_obj->get_text_length($section) > 0) {
139 print $handle "<hastxt>1\n";
140 } else {
141 print $handle "<hastxt>0\n";
142 }
143
144 # output all the section metadata
145 my $metadata = $doc_obj->get_all_metadata ($section);
146 foreach $pair (@$metadata) {
147 my ($field, $value) = (@$pair);
148
149 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
150 defined $value && $value ne "") {
151 # escape problematic stuff
152 $value =~ s/\\/\\\\/g;
153 $value =~ s/\n/\\n/g;
154 $value =~ s/\r/\\r/g;
155
156 print $handle "<$field>$value\n";
157 }
158 }
159
160 # output archivedir if at top level
161 if ($section eq $doc_obj->get_top_section()) {
162 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
163 $archivedir = "" unless defined $archivedir;
164 $archivedir =~ s/^(\/|\\)*//;
165 $archivedir =~ s/(\/|\\)*$//;
166 print $handle "<archivedir>$archivedir\n";
167 }
168
169 # output a list of children
170 my $children = $doc_obj->get_children ($section);
171 if (scalar(@$children) > 0) {
172 print $handle "<contains>";
173 my $firstchild = 1;
174 foreach $child (@$children) {
175 print $handle ";" unless $firstchild;
176 $firstchild = 0;
177 if ($child =~ /^.*?\.(\d+)$/) {
178 print $handle "\".$1";
179 } else {
180 print $handle "\".$child";
181 }
182# if ($child eq "") { print $handle "$doc_OID"; }
183# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
184# else { print $handle "$doc_OID.$section.$child"; }
185 }
186 print $handle "\n";
187 }
188
189 # output the matching document number
190 print $handle "<docnum>$self->{'num_sections'}\n";
191
192 print $handle '-' x 70, "\n";
193
194
195 # output a database entry for the document number
196 print $handle "[$self->{'num_sections'}]\n";
197 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
198 else { print $handle "<section>$doc_OID.$section\n"; }
199 print $handle '-' x 70, "\n";
200
201
202 $section = $doc_obj->get_next_section($section);
203 }
204
205 # classify this document
206 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
207
208}
209
210sub find_paragraphs {
211 $_[1] =~ s/(<p\b)/\cC$1/gi;
212}
213
214sub filter_text {
215 # $self->filter_text ($field, $new_text);
216 # don't want to do anything for this version, however,
217 # in a particular collection you might want to override
218 # this method to post-process certain fields depending on
219 # the field, or whether we are outputting it for indexing
220}
221
222sub text {
223 my $self = shift (@_);
224 my ($doc_obj) = @_;
225 my $handle = $self->{'output_handle'};
226 my $indexed_doc = 1;
227
228 # only output this document if it is one to be indexed
229 return if ($doc_obj->get_doc_type() ne "indexed_doc");
230
231 # see if this document belongs to this subcollection
232 foreach $indexexp (@{$self->{'indexexparr'}}) {
233 $indexed_doc = 0;
234 my ($field, $exp, $options) = split /\//, $indexexp;
235 if (defined ($field) && defined ($exp)) {
236 my ($bool) = $field =~ /^(.)/;
237 $field =~ s/^.// if $bool eq '!';
238 if ($field eq "filename") {
239 $field = $doc_obj->get_source_filename();
240 } else {
241 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
242 }
243 next unless defined $field;
244 if ($bool eq '!') {
245 if ($options =~ /^i$/i) {
246 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
247 } else {
248 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
249 }
250 } else {
251 if ($options =~ /^i$/i) {
252 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
253 } else {
254 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
255 }
256 }
257 }
258 }
259
260 # this is another document
261 $self->{'num_docs'} += 1;
262
263 # get the parameters for the output
264 my ($level, $fields) = split (/:/, $self->{'index'});
265 $fields =~ s/\ball\b/Title,Creator,text/;
266 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
267
268 my $doc_section = 0; # just for this document
269 my $text = "";
270 my $text_extra = "";
271
272 # get the text for this document
273 my $section = $doc_obj->get_top_section();
274 while (defined $section) {
275 # update a few statistics
276 $doc_section++;
277 $self->{'num_sections'} += 1;
278
279 if ($indexed_doc) {
280 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
281 foreach $field (split (/,/, $fields)) {
282 # only deal with this field if it doesn't start with top or
283 # this is the first section
284 my $real_field = $field;
285 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
286 my $new_text = "";
287 if ($real_field eq "text") {
288 $new_text = $doc_obj->get_text ($section);
289 $new_text =~ s/[\cB\cC]//g;
290 $self->find_paragraphs($new_text);
291
292 } else {
293 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
294 }
295
296 # filter the text
297 $self->filter_text ($field, $new_text);
298
299 if ($self->{'indexing_text'} &&
300 $new_text =~ /[\(\)\{\}]/) {
301 }
302
303 $text .= "$new_text\cC";
304 }
305 }
306 }
307
308 if ($level eq "document") { $text_extra .= "\cB"; }
309 else { $text .= "\cB"; }
310
311 $section = $doc_obj->get_next_section($section);
312 }
313
314 print $handle "$text$text_extra";
315}
316
317# converts leading number in classification back
318# to letter it represents
319# i.e 67.2.4 becomes C.2.4
320sub char_classification {
321 my $self = shift (@_);
322 my ($classification) = @_;
323
324 return $classification if $classification eq "";
325 my ($c) = $classification =~ /^\.?(\d+)/;
326 $c = chr($c);
327 $classification =~ s/^\d+/$c/;
328
329 return $classification;
330}
331
332# converts leading letter of a classification into its ascii equivalent
333# i.e C.2.4 becomes 67.2.4
334sub int_classification {
335 my $self = shift (@_);
336 my ($classification) = @_;
337 my $c = ord($classification);
338 $classification =~ s/^./$c/;
339
340 return $classification;
341}
3421;
343
Note: See TracBrowser for help on using the repository browser.