Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 315

Last change on this file since 315 was 315, checked in by sjboddie, 25 years ago
removed old infodb stuff changed the way classifiers work added maxdocs and allclassifications options no longer get doctype from collect.cfg but instead set it directly in plugins that don't use the default
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Line
1	# This document processor outputs a document
2	# for mg to process
3
4
5	package mgbuildproc;
6
7	use classify;
8	use doc;
9	use docproc;
10	use util;
11
12
13	BEGIN {
14	@ISA = ('docproc');
15	}
16
17
18	sub new {
19	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20	my $self = new docproc ();
21
22	$self->{'collection'} = $collection;
23	$self->{'source_dir'} = $source_dir;
24	$self->{'build_dir'} = $build_dir;
25	$self->{'verbosity'} = $verbosity;
26	$self->{'classifiers'} = [];
27	$self->{'mode'} = "text";
28	$self->{'index'} = "section:text";
29	$self->{'indexexparr'} = [];
30	$self->{'output_handle'} = "STDOUT";
31	$self->{'num_docs'} = 0;
32	$self->{'num_sections'} = 0;
33	$self->{'num_bytes'} = 0;
34
35	$self->{'indexing_text'} = 0;
36
37	return bless $self, $class;
38	}
39
40	sub reset {
41	my $self = shift (@_);
42
43	$self->{'num_docs'} = 0;
44	$self->{'num_sections'} = 0;
45	$self->{'num_bytes'} = 0;
46	}
47
48	sub get_num_docs {
49	my $self = shift (@_);
50
51	return $self->{'num_docs'};
52	}
53
54	sub get_num_sections {
55	my $self = shift (@_);
56
57	return $self->{'num_sections'};
58	}
59
60	sub get_num_bytes {
61	my $self = shift (@_);
62
63	return $self->{'num_bytes'};
64	}
65
66	sub set_output_handle {
67	my $self = shift (@_);
68	my ($handle) = @_;
69
70	$self->{'output_handle'} = $handle;
71	}
72
73	sub set_mode {
74	my $self = shift (@_);
75	my ($mode) = @_;
76
77	$self->{'mode'} = $mode;
78	}
79
80	sub set_index {
81	my $self = shift (@_);
82	my ($index, $indexexparr) = @_;
83
84	$self->{'index'} = $index;
85	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
86	}
87
88	sub set_classifiers {
89	my $self = shift (@_);
90	my ($classifiers) = @_;
91
92	$self->{'classifiers'} = $classifiers;
93	}
94
95	sub set_indexing_text {
96	my $self = shift (@_);
97	my ($indexing_text) = @_;
98
99	$self->{'indexing_text'} = $indexing_text;
100	}
101
102	sub process {
103	my $self = shift (@_);
104	my $method = $self->{'mode'};
105
106	$self->$method(@_);
107	}
108
109	sub infodb {
110	my $self = shift (@_);
111	my ($doc_obj, $filename) = @_;
112	my $handle = $self->{'output_handle'};
113	# $handle = "main::STDOUT";
114
115	my $doctype = $doc_obj->get_doc_type();
116
117	# only output this document if it is one to be indexed
118	return if ($doctype ne "indexed_doc");
119
120	# this is another document
121	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
122
123	my $section = $doc_obj->get_top_section ();
124	my $doc_OID = $doc_obj->get_OID();
125	while (defined $section) {
126	# update a few statistics
127	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
128	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
129
130	# output the section name
131	if ($section eq "") { print $handle "[$doc_OID]\n"; }
132	else { print $handle "[$doc_OID.$section]\n"; }
133
134	# output the fact that this document is a document
135	print $handle "<doctype>doc\n";
136
137	# output whether this node contains text
138	if ($doc_obj->get_text_length($section) > 0) {
139	print $handle "<hastxt>1\n";
140	} else {
141	print $handle "<hastxt>0\n";
142	}
143
144	# output all the section metadata
145	my $metadata = $doc_obj->get_all_metadata ($section);
146	foreach $pair (@$metadata) {
147	my ($field, $value) = (@$pair);
148
149	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
150	defined $value && $value ne "") {
151	# escape problematic stuff
152	$value =~ s/\\/\\\\/g;
153	$value =~ s/\n/\\n/g;
154	$value =~ s/\r/\\r/g;
155
156	print $handle "<$field>$value\n";
157	}
158	}
159
160	# output archivedir if at top level
161	if ($section eq $doc_obj->get_top_section()) {
162	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
163	$archivedir = "" unless defined $archivedir;
164	$archivedir =~ s/^(\/\|\\)*//;
165	$archivedir =~ s/(\/\|\\)*$//;
166	print $handle "<archivedir>$archivedir\n";
167	}
168
169	# output a list of children
170	my $children = $doc_obj->get_children ($section);
171	if (scalar(@$children) > 0) {
172	print $handle "<contains>";
173	my $firstchild = 1;
174	foreach $child (@$children) {
175	print $handle ";" unless $firstchild;
176	$firstchild = 0;
177	if ($child =~ /^.*?\.(\d+)$/) {
178	print $handle "\".$1";
179	} else {
180	print $handle "\".$child";
181	}
182	# if ($child eq "") { print $handle "$doc_OID"; }
183	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
184	# else { print $handle "$doc_OID.$section.$child"; }
185	}
186	print $handle "\n";
187	}
188
189	# output the matching document number
190	print $handle "<docnum>$self->{'num_sections'}\n";
191
192	print $handle '-' x 70, "\n";
193
194
195	# output a database entry for the document number
196	print $handle "[$self->{'num_sections'}]\n";
197	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
198	else { print $handle "<section>$doc_OID.$section\n"; }
199	print $handle '-' x 70, "\n";
200
201
202	$section = $doc_obj->get_next_section($section);
203	}
204
205	# classify this document
206	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
207
208	}
209
210	sub find_paragraphs {
211	$_[1] =~ s/(<p\b)/\cC$1/gi;
212	}
213
214	sub filter_text {
215	# $self->filter_text ($field, $new_text);
216	# don't want to do anything for this version, however,
217	# in a particular collection you might want to override
218	# this method to post-process certain fields depending on
219	# the field, or whether we are outputting it for indexing
220	}
221
222	sub text {
223	my $self = shift (@_);
224	my ($doc_obj) = @_;
225	my $handle = $self->{'output_handle'};
226	my $indexed_doc = 1;
227
228	# only output this document if it is one to be indexed
229	return if ($doc_obj->get_doc_type() ne "indexed_doc");
230
231	# see if this document belongs to this subcollection
232	foreach $indexexp (@{$self->{'indexexparr'}}) {
233	$indexed_doc = 0;
234	my ($field, $exp, $options) = split /\//, $indexexp;
235	if (defined ($field) && defined ($exp)) {
236	my ($bool) = $field =~ /^(.)/;
237	$field =~ s/^.// if $bool eq '!';
238	if ($field eq "filename") {
239	$field = $doc_obj->get_source_filename();
240	} else {
241	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
242	}
243	next unless defined $field;
244	if ($bool eq '!') {
245	if ($options =~ /^i$/i) {
246	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
247	} else {
248	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
249	}
250	} else {
251	if ($options =~ /^i$/i) {
252	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
253	} else {
254	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
255	}
256	}
257	}
258	}
259
260	# this is another document
261	$self->{'num_docs'} += 1;
262
263	# get the parameters for the output
264	my ($level, $fields) = split (/:/, $self->{'index'});
265	$fields =~ s/\ball\b/Title,Creator,text/;
266	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
267
268	my $doc_section = 0; # just for this document
269	my $text = "";
270	my $text_extra = "";
271
272	# get the text for this document
273	my $section = $doc_obj->get_top_section();
274	while (defined $section) {
275	# update a few statistics
276	$doc_section++;
277	$self->{'num_sections'} += 1;
278
279	if ($indexed_doc) {
280	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
281	foreach $field (split (/,/, $fields)) {
282	# only deal with this field if it doesn't start with top or
283	# this is the first section
284	my $real_field = $field;
285	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
286	my $new_text = "";
287	if ($real_field eq "text") {
288	$new_text = $doc_obj->get_text ($section);
289	$new_text =~ s/[\cB\cC]//g;
290	$self->find_paragraphs($new_text);
291
292	} else {
293	$new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
294	}
295
296	# filter the text
297	$self->filter_text ($field, $new_text);
298
299	if ($self->{'indexing_text'} &&
300	$new_text =~ /[\(\)\{\}]/) {
301	}
302
303	$text .= "$new_text\cC";
304	}
305	}
306	}
307
308	if ($level eq "document") { $text_extra .= "\cB"; }
309	else { $text .= "\cB"; }
310
311	$section = $doc_obj->get_next_section($section);
312	}
313
314	print $handle "$text$text_extra";
315	}
316
317	# converts leading number in classification back
318	# to letter it represents
319	# i.e 67.2.4 becomes C.2.4
320	sub char_classification {
321	my $self = shift (@_);
322	my ($classification) = @_;
323
324	return $classification if $classification eq "";
325	my ($c) = $classification =~ /^\.?(\d+)/;
326	$c = chr($c);
327	$classification =~ s/^\d+/$c/;
328
329	return $classification;
330	}
331
332	# converts leading letter of a classification into its ascii equivalent
333	# i.e C.2.4 becomes 67.2.4
334	sub int_classification {
335	my $self = shift (@_);
336	my ($classification) = @_;
337	my $c = ord($classification);
338	$classification =~ s/^./$c/;
339
340	return $classification;
341	}
342	1;
343

Note: See TracBrowser for help on using the repository browser.

Download in other formats: