Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 69

Last change on this file since 69 was 69, checked in by sjboddie, 25 years ago

Sub-collection indexes may now be defined within the collect.cfg file as
subcollection blah1 Title/blah/i
subcollection blah2 !Title/blah/i
indexsubcollections blah1 blah2 blah1,blah2
indexes section:text document:text
This example would create section:text and document:text indexes for:

the blah1 subcollection (i.e those documents whose Title field contains 'blah')
the blah2 subcollection (i.e. those documents whose Title field doesn't contain 'blah')
both subcollections (i.e. all documents)

The field to match the regular expression against (Title in this example) may be
any valid metadata tag or 'filename'.
The regular expression (blah in this example) may be any valid perl regular expression.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.0 KB

Line
1	# This document processor outputs a document
2	# for mg to process
3
4
5	package mgbuildproc;
6
7	use docproc;
8	use util;
9
10
11	BEGIN {
12	@ISA = ('docproc');
13	}
14
15
16	sub new {
17	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18	my $self = new docproc ();
19
20	$self->{'collection'} = $collection;
21	$self->{'source_dir'} = $source_dir;
22	$self->{'build_dir'} = $build_dir;
23	$self->{'verbosity'} = $verbosity;
24	$self->{'mode'} = "text";
25	$self->{'index'} = "section:text";
26	$self->{'indexexparr'} = [];
27	$self->{'output_handle'} = "STDOUT";
28	$self->{'num_docs'} = 0;
29	$self->{'num_sections'} = 0;
30	$self->{'num_bytes'} = 0;
31
32	return bless $self, $class;
33	}
34
35	sub reset {
36	my $self = shift (@_);
37
38	$self->{'num_docs'} = 0;
39	$self->{'num_sections'} = 0;
40	$self->{'num_bytes'} = 0;
41	}
42
43	sub get_num_docs {
44	my $self = shift (@_);
45
46	return $self->{'num_docs'};
47	}
48
49	sub get_num_sections {
50	my $self = shift (@_);
51
52	return $self->{'num_sections'};
53	}
54
55	sub get_num_bytes {
56	my $self = shift (@_);
57
58	return $self->{'num_bytes'};
59	}
60
61	sub set_output_handle {
62	my $self = shift (@_);
63	my ($handle) = @_;
64
65	$self->{'output_handle'} = $handle;
66	}
67
68	sub set_mode {
69	my $self = shift (@_);
70	my ($mode) = @_;
71
72	$self->{'mode'} = $mode;
73	}
74
75	sub set_index {
76	my $self = shift (@_);
77	my ($index, $indexexparr) = @_;
78
79	$self->{'index'} = $index;
80	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
81	}
82
83	sub process {
84	my $self = shift (@_);
85	my ($doc_obj) = @_;
86	my $method = $self->{'mode'};
87
88	$self->$method($doc_obj);
89	}
90
91	sub infodb {
92	my $self = shift (@_);
93	my ($doc_obj) = @_;
94	my $handle = $self->{'output_handle'};
95
96	if ($doc_obj eq 'classifications') {
97	# output classifications if all books have been processed
98	foreach $key (keys %$saved_classifications) {
99	$saved_classifications->{$key}->{'contains'} = undef if
100	$saved_classifications->{$key}->{'contains'} eq "";
101	$saved_classifications->{$key}->{'parent'} = undef if
102	(!defined $saved_classifications->{$key}->{'parent'}) \|\|
103	($saved_classifications->{$key}->{'parent'} eq "");
104	$self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
105	undef, undef, undef, undef,
106	$saved_classifications->{$key}->{'contains'}, undef,
107	$saved_classifications->{$key}->{'parent'}, undef, undef);
108	}
109	return;
110	}
111
112	my $doctype = $doc_obj->get_doc_type();
113
114	# only output this document if it is one to be indexed
115	return if ($doctype ne "indexed_doc" &&
116	$doctype ne "classification");
117
118	# found classification document
119	$saved_classifications = {} if ($doctype eq "classification");
120	$sectionmap = {};
121
122	# this is another document
123	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
124
125	my $section = $doc_obj->get_top_section();
126	while (defined $section) {
127	# update a few statistics
128	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
129	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
130
131	my $title = $doc_obj->get_metadata_element($section, "Title");
132	my $source = $doc_obj->get_metadata_element($section, "Source");
133	my $date = $doc_obj->get_metadata_element($section, "Date");
134	my $jobnumber = $doc_obj->get_source_filename();
135
136	my $mapped_section = $self->map_section($doctype, $section);
137	if ($doctype eq "classification") {
138	$mapped_section = $self->char_classification($mapped_section);
139	} else {
140	$mapped_section = "B.$self->{'num_docs'}.$mapped_section";
141	}
142	$mapped_section =~ s/\.+$//;
143
144	my ($parent, $classification, $creator, $OID);
145
146	if ($section ne $doc_obj->get_top_section()) {
147	$parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
148	if ($doctype eq "classification") {
149	$parent = $self->char_classification($parent);
150	} else {
151	$parent = "B.$self->{'num_docs'}.$parent";
152	}
153	$parent =~ s/\.+$//;
154	} else {
155	$classification = $self->get_classifications($doc_obj, $section, $mapped_section)
156	unless $doctype eq "classification";
157	$creator = $doc_obj->get_metadata_element($section, "Creator");
158	$OID = $doc_obj->get_OID();
159	# this is a hack at getting OID to look like the directory path - I'm sure there's
160	# a better way to do it but it's late... Stefan
161	my @OIDchars = split //, $OID;
162	$OID = "";
163	my $count = 0;
164	foreach $i (@OIDchars) {
165	if ($count == 7) {
166	$OID .= "$i/";
167	$count = 0;
168	} else {
169	$OID .= $i;
170	$count ++;
171	}
172	}
173	$OID =~ s/\/$//;
174	$OID .= ".dir";
175	}
176
177	if ($doc_obj->is_leaf_section($section)) {
178	if ($doctype eq "classification") {
179	if (defined $saved_classifications->{$mapped_section}) {
180	print STDERR "mgbuildproc:warning: classification $mapped_section " .
181	"declared multiple times\n";
182	} else {
183	$saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
184	'parent' => $parent};
185	}
186	} else {
187	$self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
188	$self->{'num_sections'}, $parent, $classification, $OID);
189	}
190	} else {
191
192	# add the introductory section if it exists
193	my $contains = "";
194	if ($doc_obj->get_text_length($section) > 0) {
195	$contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
196	}
197
198	# add the rest of the children
199	my @children = @{$doc_obj->get_children($section)};
200	if ($doctype eq "classification") {
201	map {$_ = $self->char_classification($_);} @children;
202	if (defined $saved_classifications->{$mapped_section}) {
203	print STDERR "mgbuildproc:warning: classification $mapped_section " .
204	"declared multiple times\n";
205	} else {
206	$saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
207	$saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
208	$saved_classifications->{$mapped_section}->{'parent'} = $parent;
209	}
210	} else {
211	map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
212	$contains .= ":" if $contains ne "";
213	$contains .= join ":", @children;
214	$self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
215	$date, $jobnumber, $contains, $self->{'num_sections'},
216	$parent, $classification, $OID);
217
218	if ($doc_obj->get_text_length($section) > 0) {
219	my $intromapsection = "B.$self->{'num_docs'}." .
220	$self->map_section($doctype, "$section.0");
221	$self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
222	$source, $date, $jobnumber, undef, $self->{'num_sections'},
223	$mapped_section, undef, $OID);
224	}
225	}
226	}
227	$section = $doc_obj->get_next_section($section);
228	}
229
230	# update classification list with those books that
231	# were processed before classification list
232	if ($doctype eq "classification" && defined $temp_classifications) {
233	foreach $key (keys(%$temp_classifications)) {
234	if (!defined $saved_classifications->{$key}) {
235	print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
236	"undefined classification $key\n";
237	}
238	$saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
239	}
240	$temp_classifications = undef;
241	}
242	}
243
244
245	sub write_to_gdbm {
246	my $self = shift (@_);
247	my ($handle, $section, $title, $creator, $source, $date,
248	$jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
249
250	print $handle "[$section]\n";
251	print $handle "<t>$title\n" if (defined $title && $title ne "");
252	print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
253	print $handle "<s>$source\n" if (defined $source && $source ne "");
254	print $handle "<i>$date\n" if (defined $date && $date ne "");
255	print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
256	print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
257	print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
258	print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
259	print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
260	print $handle "<o>$OID\n" if defined $OID;
261	print $handle '-' x 70, "\n";
262
263	if (defined $docnum) {
264	print $handle "[$docnum]\n";
265	print $handle "<x>$section\n";
266	print $handle '-' x 70, "\n";
267	}
268	}
269
270	sub get_classifications {
271	my $self = shift (@_);
272	my ($doc_obj, $section, $mapped_section) = @_;
273
274	my ($classificationsref);
275	if (defined $saved_classifications) {
276	# classification list has been processed
277	$classificationsref = $saved_classifications;
278	} else {
279	# classification list has yet to be processed, save
280	# books in temp_classifications until list is processed
281	# and they can be moved into saved_classifications
282	$temp_classifications = {} unless defined $temp_classifications;
283	$classificationsref = $temp_classifications;
284	}
285
286	my $classifications = $doc_obj->get_metadata($section, "Subject");
287
288	# need to save which books belong in each classification
289	# to output later
290	foreach $classification (@$classifications) {
291
292	if (!defined $classificationsref->{$classification}) {
293	$classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
294	$classificationsref->{$classification}->{'contains'} = "";
295	}
296	$classificationsref->{$classification}->{'contains'} .= ":" unless
297	$classificationsref->{$classification}->{'contains'} eq "";
298	$classificationsref->{$classification}->{'contains'} .= $mapped_section;
299	}
300
301	return (join ":", @$classifications);
302	}
303
304	sub map_section {
305	my $self = shift (@_);
306	my ($doctype, $section) = @_;
307
308	# classifications should never need to be mapped
309	return $section if $doctype eq "classification";
310
311	return "" unless (defined $section) && ($section ne "");
312
313	$sectionmap = {} unless defined $sectionmap;
314
315
316
317	# get the section into a standard format
318	$section =~ s/^\.+\|\.+$//g;
319
320	# return the mapped section if it has been seen before
321	if (defined $sectionmap->{$section}) {
322	return $sectionmap->{$section};
323	}
324
325	# find out the parent section
326	my ($parentsection, $num);
327	if ($section =~ /^(.+)\.(\d+)$/) {
328	$parentsection = $1;
329	$num = $2;
330	} elsif ($section =~ /^(\d+)$/) {
331	$parentsection = "";
332	$num = $1;
333	} else {
334	print STDERR "mgbuildproc:map_section - misformed section $section\n";
335	}
336	$parentsection = "" unless defined $parentsection;
337
338	if ($parentsection eq "") {
339	$num ++ if $num == 1 and defined $sectionmap->{'0'};
340	} else {
341	$num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
342	}
343
344	# find out the mapped parent section
345	my $mappedparentsection = $self->map_section ($doctype, $parentsection);
346
347	# find the next unused child section
348	my $previousnum = $num - 1;
349	my $previoussection = "";
350	if ($parentsection eq "") {
351	$previoussection = $previousnum;
352	$previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
353	} else{
354	$previoussection = "$parentsection.$previousnum";
355	$previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
356	}
357	while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
358	$previousnum--;
359	$previoussection = "$parentsection.$previousnum";
360	$previoussection = $previousnum if $parentsection eq "";
361	}
362
363	# there has been no children under this parent, this section will be number 1
364	if ($previousnum <= 0) {
365	if ($mappedparentsection eq "") {
366	$sectionmap->{$section} = "1";
367	} else {
368	$sectionmap->{$section} = "$mappedparentsection.1";
369	}
370
371	} else {
372	# get the previous mapped number
373	my $previousmapnum = 0;
374	if ($sectionmap->{$previoussection} =~ /(^\|\.)?(\d+)$/) {
375	$previousmapnum = $2;
376	}
377
378	# increment it to get this mapped child
379	my $mappednum = $previousmapnum+1;
380
381	if ($mappedparentsection eq "") {
382	$sectionmap->{$section} = $mappednum;
383	} else {
384	$sectionmap->{$section} = "$mappedparentsection.$mappednum";
385	}
386	}
387
388	return $sectionmap->{$section};
389	}
390
391	sub text {
392	my $self = shift (@_);
393	my ($doc_obj) = @_;
394	my $handle = $self->{'output_handle'};
395	my $indexed_doc = 1;
396
397	# only output this document if it is one to be indexed
398	return if ($doc_obj->get_doc_type() ne "indexed_doc");
399
400	# see if this document belongs to this subcollection
401	foreach $indexexp (@{$self->{'indexexparr'}}) {
402	$indexed_doc = 0;
403	my ($field, $exp, $options) = split /\//, $indexexp;
404	if (defined ($field) && defined ($exp)) {
405	my ($bool) = $field =~ /^(.)/;
406	$field =~ s/^.// if $bool eq '!';
407	if ($field eq "filename") {
408	$field = $doc_obj->get_source_filename();
409	} else {
410	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
411	}
412	next unless defined $field;
413	if ($bool eq '!') {
414	if ($options =~ /^i$/i) {
415	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
416	} else {
417	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
418	}
419	} else {
420	if ($options =~ /^i$/i) {
421	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
422	} else {
423	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
424	}
425	}
426	}
427	}
428
429	# this is another document
430	$self->{'num_docs'} += 1;
431
432	# get the parameters for the output
433	my ($level, $fields) = split (/:/, $self->{'index'});
434	$fields =~ s/\ball\b/Title,Creator,text/;
435	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
436
437	my $doc_section = 0; # just for this document
438	my $text = "";
439	my $text_extra = "";
440
441	# get the text for this document
442	my $section = $doc_obj->get_top_section();
443	while (defined $section) {
444	# update a few statistics
445	$doc_section++;
446	$self->{'num_sections'} += 1;
447
448	if ($indexed_doc) {
449	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
450	foreach $field (split (/,/, $fields)) {
451	# only deal with this field if it doesn't start with top or
452	# this is the first section
453	my $real_field = $field;
454	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
455	my $new_text = "";
456	if ($real_field eq "text") {
457	$new_text = $doc_obj->get_text ($section);
458	$new_text =~ s/[\cB\cC]//g;
459	$new_text =~ s/(<p\b)/\cC$1/gi;
460
461	} else {
462	$new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
463	}
464
465	$text .= "$new_text\cC";
466	}
467	}
468	}
469
470	if ($level eq "document") { $text_extra .= "\cB"; }
471	else { $text .= "\cB"; }
472
473	$section = $doc_obj->get_next_section($section);
474	}
475
476	print $handle "$text$text_extra";
477	}
478
479	# converts leading number in classification back
480	# to letter it represents
481	# i.e 67.2.4 becomes C.2.4
482	sub char_classification {
483	my $self = shift (@_);
484	my ($classification) = @_;
485
486	return $classification if $classification eq "";
487	my ($c) = $classification =~ /^\.?(\d+)/;
488	$c = chr($c);
489	$classification =~ s/^\d+/$c/;
490
491	return $classification;
492	}
493
494	# converts leading letter of a classification into its ascii equivalent
495	# i.e C.2.4 becomes 67.2.4
496	sub int_classification {
497	my $self = shift (@_);
498	my ($classification) = @_;
499	my $c = ord($classification);
500	$classification =~ s/^./$c/;
501
502	return $classification;
503	}
504	1;
505

Note: See TracBrowser for help on using the repository browser.

Download in other formats: