Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 292

Last change on this file since 292 was 292, checked in by rjmcnab, 25 years ago
Added ability to post-process text before it is fed to be indexed or to be compressed.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.5 KB

Line
1	# This document processor outputs a document
2	# for mg to process
3
4
5	package mgbuildproc;
6
7	use classify;
8	use doc;
9	use docproc;
10	use util;
11
12
13	BEGIN {
14	@ISA = ('docproc');
15	}
16
17
18	sub new {
19	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20	my $self = new docproc ();
21
22	$self->{'collection'} = $collection;
23	$self->{'source_dir'} = $source_dir;
24	$self->{'build_dir'} = $build_dir;
25	$self->{'verbosity'} = $verbosity;
26	$self->{'classifiers'} = [];
27	$self->{'mode'} = "text";
28	$self->{'index'} = "section:text";
29	$self->{'indexexparr'} = [];
30	$self->{'output_handle'} = "STDOUT";
31	$self->{'num_docs'} = 0;
32	$self->{'num_sections'} = 0;
33	$self->{'num_bytes'} = 0;
34
35	$self->{'indexing_text'} = 0;
36
37	return bless $self, $class;
38	}
39
40	sub reset {
41	my $self = shift (@_);
42
43	$self->{'num_docs'} = 0;
44	$self->{'num_sections'} = 0;
45	$self->{'num_bytes'} = 0;
46	}
47
48	sub get_num_docs {
49	my $self = shift (@_);
50
51	return $self->{'num_docs'};
52	}
53
54	sub get_num_sections {
55	my $self = shift (@_);
56
57	return $self->{'num_sections'};
58	}
59
60	sub get_num_bytes {
61	my $self = shift (@_);
62
63	return $self->{'num_bytes'};
64	}
65
66	sub set_output_handle {
67	my $self = shift (@_);
68	my ($handle) = @_;
69
70	$self->{'output_handle'} = $handle;
71	}
72
73	sub set_mode {
74	my $self = shift (@_);
75	my ($mode) = @_;
76
77	$self->{'mode'} = $mode;
78	}
79
80	sub set_index {
81	my $self = shift (@_);
82	my ($index, $indexexparr) = @_;
83
84	$self->{'index'} = $index;
85	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
86	}
87
88	sub set_classifiers {
89	my $self = shift (@_);
90	my ($classifiers) = @_;
91
92	$self->{'classifiers'} = $classifiers;
93	}
94
95	sub set_indexing_text {
96	my $self = shift (@_);
97	my ($indexing_text) = @_;
98
99	$self->{'indexing_text'} = $indexing_text;
100	}
101
102	sub process {
103	my $self = shift (@_);
104	my $method = $self->{'mode'};
105
106	$self->$method(@_);
107	}
108
109	sub newinfodb {
110	my $self = shift (@_);
111	my ($doc_obj, $filename) = @_;
112	my $handle = $self->{'output_handle'};
113	# $handle = "main::STDOUT";
114
115	# this was used in the old version
116	return if ($doc_obj eq 'classifications');
117
118	my $doctype = $doc_obj->get_doc_type();
119
120	# only output this document if it is one to be indexed
121	return if ($doctype ne "indexed_doc");
122
123	# this is another document
124	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
125
126	my $section = $doc_obj->get_top_section ();
127	my $doc_OID = $doc_obj->get_OID();
128	while (defined $section) {
129	# update a few statistics
130	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
131	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
132
133	# output the section name
134	if ($section eq "") { print $handle "[$doc_OID]\n"; }
135	else { print $handle "[$doc_OID.$section]\n"; }
136
137	# output the fact that this document is a document
138	print $handle "<doctype>doc\n";
139
140	# output whether this node contains text
141	if ($doc_obj->get_text_length($section) > 0) {
142	print $handle "<hastxt>1\n";
143	} else {
144	print $handle "<hastxt>0\n";
145	}
146
147	# output all the section metadata
148	my $metadata = $doc_obj->get_all_metadata ($section);
149	foreach $pair (@$metadata) {
150	my ($field, $value) = (@$pair);
151
152	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
153	defined $value && $value ne "") {
154	# escape problematic stuff
155	$value =~ s/\\/\\\\/g;
156	$value =~ s/\n/\\n/g;
157	$value =~ s/\r/\\r/g;
158
159	print $handle "<$field>$value\n";
160	}
161	}
162
163	# output archivedir if at top level
164	if ($section eq $doc_obj->get_top_section()) {
165	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
166	$archivedir = "" unless defined $archivedir;
167	$archivedir =~ s/^(\/\|\\)*//;
168	$archivedir =~ s/(\/\|\\)*$//;
169	print $handle "<archivedir>$archivedir\n";
170	}
171
172	# output a list of children
173	my $children = $doc_obj->get_children ($section);
174	if (scalar(@$children) > 0) {
175	print $handle "<contains>";
176	my $firstchild = 1;
177	foreach $child (@$children) {
178	print $handle ";" unless $firstchild;
179	$firstchild = 0;
180	if ($child =~ /^.*?\.(\d+)$/) {
181	print $handle "\".$1";
182	} else {
183	print $handle "\".$child";
184	}
185	# if ($child eq "") { print $handle "$doc_OID"; }
186	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
187	# else { print $handle "$doc_OID.$section.$child"; }
188	}
189	print $handle "\n";
190	}
191
192	# output the matching document number
193	print $handle "<docnum>$self->{'num_sections'}\n";
194
195	print $handle '-' x 70, "\n";
196
197
198	# output a database entry for the document number
199	print $handle "[$self->{'num_sections'}]\n";
200	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
201	else { print $handle "<section>$doc_OID.$section\n"; }
202	print $handle '-' x 70, "\n";
203
204
205	$section = $doc_obj->get_next_section($section);
206	}
207
208	# classify this document
209	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
210
211	}
212
213
214
215	sub infodb {
216	my $self = shift (@_);
217	my ($doc_obj, $filename) = @_;
218	my $handle = $self->{'output_handle'};
219
220	if ($doc_obj eq 'classifications') {
221	# output classifications if all books have been processed
222	foreach $key (keys %$saved_classifications) {
223	$saved_classifications->{$key}->{'contains'} = undef if
224	$saved_classifications->{$key}->{'contains'} eq "";
225	$saved_classifications->{$key}->{'parent'} = undef if
226	(!defined $saved_classifications->{$key}->{'parent'}) \|\|
227	($saved_classifications->{$key}->{'parent'} eq "");
228	$self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
229	undef, undef, undef, undef,
230	$saved_classifications->{$key}->{'contains'}, undef,
231	$saved_classifications->{$key}->{'parent'}, undef, undef);
232	}
233	return;
234	}
235
236	my $doctype = $doc_obj->get_doc_type();
237
238	# only output this document if it is one to be indexed
239	return if ($doctype ne "indexed_doc" &&
240	$doctype ne "classification");
241
242	# found classification document
243	$saved_classifications = {} if ($doctype eq "classification");
244	$sectionmap = {};
245
246	# this is another document
247	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
248
249	my $section = $doc_obj->get_top_section();
250	while (defined $section) {
251	# update a few statistics
252	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
253	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
254
255	my $title = $doc_obj->get_metadata_element($section, "Title");
256	my $source = $doc_obj->get_metadata_element($section, "Source");
257	my $date = $doc_obj->get_metadata_element($section, "Date");
258	my $jobnumber = $doc_obj->get_source_filename();
259
260	my $mapped_section = $self->map_section($doctype, $section);
261	if ($doctype eq "classification") {
262	$mapped_section = $self->char_classification($mapped_section);
263	} else {
264	$mapped_section = "B.$self->{'num_docs'}.$mapped_section";
265	}
266	$mapped_section =~ s/\.+$//;
267
268	my ($parent, $classification, $creator);
269
270	$classification = $self->get_classifications($doc_obj, $section, $mapped_section)
271	unless $doctype eq "classification";
272
273	if ($section ne $doc_obj->get_top_section()) {
274	$parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
275	if ($doctype eq "classification") {
276	$parent = $self->char_classification($parent);
277	} else {
278	$parent = "B.$self->{'num_docs'}.$parent";
279	}
280	$parent =~ s/\.+$//;
281	} else {
282	$creator = $doc_obj->get_metadata_element($section, "Creator");
283
284	# need filename so we know what directory to look in for associated files
285	$filename =~ s/^\/?(.?\.dir).$/$1/ if (defined $filename);
286	}
287
288	if ($doc_obj->is_leaf_section($section)) {
289	if ($doctype eq "classification") {
290	if (defined $saved_classifications->{$mapped_section}) {
291	print STDERR "mgbuildproc:warning: classification $mapped_section " .
292	"declared multiple times\n";
293	} else {
294	$saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
295	'parent' => $parent};
296	}
297	} else {
298	$self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
299	$self->{'num_sections'}, $parent, $classification, $filename);
300	}
301	} else {
302
303	# add the introductory section if it exists
304	my $contains = "";
305	if ($doc_obj->get_text_length($section) > 0) {
306	$contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
307	}
308
309	# add the rest of the children
310	my @children = @{$doc_obj->get_children($section)};
311	if ($doctype eq "classification") {
312	map {$_ = $self->char_classification($_);} @children;
313	if (defined $saved_classifications->{$mapped_section}) {
314	print STDERR "mgbuildproc:warning: classification $mapped_section " .
315	"declared multiple times\n";
316	} else {
317	$saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
318	$saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
319	$saved_classifications->{$mapped_section}->{'parent'} = $parent;
320	}
321	} else {
322	map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
323	$contains .= ":" if $contains ne "";
324	$contains .= join ":", @children;
325	$self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
326	$date, $jobnumber, $contains, $self->{'num_sections'},
327	$parent, $classification, $filename);
328
329	if ($doc_obj->get_text_length($section) > 0) {
330	my $intromapsection = "B.$self->{'num_docs'}." .
331	$self->map_section($doctype, "$section.0");
332	$self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
333	$source, $date, $jobnumber, undef, $self->{'num_sections'},
334	$mapped_section, $classification, $filename);
335	}
336	}
337	}
338	$section = $doc_obj->get_next_section($section);
339	}
340
341	# update classification list with those books that
342	# were processed before classification list
343	if ($doctype eq "classification" && defined $temp_classifications) {
344	foreach $key (keys(%$temp_classifications)) {
345	if (!defined $saved_classifications->{$key}) {
346	print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
347	"undefined classification $key\n";
348	}
349	$saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
350	}
351	$temp_classifications = undef;
352	}
353	}
354
355
356	sub write_to_gdbm {
357	my $self = shift (@_);
358	my ($handle, $section, $title, $creator, $source, $date,
359	$jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
360
361	print $handle "[$section]\n";
362	print $handle "<t>$title\n" if (defined $title && $title ne "");
363	print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
364	print $handle "<s>$source\n" if (defined $source && $source ne "");
365	print $handle "<i>$date\n" if (defined $date && $date ne "");
366	print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
367	print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
368	print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
369	print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
370	print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
371	print $handle "<o>$OID\n" if defined $OID;
372	print $handle '-' x 70, "\n";
373
374	if (defined $docnum) {
375	print $handle "[$docnum]\n";
376	print $handle "<x>$section\n";
377	print $handle '-' x 70, "\n";
378	}
379	}
380
381	sub get_classifications {
382	my $self = shift (@_);
383	my ($doc_obj, $section, $mapped_section) = @_;
384
385	my ($classificationsref);
386	if (defined $saved_classifications) {
387	# classification list has been processed
388	$classificationsref = $saved_classifications;
389	} else {
390	# classification list has yet to be processed, save
391	# books in temp_classifications until list is processed
392	# and they can be moved into saved_classifications
393	$temp_classifications = {} unless defined $temp_classifications;
394	$classificationsref = $temp_classifications;
395	}
396
397	my $classifications = $doc_obj->get_metadata($section, "Subject");
398
399	# need to save which books belong in each classification
400	# to output later
401	foreach $classification (@$classifications) {
402
403	if (!defined $classificationsref->{$classification}) {
404	$classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
405	$classificationsref->{$classification}->{'contains'} = "";
406	}
407	$classificationsref->{$classification}->{'contains'} .= ":" unless
408	$classificationsref->{$classification}->{'contains'} eq "";
409	$classificationsref->{$classification}->{'contains'} .= $mapped_section;
410	}
411
412	return (join ":", @$classifications);
413	}
414
415	sub map_section {
416	my $self = shift (@_);
417	my ($doctype, $section) = @_;
418
419	# classifications should never need to be mapped
420	return $section if $doctype eq "classification";
421
422	return "" unless (defined $section) && ($section ne "");
423
424	$sectionmap = {} unless defined $sectionmap;
425
426
427
428	# get the section into a standard format
429	$section =~ s/^\.+\|\.+$//g;
430
431	# return the mapped section if it has been seen before
432	if (defined $sectionmap->{$section}) {
433	return $sectionmap->{$section};
434	}
435
436	# find out the parent section
437	my ($parentsection, $num);
438	if ($section =~ /^(.+)\.(\d+)$/) {
439	$parentsection = $1;
440	$num = $2;
441	} elsif ($section =~ /^(\d+)$/) {
442	$parentsection = "";
443	$num = $1;
444	} else {
445	print STDERR "mgbuildproc:map_section - misformed section $section\n";
446	}
447	$parentsection = "" unless defined $parentsection;
448
449	if ($parentsection eq "") {
450	$num ++ if $num == 1 and defined $sectionmap->{'0'};
451	} else {
452	$num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
453	}
454
455	# find out the mapped parent section
456	my $mappedparentsection = $self->map_section ($doctype, $parentsection);
457
458	# find the next unused child section
459	my $previousnum = $num - 1;
460	my $previoussection = "";
461	if ($parentsection eq "") {
462	$previoussection = $previousnum;
463	$previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
464	} else{
465	$previoussection = "$parentsection.$previousnum";
466	$previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
467	}
468	while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
469	$previousnum--;
470	$previoussection = "$parentsection.$previousnum";
471	$previoussection = $previousnum if $parentsection eq "";
472	}
473
474	# there has been no children under this parent, this section will be number 1
475	if ($previousnum <= 0) {
476	if ($mappedparentsection eq "") {
477	$sectionmap->{$section} = "1";
478	} else {
479	$sectionmap->{$section} = "$mappedparentsection.1";
480	}
481
482	} else {
483	# get the previous mapped number
484	my $previousmapnum = 0;
485	if ($sectionmap->{$previoussection} =~ /(^\|\.)?(\d+)$/) {
486	$previousmapnum = $2;
487	}
488
489	# increment it to get this mapped child
490	my $mappednum = $previousmapnum+1;
491
492	if ($mappedparentsection eq "") {
493	$sectionmap->{$section} = $mappednum;
494	} else {
495	$sectionmap->{$section} = "$mappedparentsection.$mappednum";
496	}
497	}
498
499	return $sectionmap->{$section};
500	}
501
502	sub find_paragraphs {
503	$_[1] =~ s/(<p\b)/\cC$1/gi;
504	}
505
506	sub filter_text {
507	# $self->filter_text ($field, $new_text);
508	# don't want to do anything for this version, however,
509	# in a particular collection you might want to override
510	# this method to post-process certain fields depending on
511	# the field, or whether we are outputting it for indexing
512	}
513
514	sub text {
515	my $self = shift (@_);
516	my ($doc_obj) = @_;
517	my $handle = $self->{'output_handle'};
518	my $indexed_doc = 1;
519
520	# only output this document if it is one to be indexed
521	return if ($doc_obj->get_doc_type() ne "indexed_doc");
522
523	# see if this document belongs to this subcollection
524	foreach $indexexp (@{$self->{'indexexparr'}}) {
525	$indexed_doc = 0;
526	my ($field, $exp, $options) = split /\//, $indexexp;
527	if (defined ($field) && defined ($exp)) {
528	my ($bool) = $field =~ /^(.)/;
529	$field =~ s/^.// if $bool eq '!';
530	if ($field eq "filename") {
531	$field = $doc_obj->get_source_filename();
532	} else {
533	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
534	}
535	next unless defined $field;
536	if ($bool eq '!') {
537	if ($options =~ /^i$/i) {
538	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
539	} else {
540	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
541	}
542	} else {
543	if ($options =~ /^i$/i) {
544	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
545	} else {
546	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
547	}
548	}
549	}
550	}
551
552	# this is another document
553	$self->{'num_docs'} += 1;
554
555	# get the parameters for the output
556	my ($level, $fields) = split (/:/, $self->{'index'});
557	$fields =~ s/\ball\b/Title,Creator,text/;
558	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
559
560	my $doc_section = 0; # just for this document
561	my $text = "";
562	my $text_extra = "";
563
564	# get the text for this document
565	my $section = $doc_obj->get_top_section();
566	while (defined $section) {
567	# update a few statistics
568	$doc_section++;
569	$self->{'num_sections'} += 1;
570
571	if ($indexed_doc) {
572	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
573	foreach $field (split (/,/, $fields)) {
574	# only deal with this field if it doesn't start with top or
575	# this is the first section
576	my $real_field = $field;
577	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
578	my $new_text = "";
579	if ($real_field eq "text") {
580	$new_text = $doc_obj->get_text ($section);
581	$new_text =~ s/[\cB\cC]//g;
582	$self->find_paragraphs($new_text);
583
584	} else {
585	$new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
586	}
587
588	# filter the text
589	$self->filter_text ($field, $new_text);
590
591	if ($self->{'indexing_text'} &&
592	$new_text =~ /[\(\)\{\}]/) {
593	print "arrgh: $new_text\n";
594	}
595
596	$text .= "$new_text\cC";
597	}
598	}
599	}
600
601	if ($level eq "document") { $text_extra .= "\cB"; }
602	else { $text .= "\cB"; }
603
604	$section = $doc_obj->get_next_section($section);
605	}
606
607	print $handle "$text$text_extra";
608	}
609
610	# converts leading number in classification back
611	# to letter it represents
612	# i.e 67.2.4 becomes C.2.4
613	sub char_classification {
614	my $self = shift (@_);
615	my ($classification) = @_;
616
617	return $classification if $classification eq "";
618	my ($c) = $classification =~ /^\.?(\d+)/;
619	$c = chr($c);
620	$classification =~ s/^\d+/$c/;
621
622	return $classification;
623	}
624
625	# converts leading letter of a classification into its ascii equivalent
626	# i.e C.2.4 becomes 67.2.4
627	sub int_classification {
628	my $self = shift (@_);
629	my ($classification) = @_;
630	my $c = ord($classification);
631	$classification =~ s/^./$c/;
632
633	return $classification;
634	}
635	1;
636

Note: See TracBrowser for help on using the repository browser.

Download in other formats: