source: trunk/gsdl/src/phind/generate/phproc.pm@ 1631

Last change on this file since 1631 was 1631, checked in by paynter, 24 years ago

Changed copyrights to include NZDLP.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1###########################################################################
2#
3# phproc.pm -- the Phind document processor
4#
5# Copyright (C) 2000 Gordon W. Paynter
6# Copyright 2000 The New Zealand Digital Library Project
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# This is the phind document processor object. It is used by the document
29# reader plugins to extract the clauses from each document.
30
31package phproc;
32
33use docproc;
34use util;
35
36sub BEGIN {
37 @ISA = ('docproc');
38}
39
40sub new {
41 my ($class, $archive_dir, $phindex_dir, $phindcfg,
42 $language, $delimiter, $verbosity, $outhandle) = @_;
43
44 my $self = new docproc ();
45
46 $self->{'archive_dir'} = $archive_dir;
47 $self->{'phindex_dir'} = $phindex_dir;
48 $self->{'indexes'} = $phindcfg;
49
50 $language =~ s/,/\|/g;
51 $self->{'language_exp'} = $language;
52 $self->{'delimiter'} = $delimiter;
53
54 $self->{'verbosity'} = $verbosity;
55 $self->{'outhandle'} = STDERR;
56 $self->{'outhandle'} = $outhandle if defined $outhandle;
57
58 &util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
59 open(TEXT, ">$phindex_dir/clauses")
60 || die "Cannot open $phindex_dir/clauses: $!";
61 $self->{'txthandle'} = TEXT;
62
63 my $docfile = &util::filename_cat("$phindex_dir", "docs.txt");
64 &util::rm($docfile) if (-e $docfile);
65 open(DOCS, ">$docfile")
66 || die "Cannot open $docfile: $!";
67 $self->{'dochandle'} = DOCS;
68
69 return bless $self, $class;
70
71}
72
73sub process {
74 my $self = shift (@_);
75 my ($doc_obj) = @_;
76
77 my $verbosity = $self->{'verbosity'};
78 my $top_section = $doc_obj->get_top_section();
79
80 my $title = $doc_obj->get_metadata_element ($top_section, "Title");
81 print "process: $title\n" if ($verbosity > 2);
82
83
84 # only consider english-language files
85 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
86 my $phrlanguage = $self->{'language_exp'};
87 return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
88
89 # record this file
90 my $total++;
91 print "file $total: $file\n" if ($self->{'$verbosity'});
92
93 # store object ID & title in document index file
94 my $OID = $doc_obj->get_OID();
95 $OID = "NULL" unless defined $OID;
96
97 my $dochandle = $self->{'dochandle'};
98 # print "dochandle: =$dochandle=\n";
99 print $dochandle "<Document>\t$OID\t$title\n";
100
101 # Store the text of this object
102 my $indexlist = $self->{'indexes'};
103 my @parts;
104 my ($index, $part, $level, $field, $section, $data, $text);
105
106 # Output the document delimiter
107 my $txthandle = $self->{'txthandle'};
108 print $txthandle $self->{'delimiter'}, "\n";
109
110 # Iterarate over all the indexes specified in collect.cfg and
111 # add their text to the clauses file.
112 foreach $index (@$indexlist) {
113 $text = "";
114
115 # Iterate over all the feilds in each index
116 @parts = split(/,/, $index);
117 foreach $part (@parts) {
118
119 # Each field has a level and a data element ((e.g. document:Title)
120 ($level, $field) = split(/:/, $part);
121 die unless ($level && $field);
122
123 # Extract the text from every section
124 # (In phind, document:text and section:text are equivalent)
125 if ($field eq "text") {
126 $data = "";
127 $section = $doc_obj->get_top_section();
128 while (defined($section)) {
129 $data .= $doc_obj->get_text($section) . "\n";
130 $section = $doc_obj->get_next_section($section);
131 }
132 $text .= convert_gml_to_tokens($data) . "\n";
133 }
134
135 # Extract a metadata field from a document
136 elsif ($level eq "document") {
137 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
138 $text .= convert_gml_to_tokens($data) . "\n";
139 }
140
141 # Extract metadata from every section in a document
142 elsif ($level eq "section") {
143 $data = "";
144 $section = $doc_obj->get_top_section();
145 while (defined($section)) {
146 $data .= $doc_obj->get_metadata_element($section, $field) . "\n";
147 $section = $doc_obj->get_next_section($section);
148 }
149 $text .= convert_gml_to_tokens($data) . "\n";
150 }
151
152 # Some sort of specification which I don't understand
153 else {
154 die "Unknown level ($level) in phind key ($part) in phind index ($index)\n";
155 }
156
157 }
158
159 # print the text
160 print $txthandle "$text";
161
162 }
163}
164
165
166
167sub convert_gml_to_tokens {
168
169 $_ = shift @_;
170
171 # FIRST, remove GML tags
172
173 # Replace all whitespace with a simple space
174 s/\s+/ /gs;
175
176 # Remove everything that is in a tag
177 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
178 s/\s*<br>\s*/ LINEBREAK /isg;
179 s/<[^>]*>/ /sg;
180
181 # Now we have the text, but it may contain HTML
182 # elements coded as &gt; etc. Remove these tags.
183 s/&lt;/</sg;
184 s/&gt;/>/sg;
185
186 s/\s+/ /sg;
187 s/\s*<p>\s*/ PARAGRAPHBREAK /isg;
188 s/\s*<br>\s*/ LINEBREAK /isg;
189 s/<[^>]*>/ /sg;
190
191 # remove &amp; and other miscellaneous markup tags
192 s/&amp;/&/sg;
193 s/&lt;/</sg;
194 s/&gt;/>/sg;
195 s/&amp;/&/sg;
196
197 # replace<p> and <br> placeholders with carriage returns
198 s/PARAGRAPHBREAK/\n/sg;
199 s/LINEBREAK/\n/sg;
200
201
202 # Exceptional punctuation
203 #
204 # We make special cases of some punctuation
205
206 # remove any apostrophe that indicates omitted letters
207 s/(\w+)\'(\w*\s)/ $1$2 /g;
208
209 # remove period that appears in a person's initals
210 s/\s([A-Z])\./ $1 /g;
211
212 # replace hyphens in hypheanted words and names with a space
213 s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
214
215
216 # Convert the remaining text to "clause format",
217 # This means removing all excess punctuation and garbage text,
218 # normalising valid punctuation to fullstops and commas,
219 # then putting one cluse on each line.
220
221 # Insert newline when the end of a sentence is detected
222 # (delimter is: "[\.\?\!]\s")
223 s/\s*[\.\?\!]\s+/\n/g;
224
225 # split numbers after four digits
226 s/(\d\d\d\d)/$1 /g;
227
228 # split words after 32 characters
229
230 # squash repeated punctuation
231 tr/A-Za-z0-9 //cs;
232
233 # save email addresses
234 # s/\w+@\w+\.[\w\.]+/EMAIL/g;
235
236 # normalise clause breaks (mostly punctuation symbols) to commas
237 s/[^A-Za-z0-9 \n]+/ , /g;
238
239 # Remove repeated commas, and replace with newline
240 s/\s*,[, ]+/\n/g;
241
242 # remove extra whitespace
243 s/ +/ /sg;
244 s/^\s+//mg;
245 s/\s*$/\n/mg;
246
247 # remove lines that contain one word or less
248 s/^\w*$//mg;
249 s/^\s*$//mg;
250 tr/\n//s;
251
252 return $_;
253}
254
255
256
2571;
258
Note: See TracBrowser for help on using the repository browser.