Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/phind/generate/phproc.pm@ 1562

Last change on this file since 1562 was 1562, checked in by paynter, 24 years ago
Phind phrase browser - code for generating the phrase indexes
Property svn:keywords set to `Author Date Id Revision`
File size: 5.3 KB

Line
1	###########################################################################
2	#
3	# phproc.pm -- the Phind document processor
4	#
5	# Copyright (C) 2000 Gordon Paynter
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# This is the phind document processor object. It is used by the document
28	# reader plugins to extract the clauses from each document.
29
30	package phproc;
31
32	use docproc;
33	use util;
34
35	sub BEGIN {
36	@ISA = ('docproc');
37	}
38
39	sub new {
40	my ($class, $archive_dir, $phindex_dir,
41	$language, $delimiter, $verbosity, $outhandle) = @_;
42
43	my $self = new docproc ();
44
45	# $self->{'collection'} = $collection;
46	$self->{'archive_dir'} = $archive_dir;
47	$self->{'phindex_dir'} = $phindex_dir;
48
49	$language =~ s/,/\\|/g;
50	$self->{'language_exp'} = $language;
51	$self->{'delimiter'} = $delimiter;
52
53	$self->{'verbosity'} = $verbosity;
54	$self->{'outhandle'} = STDERR;
55	$self->{'outhandle'} = $outhandle if defined $outhandle;
56
57	&util::rm("$phindex_dir/clauses") if (-e "$phindex_dir/clauses");
58	open(TEXT, ">$phindex_dir/clauses")
59	\|\| die "Cannot open $phindex_dir/clauses: $!";
60	$self->{'txthandle'} = TEXT;
61
62	&util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt");
63	open(DOCS, ">$phindex_dir/mg-d.txt")
64	\|\| die "Cannot open $phindex_dir/mg-d.txt: $!";
65	$self->{'dochandle'} = DOCS;
66
67	return bless $self, $class;
68
69	}
70
71	sub process {
72	my $self = shift (@_);
73	my ($doc_obj) = @_;
74
75	my $verbosity = $self->{'verbosity'};
76	my $top_section = $doc_obj->get_top_section();
77
78	my $title = $doc_obj->get_metadata_element ($top_section, "Title");
79	print "process: $title\n" if ($verbosity > 1);
80
81
82	# only consider english-language files
83	my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language");
84	my $phrlanguage = $self->{'language_exp'};
85	return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i));
86
87	# record this file
88	my $total++;
89	print "file $total: $file\n" if ($self->{'$verbosity'});
90
91	# store object ID & title in document index file
92	my $OID = $doc_obj->get_OID();
93	$OID = "NULL" unless defined $OID;
94
95	my $dochandle = $self->{'dochandle'};
96	# print "dochandle: =$dochandle=\n";
97	print $dochandle "$OID\t$title\n";
98
99	# store the text
100	$text = convert_gml_to_tokens($doc_obj->get_text());
101
102	my $txthandle = $self->{'txthandle'};
103	print $txthandle $self->{'delimiter'}, "\n$text\n";
104	}
105
106
107
108	sub convert_gml_to_tokens {
109
110	$_ = shift @_;
111
112	# FIRST, remove GML tags
113
114	# Replace all whitespace with a simple space
115	s/\s+/ /gs;
116
117	# Remove everything that is in a tag
118	s/\s<p>\s/ PARAGRAPHBREAK /isg;
119	s/\s<br>\s/ LINEBREAK /isg;
120	s/<[^>]*>/ /sg;
121
122	# Now we have the text, but it may contain HTML
123	# elements coded as > etc. Remove these tags.
124	s/</</sg;
125	s/>/>/sg;
126
127	s/\s+/ /sg;
128	s/\s<p>\s/ PARAGRAPHBREAK /isg;
129	s/\s<br>\s/ LINEBREAK /isg;
130	s/<[^>]*>/ /sg;
131
132	# remove & and other miscellaneous markup tags
133	s/&/&/sg;
134	s/</</sg;
135	s/>/>/sg;
136	s/&/&/sg;
137
138	# replace<p> and <br> placeholders with carriage returns
139	s/PARAGRAPHBREAK/\n/sg;
140	s/LINEBREAK/\n/sg;
141
142
143	# Exceptional punctuation
144	#
145	# We make special cases of some punctuation
146
147	# remove any apostrophe that indicates omitted letters
148	s/(\w+)\'(\w*\s)/ $1$2 /g;
149
150	# remove period that appears in a person's initals
151	s/\s([A-Z])\./ $1 /g;
152
153	# replace hyphens in hypheanted words and names with a space
154	s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g;
155
156
157	# Convert the remaining text to "clause format",
158	# This means removing all excess punctuation and garbage text,
159	# normalising valid punctuation to fullstops and commas,
160	# then putting one cluse on each line.
161
162	# Insert newline when the end of a sentence is detected
163	# (delimter is: "[\.\?\!]\s")
164	s/\s*[\.\?\!]\s+/\n/g;
165
166	# split numbers after four digits
167	s/(\d\d\d\d)/$1 /g;
168
169	# split words after 32 characters
170
171	# squash repeated punctuation
172	tr/A-Za-z0-9 //cs;
173
174	# save email addresses
175	# s/\w+@\w+\.[\w\.]+/EMAIL/g;
176
177	# normalise clause breaks (mostly punctuation symbols) to commas
178	s/[^A-Za-z0-9 \n]+/ , /g;
179
180	# Remove repeated commas, and replace with newline
181	s/\s*,[, ]+/\n/g;
182
183	# remove extra whitespace
184	s/ +/ /sg;
185	s/^\s+//mg;
186	s/\s*$/\n/mg;
187
188	# remove lines that contain one word or less
189	s/^\w*$//mg;
190	s/^\s*$//mg;
191	tr/\n//s;
192
193	return $_;
194	}
195
196
197
198	1;
199

Note: See TracBrowser for help on using the repository browser.

Download in other formats: