Context Navigation

summarise.cpp@ 22923

Last change on this file since 22923 was 22923, checked in by davidb, 14 years ago
Main change is to use reverse iterator (see comment in code), but also added a bit more white space, and { } on if-statements etc.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.1 KB

Line
1	/**********************************************************************
2	*
3	* summarise.cpp --
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	/* The function 'summarise' produces, given a document text and a query,
27	* a (query-biased) summary. In the future, several types of summaries will
28	* be supported.
29	*/
30
31	#include "summarise.h"
32	#include "unitool.h"
33	#include <string.h>
34
35	#include <iostream>
36	using namespace std;
37
38	/* ************** LOCAL PROTOTYPES ************** */
39
40	text_t summarise_startend(text_t &htmlstr, int summaryLength);
41	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
42
43	text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
44	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
45	bool paragraph_tag(text_t::iterator start);
46
47
48	/****************************************************
49	NAME: summarise
50	DESC: produce a summary for a document
51	*****************************************************/
52
53	text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
54	// return summarise_startend(htmlstr,summaryLength);
55	return summarise_keywords(htmlstr,query,summaryLength);
56	}
57
58
59	/****************************************************
60	NAME: summarise_startend
61	DESC: return first and last sentences of a document
62	*****************************************************/
63
64	text_t summarise_startend(text_t &htmlstr, int summaryLength) {
65	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
66	text_t answer;
67
68	// add first sentences up to half the summary length
69
70	text_t::iterator str_current = str_start;
71	while(str_current<str_end && answer.size()<(summaryLength/2)) {
72	text_t sentence = next_sentence(str_current,str_end);
73	answer.append(sentence);
74	}
75
76	summaryLength -= answer.size(); // summary length left for last sentences
77	if(summaryLength<0)
78	summaryLength = 0;
79
80	str_end = str_current;
81	str_current = htmlstr.end()-1;
82	text_t lastSentence;
83	while(str_current>str_end &&
84	lastSentence.size()<summaryLength) {
85	text_t sentence = previous_sentence(str_current,str_end);
86	lastSentence = sentence + lastSentence;
87	}
88
89	answer += " ... ";
90	answer += lastSentence;
91
92	return answer;
93	}
94
95
96	/****************************************************
97	NAME: summarise_keywords
98	DESC: build a summary with sentences containing keyword(s).
99	the sentences with most matching keywords are returned
100	first.
101	*****************************************************/
102
103	text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength)
104	{
105
106	if ((query.size()==0) \|\| (htmlstr.size()==0)) {
107	return "";
108	}
109
110	text_tarray allterms, terms;
111	splitchar(query.begin(),query.end(),' ',allterms);
112
113	// consider only non-empty terms
114	for (text_tarray::iterator term = allterms.begin();
115	term < allterms.end(); ++term) {
116	if (!(*term).empty())
117	terms.push_back(*term);
118	}
119
120	if (terms.size()==0) {
121	return "";
122	}
123
124	text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
125
126	//text_tarray::iterator terms_current = terms_start;
127	//strstr("merde",*terms_current);
128
129	text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
130
131	vector<text_tarray> answers(terms.size());
132	// an array of array of sentences for the summary:
133	// answers[0] contains sentences with 1 keyword
134	// answers[1] contains sentences with 2 keywords, etc.
135
136	vector<int> answersSize(terms.size());
137	// answersSize[0] is the combined size of sentences with 1 keyword, etc.
138	for(vector<int>::iterator size = answersSize.begin();
139	size<answersSize.end(); ++size) {
140	*size = 0; // initialise sentence size
141	}
142
143	int totfound = 0;
144	text_t::iterator str_current = str_start;
145	while (str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
146	// if the size of best sentences is greater than summary, that's enough!
147	text_t sentence = next_sentence(str_current,str_end);
148
149	text_tarray::iterator terms_current = terms_start;
150	int nFound = 0;
151	while (terms_current!=terms_end) {
152	text_t::iterator word = findword(sentence.begin(),sentence.end(),
153	*terms_current);
154	if (word!=sentence.end()) {
155	++nFound;
156	++totfound;
157	}
158	++terms_current;
159	}
160
161	if (nFound>0 && answersSize[nFound-1]<summaryLength) {
162	answers[nFound-1].push_back(sentence);
163	answersSize[nFound-1] += sentence.size();
164	}
165	}
166
167	text_t answer;
168
169	// Changed to using reverse iterator, as there is some concern as to
170	// whether the operations encoded with the usual iterator -- e.g.
171	// answers.end()-1 and so forth -- are safe. Certainly the code
172	// works out tidier using the reverse iterator and the segmentation
173	// fault that was occurring in this block went away
174
175	for (vector<text_tarray>::reverse_iterator sentarray = answers.rbegin();
176	sentarray<answers.rend(); ++sentarray) {
177	for (text_tarray::iterator sentence = (*sentarray).begin();
178	sentence < (*sentarray).end(); ++sentence) {
179	answer.append(*sentence);
180
181	if(answer.size()>=summaryLength) {
182	return answer;
183	}
184	}
185	}
186
187	if (!answer.empty()) {
188	return answer;
189	}
190
191	return summarise_startend(htmlstr,summaryLength);
192	}
193
194
195	/* ********************* LOCAL FUNCTIONS ***************** */
196
197	/* NAME: next_sentence
198	DESC: returns next sentence, text-only (ie. HTML markup is removed)
199	*/
200
201	text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
202	text_t sentence; // the sentence to be returned
203	bool foundPunctuation = false; // set to true by '.', '!' or '?'
204	while(start<end && !foundPunctuation) {
205	switch (*start) {
206	case '<': // skip over rest of html tag
207	if(paragraph_tag(start) && has_unicode_letdig(sentence))
208	foundPunctuation = true;
209	while ((start<end) && (*start!='>'))
210	++start;
211	if(start<end) ++start;
212	break;
213	case '.':
214	case '!':
215	case '?':
216	sentence.push_back(*start);
217	++start;
218	if(start>=end \|\|
219	(is_unicode_space(start) && ((start-2)<'A' \|\| *(start-2)>'Z'))) {
220	foundPunctuation = true;
221	}
222	break;
223	default:
224	sentence.push_back(*start);
225	++start;
226	break;
227	}
228	}
229	return sentence;
230	}
231
232
233	/* NAME: previous_sentence
234	DESC: returns previous sentence, text-only (ie. HTML markup is removed)
235	*/
236
237	text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
238	text_t sentence; // the sentence to be returned
239	bool found1stPunctuation = false, // set to true by '.', '!' or '?'
240	// first punct. is included in results
241	found2ndPunctuation = false; // second punct. is stop condition,
242	// and is not included in results
243	while(start>end && !found2ndPunctuation) {
244	switch (*start) {
245	case '>': // skip over rest of html tag
246	while ((start>end) && (*start!='<')) // backtrack to beginning of tag
247	--start;
248	if(start>end) {
249	if(paragraph_tag(start) && has_unicode_letdig(sentence))
250	found2ndPunctuation = true;
251	--start;
252	}
253	break;
254	case '.':
255	case '!':
256	case '?':
257	if(!is_unicode_space(*(start+1)) \|\|
258	(start-1>end && (start-1)>='A' && (start-1)<='Z')) {
259	// if next character is not a blank, or preceding character is
260	// a capital letter, we guess it's an acronym (e.g. "U.S.A.")
261	sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
262	start,start+1);
263	--start;
264	} else
265	if(has_unicode_letdig(sentence) \|\| found1stPunctuation)
266	found2ndPunctuation = true;
267	else {
268	sentence.text_as_usvector().insert(
269	sentence.text_as_usvector().begin(),start,start+1);
270	--start;
271	found1stPunctuation = true;
272	}
273	break;
274	default:
275	sentence.text_as_usvector().insert(
276	sentence.text_as_usvector().begin(),start,start+1);
277	--start;
278	break;
279	}
280	}
281	return sentence;
282	}
283
284
285	// start is positioned on the '<'
286	bool paragraph_tag(text_t::iterator start) {
287	if(*start=='<') {
288	++start;
289	if(start=='p' \|\| start=='P') {
290	++start;
291	if(is_unicode_space(start) \|\| start=='>')
292	return true;
293	}
294	}
295	return false;
296	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/runtime-src/src/recpt/summarise.cpp@ 22923

Download in other formats: