source: main/trunk/greenstone2/runtime-src/src/recpt/summarise.cpp@ 22923

Last change on this file since 22923 was 22923, checked in by davidb, 14 years ago

Main change is to use reverse iterator (see comment in code), but also added a bit more white space, and { } on if-statements etc.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
Line 
1/**********************************************************************
2 *
3 * summarise.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26/* The function 'summarise' produces, given a document text and a query,
27 * a (query-biased) summary. In the future, several types of summaries will
28 * be supported.
29 */
30
31#include "summarise.h"
32#include "unitool.h"
33#include <string.h>
34
35#include <iostream>
36using namespace std;
37
38/* **************** LOCAL PROTOTYPES **************** */
39
40text_t summarise_startend(text_t &htmlstr, int summaryLength);
41text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
42
43text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
44text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
45bool paragraph_tag(text_t::iterator start);
46
47
48/****************************************************
49 NAME: summarise
50 DESC: produce a summary for a document
51*****************************************************/
52
53text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
54 // return summarise_startend(htmlstr,summaryLength);
55 return summarise_keywords(htmlstr,query,summaryLength);
56}
57
58
59/****************************************************
60 NAME: summarise_startend
61 DESC: return first and last sentences of a document
62*****************************************************/
63
64text_t summarise_startend(text_t &htmlstr, int summaryLength) {
65 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
66 text_t answer;
67
68 // add first sentences up to half the summary length
69
70 text_t::iterator str_current = str_start;
71 while(str_current<str_end && answer.size()<(summaryLength/2)) {
72 text_t sentence = next_sentence(str_current,str_end);
73 answer.append(sentence);
74 }
75
76 summaryLength -= answer.size(); // summary length left for last sentences
77 if(summaryLength<0)
78 summaryLength = 0;
79
80 str_end = str_current;
81 str_current = htmlstr.end()-1;
82 text_t lastSentence;
83 while(str_current>str_end &&
84 lastSentence.size()<summaryLength) {
85 text_t sentence = previous_sentence(str_current,str_end);
86 lastSentence = sentence + lastSentence;
87 }
88
89 answer += " ... ";
90 answer += lastSentence;
91
92 return answer;
93}
94
95
96/****************************************************
97 NAME: summarise_keywords
98 DESC: build a summary with sentences containing keyword(s).
99 the sentences with most matching keywords are returned
100 first.
101*****************************************************/
102
103text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength)
104{
105
106 if ((query.size()==0) || (htmlstr.size()==0)) {
107 return "";
108 }
109
110 text_tarray allterms, terms;
111 splitchar(query.begin(),query.end(),' ',allterms);
112
113 // consider only non-empty terms
114 for (text_tarray::iterator term = allterms.begin();
115 term < allterms.end(); ++term) {
116 if (!(*term).empty())
117 terms.push_back(*term);
118 }
119
120 if (terms.size()==0) {
121 return "";
122 }
123
124 text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
125
126 //text_tarray::iterator terms_current = terms_start;
127 //strstr("merde",*terms_current);
128
129 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
130
131 vector<text_tarray> answers(terms.size());
132 // an array of array of sentences for the summary:
133 // answers[0] contains sentences with 1 keyword
134 // answers[1] contains sentences with 2 keywords, etc.
135
136 vector<int> answersSize(terms.size());
137 // answersSize[0] is the combined size of sentences with 1 keyword, etc.
138 for(vector<int>::iterator size = answersSize.begin();
139 size<answersSize.end(); ++size) {
140 *size = 0; // initialise sentence size
141 }
142
143 int totfound = 0;
144 text_t::iterator str_current = str_start;
145 while (str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
146 // if the size of best sentences is greater than summary, that's enough!
147 text_t sentence = next_sentence(str_current,str_end);
148
149 text_tarray::iterator terms_current = terms_start;
150 int nFound = 0;
151 while (terms_current!=terms_end) {
152 text_t::iterator word = findword(sentence.begin(),sentence.end(),
153 *terms_current);
154 if (word!=sentence.end()) {
155 ++nFound;
156 ++totfound;
157 }
158 ++terms_current;
159 }
160
161 if (nFound>0 && answersSize[nFound-1]<summaryLength) {
162 answers[nFound-1].push_back(sentence);
163 answersSize[nFound-1] += sentence.size();
164 }
165 }
166
167 text_t answer;
168
169 // Changed to using reverse iterator, as there is some concern as to
170 // whether the operations encoded with the usual iterator -- e.g.
171 // answers.end()-1 and so forth -- are safe. Certainly the code
172 // works out tidier using the reverse iterator and the segmentation
173 // fault that was occurring in this block went away
174
175 for (vector<text_tarray>::reverse_iterator sentarray = answers.rbegin();
176 sentarray<answers.rend(); ++sentarray) {
177 for (text_tarray::iterator sentence = (*sentarray).begin();
178 sentence < (*sentarray).end(); ++sentence) {
179 answer.append(*sentence);
180
181 if(answer.size()>=summaryLength) {
182 return answer;
183 }
184 }
185 }
186
187 if (!answer.empty()) {
188 return answer;
189 }
190
191 return summarise_startend(htmlstr,summaryLength);
192}
193
194
195/* *********************** LOCAL FUNCTIONS ******************* */
196
197/* NAME: next_sentence
198 DESC: returns next sentence, text-only (ie. HTML markup is removed)
199 */
200
201text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
202 text_t sentence; // the sentence to be returned
203 bool foundPunctuation = false; // set to true by '.', '!' or '?'
204 while(start<end && !foundPunctuation) {
205 switch (*start) {
206 case '<': // skip over rest of html tag
207 if(paragraph_tag(start) && has_unicode_letdig(sentence))
208 foundPunctuation = true;
209 while ((start<end) && (*start!='>'))
210 ++start;
211 if(start<end) ++start;
212 break;
213 case '.':
214 case '!':
215 case '?':
216 sentence.push_back(*start);
217 ++start;
218 if(start>=end ||
219 (is_unicode_space(*start) && (*(start-2)<'A' || *(start-2)>'Z'))) {
220 foundPunctuation = true;
221 }
222 break;
223 default:
224 sentence.push_back(*start);
225 ++start;
226 break;
227 }
228 }
229 return sentence;
230}
231
232
233/* NAME: previous_sentence
234 DESC: returns previous sentence, text-only (ie. HTML markup is removed)
235 */
236
237text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
238 text_t sentence; // the sentence to be returned
239 bool found1stPunctuation = false, // set to true by '.', '!' or '?'
240 // first punct. is included in results
241 found2ndPunctuation = false; // second punct. is stop condition,
242 // and is not included in results
243 while(start>end && !found2ndPunctuation) {
244 switch (*start) {
245 case '>': // skip over rest of html tag
246 while ((start>end) && (*start!='<')) // backtrack to beginning of tag
247 --start;
248 if(start>end) {
249 if(paragraph_tag(start) && has_unicode_letdig(sentence))
250 found2ndPunctuation = true;
251 --start;
252 }
253 break;
254 case '.':
255 case '!':
256 case '?':
257 if(!is_unicode_space(*(start+1)) ||
258 (start-1>end && *(start-1)>='A' && *(start-1)<='Z')) {
259 // if next character is not a blank, or preceding character is
260 // a capital letter, we guess it's an acronym (e.g. "U.S.A.")
261 sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
262 start,start+1);
263 --start;
264 } else
265 if(has_unicode_letdig(sentence) || found1stPunctuation)
266 found2ndPunctuation = true;
267 else {
268 sentence.text_as_usvector().insert(
269 sentence.text_as_usvector().begin(),start,start+1);
270 --start;
271 found1stPunctuation = true;
272 }
273 break;
274 default:
275 sentence.text_as_usvector().insert(
276 sentence.text_as_usvector().begin(),start,start+1);
277 --start;
278 break;
279 }
280 }
281 return sentence;
282}
283
284
285// start is positioned on the '<'
286bool paragraph_tag(text_t::iterator start) {
287 if(*start=='<') {
288 ++start;
289 if(*start=='p' || *start=='P') {
290 ++start;
291 if(is_unicode_space(*start) || *start=='>')
292 return true;
293 }
294 }
295 return false;
296}
Note: See TracBrowser for help on using the repository browser.