source: main/trunk/greenstone2/runtime-src/src/recpt/summarise.cpp@ 21752

Last change on this file since 21752 was 20805, checked in by davidb, 15 years ago

Extra checks added to summarise_keywords() to ensure that the htmlstr and query are non-trivial (i.e. not empty)

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1/**********************************************************************
2 *
3 * summarise.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26/* The function 'summarise' produces, given a document text and a query,
27 * a (query-biased) summary. In the future, several types of summaries will
28 * be supported.
29 */
30
31#include "summarise.h"
32#include "unitool.h"
33#include <string.h>
34
35
36/* **************** LOCAL PROTOTYPES **************** */
37
38text_t summarise_startend(text_t &htmlstr, int summaryLength);
39text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);
40
41text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
42text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
43bool paragraph_tag(text_t::iterator start);
44
45
46/****************************************************
47 NAME: summarise
48 DESC: produce a summary for a document
49*****************************************************/
50
51text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
52 // return summarise_startend(htmlstr,summaryLength);
53 return summarise_keywords(htmlstr,query,summaryLength);
54}
55
56
57/****************************************************
58 NAME: summarise_startend
59 DESC: return first and last sentences of a document
60*****************************************************/
61
62text_t summarise_startend(text_t &htmlstr, int summaryLength) {
63 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
64 text_t answer;
65
66 // add first sentences up to half the summary length
67
68 text_t::iterator str_current = str_start;
69 while(str_current<str_end && answer.size()<(summaryLength/2)) {
70 text_t sentence = next_sentence(str_current,str_end);
71 answer.append(sentence);
72 }
73
74 summaryLength -= answer.size(); // summary length left for last sentences
75 if(summaryLength<0)
76 summaryLength = 0;
77
78 str_end = str_current;
79 str_current = htmlstr.end()-1;
80 text_t lastSentence;
81 while(str_current>str_end &&
82 lastSentence.size()<summaryLength) {
83 text_t sentence = previous_sentence(str_current,str_end);
84 lastSentence = sentence + lastSentence;
85 }
86
87 answer += " ... ";
88 answer += lastSentence;
89
90 return answer;
91}
92
93
94/****************************************************
95 NAME: summarise_keywords
96 DESC: build a summary with sentences containing keyword(s).
97 the sentences with most matching keywords are returned
98 first.
99*****************************************************/
100
101text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) {
102
103 if ((query.size()==0) || (htmlstr.size()==0)) {
104 return "";
105 }
106
107 text_tarray allterms, terms;
108 splitchar(query.begin(),query.end(),' ',allterms);
109
110 // consider only non-empty terms
111 for(text_tarray::iterator term = allterms.begin();
112 term < allterms.end(); ++term) {
113 if(!(*term).empty())
114 terms.push_back(*term);
115 }
116
117 if (terms.size()==0) {
118 return "";
119 }
120
121 text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();
122
123 //text_tarray::iterator terms_current = terms_start;
124 //strstr("merde",*terms_current);
125
126 text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
127
128 vector<text_tarray> answers(terms.size());
129 // an array of array of sentences for the summary:
130 // answers[0] contains sentences with 1 keyword
131 // answers[1] contains sentences with 2 keywords, etc.
132 vector<int> answersSize(terms.size());
133 // answersSize[0] is the combined size of sentences with 1 keyword, etc.
134 for(vector<int>::iterator size = answersSize.begin();
135 size<answersSize.end(); ++size)
136 *size = 0; // initialise sentence size
137
138 int totfound = 0;
139 text_t::iterator str_current = str_start;
140 while(str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
141 // if the size of best sentences is greater than summary, that's enough!
142 text_t sentence = next_sentence(str_current,str_end);
143
144 text_tarray::iterator terms_current = terms_start;
145 int nFound = 0;
146 while(terms_current!=terms_end) {
147 text_t::iterator word = findword(sentence.begin(),sentence.end(),
148 *terms_current);
149 if(word!=sentence.end())
150 { ++nFound; ++totfound; }
151 ++terms_current;
152 }
153
154 if(nFound>0 && answersSize[nFound-1]<summaryLength) {
155 answers[nFound-1].push_back(sentence);
156 answersSize[nFound-1] += sentence.size();
157 }
158 }
159
160 text_t answer;
161 for(vector<text_tarray>::iterator sentarray = answers.end()-1;
162 sentarray>=answers.begin(); --sentarray)
163 for(text_tarray::iterator sentence = (*sentarray).begin();
164 sentence < (*sentarray).end(); ++sentence) {
165 answer.append(*sentence);
166 if(answer.size()>=summaryLength)
167 return answer;
168 }
169
170 if(!answer.empty())
171 return answer;
172
173 return summarise_startend(htmlstr,summaryLength);
174}
175
176
177/* *********************** LOCAL FUNCTIONS ******************* */
178
179/* NAME: next_sentence
180 DESC: returns next sentence, text-only (ie. HTML markup is removed)
181 */
182
183text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
184 text_t sentence; // the sentence to be returned
185 bool foundPunctuation = false; // set to true by '.', '!' or '?'
186 while(start<end && !foundPunctuation) {
187 switch (*start) {
188 case '<': // skip over rest of html tag
189 if(paragraph_tag(start) && has_unicode_letdig(sentence))
190 foundPunctuation = true;
191 while ((start<end) && (*start!='>'))
192 ++start;
193 if(start<end) ++start;
194 break;
195 case '.':
196 case '!':
197 case '?':
198 sentence.push_back(*start);
199 ++start;
200 if(start>=end ||
201 (is_unicode_space(*start) && (*(start-2)<'A' || *(start-2)>'Z'))) {
202 foundPunctuation = true;
203 }
204 break;
205 default:
206 sentence.push_back(*start);
207 ++start;
208 break;
209 }
210 }
211 return sentence;
212}
213
214
215/* NAME: previous_sentence
216 DESC: returns previous sentence, text-only (ie. HTML markup is removed)
217 */
218
219text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
220 text_t sentence; // the sentence to be returned
221 bool found1stPunctuation = false, // set to true by '.', '!' or '?'
222 // first punct. is included in results
223 found2ndPunctuation = false; // second punct. is stop condition,
224 // and is not included in results
225 while(start>end && !found2ndPunctuation) {
226 switch (*start) {
227 case '>': // skip over rest of html tag
228 while ((start>end) && (*start!='<')) // backtrack to beginning of tag
229 --start;
230 if(start>end) {
231 if(paragraph_tag(start) && has_unicode_letdig(sentence))
232 found2ndPunctuation = true;
233 --start;
234 }
235 break;
236 case '.':
237 case '!':
238 case '?':
239 if(!is_unicode_space(*(start+1)) ||
240 (start-1>end && *(start-1)>='A' && *(start-1)<='Z')) {
241 // if next character is not a blank, or preceding character is
242 // a capital letter, we guess it's an acronym (e.g. "U.S.A.")
243 sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
244 start,start+1);
245 --start;
246 } else
247 if(has_unicode_letdig(sentence) || found1stPunctuation)
248 found2ndPunctuation = true;
249 else {
250 sentence.text_as_usvector().insert(
251 sentence.text_as_usvector().begin(),start,start+1);
252 --start;
253 found1stPunctuation = true;
254 }
255 break;
256 default:
257 sentence.text_as_usvector().insert(
258 sentence.text_as_usvector().begin(),start,start+1);
259 --start;
260 break;
261 }
262 }
263 return sentence;
264}
265
266
267// start is positioned on the '<'
268bool paragraph_tag(text_t::iterator start) {
269 if(*start=='<') {
270 ++start;
271 if(*start=='p' || *start=='P') {
272 ++start;
273 if(is_unicode_space(*start) || *start=='>')
274 return true;
275 }
276 }
277 return false;
278}
Note: See TracBrowser for help on using the repository browser.