root/greenstone3/trunk/src/java/org/greenstone/gsdl3/service/GoogleSearch.java.tmp @ 20294

Revision 20294, 7.0 KB (checked in by kjdon, 11 years ago)

changed System.err to logger.error/info

  • Property svn:keywords set to Author Date Id Revision
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6//Google Web Services API classes
7//import com.google.soap.search.GoogleSearch;
8import com.google.soap.search.GoogleSearchFault;
9import com.google.soap.search.GoogleSearchResult;
10import com.google.soap.search.GoogleSearchResultElement;
11import com.google.soap.search.GoogleSearchDirectoryCategory;
12
13// XML classes
14import org.w3c.dom.Element;
15import org.w3c.dom.Document;
16import org.w3c.dom.NodeList;
17
18//Java classes
19import java.util.ArrayList;
20import java.util.HashMap;
21import java.io.File;
22import java.io.InputStream;
23import java.io.BufferedReader;
24import java.io.InputStreamReader;
25import java.io.IOException;
26import java.net.HttpURLConnection;
27import java.net.URLConnection;
28import java.net.URL;
29import java.net.Authenticator;
30import java.net.PasswordAuthentication;
31import java.net.MalformedURLException;
32import java.lang.Object;
33
34import org.apache.log4j.*;
35
36/**
37 *
38 * @author <a href="mailto:chi@cs.waikato.ac.nz">Chi-Yu Huang</a>
39 *
40 */
41
42public class GoogleSearch
43    extends AbstractSearch {
44   
45     static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GoogleSearch.class.getName());
46     
47    //Parameters connect to Proxy Server
48    private boolean using_proxy = false;
49    private String proxy_host = null;
50    private int proxy_port;
51    private String proxy_user = null;
52    private char [] proxy_passwd = null;
53
54    // google key
55    private String client_key = null;
56   
57    public GoogleSearch()
58    {
59    }
60       
61    //Configure GoogleSearch Service
62    public boolean configure(Element info, Element extra_info)
63    {
64    if (!super.configure(info, extra_info)) {
65        return false;
66    }
67    logger.info("Configuring GoogleSearch");
68    Element server_elem = (Element)GSXML.getChildByTagName(info, "googleServer");
69    if (server_elem == null) {
70        logger.error("no googleServer element found");
71        return false;
72    }
73   
74    client_key = server_elem.getAttribute("key");
75    if (client_key.equals("")) {
76        logger.error("no client_key for the googleServer element");
77        return false;
78    }
79   
80    does_paging = true;
81    does_chunking = false;
82
83    // are we behind a proxy??
84    // all the details should have been set up by the Message Router
85    proxy_host = System.getProperty("http.proxyHost");
86    if (proxy_host != null && !proxy_host.equals("")) {
87        using_proxy = true;
88        try {
89        proxy_port = Integer.parseInt(System.getProperty("http.proxyPort").trim());
90        } catch (Exception e) {
91        logger.error("couldn't get proxy port, defaulting to 80");
92        proxy_port = 80;
93        }
94        PasswordAuthentication pa = Authenticator.requestPasswordAuthentication(proxy_host, null, proxy_port, "http", "", null);
95        proxy_user = pa.getUserName();
96        proxy_passwd = pa.getPassword();
97    }
98    return true;
99    }
100   
101    /** Process a text query - implemented by concrete subclasses */
102    protected Element processTextQuery(Element request) {
103   
104    //Connect to Google Web API service
105    com.google.soap.search.GoogleSearch search = new com.google.soap.search.GoogleSearch();
106
107    // Set mandatory attributes
108    search.setKey(client_key);
109
110    // proxy??
111    if (using_proxy) {
112        search.setProxyHost(proxy_host);
113        search.setProxyPort(proxy_port);
114        search.setProxyUserName(proxy_user);
115        search.setProxyPassword(new String(proxy_passwd));
116    }
117
118    //set optional attributes
119    search.setSafeSearch(true);
120   
121    // Create a new (empty) result message
122    Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
123    result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
124    result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
125    Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
126    result.appendChild(doc_node_list);
127    Element query_metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
128    result.appendChild(query_metadata_list);
129
130    Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
131    if (param_list == null) {
132        logger.error("TextQuery request had no paramList.");
133        return result;  // Return the empty result
134    }
135
136    // Process the request parameters
137    HashMap params = GSXML.extractParams(param_list, false);
138
139    // Make sure a query has been specified
140    String query = (String) params.get(QUERY_PARAM);
141    if (query == null || query.equals("")) {
142        logger.error("TextQuery request had no query string.");
143        return result;  // Return the empty result
144    }
145        // tidy whitespace
146    query = query.replaceAll("\\s+", "+");
147   
148    search.setQueryString(query);
149
150    //Check hits_per_page
151    int hits_per_page;
152    try {
153        hits_per_page = Integer.parseInt(((String)params.get(HITS_PER_PAGE_PARAM)).trim());
154    } catch (Exception e) {
155        logger.error("couldn't get hits per page param, defaulting to 10");
156        hits_per_page = 10;
157    }
158    //Check the start_page number
159    int start_page;
160    try {
161        start_page = Integer.parseInt(((String) params.get(START_PAGE_PARAM)).trim());
162    } catch (Exception e) {
163        logger.error("couldn't get start page param, defaulting to 1");
164        start_page = 1;
165    }
166   
167    //Invoke Actual Search
168   
169    // Google only allows 10 hits per request
170    int loop = hits_per_page/10;
171    int remainder = hits_per_page%10;
172    int google_start_page = (start_page-1)*hits_per_page;
173    int pages_per_loop;
174    for (int j=0; j < loop; j++){
175        if (j < (loop-1) || remainder == 0) {
176        pages_per_loop = 10;
177        } else {
178        pages_per_loop = remainder;
179        }
180        search.setMaxResults(pages_per_loop);
181        search.setStartResult(google_start_page);
182        google_start_page = google_start_page + pages_per_loop;
183        GoogleSearchResult google_result;
184        try{
185        google_result = search.doSearch();
186        } catch (GoogleSearchFault ex) {
187        logger.error("the call to the Google Web APIs failed:" + ex.toString());
188        // add the error to the result
189        return result;
190        }
191        if (j==0) {
192       
193        //Total amount of documents Google Search returned
194        // only need to do this on the first loop
195       
196        long numdocs_matched = google_result.getEstimatedTotalResultsCount();       
197        GSXML.addMetadata(this.doc, query_metadata_list, "numDocsMatched", ""+numdocs_matched);
198       
199        }
200        GoogleSearchResultElement[] details = google_result.getResultElements();
201        for (int i=0; i<details.length; i++){
202        Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
203        doc_node_list.appendChild(doc_node);
204       
205        Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
206        doc_node.appendChild(metadata_list);
207       
208        String google_url = details[i].getURL(); 
209        String google_title = details[i].getTitle();
210        String google_snippet = details[i].getSnippet();
211        if (google_url !=null) {
212            GSXML.addMetadata(this.doc, metadata_list, "URL", google_url);
213        }
214        if (google_title != null) {
215            GSXML.addMetadata(this.doc, metadata_list, "Title", google_title);
216        }
217        if (google_snippet != null) {
218            GSXML.addMetadata(this.doc, metadata_list, "Snippet", google_snippet);
219        }
220        }
221    } // for each loop
222    return result;
223    }
224   
225    protected void getIndexData(ArrayList index_ids, ArrayList index_names,String lang){
226        index_ids.add("idx");
227    index_names.add("Google main index");
228    }
229   
230}
Note: See TracBrowser for help on using the browser.