root/main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GoogleSearch.java.tmp @ 28966

Revision 28966, 7.0 KB (checked in by kjdon, 6 years ago)

Lots of changes. Mainly to do with removing this.doc from everywhere. Document is not thread safe. Now we tend to create a new Document everytime we are starting a new page/message etc. in service this.desc_doc is available as teh document to create service info stuff. But it should only be used for this and not for other messages. newDOM is now static for XMLConverter. method param changes for some GSXML methods.

  • Property svn:keywords set to Author Date Id Revision
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6//Google Web Services API classes
7//import com.google.soap.search.GoogleSearch;
8import com.google.soap.search.GoogleSearchFault;
9import com.google.soap.search.GoogleSearchResult;
10import com.google.soap.search.GoogleSearchResultElement;
11import com.google.soap.search.GoogleSearchDirectoryCategory;
12
13// XML classes
14import org.w3c.dom.Element;
15import org.w3c.dom.Document;
16import org.w3c.dom.NodeList;
17
18//Java classes
19import java.util.ArrayList;
20import java.util.HashMap;
21import java.io.File;
22import java.io.InputStream;
23import java.io.BufferedReader;
24import java.io.InputStreamReader;
25import java.io.IOException;
26import java.net.HttpURLConnection;
27import java.net.URLConnection;
28import java.net.URL;
29import java.net.Authenticator;
30import java.net.PasswordAuthentication;
31import java.net.MalformedURLException;
32import java.lang.Object;
33
34import org.apache.log4j.*;
35
36/**
37 *
38 * @author <a href="mailto:chi@cs.waikato.ac.nz">Chi-Yu Huang</a>
39 *
40 */
41
42public class GoogleSearch
43    extends AbstractSearch {
44   
45     static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GoogleSearch.class.getName());
46     
47    //Parameters connect to Proxy Server
48    private boolean using_proxy = false;
49    private String proxy_host = null;
50    private int proxy_port;
51    private String proxy_user = null;
52    private char [] proxy_passwd = null;
53
54    // google key
55    private String client_key = null;
56   
57    public GoogleSearch()
58    {
59    }
60       
61    //Configure GoogleSearch Service
62    public boolean configure(Element info, Element extra_info)
63    {
64    if (!super.configure(info, extra_info)) {
65        return false;
66    }
67    logger.info("Configuring GoogleSearch");
68    Element server_elem = (Element)GSXML.getChildByTagName(info, "googleServer");
69    if (server_elem == null) {
70        logger.error("no googleServer element found");
71        return false;
72    }
73   
74    client_key = server_elem.getAttribute("key");
75    if (client_key.equals("")) {
76        logger.error("no client_key for the googleServer element");
77        return false;
78    }
79   
80    does_paging = true;
81    does_chunking = false;
82
83    // are we behind a proxy??
84    // all the details should have been set up by the Message Router
85    proxy_host = System.getProperty("http.proxyHost");
86    if (proxy_host != null && !proxy_host.equals("")) {
87        using_proxy = true;
88        try {
89        proxy_port = Integer.parseInt(System.getProperty("http.proxyPort").trim());
90        } catch (Exception e) {
91        logger.error("couldn't get proxy port, defaulting to 80");
92        proxy_port = 80;
93        }
94        PasswordAuthentication pa = Authenticator.requestPasswordAuthentication(proxy_host, null, proxy_port, "http", "", null);
95        proxy_user = pa.getUserName();
96        proxy_passwd = pa.getPassword();
97    }
98    return true;
99    }
100   
101    /** Process a text query - implemented by concrete subclasses */
102    protected Element processTextQuery(Element request) {
103   
104    //Connect to Google Web API service
105    com.google.soap.search.GoogleSearch search = new com.google.soap.search.GoogleSearch();
106
107    // Set mandatory attributes
108    search.setKey(client_key);
109
110    // proxy??
111    if (using_proxy) {
112        search.setProxyHost(proxy_host);
113        search.setProxyPort(proxy_port);
114        search.setProxyUserName(proxy_user);
115        search.setProxyPassword(new String(proxy_passwd));
116    }
117
118    //set optional attributes
119    search.setSafeSearch(true);
120   
121    // Create a new (empty) result message
122    Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
123    result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
124    result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
125    Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
126    result.appendChild(doc_node_list);
127    Element query_metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
128    result.appendChild(query_metadata_list);
129
130    Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
131    if (param_list == null) {
132        logger.error("TextQuery request had no paramList.");
133        return result;  // Return the empty result
134    }
135
136    // Process the request parameters
137    HashMap params = GSXML.extractParams(param_list, false);
138
139    // Make sure a query has been specified
140    String query = (String) params.get(QUERY_PARAM);
141    if (query == null || query.equals("")) {
142        logger.error("TextQuery request had no query string.");
143        return result;  // Return the empty result
144    }
145        // tidy whitespace
146    query = query.replaceAll("\\s+", "+");
147   
148    search.setQueryString(query);
149
150    //Check hits_per_page
151    int hits_per_page;
152    try {
153        hits_per_page = Integer.parseInt(((String)params.get(HITS_PER_PAGE_PARAM)).trim());
154    } catch (Exception e) {
155        logger.error("couldn't get hits per page param, defaulting to 10");
156        hits_per_page = 10;
157    }
158    //Check the start_page number
159    int start_page;
160    try {
161        start_page = Integer.parseInt(((String) params.get(START_PAGE_PARAM)).trim());
162    } catch (Exception e) {
163        logger.error("couldn't get start page param, defaulting to 1");
164        start_page = 1;
165    }
166   
167    //Invoke Actual Search
168   
169    // Google only allows 10 hits per request
170    int loop = hits_per_page/10;
171    int remainder = hits_per_page%10;
172    int google_start_page = (start_page-1)*hits_per_page;
173    int pages_per_loop;
174    for (int j=0; j < loop; j++){
175        if (j < (loop-1) || remainder == 0) {
176        pages_per_loop = 10;
177        } else {
178        pages_per_loop = remainder;
179        }
180        search.setMaxResults(pages_per_loop);
181        search.setStartResult(google_start_page);
182        google_start_page = google_start_page + pages_per_loop;
183        GoogleSearchResult google_result;
184        try{
185        google_result = search.doSearch();
186        } catch (GoogleSearchFault ex) {
187        logger.error("the call to the Google Web APIs failed:" + ex.toString());
188        // add the error to the result
189        return result;
190        }
191        if (j==0) {
192       
193        //Total amount of documents Google Search returned
194        // only need to do this on the first loop
195       
196        long numdocs_matched = google_result.getEstimatedTotalResultsCount();       
197        GSXML.addMetadata(query_metadata_list, "numDocsMatched", ""+numdocs_matched);
198       
199        }
200        GoogleSearchResultElement[] details = google_result.getResultElements();
201        for (int i=0; i<details.length; i++){
202        Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
203        doc_node_list.appendChild(doc_node);
204       
205        Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
206        doc_node.appendChild(metadata_list);
207       
208        String google_url = details[i].getURL(); 
209        String google_title = details[i].getTitle();
210        String google_snippet = details[i].getSnippet();
211        if (google_url !=null) {
212            GSXML.addMetadata(metadata_list, "URL", google_url);
213        }
214        if (google_title != null) {
215            GSXML.addMetadata(metadata_list, "Title", google_title);
216        }
217        if (google_snippet != null) {
218            GSXML.addMetadata(metadata_list, "Snippet", google_snippet);
219        }
220        }
221    } // for each loop
222    return result;
223    }
224   
225    protected void getIndexData(ArrayList index_ids, ArrayList index_names,String lang){
226        index_ids.add("idx");
227    index_names.add("Google main index");
228    }
229   
230}
Note: See TracBrowser for help on using the browser.