Context Navigation

← Previous Changeset
Next Changeset →

Changeset 13576

Timestamp:

2007-01-11T14:55:04+13:00 (17 years ago)

Author:

kjdon

Message:

implemented the core functionality for this.

Location:

trunk/gsdl3/src/java/org/greenstone/gsdl3/service

Files:

: 2 edited

GS2LuceneRetrieve.java (modified) (4 diffs)
GS2LuceneSearch.java (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneRetrieve.java

-              r13270
+              r13576
 // Greenstone classes
-import org.greenstone.mgpp.*;
 import org.greenstone.gsdl3.core.GSException;
 import org.greenstone.gsdl3.util.GSFile;
 import org.greenstone.gsdl3.util.GSXML;
+import org.greenstone.gsdl3.util.GDBMWrapper;
+import org.greenstone.gsdl3.util.DBInfo;
+import org.greenstone.gsdl3.util.GSHTML;
+import org.greenstone.gsdl3.util.OID;
 // XML classes
+import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Text;
 …
 import org.apache.log4j.Logger;
+/** Retrieve documents from a gs2 lucene collection. Note that this doesn't
+    actually use lucene, as the documents are stored in XML files */
 public class GS2LuceneRetrieve
     extends AbstractGS2DocumentRetrieve
+{
+     static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName());
+    static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName());
+    protected static final String DOC_LEVEL="Doc";
+    protected static final String SEC_LEVEL="Sec";
+    protected static final String ID_ATT = "gs2:id";
      // Parameters used
+    // Parameters used
     private static final String LEVEL_PARAM = "level";
     // Elements used in the config file that are specific to this class
     private static final String DEFAULT_LEVEL_ELEM = "defaultLevel";
-    private MGPPWrapper mgpp_src = null;
     private String default_level = null;
+    private String mgpp_textdir = null;
+    private String text_dir = null;
+    private boolean text_available = true;
     public GS2LuceneRetrieve() {
-    this.mgpp_src = new MGPPWrapper();
+    }
     public void cleanUp() {
     super.cleanUp();
-    this.mgpp_src.unloadIndexData();
+    }
     /** configure this service */
     public boolean configure(Element info, Element extra_info)
 …
         return false;
+    }
     // Do specific configuration
     logger.info("Configuring GS2LuceneRetrieve...");
+    // TODO - is there anything we need to do here?
+    // set up XML parser??
+    text_dir = GSFile.collectionIndexDir(this.site_home, this.cluster_name) + File.separatorChar+"text"+File.separatorChar;
+    if (!(new File(text_dir).isDirectory())) {
+        logger.error("Text directory "+text_dir+" does not exist, will be unable to retrieve text for "+cluster_name);
+        text_available = false;
+        return true; // return true so that we still get the other services for the collection
+    }
+    // Get the default level out of <defaultLevel> (buildConfig.xml)
+    Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_LEVEL_ELEM);
+    if (def != null) {
+        this.default_level = def.getAttribute(GSXML.NAME_ATT);
+    }
+    if (this.default_level == null || this.default_level.equals("")) {
+        logger.error("Default level not specified for "+this.cluster_name+", assuming "+DOC_LEVEL);
+        this.default_level = DOC_LEVEL;
+    }
     return true;
+    }
 …
      * <nodeContent>text content or other elements</nodeContent>
      */
+    protected Element getNodeContent(String doc_id) throws GSException {
+    protected Element getNodeContent(String doc_id, String lang) throws GSException {
+    String doc_content = getTextString("TextRetrievalError", lang);
+    try {
+        if (!text_available) {
+        throw new Exception("No text directory available");
+        }
+        long doc_num = this.gdbm_src.OID2Docnum(doc_id);
+        if (doc_num == -1) {
+        throw new Exception("OID "+doc_id +" couldn't be converted to lucene doc num");
+        }
+        DBInfo info=this.gdbm_src.getInfo(OID.getTop(doc_id));
+        if (info == null) {
+        throw new Exception("Couldn't get GDBM database entry for "+OID.getTop(doc_id));
+        }
+        String archivedir=info.getInfo("archivedir");
+        File doc_xml_file = new File(text_dir+archivedir+File.separatorChar+"doc.xml");
+        if (!doc_xml_file.isFile()) {
+        throw new Exception("Doc XML file "+doc_xml_file.getPath()+" does not exist");
+        }
+        Document doc_xml_doc = this.converter.getDOM(doc_xml_file);
+        if (doc_xml_doc == null) {
+        throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
+        }
+        Element full_document = doc_xml_doc.getDocumentElement();
+        if (full_document == null) {
+        throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
+        }
+        Element current_section = null;
+        if (default_level.equals(DOC_LEVEL)) {
+        current_section = full_document;
+        } else {
+        current_section = GSXML.getNamedElement(full_document, SEC_LEVEL, ID_ATT, String.valueOf(doc_num));
+        }
+        if (current_section == null) {
+        throw new Exception("Couldn't find section "+ doc_num+" in file "+doc_xml_file.getPath());
+        }
+        doc_content = GSXML.getNodeText(current_section);
+        if (doc_content == null) {
+        doc_content = "";
+        } else {
+        doc_content = resolveTextMacros(doc_content, doc_id, lang);
+        }
+    } catch (Exception e) {
+        logger.error("Error trying to get document text for "+doc_id+" in collection "+this.cluster_name+": "+e);
+    }
     Element content_node = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
-    String doc_content = "";
-    try {
-        // TODO get the doc content from the index text dir
-    } catch (Exception e) {
-        logger.info("exception happended getting doc content for " + doc_id);
-        doc_content = "dummy content for doc "+doc_id +"\n";
+    }
     Text t = this.doc.createTextNode(doc_content);
     content_node.appendChild(t);
     return content_node;
+    }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java

-              r13270
+              r13576
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
+import org.w3c.dom.Document;
 // java classes
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.io.File;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.Map;
+import java.util.Vector;
 // Logging
 import org.apache.log4j.Logger;
+import org.nzdl.gsdl.LuceneWrap.GS2LuceneQuery;
+import org.nzdl.gsdl.LuceneWrap.LuceneQueryResult;
 public class GS2LuceneSearch
     extends AbstractGS2FieldSearch
+{
+    // TODO: lucene query object
+    protected static final String RANK_PARAM_RANK_VALUE = "rank";
     static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName());
+    private GS2LuceneQuery lucene_src=null;
     public GS2LuceneSearch()
+    {
+    // TODO: create new query object
+    this.lucene_src = new GS2LuceneQuery();
     // Lucene uses double operators, not single
     AND_OPERATOR = "&&";
     OR_OPERATOR = "||";
+    }
+    does_paging = true;
+    does_chunking = true;
+    }
     public void cleanUp() {
     super.cleanUp();
     // TODO: clean up query object
+    }
         /** configure this service */
+    this.lucene_src.cleanUp();
+    }
+    /** configure this service */
     public boolean configure(Element info, Element extra_info)
+    {
 …
     does_stem = false;
     does_accent = false;
-    // TODO: configure the query object based on info from config file
     return true;
+    }
     /** add in the lucene specific params to TextQuery */
     protected void addCustomQueryParams(Element param_list, String lang)
 …
     /** lucenes rank param is based on fields, not ranked/not */
     createParameter(RANK_PARAM, param_list, lang);
+    }
+    createParameter(FIELD_ATT, param_list, lang);
+    }
     /** create a param and add to the list */
     /** we override this to do a special rank param */
 …
         // get the fields
         ArrayList fields = new ArrayList();
         fields.add("rank");
+        fields.add(RANK_PARAM_RANK_VALUE);
         ArrayList field_names = new ArrayList();
         field_names.add(getTextString("param.sortBy.rank", lang));
 …
+    }
+    }
     protected void getSortByIndexData(ArrayList index_ids, ArrayList index_names, String lang){
     // the field list -  read from config file
 …
+        }
         // TODO change field so that name is the id, and full metadata name is somthing else
         if (shortname.equals("")) {
         shortname = name;
+        }
         if (shortname.equals("ZZ")) {
         // ZZ is a fake index
+        if (shortname.equals("ZZ") || shortname.equals("TX")) {
+        // ZZ is a fake index, and we don't sort by TX
         continue;
+        }
 …
     /** methods to handle actually doing the query */
     /** do any initialisation of the query object */
     protected boolean setUpQueryer(HashMap params) {
+    // TODO
+        String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index"+File.separatorChar;
+    String index = "didx";
+    int maxdocs = 100;
+    int hits_per_page = 20;
+    int start_page = 1;
+    // set up the query params
+        Set entries = params.entrySet();
+        Iterator i = entries.iterator();
+        while (i.hasNext()) {
+            Map.Entry m = (Map.Entry)i.next();
+            String name = (String)m.getKey();
+            String value = (String)m.getValue();
+            if (name.equals(MAXDOCS_PARAM)&& !value.equals("")) {
+        maxdocs = Integer.parseInt(value);
+        } else if (name.equals(HITS_PER_PAGE_PARAM)) {
+        hits_per_page = Integer.parseInt(value);
+        } else if (name.equals(START_PAGE_PARAM)) {
+        start_page = Integer.parseInt(value);
+            } else if (name.equals(MATCH_PARAM)) {
+            if (value.equals(MATCH_PARAM_ALL)) {
+            this.lucene_src.setDefaultConjunctionOperator("AND");
+            } else{
+            this.lucene_src.setDefaultConjunctionOperator("OR");
+            }
+            } else if (name.equals(RANK_PARAM)) {
+        if (value.equals(RANK_PARAM_RANK_VALUE)) {
+            value = null;
+        }
+        this.lucene_src.setSortField(value);
+            } else if (name.equals(LEVEL_PARAM)) {
+        if (value.toUpperCase().equals("SEC")){
+            index = "sidx";
+        }
+        else {
+            index = "didx";
+        }
+        } // ignore any others
+        }
+    // set up start and end results if necessary
+    int start_results = 1;
+    if (start_page != 1) {
+        start_results = ((start_page-1) * hits_per_page) + 1;
+    }
+    int end_results = hits_per_page * start_page;
+    this.lucene_src.setStartResults(start_results);
+    this.lucene_src.setEndResults(end_results);
+    this.lucene_src.setIndexDir(indexdir+index);
+    this.lucene_src.initialise();
     return true;
+    }
     /** do the query */
     protected Object runQuery(String query) {
+    // TODO
+    try {
+        LuceneQueryResult lqr=this.lucene_src.runQuery(query);
+        return lqr;
+    } catch (Exception e) {
+        logger.error ("exception happened in run query: ", e);
+    }
     return null;
+    }
     /** get the total number of docs that match */
     protected long numDocsMatched(Object query_result) {
+    // TODO
+    return 0;
+        return ((LuceneQueryResult)query_result).getTotalDocs();
+    }
     /** get the list of doc ids */
     protected String [] getDocIDs(Object query_result) {
+    // TODO
+    return null;
+        Vector docs = ((LuceneQueryResult)query_result).getDocs();
+        String [] doc_nums = new String [docs.size()];
+        for (int d = 0; d < docs.size(); d++) {
+        String doc_num = Long.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).num_);
+            doc_nums[d] = doc_num;
+        }
+        return doc_nums;
+    }
     /** get the list of doc ranks */
     protected String [] getDocRanks(Object query_result) {
+    // TODO
+    return null;
+        Vector docs = ((LuceneQueryResult)query_result).getDocs();
+        String [] doc_ranks = new String [docs.size()];
+        for (int d = 0; d < docs.size(); d++) {
+            doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_);
+        }
+        return doc_ranks;
+    }
     /** add in term info if available */
     protected boolean addTermInfo(Element term_list, HashMap params,
                   Object query_result) {
+    // TODO
+    return true;
+    }
+        String query_level = (String)params.get(LEVEL_PARAM); // the current query level
+        Vector terms = ((LuceneQueryResult)query_result).getTerms();
+        for (int t = 0; t < terms.size(); t++) {
+            LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t);
+            Element term_elem = this.doc.createElement(GSXML.TERM_ELEM);
+            term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_);
+            term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_);
+            term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_);
+        term_elem.setAttribute(FIELD_ATT, term_info.field_);
+            term_list.appendChild(term_elem);
+        }
+        return true;
+    }
     protected String addFieldInfo(String query, String field) {
     if (field.equals("") || field.equals("ZZ")) {
 …
     return field+":("+query+")";
+    }
     protected void addQueryElem(StringBuffer s, String q, String f, String c) {
     String combine="";
     if (s.length()>0) {
 …
     s.append(combine + addFieldInfo(q,f));
+    }
     /** Lucene doesn't use these options at the moment */
     protected String addStemOptions(String query, String stem,

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 13576

Legend:

trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneRetrieve.java

trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java

Download in other formats: