Ignore:
Timestamp:
2007-01-11T14:55:04+13:00 (17 years ago)
Author:
kjdon
Message:

implemented the core functionality for this.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneRetrieve.java

    r13270 r13576  
    2020
    2121// Greenstone classes
    22 import org.greenstone.mgpp.*;
    2322import org.greenstone.gsdl3.core.GSException;
    2423import org.greenstone.gsdl3.util.GSFile;
    2524import org.greenstone.gsdl3.util.GSXML;
    26 
     25import org.greenstone.gsdl3.util.GDBMWrapper;
     26import org.greenstone.gsdl3.util.DBInfo;
     27import org.greenstone.gsdl3.util.GSHTML;
     28import org.greenstone.gsdl3.util.OID;
    2729// XML classes
     30import org.w3c.dom.Document;
    2831import org.w3c.dom.Element;
    2932import org.w3c.dom.Text;
     
    3437import org.apache.log4j.Logger;
    3538
     39/** Retrieve documents from a gs2 lucene collection. Note that this doesn't
     40    actually use lucene, as the documents are stored in XML files */
    3641public class GS2LuceneRetrieve
    3742    extends AbstractGS2DocumentRetrieve
    3843{
    39      static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName());
     44   
     45    static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName());
     46   
     47   
     48    protected static final String DOC_LEVEL="Doc";
     49    protected static final String SEC_LEVEL="Sec";
     50    protected static final String ID_ATT = "gs2:id";
    4051
    41      // Parameters used
     52    // Parameters used
    4253    private static final String LEVEL_PARAM = "level";
    43 
     54   
    4455    // Elements used in the config file that are specific to this class
    4556    private static final String DEFAULT_LEVEL_ELEM = "defaultLevel";
    46 
    47     private MGPPWrapper mgpp_src = null;
    4857   
    4958    private String default_level = null;
    50     private String mgpp_textdir = null;
    51    
     59    private String text_dir = null;
     60
     61    private boolean text_available = true;
     62   
    5263    public GS2LuceneRetrieve() {
    53     this.mgpp_src = new MGPPWrapper();
    5464    }
    55 
     65   
    5666    public void cleanUp() {
    5767    super.cleanUp();
    58     this.mgpp_src.unloadIndexData();
    5968    }
    60 
     69   
    6170    /** configure this service */
    6271    public boolean configure(Element info, Element extra_info)
     
    6574        return false;
    6675    }
    67 
     76   
    6877    // Do specific configuration
    6978    logger.info("Configuring GS2LuceneRetrieve...");
    70 
    71     // TODO - is there anything we need to do here?
    72     // set up XML parser??
    73 
     79   
     80    text_dir = GSFile.collectionIndexDir(this.site_home, this.cluster_name) + File.separatorChar+"text"+File.separatorChar;
     81    if (!(new File(text_dir).isDirectory())) {
     82        logger.error("Text directory "+text_dir+" does not exist, will be unable to retrieve text for "+cluster_name);
     83        text_available = false;
     84        return true; // return true so that we still get the other services for the collection
     85    }
     86    // Get the default level out of <defaultLevel> (buildConfig.xml)
     87    Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_LEVEL_ELEM);
     88    if (def != null) {
     89        this.default_level = def.getAttribute(GSXML.NAME_ATT);
     90    }
     91    if (this.default_level == null || this.default_level.equals("")) {
     92        logger.error("Default level not specified for "+this.cluster_name+", assuming "+DOC_LEVEL);
     93        this.default_level = DOC_LEVEL;
     94    }
     95   
    7496    return true;
    75  
     97   
    7698    }
    7799   
     
    80102     * <nodeContent>text content or other elements</nodeContent>
    81103     */
    82     protected Element getNodeContent(String doc_id) throws GSException {
    83 
     104    protected Element getNodeContent(String doc_id, String lang) throws GSException {   
     105    String doc_content = getTextString("TextRetrievalError", lang);
     106    try {
     107        if (!text_available) {
     108        throw new Exception("No text directory available");
     109        }
     110       
     111        long doc_num = this.gdbm_src.OID2Docnum(doc_id);
     112        if (doc_num == -1) {
     113        throw new Exception("OID "+doc_id +" couldn't be converted to lucene doc num");
     114        }
     115   
     116        DBInfo info=this.gdbm_src.getInfo(OID.getTop(doc_id));
     117        if (info == null) {
     118        throw new Exception("Couldn't get GDBM database entry for "+OID.getTop(doc_id));
     119        }
     120       
     121        String archivedir=info.getInfo("archivedir");
     122        File doc_xml_file = new File(text_dir+archivedir+File.separatorChar+"doc.xml");
     123        if (!doc_xml_file.isFile()) {
     124        throw new Exception("Doc XML file "+doc_xml_file.getPath()+" does not exist");
     125        }
     126        Document doc_xml_doc = this.converter.getDOM(doc_xml_file);
     127        if (doc_xml_doc == null) {
     128        throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
     129        }
     130        Element full_document = doc_xml_doc.getDocumentElement();
     131        if (full_document == null) {
     132        throw new Exception("Couldn't parse file "+doc_xml_file.getPath());
     133        }
     134        Element current_section = null;
     135        if (default_level.equals(DOC_LEVEL)) {
     136        current_section = full_document;
     137        } else {
     138        current_section = GSXML.getNamedElement(full_document, SEC_LEVEL, ID_ATT, String.valueOf(doc_num));
     139        }
     140        if (current_section == null) {
     141        throw new Exception("Couldn't find section "+ doc_num+" in file "+doc_xml_file.getPath());
     142        }
     143        doc_content = GSXML.getNodeText(current_section);
     144        if (doc_content == null) {
     145        doc_content = "";
     146        } else {
     147        doc_content = resolveTextMacros(doc_content, doc_id, lang);
     148        }
     149    } catch (Exception e) {
     150        logger.error("Error trying to get document text for "+doc_id+" in collection "+this.cluster_name+": "+e);
     151    }
    84152   
    85153    Element content_node = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
    86    
    87     String doc_content = "";
    88     try {
    89         // TODO get the doc content from the index text dir
    90     } catch (Exception e) {
    91         logger.info("exception happended getting doc content for " + doc_id);
    92         doc_content = "dummy content for doc "+doc_id +"\n";
    93 
    94     }
    95 
    96154    Text t = this.doc.createTextNode(doc_content);
    97155    content_node.appendChild(t);
    98156    return content_node;
    99157    }
    100 
    101 
    102158}
Note: See TracChangeset for help on using the changeset viewer.