Changeset 33634

Show
Ignore:
Timestamp:
08.11.2019 23:59:07 (6 days ago)
Author:
ak19
Message:

Rewrote NutchTextDumpProcessor? as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
4 added
4 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33633 r33634  
    44import com.mongodb.client.MongoCollection;  
    55import com.mongodb.client.MongoDatabase;  
     6//import com.mongodb.client.MongoIterable; 
     7import com.mongodb.BasicDBObject; 
    68import com.mongodb.MongoClient;  
    7 import com.mongodb.MongoCredential;   
     9import com.mongodb.MongoCredential; 
     10import com.mongodb.ServerAddress; 
     11import com.mongodb.MongoClientOptions; 
    812 
    913import org.bson.Document; 
     
    1216import java.io.File; 
    1317import java.io.FileReader; 
     18import java.util.ArrayList; 
     19import java.util.List; 
    1420import java.util.Properties; 
     21 
    1522 
    1623import org.apache.log4j.Logger; 
     
    4350 * 
    4451 */ 
    45 public class MongoDBAccess { 
     52public class MongoDBAccess implements AutoCloseable { 
    4653 
    4754    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 
    4855     
    4956    static final String PROPS_FILENAME = "config.properties"; 
    50     public static final String DB_NAME = "anupama"; //"ateacrawldata"; 
    5157    public static final String WEBPAGES_COLLECTION = "webpages"; 
    5258    public static final String WEBSITES_COLLECTION = "websites";     
    53      
     59 
     60    // configuration details, some with fallback values 
    5461    private String HOST = "localhost"; 
    5562    private int PORT = 27017; // mongodb port 
    5663    private String USERNAME; 
    5764    private String PASSWORD;     
    58  
     65    private String DB_NAME ="ateacrawldata"; 
     66     
    5967    private MongoClient mongo = null; 
    6068    private MongoDatabase database = null; 
     
    108116     */ 
    109117    public void connectToDB() throws Exception { 
     118     
    110119    // Creating a Mongo client  
    111120    mongo = new MongoClient( HOST, PORT );  
     
    117126     
    118127    // Accessing the database  
    119     database = mongo.getDatabase(DB_NAME);  
     128    this.database = mongo.getDatabase(DB_NAME);  
    120129    logger.info("Credentials: "+ credential); 
    121     } 
    122  
    123      
    124     public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite, 
    125            int numPages, int numPagesInMRI, int numPagesContainingMRI, 
    126                   /* TODO: String geoLocationCountryCode, boolean miURL */ 
    127            String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl) 
     130 
     131    /* 
     132    MongoCredential credential;  
     133    credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray()); 
     134    logger.info("Credentials: "+ credential); 
     135     
     136    // Create our Mongo client 
     137    mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build()); 
     138    System.out.println("Connected to the database successfully"); 
     139 
     140    this.database = mongo.getDatabase(DB_NAME);  
     141    */ 
     142     
     143    } 
     144 
     145    // TODO: which fields should be indexed? 
     146 
     147    public void showCollections() { 
     148    //MongoIterable<String> colls = this.database.listCollectionNames(); 
     149    for(String coll : this.database.listCollectionNames()) { 
     150        System.err.println("coll: " + coll); 
     151    } 
     152    } 
     153     
     154     
     155    public void insertWebsiteInfo(WebsiteInfo website) 
    128156    { 
    129157    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    130     Document document = new Document("id", SITE_COUNTER) 
    131         .append("siteFolderName", siteID)  
    132         .append("domain", domainOfSite)  
    133         .append("totalPages", numPages)  
    134         .append("numPagesInMRI", numPagesInMRI) 
    135         .append("numPagesContainingMRI", numPagesContainingMRI) 
    136         .append("siteCrawledTimestamp", siteCrawledTimestamp) 
    137         .append("siteCrawlUnfinished", siteCrawlUnfinished) 
    138         .append("redoCrawl", redoCrawl); 
     158    Document document = new Document("_id", website.id) 
     159        .append("siteFolderName", website.siteFolderName)  
     160        .append("domain", website.domain)  
     161        .append("totalPages", website.totalPages) 
     162        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) 
     163        .append("numPagesInMRI", website.numPagesInMRI) 
     164        .append("siteCrawledTimestamp", website.siteCrawledTimestamp) 
     165        .append("siteCrawlUnfinished", website.siteCrawlUnfinished) 
     166        .append("redoCrawl", website.redoCrawl); 
     167 
     168    document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath); 
     169    if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) { 
     170        document.put("countryCode", website.geoLocationCountryCode); 
     171    } 
     172 
    139173    collection.insertOne(document);  
    140     System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION);   
    141     } 
    142  
    143      
    144     public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/ 
    145                   String url, String charEncoding, String modTime, String fetchTime, 
    146                   boolean isMRI, int totalSentences, int numSentencesInMRI, 
    147                   ArrayList<SentenceInfo> singleSentences, 
    148                   ArrayList<SentenceInfo> overlappingSentences) 
     174    logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")" 
     175               + " inserted successfully into " + WEBSITES_COLLECTION);   
     176    } 
     177 
     178    /* TODO: 
     179    https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 
     180    */ 
     181    public void insertWebpageInfo(WebpageInfo webpage) 
    149182    { 
    150183    // load the webpages db 'table' 
     
    152185    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
    153186 
    154     Document document = new Document("id", WEBPAGE_COUNTER) 
    155         .append("siteid", site_id)  
    156         .append("url", url)  
    157         .append("charEncoding", charEncoding)  
    158         .append("modTime", modTime) 
    159         .append("fetchTime", fetchTime) 
    160         .append("isMRI", isMRI) 
    161         .append("totalSentences", totalSentences) 
    162         .append("numSentencesInMRI", numSentencesInMRI); 
    163  
    164     document.put("singleSentences", singleSentences); 
    165     document.put("overlappingSentences", overlappingSentences); 
     187    Document document = new Document("_id", webpage.webpageID) 
     188        .append("siteid", webpage.websiteID) 
     189        .append("url", webpage.URL) 
     190        .append("isMRI", webpage.isMRI) 
     191        .append("totalSentences", webpage.totalSentences) 
     192        .append("charEncoding", webpage.charEncoding)  
     193        .append("modTime", webpage.modifiedTime) 
     194        .append("fetchTime", webpage.fetchTime); 
     195 
     196    // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT: 
     197    //document.put("singleSentences", webpage.singleSentences); 
     198    //document.put("overlappingSentences", webpage.overlappingSentences); 
     199 
     200    // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: 
     201    // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java 
     202    List<BasicDBObject> sentencesList = new ArrayList<>(); 
     203    for(SentenceInfo sentence : webpage.singleSentences) { 
     204        sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 
     205        sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 
     206        sentencesList.add(new BasicDBObject("sentence", sentence)); 
     207    }    
     208    document.put("singleSentences", sentencesList); 
     209     
     210    List<BasicDBObject> overlappingSentencesList = new ArrayList<>(); 
     211    for(SentenceInfo sentence : webpage.overlappingSentences) { 
     212        sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 
     213        sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 
     214        sentencesList.add(new BasicDBObject("sentence", sentence)); 
     215    }    
     216    document.put("singleSentences", overlappingSentencesList); 
     217 
     218    // also put the full text in there 
     219    document.put("text", webpage.text); 
    166220     
    167221    collection.insertOne(document); 
    168     System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION);   
    169     } 
     222    logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);   
     223    } 
     224 
     225    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
     226    public void close() {} 
    170227     
    171228 
     
    184241        MongoDBAccess mongodbCon = new MongoDBAccess(); 
    185242        mongodbCon.connectToDB(); 
    186         //mongodbCon.insertDocument(); 
    187     }catch(Exception e) { 
     243        mongodbCon.showCollections(); 
     244 
     245    } catch(Exception e) { 
    188246        e.printStackTrace(); 
    189247    } 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

    r33633 r33634  
    244244         
    245245        if(text.equals("")) { 
    246         page.addMRILanguageStatus(false); 
     246        //page.addMRILanguageStatus(false); 
    247247        continue; 
    248248        } 
     
    250250        boolean isMRI = maoriTxtDetector.isTextInMaori(text); 
    251251         
    252         page.addMRILanguageStatus(isMRI); 
     252        //page.addMRILanguageStatus(isMRI); 
    253253         
    254254     
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33623 r33634  
    22 
    33import java.io.*; 
     4import java.util.ArrayList; 
    45import java.util.HashMap; 
    56import java.util.Map; 
     
    1516     
    1617    private Map<String, String> tuples; 
     18 
     19    private boolean isMRI = false; 
     20 
     21    boolean DEBUG_MODE = false; 
    1722     
    1823    public TextDumpPage(String siteID, String unparsedPageDump) { 
     
    9095                tuples.put(k.trim(), v.trim()); 
    9196            } else { 
    92                 if(NutchTextDumpProcessor.DEBUG_MODE) { 
     97                if(DEBUG_MODE) { 
    9398                logger.error("No meta key for meta: " + line); 
    9499                logger.error(unparsedPageDump); 
     
    118123 
    119124    public void debugTuples() { 
    120     if(NutchTextDumpProcessor.DEBUG_MODE) { 
     125    if(DEBUG_MODE) { 
    121126        logger.debug("__________________________________________"); 
    122127        for(Map.Entry<String, String> entry : tuples.entrySet()) { 
     
    168173    } 
    169174 
     175    /** 
     176     * IMPORTANT: This method deletes the data stored in this TextDumpPage object 
     177     * after converting relevant fields and parameters to a WebpageInfo object 
     178     */ 
     179    public WebpageInfo convertStoredDataToWebpageInfo( 
     180      long webpageID, int websiteID, boolean isMRI, int totalSentences, 
     181      ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences) 
     182    { 
     183    // clear the map, after storing the important (meta)data 
     184    String pageText = getPageText(); 
     185    String pageURL = getPageURL(); 
     186    String charEncoding = getOriginalCharEncoding(); 
     187    String modifiedTime = getModifiedTime(); 
     188    String fetchTime = getFetchTime(); 
     189 
     190    WebpageInfo webpage = new WebpageInfo(webpageID, websiteID, 
     191                          pageText, pageURL, isMRI, totalSentences, 
     192                          charEncoding, modifiedTime, fetchTime, 
     193                          singleSentences, overlappingSentences); 
     194     
     195    tuples.clear(); 
     196 
     197    return webpage; 
     198    } 
     199     
     200 
     201 
     202    /* 
    170203    public void addMRILanguageStatus(boolean status) { 
    171204    if(status) { 
     
    189222 
    190223    } 
    191      
     224    */ 
    192225} 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33633 r33634  
    142142    } 
    143143 
    144     /** inner class */ 
    145     public class SentenceInfo { 
    146     public final double confidenceLevel; 
    147     /** 3 letter lang code */ 
    148     public final String langCode; 
    149     public final String sentence; 
    150  
    151     public SentenceInfo(double confidence, String langCode, String sentence) { 
    152         confidenceLevel = confidence; 
    153         this.langCode = langCode; 
    154         this.sentence = sentence; 
    155     } 
    156     } 
    157  
    158144    /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text 
    159145     * into sentences? What if the text in any other language or a mix of languages? 
     
    183169        double confidence = bestLanguage.getConfidence(); 
    184170 
    185         sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 
     171        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 
    186172    } 
    187173 
     
    207193        separator = " "; 
    208194        } 
    209         sentence = sentence + separator + sentence[i];  
     195        sentence = sentence + separator + sentences[i];  
    210196         
    211197        //System.err.println(sentence); 
     
    214200        double confidence = bestLanguage.getConfidence(); 
    215201 
    216         sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence)); 
     202        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 
    217203    } 
    218204