Changeset 33634 for gs3-extensions


Ignore:
Timestamp:
2019-11-08T23:59:07+13:00 (4 years ago)
Author:
ak19
Message:

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
4 added
4 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33633 r33634  
    44import com.mongodb.client.MongoCollection;
    55import com.mongodb.client.MongoDatabase;
     6//import com.mongodb.client.MongoIterable;
     7import com.mongodb.BasicDBObject;
    68import com.mongodb.MongoClient;
    7 import com.mongodb.MongoCredential; 
     9import com.mongodb.MongoCredential;
     10import com.mongodb.ServerAddress;
     11import com.mongodb.MongoClientOptions;
    812
    913import org.bson.Document;
     
    1216import java.io.File;
    1317import java.io.FileReader;
     18import java.util.ArrayList;
     19import java.util.List;
    1420import java.util.Properties;
     21
    1522
    1623import org.apache.log4j.Logger;
     
    4350 *
    4451 */
    45 public class MongoDBAccess {
     52public class MongoDBAccess implements AutoCloseable {
    4653
    4754    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
    4855   
    4956    static final String PROPS_FILENAME = "config.properties";
    50     public static final String DB_NAME = "anupama"; //"ateacrawldata";
    5157    public static final String WEBPAGES_COLLECTION = "webpages";
    5258    public static final String WEBSITES_COLLECTION = "websites";   
    53    
     59
     60    // configuration details, some with fallback values
    5461    private String HOST = "localhost";
    5562    private int PORT = 27017; // mongodb port
    5663    private String USERNAME;
    5764    private String PASSWORD;   
    58 
     65    private String DB_NAME ="ateacrawldata";
     66   
    5967    private MongoClient mongo = null;
    6068    private MongoDatabase database = null;
     
    108116     */
    109117    public void connectToDB() throws Exception {
     118   
    110119    // Creating a Mongo client
    111120    mongo = new MongoClient( HOST, PORT );
     
    117126   
    118127    // Accessing the database
    119     database = mongo.getDatabase(DB_NAME);
     128    this.database = mongo.getDatabase(DB_NAME);
    120129    logger.info("Credentials: "+ credential);
    121     }
    122 
    123    
    124     public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite,
    125            int numPages, int numPagesInMRI, int numPagesContainingMRI,
    126                   /* TODO: String geoLocationCountryCode, boolean miURL */
    127            String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl)
     130
     131    /*
     132    MongoCredential credential;
     133    credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
     134    logger.info("Credentials: "+ credential);
     135   
     136    // Create our Mongo client
     137    mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
     138    System.out.println("Connected to the database successfully");
     139
     140    this.database = mongo.getDatabase(DB_NAME);
     141    */
     142   
     143    }
     144
     145    // TODO: which fields should be indexed?
     146
     147    public void showCollections() {
     148    //MongoIterable<String> colls = this.database.listCollectionNames();
     149    for(String coll : this.database.listCollectionNames()) {
     150        System.err.println("coll: " + coll);
     151    }
     152    }
     153   
     154   
     155    public void insertWebsiteInfo(WebsiteInfo website)
    128156    {
    129157    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    130     Document document = new Document("id", SITE_COUNTER)
    131         .append("siteFolderName", siteID)
    132         .append("domain", domainOfSite)
    133         .append("totalPages", numPages)
    134         .append("numPagesInMRI", numPagesInMRI)
    135         .append("numPagesContainingMRI", numPagesContainingMRI)
    136         .append("siteCrawledTimestamp", siteCrawledTimestamp)
    137         .append("siteCrawlUnfinished", siteCrawlUnfinished)
    138         .append("redoCrawl", redoCrawl);
     158    Document document = new Document("_id", website.id)
     159        .append("siteFolderName", website.siteFolderName)
     160        .append("domain", website.domain)
     161        .append("totalPages", website.totalPages)
     162        .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
     163        .append("numPagesInMRI", website.numPagesInMRI)
     164        .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
     165        .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
     166        .append("redoCrawl", website.redoCrawl);
     167
     168    document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
     169    if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
     170        document.put("countryCode", website.geoLocationCountryCode);
     171    }
     172
    139173    collection.insertOne(document);
    140     System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 
    141     }
    142 
    143    
    144     public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/
    145                   String url, String charEncoding, String modTime, String fetchTime,
    146                   boolean isMRI, int totalSentences, int numSentencesInMRI,
    147                   ArrayList<SentenceInfo> singleSentences,
    148                   ArrayList<SentenceInfo> overlappingSentences)
     174    logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
     175               + " inserted successfully into " + WEBSITES_COLLECTION); 
     176    }
     177
     178    /* TODO:
     179    https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
     180    */
     181    public void insertWebpageInfo(WebpageInfo webpage)
    149182    {
    150183    // load the webpages db 'table'
     
    152185    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
    153186
    154     Document document = new Document("id", WEBPAGE_COUNTER)
    155         .append("siteid", site_id)
    156         .append("url", url)
    157         .append("charEncoding", charEncoding)
    158         .append("modTime", modTime)
    159         .append("fetchTime", fetchTime)
    160         .append("isMRI", isMRI)
    161         .append("totalSentences", totalSentences)
    162         .append("numSentencesInMRI", numSentencesInMRI);
    163 
    164     document.put("singleSentences", singleSentences);
    165     document.put("overlappingSentences", overlappingSentences);
     187    Document document = new Document("_id", webpage.webpageID)
     188        .append("siteid", webpage.websiteID)
     189        .append("url", webpage.URL)
     190        .append("isMRI", webpage.isMRI)
     191        .append("totalSentences", webpage.totalSentences)
     192        .append("charEncoding", webpage.charEncoding)
     193        .append("modTime", webpage.modifiedTime)
     194        .append("fetchTime", webpage.fetchTime);
     195
     196    // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT:
     197    //document.put("singleSentences", webpage.singleSentences);
     198    //document.put("overlappingSentences", webpage.overlappingSentences);
     199
     200    // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
     201    // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
     202    List<BasicDBObject> sentencesList = new ArrayList<>();
     203    for(SentenceInfo sentence : webpage.singleSentences) {
     204        sentencesList.add(new BasicDBObject("langCode", sentence.langCode));
     205        sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel));
     206        sentencesList.add(new BasicDBObject("sentence", sentence));
     207    }   
     208    document.put("singleSentences", sentencesList);
     209   
     210    List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
     211    for(SentenceInfo sentence : webpage.overlappingSentences) {
     212        sentencesList.add(new BasicDBObject("langCode", sentence.langCode));
     213        sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel));
     214        sentencesList.add(new BasicDBObject("sentence", sentence));
     215    }   
     216    document.put("singleSentences", overlappingSentencesList);
     217
     218    // also put the full text in there
     219    document.put("text", webpage.text);
    166220   
    167221    collection.insertOne(document);
    168     System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 
    169     }
     222    logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION); 
     223    }
     224
     225    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
     226    public void close() {}
    170227   
    171228
     
    184241        MongoDBAccess mongodbCon = new MongoDBAccess();
    185242        mongodbCon.connectToDB();
    186         //mongodbCon.insertDocument();
    187     }catch(Exception e) {
     243        mongodbCon.showCollections();
     244
     245    } catch(Exception e) {
    188246        e.printStackTrace();
    189247    }
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java

    r33633 r33634  
    244244       
    245245        if(text.equals("")) {
    246         page.addMRILanguageStatus(false);
     246        //page.addMRILanguageStatus(false);
    247247        continue;
    248248        }
     
    250250        boolean isMRI = maoriTxtDetector.isTextInMaori(text);
    251251       
    252         page.addMRILanguageStatus(isMRI);
     252        //page.addMRILanguageStatus(isMRI);
    253253       
    254254   
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33623 r33634  
    22
    33import java.io.*;
     4import java.util.ArrayList;
    45import java.util.HashMap;
    56import java.util.Map;
     
    1516   
    1617    private Map<String, String> tuples;
     18
     19    private boolean isMRI = false;
     20
     21    boolean DEBUG_MODE = false;
    1722   
    1823    public TextDumpPage(String siteID, String unparsedPageDump) {
     
    9095                tuples.put(k.trim(), v.trim());
    9196            } else {
    92                 if(NutchTextDumpProcessor.DEBUG_MODE) {
     97                if(DEBUG_MODE) {
    9398                logger.error("No meta key for meta: " + line);
    9499                logger.error(unparsedPageDump);
     
    118123
    119124    public void debugTuples() {
    120     if(NutchTextDumpProcessor.DEBUG_MODE) {
     125    if(DEBUG_MODE) {
    121126        logger.debug("__________________________________________");
    122127        for(Map.Entry<String, String> entry : tuples.entrySet()) {
     
    168173    }
    169174
     175    /**
     176     * IMPORTANT: This method deletes the data stored in this TextDumpPage object
     177     * after converting relevant fields and parameters to a WebpageInfo object
     178     */
     179    public WebpageInfo convertStoredDataToWebpageInfo(
     180      long webpageID, int websiteID, boolean isMRI, int totalSentences,
     181      ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
     182    {
     183    // clear the map, after storing the important (meta)data
     184    String pageText = getPageText();
     185    String pageURL = getPageURL();
     186    String charEncoding = getOriginalCharEncoding();
     187    String modifiedTime = getModifiedTime();
     188    String fetchTime = getFetchTime();
     189
     190    WebpageInfo webpage = new WebpageInfo(webpageID, websiteID,
     191                          pageText, pageURL, isMRI, totalSentences,
     192                          charEncoding, modifiedTime, fetchTime,
     193                          singleSentences, overlappingSentences);
     194   
     195    tuples.clear();
     196
     197    return webpage;
     198    }
     199   
     200
     201
     202    /*
    170203    public void addMRILanguageStatus(boolean status) {
    171204    if(status) {
     
    189222
    190223    }
    191    
     224    */
    192225}
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33633 r33634  
    142142    }
    143143
    144     /** inner class */
    145     public class SentenceInfo {
    146     public final double confidenceLevel;
    147     /** 3 letter lang code */
    148     public final String langCode;
    149     public final String sentence;
    150 
    151     public SentenceInfo(double confidence, String langCode, String sentence) {
    152         confidenceLevel = confidence;
    153         this.langCode = langCode;
    154         this.sentence = sentence;
    155     }
    156     }
    157 
    158144    /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
    159145     * into sentences? What if the text in any other language or a mix of languages?
     
    183169        double confidence = bestLanguage.getConfidence();
    184170
    185         sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
     171        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
    186172    }
    187173
     
    207193        separator = " ";
    208194        }
    209         sentence = sentence + separator + sentence[i];
     195        sentence = sentence + separator + sentences[i];
    210196       
    211197        //System.err.println(sentence);
     
    214200        double confidence = bestLanguage.getConfidence();
    215201
    216         sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
     202        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
    217203    }
    218204
Note: See TracChangeset for help on using the changeset viewer.