Ignore:
Timestamp:
2019-11-08T19:43:39+13:00 (4 years ago)
Author:
ak19
Message:
  1. TextLanguageDetector now has methods for collecting all sentences and all overlapping sentences. 2. Renaming NutchTextDumpProcessor.java to NutchTextDumpToCSV.java, since there a new class, NutchTextDumpToMongoDB.java that needs slightly different data structures.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33626 r33633  
    4747    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
    4848   
    49     String HOST = "localhost";
    50     int PORT = 27017; // mongodb port
    51     String PROPS_FILENAME = "config.properties";
    52     String DB_NAME = "ateacrawldata";
     49    static final String PROPS_FILENAME = "config.properties";
     50    public static final String DB_NAME = "anupama"; //"ateacrawldata";
     51    public static final String WEBPAGES_COLLECTION = "webpages";
     52    public static final String WEBSITES_COLLECTION = "websites";   
    5353   
     54    private String HOST = "localhost";
     55    private int PORT = 27017; // mongodb port
    5456    private String USERNAME;
    55     private String PASSWORD;
    56 
     57    private String PASSWORD;   
    5758
    5859    private MongoClient mongo = null;
    5960    private MongoDatabase database = null;
     61   
    6062   
    6163    public MongoDBAccess() throws Exception {
     
    118120    logger.info("Credentials: "+ credential);
    119121    }
     122
    120123   
     124    public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite,
     125           int numPages, int numPagesInMRI, int numPagesContainingMRI,
     126                  /* TODO: String geoLocationCountryCode, boolean miURL */
     127           String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl)
     128    {
     129    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     130    Document document = new Document("id", SITE_COUNTER)
     131        .append("siteFolderName", siteID)
     132        .append("domain", domainOfSite)
     133        .append("totalPages", numPages)
     134        .append("numPagesInMRI", numPagesInMRI)
     135        .append("numPagesContainingMRI", numPagesContainingMRI)
     136        .append("siteCrawledTimestamp", siteCrawledTimestamp)
     137        .append("siteCrawlUnfinished", siteCrawlUnfinished)
     138        .append("redoCrawl", redoCrawl);
     139    collection.insertOne(document);
     140    System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 
     141    }
    121142
    122     /*
    123     public void insertDocument() {
    124     MongoCollection<Document> collection = this.database.getCollection("sampleCollection");
     143   
     144    public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/
     145                  String url, String charEncoding, String modTime, String fetchTime,
     146                  boolean isMRI, int totalSentences, int numSentencesInMRI,
     147                  ArrayList<SentenceInfo> singleSentences,
     148                  ArrayList<SentenceInfo> overlappingSentences)
     149    {
     150    // load the webpages db 'table'
     151    // in mongodb, the equivalent of db tables are called 'collections'
     152    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
     153
     154    Document document = new Document("id", WEBPAGE_COUNTER)
     155        .append("siteid", site_id)
     156        .append("url", url)
     157        .append("charEncoding", charEncoding)
     158        .append("modTime", modTime)
     159        .append("fetchTime", fetchTime)
     160        .append("isMRI", isMRI)
     161        .append("totalSentences", totalSentences)
     162        .append("numSentencesInMRI", numSentencesInMRI);
     163
     164    document.put("singleSentences", singleSentences);
     165    document.put("overlappingSentences", overlappingSentences);
     166   
     167    collection.insertOne(document);
     168    System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 
    125169    }
    126     */
     170   
    127171
    128172    // TODO:
Note: See TracChangeset for help on using the changeset viewer.