Ignore:
Timestamp:
2019-11-05T21:04:09+13:00 (4 years ago)
Author:
ak19
Message:
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33615 r33623  
    182182        TextDumpPage firstPage = pages.get(0);
    183183        String url = firstPage.getPageURL();
    184         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
     184        this.domainOfSite = Utility.getDomainForURL(url, true);
    185185    }
    186186    else {
     
    248248       
    249249        page.addMRILanguageStatus(isMRI);
    250 
     250       
     251   
    251252        // Even if the entire page is not found to be overall in Māori,
    252253        // let's still inspect the sentences of the page and count how many (if any)
     
    281282            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
    282283                          SITE_COUNTER, /* alternative: this.siteID */
    283                           url, isMRI, totalSentences, numSentencesInMRI);
     284                          url,
     285                          //"origCharEncoding", "modifiedTime", "fetchTime",
     286                          page.getOriginalCharEncoding(),
     287                          page.getModifiedTime(),
     288                          page.getFetchTime(),
     289                          isMRI, totalSentences, numSentencesInMRI);
    284290
    285291            // Write the sentences that are in te reo into the mri-sentences CSV file
     
    393399           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
    394400           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
    395         webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
    396                        "numSentences", "numSentencesInMRI");
     401        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
     402                       "origCharEncoding", "modifiedTime", "fetchTime",
     403                       "isMRI", "numSentences", "numSentencesInMRI");
    397404        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
    398405       
     
    435442       
    436443    } catch(Exception e) {
    437         // can get an exception when instantiating CCWETProcessor instance
     444        // can get an exception when instantiating NutchTextDumpProcessor instance
    438445        // or with CSV file
    439446        logger.error(e.getMessage(), e);
Note: See TracChangeset for help on using the changeset viewer.