source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java@ 33602

Last change on this file since 33602 was 33602, checked in by ak19, 5 years ago
  1. The final csv file, mri-sentences.csv, is now written out. 2. Only pages that are overall in MRI or contain any MRI sentences get entries in the webpages csv file now. 3. Corrections to ID columns written to the webpages and websites csv files. 4. Some cleanup of unused code.
File size: 1.7 KB
Line 
1package org.greenstone.atea;
2
3
4//import org.apache.log4j.Logger;
5
6
7public class MRIWebPageStats {
8 //private static Logger logger = Logger.getLogger(org.greenstone.atea.MRIWebPageStats.class.getName());
9
10 public final String siteID; // crawled site's folder name e.g. 00510
11 public final String URL; // URL of webpage
12 public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList
13
14 public final boolean isMRI;
15 public final int numSentences; // count of all sentences in the webpage's body
16 public final int numSentencesInMRI; // count of sentences in the webpage's body in Māori (mri)
17
18
19 public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI,
20 int numSentences, int numSentencesInMRI)
21 {
22 this.siteID = siteID;
23 this.URL = url;
24 this.pageID = pageID;
25
26 this.isMRI = isMRI;
27 this.numSentences = numSentences;
28 this.numSentencesInMRI = numSentencesInMRI;
29 }
30
31 public String toString() {
32 StringBuilder str = new StringBuilder();
33 str.append("URL: " + this.URL);
34 str.append("\nsiteID: " + this.siteID);
35 str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences);
36 if(this.isMRI && this.numSentencesInMRI <= 0) {
37 // one or more pages in the site were MRI, but they didn't contain proper sentences
38 str.append(" (no PROPER sentences in MRI)");
39 }
40 return str.toString();
41 }
42
43 /** for converting to csv */
44 /*
45 Unused.
46 public String[] toCSV() {
47 String[] csvRecord = { Integer.toString(pageID),
48 siteID, // foreign key
49 URL,
50 Boolean.toString(isMRI),
51 Integer.toString(numSentences),
52 Integer.toString(numSentencesInMRI)
53 };
54
55 return csvRecord;
56 }
57 */
58}
Note: See TracBrowser for help on using the repository browser.