1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 |
|
---|
4 | //import org.apache.log4j.Logger;
|
---|
5 |
|
---|
6 |
|
---|
7 | public class MRIWebPageStats {
|
---|
8 | //private static Logger logger = Logger.getLogger(org.greenstone.atea.MRIWebPageStats.class.getName());
|
---|
9 |
|
---|
10 | public final String siteID; // crawled site's folder name e.g. 00510
|
---|
11 | public final String URL; // URL of webpage
|
---|
12 | public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList
|
---|
13 |
|
---|
14 | public final boolean isMRI;
|
---|
15 | public final int numSentences; // count of all sentences in the webpage's body
|
---|
16 | public final int numSentencesInMRI; // count of sentences in the webpage's body in MÄori (mri)
|
---|
17 |
|
---|
18 |
|
---|
19 | public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI,
|
---|
20 | int numSentences, int numSentencesInMRI)
|
---|
21 | {
|
---|
22 | this.siteID = siteID;
|
---|
23 | this.URL = url;
|
---|
24 | this.pageID = pageID;
|
---|
25 |
|
---|
26 | this.isMRI = isMRI;
|
---|
27 | this.numSentences = numSentences;
|
---|
28 | this.numSentencesInMRI = numSentencesInMRI;
|
---|
29 | }
|
---|
30 |
|
---|
31 | public String toString() {
|
---|
32 | StringBuilder str = new StringBuilder();
|
---|
33 | str.append("URL: " + this.URL);
|
---|
34 | str.append("\nsiteID: " + this.siteID);
|
---|
35 | str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences);
|
---|
36 | if(this.isMRI && this.numSentencesInMRI <= 0) {
|
---|
37 | // one or more pages in the site were MRI, but they didn't contain proper sentences
|
---|
38 | str.append(" (no PROPER sentences in MRI)");
|
---|
39 | }
|
---|
40 | return str.toString();
|
---|
41 | }
|
---|
42 |
|
---|
43 | /** for converting to csv */
|
---|
44 | /*
|
---|
45 | Unused.
|
---|
46 | public String[] toCSV() {
|
---|
47 | String[] csvRecord = { Integer.toString(pageID),
|
---|
48 | siteID, // foreign key
|
---|
49 | URL,
|
---|
50 | Boolean.toString(isMRI),
|
---|
51 | Integer.toString(numSentences),
|
---|
52 | Integer.toString(numSentencesInMRI)
|
---|
53 | };
|
---|
54 |
|
---|
55 | return csvRecord;
|
---|
56 | }
|
---|
57 | */
|
---|
58 | }
|
---|