Changeset 33623

Show
Ignore:
Timestamp:
05.11.2019 21:04:09 (9 days ago)
Author:
ak19
Message:

1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.

Location:
gs3-extensions/maori-lang-detection
Files:
7 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33621 r33623  
    492492 
    493493 
     494INSTALLATION MONGO-DB AND CLIENT 
     495FROM: https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ 
     496    wget -qO - https://www.mongodb.org/static/pgp/server-4.2.asc | sudo apt-key add - 
     497    echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list 
     498    sudo apt-get update 
     499    sudo apt-get install -y mongodb-org 
     500 
     501UNINSTALLING 
     502    https://www.anintegratedworld.com/uninstall-mongodb-in-ubuntu-via-command-line-in-3-easy-steps/ 
    494503 
    495504 
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33615 r33623  
    2323 
    2424 
    25 mongodb.user=admin 
    26 mongodb.pwd=pinky 
    27 #CHANGEME 
    28  
     25mongodb.user=anupama 
     26mongodb.pwd=chang3m3 
     27# default mongodb port is 27017. Don't change the port unless you really have configured 
     28# your mongodb server to listen at some other port 
     29mongodb.port=27017 
     30mongodb.host=mongodb.cms.waikato.ac.nz 
     31mongodb.dbname=ateacrawldata 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33615 r33623  
    237237    } 
    238238 
    239     /** Work out the 'domain' for a given url. 
    240      * This retains any www. or subdomain prefix. 
    241      */ 
    242     public static String getDomainForURL(String url, boolean withProtocol) { 
    243     int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 
    244     startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
    245     // the keep the URL around in case param withProtocol=true 
    246     String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 
    247      
    248     String domain = url.substring(startIndex); 
    249     int endIndex = domain.indexOf("/"); 
    250     if(endIndex == -1) endIndex = domain.length(); 
    251     domain = domain.substring(0, endIndex); 
    252  
    253     if(withProtocol) { 
    254         // now that we have the domain (everything to the first / when there is no protocol) 
    255         // can glue the protocol back on 
    256         domain = protocol + domain; 
    257     } 
    258      
    259     return domain; 
    260     } 
    261239 
    262240    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ 
     
    304282        // work out domain. This retains any www. or subdomain prefix 
    305283        // passing true to further also retain the http(s) protocol 
    306         domainWithProtocol = getDomainForURL(url, true); 
     284        domainWithProtocol = Utility.getDomainForURL(url, true); 
    307285 
    308286        Set<String> urlsSet; 
     
    316294        } 
    317295 
     296        /* 
    318297        // Dr Nichols said that a url that was located outside the country and 
    319298        // which had /mi/ URLs was more likely to be an autotranslated (product) site. 
     
    322301        // then add that domain (if not already added) and that url into a file 
    323302        // for later manual inspection 
    324         if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    325             /* 
     303        if(!domainWithProtocol.endsWith(".nz") 
     304           && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     305             
    326306            if(!possibleProductDomains.contains(domainWithProtocol)) { 
    327307 
     
    345325            if(!isInNZ) { 
    346326                possibleProductDomains.add(domainWithProtocol); 
    347                 // write both domain and a sample URL on that site out to file 
     327                // write both domain and a sample seedURL on that site out to file 
    348328                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");               
    349329                possibleProductSitesWriter.write("\t" + url + "\n"); 
    350330            } 
    351             }*/ /*else { 
    352             // already wrote out domain to file at some point, write just the URL out to file 
    353             possibleProductSitesWriter.write("\t" + url + "\n"); 
    354             }*/ 
    355         } 
     331            } 
     332            //else { 
     333            // already wrote out domain to file at some point, write just the URL out to file 
     334            //possibleProductSitesWriter.write("\t" + url + "\n"); 
     335            //} 
     336        } 
     337        */ 
    356338        } 
    357339    } catch (IOException ioe) { 
     
    686668    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually, 
    687669    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted 
    688     String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol 
     670    String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol 
    689671    if(!isBlackListed && url.contains("jasmin")) { 
    690672        logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); 
     
    821803    public static void printUsage() { 
    822804    System.err.println("Run this program as:"); 
    823     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");  
     805    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");    
    824806    } 
    825807 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33622 r33623  
    22 
    33 
     4import com.mongodb.client.MongoCollection;  
    45import com.mongodb.client.MongoDatabase;  
    56import com.mongodb.MongoClient;  
    67import com.mongodb.MongoCredential;   
     8 
     9import org.bson.Document; 
    710 
    811import java.io.BufferedReader; 
     
    2326 * TO RUN: 
    2427 *       java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess 
     28 * 
     29 * Manually connecting to mongodb from client: 
     30 *    mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p 
     31 * Then after connecting with pwd, type: 
     32 *    use DBNAME 
     33 * 
     34 * Or connect to mongodb and specify db in one statement: 
     35 *    mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p 
     36 * 
     37 * Some links: 
     38 *   - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection 
     39 *   - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection") 
     40 *   - https://tecadmin.net/tutorial/mongodb/drop-collection/ 
     41 * IMPORTANT LINK: 
     42 *   - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1 
     43 * 
    2544 */ 
    2645public class MongoDBAccess { 
     
    2847    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 
    2948     
    30     final static String HOST = "localhost"; 
    31     final static int PORT = 27017; // mongodb port 
    32     final static String PROPS_FILENAME = "config.properties"; 
    33     final static String DB_NAME = "ateacrawldata"; 
     49    String HOST = "localhost"; 
     50    int PORT = 27017; // mongodb port 
     51    String PROPS_FILENAME = "config.properties"; 
     52    String DB_NAME = "ateacrawldata"; 
    3453     
    3554    private String USERNAME; 
     
    3756 
    3857 
     58    private MongoClient mongo = null; 
     59    private MongoDatabase database = null; 
     60     
    3961    public MongoDBAccess() throws Exception { 
    4062    boolean success = false; 
     
    5577    if(USERNAME.equals("")) { 
    5678        USERNAME = "root"; 
    57         logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + " defaulting to: " + USERNAME); 
     79        logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME); 
    5880            } 
    5981    PASSWORD = props.getProperty("mongodb.pwd"); 
     
    6688        throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);        
    6789    } 
     90 
     91    HOST = props.getProperty("mongodb.host", HOST); 
     92    String port = props.getProperty("mongodb.port", Integer.toString(PORT)); 
     93    PORT = Integer.parseInt(port); 
     94    DB_NAME = props.getProperty("mongodb.dbname", DB_NAME); 
     95 
     96    logger.info("Connecting to mongodb with:"); 
     97    logger.info(" - host:    " + HOST); 
     98    logger.info(" - port:    " + PORT); 
     99    logger.info(" - user:    " + USERNAME); 
     100    logger.info(" - db name: " + DB_NAME);   
    68101    } 
    69102 
    70  
     103    /**  
     104     * Since we have only a single MongoClient, don't need to call close/disconnect on it as per 
     105     * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection 
     106     */ 
    71107    public void connectToDB() throws Exception { 
    72108    // Creating a Mongo client  
    73     MongoClient mongo = new MongoClient( HOST, PORT );  
     109    mongo = new MongoClient( HOST, PORT );  
    74110     
    75111    // Creating Credentials  
     
    79115     
    80116    // Accessing the database  
    81     MongoDatabase database = mongo.getDatabase(DB_NAME);  
    82     //System.out.println("Credentials: "+ credential); 
     117    database = mongo.getDatabase(DB_NAME);  
     118    logger.info("Credentials: "+ credential); 
    83119    } 
    84120     
    85121 
     122    /* 
     123    public void insertDocument() { 
     124    MongoCollection<Document> collection = this.database.getCollection("sampleCollection"); 
     125    } 
     126    */ 
     127 
     128    // create collection (table in RDBMS) websites, create collection webpages 
     129    // webpages collection will have sentences embedded 
     130     
    86131    public static void main(String args[]) { 
    87132    try { 
    88133        MongoDBAccess mongodbCon = new MongoDBAccess(); 
    89         //mongodbCon.connectToDB(); 
     134        mongodbCon.connectToDB(); 
     135        //mongodbCon.insertDocument(); 
    90136    }catch(Exception e) { 
    91137        e.printStackTrace(); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33615 r33623  
    182182        TextDumpPage firstPage = pages.get(0); 
    183183        String url = firstPage.getPageURL(); 
    184         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 
     184        this.domainOfSite = Utility.getDomainForURL(url, true); 
    185185    } 
    186186    else { 
     
    248248         
    249249        page.addMRILanguageStatus(isMRI); 
    250  
     250         
     251     
    251252        // Even if the entire page is not found to be overall in Māori, 
    252253        // let's still inspect the sentences of the page and count how many (if any) 
     
    281282            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++, 
    282283                          SITE_COUNTER, /* alternative: this.siteID */ 
    283                           url, isMRI, totalSentences, numSentencesInMRI); 
     284                          url, 
     285                          //"origCharEncoding", "modifiedTime", "fetchTime", 
     286                          page.getOriginalCharEncoding(), 
     287                          page.getModifiedTime(), 
     288                          page.getFetchTime(), 
     289                          isMRI, totalSentences, numSentencesInMRI); 
    284290 
    285291            // Write the sentences that are in te reo into the mri-sentences CSV file 
     
    393399           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 
    394400           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 
    395         webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI", 
    396                        "numSentences", "numSentencesInMRI"); 
     401        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", 
     402                       "origCharEncoding", "modifiedTime", "fetchTime", 
     403                       "isMRI", "numSentences", "numSentencesInMRI"); 
    397404        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence"); 
    398405         
     
    435442         
    436443    } catch(Exception e) { 
    437         // can get an exception when instantiating CCWETProcessor instance 
     444        // can get an exception when instantiating NutchTextDumpProcessor instance 
    438445        // or with CSV file 
    439446        logger.error(e.getMessage(), e); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33615 r33623  
    8484                String k = line.substring(0, endIndex); 
    8585                String v = line.substring(endIndex+1); 
     86                if(k.startsWith("metadata")) { 
     87                k = k.substring("metadata".length()); 
     88                } 
     89                 
    8690                tuples.put(k.trim(), v.trim()); 
    8791            } else { 
     
    134138    } 
    135139 
     140    /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp 
     141       or encoding he meant, but storing 2 of several timestamps and selecting 
     142       original character encoding (presumably the char encoding of the page) out of 2 
     143       pieces of char encoding metadata to store. */ 
     144    public String getModifiedTime() {    
     145    // is this the webpage's last mod time?  
     146    String time = tuples.get("modifiedTime"); 
     147    time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset 
     148    return time; 
     149    }     
     150    public String getFetchTime() { 
     151    // is this the nutch crawl time 
     152    String time = tuples.get("fetchTime"); 
     153    time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset 
     154    return time; 
     155     
     156    }  
     157    public String getOriginalCharEncoding() { 
     158    // is this the web page's char-encoding? 
     159    return tuples.get("OriginalCharEncoding"); 
     160    } 
     161     
    136162    public String get(String key) { 
    137163    return tuples.get(key); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33604 r33623  
    8282    } 
    8383     
     84    /** Work out the 'domain' for a given url. 
     85     * This retains any www. or subdomain prefix. 
     86     */ 
     87    public static String getDomainForURL(String url, boolean withProtocol) { 
     88    int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 
     89    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     90    // the keep the URL around in case param withProtocol=true 
     91    String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 
     92     
     93    String domain = url.substring(startIndex); 
     94    int endIndex = domain.indexOf("/"); 
     95    if(endIndex == -1) endIndex = domain.length(); 
     96    domain = domain.substring(0, endIndex); 
     97 
     98    if(withProtocol) { 
     99        // now that we have the domain (everything to the first / when there is no protocol) 
     100        // can glue the protocol back on 
     101        domain = protocol + domain; 
     102    } 
     103     
     104    return domain; 
     105    } 
     106     
    84107    public static boolean isDomainInCountry(String domainWithProtocol, 
    85108                        String countryCode, File geoLiteCityDatFile)