Changeset 33623 for gs3-extensions


Ignore:
Timestamp:
2019-11-05T21:04:09+13:00 (4 years ago)
Author:
ak19
Message:
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
Location:
gs3-extensions/maori-lang-detection
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33621 r33623  
    492492
    493493
     494INSTALLATION MONGO-DB AND CLIENT
     495FROM: https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
     496    wget -qO - https://www.mongodb.org/static/pgp/server-4.2.asc | sudo apt-key add -
     497    echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list
     498    sudo apt-get update
     499    sudo apt-get install -y mongodb-org
     500
     501UNINSTALLING
     502    https://www.anintegratedworld.com/uninstall-mongodb-in-ubuntu-via-command-line-in-3-easy-steps/
    494503
    495504
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33615 r33623  
    2323
    2424
    25 mongodb.user=admin
    26 mongodb.pwd=pinky
    27 #CHANGEME
    28 
     25mongodb.user=anupama
     26mongodb.pwd=chang3m3
     27# default mongodb port is 27017. Don't change the port unless you really have configured
     28# your mongodb server to listen at some other port
     29mongodb.port=27017
     30mongodb.host=mongodb.cms.waikato.ac.nz
     31mongodb.dbname=ateacrawldata
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33615 r33623  
    237237    }
    238238
    239     /** Work out the 'domain' for a given url.
    240      * This retains any www. or subdomain prefix.
    241      */
    242     public static String getDomainForURL(String url, boolean withProtocol) {
    243     int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
    244     startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
    245     // the keep the URL around in case param withProtocol=true
    246     String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
    247    
    248     String domain = url.substring(startIndex);
    249     int endIndex = domain.indexOf("/");
    250     if(endIndex == -1) endIndex = domain.length();
    251     domain = domain.substring(0, endIndex);
    252 
    253     if(withProtocol) {
    254         // now that we have the domain (everything to the first / when there is no protocol)
    255         // can glue the protocol back on
    256         domain = protocol + domain;
    257     }
    258    
    259     return domain;
    260     }
    261239
    262240    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
     
    304282        // work out domain. This retains any www. or subdomain prefix
    305283        // passing true to further also retain the http(s) protocol
    306         domainWithProtocol = getDomainForURL(url, true);
     284        domainWithProtocol = Utility.getDomainForURL(url, true);
    307285
    308286        Set<String> urlsSet;
     
    316294        }
    317295
     296        /*
    318297        // Dr Nichols said that a url that was located outside the country and
    319298        // which had /mi/ URLs was more likely to be an autotranslated (product) site.
     
    322301        // then add that domain (if not already added) and that url into a file
    323302        // for later manual inspection
    324         if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
    325             /*
     303        if(!domainWithProtocol.endsWith(".nz")
     304           && (url.contains("/mi/") || url.endsWith("/mi"))) {
     305           
    326306            if(!possibleProductDomains.contains(domainWithProtocol)) {
    327307
     
    345325            if(!isInNZ) {
    346326                possibleProductDomains.add(domainWithProtocol);
    347                 // write both domain and a sample URL on that site out to file
     327                // write both domain and a sample seedURL on that site out to file
    348328                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
    349329                possibleProductSitesWriter.write("\t" + url + "\n");
    350330            }
    351             }*/ /*else {
    352             // already wrote out domain to file at some point, write just the URL out to file
    353             possibleProductSitesWriter.write("\t" + url + "\n");
    354             }*/
    355         }
     331            }
     332            //else {
     333            // already wrote out domain to file at some point, write just the URL out to file
     334            //possibleProductSitesWriter.write("\t" + url + "\n");
     335            //}
     336        }
     337        */
    356338        }
    357339    } catch (IOException ioe) {
     
    686668    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
    687669    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
    688     String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
     670    String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol
    689671    if(!isBlackListed && url.contains("jasmin")) {
    690672        logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
     
    821803    public static void printUsage() {
    822804    System.err.println("Run this program as:");
    823     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
     805    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");   
    824806    }
    825807
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33622 r33623  
    22
    33
     4import com.mongodb.client.MongoCollection;
    45import com.mongodb.client.MongoDatabase;
    56import com.mongodb.MongoClient;
    67import com.mongodb.MongoCredential; 
     8
     9import org.bson.Document;
    710
    811import java.io.BufferedReader;
     
    2326 * TO RUN:
    2427 *       java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
     28 *
     29 * Manually connecting to mongodb from client:
     30 *    mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
     31 * Then after connecting with pwd, type:
     32 *    use DBNAME
     33 *
     34 * Or connect to mongodb and specify db in one statement:
     35 *    mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
     36 *
     37 * Some links:
     38 *   - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
     39 *   - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
     40 *   - https://tecadmin.net/tutorial/mongodb/drop-collection/
     41 * IMPORTANT LINK:
     42 *   - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
     43 *
    2544 */
    2645public class MongoDBAccess {
     
    2847    private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
    2948   
    30     final static String HOST = "localhost";
    31     final static int PORT = 27017; // mongodb port
    32     final static String PROPS_FILENAME = "config.properties";
    33     final static String DB_NAME = "ateacrawldata";
     49    String HOST = "localhost";
     50    int PORT = 27017; // mongodb port
     51    String PROPS_FILENAME = "config.properties";
     52    String DB_NAME = "ateacrawldata";
    3453   
    3554    private String USERNAME;
     
    3756
    3857
     58    private MongoClient mongo = null;
     59    private MongoDatabase database = null;
     60   
    3961    public MongoDBAccess() throws Exception {
    4062    boolean success = false;
     
    5577    if(USERNAME.equals("")) {
    5678        USERNAME = "root";
    57         logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + " defaulting to: " + USERNAME);
     79        logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
    5880            }
    5981    PASSWORD = props.getProperty("mongodb.pwd");
     
    6688        throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);       
    6789    }
     90
     91    HOST = props.getProperty("mongodb.host", HOST);
     92    String port = props.getProperty("mongodb.port", Integer.toString(PORT));
     93    PORT = Integer.parseInt(port);
     94    DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
     95
     96    logger.info("Connecting to mongodb with:");
     97    logger.info(" - host:    " + HOST);
     98    logger.info(" - port:    " + PORT);
     99    logger.info(" - user:    " + USERNAME);
     100    logger.info(" - db name: " + DB_NAME); 
    68101    }
    69102
    70 
     103    /**
     104     * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
     105     * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
     106     */
    71107    public void connectToDB() throws Exception {
    72108    // Creating a Mongo client
    73     MongoClient mongo = new MongoClient( HOST, PORT );
     109    mongo = new MongoClient( HOST, PORT );
    74110   
    75111    // Creating Credentials
     
    79115   
    80116    // Accessing the database
    81     MongoDatabase database = mongo.getDatabase(DB_NAME);
    82     //System.out.println("Credentials: "+ credential);
     117    database = mongo.getDatabase(DB_NAME);
     118    logger.info("Credentials: "+ credential);
    83119    }
    84120   
    85121
     122    /*
     123    public void insertDocument() {
     124    MongoCollection<Document> collection = this.database.getCollection("sampleCollection");
     125    }
     126    */
     127
     128    // create collection (table in RDBMS) websites, create collection webpages
     129    // webpages collection will have sentences embedded
     130   
    86131    public static void main(String args[]) {
    87132    try {
    88133        MongoDBAccess mongodbCon = new MongoDBAccess();
    89         //mongodbCon.connectToDB();
     134        mongodbCon.connectToDB();
     135        //mongodbCon.insertDocument();
    90136    }catch(Exception e) {
    91137        e.printStackTrace();
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33615 r33623  
    182182        TextDumpPage firstPage = pages.get(0);
    183183        String url = firstPage.getPageURL();
    184         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
     184        this.domainOfSite = Utility.getDomainForURL(url, true);
    185185    }
    186186    else {
     
    248248       
    249249        page.addMRILanguageStatus(isMRI);
    250 
     250       
     251   
    251252        // Even if the entire page is not found to be overall in Māori,
    252253        // let's still inspect the sentences of the page and count how many (if any)
     
    281282            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
    282283                          SITE_COUNTER, /* alternative: this.siteID */
    283                           url, isMRI, totalSentences, numSentencesInMRI);
     284                          url,
     285                          //"origCharEncoding", "modifiedTime", "fetchTime",
     286                          page.getOriginalCharEncoding(),
     287                          page.getModifiedTime(),
     288                          page.getFetchTime(),
     289                          isMRI, totalSentences, numSentencesInMRI);
    284290
    285291            // Write the sentences that are in te reo into the mri-sentences CSV file
     
    393399           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
    394400           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
    395         webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
    396                        "numSentences", "numSentencesInMRI");
     401        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
     402                       "origCharEncoding", "modifiedTime", "fetchTime",
     403                       "isMRI", "numSentences", "numSentencesInMRI");
    397404        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
    398405       
     
    435442       
    436443    } catch(Exception e) {
    437         // can get an exception when instantiating CCWETProcessor instance
     444        // can get an exception when instantiating NutchTextDumpProcessor instance
    438445        // or with CSV file
    439446        logger.error(e.getMessage(), e);
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33615 r33623  
    8484                String k = line.substring(0, endIndex);
    8585                String v = line.substring(endIndex+1);
     86                if(k.startsWith("metadata")) {
     87                k = k.substring("metadata".length());
     88                }
     89               
    8690                tuples.put(k.trim(), v.trim());
    8791            } else {
     
    134138    }
    135139
     140    /* Dr Nichols suggested storing timestamp and char encoding. Not sure which timestamp
     141       or encoding he meant, but storing 2 of several timestamps and selecting
     142       original character encoding (presumably the char encoding of the page) out of 2
     143       pieces of char encoding metadata to store. */
     144    public String getModifiedTime() {   
     145    // is this the webpage's last mod time?
     146    String time = tuples.get("modifiedTime");
     147    time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
     148    return time;
     149    }   
     150    public String getFetchTime() {
     151    // is this the nutch crawl time
     152    String time = tuples.get("fetchTime");
     153    time = time.equals("0") ? "" : time; // zero will be assumed to be epoch, rather than unset
     154    return time;
     155   
     156    }
     157    public String getOriginalCharEncoding() {
     158    // is this the web page's char-encoding?
     159    return tuples.get("OriginalCharEncoding");
     160    }
     161   
    136162    public String get(String key) {
    137163    return tuples.get(key);
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33604 r33623  
    8282    }
    8383   
     84    /** Work out the 'domain' for a given url.
     85     * This retains any www. or subdomain prefix.
     86     */
     87    public static String getDomainForURL(String url, boolean withProtocol) {
     88    int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
     89    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     90    // the keep the URL around in case param withProtocol=true
     91    String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
     92   
     93    String domain = url.substring(startIndex);
     94    int endIndex = domain.indexOf("/");
     95    if(endIndex == -1) endIndex = domain.length();
     96    domain = domain.substring(0, endIndex);
     97
     98    if(withProtocol) {
     99        // now that we have the domain (everything to the first / when there is no protocol)
     100        // can glue the protocol back on
     101        domain = protocol + domain;
     102    }
     103   
     104    return domain;
     105    }
     106   
    84107    public static boolean isDomainInCountry(String domainWithProtocol,
    85108                        String countryCode, File geoLiteCityDatFile)
Note: See TracChangeset for help on using the changeset viewer.