Changeset 33965


Ignore:
Timestamp:
2020-02-21T20:59:07+13:00 (4 years ago)
Author:
ak19
Message:
  1. Adding a basicDomain column (stripped of http/https and www prefix) for easier sorting in LibreOffice Calc spreadshoot. 2. Adding a recordCount column to maintain a way to get back the original order after resorting in LibreOffice Calc.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java

    r33963 r33965  
    641641         CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
    642642         ) {
    643 
     643       
     644       
    644645        int recordCount = 0;
     646
     647        csvWriter.printRecord("origSequence", "basicDomain","pageURL",
     648                  "countryCode","mostlyMRI","qualityLevel",
     649                  "numPagesInMRIForDomainSuffix","totalPagesInMRIForDomainSuffix");
     650
     651       
    645652        for (CSVRecord csvRecord : parser) {       
    646653       
     
    649656            continue;
    650657        }
    651        
    652658        recordCount++;
    653659       
    654660        String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
    655 
     661       
    656662        String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
    657663        String isReallyInMRI = "";
     
    677683
    678684        // Save the CSV record into the tmp file with the 2 counts columns
    679         csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel,
     685        csvWriter.printRecord(recordCount, basicURL,
     686                      url,  countryCode, isReallyInMRI, qualityLevel,
    680687                      countNumPagesInMRI, countTotalPages);     
    681688        }
     
    694701    System.err.println("Usage: ManualURLInspection webPageURLs.txt");
    695702    }
    696 
     703   
     704    //^https?://(www.)?
    697705    /**
    698706     * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
Note: See TracChangeset for help on using the changeset viewer.