Changeset 33573 for gs3-extensions
- Timestamp:
- 2019-10-16T21:39:56+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh
r33571 r33573 165 165 echo " $0 -all|<ids>" 166 166 echo " where an id is a folder name in to_crawl/sites" 167 echo " and ids can be a comma separated list of"167 echo " and ids can be a comma or space separated list of" 168 168 echo " individual ids and/or ranges" 169 169 echo " Examples:" 170 echo " $0 00008-00022,00025,00026,00028-00034 "170 echo " $0 00008-00022,00025,00026,00028-00034 00050" 171 171 echo " $0 -all" 172 172 echo " $0 00312" … … 189 189 fi 190 190 191 # works - split args on comma 191 # works - split args on comma or space 192 192 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash 193 193 IFS=', ' read -ra IDS <<< "$args" -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33569 r33573 761 761 // Finally, we can process this WETFile's records into the keep and discard pile 762 762 wetFileCount++; 763 logger.debug("Off to process " + WETFile);763 debug("Off to process " + WETFile); 764 764 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 765 765 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-## … … 794 794 public static void error(String msg, Exception e) { 795 795 logger.error(msg, e); 796 System.err.println( msg);796 System.err.println("\n"+msg); 797 797 e.printStackTrace(); 798 798 } … … 810 810 public boolean accept(File dir, String name) { 811 811 if(name.endsWith(".warc.wet")) { 812 logger.debug("Will include " + name + " for processing.");812 debug("Will include " + name + " for processing."); 813 813 return true; 814 814 } … … 818 818 File unzippedVersion = new File(dir, nameWithoutGZext); 819 819 if(unzippedVersion.exists()) { 820 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");821 logger.debug("Skipping " + name);820 debug("--- Unzipped version " + unzippedVersion + " exists."); 821 debug("Skipping " + name); 822 822 return false; // don't count gzipped version if unzipped version exists. 823 823 } 824 824 else { 825 logger.debug("Only zipped version " + name + " exists.");825 debug("Only zipped version " + name + " exists."); 826 826 return true; // No unzipped version, so have to work with gzipped version 827 827 } … … 829 829 830 830 // we're not even interested in any other file extensions 831 logger.debug("Not a WET file. Skipping " + name);831 debug("Not a WET file. Skipping " + name); 832 832 return false; 833 833 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33552 r33573 77 77 78 78 String fileID = inFile.getName(); 79 // System.err.println("*** Processing wetfile: " + fileID);79 //debug("*** Processing wetfile: " + fileID); 80 80 fileID = fileID.substring(fileID.lastIndexOf("0")+1); 81 81 if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet … … 195 195 String recordURI, String record) 196 196 { 197 System.err.println("CrawlID: CC-MAIN-" + this.crawlID197 info("CrawlID: CC-MAIN-" + this.crawlID 198 198 + " WET #" + this.WETFileID 199 199 + " record #" + recordID 200 200 + " - contentLength: " + contentLength 201 201 + " - lineCount: " + lineCount); 202 System.err.println("URI: " + recordURI);203 // System.err.println(record);204 // System.err.println("--------------------------");202 info("URI: " + recordURI); 203 //debug(record); 204 //info("--------------------------"); 205 205 206 206 File parentFolder = null; … … 215 215 else if(batchProcessor.isGreylisted(recordURI)) { 216 216 parentFolder = batchProcessor.greyListedFolder; 217 System.err.println("@@@GREYLISTED");217 debug("@@@GREYLISTED"); 218 218 } 219 219 else { // url was only blacklisted 220 220 parentFolder = batchProcessor.discardFolder; 221 System.err.println("@@@DISCARDING - blacklisted");221 debug("@@@DISCARDING - blacklisted"); 222 222 } 223 223 } … … 229 229 else { 230 230 parentFolder = batchProcessor.greyListedFolder; 231 System.err.println("@@@GREYLISTED");231 debug("@@@GREYLISTED"); 232 232 } 233 233 } … … 274 274 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 275 275 parentFolder = batchProcessor.discardFolder; 276 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");276 debug("@@@DISCARDING - CAMELCASED CONTENTS"); 277 277 } 278 278 else*/ … … 282 282 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 283 283 parentFolder = batchProcessor.keepFolder; 284 System.err.println("@@@KEEPING");284 debug("@@@KEEPING"); 285 285 } 286 286 } … … 289 289 if(parentFolder == null) { 290 290 parentFolder = batchProcessor.discardFolder; 291 System.err.println("@@@DISCARDING");291 debug("@@@DISCARDING"); 292 292 } 293 293 … … 301 301 } 302 302 } catch(Exception e) { 303 System.err.println("Unable to write URL");303 debug("Unable to write URL"); 304 304 e.printStackTrace(); 305 305 } 306 306 307 System.err.println("--------------------------");307 debug("--------------------------"); 308 308 309 309 // outFilename will look something like YYYY-##-#### … … 319 319 } catch(IOException ioe) { 320 320 ioe.printStackTrace(); 321 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile); 322 } 321 error("@@@@@@@@@ Error writing to file " + outFile, ioe); 322 } 323 } 324 325 326 public void info(String msg) { 327 System.err.println(msg); 328 logger.info(msg); 329 } 330 public void debug(String msg) { 331 System.err.println(msg); 332 logger.debug(msg); 333 } 334 public void warn(String msg) { 335 System.err.println(msg); 336 logger.warn(msg); 337 } 338 public void error(String msg) { 339 System.err.println(msg); 340 logger.error(msg); 341 } 342 public void error(String msg, Exception e) { 343 logger.error(msg, e); 344 System.err.println("\n"+msg); 345 e.printStackTrace(); 323 346 } 324 347 }
Note:
See TracChangeset
for help on using the changeset viewer.