source: other-projects/the-macronizer/trunk/src/java/monogram/restorer/TxtRestorer.java@ 30062

Last change on this file since 30062 was 30062, checked in by davidb, 9 years ago

Removal/Tidy-up of debug statements

File size: 2.0 KB
Line 
1
2package monogram.restorer;
3
4import java.io.BufferedReader;
5import java.io.BufferedWriter;
6import java.io.File;
7import java.io.FileInputStream;
8import java.io.FileOutputStream;
9import java.io.IOException;
10import java.io.InputStreamReader;
11import java.io.OutputStreamWriter;
12import java.util.regex.Matcher;
13import java.util.regex.Pattern;
14import util.IOUtil;
15
16/**
17 * @author University of Waikato - Te Whare Wānanga o Waikato
18 * @version 1.0
19 * @since 2014-11-20
20 */
21public class TxtRestorer {
22
23 private static final String OUTPUT_CHARSET_ENCODING = "utf-8";
24 private Pattern pattern;
25
26 public TxtRestorer() {
27 pattern = Pattern.compile("([a-zA-Zā-ūĀ-Ū0-9]+|\\p{Punct})|(.)");
28 }
29
30 public void restore(File inputFile, String inputCharsetEncoding, File outputFile, boolean preserveMacrons) {
31 MonogramRestorer restorer = new MonogramRestorer(preserveMacrons);
32 BufferedReader reader = null;
33 BufferedWriter writer = null;
34
35 try {
36 reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), inputCharsetEncoding));
37 writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), OUTPUT_CHARSET_ENCODING));
38 String line;
39 while ((line = reader.readLine()) != null) {
40 final Matcher matcher = pattern.matcher(line);
41 while (matcher.find()) {
42 final String token = matcher.group(1);
43 final String nonToken = matcher.group(2);
44 if (token != null) {
45 String restoredToken = restorer.restore(token);
46 writer.write(restoredToken);
47 } else {
48 writer.write(nonToken);
49 }
50 }
51 writer.newLine();
52 }
53 } catch (IOException e) {
54 e.printStackTrace();
55 } finally {
56 IOUtil.closeReader(reader);
57 IOUtil.closeWriter(writer);
58 }
59 }
60}
Note: See TracBrowser for help on using the repository browser.