1 |
|
---|
2 | package monogram.restorer;
|
---|
3 |
|
---|
4 | import java.io.BufferedReader;
|
---|
5 | import java.io.BufferedWriter;
|
---|
6 | import java.io.File;
|
---|
7 | import java.io.FileInputStream;
|
---|
8 | import java.io.FileOutputStream;
|
---|
9 | import java.io.IOException;
|
---|
10 | import java.io.InputStreamReader;
|
---|
11 | import java.io.OutputStreamWriter;
|
---|
12 | import java.util.regex.Matcher;
|
---|
13 | import java.util.regex.Pattern;
|
---|
14 | import util.IOUtil;
|
---|
15 |
|
---|
16 | /**
|
---|
17 | * @author University of Waikato - Te Whare WÄnanga o Waikato
|
---|
18 | * @version 1.0
|
---|
19 | * @since 2014-11-20
|
---|
20 | */
|
---|
21 | public class TxtRestorer {
|
---|
22 |
|
---|
23 | private static final String OUTPUT_CHARSET_ENCODING = "utf-8";
|
---|
24 | private Pattern pattern;
|
---|
25 |
|
---|
26 | public TxtRestorer() {
|
---|
27 | pattern = Pattern.compile("([a-zA-ZÄ-Å«Ä-Ū0-9]+|\\p{Punct})|(.)", Pattern.DOTALL);
|
---|
28 | }
|
---|
29 |
|
---|
30 | public void restore(File inputFile, String inputCharsetEncoding, File outputFile, boolean preserveMacrons, boolean markupChangedWords) {
|
---|
31 | MonogramRestorer restorer = new MonogramRestorer(preserveMacrons);
|
---|
32 | BufferedReader reader = null;
|
---|
33 | BufferedWriter writer = null;
|
---|
34 |
|
---|
35 | try {
|
---|
36 | reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), inputCharsetEncoding));
|
---|
37 | writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), OUTPUT_CHARSET_ENCODING));
|
---|
38 | String line;
|
---|
39 | while ((line = reader.readLine()) != null) {
|
---|
40 | final Matcher matcher = pattern.matcher(line);
|
---|
41 | while (matcher.find()) {
|
---|
42 | final String token = matcher.group(1);
|
---|
43 | final String nonToken = matcher.group(2);
|
---|
44 | if (token != null) {
|
---|
45 | String restoredToken = restorer.restore(token, markupChangedWords);
|
---|
46 | writer.write(restoredToken);
|
---|
47 | } else {
|
---|
48 | writer.write(nonToken);
|
---|
49 | }
|
---|
50 | }
|
---|
51 | writer.newLine();
|
---|
52 | }
|
---|
53 | } catch (IOException e) {
|
---|
54 | e.printStackTrace();
|
---|
55 | } finally {
|
---|
56 | IOUtil.closeReader(reader);
|
---|
57 | IOUtil.closeWriter(writer);
|
---|
58 | }
|
---|
59 | }
|
---|
60 | }
|
---|