source: gs3-extensions/atlas-src/trunk/src/org/greenstone/atlas/server/HTMLParser.java@ 23934

Last change on this file since 23934 was 23934, checked in by sjm84, 13 years ago

Extensive improvements to the ATLAS code

File size: 2.5 KB
Line 
1package org.greenstone.atlas.server;
2
3import java.io.BufferedWriter;
4import java.io.FileWriter;
5import java.util.ArrayList;
6
7public class HTMLParser
8{
9 protected char[] _html = null;
10 protected int _index = 0;
11
12 ArrayList<String> _fullWordList = new ArrayList<String>();
13 ArrayList<String> _fullBetweenList = new ArrayList<String>();
14
15 public HTMLParser(String html)
16 {
17 _html = html.toCharArray();
18
19 String other = getNextCharactersBetweenWords();
20 String word = getNextWord();
21 while(word != null || other != null)
22 {
23 if(word != null)
24 {
25 _fullWordList.add(word);
26 }
27
28 if(other != null)
29 {
30 _fullBetweenList.add(other);
31 }
32
33 other = getNextCharactersBetweenWords();
34 word = getNextWord();
35 }
36 }
37
38 public String getNextCharactersBetweenWords()
39 {
40 if(_index >= _html.length)
41 {
42 return null;
43 }
44
45 StringBuilder chars = new StringBuilder();
46
47 while(true)
48 {
49 if(_index >= _html.length)
50 {
51 break;
52 }
53
54 if(_html[_index] == '<')
55 {
56 chars.append(_html[_index]);
57 int bracketCount = 1;
58 while(_index < _html.length && bracketCount > 0)
59 {
60 _index++;
61 if(_html[_index] == '>')
62 {
63 bracketCount--;
64 }
65 else if(_html[_index] == '<')
66 {
67 bracketCount++;
68 }
69 chars.append(_html[_index]);
70 }
71 }
72 else if(!Character.isLetter(_html[_index]))
73 {
74 chars.append(_html[_index]);
75 }
76 else
77 {
78 break;
79 }
80 _index++;
81 }
82
83 if(chars.length() == 0)
84 {
85 return null;
86 }
87 return chars.toString();
88 }
89
90 public String getNextWord()
91 {
92 if(_index >= _html.length)
93 {
94 return null;
95 }
96
97 StringBuilder newWord = new StringBuilder();
98
99 while(true)
100 {
101 char currentCharacter = _html[_index];
102 if(!Character.isLetter(currentCharacter))
103 {
104 break;
105 }
106 else
107 {
108 newWord.append(currentCharacter);
109 }
110 _index++;
111 }
112 return newWord.toString();
113 }
114
115 public ArrayList<String> getFullHTMLWordList()
116 {
117 return _fullWordList;
118 }
119
120 public ArrayList<String> getFullBetweenWordList()
121 {
122 return _fullBetweenList;
123 }
124
125 public static String removeTags(String htmlString)
126 {
127 StringBuilder cleanText = new StringBuilder();
128
129 int inTag = 0;
130 for(char c : htmlString.toCharArray())
131 {
132 if(c == '<')
133 {
134 inTag++;
135 }
136 else if(c == '>' && inTag > 0)
137 {
138 inTag--;
139 }
140 else if(inTag == 0)
141 {
142 cleanText.append(c);
143 }
144 }
145
146 return cleanText.toString();
147 }
148}
Note: See TracBrowser for help on using the repository browser.