source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wiki2xml/php/filter_named_entities.php@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 4.5 KB
Line 
1<?php
2/**
3 * This file is to compensate for a bug in PHP4 and early PHP5 versions
4 * which do not replace some entities correctly
5 */
6
7$html_named_entities_mapping_mine = array (
8 // Obtained with:
9 // less /usr/share/xml/entities/xhtml/*.ent|grep '^<!ENTITY'|sed -e 's/^<\!ENTITY[ \t]*\([A-Za-z0-9]*\)[ \t]*"&#\([0-9]*\);".*$/"\1"=>\2,/' > /home/dom/data/2005/04/entities-table
10'nbsp'=>160,
11'iexcl'=>161,
12'cent'=>162,
13'pound'=>163,
14'curren'=>164,
15'yen'=>165,
16'brvbar'=>166,
17'sect'=>167,
18'uml'=>168,
19'copy'=>169,
20'ordf'=>170,
21'laquo'=>171,
22'not'=>172,
23'shy'=>173,
24'reg'=>174,
25'macr'=>175,
26'deg'=>176,
27'plusmn'=>177,
28'sup2'=>178,
29'sup3'=>179,
30'acute'=>180,
31'micro'=>181,
32'para'=>182,
33'middot'=>183,
34'cedil'=>184,
35'sup1'=>185,
36'ordm'=>186,
37'raquo'=>187,
38'frac14'=>188,
39'frac12'=>189,
40'frac34'=>190,
41'iquest'=>191,
42'Agrave'=>192,
43'Aacute'=>193,
44'Acirc'=>194,
45'Atilde'=>195,
46'Auml'=>196,
47'Aring'=>197,
48'AElig'=>198,
49'Ccedil'=>199,
50'Egrave'=>200,
51'Eacute'=>201,
52'Ecirc'=>202,
53'Euml'=>203,
54'Igrave'=>204,
55'Iacute'=>205,
56'Icirc'=>206,
57'Iuml'=>207,
58'ETH'=>208,
59'Ntilde'=>209,
60'Ograve'=>210,
61'Oacute'=>211,
62'Ocirc'=>212,
63'Otilde'=>213,
64'Ouml'=>214,
65'times'=>215,
66'Oslash'=>216,
67'Ugrave'=>217,
68'Uacute'=>218,
69'Ucirc'=>219,
70'Uuml'=>220,
71'Yacute'=>221,
72'THORN'=>222,
73'szlig'=>223,
74'agrave'=>224,
75'aacute'=>225,
76'acirc'=>226,
77'atilde'=>227,
78'auml'=>228,
79'aring'=>229,
80'aelig'=>230,
81'ccedil'=>231,
82'egrave'=>232,
83'eacute'=>233,
84'ecirc'=>234,
85'euml'=>235,
86'igrave'=>236,
87'iacute'=>237,
88'icirc'=>238,
89'iuml'=>239,
90'eth'=>240,
91'ntilde'=>241,
92'ograve'=>242,
93'oacute'=>243,
94'ocirc'=>244,
95'otilde'=>245,
96'ouml'=>246,
97'divide'=>247,
98'oslash'=>248,
99'ugrave'=>249,
100'uacute'=>250,
101'ucirc'=>251,
102'uuml'=>252,
103'yacute'=>253,
104'thorn'=>254,
105'yuml'=>255,
106'quot'=>34,
107'amp'=>38,
108'lt'=>60,
109'gt'=>62,
110'OElig'=>338,
111'oelig'=>339,
112'Scaron'=>352,
113'scaron'=>353,
114'Yuml'=>376,
115'circ'=>710,
116'tilde'=>732,
117'ensp'=>8194,
118'emsp'=>8195,
119'thinsp'=>8201,
120'zwnj'=>8204,
121'zwj'=>8205,
122'lrm'=>8206,
123'rlm'=>8207,
124'ndash'=>8211,
125'mdash'=>8212,
126'lsquo'=>8216,
127'rsquo'=>8217,
128'sbquo'=>8218,
129'ldquo'=>8220,
130'rdquo'=>8221,
131'bdquo'=>8222,
132'dagger'=>8224,
133'Dagger'=>8225,
134'permil'=>8240,
135'lsaquo'=>8249,
136'rsaquo'=>8250,
137'euro'=>8364,
138'fnof'=>402,
139'Alpha'=>913,
140'Beta'=>914,
141'Gamma'=>915,
142'Delta'=>916,
143'Epsilon'=>917,
144'Zeta'=>918,
145'Eta'=>919,
146'Theta'=>920,
147'Iota'=>921,
148'Kappa'=>922,
149'Lambda'=>923,
150'Mu'=>924,
151'Nu'=>925,
152'Xi'=>926,
153'Omicron'=>927,
154'Pi'=>928,
155'Rho'=>929,
156'Sigma'=>931,
157'Tau'=>932,
158'Upsilon'=>933,
159'Phi'=>934,
160'Chi'=>935,
161'Psi'=>936,
162'Omega'=>937,
163'alpha'=>945,
164'beta'=>946,
165'gamma'=>947,
166'delta'=>948,
167'epsilon'=>949,
168'zeta'=>950,
169'eta'=>951,
170'theta'=>952,
171'iota'=>953,
172'kappa'=>954,
173'lambda'=>955,
174'mu'=>956,
175'nu'=>957,
176'xi'=>958,
177'omicron'=>959,
178'pi'=>960,
179'rho'=>961,
180'sigmaf'=>962,
181'sigma'=>963,
182'tau'=>964,
183'upsilon'=>965,
184'phi'=>966,
185'chi'=>967,
186'psi'=>968,
187'omega'=>969,
188'thetasym'=>977,
189'upsih'=>978,
190'piv'=>982,
191'bull'=>8226,
192'hellip'=>8230,
193'prime'=>8242,
194'Prime'=>8243,
195'oline'=>8254,
196'frasl'=>8260,
197'weierp'=>8472,
198'image'=>8465,
199'real'=>8476,
200'trade'=>8482,
201'alefsym'=>8501,
202'larr'=>8592,
203'uarr'=>8593,
204'rarr'=>8594,
205'darr'=>8595,
206'harr'=>8596,
207'crarr'=>8629,
208'lArr'=>8656,
209'uArr'=>8657,
210'rArr'=>8658,
211'dArr'=>8659,
212'hArr'=>8660,
213'forall'=>8704,
214'part'=>8706,
215'exist'=>8707,
216'empty'=>8709,
217'nabla'=>8711,
218'isin'=>8712,
219'notin'=>8713,
220'ni'=>8715,
221'prod'=>8719,
222'sum'=>8721,
223'minus'=>8722,
224'lowast'=>8727,
225'radic'=>8730,
226'prop'=>8733,
227'infin'=>8734,
228'ang'=>8736,
229'and'=>8743,
230'or'=>8744,
231'cap'=>8745,
232'cup'=>8746,
233'int'=>8747,
234'there4'=>8756,
235'sim'=>8764,
236'cong'=>8773,
237'asymp'=>8776,
238'ne'=>8800,
239'equiv'=>8801,
240'le'=>8804,
241'ge'=>8805,
242'sub'=>8834,
243'sup'=>8835,
244'nsub'=>8836,
245'sube'=>8838,
246'supe'=>8839,
247'oplus'=>8853,
248'otimes'=>8855,
249'perp'=>8869,
250'sdot'=>8901,
251'lceil'=>8968,
252'rceil'=>8969,
253'lfloor'=>8970,
254'rfloor'=>8971,
255'lang'=>9001,
256'rang'=>9002,
257'loz'=>9674,
258'spades'=>9824,
259'clubs'=>9827,
260'hearts'=>9829,
261'diams'=>9830,
262'32'=>32,
263);
264
265function utf8_chr($code)
266{
267 if($code<128) return chr($code);
268 else if($code<2048) return chr(($code>>6)+192).chr(($code&63)+128);
269 else if($code<65536) return chr(($code>>12)+224).chr((($code>>6)&63)+128).chr(($code&63)+128);
270 else if($code<2097152) return chr($code>>18+240).chr((($code>>12)&63)+128).chr(($code>>6)&63+128).chr($code&63+128);
271}
272
273function filter_named_entities(&$content) {
274 global $html_named_entities_mapping_mine;
275 foreach($html_named_entities_mapping_mine as $name => $value) {
276 $content=str_replace('&'.$name.';',utf8_chr ( $value ),$content);
277 }
278 $content=str_replace('í','i',$content); # Ugly hack
279}
280
281?>
Note: See TracBrowser for help on using the repository browser.