1 | <?php
|
---|
2 |
|
---|
3 | require_once('common.php');
|
---|
4 | $debug = 1;
|
---|
5 |
|
---|
6 | // There are some elements that are, in my opinion, incorrectly 'resolved' in
|
---|
7 | // code blocks just because they are valid HTML. For instance, <i> is
|
---|
8 | // resolved to <i> in code blocks, while something like <Metadata> is
|
---|
9 | // not. Set this to true to allow such abominations (for the purpose of
|
---|
10 | // comparing before and after versions of the XML). [jmt12]
|
---|
11 | $allow_bad_codeblocks = true;
|
---|
12 |
|
---|
13 | /** @file gs-manual-export.php
|
---|
14 | * This script transforms the series of dokuwiki pages that make up a certain
|
---|
15 | * manual (as specified by the 'm' argument) in a certain language ('l') into
|
---|
16 | * the XML format required by the rest of the Greenstone manual generation
|
---|
17 | * scripts.
|
---|
18 | */
|
---|
19 |
|
---|
20 | // 0. Initialization
|
---|
21 |
|
---|
22 | // - we have a counter to assign identifiers to text blocks etc without ids
|
---|
23 | $text_id_counter = 1;
|
---|
24 | // - we need an array of all the footnotes
|
---|
25 | $footnotes = array();
|
---|
26 |
|
---|
27 | // Defaults
|
---|
28 | if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
|
---|
29 | {
|
---|
30 | $_REQUEST['l'] = 'en';
|
---|
31 | }
|
---|
32 | if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
|
---|
33 | {
|
---|
34 | //$_REQUEST['m'] = 'user';
|
---|
35 | //$_REQUEST['m'] = 'install';
|
---|
36 | $_REQUEST['m'] = 'develop';
|
---|
37 | //$_REQUEST['m'] = 'paper';
|
---|
38 | }
|
---|
39 | if (!isset($_REQUEST['v']) || empty($_REQUEST['v']))
|
---|
40 | {
|
---|
41 | $_REQUEST['v'] = 'draft';
|
---|
42 | }
|
---|
43 | if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a']))
|
---|
44 | {
|
---|
45 | $_REQUEST['a'] = 'store'; // Try to store the file to disk
|
---|
46 | }
|
---|
47 |
|
---|
48 | $var_path = filecat(array($base_path, 'var'));
|
---|
49 | $timestamp = time();
|
---|
50 | //$xml_source_path = '**PATH TO GSDL MANUALS**'
|
---|
51 | $xml_source_path = '/tmp';
|
---|
52 | if ($_REQUEST['a'] == 'download')
|
---|
53 | {
|
---|
54 | // Clear out previous exports
|
---|
55 | recursiveRemove($var_path, '/greenstone/greenstone-documentation/php/var');
|
---|
56 | // New export
|
---|
57 | $xml_source_path = fileCat(array($var_path, $timestamp));
|
---|
58 | }
|
---|
59 |
|
---|
60 | echo '<html>' . "\n";
|
---|
61 | echo '<head>' . "\n";
|
---|
62 | echo '<title>GS Manual Export</title>' . "\n";
|
---|
63 | echo '</head>' . "\n";
|
---|
64 | echo '<body>' . "\n";
|
---|
65 |
|
---|
66 | // - validate arguments before we use them (security)
|
---|
67 | if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
|
---|
68 | {
|
---|
69 | printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
|
---|
70 | }
|
---|
71 |
|
---|
72 | if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
|
---|
73 | {
|
---|
74 | printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
|
---|
75 | }
|
---|
76 |
|
---|
77 | echo '<h2>Generating Greenstone Manual XML</h2>' . "\n";
|
---|
78 | echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
|
---|
79 | // 1. Create the XML output file handle
|
---|
80 | // - construct the path using the information we've been provided as arguments
|
---|
81 | $xml_file_dir = $xml_source_path . '/' . $_REQUEST['l'];
|
---|
82 | mkAllDir($xml_file_dir);
|
---|
83 | $xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
|
---|
84 | // - backup any existing file
|
---|
85 | if (file_exists($xml_file_path))
|
---|
86 | {
|
---|
87 | $xml_backup_file_path = $xml_file_path . '.bak';
|
---|
88 | if (!rename($xml_file_path, $xml_backup_file_path))
|
---|
89 | {
|
---|
90 | printError('Failed to rename existing manual file for backup');
|
---|
91 | }
|
---|
92 | }
|
---|
93 |
|
---|
94 | // - and create a handle to the new file
|
---|
95 | $xml_out = fopen($xml_file_path, 'w');
|
---|
96 |
|
---|
97 | // 2. Read in the top level page - this will give configuration data for the
|
---|
98 | // manual and cover page, as well as specifying the order for the other
|
---|
99 | // pages in the manual
|
---|
100 | echo "<p><b>Frontmatter:</b><br/>\n";
|
---|
101 | // - by reading this page we hope to populate an array of metadata, and also
|
---|
102 | // extract the sequence of other pages within this manual
|
---|
103 | $cover_metadata = array();
|
---|
104 | $pages_in_order = array();
|
---|
105 | // - we now need to consider if the user has asked for a draft version (i.e.
|
---|
106 | // includes the latest version of pages regardless of approval) or if only
|
---|
107 | // the approved versions of pages should be included
|
---|
108 | // - only necessary for english version of manual, as those are the only pages
|
---|
109 | // editable
|
---|
110 | $top_page_path = '';
|
---|
111 | if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
|
---|
112 | {
|
---|
113 | // - again, we can construct the path to the top level page given the arguments
|
---|
114 | // provided
|
---|
115 | $top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
|
---|
116 | }
|
---|
117 | else
|
---|
118 | {
|
---|
119 | $top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
|
---|
120 | }
|
---|
121 |
|
---|
122 | if (!file_exists($top_page_path))
|
---|
123 | {
|
---|
124 | printError('Failed to locate top level page for manual');
|
---|
125 | }
|
---|
126 | // - we're going to open a handle to the file, then read it in line-by-line
|
---|
127 | // watching for the lines we are interested in (using pattern matching)
|
---|
128 | $top_page_in = fopen($top_page_path, 'r');
|
---|
129 | if (!$top_page_in)
|
---|
130 | {
|
---|
131 | printError('Failed to open top level page for reading');
|
---|
132 | }
|
---|
133 | $in_contents = false;
|
---|
134 | while (($line = fgets($top_page_in)) !== false)
|
---|
135 | {
|
---|
136 | // - if we are capturing page order, and we encounter something that looks
|
---|
137 | // like a bulletpoint item pointing to a wiki page, then we append the
|
---|
138 | // name of that page to our pages in order array
|
---|
139 | if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches))
|
---|
140 | {
|
---|
141 | array_push($pages_in_order, $matches[1]);
|
---|
142 | }
|
---|
143 | // - metadata is all encoded within dokuwiki tables
|
---|
144 | elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches))
|
---|
145 | {
|
---|
146 | $field = $matches[1];
|
---|
147 | $value = $matches[2];
|
---|
148 | $values = array();
|
---|
149 | if (isset($cover_metadata[$field]))
|
---|
150 | {
|
---|
151 | $values = $cover_metadata[$field];
|
---|
152 | }
|
---|
153 | array_push($values, $value);
|
---|
154 | $cover_metadata[$field] = $values;
|
---|
155 | }
|
---|
156 | // - watch for the heading 'Contents' to begin extracting page order
|
---|
157 | // information
|
---|
158 | elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
|
---|
159 | {
|
---|
160 | if ($matches[1] == 'Contents')
|
---|
161 | {
|
---|
162 | $in_contents = true;
|
---|
163 | }
|
---|
164 | // - any other title means we aren't capturing page order (anymore)
|
---|
165 | else
|
---|
166 | {
|
---|
167 | $in_contents = false;
|
---|
168 | }
|
---|
169 | }
|
---|
170 | }
|
---|
171 | if (!feof($top_page_in))
|
---|
172 | {
|
---|
173 | printError('Unexpected fgets() fail when reading top page');
|
---|
174 | }
|
---|
175 | fclose($top_page_in);
|
---|
176 | // - ensure we have the required metadata
|
---|
177 | $required_metadata = array('Heading','Title','Affiliation','Version','Date');
|
---|
178 | foreach ($required_metadata as $required_field)
|
---|
179 | {
|
---|
180 | if (!isset($cover_metadata[$required_field]))
|
---|
181 | {
|
---|
182 | printError('Missing required metadata: ' . $required_field);
|
---|
183 | }
|
---|
184 | }
|
---|
185 | // - now we can use the metadata to construct the XML header and the cover page.
|
---|
186 | // This follows a pretty set recipe with only elements that can repeat---like
|
---|
187 | // Author, SupplementaryText etc---are at all tricky
|
---|
188 | fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n");
|
---|
189 | fwrite($xml_out, '<!DOCTYPE Manual [' . "\n");
|
---|
190 | if (isset($cover_metadata['ENTITY']))
|
---|
191 | {
|
---|
192 | foreach ($cover_metadata['ENTITY'] as $entity)
|
---|
193 | {
|
---|
194 | fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n");
|
---|
195 | }
|
---|
196 | }
|
---|
197 | fwrite($xml_out, ']>' . "\n");
|
---|
198 | fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n");
|
---|
199 |
|
---|
200 | ///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n";
|
---|
201 |
|
---|
202 | outputMetadataSingle($xml_out, $cover_metadata, 'Heading');
|
---|
203 | outputMetadataSingle($xml_out, $cover_metadata, 'Title');
|
---|
204 | outputMetadataSingle($xml_out, $cover_metadata, 'Author');
|
---|
205 | outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation');
|
---|
206 | outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText');
|
---|
207 | outputMetadataMultiple($xml_out, $cover_metadata, 'Text');
|
---|
208 | outputMetadataMultiple($xml_out, $cover_metadata, 'Comment');
|
---|
209 | outputMetadataSingle($xml_out, $cover_metadata, 'Version');
|
---|
210 | outputMetadataSingle($xml_out, $cover_metadata, 'Date');
|
---|
211 |
|
---|
212 | // 3. Process each page listed in the contents of the top level page in order
|
---|
213 | foreach ($pages_in_order as $page)
|
---|
214 | {
|
---|
215 | processPage($xml_out, $page);
|
---|
216 | }
|
---|
217 |
|
---|
218 | // 4. Output out list of footnotes (if any)
|
---|
219 | if (!empty($footnotes))
|
---|
220 | {
|
---|
221 | fwrite($xml_out, '<FootnoteList>'. "\n");
|
---|
222 | foreach ($footnotes as $footnote=>$footnote_id)
|
---|
223 | {
|
---|
224 | ///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n";
|
---|
225 | outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
|
---|
226 | }
|
---|
227 | fwrite($xml_out, '</FootnoteList>'. "\n");
|
---|
228 | }
|
---|
229 |
|
---|
230 | // 5. Finalize and close the XML output
|
---|
231 | fwrite($xml_out, '</Manual>' . "\n");
|
---|
232 | fclose($xml_out);
|
---|
233 | chmod($xml_file_path, 0664);
|
---|
234 |
|
---|
235 | // 6. Complete!
|
---|
236 | echo '<p><b>Complete!</b></p>' . "\n<hr/>\n";
|
---|
237 | if ($_REQUEST['a'] == 'download')
|
---|
238 | {
|
---|
239 | // Zip up the manual files
|
---|
240 | $zip_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.zip';
|
---|
241 | $zip_path = fileCat(array($xml_source_path, $zip_file));
|
---|
242 | $zip_command = 'zip -r "' . $zip_path . '" . > /dev/null 2>&1';
|
---|
243 | //cho '<p><b>[DEBUG]</b> zip_command:' . $zip_command . '</p>';
|
---|
244 | $tgz_file = ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.tgz';
|
---|
245 | $tgz_path = fileCat(array($xml_source_path, $tgz_file));
|
---|
246 | $tgz_command = 'tar -czf "' . $tgz_path . '" * > /dev/null 2>&1';
|
---|
247 | //cho '<p><b>[DEBUG]</b> tgz_command:' . $tgz_command . '</p>';
|
---|
248 | // We need to move to the document folder so that archives have sensible paths
|
---|
249 | $original_cwd = getcwd();
|
---|
250 | chdir($xml_file_dir);
|
---|
251 | system($zip_command);
|
---|
252 | system($tgz_command);
|
---|
253 | // Go back
|
---|
254 | chdir($original_cwd);
|
---|
255 | // Links are ready
|
---|
256 | echo '<p>Download XML file plus images as: <a href="var/' . $timestamp . '/' . $zip_file . '">ZIP</a> or <a href="var/' . $timestamp . '/' . $tgz_file . '">TGZ</a></p>' . "\n";
|
---|
257 | }
|
---|
258 | echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n";
|
---|
259 | echo '</body>' . "\n";
|
---|
260 | echo '</html>';
|
---|
261 | exit(0);
|
---|
262 |
|
---|
263 | /**
|
---|
264 | */
|
---|
265 | function outputMetadataSingle($xml_out, $metadata, $field, $mid=false)
|
---|
266 | {
|
---|
267 | echo '[metadata: ' . $field . "] \n";
|
---|
268 | if ($mid)
|
---|
269 | {
|
---|
270 | fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n");
|
---|
271 | }
|
---|
272 | else
|
---|
273 | {
|
---|
274 | fwrite($xml_out, '<' . $field . '>' . "\n");
|
---|
275 | }
|
---|
276 | if (is_array($metadata))
|
---|
277 | {
|
---|
278 | if (isset($metadata[$field]) && isset($metadata[$field][0]))
|
---|
279 | {
|
---|
280 | outputTextBlock($xml_out, $metadata[$field][0]);
|
---|
281 | }
|
---|
282 | else
|
---|
283 | {
|
---|
284 | echo 'no such field or no metadata';
|
---|
285 | }
|
---|
286 | }
|
---|
287 | elseif (!empty($metadata))
|
---|
288 | {
|
---|
289 | outputTextBlock($xml_out, $metadata);
|
---|
290 | }
|
---|
291 | else
|
---|
292 | {
|
---|
293 | echo 'no such field or no metadata';
|
---|
294 | }
|
---|
295 | fwrite($xml_out, '</' . $field . '>' . "\n");
|
---|
296 | }
|
---|
297 | /** outputMetadataSingle() **/
|
---|
298 |
|
---|
299 | function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false)
|
---|
300 | {
|
---|
301 | echo '[metadata list: ' . $field . "] \n";
|
---|
302 | fwrite($xml_out, '<' . $field . '>' . "\n");
|
---|
303 | if (isset($metadata[$field]))
|
---|
304 | {
|
---|
305 | if (count($metadata[$field]) == 1)
|
---|
306 | {
|
---|
307 | outputTextBlock($xml_out, $metadata[$field][0]);
|
---|
308 | }
|
---|
309 | if (count($metadata[$field]) > 1)
|
---|
310 | {
|
---|
311 | $last_value = '';
|
---|
312 | if ($final_separator)
|
---|
313 | {
|
---|
314 | $last_value = array_pop($metadata[$field]);
|
---|
315 | }
|
---|
316 | $values = implode($separator, $metadata[$field]);
|
---|
317 | if ($final_separator)
|
---|
318 | {
|
---|
319 | $values .= $final_separator . $last_value;
|
---|
320 | }
|
---|
321 | outputTextBlock($xml_out, $values);
|
---|
322 | }
|
---|
323 | }
|
---|
324 | else
|
---|
325 | {
|
---|
326 | echo 'no such field or no metadata';
|
---|
327 | }
|
---|
328 | fwrite($xml_out, '</' . $field . '>' . "\n");
|
---|
329 | }
|
---|
330 | /** outputMetadataList() **/
|
---|
331 |
|
---|
332 | function outputMetadataMultiple($xml_out, $metadata, $field)
|
---|
333 | {
|
---|
334 | echo '[metadata multiple: ' . $field . "] \n";
|
---|
335 | // - Text blocks don't need to be wrapped in Text element
|
---|
336 | if ($field != 'Text')
|
---|
337 | {
|
---|
338 | fwrite($xml_out, '<' . $field . '>' . "\n");
|
---|
339 | }
|
---|
340 | if (isset($metadata[$field]))
|
---|
341 | {
|
---|
342 | foreach ($metadata[$field] as $value)
|
---|
343 | {
|
---|
344 | outputTextBlock($xml_out, $value);
|
---|
345 | }
|
---|
346 | }
|
---|
347 | else
|
---|
348 | {
|
---|
349 | echo 'no such field or no metadata';
|
---|
350 | }
|
---|
351 | if ($field != 'Text')
|
---|
352 | {
|
---|
353 | fwrite($xml_out, '</' . $field . '>' . "\n");
|
---|
354 | }
|
---|
355 | }
|
---|
356 |
|
---|
357 | function translateTableCodeline($text)
|
---|
358 | {
|
---|
359 | // Escape any italic tags hidden in HTML comments
|
---|
360 | $text = str_replace('<!--i-->', '%!--i--%', $text);
|
---|
361 | $text = str_replace('<!--/i-->', '%!--/i--%', $text);
|
---|
362 | // Encode entities etc
|
---|
363 | $text = translateText($text, true);
|
---|
364 | // Restore any italics elements hidden above
|
---|
365 | $text = str_replace('%!--i--%', '<i>', $text);
|
---|
366 | $text = str_replace('%!--/i--%', '</i>', $text);
|
---|
367 | return $text;
|
---|
368 | }
|
---|
369 | /** translateTableCodeline() **/
|
---|
370 |
|
---|
371 | function translateText($text, $in_code_block=false)
|
---|
372 | {
|
---|
373 | global $allow_bad_codeblocks;
|
---|
374 | $text = str_replace('&','&',$text);
|
---|
375 | $text = str_replace('<','<',$text);
|
---|
376 | $text = str_replace('>','>',$text);
|
---|
377 | if ($in_code_block && $allow_bad_codeblocks)
|
---|
378 | {
|
---|
379 | ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
|
---|
380 | $text = str_replace('<i>','<i>',$text);
|
---|
381 | $text = str_replace('</i>','</i>',$text);
|
---|
382 | $text = str_replace('<br/>','<br/>',$text);
|
---|
383 | }
|
---|
384 | return $text;
|
---|
385 | }
|
---|
386 |
|
---|
387 | function outputTextBlock($xml_out, $text, $type='', $in_code_block = false)
|
---|
388 | {
|
---|
389 | global $cover_metadata;
|
---|
390 | global $text_id_counter;
|
---|
391 | global $footnotes;
|
---|
392 | global $allow_bad_codeblocks;
|
---|
393 |
|
---|
394 | // - Start by dealing with any footnotes before anything else
|
---|
395 | while (preg_match('/\(\((.*?)\)\)/', $text, $matches))
|
---|
396 | {
|
---|
397 | $pattern = $matches[0];
|
---|
398 | $footnote = $matches[1];
|
---|
399 | $footnote_id = count($footnotes) + 1;
|
---|
400 | $footnotes[$footnote] = $footnote_id;
|
---|
401 | // - note that we have to escape the footnote reference as the following
|
---|
402 | // code will convert any < and > to entities...
|
---|
403 | $footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%';
|
---|
404 | $text = str_replace($pattern, $footnote_reference, $text);
|
---|
405 | }
|
---|
406 |
|
---|
407 | $text_id = '';
|
---|
408 | // - check whether the string begins with an explicit id
|
---|
409 | if (preg_match('/^\s*<!--\s*id:(.+?)\s*-->(.*)$/', $text, $matches))
|
---|
410 | {
|
---|
411 | $text_id = $matches[1];
|
---|
412 | $text = $matches[2];
|
---|
413 | if (is_numeric($text_id))
|
---|
414 | {
|
---|
415 | $text_id_counter = $text_id + 1;
|
---|
416 | }
|
---|
417 | }
|
---|
418 | else
|
---|
419 | {
|
---|
420 | $text_id = $text_id_counter;
|
---|
421 | $text_id_counter++;
|
---|
422 | }
|
---|
423 |
|
---|
424 | // - protect the special case of an HTML comment being actually displayed
|
---|
425 | // in the text
|
---|
426 | $text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text);
|
---|
427 |
|
---|
428 | // - reformat dokuwiki syntax to HTML tag syntax
|
---|
429 | $text = preg_replace('/<!--.*?-->/', '', $text);
|
---|
430 |
|
---|
431 | // we leave code blocks alone in terms of ampersands
|
---|
432 | if (!$in_code_block)
|
---|
433 | {
|
---|
434 | // - ampersands aren't safe in XML...
|
---|
435 | $text = str_replace('&', '&', $text);
|
---|
436 | // ...except for the entities that we have registered as metadata
|
---|
437 | if (isset($cover_metadata['ENTITY']))
|
---|
438 | {
|
---|
439 | foreach ($cover_metadata['ENTITY'] as $entity)
|
---|
440 | {
|
---|
441 | if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
|
---|
442 | {
|
---|
443 | $entity_name = $matches[1];
|
---|
444 | if ($entity_name != 'mdash')
|
---|
445 | {
|
---|
446 | $entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8');
|
---|
447 | $text = str_replace('&' . $entity_name . ';', '&' . $entity_name . ';', $text);
|
---|
448 | // - we also convert any characters that match the entity char into
|
---|
449 | // the entity
|
---|
450 | $text = str_replace($entity_character, '&' . $entity_name . ';', $text);
|
---|
451 | }
|
---|
452 | }
|
---|
453 | }
|
---|
454 | }
|
---|
455 | // - protect <br/> tags
|
---|
456 | $text = str_replace('<br/>','%%br/%%',$text);
|
---|
457 | // - encoding all of the < and > that appear in the text (rather than
|
---|
458 | // true html formatting)
|
---|
459 | $text = str_replace('<','<',$text);
|
---|
460 | $text = str_replace('>','>',$text);
|
---|
461 | // - restore <br/> tags
|
---|
462 | $text = str_replace('%%br/%%','<br/>',$text);
|
---|
463 | }
|
---|
464 | else if ($type == 'code')
|
---|
465 | {
|
---|
466 | $text = str_replace('<','<',$text);
|
---|
467 | $text = str_replace('>','>',$text);
|
---|
468 | }
|
---|
469 |
|
---|
470 | // - links, oh how I hate thee
|
---|
471 | // - external links are slightly easier
|
---|
472 | $text = preg_replace('/\[\[http:\/\/(.*?)\|(.*?)\]\]/', '<Link url="http://\1">\2</Link>', $text);
|
---|
473 | // - internals have to become the horrible <CrossRef> tags. We ignore any
|
---|
474 | // number prefix on the page name as that is just used for ordering within
|
---|
475 | // Dokuwiki
|
---|
476 | $text = preg_replace('/\[\[\.\:(.*?)\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text);
|
---|
477 | // - internal links starting with hash must be on the same page
|
---|
478 | $text = preg_replace('/\[\[###(.*?)\|.*?\]\]/','<CrossRef target="Part" ref="\1"/>', $text);
|
---|
479 | $text = preg_replace('/\[\[##(.*?)\|.*?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text);
|
---|
480 | $text = preg_replace('/\[\[#(.*?)\|.*?\]\]/','<CrossRef target="Section" ref="\1"/>', $text);
|
---|
481 | // - 'external' internal wiki links are even worst - since we can't know what
|
---|
482 | // the page order number for another manual's chapters might be, we instead
|
---|
483 | // use a search
|
---|
484 | $text = preg_replace('/\[\[\?do\=search\&id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text);
|
---|
485 | // - references to images and tables
|
---|
486 | $text = preg_replace('/(?:<|<)imgref\sfigure_(.+?)(?:>|>)/','<CrossRef target="Figure" ref="\1"/>', $text);
|
---|
487 | $text = preg_replace('/(?:<|<)tblref\stable_(.+?)(?:>|>)/','<CrossRef target="Table" ref="\1"/>', $text);
|
---|
488 | // - explicitly convert URLs as they are a bit messy
|
---|
489 | // - first all the cases of URLs in italics, without protocol
|
---|
490 | $text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
|
---|
491 | $text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
|
---|
492 | $text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
|
---|
493 | // - now all the protocol ones (with care taken to protect // in protocol)
|
---|
494 | $text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
|
---|
495 | $text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
|
---|
496 | $text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
|
---|
497 | // - next we have the underlined URLs sans protocols
|
---|
498 | $text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
|
---|
499 | $text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
|
---|
500 | $text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
|
---|
501 | // - and finally the protocol prefixed underlined URLs
|
---|
502 | $text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
|
---|
503 | $text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
|
---|
504 | $text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
|
---|
505 | // - lets also protect any other protocols we find floating around
|
---|
506 | $text = preg_replace('/(file|ftp|http):\/\//i', '\1:##DOUBLESLASH##', $text);
|
---|
507 |
|
---|
508 | // - italic formatting (taking care of protected double slashes)
|
---|
509 | $text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text);
|
---|
510 | $text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case
|
---|
511 | $text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case
|
---|
512 | $text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case
|
---|
513 | $text = preg_replace('/\/\/\s*(.+?)\s*\/\//', '<i>\1</i>', $text);
|
---|
514 | $text = preg_replace('/##DOUBLESLASH##/', '//', $text);
|
---|
515 | // - bold formatting
|
---|
516 | $text = preg_replace('/\*\*([^"]+?)\*\*/', '<b>\1</b>', $text);
|
---|
517 | // - underline formatting
|
---|
518 | $text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text);
|
---|
519 |
|
---|
520 | // - decode certain entities in codeblock (just because they are valid HTML,
|
---|
521 | // derp).
|
---|
522 | if ($in_code_block && $allow_bad_codeblocks)
|
---|
523 | {
|
---|
524 | ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
|
---|
525 | $text = str_replace('<i>','<i>',$text);
|
---|
526 | $text = str_replace('</i>','</i>',$text);
|
---|
527 | //$text = str_replace('<br/>','<br/>',$text);
|
---|
528 | }
|
---|
529 | // - restore protected entities
|
---|
530 | $text = preg_replace('/##(gt|lt)##/','&\1;',$text);
|
---|
531 | // - restore protected comment blocks
|
---|
532 | $text = str_replace('%!--', '<!--', $text);
|
---|
533 | $text = str_replace('--%', '-->', $text);
|
---|
534 | // - restore protected footnote refs
|
---|
535 | $text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text);
|
---|
536 | // output the text block
|
---|
537 | $text = trim($text);
|
---|
538 | if (empty($text))
|
---|
539 | {
|
---|
540 | fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n");
|
---|
541 | }
|
---|
542 | else if (!empty($type))
|
---|
543 | {
|
---|
544 | fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n");
|
---|
545 | }
|
---|
546 | else
|
---|
547 | {
|
---|
548 | fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n");
|
---|
549 | }
|
---|
550 | }
|
---|
551 | /** outputTextBlock($xml_out, $text) **/
|
---|
552 |
|
---|
553 | /**
|
---|
554 | */
|
---|
555 | function processPage($xml_out, $page_name)
|
---|
556 | {
|
---|
557 | global $dokuwiki_path;
|
---|
558 | global $seen_ids;
|
---|
559 | echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n";
|
---|
560 | // - locate the page in question (taking into account if the user asked for a
|
---|
561 | // draft version or an approved version of the manual)
|
---|
562 | $page_path = '';
|
---|
563 | if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
|
---|
564 | {
|
---|
565 | $page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
|
---|
566 | }
|
---|
567 | else
|
---|
568 | {
|
---|
569 | $page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
|
---|
570 | }
|
---|
571 | $page_in = @fopen($page_path, 'r');
|
---|
572 | if (!$page_in)
|
---|
573 | {
|
---|
574 | printError('Failed to open page for reading:' . $page_name, false);
|
---|
575 | return;
|
---|
576 | }
|
---|
577 | // - once again we read in line-by-line, but this time we are going to output
|
---|
578 | // each line as we go through. We expect to encounter certain lines in a
|
---|
579 | // predefined order, and should complain if we don't find what we expect.
|
---|
580 | $in_chapter = false;
|
---|
581 | $in_section = false;
|
---|
582 | $in_subsection = false;
|
---|
583 | $in_part = false;
|
---|
584 | $in_list = false;
|
---|
585 | $lists = array();
|
---|
586 | $previous_listitem_type = '';
|
---|
587 | $in_figure = false;
|
---|
588 | $in_table = false;
|
---|
589 | $column_widths = array();
|
---|
590 | $in_code_block = false;
|
---|
591 | while (($line = fgets($page_in)) !== false)
|
---|
592 | {
|
---|
593 | // remove newline character
|
---|
594 | $line = preg_replace('/\r?\n$/','',$line);
|
---|
595 | // - we need to know the 'depth' for the bulletpoint lists
|
---|
596 | $depth = 0;
|
---|
597 | while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == ' ')
|
---|
598 | {
|
---|
599 | $depth++;
|
---|
600 | $line = substr($line, 2);
|
---|
601 | }
|
---|
602 | $first_character = substr($line, 0, 1);
|
---|
603 | // - special case for the end of bullet lists
|
---|
604 | if ($in_list && ($first_character != "*" && $first_character != "-"))
|
---|
605 | {
|
---|
606 | while (count($lists) > 0)
|
---|
607 | {
|
---|
608 | $list_type = array_pop($lists);
|
---|
609 | if ($list_type == '*')
|
---|
610 | {
|
---|
611 | fwrite($xml_out, '</Bullet>' . "\n");
|
---|
612 | fwrite($xml_out, '</BulletList>' . "\n");
|
---|
613 | }
|
---|
614 | else
|
---|
615 | {
|
---|
616 | fwrite($xml_out, '</NumberedItem>' . "\n");
|
---|
617 | fwrite($xml_out, '</NumberedList>' . "\n");
|
---|
618 | }
|
---|
619 | }
|
---|
620 | $in_list = false;
|
---|
621 | }
|
---|
622 | // - special case for the end of tables
|
---|
623 | if ($in_table && $first_character != '^' && $first_character != '|')
|
---|
624 | {
|
---|
625 | fwrite($xml_out, '</TableContent>' . "\n");
|
---|
626 | fwrite($xml_out, '</Table>' . "\n");
|
---|
627 | $in_table = false;
|
---|
628 | }
|
---|
629 | // - special cases for premature closing of sections, subsections and parts
|
---|
630 | if (preg_match('/<!-- close:(section|subsection|part) -->/', $line, $matches))
|
---|
631 | {
|
---|
632 | // - we always try to do this (regardless of actual flag) as we must
|
---|
633 | // always close the smallest 'granularity' first
|
---|
634 | if ($in_part)
|
---|
635 | {
|
---|
636 | fwrite($xml_out, '</Content>' . "\n");
|
---|
637 | fwrite($xml_out, '</Part>' . "\n");
|
---|
638 | $in_part = false;
|
---|
639 | }
|
---|
640 | if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection'))
|
---|
641 | {
|
---|
642 | fwrite($xml_out, '</Content>' . "\n");
|
---|
643 | fwrite($xml_out, '</Subsection>' . "\n");
|
---|
644 | $in_subsection = false;
|
---|
645 | }
|
---|
646 | if ($in_section && $matches[1] == 'section')
|
---|
647 | {
|
---|
648 | fwrite($xml_out, '</Content>' . "\n");
|
---|
649 | fwrite($xml_out, '</Section>' . "\n");
|
---|
650 | $in_section = false;
|
---|
651 | }
|
---|
652 | }
|
---|
653 |
|
---|
654 | // - if this page is a chapter, then the first thing on the page should be
|
---|
655 | // the chapter title (six equals)
|
---|
656 | if (preg_match('/====== (.+) ======/', $line, $matches))
|
---|
657 | {
|
---|
658 | $chapter_title = $matches[1];
|
---|
659 | $chapter_id = $page_name;
|
---|
660 | if (empty($chapter_id))
|
---|
661 | {
|
---|
662 | $chapter_id = generateID($chapter_title);
|
---|
663 | }
|
---|
664 | // - are we already processing a part? if so end it, end it now
|
---|
665 | if ($in_part)
|
---|
666 | {
|
---|
667 | fwrite($xml_out, '</Content>' . "\n");
|
---|
668 | fwrite($xml_out, '</Part>' . "\n");
|
---|
669 | $in_part = false;
|
---|
670 | }
|
---|
671 | // - are we already processing a subsection? if so end it, end it now
|
---|
672 | if ($in_subsection)
|
---|
673 | {
|
---|
674 | fwrite($xml_out, '</Content>' . "\n");
|
---|
675 | fwrite($xml_out, '</Subsection>' . "\n");
|
---|
676 | $in_subsection = false;
|
---|
677 | }
|
---|
678 | // - are we already processing a section? if so end it, end it now
|
---|
679 | if ($in_section)
|
---|
680 | {
|
---|
681 | fwrite($xml_out, '</Content>' . "\n");
|
---|
682 | fwrite($xml_out, '</Section>' . "\n");
|
---|
683 | $in_section = false;
|
---|
684 | }
|
---|
685 | // - are we already processing a chapter? if so end it, end it now
|
---|
686 | if ($in_chapter)
|
---|
687 | {
|
---|
688 | fwrite($xml_out, '</Content>' . "\n");
|
---|
689 | fwrite($xml_out, '</Chapter>' . "\n");
|
---|
690 | $in_chapter = false;
|
---|
691 | }
|
---|
692 | // - write out this chapter's header
|
---|
693 | fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n");
|
---|
694 | outputMetadataSingle($xml_out, $chapter_title, 'Title');
|
---|
695 | fwrite($xml_out, '<Content>' . "\n");
|
---|
696 | $in_chapter = true;
|
---|
697 | }
|
---|
698 | // - the next likely thing to encounter is a section heading (five equals)
|
---|
699 | elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches))
|
---|
700 | {
|
---|
701 | $section_title = $matches[1];
|
---|
702 | // - check for explicit section id
|
---|
703 | $section_id = '';
|
---|
704 | if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches))
|
---|
705 | {
|
---|
706 | $section_id = $matches[1];
|
---|
707 | $section_title = $matches[2];
|
---|
708 | }
|
---|
709 | if (empty($section_id))
|
---|
710 | {
|
---|
711 | $section_id = generateID($section_title);
|
---|
712 | }
|
---|
713 | // - are we already processing a part? if so end it, end it now
|
---|
714 | if ($in_part)
|
---|
715 | {
|
---|
716 | fwrite($xml_out, '</Content>' . "\n");
|
---|
717 | fwrite($xml_out, '</Part>' . "\n");
|
---|
718 | $in_part = false;
|
---|
719 | }
|
---|
720 | // - are we already processing a subsection? if so end it, end it now
|
---|
721 | if ($in_subsection)
|
---|
722 | {
|
---|
723 | fwrite($xml_out, '</Content>' . "\n");
|
---|
724 | fwrite($xml_out, '</Subsection>' . "\n");
|
---|
725 | $in_subsection = false;
|
---|
726 | }
|
---|
727 | // - are we already processing a section? if so end it, end it now
|
---|
728 | if ($in_section)
|
---|
729 | {
|
---|
730 | fwrite($xml_out, '</Content>' . "\n");
|
---|
731 | fwrite($xml_out, '</Section>' . "\n");
|
---|
732 | $in_section = false;
|
---|
733 | }
|
---|
734 | // - write out this section's header
|
---|
735 | fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n");
|
---|
736 | outputMetadataSingle($xml_out, $section_title, 'Title');
|
---|
737 | fwrite($xml_out, '<Content>' . "\n");
|
---|
738 | $in_section = true;
|
---|
739 | }
|
---|
740 | // - similar for subsection heading (four equals)
|
---|
741 | elseif (preg_match('/==== (.+) ====/', $line, $matches))
|
---|
742 | {
|
---|
743 | $subsection_title = $matches[1];
|
---|
744 | // - check for explicit subsection id
|
---|
745 | $subsection_id = '';
|
---|
746 | if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches))
|
---|
747 | {
|
---|
748 | $subsection_id = $matches[1];
|
---|
749 | $subsection_title = $matches[2];
|
---|
750 | }
|
---|
751 | if (empty($subsection_id))
|
---|
752 | {
|
---|
753 | $subsection_id = generateID($subsection_title);
|
---|
754 | }
|
---|
755 | // - are we already processing a part? if so end it, end it now
|
---|
756 | if ($in_part)
|
---|
757 | {
|
---|
758 | fwrite($xml_out, '</Content>' . "\n");
|
---|
759 | fwrite($xml_out, '</Part>' . "\n");
|
---|
760 | $in_part = false;
|
---|
761 | }
|
---|
762 | // - are we already processing a subsection? if so end it, end it now
|
---|
763 | if ($in_subsection)
|
---|
764 | {
|
---|
765 | fwrite($xml_out, '</Content>' . "\n");
|
---|
766 | fwrite($xml_out, '</Subsection>' . "\n");
|
---|
767 | $in_subsection = false;
|
---|
768 | }
|
---|
769 | // - write out this subsection's header
|
---|
770 | fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n");
|
---|
771 | outputMetadataSingle($xml_out, $subsection_title, 'Title');
|
---|
772 | fwrite($xml_out, '<Content>' . "\n");
|
---|
773 | $in_subsection = true;
|
---|
774 | }
|
---|
775 | // - and part heading (three equals)
|
---|
776 | elseif (preg_match('/=== (.+) ===/', $line, $matches))
|
---|
777 | {
|
---|
778 | $part_title = $matches[1];
|
---|
779 | // - check for explicit part id
|
---|
780 | $part_id = '';
|
---|
781 | if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches))
|
---|
782 | {
|
---|
783 | $part_id = $matches[1];
|
---|
784 | $part_title = $matches[2];
|
---|
785 | }
|
---|
786 | if (empty($part_id))
|
---|
787 | {
|
---|
788 | $part_id = generateID($part_title);
|
---|
789 | }
|
---|
790 | // - are we already processing a part? if so end it, end it now
|
---|
791 | if ($in_part)
|
---|
792 | {
|
---|
793 | fwrite($xml_out, '</Content>' . "\n");
|
---|
794 | fwrite($xml_out, '</Part>' . "\n");
|
---|
795 | $in_part = false;
|
---|
796 | }
|
---|
797 | // - write out this part's header
|
---|
798 | fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n");
|
---|
799 | outputMetadataSingle($xml_out, '**//' . $part_title . '//**', 'Title');
|
---|
800 | fwrite($xml_out, '<Content>' . "\n");
|
---|
801 | $in_part = true;
|
---|
802 | }
|
---|
803 | // - Ignore 5th level heading - they are only used to allow more convenient
|
---|
804 | // editing of figures and tables
|
---|
805 | elseif (preg_match('/== (.+) ==/', $line, $matches))
|
---|
806 | {
|
---|
807 | }
|
---|
808 | // - lists need special handling
|
---|
809 | elseif (preg_match('/^(\*|\-)\s+(.*)/', $line, $matches))
|
---|
810 | {
|
---|
811 | $list_type = $matches[1];
|
---|
812 | $list_text = $matches[2];
|
---|
813 | $list_depth = count($lists);
|
---|
814 | if (!$in_list)
|
---|
815 | {
|
---|
816 | if ($list_type == '*')
|
---|
817 | {
|
---|
818 | fwrite($xml_out, '<BulletList>' . "\n");
|
---|
819 | }
|
---|
820 | else
|
---|
821 | {
|
---|
822 | fwrite($xml_out, '<NumberedList>' . "\n");
|
---|
823 | }
|
---|
824 | $in_list = true;
|
---|
825 | array_push($lists, $list_type);
|
---|
826 | }
|
---|
827 | // - this bullet is at the same depth as previous - close the previous
|
---|
828 | // point
|
---|
829 | elseif ($depth == $list_depth)
|
---|
830 | {
|
---|
831 | $previous_list_type = end($lists);
|
---|
832 | if ($previous_list_type == '*')
|
---|
833 | {
|
---|
834 | fwrite($xml_out, '</Bullet>' . "\n");
|
---|
835 | }
|
---|
836 | else
|
---|
837 | {
|
---|
838 | fwrite($xml_out, '</NumberedItem>' . "\n");
|
---|
839 | }
|
---|
840 | // - we don't match in type anymore... close the previous list and open
|
---|
841 | // a new list of the appropriate type
|
---|
842 | if ($list_type != $previous_list_type)
|
---|
843 | {
|
---|
844 | if ($previous_list_type == '*')
|
---|
845 | {
|
---|
846 | fwrite($xml_out, '</BulletList>' . "\n");
|
---|
847 | fwrite($xml_out, '<NumberedList>' . "\n");
|
---|
848 | }
|
---|
849 | else
|
---|
850 | {
|
---|
851 | fwrite($xml_out, '</NumberedNumbered>' . "\n");
|
---|
852 | fwrite($xml_out, '<BulletList>' . "\n");
|
---|
853 | }
|
---|
854 | array_pop($lists);
|
---|
855 | array_push($lists, $list_type);
|
---|
856 | }
|
---|
857 | }
|
---|
858 | else
|
---|
859 | {
|
---|
860 | // - we have either got deeper...
|
---|
861 | if ($depth > $list_depth)
|
---|
862 | {
|
---|
863 | if ($list_type == '*')
|
---|
864 | {
|
---|
865 | fwrite($xml_out, '<BulletList>' . "\n");
|
---|
866 | }
|
---|
867 | else
|
---|
868 | {
|
---|
869 | fwrite($xml_out, '<NumberedList>' . "\n");
|
---|
870 | }
|
---|
871 | array_push($lists, $list_type);
|
---|
872 | }
|
---|
873 | // ... or shallower in the bullet listing
|
---|
874 | if ($depth < $list_depth)
|
---|
875 | {
|
---|
876 | $previous_list_type = array_pop($lists);
|
---|
877 | if ($previous_list_type == '*')
|
---|
878 | {
|
---|
879 | fwrite($xml_out, '</Bullet>' . "\n");
|
---|
880 | fwrite($xml_out, '</BulletList>' . "\n");
|
---|
881 | }
|
---|
882 | else
|
---|
883 | {
|
---|
884 | fwrite($xml_out, '</NumberedItem>' . "\n");
|
---|
885 | fwrite($xml_out, '</NumberedList>' . "\n");
|
---|
886 | }
|
---|
887 | // - we still have to close the last item too
|
---|
888 | $previous_listitem_type = end($lists);
|
---|
889 | if ($previous_listitem_type == '*')
|
---|
890 | {
|
---|
891 | fwrite($xml_out, '</Bullet>' . "\n");
|
---|
892 | }
|
---|
893 | else
|
---|
894 | {
|
---|
895 | fwrite($xml_out, '</NumberedItem>' . "\n");
|
---|
896 | }
|
---|
897 | }
|
---|
898 | }
|
---|
899 | if ($list_type == '*')
|
---|
900 | {
|
---|
901 | fwrite($xml_out, '<Bullet>' . "\n");
|
---|
902 | }
|
---|
903 | else
|
---|
904 | {
|
---|
905 | fwrite($xml_out, '<NumberedItem>' . "\n");
|
---|
906 | }
|
---|
907 | // Special Case: bullets that contain (start) a code block
|
---|
908 | if (preg_match('/^(.*)<code>\s*$/', $list_text, $matches))
|
---|
909 | {
|
---|
910 | $list_text = $matches[1];
|
---|
911 | $in_code_block = true;
|
---|
912 | }
|
---|
913 |
|
---|
914 | outputTextBlock($xml_out, $list_text);
|
---|
915 |
|
---|
916 | // - to make things clearer, we'll process any and all code blocks within
|
---|
917 | // bullets here - especially as there may be more text block *after*
|
---|
918 | // the code block finishes
|
---|
919 | if ($in_code_block)
|
---|
920 | {
|
---|
921 | $sub_line = '';
|
---|
922 | while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
|
---|
923 | {
|
---|
924 | $sub_line = trim($sub_line);
|
---|
925 | // - closing code
|
---|
926 | if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
|
---|
927 | {
|
---|
928 | $sub_line = $matches[1]; // may be empty string
|
---|
929 | $in_code_block = false;
|
---|
930 | }
|
---|
931 | // - output another plain codeline
|
---|
932 | else
|
---|
933 | {
|
---|
934 | fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n");
|
---|
935 | $sub_line = '';
|
---|
936 | }
|
---|
937 | }
|
---|
938 | // - if sub_line still has anything in it, then add that content as a
|
---|
939 | // text block
|
---|
940 | if (!empty($sub_line))
|
---|
941 | {
|
---|
942 | outputTextBlock($xml_out, $sub_line);
|
---|
943 | }
|
---|
944 | }
|
---|
945 | }
|
---|
946 | // - images start with an image caption 'element'
|
---|
947 | elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\|(.+)>([^<]*?)<\/imgcaption>/', $line, $matches))
|
---|
948 | {
|
---|
949 | $figure_id = $matches[1];
|
---|
950 | $figure_title = $matches[2];
|
---|
951 | $image_content = $matches[3];
|
---|
952 | // - watch for the special withLineNumber flag
|
---|
953 | $class_attribute = '';
|
---|
954 | if (strpos($figure_title, '%!-- withLineNumber --%') != false)
|
---|
955 | {
|
---|
956 | $class_attribute = ' class="withLineNumber"';
|
---|
957 | $figure_title = str_replace('%!-- withLineNumber --%','',$figure_title);
|
---|
958 | }
|
---|
959 | fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n");
|
---|
960 | echo '[figure: ' . $figure_id . "] \n";
|
---|
961 | fwrite($xml_out, '<Title>' . "\n");
|
---|
962 | // - decode any comments in the title (used to store explicit id
|
---|
963 | // information)
|
---|
964 | $figure_title = str_replace('%!--', '<!--', $figure_title);
|
---|
965 | $figure_title = str_replace('--%', '-->', $figure_title);
|
---|
966 | // - special case: the title may have a subtitle (as a prefix)
|
---|
967 | $figure_subtitle_id = '';
|
---|
968 | $figure_subtitle = '';
|
---|
969 | // - subtitle with explicit id
|
---|
970 | if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s*(.*)$/', $figure_title, $matches))
|
---|
971 | {
|
---|
972 | $figure_subtitle = $matches[1];
|
---|
973 | $figure_title = $matches[2];
|
---|
974 | }
|
---|
975 | // - subtitle without explicit id
|
---|
976 | else if (preg_match('/^(\([a-z]\))\s*(.*)$/', $figure_title, $matches))
|
---|
977 | {
|
---|
978 | $figure_subtitle = $matches[1];
|
---|
979 | $figure_title = $matches[2];
|
---|
980 | }
|
---|
981 | outputTextBlock($xml_out, $figure_title);
|
---|
982 | if (!empty($figure_subtitle))
|
---|
983 | {
|
---|
984 | fwrite($xml_out, '<SubTitle>' . "\n");
|
---|
985 | outputTextBlock($xml_out, $figure_subtitle);
|
---|
986 | fwrite($xml_out, '</SubTitle>' . "\n");
|
---|
987 | }
|
---|
988 | fwrite($xml_out, '</Title>' . "\n");
|
---|
989 | // Try and find the image itself
|
---|
990 | if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $image_content))
|
---|
991 | {
|
---|
992 | processImage($xml_out, $line);
|
---|
993 | fwrite($xml_out, '</Figure>' . "\n");
|
---|
994 | }
|
---|
995 | // Didn't find an image? Weird, but mark the imgcaption as open, and
|
---|
996 | // we'll chomp up the next image found as the content.
|
---|
997 | else
|
---|
998 | {
|
---|
999 | $in_figure = true;
|
---|
1000 | }
|
---|
1001 | // - record the id to prevent repeating
|
---|
1002 | $seen_ids[$figure_id] = true;
|
---|
1003 | }
|
---|
1004 | // - tables start with a table caption 'element'
|
---|
1005 | elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\|([^>]+)>\s*<\/tblcaption>/', $line, $matches))
|
---|
1006 | {
|
---|
1007 | $table_id = $matches[1];
|
---|
1008 | $table_title = $matches[2];
|
---|
1009 | if ($table_title == '##NOCAPTION##')
|
---|
1010 | {
|
---|
1011 | echo '[non-captioned table: ' . $table_id . "] \n";
|
---|
1012 | // - watch for autogenerated ids... no point in outputting them
|
---|
1013 | if (preg_match('/^table(_\d+)?$/', $table_id))
|
---|
1014 | {
|
---|
1015 | fwrite($xml_out, "<Table>\n");
|
---|
1016 | }
|
---|
1017 | else
|
---|
1018 | {
|
---|
1019 | fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
|
---|
1020 | }
|
---|
1021 | fwrite($xml_out, '<Title/>' . "\n");
|
---|
1022 | }
|
---|
1023 | elseif ($table_title == '##HIDDEN##')
|
---|
1024 | {
|
---|
1025 | echo '[hidden table: ' . $table_id . "] \n";
|
---|
1026 | // - watch for autogenerated ids... no point in outputting them
|
---|
1027 | if (preg_match('/^table(_\d+)?$/', $table_id))
|
---|
1028 | {
|
---|
1029 | fwrite($xml_out, "<Table class=\"hidden\">\n");
|
---|
1030 | }
|
---|
1031 | else
|
---|
1032 | {
|
---|
1033 | fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n");
|
---|
1034 | }
|
---|
1035 | fwrite($xml_out, '<Title/>' . "\n");
|
---|
1036 | }
|
---|
1037 | else
|
---|
1038 | {
|
---|
1039 | echo '[table: ' . $table_id . "] \n";
|
---|
1040 | // - watch for autogenerated ids... no point in outputting them
|
---|
1041 | if (preg_match('/^table(_\d+)?$/', $table_id))
|
---|
1042 | {
|
---|
1043 | fwrite($xml_out, "<Table>\n");
|
---|
1044 | }
|
---|
1045 | else
|
---|
1046 | {
|
---|
1047 | fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
|
---|
1048 | }
|
---|
1049 | fwrite($xml_out, '<Title>' . "\n");
|
---|
1050 | outputTextBlock($xml_out, $table_title);
|
---|
1051 | fwrite($xml_out, '</Title>' . "\n");
|
---|
1052 | }
|
---|
1053 | fwrite($xml_out, '<TableContent>' . "\n");
|
---|
1054 | $in_table = true;
|
---|
1055 | // - record the id to prevent repeating
|
---|
1056 | $seen_ids[$table_id] = true;
|
---|
1057 | }
|
---|
1058 | // - the second line in a table should be it's column width values
|
---|
1059 | elseif (preg_match('/\|<\s-\s([0-9 ]+?)\s>\|/', $line, $matches))
|
---|
1060 | {
|
---|
1061 | $column_widths = explode(' ', $matches[1]);
|
---|
1062 | }
|
---|
1063 | // - then every row will be made of a number of cells
|
---|
1064 | elseif (preg_match('/^\|(.*?)\|$/', $line, $matches))
|
---|
1065 | {
|
---|
1066 | $row_content = $matches[1];
|
---|
1067 | $cell_contents = preg_split('/(\s+\||\|\s+)/', $row_content);
|
---|
1068 | fwrite($xml_out, '<tr>' . "\n");
|
---|
1069 | foreach ($cell_contents as $index=>$cell_content)
|
---|
1070 | {
|
---|
1071 | $cell_content = trim($cell_content);
|
---|
1072 | $th_text = '';
|
---|
1073 | if (isset($column_widths[$index]))
|
---|
1074 | {
|
---|
1075 | $th_text = '<th width="' . $column_widths[$index] . '"';
|
---|
1076 | }
|
---|
1077 | else
|
---|
1078 | {
|
---|
1079 | $th_text = '<th';
|
---|
1080 | }
|
---|
1081 | // - if the cell would be empty, we use the shorthand
|
---|
1082 | if (empty($cell_content))
|
---|
1083 | {
|
---|
1084 | $th_text .= '/>' . "\n";
|
---|
1085 | fwrite($xml_out, $th_text);
|
---|
1086 | }
|
---|
1087 | else
|
---|
1088 | {
|
---|
1089 | $th_text .= '>' . "\n";
|
---|
1090 | fwrite($xml_out, $th_text);
|
---|
1091 |
|
---|
1092 | // GAH - this is proving harder than a hard thing thats hard.
|
---|
1093 | // The issue is that the most straightforward way of fixing this,
|
---|
1094 | // namely using explicit newlines (\\) in the dokuwiki txt causes
|
---|
1095 | // lots a legitimately translated <br/> to also be split up. I
|
---|
1096 | // think the only way forward would be to maybe extend the HTML
|
---|
1097 | // Comment plugin to also respect and process <br/> tags. Then I
|
---|
1098 | // can avoid transforming them, and use the \\ sentinel to
|
---|
1099 | // separate multi-line table cells.
|
---|
1100 | $cell_content_lines = explode('\\\\', $cell_content);
|
---|
1101 | foreach ($cell_content_lines as $cell_content)
|
---|
1102 | {
|
---|
1103 | // - watch out, as the content may be an image
|
---|
1104 | if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
|
---|
1105 | {
|
---|
1106 | processImage($xml_out, $cell_content);
|
---|
1107 | }
|
---|
1108 | elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
|
---|
1109 | {
|
---|
1110 | fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n");
|
---|
1111 | }
|
---|
1112 | // - anything else it text
|
---|
1113 | else
|
---|
1114 | {
|
---|
1115 | outputTextBlock($xml_out, $cell_content);
|
---|
1116 | }
|
---|
1117 | }
|
---|
1118 | fwrite($xml_out, '</th>' . "\n");
|
---|
1119 | }
|
---|
1120 | }
|
---|
1121 | fwrite($xml_out, '</tr>' . "\n");
|
---|
1122 | }
|
---|
1123 | // - links to image media in the wiki!
|
---|
1124 | elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
|
---|
1125 | {
|
---|
1126 | processImage($xml_out, $line);
|
---|
1127 | // - if we were processing a figure, then now is a good time to close it
|
---|
1128 | if ($in_figure)
|
---|
1129 | {
|
---|
1130 | fwrite($xml_out, '</Figure>' . "\n");
|
---|
1131 | $in_figure = false;
|
---|
1132 | }
|
---|
1133 | }
|
---|
1134 | // - if the line starts with a <code> block, then we have a tag
|
---|
1135 | // for that (which is special in that it get a unique text id)
|
---|
1136 | elseif (preg_match('/^<code\s*\d*\s*>(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
|
---|
1137 | {
|
---|
1138 | $payload = $matches[1];
|
---|
1139 | $found_end = (isset($matches[2]));
|
---|
1140 | $in_code_block = true;
|
---|
1141 | // - be careful with empty lines
|
---|
1142 | if (empty($payload))
|
---|
1143 | {
|
---|
1144 | // - as they may appear in the body of the code (in which case we need
|
---|
1145 | // to output them). The empty lines at the start or end of a code
|
---|
1146 | // block are just an unfortunate consequence of the support for code
|
---|
1147 | // line numbering.
|
---|
1148 | if (!$found_end && strpos($line, '<code') === false)
|
---|
1149 | {
|
---|
1150 | fwrite($xml_out, "<CodeLine/>\n");
|
---|
1151 | }
|
---|
1152 | }
|
---|
1153 | elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches))
|
---|
1154 | {
|
---|
1155 | $text_id = $matches[1];
|
---|
1156 | outputTextBlock($xml_out, $payload, 'code', true);
|
---|
1157 | // - record the id to prevent repeating
|
---|
1158 | $seen_ids[$text_id] = true;
|
---|
1159 | }
|
---|
1160 | else
|
---|
1161 | {
|
---|
1162 | fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n");
|
---|
1163 | }
|
---|
1164 | // - if we didn't find an endtag we have to keep doing code mode until
|
---|
1165 | // we do
|
---|
1166 | $in_code_block = (!$found_end);
|
---|
1167 | if ($found_end)
|
---|
1168 | {
|
---|
1169 | // - if we were processing a figure, then now is a good time to close it
|
---|
1170 | if ($in_figure)
|
---|
1171 | {
|
---|
1172 | fwrite($xml_out, '</Figure>' . "\n");
|
---|
1173 | $in_figure = false;
|
---|
1174 | }
|
---|
1175 | }
|
---|
1176 | }
|
---|
1177 | // - entities on a line by themselves (i.e. references to external files)
|
---|
1178 | // go through verbatim
|
---|
1179 | elseif (preg_match('/^\s*&[a-z0-9_-]+;\s*$/', $line))
|
---|
1180 | {
|
---|
1181 | fwrite($xml_out, $line . "\n");
|
---|
1182 | }
|
---|
1183 | // - lines starting with > are indented text blocks
|
---|
1184 | elseif (preg_match('/^>(.*)$/', $line, $matches))
|
---|
1185 | {
|
---|
1186 | $payload = $matches[1];
|
---|
1187 | fwrite($xml_out, "<Indented>\n");
|
---|
1188 | outputTextBlock($xml_out, $payload);
|
---|
1189 | fwrite($xml_out, "</Indented>\n");
|
---|
1190 | }
|
---|
1191 | // - everything else goes straight through as a text block
|
---|
1192 | // - note that for code blocks, even empty lines count
|
---|
1193 | elseif (!empty($line))
|
---|
1194 | {
|
---|
1195 | // - output the line of text having encoded entities etc
|
---|
1196 | outputTextBlock($xml_out, $line, '', $in_code_block);
|
---|
1197 | }
|
---|
1198 | }
|
---|
1199 | // Complete any open part
|
---|
1200 | if ($in_part)
|
---|
1201 | {
|
---|
1202 | fwrite($xml_out, '</Content>' . "\n");
|
---|
1203 | fwrite($xml_out, '</Part>' . "\n");
|
---|
1204 | $in_part = false;
|
---|
1205 | }
|
---|
1206 | // Complete any open subsection
|
---|
1207 | if ($in_subsection)
|
---|
1208 | {
|
---|
1209 | fwrite($xml_out, '</Content>' . "\n");
|
---|
1210 | fwrite($xml_out, '</Subsection>' . "\n");
|
---|
1211 | $in_subsection = false;
|
---|
1212 | }
|
---|
1213 | // Complete any open section
|
---|
1214 | if ($in_section)
|
---|
1215 | {
|
---|
1216 | fwrite($xml_out, '</Content>' . "\n");
|
---|
1217 | fwrite($xml_out, '</Section>' . "\n");
|
---|
1218 | $in_section = false;
|
---|
1219 | }
|
---|
1220 | // Complete any open chapter
|
---|
1221 | if ($in_chapter)
|
---|
1222 | {
|
---|
1223 | fwrite($xml_out, '</Content>' . "\n");
|
---|
1224 | fwrite($xml_out, '</Chapter>' . "\n");
|
---|
1225 | $in_chapter = false;
|
---|
1226 | }
|
---|
1227 | }
|
---|
1228 | /** processPage($xml_out, $page_name) **/
|
---|
1229 |
|
---|
1230 | function processImage($xml_out, $text)
|
---|
1231 | {
|
---|
1232 | global $dokuwiki_path;
|
---|
1233 | global $xml_source_path;
|
---|
1234 | if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches))
|
---|
1235 | {
|
---|
1236 | $filename = $matches[1];
|
---|
1237 | $width = $matches[2];
|
---|
1238 | $height = $matches[3];
|
---|
1239 | // - copy the file into place
|
---|
1240 | $image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
|
---|
1241 | $image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images';
|
---|
1242 | mkAllDir($image_destination_dir);
|
---|
1243 | $image_destination_path = $image_destination_dir . '/' . $filename;
|
---|
1244 | if (copy($image_source_path, $image_destination_path))
|
---|
1245 | {
|
---|
1246 | echo '[copying file: ' . $filename . "] \n";
|
---|
1247 | chmod($image_destination_path, 0664);
|
---|
1248 | }
|
---|
1249 | else
|
---|
1250 | {
|
---|
1251 | printError('Failed to copy image into place: ' . $filename, false);
|
---|
1252 | }
|
---|
1253 | // - spit out the XML element
|
---|
1254 | fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n");
|
---|
1255 | }
|
---|
1256 | } |
---|