source: documentation/trunk/php/gs-manual-export.php@ 25026

Last change on this file since 25026 was 25026, checked in by jmt12, 11 years ago

PHP scripts for importing and exporting the XML manuals into and out of Dokuwiki

File size: 41.7 KB
Line 
1<?php
2
3require_once('common.php');
4$debug = 1;
5
6// There are some elements that are, in my opinion, incorrectly 'resolved' in
7// code blocks just because they are valid HTML. For instance, &lt;i&gt; is
8// resolved to <i> in code blocks, while something like &lt;Metadata&gt; is
9// not. Set this to true to allow such abominations (for the purpose of
10// comparing before and after versions of the XML). [jmt12]
11$allow_bad_codeblocks = true;
12
13/** @file gs-manual-export.php
14 * This script transforms the series of dokuwiki pages that make up a certain
15 * manual (as specified by the 'm' argument) in a certain language ('l') into
16 * the XML format required by the rest of the Greenstone manual generation
17 * scripts.
18 */
19
20// 0. Initialization
21$xml_source_path = '/research/jmt12/gsdl-docs/temp';
22// - we have a counter to assign identifiers to text blocks etc without ids
23$text_id_counter = 1;
24// - we need an array of all the footnotes
25$footnotes = array();
26
27// Defaults
28if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
29{
30 $_REQUEST['l'] = 'en';
31}
32if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
33{
34 //$_REQUEST['m'] = 'user';
35 //$_REQUEST['m'] = 'install';
36 $_REQUEST['m'] = 'develop';
37 //$_REQUEST['m'] = 'paper';
38}
39if (!isset($_REQUEST['v']) || empty($_REQUEST['v']))
40{
41 $_REQUEST['v'] = 'draft';
42}
43if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a']))
44{
45 $_REQUEST['a'] = 'store'; // Try to store the file to disk
46}
47
48echo '<html>' . "\n";
49echo '<head>' . "\n";
50echo '<title>GS Manual Export</title>' . "\n";
51echo '</head>' . "\n";
52echo '<body>' . "\n";
53
54// - validate arguments before we use them (security)
55if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
56 {
57 printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
58 }
59
60if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
61 {
62 printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
63 }
64
65echo '<h2>Generating Greenstone Manual XML</h2>' . "\n";
66echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
67// 1. Create the XML output file handle
68$xml_file_path = '';
69if ($_REQUEST['a'] == 'download')
70{
71 $xml_file_path = fileCat(array($base_path, 'var', ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml'));
72}
73else
74{
75 // - construct the path using the information we've been provided as arguments
76 $xml_file_dir = $xml_source_path . '/' . $_REQUEST['l'];
77 mkAllDir($xml_file_dir, 0755);
78 $xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
79 // - backup any existing file
80 if (file_exists($xml_file_path))
81 {
82 $xml_backup_file_path = $xml_file_path . '.bak';
83 if (!rename($xml_file_path, $xml_backup_file_path))
84 {
85 printError('Failed to rename existing manual file for backup');
86 }
87 }
88}
89// - and create a handle to the new file
90$xml_out = fopen($xml_file_path, 'w');
91
92// 2. Read in the top level page - this will give configuration data for the
93// manual and cover page, as well as specifying the order for the other
94// pages in the manual
95echo "<p><b>Frontmatter:</b><br/>\n";
96// - by reading this page we hope to populate an array of metadata, and also
97// extract the sequence of other pages within this manual
98$cover_metadata = array();
99$pages_in_order = array();
100// - we now need to consider if the user has asked for a draft version (i.e.
101// includes the latest version of pages regardless of approval) or if only
102// the approved versions of pages should be included
103// - only necessary for english version of manual, as those are the only pages
104// editable
105$top_page_path = '';
106if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
107{
108 // - again, we can construct the path to the top level page given the arguments
109 // provided
110 $top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
111}
112else
113{
114 $top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
115}
116
117if (!file_exists($top_page_path))
118 {
119 printError('Failed to locate top level page for manual');
120 }
121// - we're going to open a handle to the file, then read it in line-by-line
122// watching for the lines we are interested in (using pattern matching)
123$top_page_in = fopen($top_page_path, 'r');
124if (!$top_page_in)
125 {
126 printError('Failed to open top level page for reading');
127 }
128$in_contents = false;
129while (($line = fgets($top_page_in)) !== false)
130 {
131 // - if we are capturing page order, and we encounter something that looks
132 // like a bulletpoint item pointing to a wiki page, then we append the
133 // name of that page to our pages in order array
134 if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches))
135 {
136 array_push($pages_in_order, $matches[1]);
137 }
138 // - metadata is all encoded within dokuwiki tables
139 elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches))
140 {
141 $field = $matches[1];
142 $value = $matches[2];
143 $values = array();
144 if (isset($cover_metadata[$field]))
145 {
146 $values = $cover_metadata[$field];
147 }
148 array_push($values, $value);
149 $cover_metadata[$field] = $values;
150 }
151 // - watch for the heading 'Contents' to begin extracting page order
152 // information
153 elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
154 {
155 if ($matches[1] == 'Contents')
156 {
157 $in_contents = true;
158 }
159 // - any other title means we aren't capturing page order (anymore)
160 else
161 {
162 $in_contents = false;
163 }
164 }
165 }
166if (!feof($top_page_in))
167 {
168 printError('Unexpected fgets() fail when reading top page');
169 }
170fclose($top_page_in);
171// - ensure we have the required metadata
172$required_metadata = array('Heading','Title','Affiliation','Version','Date');
173foreach ($required_metadata as $required_field)
174{
175 if (!isset($cover_metadata[$required_field]))
176 {
177 printError('Missing required metadata: ' . $required_field);
178 }
179}
180// - now we can use the metadata to construct the XML header and the cover page.
181// This follows a pretty set recipe with only elements that can repeat---like
182// Author, SupplementaryText etc---are at all tricky
183fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n");
184fwrite($xml_out, '<!DOCTYPE Manual [' . "\n");
185if (isset($cover_metadata['ENTITY']))
186 {
187 foreach ($cover_metadata['ENTITY'] as $entity)
188 {
189 fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n");
190 }
191 }
192fwrite($xml_out, ']>' . "\n");
193fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n");
194
195///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n";
196
197outputMetadataSingle($xml_out, $cover_metadata, 'Heading');
198outputMetadataSingle($xml_out, $cover_metadata, 'Title');
199outputMetadataSingle($xml_out, $cover_metadata, 'Author');
200outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation');
201outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText');
202outputMetadataMultiple($xml_out, $cover_metadata, 'Text');
203outputMetadataMultiple($xml_out, $cover_metadata, 'Comment');
204outputMetadataSingle($xml_out, $cover_metadata, 'Version');
205outputMetadataSingle($xml_out, $cover_metadata, 'Date');
206
207// 3. Process each page listed in the contents of the top level page in order
208foreach ($pages_in_order as $page)
209{
210 processPage($xml_out, $page);
211}
212
213// 4. Output out list of footnotes (if any)
214if (!empty($footnotes))
215 {
216 fwrite($xml_out, '<FootnoteList>'. "\n");
217 foreach ($footnotes as $footnote=>$footnote_id)
218 {
219 ///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n";
220 outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
221 }
222 fwrite($xml_out, '</FootnoteList>'. "\n");
223 }
224
225// 5. Finalize and close the XML output
226fwrite($xml_out, '</Manual>' . "\n");
227fclose($xml_out);
228
229// 6. Complete!
230echo '<p><b>Complete!</b></p>' . "\n<hr/>\n";
231if ($_REQUEST['a'] == 'download')
232{
233 echo '<p>Click <a href="var/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml">here</a> to download XML file</p>' . "\n";
234}
235echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n";
236echo '</body>' . "\n";
237echo '</html>';
238exit(0);
239
240/**
241 */
242function outputMetadataSingle($xml_out, $metadata, $field, $mid=false)
243{
244 echo '[metadata: ' . $field . "] \n";
245 if ($mid)
246 {
247 fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n");
248 }
249 else
250 {
251 fwrite($xml_out, '<' . $field . '>' . "\n");
252 }
253 if (is_array($metadata))
254 {
255 if (isset($metadata[$field]) && isset($metadata[$field][0]))
256 {
257 outputTextBlock($xml_out, $metadata[$field][0]);
258 }
259 else
260 {
261 echo 'no such field or no metadata';
262 }
263 }
264 elseif (!empty($metadata))
265 {
266 outputTextBlock($xml_out, $metadata);
267 }
268 else
269 {
270 echo 'no such field or no metadata';
271 }
272 fwrite($xml_out, '</' . $field . '>' . "\n");
273}
274/** outputMetadataSingle() **/
275
276function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false)
277{
278 echo '[metadata list: ' . $field . "] \n";
279 fwrite($xml_out, '<' . $field . '>' . "\n");
280 if (isset($metadata[$field]))
281 {
282 if (count($metadata[$field]) == 1)
283 {
284 outputTextBlock($xml_out, $metadata[$field][0]);
285 }
286 if (count($metadata[$field]) > 1)
287 {
288 $last_value = '';
289 if ($final_separator)
290 {
291 $last_value = array_pop($metadata[$field]);
292 }
293 $values = implode($separator, $metadata[$field]);
294 if ($final_separator)
295 {
296 $values .= $final_separator . $last_value;
297 }
298 outputTextBlock($xml_out, $values);
299 }
300 }
301 else
302 {
303 echo 'no such field or no metadata';
304 }
305 fwrite($xml_out, '</' . $field . '>' . "\n");
306}
307/** outputMetadataList() **/
308
309function outputMetadataMultiple($xml_out, $metadata, $field)
310{
311 echo '[metadata multiple: ' . $field . "] \n";
312 // - Text blocks don't need to be wrapped in Text element
313 if ($field != 'Text')
314 {
315 fwrite($xml_out, '<' . $field . '>' . "\n");
316 }
317 if (isset($metadata[$field]))
318 {
319 foreach ($metadata[$field] as $value)
320 {
321 outputTextBlock($xml_out, $value);
322 }
323 }
324 else
325 {
326 echo 'no such field or no metadata';
327 }
328 if ($field != 'Text')
329 {
330 fwrite($xml_out, '</' . $field . '>' . "\n");
331 }
332}
333
334function translateTableCodeline($text)
335{
336 // Escape any italic tags hidden in HTML comments
337 $text = str_replace('<!--i-->', '%!--i--%', $text);
338 $text = str_replace('<!--/i-->', '%!--/i--%', $text);
339 // Encode entities etc
340 $text = translateText($text, true);
341 // Restore any italics elements hidden above
342 $text = str_replace('%!--i--%', '<i>', $text);
343 $text = str_replace('%!--/i--%', '</i>', $text);
344 return $text;
345}
346/** translateTableCodeline() **/
347
348function translateText($text, $in_code_block=false)
349{
350 global $allow_bad_codeblocks;
351 $text = str_replace('&','&amp;',$text);
352 $text = str_replace('<','&lt;',$text);
353 $text = str_replace('>','&gt;',$text);
354 if ($in_code_block && $allow_bad_codeblocks)
355 {
356 ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
357 $text = str_replace('&lt;i&gt;','<i>',$text);
358 $text = str_replace('&lt;/i&gt;','</i>',$text);
359 $text = str_replace('&lt;br/&gt;','<br/>',$text);
360 }
361 return $text;
362}
363
364function outputTextBlock($xml_out, $text, $type='', $in_code_block = false)
365{
366 global $cover_metadata;
367 global $text_id_counter;
368 global $footnotes;
369 global $allow_bad_codeblocks;
370
371 // - Start by dealing with any footnotes before anything else
372 while (preg_match('/\(\((.*?)\)\)/', $text, $matches))
373 {
374 $pattern = $matches[0];
375 $footnote = $matches[1];
376 $footnote_id = count($footnotes) + 1;
377 $footnotes[$footnote] = $footnote_id;
378 // - note that we have to escape the footnote reference as the following
379 // code will convert any < and > to entities...
380 $footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%';
381 $text = str_replace($pattern, $footnote_reference, $text);
382 }
383
384 $text_id = '';
385 // - check whether the string begins with an explicit id
386 if (preg_match('/^\s*<!--\s*id:(.+?)\s*-->(.*)$/', $text, $matches))
387 {
388 $text_id = $matches[1];
389 $text = $matches[2];
390 if (is_numeric($text_id))
391 {
392 $text_id_counter = $text_id + 1;
393 }
394 }
395 else
396 {
397 $text_id = $text_id_counter;
398 $text_id_counter++;
399 }
400
401 // - protect the special case of an HTML comment being actually displayed
402 // in the text
403 $text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text);
404
405 // - reformat dokuwiki syntax to HTML tag syntax
406 $text = preg_replace('/<!--.*?-->/', '', $text);
407
408 // we leave code blocks alone in terms of ampersands
409 if (!$in_code_block)
410 {
411 // - ampersands aren't safe in XML...
412 $text = str_replace('&', '&amp;', $text);
413 // ...except for the entities that we have registered as metadata
414 if (isset($cover_metadata['ENTITY']))
415 {
416 foreach ($cover_metadata['ENTITY'] as $entity)
417 {
418 if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
419 {
420 $entity_name = $matches[1];
421 if ($entity_name != 'mdash')
422 {
423 $entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8');
424 $text = str_replace('&amp;' . $entity_name . ';', '&' . $entity_name . ';', $text);
425 // - we also convert any characters that match the entity char into
426 // the entity
427 $text = str_replace($entity_character, '&' . $entity_name . ';', $text);
428 }
429 }
430 }
431 }
432 // - protect <br/> tags
433 $text = str_replace('<br/>','%%br/%%',$text);
434 // - encoding all of the < and > that appear in the text (rather than
435 // true html formatting)
436 $text = str_replace('<','&lt;',$text);
437 $text = str_replace('>','&gt;',$text);
438 // - restore <br/> tags
439 $text = str_replace('%%br/%%','<br/>',$text);
440 }
441 else if ($type == 'code')
442 {
443 $text = str_replace('<','&lt;',$text);
444 $text = str_replace('>','&gt;',$text);
445 }
446
447 // - links, oh how I hate thee
448 // - external links are slightly easier
449 $text = preg_replace('/\[\[http:\/\/(.*?)\|(.*?)\]\]/', '<Link url="http://\1">\2</Link>', $text);
450 // - internals have to become the horrible <CrossRef> tags. We ignore any
451 // number prefix on the page name as that is just used for ordering within
452 // Dokuwiki
453 $text = preg_replace('/\[\[\.\:(.*?)\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text);
454 // - internal links starting with hash must be on the same page
455 $text = preg_replace('/\[\[###(.*?)\|.*?\]\]/','<CrossRef target="Part" ref="\1"/>', $text);
456 $text = preg_replace('/\[\[##(.*?)\|.*?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text);
457 $text = preg_replace('/\[\[#(.*?)\|.*?\]\]/','<CrossRef target="Section" ref="\1"/>', $text);
458 // - 'external' internal wiki links are even worst - since we can't know what
459 // the page order number for another manual's chapters might be, we instead
460 // use a search
461 $text = preg_replace('/\[\[\?do\=search\&amp;id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text);
462 // - references to images and tables
463 $text = preg_replace('/(?:<|&lt;)imgref\sfigure_(.+?)(?:>|&gt;)/','<CrossRef target="Figure" ref="\1"/>', $text);
464 $text = preg_replace('/(?:<|&lt;)tblref\stable_(.+?)(?:>|&gt;)/','<CrossRef target="Table" ref="\1"/>', $text);
465 // - explicitly convert URLs as they are a bit messy
466 // - first all the cases of URLs in italics, without protocol
467 $text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
468 $text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
469 $text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
470 // - now all the protocol ones (with care taken to protect // in protocol)
471 $text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
472 $text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
473 $text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
474 // - next we have the underlined URLs sans protocols
475 $text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
476 $text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
477 $text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
478 // - and finally the protocol prefixed underlined URLs
479 $text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
480 $text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
481 $text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
482 // - lets also protect any other protocols we find floating around
483 $text = preg_replace('/(file|ftp|http):\/\//i', '\1:##DOUBLESLASH##', $text);
484
485 // - italic formatting (taking care of protected double slashes)
486 $text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text);
487 $text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case
488 $text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case
489 $text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case
490 $text = preg_replace('/\/\/\s*(.+?)\s*\/\//', '<i>\1</i>', $text);
491 $text = preg_replace('/##DOUBLESLASH##/', '//', $text);
492 // - bold formatting
493 $text = preg_replace('/\*\*([^"]+?)\*\*/', '<b>\1</b>', $text);
494 // - underline formatting
495 $text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text);
496
497 // - decode certain entities in codeblock (just because they are valid HTML,
498 // derp).
499 if ($in_code_block && $allow_bad_codeblocks)
500 {
501 ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
502 $text = str_replace('&lt;i&gt;','<i>',$text);
503 $text = str_replace('&lt;/i&gt;','</i>',$text);
504 //$text = str_replace('&lt;br/&gt;','<br/>',$text);
505 }
506 // - restore protected entities
507 $text = preg_replace('/##(gt|lt)##/','&\1;',$text);
508 // - restore protected comment blocks
509 $text = str_replace('%!--', '&lt;!--', $text);
510 $text = str_replace('--%', '--&gt;', $text);
511 // - restore protected footnote refs
512 $text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text);
513 // output the text block
514 $text = trim($text);
515 if (empty($text))
516 {
517 fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n");
518 }
519 else if (!empty($type))
520 {
521 fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n");
522 }
523 else
524 {
525 fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n");
526 }
527}
528/** outputTextBlock($xml_out, $text) **/
529
530/**
531 */
532function processPage($xml_out, $page_name)
533{
534 global $dokuwiki_path;
535 global $seen_ids;
536 echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n";
537 // - locate the page in question (taking into account if the user asked for a
538 // draft version or an approved version of the manual)
539 $page_path = '';
540 if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
541 {
542 $page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
543 }
544 else
545 {
546 $page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
547 }
548 $page_in = @fopen($page_path, 'r');
549 if (!$page_in)
550 {
551 printError('Failed to open page for reading:' . $page_name, false);
552 return;
553 }
554 // - once again we read in line-by-line, but this time we are going to output
555 // each line as we go through. We expect to encounter certain lines in a
556 // predefined order, and should complain if we don't find what we expect.
557 $in_chapter = false;
558 $in_section = false;
559 $in_subsection = false;
560 $in_part = false;
561 $in_list = false;
562 $lists = array();
563 $previous_listitem_type = '';
564 $in_figure = false;
565 $in_table = false;
566 $column_widths = array();
567 $in_code_block = false;
568 while (($line = fgets($page_in)) !== false)
569 {
570 // remove newline character
571 $line = preg_replace('/\r?\n$/','',$line);
572 // - we need to know the 'depth' for the bulletpoint lists
573 $depth = 0;
574 while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == ' ')
575 {
576 $depth++;
577 $line = substr($line, 2);
578 }
579 $first_character = substr($line, 0, 1);
580 // - special case for the end of bullet lists
581 if ($in_list && ($first_character != "*" && $first_character != "-"))
582 {
583 while (count($lists) > 0)
584 {
585 $list_type = array_pop($lists);
586 if ($list_type == '*')
587 {
588 fwrite($xml_out, '</Bullet>' . "\n");
589 fwrite($xml_out, '</BulletList>' . "\n");
590 }
591 else
592 {
593 fwrite($xml_out, '</NumberedItem>' . "\n");
594 fwrite($xml_out, '</NumberedList>' . "\n");
595 }
596 }
597 $in_list = false;
598 }
599 // - special case for the end of tables
600 if ($in_table && $first_character != '^' && $first_character != '|')
601 {
602 fwrite($xml_out, '</TableContent>' . "\n");
603 fwrite($xml_out, '</Table>' . "\n");
604 $in_table = false;
605 }
606 // - special cases for premature closing of sections, subsections and parts
607 if (preg_match('/<!-- close:(section|subsection|part) -->/', $line, $matches))
608 {
609 // - we always try to do this (regardless of actual flag) as we must
610 // always close the smallest 'granularity' first
611 if ($in_part)
612 {
613 fwrite($xml_out, '</Content>' . "\n");
614 fwrite($xml_out, '</Part>' . "\n");
615 $in_part = false;
616 }
617 if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection'))
618 {
619 fwrite($xml_out, '</Content>' . "\n");
620 fwrite($xml_out, '</Subsection>' . "\n");
621 $in_subsection = false;
622 }
623 if ($in_section && $matches[1] == 'section')
624 {
625 fwrite($xml_out, '</Content>' . "\n");
626 fwrite($xml_out, '</Section>' . "\n");
627 $in_section = false;
628 }
629 }
630
631 // - if this page is a chapter, then the first thing on the page should be
632 // the chapter title (six equals)
633 if (preg_match('/====== (.+) ======/', $line, $matches))
634 {
635 $chapter_title = $matches[1];
636 $chapter_id = $page_name;
637 if (empty($chapter_id))
638 {
639 $chapter_id = generateID($chapter_title);
640 }
641 // - are we already processing a part? if so end it, end it now
642 if ($in_part)
643 {
644 fwrite($xml_out, '</Content>' . "\n");
645 fwrite($xml_out, '</Part>' . "\n");
646 $in_part = false;
647 }
648 // - are we already processing a subsection? if so end it, end it now
649 if ($in_subsection)
650 {
651 fwrite($xml_out, '</Content>' . "\n");
652 fwrite($xml_out, '</Subsection>' . "\n");
653 $in_subsection = false;
654 }
655 // - are we already processing a section? if so end it, end it now
656 if ($in_section)
657 {
658 fwrite($xml_out, '</Content>' . "\n");
659 fwrite($xml_out, '</Section>' . "\n");
660 $in_section = false;
661 }
662 // - are we already processing a chapter? if so end it, end it now
663 if ($in_chapter)
664 {
665 fwrite($xml_out, '</Content>' . "\n");
666 fwrite($xml_out, '</Chapter>' . "\n");
667 $in_chapter = false;
668 }
669 // - write out this chapter's header
670 fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n");
671 outputMetadataSingle($xml_out, $chapter_title, 'Title');
672 fwrite($xml_out, '<Content>' . "\n");
673 $in_chapter = true;
674 }
675 // - the next likely thing to encounter is a section heading (five equals)
676 elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches))
677 {
678 $section_title = $matches[1];
679 // - check for explicit section id
680 $section_id = '';
681 if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches))
682 {
683 $section_id = $matches[1];
684 $section_title = $matches[2];
685 }
686 if (empty($section_id))
687 {
688 $section_id = generateID($section_title);
689 }
690 // - are we already processing a part? if so end it, end it now
691 if ($in_part)
692 {
693 fwrite($xml_out, '</Content>' . "\n");
694 fwrite($xml_out, '</Part>' . "\n");
695 $in_part = false;
696 }
697 // - are we already processing a subsection? if so end it, end it now
698 if ($in_subsection)
699 {
700 fwrite($xml_out, '</Content>' . "\n");
701 fwrite($xml_out, '</Subsection>' . "\n");
702 $in_subsection = false;
703 }
704 // - are we already processing a section? if so end it, end it now
705 if ($in_section)
706 {
707 fwrite($xml_out, '</Content>' . "\n");
708 fwrite($xml_out, '</Section>' . "\n");
709 $in_section = false;
710 }
711 // - write out this section's header
712 fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n");
713 outputMetadataSingle($xml_out, $section_title, 'Title');
714 fwrite($xml_out, '<Content>' . "\n");
715 $in_section = true;
716 }
717 // - similar for subsection heading (four equals)
718 elseif (preg_match('/==== (.+) ====/', $line, $matches))
719 {
720 $subsection_title = $matches[1];
721 // - check for explicit subsection id
722 $subsection_id = '';
723 if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches))
724 {
725 $subsection_id = $matches[1];
726 $subsection_title = $matches[2];
727 }
728 if (empty($subsection_id))
729 {
730 $subsection_id = generateID($subsection_title);
731 }
732 // - are we already processing a part? if so end it, end it now
733 if ($in_part)
734 {
735 fwrite($xml_out, '</Content>' . "\n");
736 fwrite($xml_out, '</Part>' . "\n");
737 $in_part = false;
738 }
739 // - are we already processing a subsection? if so end it, end it now
740 if ($in_subsection)
741 {
742 fwrite($xml_out, '</Content>' . "\n");
743 fwrite($xml_out, '</Subsection>' . "\n");
744 $in_subsection = false;
745 }
746 // - write out this subsection's header
747 fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n");
748 outputMetadataSingle($xml_out, $subsection_title, 'Title');
749 fwrite($xml_out, '<Content>' . "\n");
750 $in_subsection = true;
751 }
752 // - and part heading (three equals)
753 elseif (preg_match('/=== (.+) ===/', $line, $matches))
754 {
755 $part_title = $matches[1];
756 // - check for explicit part id
757 $part_id = '';
758 if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches))
759 {
760 $part_id = $matches[1];
761 $part_title = $matches[2];
762 }
763 if (empty($part_id))
764 {
765 $part_id = generateID($part_title);
766 }
767 // - are we already processing a part? if so end it, end it now
768 if ($in_part)
769 {
770 fwrite($xml_out, '</Content>' . "\n");
771 fwrite($xml_out, '</Part>' . "\n");
772 $in_part = false;
773 }
774 // - write out this part's header
775 fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n");
776 outputMetadataSingle($xml_out, '**//' . $part_title . '//**', 'Title');
777 fwrite($xml_out, '<Content>' . "\n");
778 $in_part = true;
779 }
780 // - Ignore 5th level heading - they are only used to allow more convenient
781 // editing of figures and tables
782 elseif (preg_match('/== (.+) ==/', $line, $matches))
783 {
784 }
785 // - lists need special handling
786 elseif (preg_match('/^(\*|\-)\s+(.*)/', $line, $matches))
787 {
788 $list_type = $matches[1];
789 $list_text = $matches[2];
790 $list_depth = count($lists);
791 if (!$in_list)
792 {
793 if ($list_type == '*')
794 {
795 fwrite($xml_out, '<BulletList>' . "\n");
796 }
797 else
798 {
799 fwrite($xml_out, '<NumberedList>' . "\n");
800 }
801 $in_list = true;
802 array_push($lists, $list_type);
803 }
804 // - this bullet is at the same depth as previous - close the previous
805 // point
806 elseif ($depth == $list_depth)
807 {
808 $previous_list_type = end($lists);
809 if ($previous_list_type == '*')
810 {
811 fwrite($xml_out, '</Bullet>' . "\n");
812 }
813 else
814 {
815 fwrite($xml_out, '</NumberedItem>' . "\n");
816 }
817 // - we don't match in type anymore... close the previous list and open
818 // a new list of the appropriate type
819 if ($list_type != $previous_list_type)
820 {
821 if ($previous_list_type == '*')
822 {
823 fwrite($xml_out, '</BulletList>' . "\n");
824 fwrite($xml_out, '<NumberedList>' . "\n");
825 }
826 else
827 {
828 fwrite($xml_out, '</NumberedNumbered>' . "\n");
829 fwrite($xml_out, '<BulletList>' . "\n");
830 }
831 array_pop($lists);
832 array_push($lists, $list_type);
833 }
834 }
835 else
836 {
837 // - we have either got deeper...
838 if ($depth > $list_depth)
839 {
840 if ($list_type == '*')
841 {
842 fwrite($xml_out, '<BulletList>' . "\n");
843 }
844 else
845 {
846 fwrite($xml_out, '<NumberedList>' . "\n");
847 }
848 array_push($lists, $list_type);
849 }
850 // ... or shallower in the bullet listing
851 if ($depth < $list_depth)
852 {
853 $previous_list_type = array_pop($lists);
854 if ($previous_list_type == '*')
855 {
856 fwrite($xml_out, '</Bullet>' . "\n");
857 fwrite($xml_out, '</BulletList>' . "\n");
858 }
859 else
860 {
861 fwrite($xml_out, '</NumberedItem>' . "\n");
862 fwrite($xml_out, '</NumberedList>' . "\n");
863 }
864 // - we still have to close the last item too
865 $previous_listitem_type = end($lists);
866 if ($previous_listitem_type == '*')
867 {
868 fwrite($xml_out, '</Bullet>' . "\n");
869 }
870 else
871 {
872 fwrite($xml_out, '</NumberedItem>' . "\n");
873 }
874 }
875 }
876 if ($list_type == '*')
877 {
878 fwrite($xml_out, '<Bullet>' . "\n");
879 }
880 else
881 {
882 fwrite($xml_out, '<NumberedItem>' . "\n");
883 }
884 // Special Case: bullets that contain (start) a code block
885 if (preg_match('/^(.*)<code>\s*$/', $list_text, $matches))
886 {
887 $list_text = $matches[1];
888 $in_code_block = true;
889 }
890
891 outputTextBlock($xml_out, $list_text);
892
893 // - to make things clearer, we'll process any and all code blocks within
894 // bullets here - especially as there may be more text block *after*
895 // the code block finishes
896 if ($in_code_block)
897 {
898 $sub_line = '';
899 while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
900 {
901 $sub_line = trim($sub_line);
902 // - closing code
903 if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
904 {
905 $sub_line = $matches[1]; // may be empty string
906 $in_code_block = false;
907 }
908 // - output another plain codeline
909 else
910 {
911 fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n");
912 $sub_line = '';
913 }
914 }
915 // - if sub_line still has anything in it, then add that content as a
916 // text block
917 if (!empty($sub_line))
918 {
919 outputTextBlock($xml_out, $sub_line);
920 }
921 }
922 }
923 // - images start with an image caption 'element'
924 elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\|(.+)>\s*<\/imgcaption>/', $line, $matches))
925 {
926 $figure_id = $matches[1];
927 $figure_title = $matches[2];
928 // - watch for the special withLineNumber flag
929 $class_attribute = '';
930 if (strpos($figure_title, '%!-- withLineNumber --%') != false)
931 {
932 $class_attribute = ' class="withLineNumber"';
933 $figure_title = str_replace('%!-- withLineNumber --%','',$figure_title);
934 }
935 fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n");
936 echo '[figure: ' . $figure_id . "] \n";
937 fwrite($xml_out, '<Title>' . "\n");
938 // - decode any comments in the title (used to store explicit id
939 // information)
940 $figure_title = str_replace('%!--', '<!--', $figure_title);
941 $figure_title = str_replace('--%', '-->', $figure_title);
942 // - special case: the title may have a subtitle (as a prefix)
943 $figure_subtitle_id = '';
944 $figure_subtitle = '';
945 // - subtitle with explicit id
946 if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s*(.*)$/', $figure_title, $matches))
947 {
948 $figure_subtitle = $matches[1];
949 $figure_title = $matches[2];
950 }
951 // - subtitle without explicit id
952 else if (preg_match('/^(\([a-z]\))\s*(.*)$/', $figure_title, $matches))
953 {
954 $figure_subtitle = $matches[1];
955 $figure_title = $matches[2];
956 }
957 outputTextBlock($xml_out, $figure_title);
958 if (!empty($figure_subtitle))
959 {
960 fwrite($xml_out, '<SubTitle>' . "\n");
961 outputTextBlock($xml_out, $figure_subtitle);
962 fwrite($xml_out, '</SubTitle>' . "\n");
963 }
964 fwrite($xml_out, '</Title>' . "\n");
965 $in_figure = true;
966 // - record the id to prevent repeating
967 $seen_ids[$figure_id] = true;
968 }
969 // - tables start with a table caption 'element'
970 elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\|([^>]+)>\s*<\/tblcaption>/', $line, $matches))
971 {
972 $table_id = $matches[1];
973 $table_title = $matches[2];
974 if ($table_title == '##NOCAPTION##')
975 {
976 echo '[non-captioned table: ' . $table_id . "] \n";
977 // - watch for autogenerated ids... no point in outputting them
978 if (preg_match('/^table(_\d+)?$/', $table_id))
979 {
980 fwrite($xml_out, "<Table>\n");
981 }
982 else
983 {
984 fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
985 }
986 fwrite($xml_out, '<Title/>' . "\n");
987 }
988 elseif ($table_title == '##HIDDEN##')
989 {
990 echo '[hidden table: ' . $table_id . "] \n";
991 // - watch for autogenerated ids... no point in outputting them
992 if (preg_match('/^table(_\d+)?$/', $table_id))
993 {
994 fwrite($xml_out, "<Table class=\"hidden\">\n");
995 }
996 else
997 {
998 fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n");
999 }
1000 fwrite($xml_out, '<Title/>' . "\n");
1001 }
1002 else
1003 {
1004 echo '[table: ' . $table_id . "] \n";
1005 // - watch for autogenerated ids... no point in outputting them
1006 if (preg_match('/^table(_\d+)?$/', $table_id))
1007 {
1008 fwrite($xml_out, "<Table>\n");
1009 }
1010 else
1011 {
1012 fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
1013 }
1014 fwrite($xml_out, '<Title>' . "\n");
1015 outputTextBlock($xml_out, $table_title);
1016 fwrite($xml_out, '</Title>' . "\n");
1017 }
1018 fwrite($xml_out, '<TableContent>' . "\n");
1019 $in_table = true;
1020 // - record the id to prevent repeating
1021 $seen_ids[$table_id] = true;
1022 }
1023 // - the second line in a table should be it's column width values
1024 elseif (preg_match('/\|<\s-\s([0-9 ]+?)\s>\|/', $line, $matches))
1025 {
1026 $column_widths = explode(' ', $matches[1]);
1027 }
1028 // - then every row will be made of a number of cells
1029 elseif (preg_match('/^\|(.*?)\|$/', $line, $matches))
1030 {
1031 $row_content = $matches[1];
1032 $cell_contents = preg_split('/(\s+\||\|\s+)/', $row_content);
1033 fwrite($xml_out, '<tr>' . "\n");
1034 foreach ($cell_contents as $index=>$cell_content)
1035 {
1036 $cell_content = trim($cell_content);
1037 $th_text = '';
1038 if (isset($column_widths[$index]))
1039 {
1040 $th_text = '<th width="' . $column_widths[$index] . '"';
1041 }
1042 else
1043 {
1044 $th_text = '<th';
1045 }
1046 // - if the cell would be empty, we use the shorthand
1047 if (empty($cell_content))
1048 {
1049 $th_text .= '/>' . "\n";
1050 fwrite($xml_out, $th_text);
1051 }
1052 else
1053 {
1054 $th_text .= '>' . "\n";
1055 fwrite($xml_out, $th_text);
1056
1057 // GAH - this is proving harder than a hard thing thats hard.
1058 // The issue is that the most straightforward way of fixing this,
1059 // namely using explicit newlines (\\) in the dokuwiki txt causes
1060 // lots a legitimately translated <br/> to also be split up. I
1061 // think the only way forward would be to maybe extend the HTML
1062 // Comment plugin to also respect and process <br/> tags. Then I
1063 // can avoid transforming them, and use the \\ sentinel to
1064 // separate multi-line table cells.
1065 $cell_content_lines = explode('\\\\', $cell_content);
1066 foreach ($cell_content_lines as $cell_content)
1067 {
1068 // - watch out, as the content may be an image
1069 if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
1070 {
1071 processImage($xml_out, $cell_content);
1072 }
1073 elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
1074 {
1075 fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n");
1076 }
1077 // - anything else it text
1078 else
1079 {
1080 outputTextBlock($xml_out, $cell_content);
1081 }
1082 }
1083 fwrite($xml_out, '</th>' . "\n");
1084 }
1085 }
1086 fwrite($xml_out, '</tr>' . "\n");
1087 }
1088 // - links to image media in the wiki!
1089 elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
1090 {
1091 processImage($xml_out, $line);
1092 // - if we were processing a figure, then now is a good time to close it
1093 if ($in_figure)
1094 {
1095 fwrite($xml_out, '</Figure>' . "\n");
1096 $in_figure = false;
1097 }
1098 }
1099 // - if the line starts with a <code> block, then we have a tag
1100 // for that (which is special in that it get a unique text id)
1101 elseif (preg_match('/^<code\s*\d*\s*>(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
1102 {
1103 $payload = $matches[1];
1104 $found_end = (isset($matches[2]));
1105 $in_code_block = true;
1106 // - be careful with empty lines
1107 if (empty($payload))
1108 {
1109 // - as they may appear in the body of the code (in which case we need
1110 // to output them). The empty lines at the start or end of a code
1111 // block are just an unfortunate consequence of the support for code
1112 // line numbering.
1113 if (!$found_end && strpos($line, '<code') === false)
1114 {
1115 fwrite($xml_out, "<CodeLine/>\n");
1116 }
1117 }
1118 elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches))
1119 {
1120 $text_id = $matches[1];
1121 outputTextBlock($xml_out, $payload, 'code', true);
1122 // - record the id to prevent repeating
1123 $seen_ids[$text_id] = true;
1124 }
1125 else
1126 {
1127 fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n");
1128 }
1129 // - if we didn't find an endtag we have to keep doing code mode until
1130 // we do
1131 $in_code_block = (!$found_end);
1132 if ($found_end)
1133 {
1134 // - if we were processing a figure, then now is a good time to close it
1135 if ($in_figure)
1136 {
1137 fwrite($xml_out, '</Figure>' . "\n");
1138 $in_figure = false;
1139 }
1140 }
1141 }
1142 // - entities on a line by themselves (i.e. references to external files)
1143 // go through verbatim
1144 elseif (preg_match('/^\s*&[a-z0-9_-]+;\s*$/', $line))
1145 {
1146 fwrite($xml_out, $line . "\n");
1147 }
1148 // - lines starting with > are indented text blocks
1149 elseif (preg_match('/^>(.*)$/', $line, $matches))
1150 {
1151 $payload = $matches[1];
1152 fwrite($xml_out, "<Indented>\n");
1153 outputTextBlock($xml_out, $payload);
1154 fwrite($xml_out, "</Indented>\n");
1155 }
1156 // - everything else goes straight through as a text block
1157 // - note that for code blocks, even empty lines count
1158 elseif (!empty($line))
1159 {
1160 // - output the line of text having encoded entities etc
1161 outputTextBlock($xml_out, $line, '', $in_code_block);
1162 }
1163 }
1164 // Complete any open part
1165 if ($in_part)
1166 {
1167 fwrite($xml_out, '</Content>' . "\n");
1168 fwrite($xml_out, '</Part>' . "\n");
1169 $in_part = false;
1170 }
1171 // Complete any open subsection
1172 if ($in_subsection)
1173 {
1174 fwrite($xml_out, '</Content>' . "\n");
1175 fwrite($xml_out, '</Subsection>' . "\n");
1176 $in_subsection = false;
1177 }
1178 // Complete any open section
1179 if ($in_section)
1180 {
1181 fwrite($xml_out, '</Content>' . "\n");
1182 fwrite($xml_out, '</Section>' . "\n");
1183 $in_section = false;
1184 }
1185 // Complete any open chapter
1186 if ($in_chapter)
1187 {
1188 fwrite($xml_out, '</Content>' . "\n");
1189 fwrite($xml_out, '</Chapter>' . "\n");
1190 $in_chapter = false;
1191 }
1192}
1193/** processPage($xml_out, $page_name) **/
1194
1195function processImage($xml_out, $text)
1196{
1197 global $dokuwiki_path;
1198 global $xml_source_path;
1199 if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches))
1200 {
1201 $filename = $matches[1];
1202 $width = $matches[2];
1203 $height = $matches[3];
1204 // - copy the file into place
1205 $image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
1206 $image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images';
1207 mkAllDir($image_destination_dir, 0755);
1208 $image_destination_path = $image_destination_dir . '/' . $filename;
1209 if (copy($image_source_path, $image_destination_path))
1210 {
1211 echo '[copying file: ' . $filename . "] \n";
1212 }
1213 else
1214 {
1215 printError('Failed to copy image into place: ' . $filename, false);
1216 }
1217 // - spit out the XML element
1218 fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n");
1219 }
1220}
Note: See TracBrowser for help on using the repository browser.