root/documentation/trunk/php/gs-manual-export.php @ 25052

Revision 25026, 41.7 KB (checked in by jmt12, 8 years ago)

PHP scripts for importing and exporting the XML manuals into and out of Dokuwiki

Line 
1<?php
2
3require_once('common.php');
4$debug = 1;
5
6// There are some elements that are, in my opinion, incorrectly 'resolved' in
7// code blocks just because they are valid HTML. For instance, &lt;i&gt; is
8// resolved to <i> in code blocks, while something like &lt;Metadata&gt; is
9// not. Set this to true to allow such abominations (for the purpose of
10// comparing before and after versions of the XML). [jmt12]
11$allow_bad_codeblocks = true;
12
13/** @file gs-manual-export.php
14 *  This script transforms the series of dokuwiki pages that make up a certain
15 *  manual (as specified by the 'm' argument) in a certain language ('l') into
16 *  the XML format required by the rest of the Greenstone manual generation
17 *  scripts.
18 */
19
20// 0. Initialization
21$xml_source_path = '/research/jmt12/gsdl-docs/temp';
22// - we have a counter to assign identifiers to text blocks etc without ids
23$text_id_counter = 1;
24// - we need an array of all the footnotes
25$footnotes = array();
26
27// Defaults
28if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
29{
30  $_REQUEST['l'] = 'en';
31}
32if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
33{
34  //$_REQUEST['m'] = 'user';
35  //$_REQUEST['m'] = 'install';
36  $_REQUEST['m'] = 'develop';
37  //$_REQUEST['m'] = 'paper';
38}
39if (!isset($_REQUEST['v']) || empty($_REQUEST['v']))
40{
41  $_REQUEST['v'] = 'draft';
42}
43if (!isset($_REQUEST['a']) || !preg_match('/^(download|store)$/', $_REQUEST['a']))
44{
45  $_REQUEST['a'] = 'store'; // Try to store the file to disk
46}
47
48echo '<html>' . "\n";
49echo '<head>' . "\n";
50echo '<title>GS Manual Export</title>' . "\n";
51echo '</head>' . "\n";
52echo '<body>' . "\n";
53
54// - validate arguments before we use them (security)
55if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
56 {
57   printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
58 }
59
60if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
61 {
62   printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
63 }
64
65echo '<h2>Generating Greenstone Manual XML</h2>' . "\n";
66echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
67// 1. Create the XML output file handle
68$xml_file_path = '';
69if ($_REQUEST['a'] == 'download')
70{
71  $xml_file_path = fileCat(array($base_path, 'var', ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml'));
72}
73else
74{
75  // - construct the path using the information we've been provided as arguments
76  $xml_file_dir = $xml_source_path . '/' . $_REQUEST['l'];
77  mkAllDir($xml_file_dir, 0755);
78  $xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
79  // - backup any existing file
80  if (file_exists($xml_file_path))
81  {
82    $xml_backup_file_path = $xml_file_path . '.bak';
83    if (!rename($xml_file_path, $xml_backup_file_path))
84    {
85      printError('Failed to rename existing manual file for backup');
86    }
87  }
88}
89// - and create a handle to the new file
90$xml_out = fopen($xml_file_path, 'w');
91
92// 2. Read in the top level page - this will give configuration data for the
93//    manual and cover page, as well as specifying the order for the other
94//    pages in the manual
95echo "<p><b>Frontmatter:</b><br/>\n";
96// - by reading this page we hope to populate an array of metadata, and also
97//   extract the sequence of other pages within this manual
98$cover_metadata = array();
99$pages_in_order = array();
100// - we now need to consider if the user has asked for a draft version (i.e.
101//   includes the latest version of pages regardless of approval) or if only
102//   the approved versions of pages should be included
103// - only necessary for english version of manual, as those are the only pages
104//   editable
105$top_page_path = '';
106if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
107{
108  // - again, we can construct the path to the top level page given the arguments
109  //   provided
110  $top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
111}
112else
113{
114  $top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
115}
116
117if (!file_exists($top_page_path))
118 {
119   printError('Failed to locate top level page for manual');
120 }
121// - we're going to open a handle to the file, then read it in line-by-line
122//   watching for the lines we are interested in (using pattern matching)
123$top_page_in = fopen($top_page_path, 'r');
124if (!$top_page_in)
125 {
126   printError('Failed to open top level page for reading');
127 }
128$in_contents = false;
129while (($line = fgets($top_page_in)) !== false)
130 {
131   // - if we are capturing page order, and we encounter something that looks
132   //   like a bulletpoint item pointing to a wiki page, then we append the
133   //   name of that page to our pages in order array
134   if ($in_contents && preg_match('/^\s+\*\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\|(.*?)\]\]\s*$/', $line, $matches))
135   {
136     array_push($pages_in_order, $matches[1]);
137   }
138   // - metadata is all encoded within dokuwiki tables
139   elseif (preg_match('/^\^\s+([^\s]+)\s+\|\s+(.+?)\s+\|\s*$/', $line, $matches))
140   {
141     $field = $matches[1];
142     $value = $matches[2];
143     $values = array();
144     if (isset($cover_metadata[$field]))
145     {
146       $values = $cover_metadata[$field];
147     }
148     array_push($values, $value);
149     $cover_metadata[$field] = $values;
150   }
151   // - watch for the heading 'Contents' to begin extracting page order
152   //   information
153   elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
154   {
155     if ($matches[1] == 'Contents')
156     {
157       $in_contents = true;
158     }
159     // - any other title means we aren't capturing page order (anymore)
160     else
161     {
162       $in_contents = false;
163     }
164   }
165 }
166if (!feof($top_page_in))
167 {
168   printError('Unexpected fgets() fail when reading top page');
169 }
170fclose($top_page_in);
171// - ensure we have the required metadata
172$required_metadata = array('Heading','Title','Affiliation','Version','Date');
173foreach ($required_metadata as $required_field)
174{
175  if (!isset($cover_metadata[$required_field]))
176  {
177    printError('Missing required metadata: ' . $required_field);
178  }
179}
180// - now we can use the metadata to construct the XML header and the cover page.
181//   This follows a pretty set recipe with only elements that can repeat---like
182//   Author, SupplementaryText etc---are at all tricky
183fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n");
184fwrite($xml_out, '<!DOCTYPE Manual [' . "\n");
185if (isset($cover_metadata['ENTITY']))
186 {
187   foreach ($cover_metadata['ENTITY'] as $entity)
188   {
189     fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n");
190   }
191 }
192fwrite($xml_out, ']>' . "\n");
193fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n");
194
195///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n";
196
197outputMetadataSingle($xml_out, $cover_metadata, 'Heading');
198outputMetadataSingle($xml_out, $cover_metadata, 'Title');
199outputMetadataSingle($xml_out, $cover_metadata, 'Author');
200outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation');
201outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText');
202outputMetadataMultiple($xml_out, $cover_metadata, 'Text');
203outputMetadataMultiple($xml_out, $cover_metadata, 'Comment');
204outputMetadataSingle($xml_out, $cover_metadata, 'Version');
205outputMetadataSingle($xml_out, $cover_metadata, 'Date');
206
207// 3. Process each page listed in the contents of the top level page in order
208foreach ($pages_in_order as $page)
209{
210  processPage($xml_out, $page);
211}
212
213// 4. Output out list of footnotes (if any)
214if (!empty($footnotes))
215 {
216   fwrite($xml_out, '<FootnoteList>'. "\n");
217   foreach ($footnotes as $footnote=>$footnote_id)
218   {
219     ///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n";
220     outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
221   }
222   fwrite($xml_out, '</FootnoteList>'. "\n");
223 }
224
225// 5. Finalize and close the XML output
226fwrite($xml_out, '</Manual>' . "\n");
227fclose($xml_out);
228
229// 6. Complete!
230echo '<p><b>Complete!</b></p>' . "\n<hr/>\n";
231if ($_REQUEST['a'] == 'download')
232{
233  echo '<p>Click <a href="var/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml">here</a> to download XML file</p>' . "\n";
234}
235echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n";
236echo '</body>' . "\n";
237echo '</html>';
238exit(0);
239
240/**
241 */
242function outputMetadataSingle($xml_out, $metadata, $field, $mid=false)
243{
244  echo '[metadata: ' . $field . "] \n";
245  if ($mid)
246  {
247    fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n");
248  }
249  else
250  {
251    fwrite($xml_out, '<' . $field . '>' . "\n");
252  }
253  if (is_array($metadata))
254  {
255    if (isset($metadata[$field]) && isset($metadata[$field][0]))
256    {
257      outputTextBlock($xml_out, $metadata[$field][0]);
258    }
259    else
260    {
261      echo 'no such field or no metadata';
262    }
263  }
264  elseif (!empty($metadata))
265  {
266    outputTextBlock($xml_out, $metadata);
267  }
268  else
269  {
270    echo 'no such field or no metadata';
271  }
272  fwrite($xml_out, '</' . $field . '>' . "\n");
273}
274/** outputMetadataSingle() **/
275
276function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false)
277{
278  echo '[metadata list: ' . $field . "] \n";
279  fwrite($xml_out, '<' . $field . '>' . "\n");
280  if (isset($metadata[$field]))
281  {
282    if (count($metadata[$field]) == 1)
283    {
284      outputTextBlock($xml_out, $metadata[$field][0]);
285    }
286    if (count($metadata[$field]) > 1)
287    {
288      $last_value = '';
289      if ($final_separator)
290      {
291        $last_value = array_pop($metadata[$field]);
292      }
293      $values = implode($separator, $metadata[$field]);
294      if ($final_separator)
295      {
296        $values .= $final_separator . $last_value;
297      }
298      outputTextBlock($xml_out, $values);
299    }
300  }
301  else
302  {
303    echo 'no such field or no metadata';
304  }
305  fwrite($xml_out, '</' . $field . '>' . "\n");
306}
307/** outputMetadataList() **/
308
309function outputMetadataMultiple($xml_out, $metadata, $field)
310{
311  echo '[metadata multiple: ' . $field . "] \n";
312  // - Text blocks don't need to be wrapped in Text element
313  if ($field != 'Text')
314  {
315    fwrite($xml_out, '<' . $field . '>' . "\n");
316  }
317  if (isset($metadata[$field]))
318  {
319    foreach ($metadata[$field] as $value)
320    {
321      outputTextBlock($xml_out, $value);
322    }
323  }
324  else
325  {
326    echo 'no such field or no metadata';
327  }
328  if ($field != 'Text')
329  {
330    fwrite($xml_out, '</' . $field . '>' . "\n");
331  }
332}
333
334function translateTableCodeline($text)
335{
336  // Escape any italic tags hidden in HTML comments
337  $text = str_replace('<!--i-->', '%!--i--%', $text);
338  $text = str_replace('<!--/i-->', '%!--/i--%', $text);
339  // Encode entities etc
340  $text = translateText($text, true);
341  // Restore any italics elements hidden above
342  $text = str_replace('%!--i--%', '<i>', $text);
343  $text = str_replace('%!--/i--%', '</i>', $text);
344  return $text;
345}
346/** translateTableCodeline() **/
347
348function translateText($text, $in_code_block=false)
349{
350  global $allow_bad_codeblocks;
351  $text = str_replace('&','&amp;',$text);
352  $text = str_replace('<','&lt;',$text);
353  $text = str_replace('>','&gt;',$text);
354  if ($in_code_block && $allow_bad_codeblocks)
355  {
356    ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
357    $text = str_replace('&lt;i&gt;','<i>',$text);
358    $text = str_replace('&lt;/i&gt;','</i>',$text);
359    $text = str_replace('&lt;br/&gt;','<br/>',$text);
360  }
361  return $text;
362}
363
364function outputTextBlock($xml_out, $text, $type='', $in_code_block = false)
365{
366  global $cover_metadata;
367  global $text_id_counter;
368  global $footnotes;
369  global $allow_bad_codeblocks;
370
371  // - Start by dealing with any footnotes before anything else
372  while (preg_match('/\(\((.*?)\)\)/', $text, $matches))
373  {
374    $pattern = $matches[0];
375    $footnote = $matches[1];
376    $footnote_id = count($footnotes) + 1;
377    $footnotes[$footnote] = $footnote_id;
378    // - note that we have to escape the footnote reference as the following
379    //   code will convert any < and > to entities...
380    $footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%';
381    $text = str_replace($pattern, $footnote_reference, $text);
382  }
383
384  $text_id = '';
385  // - check whether the string begins with an explicit id
386  if (preg_match('/^\s*<!--\s*id:(.+?)\s*-->(.*)$/', $text, $matches))
387  {
388    $text_id = $matches[1];
389    $text = $matches[2];
390    if (is_numeric($text_id))
391    {
392      $text_id_counter = $text_id + 1;
393    }
394  }
395  else
396  {
397    $text_id = $text_id_counter;
398    $text_id_counter++;
399  }
400
401  // - protect the special case of an HTML comment being actually displayed
402  //   in the text
403  $text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text);
404
405  // - reformat dokuwiki syntax to HTML tag syntax
406  $text = preg_replace('/<!--.*?-->/', '', $text);
407
408  // we leave code blocks alone in terms of ampersands
409  if (!$in_code_block)
410  {
411    // - ampersands aren't safe in XML...
412    $text = str_replace('&', '&amp;', $text);
413    // ...except for the entities that we have registered as metadata
414    if (isset($cover_metadata['ENTITY']))
415    {
416      foreach ($cover_metadata['ENTITY'] as $entity)
417      {
418        if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
419        {
420          $entity_name = $matches[1];
421          if ($entity_name != 'mdash')
422          {
423          $entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8');
424          $text = str_replace('&amp;' . $entity_name . ';', '&' . $entity_name . ';', $text);
425          // - we also convert any characters that match the entity char into
426          //   the entity
427          $text = str_replace($entity_character, '&' . $entity_name . ';', $text);
428          }
429        }
430      }
431    }
432    // - protect <br/> tags
433    $text = str_replace('<br/>','%%br/%%',$text);
434    // - encoding all of the < and > that appear in the text (rather than
435    //   true html formatting)
436    $text = str_replace('<','&lt;',$text);
437    $text = str_replace('>','&gt;',$text);
438    // - restore <br/> tags
439    $text = str_replace('%%br/%%','<br/>',$text);
440  }
441  else if ($type == 'code')
442  {
443    $text = str_replace('<','&lt;',$text);
444    $text = str_replace('>','&gt;',$text);
445  }
446
447  // - links, oh how I hate thee
448  // - external links are slightly easier
449  $text = preg_replace('/\[\[http:\/\/(.*?)\|(.*?)\]\]/', '<Link url="http://\1">\2</Link>', $text);
450  // - internals have to become the horrible <CrossRef> tags. We ignore any
451  //   number prefix on the page name as that is just used for ordering within
452  //   Dokuwiki
453  $text = preg_replace('/\[\[\.\:(.*?)\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text);
454  // - internal links starting with hash must be on the same page
455  $text = preg_replace('/\[\[###(.*?)\|.*?\]\]/','<CrossRef target="Part" ref="\1"/>', $text);
456  $text = preg_replace('/\[\[##(.*?)\|.*?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text);
457  $text = preg_replace('/\[\[#(.*?)\|.*?\]\]/','<CrossRef target="Section" ref="\1"/>', $text);
458  // - 'external' internal wiki links are even worst - since we can't know what
459  //   the page order number for another manual's chapters might be, we instead
460  //   use a search
461  $text = preg_replace('/\[\[\?do\=search\&amp;id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text);
462  // - references to images and tables
463  $text = preg_replace('/(?:<|&lt;)imgref\sfigure_(.+?)(?:>|&gt;)/','<CrossRef target="Figure" ref="\1"/>', $text);
464  $text = preg_replace('/(?:<|&lt;)tblref\stable_(.+?)(?:>|&gt;)/','<CrossRef target="Table" ref="\1"/>', $text);
465  // - explicitly convert URLs as they are a bit messy
466  // - first all the cases of URLs in italics, without protocol
467  $text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
468  $text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
469  $text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
470  // - now all the protocol ones (with care taken to protect // in protocol)
471  $text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
472  $text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
473  $text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
474  // - next we have the underlined URLs sans protocols
475  $text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
476  $text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
477  $text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
478  // - and finally the protocol prefixed underlined URLs
479  $text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
480  $text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
481  $text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
482  // - lets also protect any other protocols we find floating around
483  $text = preg_replace('/(file|ftp|http):\/\//i', '\1:##DOUBLESLASH##', $text);
484
485  // - italic formatting (taking care of protected double slashes)
486  $text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text);
487  $text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case
488  $text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case
489  $text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case
490  $text = preg_replace('/\/\/\s*(.+?)\s*\/\//', '<i>\1</i>', $text);
491  $text = preg_replace('/##DOUBLESLASH##/', '//', $text);
492  // - bold formatting
493  $text = preg_replace('/\*\*([^"]+?)\*\*/', '<b>\1</b>', $text);
494  // - underline formatting
495  $text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text);
496
497  // - decode certain entities in codeblock (just because they are valid HTML,
498  //   derp).
499  if ($in_code_block && $allow_bad_codeblocks)
500  {
501    ///cho "**** [debug] restoring bogus decoded tags in: |$text| ****\n";
502    $text = str_replace('&lt;i&gt;','<i>',$text);
503    $text = str_replace('&lt;/i&gt;','</i>',$text);
504    //$text = str_replace('&lt;br/&gt;','<br/>',$text);
505  }
506  // - restore protected entities
507  $text = preg_replace('/##(gt|lt)##/','&\1;',$text);
508  // - restore protected comment blocks
509  $text = str_replace('%!--', '&lt;!--', $text);
510  $text = str_replace('--%', '--&gt;', $text);
511  // - restore protected footnote refs
512  $text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text);
513  // output the text block
514  $text = trim($text);
515  if (empty($text))
516  {
517    fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n");
518  }
519  else if (!empty($type))
520  {
521    fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n");
522  }
523  else
524  {
525    fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n");
526  }
527}
528/** outputTextBlock($xml_out, $text) **/
529
530/**
531 */
532function processPage($xml_out, $page_name)
533{
534  global $dokuwiki_path;
535  global $seen_ids;
536  echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n";
537  // - locate the page in question (taking into account if the user asked for a
538  //   draft version or an approved version of the manual)
539  $page_path = '';
540  if ($_REQUEST['v'] == 'draft' || $_REQUEST['l'] != 'en')
541  {
542    $page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
543  }
544  else
545  {
546    $page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
547  }
548  $page_in = @fopen($page_path, 'r');
549  if (!$page_in)
550  {
551    printError('Failed to open page for reading:' . $page_name, false);
552    return;
553  }
554  // - once again we read in line-by-line, but this time we are going to output
555  //   each line as we go through. We expect to encounter certain lines in a
556  //   predefined order, and should complain if we don't find what we expect.
557  $in_chapter = false;
558  $in_section = false;
559  $in_subsection = false;
560  $in_part = false;
561  $in_list = false;
562  $lists = array();
563  $previous_listitem_type = '';
564  $in_figure = false;
565  $in_table = false;
566  $column_widths = array();
567  $in_code_block = false;
568  while (($line = fgets($page_in)) !== false)
569  {
570    // remove newline character
571    $line = preg_replace('/\r?\n$/','',$line);
572    // - we need to know the 'depth' for the bulletpoint lists
573    $depth = 0;
574    while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == '  ')
575    {
576      $depth++;
577      $line = substr($line, 2);
578    }
579    $first_character = substr($line, 0, 1);
580    // - special case for the end of bullet lists
581    if ($in_list && ($first_character != "*" && $first_character != "-"))
582    {
583      while (count($lists) > 0)
584      {
585        $list_type = array_pop($lists);
586        if ($list_type == '*')
587        {
588          fwrite($xml_out, '</Bullet>' . "\n");
589          fwrite($xml_out, '</BulletList>' . "\n");
590        }
591        else
592        {
593          fwrite($xml_out, '</NumberedItem>' . "\n");
594          fwrite($xml_out, '</NumberedList>' . "\n");
595        }
596      }
597      $in_list = false;
598    }
599    // - special case for the end of tables
600    if ($in_table && $first_character != '^' && $first_character != '|')
601    {
602      fwrite($xml_out, '</TableContent>' . "\n");
603      fwrite($xml_out, '</Table>' . "\n");
604      $in_table = false;
605    }
606    // - special cases for premature closing of sections, subsections and parts
607    if (preg_match('/<!-- close:(section|subsection|part) -->/', $line, $matches))
608    {
609      // - we always try to do this (regardless of actual flag) as we must
610      //   always close the smallest 'granularity' first
611      if ($in_part)
612      {
613        fwrite($xml_out, '</Content>' . "\n");
614        fwrite($xml_out, '</Part>' . "\n");
615        $in_part = false;
616      }
617      if ($in_subsection && ($matches[1] == 'section' || $matches[1] == 'subsection'))
618      {
619        fwrite($xml_out, '</Content>' . "\n");
620        fwrite($xml_out, '</Subsection>' . "\n");
621        $in_subsection = false;
622      }
623      if ($in_section && $matches[1] == 'section')
624      {
625        fwrite($xml_out, '</Content>' . "\n");
626        fwrite($xml_out, '</Section>' . "\n");
627        $in_section = false;
628      }
629    }
630
631    // - if this page is a chapter, then the first thing on the page should be
632    //   the chapter title (six equals)
633    if (preg_match('/====== (.+) ======/', $line, $matches))
634    {
635      $chapter_title = $matches[1];
636      $chapter_id = $page_name;
637      if (empty($chapter_id))
638      {
639        $chapter_id = generateID($chapter_title);
640      }
641      // - are we already processing a part? if so end it, end it now
642      if ($in_part)
643      {
644        fwrite($xml_out, '</Content>' . "\n");
645        fwrite($xml_out, '</Part>' . "\n");
646        $in_part = false;
647      }
648      // - are we already processing a subsection? if so end it, end it now
649      if ($in_subsection)
650      {
651        fwrite($xml_out, '</Content>' . "\n");
652        fwrite($xml_out, '</Subsection>' . "\n");
653        $in_subsection = false;
654      }
655      // - are we already processing a section? if so end it, end it now
656      if ($in_section)
657      {
658        fwrite($xml_out, '</Content>' . "\n");
659        fwrite($xml_out, '</Section>' . "\n");
660        $in_section = false;
661      }
662      // - are we already processing a chapter? if so end it, end it now
663      if ($in_chapter)
664      {
665        fwrite($xml_out, '</Content>' . "\n");
666        fwrite($xml_out, '</Chapter>' . "\n");
667        $in_chapter = false;
668      }
669      // - write out this chapter's header
670      fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n");
671      outputMetadataSingle($xml_out, $chapter_title, 'Title');
672      fwrite($xml_out, '<Content>' . "\n");
673      $in_chapter = true;
674    }
675    // - the next likely thing to encounter is a section heading (five equals)
676    elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches))
677    {
678      $section_title = $matches[1];
679      // - check for explicit section id
680      $section_id = '';
681      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches))
682      {
683        $section_id = $matches[1];
684        $section_title = $matches[2];
685      }
686      if (empty($section_id))
687      {
688        $section_id = generateID($section_title);
689      }
690      // - are we already processing a part? if so end it, end it now
691      if ($in_part)
692      {
693        fwrite($xml_out, '</Content>' . "\n");
694        fwrite($xml_out, '</Part>' . "\n");
695        $in_part = false;
696      }
697      // - are we already processing a subsection? if so end it, end it now
698      if ($in_subsection)
699      {
700        fwrite($xml_out, '</Content>' . "\n");
701        fwrite($xml_out, '</Subsection>' . "\n");
702        $in_subsection = false;
703      }
704      // - are we already processing a section? if so end it, end it now
705      if ($in_section)
706      {
707        fwrite($xml_out, '</Content>' . "\n");
708        fwrite($xml_out, '</Section>' . "\n");
709        $in_section = false;
710      }
711      // - write out this section's header
712      fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n");
713      outputMetadataSingle($xml_out, $section_title, 'Title');
714      fwrite($xml_out, '<Content>' . "\n");
715      $in_section = true;
716    }
717    // - similar for subsection heading (four equals)
718    elseif (preg_match('/==== (.+) ====/', $line, $matches))
719    {
720      $subsection_title = $matches[1];
721      // - check for explicit subsection id
722      $subsection_id = '';
723      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches))
724      {
725        $subsection_id = $matches[1];
726        $subsection_title = $matches[2];
727      }
728      if (empty($subsection_id))
729      {
730        $subsection_id = generateID($subsection_title);
731      }
732      // - are we already processing a part? if so end it, end it now
733      if ($in_part)
734      {
735        fwrite($xml_out, '</Content>' . "\n");
736        fwrite($xml_out, '</Part>' . "\n");
737        $in_part = false;
738      }
739      // - are we already processing a subsection? if so end it, end it now
740      if ($in_subsection)
741      {
742        fwrite($xml_out, '</Content>' . "\n");
743        fwrite($xml_out, '</Subsection>' . "\n");
744        $in_subsection = false;
745      }
746      // - write out this subsection's header
747      fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n");
748      outputMetadataSingle($xml_out, $subsection_title, 'Title');
749      fwrite($xml_out, '<Content>' . "\n");
750      $in_subsection = true;
751    }
752    // - and part heading (three equals)
753    elseif (preg_match('/=== (.+) ===/', $line, $matches))
754    {
755      $part_title = $matches[1];
756      // - check for explicit part id
757      $part_id = '';
758      if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches))
759      {
760        $part_id = $matches[1];
761        $part_title = $matches[2];
762      }
763      if (empty($part_id))
764      {
765        $part_id = generateID($part_title);
766      }
767      // - are we already processing a part? if so end it, end it now
768      if ($in_part)
769      {
770        fwrite($xml_out, '</Content>' . "\n");
771        fwrite($xml_out, '</Part>' . "\n");
772        $in_part = false;
773      }
774      // - write out this part's header
775      fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n");
776      outputMetadataSingle($xml_out, '**//' . $part_title . '//**', 'Title');
777      fwrite($xml_out, '<Content>' . "\n");
778      $in_part = true;
779    }
780    // - Ignore 5th level heading - they are only used to allow more convenient
781    //   editing of figures and tables
782    elseif (preg_match('/== (.+) ==/', $line, $matches))
783    {
784    }
785    // - lists need special handling
786    elseif (preg_match('/^(\*|\-)\s+(.*)/', $line, $matches))
787    {
788      $list_type = $matches[1];
789      $list_text = $matches[2];
790      $list_depth = count($lists);
791      if (!$in_list)
792      {
793        if ($list_type == '*')
794        {
795          fwrite($xml_out, '<BulletList>' . "\n");
796        }
797        else
798        {
799          fwrite($xml_out, '<NumberedList>' . "\n");
800        }
801        $in_list = true;
802        array_push($lists, $list_type);
803      }
804      // - this bullet is at the same depth as previous - close the previous
805      //   point
806      elseif ($depth == $list_depth)
807      {
808        $previous_list_type = end($lists);
809        if ($previous_list_type == '*')
810        {
811          fwrite($xml_out, '</Bullet>' . "\n");
812        }
813        else
814        {
815          fwrite($xml_out, '</NumberedItem>' . "\n");
816        }
817        // - we don't match in type anymore... close the previous list and open
818        //   a new list of the appropriate type
819        if ($list_type != $previous_list_type)
820        {
821          if ($previous_list_type == '*')
822          {
823            fwrite($xml_out, '</BulletList>' . "\n");
824            fwrite($xml_out, '<NumberedList>' . "\n");
825          }
826          else
827          {
828            fwrite($xml_out, '</NumberedNumbered>' . "\n");
829            fwrite($xml_out, '<BulletList>' . "\n");
830          }
831          array_pop($lists);
832          array_push($lists, $list_type);
833        }
834      }
835      else
836      {
837        // - we have either got deeper...
838        if ($depth > $list_depth)
839        {
840          if ($list_type == '*')
841          {
842            fwrite($xml_out, '<BulletList>' . "\n");
843          }
844          else
845          {
846            fwrite($xml_out, '<NumberedList>' . "\n");
847          }
848          array_push($lists, $list_type);
849        }
850        // ... or shallower in the bullet listing
851        if ($depth < $list_depth)
852        {
853          $previous_list_type = array_pop($lists);
854          if ($previous_list_type == '*')
855          {
856            fwrite($xml_out, '</Bullet>' . "\n");
857            fwrite($xml_out, '</BulletList>' . "\n");
858          }
859          else
860          {
861            fwrite($xml_out, '</NumberedItem>' . "\n");
862            fwrite($xml_out, '</NumberedList>' . "\n");
863          }
864          // - we still have to close the last item too
865          $previous_listitem_type = end($lists);
866          if ($previous_listitem_type == '*')
867          {
868            fwrite($xml_out, '</Bullet>' . "\n");
869          }
870          else
871          {
872            fwrite($xml_out, '</NumberedItem>' . "\n");
873          }
874        }
875      }
876      if ($list_type == '*')
877      {
878        fwrite($xml_out, '<Bullet>' . "\n");
879      }
880      else
881      {
882        fwrite($xml_out, '<NumberedItem>' . "\n");
883      }
884      // Special Case: bullets that contain (start) a code block
885      if (preg_match('/^(.*)<code>\s*$/', $list_text, $matches))
886      {
887        $list_text = $matches[1];
888        $in_code_block = true;
889      }
890
891      outputTextBlock($xml_out, $list_text);
892
893      // - to make things clearer, we'll process any and all code blocks within
894      //   bullets here - especially as there may be more text block *after*
895      //   the code block finishes
896      if ($in_code_block)
897      {
898        $sub_line = '';
899        while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
900        {
901          $sub_line = trim($sub_line);
902          // - closing code
903          if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
904          {
905            $sub_line = $matches[1]; // may be empty string
906            $in_code_block = false;
907          }
908          // - output another plain codeline
909          else
910          {
911            fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n");
912            $sub_line = '';
913          }
914        }
915        // - if sub_line still has anything in it, then add that content as a
916        //   text block
917        if (!empty($sub_line))
918        {
919          outputTextBlock($xml_out, $sub_line);
920        }
921      }
922    }
923    // - images start with an image caption 'element'
924    elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\|(.+)>\s*<\/imgcaption>/', $line, $matches))
925    {
926      $figure_id = $matches[1];
927      $figure_title = $matches[2];
928      // - watch for the special withLineNumber flag
929      $class_attribute = '';
930      if (strpos($figure_title, '%!-- withLineNumber --%') != false)
931      {
932        $class_attribute = ' class="withLineNumber"';
933        $figure_title = str_replace('%!-- withLineNumber --%','',$figure_title);
934      }
935      fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n");
936      echo '[figure: ' . $figure_id . "] \n";
937      fwrite($xml_out, '<Title>' . "\n");
938      // - decode any comments in the title (used to store explicit id
939      //   information)
940      $figure_title = str_replace('%!--', '<!--', $figure_title);
941      $figure_title = str_replace('--%', '-->', $figure_title);
942      // - special case: the title may have a subtitle (as a prefix)
943      $figure_subtitle_id = '';
944      $figure_subtitle = '';
945      // - subtitle with explicit id
946      if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s*(.*)$/', $figure_title, $matches))
947      {
948        $figure_subtitle = $matches[1];
949        $figure_title = $matches[2];
950      }
951      // - subtitle without explicit id
952      else if (preg_match('/^(\([a-z]\))\s*(.*)$/', $figure_title, $matches))
953      {
954        $figure_subtitle = $matches[1];
955        $figure_title = $matches[2];
956      }
957      outputTextBlock($xml_out, $figure_title);
958      if (!empty($figure_subtitle))
959      {
960        fwrite($xml_out, '<SubTitle>' . "\n");
961        outputTextBlock($xml_out, $figure_subtitle);
962        fwrite($xml_out, '</SubTitle>' . "\n");
963      }
964      fwrite($xml_out, '</Title>' . "\n");
965      $in_figure = true;
966      // - record the id to prevent repeating
967      $seen_ids[$figure_id] = true;
968    }
969    // - tables start with a table caption 'element'
970    elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\|([^>]+)>\s*<\/tblcaption>/', $line, $matches))
971    {
972      $table_id = $matches[1];
973      $table_title = $matches[2];
974      if ($table_title == '##NOCAPTION##')
975      {
976        echo '[non-captioned table: ' . $table_id . "] \n";
977        // - watch for autogenerated ids... no point in outputting them
978        if (preg_match('/^table(_\d+)?$/', $table_id))
979        {
980          fwrite($xml_out, "<Table>\n");
981        }
982        else
983        {
984          fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
985        }
986        fwrite($xml_out, '<Title/>' . "\n");
987      }
988      elseif ($table_title == '##HIDDEN##')
989      {
990        echo '[hidden table: ' . $table_id . "] \n";
991        // - watch for autogenerated ids... no point in outputting them
992        if (preg_match('/^table(_\d+)?$/', $table_id))
993        {
994          fwrite($xml_out, "<Table class=\"hidden\">\n");
995        }
996        else
997        {
998          fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n");
999        }
1000        fwrite($xml_out, '<Title/>' . "\n");
1001      }
1002      else
1003      {
1004        echo '[table: ' . $table_id . "] \n";
1005        // - watch for autogenerated ids... no point in outputting them
1006        if (preg_match('/^table(_\d+)?$/', $table_id))
1007        {
1008          fwrite($xml_out, "<Table>\n");
1009        }
1010        else
1011        {
1012          fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
1013        }
1014        fwrite($xml_out, '<Title>' . "\n");
1015        outputTextBlock($xml_out, $table_title);
1016        fwrite($xml_out, '</Title>' . "\n");
1017      }
1018      fwrite($xml_out, '<TableContent>' . "\n");
1019      $in_table = true;
1020      // - record the id to prevent repeating
1021      $seen_ids[$table_id] = true;
1022    }
1023    // - the second line in a table should be it's column width values
1024    elseif (preg_match('/\|<\s-\s([0-9 ]+?)\s>\|/', $line, $matches))
1025    {
1026      $column_widths = explode(' ', $matches[1]);
1027    }
1028    // - then every row will be made of a number of cells
1029    elseif (preg_match('/^\|(.*?)\|$/', $line, $matches))
1030    {
1031      $row_content = $matches[1];
1032      $cell_contents = preg_split('/(\s+\||\|\s+)/', $row_content);
1033      fwrite($xml_out, '<tr>' . "\n");
1034      foreach ($cell_contents as $index=>$cell_content)
1035      {
1036        $cell_content = trim($cell_content);
1037        $th_text = '';
1038        if (isset($column_widths[$index]))
1039        {
1040          $th_text = '<th width="' . $column_widths[$index] . '"';
1041        }
1042        else
1043        {
1044          $th_text = '<th';
1045        }
1046        // - if the cell would be empty, we use the shorthand
1047        if (empty($cell_content))
1048        {
1049          $th_text .= '/>' . "\n";
1050          fwrite($xml_out, $th_text);
1051        }
1052        else
1053        {
1054          $th_text .= '>' . "\n";
1055          fwrite($xml_out, $th_text);
1056
1057          // GAH - this is proving harder than a hard thing thats hard.
1058          // The issue is that the most straightforward way of fixing this,
1059          // namely using explicit newlines (\\) in the dokuwiki txt causes
1060          // lots a legitimately translated <br/> to also be split up. I
1061          // think the only way forward would be to maybe extend the HTML
1062          // Comment plugin to also respect and process <br/> tags. Then I
1063          // can avoid transforming them, and use the \\ sentinel to
1064          // separate multi-line table cells.
1065          $cell_content_lines = explode('\\\\', $cell_content);
1066          foreach ($cell_content_lines as $cell_content)
1067          {
1068            // - watch out, as the content may be an image
1069            if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
1070            {
1071              processImage($xml_out, $cell_content);
1072            }
1073            elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
1074            {
1075              fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n");
1076            }
1077            // - anything else it text
1078            else
1079            {
1080              outputTextBlock($xml_out, $cell_content);
1081            }
1082          }
1083          fwrite($xml_out, '</th>' . "\n");
1084        }
1085      }
1086      fwrite($xml_out, '</tr>' . "\n");
1087    }
1088    // - links to image media in the wiki!
1089    elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
1090    {
1091      processImage($xml_out, $line);
1092      // - if we were processing a figure, then now is a good time to close it
1093      if ($in_figure)
1094      {
1095        fwrite($xml_out, '</Figure>' . "\n");
1096        $in_figure = false;
1097      }
1098    }
1099    // - if the line starts with a <code> block, then we have a tag
1100    //   for that (which is special in that it get a unique text id)
1101    elseif (preg_match('/^<code\s*\d*\s*>(.*?)(<\/code>)?$/', $line, $matches) || ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
1102    {
1103      $payload = $matches[1];
1104      $found_end = (isset($matches[2]));
1105      $in_code_block = true;
1106      // - be careful with empty lines
1107      if (empty($payload))
1108      {
1109        // - as they may appear in the body of the code (in which case we need
1110        //   to output them). The empty lines at the start or end of a code
1111        //   block are just an unfortunate consequence of the support for code
1112        //   line numbering.
1113        if (!$found_end && strpos($line, '<code') === false)
1114        {
1115          fwrite($xml_out, "<CodeLine/>\n");
1116        }
1117      }
1118      elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches))
1119      {
1120        $text_id = $matches[1];
1121        outputTextBlock($xml_out, $payload, 'code', true);
1122        // - record the id to prevent repeating
1123        $seen_ids[$text_id] = true;
1124      }
1125      else
1126      {
1127        fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n");
1128      }
1129      // - if we didn't find an endtag we have to keep doing code mode until
1130      //   we do
1131      $in_code_block = (!$found_end);
1132      if ($found_end)
1133      {
1134        // - if we were processing a figure, then now is a good time to close it
1135        if ($in_figure)
1136        {
1137          fwrite($xml_out, '</Figure>' . "\n");
1138          $in_figure = false;
1139        }
1140      }
1141    }
1142    // - entities on a line by themselves (i.e. references to external files)
1143    //   go through verbatim
1144    elseif (preg_match('/^\s*&[a-z0-9_-]+;\s*$/', $line))
1145    {
1146      fwrite($xml_out, $line . "\n");
1147    }
1148    // - lines starting with > are indented text blocks
1149    elseif (preg_match('/^>(.*)$/', $line, $matches))
1150    {
1151      $payload = $matches[1];
1152      fwrite($xml_out, "<Indented>\n");
1153      outputTextBlock($xml_out, $payload);
1154      fwrite($xml_out, "</Indented>\n");
1155    }
1156    // - everything else goes straight through as a text block
1157    // - note that for code blocks, even empty lines count
1158    elseif (!empty($line))
1159    {
1160      // - output the line of text having encoded entities etc
1161      outputTextBlock($xml_out, $line, '', $in_code_block);
1162    }
1163  }
1164  // Complete any open part
1165  if ($in_part)
1166  {
1167    fwrite($xml_out, '</Content>' . "\n");
1168    fwrite($xml_out, '</Part>' . "\n");
1169    $in_part = false;
1170  }
1171  // Complete any open subsection
1172  if ($in_subsection)
1173  {
1174    fwrite($xml_out, '</Content>' . "\n");
1175    fwrite($xml_out, '</Subsection>' . "\n");
1176    $in_subsection = false;
1177  }
1178  // Complete any open section
1179  if ($in_section)
1180  {
1181    fwrite($xml_out, '</Content>' . "\n");
1182    fwrite($xml_out, '</Section>' . "\n");
1183    $in_section = false;
1184  }
1185  // Complete any open chapter
1186  if ($in_chapter)
1187  {
1188    fwrite($xml_out, '</Content>' . "\n");
1189    fwrite($xml_out, '</Chapter>' . "\n");
1190    $in_chapter = false;
1191  }
1192}
1193/** processPage($xml_out, $page_name) **/
1194
1195function processImage($xml_out, $text)
1196{
1197  global $dokuwiki_path;
1198  global $xml_source_path;
1199  if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches))
1200  {
1201    $filename = $matches[1];
1202    $width = $matches[2];
1203    $height = $matches[3];
1204    // - copy the file into place
1205    $image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
1206    $image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images';
1207    mkAllDir($image_destination_dir, 0755);
1208    $image_destination_path = $image_destination_dir . '/' . $filename;
1209    if (copy($image_source_path, $image_destination_path))
1210    {
1211      echo '[copying file: ' . $filename . "] \n";
1212    }
1213    else
1214    {
1215      printError('Failed to copy image into place: ' . $filename, false);
1216    }
1217    // - spit out the XML element
1218    fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n");
1219  }
1220}
Note: See TracBrowser for help on using the browser.