root/documentation/trunk/php/gs-manual-import.php @ 25052

Revision 25052, 41.7 KB (checked in by jmt12, 8 years ago)

Several changes to Metadata, Table and Figure parsing as issues were encountered in languages other than English. I'd argue most of these were bogus XML, but I better import the pages anyway.

Line 
1<?php
2
3require_once('common.php');
4
5/** @file gs-manual-export.php
6 *  This script transforms the single XML manual file required by the rest of
7 *  the Greenstone manual generation scripts into the series of dokuwiki pages
8 *  that make up a certain manual (as specified by the 'm' argument) in a
9 *  certain language ('l').
10 */
11
12if (!parseCLIArguments())
13 {
14   printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user|install|develop|paper]");
15 }
16
17// 0. Initialization
18if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
19{
20  $_REQUEST['l'] = 'en';
21}
22if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
23{
24  $_REQUEST['m'] = 'user';
25  //$_REQUEST['m'] = 'install';
26  //$_REQUEST['m'] = 'develop';
27  //$_REQUEST['m'] = 'paper';
28}
29
30// - validate arguments before we use them (security)
31if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
32 {
33   printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34 }
35
36if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
37 {
38   printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39 }
40
41echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43echo "<p><b>Frontmatter: </b><br/>\n";
44// 1. By-and-large we're going to process all of this in a big state machine
45// - the top level page, containing cover page and chapter order information,
46//   needs to be created last, so we have to store it's information
47$manual_metadata = array();
48$entity_replacements = array();
49$footnotes = array();
50$page_order = array();
51$page_count = 2;
52$looking_for_metadata = '';
53$chapter_txt_out = false;
54$frontmatter_text = '';
55$in_section = false;
56$sections_page_name = '';
57$in_chapter = false;
58$chapter_id = '';
59$bullet_depth = 0;
60$is_numbered_list = true;
61$line_counter = 0;
62$in_code = false;
63$in_footnotes = false;
64$in_numbered_item = 0;
65$in_bullet_item = false;
66$seen_code_in_item = false;
67$in_indent = false;
68$is_code_linenumbered = false;
69// - construct the path using the information we've been provided as arguments
70$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71$xml_in = fopen($xml_file_path, 'r');
72if (!$xml_in)
73 {
74   printError('Failed to locate top level page for manual');
75 }
76// - we also use this opportunity to read in any footnotes as we'll need to
77//   move them onto their appropriate page
78while (($line = fgets($xml_in)) !== false)
79 {
80   if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81   {
82     $footnote_id = $matches[1];
83     $text_line = fgets($xml_in);
84     if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85     {
86       $footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87     }
88     // - throw away </Footnote>
89     fgets($xml_in);
90   }
91 }
92fclose($xml_in);
93// - now reopen to parse it
94$xml_in = fopen($xml_file_path, 'r');
95if (!$xml_in)
96 {
97   printError('Failed to locate top level page for manual');
98 }
99while (($line = getLine($xml_in)) !== false)
100 {
101   // - Special Case: lingering code blocks, continue if next line also
102   //   contains code, otherwise we need an extra newline
103   if ($in_code)
104   {
105     $code_text = "\n";
106     if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107     {
108       $code_text .= '</code>';
109       // - codeblocks that appear inside numbered lists do not get their own
110       //   newlines as that would split the item. Instead newlines will be
111       //   added when </NumberedItem> encountered.
112       if (!$is_numbered_list)
113       {
114         $code_text .= "\n\n";
115       }
116       else
117       {
118         $seen_code_in_item = true;
119       }
120       $in_code = false;
121     }
122     if ($in_chapter)
123     {
124       fwrite($chapter_txt_out, $code_text);
125     }
126     else
127     {
128       $frontmatter_text .= $code_text;
129     }
130   }
131   // - some system metadata to watch for
132   if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133   {
134     $entity = $matches[1];
135     addMetadata('ENTITY',$entity);
136     if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137     {
138       $entity_replacements[$matches[1]] = $matches[2];
139     }
140   }
141   // - we have an explicit list of cover metadata to watch for
142   elseif (!$in_section && !$in_chapter && preg_match('/<(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
143   {
144     $looking_for_metadata = $matches[1];
145   }
146   elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
147   {
148     $looking_for_metadata = '';
149   }
150   // - found metadata we have!
151   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152   {
153     $text_id = $matches[1];
154     $text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155     addMetadata($looking_for_metadata, $text);
156   }
157   // - bogus metadata found in French version
158   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
159   {
160   }
161   // - any text we encounter outside of both sections and chapters also
162   //   belongs on the cover
163   elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
164   {
165     // (for now I'll assume id's are persistent)
166     addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
167   }
168   // - we will probably encounter the opening section (which is outside of a
169   //   chapter) first, so we have a special case for it
170   elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
171   {
172     $section_id = $matches[1];
173     // - if this is the first non-chapter section we have encountered then it
174     //   gets the honor of having the page---that these sections will
175     //   eventually be printed out on---named after it. Typically this should
176     //   be "about_this_manual"
177     if (empty($frontmatter_text))
178     {
179       $sections_page_name = $section_id;
180     }
181     $in_section = true;
182     $title = getTitle($xml_in, 'section:' . $section_id);
183     if (empty($frontmatter_text))
184     {
185       array_unshift($page_order, $sections_page_name . '|' . noComments($title));
186     }
187     // - if the title, as is, wouldn't autogenerate the appropriate id, then
188     //   we have to include the id explicitly (as another html comment block)
189     if ($section_id != generateID($title))
190     {
191       $title = '<!-- sid:' . $section_id . ' -->' . $title;
192       $seen_ids[$section_id] = 1;
193     }
194     $frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
195     // - whew. Chapter's going to be just as bad though.
196   }
197   elseif ($in_section && preg_match('/<\/Section>/', $line))
198   {
199     $in_section = false;
200   }
201   elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
202   {
203     $chapter_id = $matches[1];
204     echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
205     $chapter_page_name = $chapter_id;
206     // - create a new file to store this chapter
207     $chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
208     if (!file_exists($chapter_file_dir))
209     {
210       mkAllDir($chapter_file_dir, 0755);
211     }
212     $chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
213     // - backup existing file
214     if (file_exists($chapter_file_path))
215     {
216       $chapter_backup_file_path = $chapter_file_path . '.bak';
217       rename($chapter_file_path, $chapter_backup_file_path);
218     }
219     // - open new file for writing
220     $chapter_txt_out = fopen($chapter_file_path, 'w');
221     if (!$chapter_txt_out)
222     {
223       printError('Failed to open page file for writing: ' . $chapter_page_name);
224     }
225     $in_chapter = true;
226     $title = getTitle($xml_in, 'chapter: ' . $chapter_id);
227     fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
228     array_push($page_order, $chapter_page_name . '|' . noComments($title));
229   }
230   elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
231   {
232     fclose($chapter_txt_out);
233     $chapter_txt_out = false;
234     $in_chapter = false;
235     $page_count++;
236   }
237   // - section, subsection and part titles within chapter
238   elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
239   {
240     $title_type = $matches[1];
241     $section_id = $matches[2];
242     if (empty($section_id))
243     {
244       $section_id = generateID(strtolower($title_type));
245     }
246     echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
247     $header_fix = '';
248     $title = getTitle($xml_in, 'heading: ' . $title_type);
249     if ($title_type == 'Section')
250     {
251       $header_fix = '=====';
252     }
253     if ($title_type == 'Subsection')
254     {
255       $header_fix = '====';
256     }
257     if ($title_type == 'Part')
258     {
259       $header_fix = '===';
260       // - remove b's and i's
261       $title = preg_replace('/<\/?(B|I)>/i', '', $title);
262     }
263     // - if the title, as is, wouldn't autogenerate the appropriate id, then
264     //   we have to include the id explicitly (as another html comment block)
265     if ($section_id != generateID($title))
266     {
267       $title = '<!-- sid:' . $section_id . ' -->' . $title;
268       $seen_ids[$section_id] = 1;
269     }
270     fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
271   }
272   elseif ($in_chapter && (strpos($line, '</Section') !== false || strpos($line, '</Subsection') !== false  || strpos($line, '</Part') !== false))
273   {
274     // do nothing for now
275   }
276   // - figures (and their titles/captions)
277   elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
278   {
279     $figure_id = $matches[1];
280     $other_attributes = $matches[2];
281     echo '[adding figure: ' . $figure_id . "] \n";
282     // We need the title too
283     $caption = getTitle($xml_in, 'figure:' . $figure_id);
284     $caption = translateText(alternateComments($caption));
285     $txt = "<imgcaption figure_" . $figure_id . '|' . $caption . ' ';
286     // - we also check the other attributes to see if the XML has requested
287     //   any following codeblock be linenumbered
288     if (strpos($other_attributes, 'withLineNumber') !== false)
289     {
290       $is_code_linenumbered = true;
291       $txt .= '%!-- withLineNumber --%';
292     }
293     $txt .= '></imgcaption>' . "\n";
294     if ($in_chapter)
295     {
296       fwrite($chapter_txt_out, $txt);
297     }
298     else
299     {
300       $frontmatter_text .= $txt;
301     }
302   }
303   elseif (strpos($line, '</Figure>') !== false)
304   {
305     if ($in_chapter)
306     {
307       fwrite($chapter_txt_out, "\n\n");
308     }
309     else
310     {
311       $frontmatter_text .= "\n\n";
312     }
313     // - no longer required
314     $is_code_linenumbered = false;
315   }
316   elseif (preg_match('/<Table([^>]*).*?.*?>/', $line, $matches))
317   {
318     $attributes = $matches[1];
319     $table_txt = '';
320     $table_id = '';
321     if (preg_match('/id="([^"]+)"/', $attributes, $matches))
322     {
323       $table_id = $matches[1];
324     }
325     else
326     {
327       $table_id = generateID('table');
328     }
329     $hidden = false;
330     if (strpos($attributes, 'class="hidden"') !== false)
331     {
332       $hidden = true;
333     }
334     echo '[adding table: ' . $table_id . "] \n";
335     $table_caption = getTitle($xml_in, 'table: ' . $table_id);
336     if ($hidden)
337     {
338       $table_txt .= '<tblcaption table_' . $table_id . '|##HIDDEN##></tblcaption>' . "\n";
339     }
340     elseif (empty($table_caption))
341     {
342       $table_txt .= '<tblcaption table_' . $table_id . '|##NOCAPTION##></tblcaption>' . "\n";
343     }
344     else
345     {
346       $table_txt .= '<tblcaption table_' . $table_id . '|' . noComments($table_caption) . '></tblcaption>' . "\n";
347     }
348     // - in order to properly capture the table we're going to have to read in
349     //   the whole thing here, and take note of column widths
350     $have_output_widths = false;
351     $column_widths = array();
352     while (strpos($line, '</Table>') === false)
353     {
354       // - find the start of a row
355       while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
356       {
357         $line = getLine($xml_in);
358       }
359       if (strpos($line, '<tr>') !== false)
360       {
361         $row_txt = '|';
362         $line = getLine($xml_in);
363         // - now we read in multiple cells (line starting <th
364         while (strpos($line, '<th') === 0)
365         {
366           if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
367           {
368             $cell_width = $matches[1];
369             if (!$have_output_widths)
370             {
371               array_push($column_widths, $cell_width);
372             }
373           }
374           // Ignore empty cells
375           // - adding another case for empty header cells (turned up in es
376           //   version of "From Paper")
377           if (preg_match('/<th width="\d+"\/>/', $line) || preg_match('/<th width="\d+">.*<\/th>/', $line))
378           {
379             $row_txt .= ' |';
380           }
381           else
382           {
383             $line = getLine($xml_in);
384             $first = true;
385             while (strpos($line, '</th>') === false)
386             {
387               if (!$first)
388               {
389                 $row_txt .= '\\\\';
390               }
391               // - we can have images or text in our tables
392               if (preg_match('/<File.*url="images\/([^"]+)".*\/>/', $line, $matches))
393               {
394                 $payload = $matches[0];
395                 $filename = $matches[1];
396                 $width = 0;
397                 if (preg_match('/width="(\d+)"/', $payload, $matches))
398                 {
399                   $width = $matches[1];
400                 }
401                 $height = 0;
402                 if (preg_match('/height="(\d+)"/', $payload, $matches))
403                 {
404                   $height = $matches[1];
405                 }
406                 $image_txt = handleImage($filename, $width, $height);
407                 $row_txt .= ' ' . $image_txt . ' ';
408               }
409               elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
410               {
411                 $tid = $matches[1];
412                 $txt = $matches[2];
413                 // - multiple line text block
414                 while (strpos($txt, '</Text>') === false)
415                 {
416                   $txt .= getLine($xml_in);
417                 }
418                 $txt = str_replace('</Text>','',$txt);
419                 $row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
420               }
421               elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
422               {
423                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
424               }
425               elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
426               {
427                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
428               }
429               elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
430               {
431                 if (!empty($matches[1]))
432                 {
433                   $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
434                 }
435                 else
436                 {
437                   $row_txt .= ' ';
438                 }
439               }
440               // we'll add (bogus) linebreaks
441               elseif (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
442               {
443                 $row_txt = ' ';
444               }
445               else
446               {
447                 printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
448               }
449               $first = false;
450               // - next line
451               $line = getLine($xml_in);
452             }
453             // - close the cell
454             $row_txt .= '|';
455           }
456           // next!
457           $line = getLine($xml_in);
458         }
459         // - if we haven't already, output the width command
460         if (!$have_output_widths)
461         {
462           $table_txt .= '|< - ' . implode(' ', $column_widths) . ' >|' . "\n";
463           $have_output_widths = true;
464         }
465         $table_txt .= $row_txt . "\n";
466         // - throw away the closing </tr>
467         $line = getLine($xml_in);
468       }
469     }
470     $table_txt .= "\n";
471     if ($in_chapter)
472     {
473       fwrite($chapter_txt_out, $table_txt);
474     }
475     else
476     {
477       $frontmatter_text .= $table_txt;
478     }
479   }
480   // - copy and insert images
481   elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
482   {
483     $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
484     if ($in_chapter)
485     {
486       fwrite($chapter_txt_out, $image_txt);
487     }
488     else
489     {
490       $frontmatter_text .= $image_txt;
491     }
492   }
493   // - bullet lists
494   elseif (preg_match('/<BulletList>/', $line))
495   {
496     echo "[adding bulletlist] \n";
497     if ($in_bullet_item || $in_numbered_item)
498     {
499       if ($in_chapter)
500       {
501         fwrite($chapter_txt_out, "\n");
502       }
503       elseif ($in_section)
504       {
505         $frontmatter_text .= "\n";
506       }
507     }
508     $bullet_depth++;
509     $is_numbered_list = false;
510   }
511   // - numbered lists
512   elseif (preg_match('/<NumberedList>/', $line))
513   {
514     echo "[adding numbered list] \n";
515     $bullet_depth++;
516     $is_numbered_list = true;
517     // - reset this flag that keeps track of whether an item (numbered or
518     //   otherwise) is legitimately split by a code block
519     $seen_code_in_item = false;
520   }
521   elseif (preg_match('/<\/BulletList>/', $line))
522   {
523     $bullet_depth--;
524     if ($bullet_depth == 0)
525     {
526       if ($in_chapter)
527       {
528         fwrite($chapter_txt_out, "\n");
529       }
530       elseif ($in_section)
531       {
532         $frontmatter_text .= "\n";
533       }
534       $is_numbered_list = false;
535     }
536     ///cho "[finished bulletlist] ";
537   }
538   elseif (preg_match('/<\/NumberedList>/', $line))
539   {
540     $bullet_depth--;
541     if ($bullet_depth == 0)
542     {
543       if ($in_chapter)
544       {
545         fwrite($chapter_txt_out, "\n");
546       }
547       elseif ($in_section)
548       {
549         $frontmatter_text .= "\n";
550       }
551       $is_numbered_list = false;
552     }
553     ///cho "[finished numbered list] ";
554   }
555   elseif (preg_match('/<NumberedItem>/', $line))
556   {
557     $in_numbered_item = 1;
558   }
559   elseif (preg_match('/<\/NumberedItem>/', $line))
560   {
561     $in_numbered_item = 0;
562     if ($in_chapter)
563     {
564       fwrite($chapter_txt_out, "\n");
565     }
566     else
567     {
568       $frontmatter_text .= "\n";
569     }
570   }
571   elseif (preg_match('/<Bullet>/', $line))
572   {
573     $in_bullet_item = true;
574   }
575   elseif (preg_match('/<\/Bullet>/', $line))
576   {
577     if ($in_chapter)
578     {
579       fwrite($chapter_txt_out, "\n");
580     }
581     else
582     {
583       $frontmatter_text .= "\n";
584     }
585     $in_bullet_item = false;
586   }
587   // TEXT HANDLING - this is the main case, but has disappeared into the mire
588   // of other cases.
589   elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
590   {
591     $id = $matches[1];
592     $str = $matches[2];
593     // - special case for those text elements split over multiple lines. We
594     //   keep concatenating lines until we find the closing text element or we
595     //   run out of lines!
596     $another_line = '';
597     while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
598     {
599       $str .= ' ' . $another_line;
600     }
601     // - note that if we ran out of lines (eof) then we'll break out of this
602     //   block anyway, it's just there won't be a <\Text> at the end of this
603     //   block... despite this being a major validation issue in the XML it
604     //   shouldn't result in this script being vladed
605     // - now remove the </Text> from the end (hopefully) of str
606     $str = preg_replace('/<\/Text>\s*/', '', $str);
607     // - and prepend the id while translating the str into Dokuwiki format
608     $str = '<!-- id:' . $id . ' -->' . translateText($str);
609     if ($bullet_depth > 0)
610     {
611       if ($is_numbered_list)
612       {
613         // - special case for those text elements legimately split in two by
614         //   code blocks. They get no bullet of either type and are 'run-on'
615         //   immediately to the end of the code element in order to prevent
616         //   dokuwiki restarting numbering etc
617         if ($seen_code_in_item)
618         {
619           // - leave str as it is
620           // - reset flag just incase the item happens to contain another
621           //   code block
622           $seen_code_in_item = false;
623         }
624         else if ($in_numbered_item == 1)
625         {
626           $str = '- ' . $str;
627         }
628         // - superspecial case for the poorly formatted numberlists that
629         //   contain more than one text block per point. We'll nest them
630         //   as a bullet list as that preserves order, formatting and (I
631         //   hope) meaning.
632         else
633         {
634           if ($in_chapter)
635           {
636             fwrite($chapter_txt_out, "\n");
637           }
638           else
639           {
640             $frontmatter_text .= "\n";
641           }
642           $str = '  * ' . $str;
643         }
644         $in_numbered_item++;
645       }
646       else
647       {
648         $str = '* ' . $str;
649       }
650       for ($i = 0; $i < $bullet_depth; $i++)
651       {
652         $str = '  ' . $str;
653       }
654     }
655     else
656     {
657       // Indented text is preceeded by a >
658       if ($in_indent)
659       {
660         $str = '> ' . $str . "\n";
661       }
662       else
663       {
664         $str .= "\n";
665       }
666     }
667     if ($bullet_depth == 0)
668     {
669       $str .= "\n";
670     }
671     if ($in_chapter)
672     {
673       fwrite($chapter_txt_out, $str);
674     }
675     else
676     {
677       $frontmatter_text .= $str;
678     }
679   }
680   // - codified text blocks
681   elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
682   {
683     $code_id = $matches[1];
684     // - determine the appropriate code block prefix
685     $code_prefix = '';
686     if (!$in_code)
687     {
688       if ($is_code_linenumbered)
689       {
690         $code_prefix = '<code 1>';
691       }
692       else
693       {
694         $code_prefix = '<code>';
695       }
696       $in_code = true;
697     }
698     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
699     if ($in_chapter)
700     {
701       fwrite($chapter_txt_out, $code_txt);
702     }
703     else
704     {
705       $frontmatter_text .= $code_txt;
706     }
707   }
708   elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
709   {
710     $code_id = $matches[1];
711     // - determine the appropriate code block prefix
712     $code_prefix = '';
713     if (!$in_code)
714     {
715       if ($is_code_linenumbered)
716       {
717         $code_prefix = '<code 1>';
718       }
719       else
720       {
721         $code_prefix = '<code>';
722       }
723       $in_code = true;
724     }
725     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
726     if ($in_chapter)
727     {
728       fwrite($chapter_txt_out, $code_txt);
729     }
730     else
731     {
732       $frontmatter_text .= $code_txt;
733     }
734   }
735   elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
736   {
737     $code_txt = $matches[1];
738     // - determine the appropriate code block prefix
739     $code_prefix = '';
740     if (!$in_code)
741     {
742       if ($is_code_linenumbered)
743       {
744         $code_prefix = "<code 1>\n";
745       }
746       else
747       {
748         $code_prefix = "<code>\n";
749       }
750       $in_code = true;
751     }
752     // - arg. another special case for codelines that span more than one line
753     // (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
754     // for an element name, eh?)
755     $another_line = '';
756     while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
757     {
758       $code_txt .= ' ' . $another_line;
759     }
760     $code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
761     $code_txt = $code_prefix . translateText($code_txt, true);
762     if ($in_chapter)
763     {
764       fwrite($chapter_txt_out, $code_txt);
765     }
766     else
767     {
768       $frontmatter_text .= $code_txt;
769     }
770   }
771   // - there are also sometimes empty codelines - which indicate a newline in
772   //   the code listing
773   elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
774   {
775     $code_txt = '';
776     if (!$in_code)
777     {
778       $code_txt = "\n";
779       if ($is_code_linenumbered)
780       {
781         $code_txt = "<code 1>\n" . $code_txt;
782       }
783       else
784       {
785         $code_txt = "<code>\n" . $code_txt;
786       }
787       $in_code = true;
788     }
789     if ($in_chapter)
790     {
791       fwrite($chapter_txt_out, $code_txt);
792     }
793     else
794     {
795       $frontmatter_text .= $code_txt;
796     }
797   }
798   // - reference to an external XML file
799   elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
800   {
801     if ($in_chapter)
802     {
803       fwrite($chapter_txt_out, $line);
804     }
805     else
806     {
807       $frontmatter_text .= $line;
808     }
809   }
810   elseif (strpos($line, '<FootnoteList>') !== false)
811   {
812     $in_footnotes = true;
813   }
814   elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
815   {
816     $in_footnotes = false;
817   }
818   // Indentation - the closest thing we have is quoting, so we'll use that
819   elseif (strpos($line, '<Indented>') !== false)
820   {
821     $in_indent = true;
822   }
823   elseif (strpos($line, '</Indented>') !== false)
824   {
825     $in_indent = false;
826   }
827   // - pattern of lines to ignore
828   else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
829   {
830   }
831   // - we ignore anything else in footnotes too, as they were handled in the
832   //   preprocessing pass
833   else if ($in_footnotes)
834   {
835   }
836   // - ignore empty lines
837   else if (preg_match('/^\s*$/', $line))
838   {
839   }
840   // - meh. French versions have random, non-text element, linebreaks floating
841   //   around. Guess I'll honor their formatting even though it's bogus
842   else if (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
843   {
844     if ($in_chapter)
845     {
846       fwrite($chapter_txt_out, ' \\\\');
847     }
848     else
849     {
850       $frontmatter_text .= ' \\\\';
851     }
852   }
853   // - danger Will Robinson!
854   else
855   {
856     echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|<hr /></div>\n";
857   }
858 }
859
860// 2. We should now have enough metadata to export the cover page
861$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
862// - backup any existing file
863if (file_exists($top_page_path))
864 {
865   $top_page_backup_path = $top_page_path . '.bak';
866   if(!rename($top_page_path, $top_page_backup_path))
867   {
868     printError('Failed to rename existing top page for backup');
869   }
870 }
871// - and create a handle to the new file
872$txt_out = fopen($top_page_path, 'w');
873// - write the page (including the tables)
874fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
875fwrite($txt_out, "\n");
876
877// - *NEW* ability to request imports and exports from within the page
878fwrite($txt_out, "<ifauth @admin>\n\n");
879fwrite($txt_out, '**Administrator Commands:**' . "\n");
880// On second thoughts we probably never want to do this casually, as it boguses
881// all history/approval/edit information. Instead I'll leave this as a manual
882// process.
883fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
884fwrite($txt_out, '  * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
885fwrite($txt_out, "</ifauth>\n\n");
886
887// - regular metadata
888fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
889fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
890$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
891foreach ($fields as $field)
892{
893  $values = getMetadata($field);
894  foreach ($values as $value)
895  {
896    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
897  }
898}
899fwrite($txt_out, "\n");
900// - contents (which also provides order information for exporting)
901fwrite($txt_out, '===== Contents =====' . "\n");
902fwrite($txt_out, "\n");
903fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
904foreach ($page_order as $page_info)
905{
906  fwrite($txt_out, '  * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
907}
908// - system metadata
909fwrite($txt_out, "<ifauth @admin>\n\n");
910fwrite($txt_out, '===== System Metadata =====' . "\n");
911fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
912fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
913$fields = array('ENTITY','SupplementaryText');
914foreach ($fields as $field)
915{
916  $values = getMetadata($field);
917  foreach ($values as $value)
918  {
919    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
920  }
921}
922fwrite($txt_out, "</ifauth>\n\n");
923fwrite($txt_out, "\n");
924// - done!
925fclose($txt_out);
926$page_count++;
927
928// 3. And the 'sections' page, grouping together all the loose sections as
929//    frontmatter
930$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
931// - backup any existing file
932if (file_exists($frontmatter_page_path))
933 {
934   $frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
935   if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
936   {
937     printError('Failed to rename existing frontmatter page for backup');
938   }
939 }
940// - populate the new frontmatter file
941file_put_contents($frontmatter_page_path, $frontmatter_text);
942
943echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
944echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
945exit(0);
946
947/**
948 */
949function addMetadata($field, $value)
950{
951  global $manual_metadata;
952  echo '[adding metadata: ' . $field . "] \n";
953  $values = array();
954  if (isset($manual_metadata[$field]))
955  {
956    $values = $manual_metadata[$field];
957  }
958  array_push($values, $value);
959  $manual_metadata[$field] = $values;
960}
961/** addMetadata() **/
962
963function getFirstMetadata($field)
964{
965  global $manual_metadata;
966  $value = '';
967  if (isset($manual_metadata[$field]))
968  {
969    $values = $manual_metadata[$field];
970    if (!empty($values))
971    {
972      $value = $values[0];
973    }
974  }
975  return $value;
976}
977/** getFirstMetadata() **/
978
979function getMetadata($field)
980{
981  global $manual_metadata;
982  $values = array();
983  if (isset($manual_metadata[$field]))
984  {
985    $values = $manual_metadata[$field];
986  }
987  return $values;
988}
989/** getMetadata() **/
990
991/**
992 *  Read in the next title element with nested text element and extract the
993 *  title.
994 */
995function getTitle($xml_in, $element)
996{
997  $title = '';
998  $in_title_element = false;
999  // - the first thing in a chapter will be it's title
1000  $title_line = getLine($xml_in);
1001  // - super special case: some language versions don't wrap titles in title
1002  //   element, so if the first thing we see is a text, we treat that as the
1003  //   title
1004  if (strpos($title_line, '<text') !== false)
1005  {
1006
1007  }
1008  // - super special case: a table with an empty title
1009  if (strpos($title_line, '<Title/>') !== false)
1010  {
1011    return '';
1012  }
1013  if (strpos($title_line, '<Title>') !== false)
1014  {
1015    $in_title_element = true;
1016    $title_line = getLine($xml_in);
1017  }
1018  // - some horribly formed entries have the subtitle first within the title
1019  //   element
1020  if (strpos($title_line, '<SubTitle>') !== false)
1021  {
1022    $title_line = getLine($xml_in);
1023    if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1024    {
1025      $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1026    }
1027    $title_line = getLine($xml_in);
1028    if (strpos($title_line, '</SubTitle>') === false)
1029    {
1030      printError('Failed to find closing title for: ' . $element);
1031    }
1032    $title_line = getLine($xml_in);
1033  }
1034  // - grab the chapter title now so we can store it in the page ordering
1035  if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
1036  {
1037    $id = $matches[1];
1038    $str = $matches[2];
1039    // - special case for text blocks that span multiple lines (as discovered
1040    //   in the russian "From Paper"
1041    $another_line = '';
1042    while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
1043    {
1044      $str .= ' ' . $another_line;
1045    }
1046    // - now remove </Text>
1047    $str = preg_replace('/<\/Text>\s*/', '', $str);
1048    $title = '<!-- id:' . $id . ' -->' . $str . $title;
1049  }
1050  // - special case for (stoopid) empty titles that use up a text id
1051  elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
1052  {
1053    $title = '<!-- id:' . $matches[1] . ' -->' . $title;
1054  }
1055  else
1056  {
1057    printError('Failed to find title text for: ' . $element);
1058  }
1059  // - watch for subtitle elements
1060  if ($in_title_element)
1061  {
1062    $title_line = getLine($xml_in);
1063    if (strpos($title_line, '<SubTitle>') !== false)
1064    {
1065      $title_line = getLine($xml_in);
1066      if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1067      {
1068        $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1069      }
1070      $title_line = getLine($xml_in);
1071      if (strpos($title_line, '</SubTitle>') === false)
1072      {
1073        printError('Failed to find closing title for: ' . $element);
1074      }
1075      $title_line = getLine($xml_in);
1076    }
1077    if (strpos($title_line, '</Title>') === false)
1078    {
1079      printError('Failed to find closing title for: ' . $element);
1080    }
1081  }
1082  return $title;
1083}
1084/** getTitle() **/
1085
1086function alternateComments($text)
1087{
1088  $text = str_replace('<!--', '%!--', $text);
1089  $text = str_replace('-->', '--%', $text);
1090  // remove any lurking crossrefs while we are at it
1091  $text = preg_replace('/<CrossRef.*?ref="([^"]+)".*?>/', '\\1', $text);
1092  return $text;
1093}
1094
1095function noComments($text)
1096{
1097  $text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1098  return preg_replace('/<!--[^>]+-->/', '', $text);
1099}
1100
1101function translateTableCodeline($text)
1102{
1103  ///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1104  // Escape the current italics tags to prevent the translate destroying them
1105  $text = str_replace('<i>', '%!--i--%', $text);
1106  $text = str_replace('</i>', '%/i%', $text);
1107  // Translate the text, just decoding the entities
1108  $text = translateText($text, true);
1109  // Now turn the italic tags (escaped) into HTML comments so we remember them
1110  // but they are hidden in the text
1111  $text = str_replace('%!--i--%', '<!--i-->', $text);
1112  $text = str_replace('%/i%', '<!--/i-->', $text);
1113  ///cho " =&gt; '" . htmlspecialchars($text) . "<br />\n";
1114  return $text;
1115}
1116
1117function translateText($text, $entities_only=false)
1118{
1119  global $entity_replacements;
1120  global $footnotes;
1121  global $in_code;
1122
1123  // - immediate find and protect any legitimate HTML comments in the text
1124  //   (so already using encoded entities), otherwise they'll be throughly
1125  //   vladed during the following tranforms. This has to be matched with
1126  //   changes to the HTMLComments plugin in Dokuwiki to allow the correct
1127  //   thing to be displayed to the user.
1128  if (!$in_code)
1129  {
1130    $text = str_replace('&lt;!--', '%!--', $text);
1131    $text = str_replace('--&gt;', '--%', $text);
1132  }
1133
1134  if (!$entities_only)
1135  {
1136    // - replace linking constructs with dokuwiki ones
1137    // - external chapter section crossrefs are easily the worst of all...
1138    while (preg_match('/<CrossRef\s[^>]*external[^>]*\/>/', $text) && preg_match('/<CrossRef\s[^>]*target="Chapter"[^>]*\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1139    {
1140      $pattern = $matches[0];
1141      $attributes = $matches[1];
1142      $manual_name = '';
1143      if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1144      {
1145        $manual_name = $matches[1];
1146      }
1147      $language = '';
1148      if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1149      {
1150        $language = $matches[1];
1151      }
1152      $page_id = '';
1153      if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1154      {
1155        $page_id = $matches[1];
1156      }
1157      if (empty($manual_name) || empty($language) || empty($page_id))
1158      {
1159        printError('Failed to parse external reference: ' . $pattern);
1160      }
1161      // - best we can do is a search within a restricted namespace
1162      $reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
1163      $text = str_replace($pattern, $reference, $text);
1164    }
1165    // - chapter crossrefs are tricksie due to needing to know ordering numbers
1166    while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1167    {
1168      $chapter_id = $matches[1];
1169      $page_name = $chapter_id;
1170      $text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
1171    }
1172    // - internal figure and table references
1173    $text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1174    $text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1175    // - simple internal reference
1176    $text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1|\1]]', $text);
1177    $text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1|\1]]', $text);
1178    $text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1|\1]]', $text);
1179    // - simple external url
1180    $text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1|\2]]', $text);
1181    // - footnote references are also tricksie as we've had to extract the
1182    //   footnotes earlier (during chapter counting)
1183    while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1184    {
1185      $footnote_id = $matches[1];
1186      if (!isset($footnotes[$footnote_id]))
1187      {
1188        printError('Unknown footnote referenced: ' . $footnote_id);
1189      }
1190      $footnote = $footnotes[$footnote_id];
1191      $text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1192    }
1193    // - detect and handle URLs surrounded by <i> tags very carefully (as doku
1194    //   will less than helpfully turn them into an external link and screw up
1195    //   everything that follows them on the page).
1196    //   example: <i>www.microsoft.com</i>
1197    //   example: <i>http://nzdl.org/cgi-bin/library</i>
1198    //   example: <i>www.yourserver.com</i>
1199    //   example: <i>http://www.yourserver.com</i>
1200    //   example: <i>http://www.yourserver.com/greenstone</i>
1201    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1202    // - superspecial case for two-part URLs ending in .org (like nzdl.org)
1203    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1204    // - another superspecial case, this time for URLs on localhost
1205    $text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1206    // - p00p, underlines have the same issue around URLs.
1207    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1208    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1209    $text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1210
1211    // - replace HTML elements with the dokuwiki style equivilents
1212    $text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
1213    // - restore the double slashes in dokuwiki links
1214    while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
1215    {
1216      $text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
1217    }
1218    $text = str_replace('<b>', '**', $text);
1219    $text = str_replace('</b>', '**', $text);
1220    //$text = str_replace(' <br/>', '\\\\ ', $text);
1221    //$text = str_replace(' <br/>', '\\\\ ', $text);
1222    //$text = str_replace('<br/>', '\\\\ ', $text);
1223    $text = str_replace('<i>', '//', $text);
1224    $text = str_replace('</i>', '//', $text);
1225    $text = str_replace('<u>', '__', $text);
1226    $text = str_replace('</u>', '__', $text);
1227  }
1228  // Decode entities
1229  // - user defined entities (in the manual metadata)
1230  foreach ($entity_replacements as $entity=>$code)
1231  {
1232    $text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1233  }
1234  // - standard entities
1235  $text = str_replace('&gt;','>', $text);
1236  $text = str_replace('&lt;','<', $text);
1237  $text = str_replace('&amp;','&', $text);
1238  return $text;
1239}
1240/** translateText() **/
1241
1242function handleImage($filename, $width, $height)
1243{
1244  global $dokuwiki_path;
1245  global $xml_source_path;
1246  echo '[copying image: ' . $filename . "] \n";
1247  // - copy file into place
1248  $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1249  $destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
1250  if (!file_exists($destination_dir))
1251  {
1252    mkAllDir($destination_dir, 0755);
1253  }
1254  $destination_path = $destination_dir . strtolower($filename);
1255  copy($source_path, $destination_path);
1256  if (!file_exists($destination_path))
1257  {
1258    printError('Failed to copy image file: ' . $filename);
1259  }
1260  // - create the string
1261  $image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1262  return $image_txt;
1263}
1264
1265function getLine($in)
1266{
1267  global $line_counter;
1268  $line_counter++;
1269  return fgets($in);
1270}
1271
1272function noFormatting($text)
1273{
1274  $text = str_replace( '<b>', '', $text);
1275  $text = str_replace('</b>', '', $text);
1276  $text = str_replace( '<i>', '', $text);
1277  $text = str_replace('</i>', '', $text);
1278  $text = str_replace('<br/>', '', $text);
1279  return $text;
1280}
1281
1282?>
Note: See TracBrowser for help on using the browser.