root/documentation/trunk/php/gs-manual-import.php @ 30117

Revision 30117, 41.7 KB (checked in by jmt12, 4 years ago)

Altered to fix bug where imgcaption tag closed before image actually output. Looks like this was intentional at some stage, but means problems in the Wiki, so I'll repair

Line 
1<?php
2
3require_once('common.php');
4
5/** @file gs-manual-export.php
6 *  This script transforms the single XML manual file required by the rest of
7 *  the Greenstone manual generation scripts into the series of dokuwiki pages
8 *  that make up a certain manual (as specified by the 'm' argument) in a
9 *  certain language ('l').
10 */
11
12if (!parseCLIArguments())
13 {
14   printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user|install|develop|paper]");
15 }
16
17// 0. Initialization
18if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
19{
20  $_REQUEST['l'] = 'en';
21}
22if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
23{
24  $_REQUEST['m'] = 'user';
25  //$_REQUEST['m'] = 'install';
26  //$_REQUEST['m'] = 'develop';
27  //$_REQUEST['m'] = 'paper';
28}
29
30// - validate arguments before we use them (security)
31if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
32 {
33   printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34 }
35
36if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
37 {
38   printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39 }
40
41echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43echo "<p><b>Frontmatter: </b><br/>\n";
44// 1. By-and-large we're going to process all of this in a big state machine
45// - the top level page, containing cover page and chapter order information,
46//   needs to be created last, so we have to store it's information
47$manual_metadata = array();
48$entity_replacements = array();
49$footnotes = array();
50$page_order = array();
51$page_count = 2;
52$looking_for_metadata = '';
53$chapter_txt_out = false;
54$frontmatter_text = '';
55$in_section = false;
56$sections_page_name = '';
57$in_chapter = false;
58$chapter_id = '';
59$bullet_depth = 0;
60$is_numbered_list = true;
61$line_counter = 0;
62$in_code = false;
63$in_footnotes = false;
64$in_numbered_item = 0;
65$in_bullet_item = false;
66$seen_code_in_item = false;
67$in_indent = false;
68$is_code_linenumbered = false;
69// - construct the path using the information we've been provided as arguments
70$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71$xml_in = fopen($xml_file_path, 'r');
72if (!$xml_in)
73 {
74   printError('Failed to locate top level page for manual');
75 }
76// - we also use this opportunity to read in any footnotes as we'll need to
77//   move them onto their appropriate page
78while (($line = fgets($xml_in)) !== false)
79 {
80   if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81   {
82     $footnote_id = $matches[1];
83     $text_line = fgets($xml_in);
84     if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85     {
86       $footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87     }
88     // - throw away </Footnote>
89     fgets($xml_in);
90   }
91 }
92fclose($xml_in);
93// - now reopen to parse it
94$xml_in = fopen($xml_file_path, 'r');
95if (!$xml_in)
96 {
97   printError('Failed to locate top level page for manual');
98 }
99while (($line = getLine($xml_in)) !== false)
100 {
101   // - Special Case: lingering code blocks, continue if next line also
102   //   contains code, otherwise we need an extra newline
103   if ($in_code)
104   {
105     $code_text = "\n";
106     if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107     {
108       $code_text .= '</code>';
109       // - codeblocks that appear inside numbered lists do not get their own
110       //   newlines as that would split the item. Instead newlines will be
111       //   added when </NumberedItem> encountered.
112       if (!$is_numbered_list)
113       {
114         $code_text .= "\n\n";
115       }
116       else
117       {
118         $seen_code_in_item = true;
119       }
120       $in_code = false;
121     }
122     if ($in_chapter)
123     {
124       fwrite($chapter_txt_out, $code_text);
125     }
126     else
127     {
128       $frontmatter_text .= $code_text;
129     }
130   }
131   // - some system metadata to watch for
132   if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133   {
134     $entity = $matches[1];
135     addMetadata('ENTITY',$entity);
136     if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137     {
138       $entity_replacements[$matches[1]] = $matches[2];
139     }
140   }
141   // - we have an explicit list of cover metadata to watch for
142   elseif (!$in_section && !$in_chapter && preg_match('/<(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
143   {
144     $looking_for_metadata = $matches[1];
145   }
146   elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
147   {
148     $looking_for_metadata = '';
149   }
150   // - found metadata we have!
151   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152   {
153     $text_id = $matches[1];
154     $text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155     addMetadata($looking_for_metadata, $text);
156   }
157   // - bogus metadata found in French version
158   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
159   {
160   }
161   // - any text we encounter outside of both sections and chapters also
162   //   belongs on the cover
163   elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
164   {
165     // (for now I'll assume id's are persistent)
166     addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
167   }
168   // - we will probably encounter the opening section (which is outside of a
169   //   chapter) first, so we have a special case for it
170   elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
171   {
172     $section_id = $matches[1];
173     // - if this is the first non-chapter section we have encountered then it
174     //   gets the honor of having the page---that these sections will
175     //   eventually be printed out on---named after it. Typically this should
176     //   be "about_this_manual"
177     if (empty($frontmatter_text))
178     {
179       $sections_page_name = $section_id;
180     }
181     $in_section = true;
182     $title = getTitle($xml_in, 'section:' . $section_id);
183     if (empty($frontmatter_text))
184     {
185       array_unshift($page_order, $sections_page_name . '|' . noComments($title));
186     }
187     // - if the title, as is, wouldn't autogenerate the appropriate id, then
188     //   we have to include the id explicitly (as another html comment block)
189     if ($section_id != generateID($title))
190     {
191       $title = '<!-- sid:' . $section_id . ' -->' . $title;
192       $seen_ids[$section_id] = 1;
193     }
194     $frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
195     // - whew. Chapter's going to be just as bad though.
196   }
197   elseif ($in_section && preg_match('/<\/Section>/', $line))
198   {
199     $in_section = false;
200   }
201   elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
202   {
203     $chapter_id = $matches[1];
204     echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
205     $chapter_page_name = $chapter_id;
206     // - create a new file to store this chapter
207     $chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
208     if (!file_exists($chapter_file_dir))
209     {
210       mkAllDir($chapter_file_dir, 0755);
211     }
212     $chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
213     // - backup existing file
214     if (file_exists($chapter_file_path))
215     {
216       $chapter_backup_file_path = $chapter_file_path . '.bak';
217       rename($chapter_file_path, $chapter_backup_file_path);
218     }
219     // - open new file for writing
220     $chapter_txt_out = fopen($chapter_file_path, 'w');
221     if (!$chapter_txt_out)
222     {
223       printError('Failed to open page file for writing: ' . $chapter_page_name);
224     }
225     $in_chapter = true;
226     $title = getTitle($xml_in, 'chapter: ' . $chapter_id);
227     fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
228     array_push($page_order, $chapter_page_name . '|' . noComments($title));
229   }
230   elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
231   {
232     fclose($chapter_txt_out);
233     $chapter_txt_out = false;
234     $in_chapter = false;
235     $page_count++;
236   }
237   // - section, subsection and part titles within chapter
238   elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
239   {
240     $title_type = $matches[1];
241     $section_id = $matches[2];
242     if (empty($section_id))
243     {
244       $section_id = generateID(strtolower($title_type));
245     }
246     echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
247     $header_fix = '';
248     $title = getTitle($xml_in, 'heading: ' . $title_type);
249     if ($title_type == 'Section')
250     {
251       $header_fix = '=====';
252     }
253     if ($title_type == 'Subsection')
254     {
255       $header_fix = '====';
256     }
257     if ($title_type == 'Part')
258     {
259       $header_fix = '===';
260       // - remove b's and i's
261       $title = preg_replace('/<\/?(B|I)>/i', '', $title);
262     }
263     // - if the title, as is, wouldn't autogenerate the appropriate id, then
264     //   we have to include the id explicitly (as another html comment block)
265     if ($section_id != generateID($title))
266     {
267       $title = '<!-- sid:' . $section_id . ' -->' . $title;
268       $seen_ids[$section_id] = 1;
269     }
270     fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
271   }
272   elseif ($in_chapter && (strpos($line, '</Section') !== false || strpos($line, '</Subsection') !== false  || strpos($line, '</Part') !== false))
273   {
274     // do nothing for now
275   }
276   // - figures (and their titles/captions)
277   elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
278   {
279     $figure_id = $matches[1];
280     $other_attributes = $matches[2];
281     echo '[adding figure: ' . $figure_id . "] \n";
282     // We need the title too
283     $caption = getTitle($xml_in, 'figure:' . $figure_id);
284     $caption = translateText(alternateComments($caption));
285     $txt = "<imgcaption figure_" . $figure_id . '|' . $caption . ' ';
286     // - we also check the other attributes to see if the XML has requested
287     //   any following codeblock be linenumbered
288     if (strpos($other_attributes, 'withLineNumber') !== false)
289     {
290       $is_code_linenumbered = true;
291       $txt .= '%!-- withLineNumber --%';
292     }
293     $txt .= '>';
294     if ($in_chapter)
295     {
296       fwrite($chapter_txt_out, $txt);
297     }
298     else
299     {
300       $frontmatter_text .= $txt;
301     }
302   }
303   elseif (strpos($line, '</Figure>') !== false)
304   {
305     if ($in_chapter)
306     {
307       fwrite($chapter_txt_out, "</imgcaption>\n\n");
308     }
309     else
310     {
311       $frontmatter_text .= "</imgcaption>\n\n";
312     }
313     // - no longer required
314     $is_code_linenumbered = false;
315   }
316   elseif (preg_match('/<Table([^>]*).*?.*?>/', $line, $matches))
317   {
318     $attributes = $matches[1];
319     $table_txt = '';
320     $table_id = '';
321     if (preg_match('/id="([^"]+)"/', $attributes, $matches))
322     {
323       $table_id = $matches[1];
324     }
325     else
326     {
327       $table_id = generateID('table');
328     }
329     $hidden = false;
330     if (strpos($attributes, 'class="hidden"') !== false)
331     {
332       $hidden = true;
333     }
334     echo '[adding table: ' . $table_id . "] \n";
335     $table_caption = getTitle($xml_in, 'table: ' . $table_id);
336     if ($hidden)
337     {
338       $table_txt .= '<tblcaption table_' . $table_id . '|##HIDDEN##></tblcaption>' . "\n";
339     }
340     elseif (empty($table_caption))
341     {
342       $table_txt .= '<tblcaption table_' . $table_id . '|##NOCAPTION##></tblcaption>' . "\n";
343     }
344     else
345     {
346       $table_txt .= '<tblcaption table_' . $table_id . '|' . noComments($table_caption) . '></tblcaption>' . "\n";
347     }
348     // - in order to properly capture the table we're going to have to read in
349     //   the whole thing here, and take note of column widths
350     $have_output_widths = false;
351     $column_widths = array();
352     while (strpos($line, '</Table>') === false)
353     {
354       // - find the start of a row
355       while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
356       {
357         $line = getLine($xml_in);
358       }
359       if (strpos($line, '<tr>') !== false)
360       {
361         $row_txt = '|';
362         $line = getLine($xml_in);
363         // - now we read in multiple cells (line starting <th
364         while (strpos($line, '<th') === 0)
365         {
366           if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
367           {
368             $cell_width = $matches[1];
369             if (!$have_output_widths)
370             {
371               array_push($column_widths, $cell_width);
372             }
373           }
374           // Ignore empty cells
375           // - adding another case for empty header cells (turned up in es
376           //   version of "From Paper")
377           if (preg_match('/<th width="\d+"\/>/', $line) || preg_match('/<th width="\d+">.*<\/th>/', $line))
378           {
379             $row_txt .= ' |';
380           }
381           else
382           {
383             $line = getLine($xml_in);
384             $first = true;
385             while (strpos($line, '</th>') === false)
386             {
387               if (!$first)
388               {
389                 $row_txt .= '\\\\';
390               }
391               // - we can have images or text in our tables
392               if (preg_match('/<File.*url="images\/([^"]+)".*\/>/', $line, $matches))
393               {
394                 $payload = $matches[0];
395                 $filename = $matches[1];
396                 $width = 0;
397                 if (preg_match('/width="(\d+)"/', $payload, $matches))
398                 {
399                   $width = $matches[1];
400                 }
401                 $height = 0;
402                 if (preg_match('/height="(\d+)"/', $payload, $matches))
403                 {
404                   $height = $matches[1];
405                 }
406                 $image_txt = handleImage($filename, $width, $height);
407                 $row_txt .= ' ' . $image_txt . ' ';
408               }
409               elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
410               {
411                 $tid = $matches[1];
412                 $txt = $matches[2];
413                 // - multiple line text block
414                 while (strpos($txt, '</Text>') === false)
415                 {
416                   $txt .= getLine($xml_in);
417                 }
418                 $txt = str_replace('</Text>','',$txt);
419                 $row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
420               }
421               elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
422               {
423                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
424               }
425               elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
426               {
427                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
428               }
429               elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
430               {
431                 if (!empty($matches[1]))
432                 {
433                   $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
434                 }
435                 else
436                 {
437                   $row_txt .= ' ';
438                 }
439               }
440               // we'll add (bogus) linebreaks
441               elseif (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
442               {
443                 $row_txt = ' ';
444               }
445               else
446               {
447                 printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
448               }
449               $first = false;
450               // - next line
451               $line = getLine($xml_in);
452             }
453             // - close the cell
454             $row_txt .= '|';
455           }
456           // next!
457           $line = getLine($xml_in);
458         }
459         // - if we haven't already, output the width command
460         if (!$have_output_widths)
461         {
462           $table_txt .= '|< - ' . implode(' ', $column_widths) . ' >|' . "\n";
463           $have_output_widths = true;
464         }
465         $table_txt .= $row_txt . "\n";
466         // - throw away the closing </tr>
467         $line = getLine($xml_in);
468       }
469     }
470     $table_txt .= "\n";
471     if ($in_chapter)
472     {
473       fwrite($chapter_txt_out, $table_txt);
474     }
475     else
476     {
477       $frontmatter_text .= $table_txt;
478     }
479   }
480   // - copy and insert images
481   elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
482   {
483     $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
484     if ($in_chapter)
485     {
486       fwrite($chapter_txt_out, $image_txt);
487     }
488     else
489     {
490       $frontmatter_text .= $image_txt;
491     }
492   }
493   // - bullet lists
494   elseif (preg_match('/<BulletList>/', $line))
495   {
496     echo "[adding bulletlist] \n";
497     if ($in_bullet_item || $in_numbered_item)
498     {
499       if ($in_chapter)
500       {
501         fwrite($chapter_txt_out, "\n");
502       }
503       elseif ($in_section)
504       {
505         $frontmatter_text .= "\n";
506       }
507     }
508     $bullet_depth++;
509     $is_numbered_list = false;
510   }
511   // - numbered lists
512   elseif (preg_match('/<NumberedList>/', $line))
513   {
514     echo "[adding numbered list] \n";
515     $bullet_depth++;
516     $is_numbered_list = true;
517     // - reset this flag that keeps track of whether an item (numbered or
518     //   otherwise) is legitimately split by a code block
519     $seen_code_in_item = false;
520   }
521   elseif (preg_match('/<\/BulletList>/', $line))
522   {
523     $bullet_depth--;
524     if ($bullet_depth == 0)
525     {
526       if ($in_chapter)
527       {
528         fwrite($chapter_txt_out, "\n");
529       }
530       elseif ($in_section)
531       {
532         $frontmatter_text .= "\n";
533       }
534       $is_numbered_list = false;
535     }
536     ///cho "[finished bulletlist] ";
537   }
538   elseif (preg_match('/<\/NumberedList>/', $line))
539   {
540     $bullet_depth--;
541     if ($bullet_depth == 0)
542     {
543       if ($in_chapter)
544       {
545         fwrite($chapter_txt_out, "\n");
546       }
547       elseif ($in_section)
548       {
549         $frontmatter_text .= "\n";
550       }
551       $is_numbered_list = false;
552     }
553     ///cho "[finished numbered list] ";
554   }
555   elseif (preg_match('/<NumberedItem>/', $line))
556   {
557     $in_numbered_item = 1;
558   }
559   elseif (preg_match('/<\/NumberedItem>/', $line))
560   {
561     $in_numbered_item = 0;
562     if ($in_chapter)
563     {
564       fwrite($chapter_txt_out, "\n");
565     }
566     else
567     {
568       $frontmatter_text .= "\n";
569     }
570   }
571   elseif (preg_match('/<Bullet>/', $line))
572   {
573     $in_bullet_item = true;
574   }
575   elseif (preg_match('/<\/Bullet>/', $line))
576   {
577     if ($in_chapter)
578     {
579       fwrite($chapter_txt_out, "\n");
580     }
581     else
582     {
583       $frontmatter_text .= "\n";
584     }
585     $in_bullet_item = false;
586   }
587   // TEXT HANDLING - this is the main case, but has disappeared into the mire
588   // of other cases.
589   elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
590   {
591     $id = $matches[1];
592     $str = $matches[2];
593     // - special case for those text elements split over multiple lines. We
594     //   keep concatenating lines until we find the closing text element or we
595     //   run out of lines!
596     $another_line = '';
597     while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
598     {
599       $str .= ' ' . $another_line;
600     }
601     // - note that if we ran out of lines (eof) then we'll break out of this
602     //   block anyway, it's just there won't be a <\Text> at the end of this
603     //   block... despite this being a major validation issue in the XML it
604     //   shouldn't result in this script being vladed
605     // - now remove the </Text> from the end (hopefully) of str
606     $str = preg_replace('/<\/Text>\s*/', '', $str);
607     // - and prepend the id while translating the str into Dokuwiki format
608     $str = '<!-- id:' . $id . ' -->' . translateText($str);
609     if ($bullet_depth > 0)
610     {
611       if ($is_numbered_list)
612       {
613         // - special case for those text elements legimately split in two by
614         //   code blocks. They get no bullet of either type and are 'run-on'
615         //   immediately to the end of the code element in order to prevent
616         //   dokuwiki restarting numbering etc
617         if ($seen_code_in_item)
618         {
619           // - leave str as it is
620           // - reset flag just incase the item happens to contain another
621           //   code block
622           $seen_code_in_item = false;
623         }
624         else if ($in_numbered_item == 1)
625         {
626           $str = '- ' . $str;
627         }
628         // - superspecial case for the poorly formatted numberlists that
629         //   contain more than one text block per point. We'll nest them
630         //   as a bullet list as that preserves order, formatting and (I
631         //   hope) meaning.
632         else
633         {
634           if ($in_chapter)
635           {
636             fwrite($chapter_txt_out, "\n");
637           }
638           else
639           {
640             $frontmatter_text .= "\n";
641           }
642           $str = '  * ' . $str;
643         }
644         $in_numbered_item++;
645       }
646       else
647       {
648         $str = '* ' . $str;
649       }
650       for ($i = 0; $i < $bullet_depth; $i++)
651       {
652         $str = '  ' . $str;
653       }
654     }
655     else
656     {
657       // Indented text is preceeded by a >
658       if ($in_indent)
659       {
660         $str = '> ' . $str . "\n";
661       }
662       else
663       {
664         $str .= "\n";
665       }
666     }
667     if ($bullet_depth == 0)
668     {
669       $str .= "\n";
670     }
671     if ($in_chapter)
672     {
673       fwrite($chapter_txt_out, $str);
674     }
675     else
676     {
677       $frontmatter_text .= $str;
678     }
679   }
680   // - codified text blocks
681   elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
682   {
683     $code_id = $matches[1];
684     // - determine the appropriate code block prefix
685     $code_prefix = '';
686     if (!$in_code)
687     {
688       if ($is_code_linenumbered)
689       {
690         $code_prefix = '<code 1>';
691       }
692       else
693       {
694         $code_prefix = '<code>';
695       }
696       $in_code = true;
697     }
698     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
699     if ($in_chapter)
700     {
701       fwrite($chapter_txt_out, $code_txt);
702     }
703     else
704     {
705       $frontmatter_text .= $code_txt;
706     }
707   }
708   elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
709   {
710     $code_id = $matches[1];
711     // - determine the appropriate code block prefix
712     $code_prefix = '';
713     if (!$in_code)
714     {
715       if ($is_code_linenumbered)
716       {
717         $code_prefix = '<code 1>';
718       }
719       else
720       {
721         $code_prefix = '<code>';
722       }
723       $in_code = true;
724     }
725     $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
726     if ($in_chapter)
727     {
728       fwrite($chapter_txt_out, $code_txt);
729     }
730     else
731     {
732       $frontmatter_text .= $code_txt;
733     }
734   }
735   elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
736   {
737     $code_txt = $matches[1];
738     // - determine the appropriate code block prefix
739     $code_prefix = '';
740     if (!$in_code)
741     {
742       if ($is_code_linenumbered)
743       {
744         $code_prefix = "<code 1>\n";
745       }
746       else
747       {
748         $code_prefix = "<code>\n";
749       }
750       $in_code = true;
751     }
752     // - arg. another special case for codelines that span more than one line
753     // (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
754     // for an element name, eh?)
755     $another_line = '';
756     while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
757     {
758       $code_txt .= ' ' . $another_line;
759     }
760     $code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
761     $code_txt = $code_prefix . translateText($code_txt, true);
762     if ($in_chapter)
763     {
764       fwrite($chapter_txt_out, $code_txt);
765     }
766     else
767     {
768       $frontmatter_text .= $code_txt;
769     }
770   }
771   // - there are also sometimes empty codelines - which indicate a newline in
772   //   the code listing
773   elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
774   {
775     $code_txt = '';
776     if (!$in_code)
777     {
778       $code_txt = "\n";
779       if ($is_code_linenumbered)
780       {
781         $code_txt = "<code 1>\n" . $code_txt;
782       }
783       else
784       {
785         $code_txt = "<code>\n" . $code_txt;
786       }
787       $in_code = true;
788     }
789     if ($in_chapter)
790     {
791       fwrite($chapter_txt_out, $code_txt);
792     }
793     else
794     {
795       $frontmatter_text .= $code_txt;
796     }
797   }
798   // - reference to an external XML file
799   elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
800   {
801     if ($in_chapter)
802     {
803       fwrite($chapter_txt_out, $line);
804     }
805     else
806     {
807       $frontmatter_text .= $line;
808     }
809   }
810   elseif (strpos($line, '<FootnoteList>') !== false)
811   {
812     $in_footnotes = true;
813   }
814   elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
815   {
816     $in_footnotes = false;
817   }
818   // Indentation - the closest thing we have is quoting, so we'll use that
819   elseif (strpos($line, '<Indented>') !== false)
820   {
821     $in_indent = true;
822   }
823   elseif (strpos($line, '</Indented>') !== false)
824   {
825     $in_indent = false;
826   }
827   // - pattern of lines to ignore
828   else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
829   {
830   }
831   // - we ignore anything else in footnotes too, as they were handled in the
832   //   preprocessing pass
833   else if ($in_footnotes)
834   {
835   }
836   // - ignore empty lines
837   else if (preg_match('/^\s*$/', $line))
838   {
839   }
840   // - meh. French versions have random, non-text element, linebreaks floating
841   //   around. Guess I'll honor their formatting even though it's bogus
842   else if (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
843   {
844     if ($in_chapter)
845     {
846       fwrite($chapter_txt_out, ' \\\\');
847     }
848     else
849     {
850       $frontmatter_text .= ' \\\\';
851     }
852   }
853   // - danger Will Robinson!
854   else
855   {
856     echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|<hr /></div>\n";
857   }
858 }
859
860// 2. We should now have enough metadata to export the cover page
861$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
862// - backup any existing file
863if (file_exists($top_page_path))
864 {
865   $top_page_backup_path = $top_page_path . '.bak';
866   if(!rename($top_page_path, $top_page_backup_path))
867   {
868     printError('Failed to rename existing top page for backup');
869   }
870 }
871// - and create a handle to the new file
872$txt_out = fopen($top_page_path, 'w');
873// - write the page (including the tables)
874fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
875fwrite($txt_out, "\n");
876
877// - *NEW* ability to request imports and exports from within the page
878fwrite($txt_out, "<ifauth @admin>\n\n");
879fwrite($txt_out, '**Administrator Commands:**' . "\n");
880// On second thoughts we probably never want to do this casually, as it boguses
881// all history/approval/edit information. Instead I'll leave this as a manual
882// process.
883fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
884fwrite($txt_out, '  * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
885fwrite($txt_out, "</ifauth>\n\n");
886
887// - regular metadata
888fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
889fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
890$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
891foreach ($fields as $field)
892{
893  $values = getMetadata($field);
894  foreach ($values as $value)
895  {
896    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
897  }
898}
899fwrite($txt_out, "\n");
900// - contents (which also provides order information for exporting)
901fwrite($txt_out, '===== Contents =====' . "\n");
902fwrite($txt_out, "\n");
903fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
904foreach ($page_order as $page_info)
905{
906  fwrite($txt_out, '  * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
907}
908// - system metadata
909fwrite($txt_out, "<ifauth @admin>\n\n");
910fwrite($txt_out, '===== System Metadata =====' . "\n");
911fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
912fwrite($txt_out, '^ Metadata  ^ Value  ^' . "\n");
913$fields = array('ENTITY','SupplementaryText');
914foreach ($fields as $field)
915{
916  $values = getMetadata($field);
917  foreach ($values as $value)
918  {
919    fwrite($txt_out, '^ ' . $field . '  | ' . $value . '  |' . "\n");
920  }
921}
922fwrite($txt_out, "</ifauth>\n\n");
923fwrite($txt_out, "\n");
924// - done!
925fclose($txt_out);
926$page_count++;
927
928// 3. And the 'sections' page, grouping together all the loose sections as
929//    frontmatter
930$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
931// - backup any existing file
932if (file_exists($frontmatter_page_path))
933 {
934   $frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
935   if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
936   {
937     printError('Failed to rename existing frontmatter page for backup');
938   }
939 }
940// - populate the new frontmatter file
941file_put_contents($frontmatter_page_path, $frontmatter_text);
942
943echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
944echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
945exit(0);
946
947/**
948 */
949function addMetadata($field, $value)
950{
951  global $manual_metadata;
952  echo '[adding metadata: ' . $field . "] \n";
953  $values = array();
954  if (isset($manual_metadata[$field]))
955  {
956    $values = $manual_metadata[$field];
957  }
958  array_push($values, $value);
959  $manual_metadata[$field] = $values;
960}
961/** addMetadata() **/
962
963function getFirstMetadata($field)
964{
965  global $manual_metadata;
966  $value = '';
967  if (isset($manual_metadata[$field]))
968  {
969    $values = $manual_metadata[$field];
970    if (!empty($values))
971    {
972      $value = $values[0];
973    }
974  }
975  return $value;
976}
977/** getFirstMetadata() **/
978
979function getMetadata($field)
980{
981  global $manual_metadata;
982  $values = array();
983  if (isset($manual_metadata[$field]))
984  {
985    $values = $manual_metadata[$field];
986  }
987  return $values;
988}
989/** getMetadata() **/
990
991/**
992 *  Read in the next title element with nested text element and extract the
993 *  title.
994 */
995function getTitle($xml_in, $element)
996{
997  $title = '';
998  $in_title_element = false;
999  // - the first thing in a chapter will be it's title
1000  $title_line = getLine($xml_in);
1001  // - super special case: some language versions don't wrap titles in title
1002  //   element, so if the first thing we see is a text, we treat that as the
1003  //   title
1004  if (strpos($title_line, '<text') !== false)
1005  {
1006
1007  }
1008  // - super special case: a table with an empty title
1009  if (strpos($title_line, '<Title/>') !== false)
1010  {
1011    return '';
1012  }
1013  if (strpos($title_line, '<Title>') !== false)
1014  {
1015    $in_title_element = true;
1016    $title_line = getLine($xml_in);
1017  }
1018  // - some horribly formed entries have the subtitle first within the title
1019  //   element
1020  if (strpos($title_line, '<SubTitle>') !== false)
1021  {
1022    $title_line = getLine($xml_in);
1023    if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1024    {
1025      $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1026    }
1027    $title_line = getLine($xml_in);
1028    if (strpos($title_line, '</SubTitle>') === false)
1029    {
1030      printError('Failed to find closing title for: ' . $element);
1031    }
1032    $title_line = getLine($xml_in);
1033  }
1034  // - grab the chapter title now so we can store it in the page ordering
1035  if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
1036  {
1037    $id = $matches[1];
1038    $str = $matches[2];
1039    // - special case for text blocks that span multiple lines (as discovered
1040    //   in the russian "From Paper"
1041    $another_line = '';
1042    while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
1043    {
1044      $str .= ' ' . $another_line;
1045    }
1046    // - now remove </Text>
1047    $str = preg_replace('/<\/Text>\s*/', '', $str);
1048    $title = '<!-- id:' . $id . ' -->' . $str . $title;
1049  }
1050  // - special case for (stoopid) empty titles that use up a text id
1051  elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
1052  {
1053    $title = '<!-- id:' . $matches[1] . ' -->' . $title;
1054  }
1055  else
1056  {
1057    printError('Failed to find title text for: ' . $element);
1058  }
1059  // - watch for subtitle elements
1060  if ($in_title_element)
1061  {
1062    $title_line = getLine($xml_in);
1063    if (strpos($title_line, '<SubTitle>') !== false)
1064    {
1065      $title_line = getLine($xml_in);
1066      if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1067      {
1068        $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1069      }
1070      $title_line = getLine($xml_in);
1071      if (strpos($title_line, '</SubTitle>') === false)
1072      {
1073        printError('Failed to find closing title for: ' . $element);
1074      }
1075      $title_line = getLine($xml_in);
1076    }
1077    if (strpos($title_line, '</Title>') === false)
1078    {
1079      printError('Failed to find closing title for: ' . $element);
1080    }
1081  }
1082  return $title;
1083}
1084/** getTitle() **/
1085
1086function alternateComments($text)
1087{
1088  $text = str_replace('<!--', '%!--', $text);
1089  $text = str_replace('-->', '--%', $text);
1090  // remove any lurking crossrefs while we are at it
1091  $text = preg_replace('/<CrossRef.*?ref="([^"]+)".*?>/', '\\1', $text);
1092  return $text;
1093}
1094
1095function noComments($text)
1096{
1097  $text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1098  return preg_replace('/<!--[^>]+-->/', '', $text);
1099}
1100
1101function translateTableCodeline($text)
1102{
1103  ///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1104  // Escape the current italics tags to prevent the translate destroying them
1105  $text = str_replace('<i>', '%!--i--%', $text);
1106  $text = str_replace('</i>', '%/i%', $text);
1107  // Translate the text, just decoding the entities
1108  $text = translateText($text, true);
1109  // Now turn the italic tags (escaped) into HTML comments so we remember them
1110  // but they are hidden in the text
1111  $text = str_replace('%!--i--%', '<!--i-->', $text);
1112  $text = str_replace('%/i%', '<!--/i-->', $text);
1113  ///cho " =&gt; '" . htmlspecialchars($text) . "<br />\n";
1114  return $text;
1115}
1116
1117function translateText($text, $entities_only=false)
1118{
1119  global $entity_replacements;
1120  global $footnotes;
1121  global $in_code;
1122
1123  // - immediate find and protect any legitimate HTML comments in the text
1124  //   (so already using encoded entities), otherwise they'll be throughly
1125  //   vladed during the following tranforms. This has to be matched with
1126  //   changes to the HTMLComments plugin in Dokuwiki to allow the correct
1127  //   thing to be displayed to the user.
1128  if (!$in_code)
1129  {
1130    $text = str_replace('&lt;!--', '%!--', $text);
1131    $text = str_replace('--&gt;', '--%', $text);
1132  }
1133
1134  if (!$entities_only)
1135  {
1136    // - replace linking constructs with dokuwiki ones
1137    // - external chapter section crossrefs are easily the worst of all...
1138    while (preg_match('/<CrossRef\s[^>]*external[^>]*\/>/', $text) && preg_match('/<CrossRef\s[^>]*target="Chapter"[^>]*\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1139    {
1140      $pattern = $matches[0];
1141      $attributes = $matches[1];
1142      $manual_name = '';
1143      if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1144      {
1145        $manual_name = $matches[1];
1146      }
1147      $language = '';
1148      if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1149      {
1150        $language = $matches[1];
1151      }
1152      $page_id = '';
1153      if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1154      {
1155        $page_id = $matches[1];
1156      }
1157      if (empty($manual_name) || empty($language) || empty($page_id))
1158      {
1159        printError('Failed to parse external reference: ' . $pattern);
1160      }
1161      // - best we can do is a search within a restricted namespace
1162      $reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
1163      $text = str_replace($pattern, $reference, $text);
1164    }
1165    // - chapter crossrefs are tricksie due to needing to know ordering numbers
1166    while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1167    {
1168      $chapter_id = $matches[1];
1169      $page_name = $chapter_id;
1170      $text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
1171    }
1172    // - internal figure and table references
1173    $text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1174    $text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1175    // - simple internal reference
1176    $text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1|\1]]', $text);
1177    $text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1|\1]]', $text);
1178    $text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1|\1]]', $text);
1179    // - simple external url
1180    $text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1|\2]]', $text);
1181    // - footnote references are also tricksie as we've had to extract the
1182    //   footnotes earlier (during chapter counting)
1183    while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1184    {
1185      $footnote_id = $matches[1];
1186      if (!isset($footnotes[$footnote_id]))
1187      {
1188        printError('Unknown footnote referenced: ' . $footnote_id);
1189      }
1190      $footnote = $footnotes[$footnote_id];
1191      $text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1192    }
1193    // - detect and handle URLs surrounded by <i> tags very carefully (as doku
1194    //   will less than helpfully turn them into an external link and screw up
1195    //   everything that follows them on the page).
1196    //   example: <i>www.microsoft.com</i>
1197    //   example: <i>http://nzdl.org/cgi-bin/library</i>
1198    //   example: <i>www.yourserver.com</i>
1199    //   example: <i>http://www.yourserver.com</i>
1200    //   example: <i>http://www.yourserver.com/greenstone</i>
1201    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1202    // - superspecial case for two-part URLs ending in .org (like nzdl.org)
1203    $text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1204    // - another superspecial case, this time for URLs on localhost
1205    $text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1206    // - p00p, underlines have the same issue around URLs.
1207    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1208    $text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1209    $text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1210
1211    // - replace HTML elements with the dokuwiki style equivilents
1212    $text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
1213    // - restore the double slashes in dokuwiki links
1214    while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
1215    {
1216      $text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
1217    }
1218    $text = str_replace('<b>', '**', $text);
1219    $text = str_replace('</b>', '**', $text);
1220    //$text = str_replace(' <br/>', '\\\\ ', $text);
1221    //$text = str_replace(' <br/>', '\\\\ ', $text);
1222    //$text = str_replace('<br/>', '\\\\ ', $text);
1223    $text = str_replace('<i>', '//', $text);
1224    $text = str_replace('</i>', '//', $text);
1225    $text = str_replace('<u>', '__', $text);
1226    $text = str_replace('</u>', '__', $text);
1227  }
1228  // Decode entities
1229  // - user defined entities (in the manual metadata)
1230  foreach ($entity_replacements as $entity=>$code)
1231  {
1232    $text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1233  }
1234  // - standard entities
1235  $text = str_replace('&gt;','>', $text);
1236  $text = str_replace('&lt;','<', $text);
1237  $text = str_replace('&amp;','&', $text);
1238  return $text;
1239}
1240/** translateText() **/
1241
1242function handleImage($filename, $width, $height)
1243{
1244  global $dokuwiki_path;
1245  global $xml_source_path;
1246  echo '[copying image: ' . $filename . "] \n";
1247  // - copy file into place
1248  $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1249  $destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
1250  if (!file_exists($destination_dir))
1251  {
1252    mkAllDir($destination_dir, 0755);
1253  }
1254  $destination_path = $destination_dir . strtolower($filename);
1255  copy($source_path, $destination_path);
1256  if (!file_exists($destination_path))
1257  {
1258    printError('Failed to copy image file: ' . $filename);
1259  }
1260  // - create the string
1261  $image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1262  return $image_txt;
1263}
1264
1265function getLine($in)
1266{
1267  global $line_counter;
1268  $line_counter++;
1269  return fgets($in);
1270}
1271
1272function noFormatting($text)
1273{
1274  $text = str_replace( '<b>', '', $text);
1275  $text = str_replace('</b>', '', $text);
1276  $text = str_replace( '<i>', '', $text);
1277  $text = str_replace('</i>', '', $text);
1278  $text = str_replace('<br/>', '', $text);
1279  return $text;
1280}
1281
1282?>
Note: See TracBrowser for help on using the browser.