source: documentation/trunk/php/gs-manual-import.php@ 25026

Last change on this file since 25026 was 25026, checked in by jmt12, 12 years ago

PHP scripts for importing and exporting the XML manuals into and out of Dokuwiki

File size: 38.4 KB
Line 
1<?php
2
3require_once('common.php');
4
5/** @file gs-manual-export.php
6 * This script transforms the single XML manual file required by the rest of
7 * the Greenstone manual generation scripts into the series of dokuwiki pages
8 * that make up a certain manual (as specified by the 'm' argument) in a
9 * certain language ('l').
10 */
11
12if (!parseCLIArguments())
13 {
14 printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user|install|develop|paper]");
15 }
16
17// 0. Initialization
18if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
19{
20 $_REQUEST['l'] = 'en';
21}
22if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
23{
24 $_REQUEST['m'] = 'user';
25 //$_REQUEST['m'] = 'install';
26 //$_REQUEST['m'] = 'develop';
27 //$_REQUEST['m'] = 'paper';
28}
29
30// - validate arguments before we use them (security)
31if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
32 {
33 printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34 }
35
36if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
37 {
38 printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39 }
40
41echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43echo "<p><b>Frontmatter: </b><br/>\n";
44// 1. By-and-large we're going to process all of this in a big state machine
45// - the top level page, containing cover page and chapter order information,
46// needs to be created last, so we have to store it's information
47$manual_metadata = array();
48$entity_replacements = array();
49$footnotes = array();
50$page_order = array();
51$page_count = 2;
52$looking_for_metadata = '';
53$chapter_txt_out = false;
54$frontmatter_text = '';
55$in_section = false;
56$sections_page_name = '';
57$in_chapter = false;
58$chapter_id = '';
59$bullet_depth = 0;
60$is_numbered_list = true;
61$line_counter = 0;
62$in_code = false;
63$in_footnotes = false;
64$in_numbered_item = 0;
65$in_bullet_item = false;
66$seen_code_in_item = false;
67$in_indent = false;
68$is_code_linenumbered = false;
69// - construct the path using the information we've been provided as arguments
70$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71$xml_in = fopen($xml_file_path, 'r');
72if (!$xml_in)
73 {
74 printError('Failed to locate top level page for manual');
75 }
76// - we also use this opportunity to read in any footnotes as we'll need to
77// move them onto their appropriate page
78while (($line = fgets($xml_in)) !== false)
79 {
80 if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81 {
82 $footnote_id = $matches[1];
83 $text_line = fgets($xml_in);
84 if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85 {
86 $footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87 }
88 // - throw away </Footnote>
89 fgets($xml_in);
90 }
91 }
92fclose($xml_in);
93// - now reopen to parse it
94$xml_in = fopen($xml_file_path, 'r');
95if (!$xml_in)
96 {
97 printError('Failed to locate top level page for manual');
98 }
99while (($line = getLine($xml_in)) !== false)
100 {
101 // - Special Case: lingering code blocks, continue if next line also
102 // contains code, otherwise we need an extra newline
103 if ($in_code)
104 {
105 $code_text = "\n";
106 if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107 {
108 $code_text .= '</code>';
109 // - codeblocks that appear inside numbered lists do not get their own
110 // newlines as that would split the item. Instead newlines will be
111 // added when </NumberedItem> encountered.
112 if (!$is_numbered_list)
113 {
114 $code_text .= "\n\n";
115 }
116 else
117 {
118 $seen_code_in_item = true;
119 }
120 $in_code = false;
121 }
122 if ($in_chapter)
123 {
124 fwrite($chapter_txt_out, $code_text);
125 }
126 else
127 {
128 $frontmatter_text .= $code_text;
129 }
130 }
131 // - some system metadata to watch for
132 if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133 {
134 $entity = $matches[1];
135 addMetadata('ENTITY',$entity);
136 if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137 {
138 $entity_replacements[$matches[1]] = $matches[2];
139 }
140 }
141 // - we have an explicit list of cover metadata to watch for
142 elseif (!$in_section && !$in_chapter && preg_match('/<(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
143 {
144 $looking_for_metadata = $matches[1];
145 }
146 elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
147 {
148 $looking_for_metadata = '';
149 }
150 // - found metadata we have!
151 elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152 {
153 $text_id = $matches[1];
154 $text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155 addMetadata($looking_for_metadata, $text);
156 }
157 // - any text we encounter outside of both sections and chapters also
158 // belongs on the cover
159 elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
160 {
161 // (for now I'll assume id's are persistent)
162 addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
163 }
164 // - we will probably encounter the opening section (which is outside of a
165 // chapter) first, so we have a special case for it
166 elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
167 {
168 $section_id = $matches[1];
169 // - if this is the first non-chapter section we have encountered then it
170 // gets the honor of having the page---that these sections will
171 // eventually be printed out on---named after it. Typically this should
172 // be "about_this_manual"
173 if (empty($frontmatter_text))
174 {
175 $sections_page_name = $section_id;
176 }
177 $in_section = true;
178 $title = getTitle($xml_in, 'section:' . $section_id);
179 if (empty($frontmatter_text))
180 {
181 array_unshift($page_order, $sections_page_name . '|' . noComments($title));
182 }
183 // - if the title, as is, wouldn't autogenerate the appropriate id, then
184 // we have to include the id explicitly (as another html comment block)
185 if ($section_id != generateID($title))
186 {
187 $title = '<!-- sid:' . $section_id . ' -->' . $title;
188 $seen_ids[$section_id] = 1;
189 }
190 $frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
191 // - whew. Chapter's going to be just as bad though.
192 }
193 elseif ($in_section && preg_match('/<\/Section>/', $line))
194 {
195 $in_section = false;
196 }
197 elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
198 {
199 $chapter_id = $matches[1];
200 echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
201 $chapter_page_name = $chapter_id;
202 // - create a new file to store this chapter
203 $chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
204 if (!file_exists($chapter_file_dir))
205 {
206 mkAllDir($chapter_file_dir, 0755);
207 }
208 $chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
209 // - backup existing file
210 if (file_exists($chapter_file_path))
211 {
212 $chapter_backup_file_path = $chapter_file_path . '.bak';
213 rename($chapter_file_path, $chapter_backup_file_path);
214 }
215 // - open new file for writing
216 $chapter_txt_out = fopen($chapter_file_path, 'w');
217 if (!$chapter_txt_out)
218 {
219 printError('Failed to open page file for writing: ' . $chapter_page_name);
220 }
221 $in_chapter = true;
222 $title = getTitle($xml_in, 'chapter: ' . $chapter_id);
223 fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
224 array_push($page_order, $chapter_page_name . '|' . noComments($title));
225 }
226 elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
227 {
228 fclose($chapter_txt_out);
229 $chapter_txt_out = false;
230 $in_chapter = false;
231 $page_count++;
232 }
233 // - section, subsection and part titles within chapter
234 elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]+)">/', $line, $matches))
235 {
236 $title_type = $matches[1];
237 $section_id = $matches[2];
238 $header_fix = '';
239 $title = getTitle($xml_in, 'heading: ' . $title_type);
240 if ($title_type == 'Section')
241 {
242 $header_fix = '=====';
243 }
244 if ($title_type == 'Subsection')
245 {
246 $header_fix = '====';
247 }
248 if ($title_type == 'Part')
249 {
250 $header_fix = '===';
251 // - remove b's and i's
252 $title = preg_replace('/<\/?(B|I)>/i', '', $title);
253 }
254 // - if the title, as is, wouldn't autogenerate the appropriate id, then
255 // we have to include the id explicitly (as another html comment block)
256 if ($section_id != generateID($title))
257 {
258 $title = '<!-- sid:' . $section_id . ' -->' . $title;
259 $seen_ids[$section_id] = 1;
260 }
261 fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
262 }
263 elseif ($in_chapter && (strpos($line, '</Section') !== false || strpos($line, '</Subsection') !== false || strpos($line, '</Part') !== false))
264 {
265 // do nothing for now
266 }
267 // - figures (and their titles/captions)
268 elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
269 {
270 $figure_id = $matches[1];
271 $other_attributes = $matches[2];
272 echo '[adding figure: ' . $figure_id . "] \n";
273 // We need the title too
274 $caption = getTitle($xml_in, 'figure:' . $figure_id);
275 $caption = translateText(alternateComments($caption));
276 $txt = "<imgcaption figure_" . $figure_id . '|' . $caption . ' ';
277 // - we also check the other attributes to see if the XML has requested
278 // any following codeblock be linenumbered
279 if (strpos($other_attributes, 'withLineNumber') !== false)
280 {
281 $is_code_linenumbered = true;
282 $txt .= '%!-- withLineNumber --%';
283 }
284 $txt .= '></imgcaption>' . "\n";
285 if ($in_chapter)
286 {
287 fwrite($chapter_txt_out, $txt);
288 }
289 else
290 {
291 $frontmatter_text .= $txt;
292 }
293 }
294 elseif (strpos($line, '</Figure>') !== false)
295 {
296 if ($in_chapter)
297 {
298 fwrite($chapter_txt_out, "\n\n");
299 }
300 else
301 {
302 $frontmatter_text .= "\n\n";
303 }
304 // - no longer required
305 $is_code_linenumbered = false;
306 }
307 elseif (preg_match('/<Table([^>]*).*?.*?>/', $line, $matches))
308 {
309 $attributes = $matches[1];
310 $table_txt = '';
311 $table_id = '';
312 if (preg_match('/id="([^"]+)"/', $attributes, $matches))
313 {
314 $table_id = $matches[1];
315 }
316 else
317 {
318 $table_id = generateID('table');
319 }
320 $hidden = false;
321 if (strpos($attributes, 'class="hidden"') !== false)
322 {
323 $hidden = true;
324 }
325 echo '[adding table: ' . $table_id . "] \n";
326 $table_caption = getTitle($xml_in, 'table: ' . $table_id);
327 if ($hidden)
328 {
329 $table_txt .= '<tblcaption table_' . $table_id . '|##HIDDEN##></tblcaption>' . "\n";
330 }
331 elseif (empty($table_caption))
332 {
333 $table_txt .= '<tblcaption table_' . $table_id . '|##NOCAPTION##></tblcaption>' . "\n";
334 }
335 else
336 {
337 $table_txt .= '<tblcaption table_' . $table_id . '|' . noComments($table_caption) . '></tblcaption>' . "\n";
338 }
339 // - in order to properly capture the table we're going to have to read in
340 // the whole thing here, and take note of column widths
341 $have_output_widths = false;
342 $column_widths = array();
343 while (strpos($line, '</Table>') === false)
344 {
345 // - find the start of a row
346 while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
347 {
348 $line = getLine($xml_in);
349 }
350 if (strpos($line, '<tr>') !== false)
351 {
352 $row_txt = '|';
353 $line = getLine($xml_in);
354 // - now we read in multiple cells
355 while (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
356 {
357 $cell_width = $matches[1];
358 if (!$have_output_widths)
359 {
360 array_push($column_widths, $cell_width);
361 }
362 // Ignore empty cells
363 // - adding another case for empty header cells (turned up in es
364 // version of "From Paper")
365 if (preg_match('/<th width="\d+"\/>/', $line) || preg_match('/<th width="\d+">.*<\/th>/', $line))
366 {
367 $row_txt .= ' |';
368 }
369 else
370 {
371 $line = getLine($xml_in);
372 $first = true;
373 while (strpos($line, '</th>') === false)
374 {
375 if (!$first)
376 {
377 $row_txt .= '\\\\';
378 }
379 // - we can have images or text in our tables
380 if (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
381 {
382 $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
383 $row_txt .= ' ' . $image_txt . ' ';
384 }
385 elseif (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
386 {
387 $row_txt .= ' <!-- id:' . $matches[1] . ' -->' . translateText($matches[2]) . ' ';
388 }
389 elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
390 {
391 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
392 }
393 else
394 {
395 printError('Warning! Unrecognized element in table: ' . $line);
396 }
397 $first = false;
398 // - next line
399 $line = getLine($xml_in);
400 }
401 // - close the cell
402 $row_txt .= '|';
403 }
404 // next!
405 $line = getLine($xml_in);
406 }
407 // - if we haven't already, output the width command
408 if (!$have_output_widths)
409 {
410 $table_txt .= '|< - ' . implode(' ', $column_widths) . ' >|' . "\n";
411 $have_output_widths = true;
412 }
413 $table_txt .= $row_txt . "\n";
414 // - throw away the closing </tr>
415 $line = getLine($xml_in);
416 }
417 }
418 $table_txt .= "\n";
419 if ($in_chapter)
420 {
421 fwrite($chapter_txt_out, $table_txt);
422 }
423 else
424 {
425 $frontmatter_text .= $table_txt;
426 }
427 }
428 // - copy and insert images
429 elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
430 {
431 $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
432 if ($in_chapter)
433 {
434 fwrite($chapter_txt_out, $image_txt);
435 }
436 else
437 {
438 $frontmatter_text .= $image_txt;
439 }
440 }
441 // - bullet lists
442 elseif (preg_match('/<BulletList>/', $line))
443 {
444 echo "[adding bulletlist] \n";
445 if ($in_bullet_item || $in_numbered_item)
446 {
447 if ($in_chapter)
448 {
449 fwrite($chapter_txt_out, "\n");
450 }
451 elseif ($in_section)
452 {
453 $frontmatter_text .= "\n";
454 }
455 }
456 $bullet_depth++;
457 $is_numbered_list = false;
458 }
459 // - numbered lists
460 elseif (preg_match('/<NumberedList>/', $line))
461 {
462 echo "[adding numbered list] \n";
463 $bullet_depth++;
464 $is_numbered_list = true;
465 // - reset this flag that keeps track of whether an item (numbered or
466 // otherwise) is legitimately split by a code block
467 $seen_code_in_item = false;
468 }
469 elseif (preg_match('/<\/BulletList>/', $line))
470 {
471 $bullet_depth--;
472 if ($bullet_depth == 0)
473 {
474 if ($in_chapter)
475 {
476 fwrite($chapter_txt_out, "\n");
477 }
478 elseif ($in_section)
479 {
480 $frontmatter_text .= "\n";
481 }
482 $is_numbered_list = false;
483 }
484 ///cho "[finished bulletlist] ";
485 }
486 elseif (preg_match('/<\/NumberedList>/', $line))
487 {
488 $bullet_depth--;
489 if ($bullet_depth == 0)
490 {
491 if ($in_chapter)
492 {
493 fwrite($chapter_txt_out, "\n");
494 }
495 elseif ($in_section)
496 {
497 $frontmatter_text .= "\n";
498 }
499 $is_numbered_list = false;
500 }
501 ///cho "[finished numbered list] ";
502 }
503 elseif (preg_match('/<NumberedItem>/', $line))
504 {
505 $in_numbered_item = 1;
506 }
507 elseif (preg_match('/<\/NumberedItem>/', $line))
508 {
509 $in_numbered_item = 0;
510 if ($in_chapter)
511 {
512 fwrite($chapter_txt_out, "\n");
513 }
514 else
515 {
516 $frontmatter_text .= "\n";
517 }
518 }
519 elseif (preg_match('/<Bullet>/', $line))
520 {
521 $in_bullet_item = true;
522 }
523 elseif (preg_match('/<\/Bullet>/', $line))
524 {
525 if ($in_chapter)
526 {
527 fwrite($chapter_txt_out, "\n");
528 }
529 else
530 {
531 $frontmatter_text .= "\n";
532 }
533 $in_bullet_item = false;
534 }
535 // TEXT HANDLING - this is the main case, but has disappeared into the mire
536 // of other cases.
537 elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
538 {
539 $id = $matches[1];
540 $str = $matches[2];
541 // - special case for those text elements split over multiple lines. We
542 // keep concatenating lines until we find the closing text element or we
543 // run out of lines!
544 $another_line = '';
545 while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
546 {
547 $str .= ' ' . $another_line;
548 }
549 // - note that if we ran out of lines (eof) then we'll break out of this
550 // block anyway, it's just there won't be a <\Text> at the end of this
551 // block... despite this being a major validation issue in the XML it
552 // shouldn't result in this script being vladed
553 // - now remove the </Text> from the end (hopefully) of str
554 $str = preg_replace('/<\/Text>\s*/', '', $str);
555 // - and prepend the id while translating the str into Dokuwiki format
556 $str = '<!-- id:' . $id . ' -->' . translateText($str);
557 if ($bullet_depth > 0)
558 {
559 if ($is_numbered_list)
560 {
561 // - special case for those text elements legimately split in two by
562 // code blocks. They get no bullet of either type and are 'run-on'
563 // immediately to the end of the code element in order to prevent
564 // dokuwiki restarting numbering etc
565 if ($seen_code_in_item)
566 {
567 // - leave str as it is
568 // - reset flag just incase the item happens to contain another
569 // code block
570 $seen_code_in_item = false;
571 }
572 else if ($in_numbered_item == 1)
573 {
574 $str = '- ' . $str;
575 }
576 // - superspecial case for the poorly formatted numberlists that
577 // contain more than one text block per point. We'll nest them
578 // as a bullet list as that preserves order, formatting and (I
579 // hope) meaning.
580 else
581 {
582 if ($in_chapter)
583 {
584 fwrite($chapter_txt_out, "\n");
585 }
586 else
587 {
588 $frontmatter_text .= "\n";
589 }
590 $str = ' * ' . $str;
591 }
592 $in_numbered_item++;
593 }
594 else
595 {
596 $str = '* ' . $str;
597 }
598 for ($i = 0; $i < $bullet_depth; $i++)
599 {
600 $str = ' ' . $str;
601 }
602 }
603 else
604 {
605 // Indented text is preceeded by a >
606 if ($in_indent)
607 {
608 $str = '> ' . $str . "\n";
609 }
610 else
611 {
612 $str .= "\n";
613 }
614 }
615 if ($bullet_depth == 0)
616 {
617 $str .= "\n";
618 }
619 if ($in_chapter)
620 {
621 fwrite($chapter_txt_out, $str);
622 }
623 else
624 {
625 $frontmatter_text .= $str;
626 }
627 }
628 // - codified text blocks
629 elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
630 {
631 $code_id = $matches[1];
632 // - determine the appropriate code block prefix
633 $code_prefix = '';
634 if (!$in_code)
635 {
636 if ($is_code_linenumbered)
637 {
638 $code_prefix = '<code 1>';
639 }
640 else
641 {
642 $code_prefix = '<code>';
643 }
644 $in_code = true;
645 }
646 $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
647 if ($in_chapter)
648 {
649 fwrite($chapter_txt_out, $code_txt);
650 }
651 else
652 {
653 $frontmatter_text .= $code_txt;
654 }
655 }
656 elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
657 {
658 $code_id = $matches[1];
659 // - determine the appropriate code block prefix
660 $code_prefix = '';
661 if (!$in_code)
662 {
663 if ($is_code_linenumbered)
664 {
665 $code_prefix = '<code 1>';
666 }
667 else
668 {
669 $code_prefix = '<code>';
670 }
671 $in_code = true;
672 }
673 $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
674 if ($in_chapter)
675 {
676 fwrite($chapter_txt_out, $code_txt);
677 }
678 else
679 {
680 $frontmatter_text .= $code_txt;
681 }
682 }
683 elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
684 {
685 $code_txt = $matches[1];
686 // - determine the appropriate code block prefix
687 $code_prefix = '';
688 if (!$in_code)
689 {
690 if ($is_code_linenumbered)
691 {
692 $code_prefix = "<code 1>\n";
693 }
694 else
695 {
696 $code_prefix = "<code>\n";
697 }
698 $in_code = true;
699 }
700 // - arg. another special case for codelines that span more than one line
701 // (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
702 // for an element name, eh?)
703 $another_line = '';
704 while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
705 {
706 $code_txt .= ' ' . $another_line;
707 }
708 $code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
709 $code_txt = $code_prefix . translateText($code_txt, true);
710 if ($in_chapter)
711 {
712 fwrite($chapter_txt_out, $code_txt);
713 }
714 else
715 {
716 $frontmatter_text .= $code_txt;
717 }
718 }
719 // - there are also sometimes empty codelines - which indicate a newline in
720 // the code listing
721 elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
722 {
723 $code_txt = '';
724 if (!$in_code)
725 {
726 $code_txt = "\n";
727 if ($is_code_linenumbered)
728 {
729 $code_txt = "<code 1>\n" . $code_txt;
730 }
731 else
732 {
733 $code_txt = "<code>\n" . $code_txt;
734 }
735 $in_code = true;
736 }
737 if ($in_chapter)
738 {
739 fwrite($chapter_txt_out, $code_txt);
740 }
741 else
742 {
743 $frontmatter_text .= $code_txt;
744 }
745 }
746 // - reference to an external XML file
747 elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
748 {
749 if ($in_chapter)
750 {
751 fwrite($chapter_txt_out, $line);
752 }
753 else
754 {
755 $frontmatter_text .= $line;
756 }
757 }
758 elseif (strpos($line, '<FootnoteList>') !== false)
759 {
760 $in_footnotes = true;
761 }
762 elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
763 {
764 $in_footnotes = false;
765 }
766 // Indentation - the closest thing we have is quoting, so we'll use that
767 elseif (strpos($line, '<Indented>') !== false)
768 {
769 $in_indent = true;
770 }
771 elseif (strpos($line, '</Indented>') !== false)
772 {
773 $in_indent = false;
774 }
775 // - pattern of lines to ignore
776 elseif (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
777 {
778 }
779 // - we ignore anything else in footnotes too, as they were handled in the
780 // preprocessing pass
781 elseif ($in_footnotes)
782 {
783 }
784 // - danger Will Robinson!
785 else
786 {
787 echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|<hr /></div>\n";
788 }
789 }
790
791// 2. We should now have enough metadata to export the cover page
792$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
793// - backup any existing file
794if (file_exists($top_page_path))
795 {
796 $top_page_backup_path = $top_page_path . '.bak';
797 if(!rename($top_page_path, $top_page_backup_path))
798 {
799 printError('Failed to rename existing top page for backup');
800 }
801 }
802// - and create a handle to the new file
803$txt_out = fopen($top_page_path, 'w');
804// - write the page (including the tables)
805fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
806fwrite($txt_out, "\n");
807
808// - *NEW* ability to request imports and exports from within the page
809fwrite($txt_out, "<ifauth @admin>\n\n");
810fwrite($txt_out, '**Administrator Commands:**' . "\n");
811// On second thoughts we probably never want to do this casually, as it boguses
812// all history/approval/edit information. Instead I'll leave this as a manual
813// process.
814fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
815fwrite($txt_out, ' * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
816fwrite($txt_out, "</ifauth>\n\n");
817
818// - regular metadata
819fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
820fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
821$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
822foreach ($fields as $field)
823{
824 $values = getMetadata($field);
825 foreach ($values as $value)
826 {
827 fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
828 }
829}
830fwrite($txt_out, "\n");
831// - contents (which also provides order information for exporting)
832fwrite($txt_out, '===== Contents =====' . "\n");
833fwrite($txt_out, "\n");
834fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
835foreach ($page_order as $page_info)
836{
837 fwrite($txt_out, ' * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
838}
839// - system metadata
840fwrite($txt_out, "<ifauth @admin>\n\n");
841fwrite($txt_out, '===== System Metadata =====' . "\n");
842fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
843fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
844$fields = array('ENTITY','SupplementaryText');
845foreach ($fields as $field)
846{
847 $values = getMetadata($field);
848 foreach ($values as $value)
849 {
850 fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
851 }
852}
853fwrite($txt_out, "</ifauth>\n\n");
854fwrite($txt_out, "\n");
855// - done!
856fclose($txt_out);
857$page_count++;
858
859// 3. And the 'sections' page, grouping together all the loose sections as
860// frontmatter
861$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
862// - backup any existing file
863if (file_exists($frontmatter_page_path))
864 {
865 $frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
866 if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
867 {
868 printError('Failed to rename existing frontmatter page for backup');
869 }
870 }
871// - populate the new frontmatter file
872file_put_contents($frontmatter_page_path, $frontmatter_text);
873
874echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
875echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
876exit(0);
877
878/**
879 */
880function addMetadata($field, $value)
881{
882 global $manual_metadata;
883 echo '[adding metadata: ' . $field . "] \n";
884 $values = array();
885 if (isset($manual_metadata[$field]))
886 {
887 $values = $manual_metadata[$field];
888 }
889 array_push($values, $value);
890 $manual_metadata[$field] = $values;
891}
892/** addMetadata() **/
893
894function getFirstMetadata($field)
895{
896 global $manual_metadata;
897 $value = '';
898 if (isset($manual_metadata[$field]))
899 {
900 $values = $manual_metadata[$field];
901 if (!empty($values))
902 {
903 $value = $values[0];
904 }
905 }
906 return $value;
907}
908/** getFirstMetadata() **/
909
910function getMetadata($field)
911{
912 global $manual_metadata;
913 $values = array();
914 if (isset($manual_metadata[$field]))
915 {
916 $values = $manual_metadata[$field];
917 }
918 return $values;
919}
920/** getMetadata() **/
921
922/**
923 * Read in the next title element with nested text element and extract the
924 * title.
925 */
926function getTitle($xml_in, $element)
927{
928 $title = '';
929 // - the first thing in a chapter will be it's title
930 $title_line = getLine($xml_in);
931 // - super special case: a table with an empty title
932 if (strpos($title_line, '<Title/>') !== false)
933 {
934 return '';
935 }
936 if (strpos($title_line, '<Title>') === false)
937 {
938 printError('Failed to find opening title for: ' . $element);
939 }
940 $title_line = getLine($xml_in);
941 // - grab the chapter title now so we can store it in the page ordering
942 if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
943 {
944 $id = $matches[1];
945 $str = $matches[2];
946 // - special case for text blocks that span multiple lines (as discovered
947 // in the russian "From Paper"
948 $another_line = '';
949 while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
950 {
951 $str .= ' ' . $another_line;
952 }
953 // - now remove </Text>
954 $str = preg_replace('/<\/Text>\s*/', '', $str);
955 $title = '<!-- id:' . $id . ' -->' . $str;
956 }
957 // - special case for (stoopid) empty titles that use up a text id
958 elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
959 {
960 $title = '<!-- id:' . $matches[1] . ' -->';
961 }
962 else
963 {
964 printError('Failed to find title text for: ' . $element);
965 }
966 // - watch for subtitle elements
967 $title_line = getLine($xml_in);
968 if (strpos($title_line, '<SubTitle>') !== false)
969 {
970 $title_line = getLine($xml_in);
971 if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
972 {
973 $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
974 }
975 $title_line = getLine($xml_in);
976 if (strpos($title_line, '</SubTitle>') === false)
977 {
978 printError('Failed to find closing title for: ' . $element);
979 }
980 $title_line = getLine($xml_in);
981 }
982 if (strpos($title_line, '</Title>') === false)
983 {
984 printError('Failed to find closing title for: ' . $element);
985 }
986 return $title;
987}
988/** getTitle() **/
989
990function alternateComments($text)
991{
992 $text = str_replace('<!--', '%!--', $text);
993 $text = str_replace('-->', '--%', $text);
994 return $text;
995}
996
997function noComments($text)
998{
999 $text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1000 return preg_replace('/<!--[^>]+-->/', '', $text);
1001}
1002
1003function translateTableCodeline($text)
1004{
1005 ///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1006 // Escape the current italics tags to prevent the translate destroying them
1007 $text = str_replace('<i>', '%!--i--%', $text);
1008 $text = str_replace('</i>', '%/i%', $text);
1009 // Translate the text, just decoding the entities
1010 $text = translateText($text, true);
1011 // Now turn the italic tags (escaped) into HTML comments so we remember them
1012 // but they are hidden in the text
1013 $text = str_replace('%!--i--%', '<!--i-->', $text);
1014 $text = str_replace('%/i%', '<!--/i-->', $text);
1015 ///cho " =&gt; '" . htmlspecialchars($text) . "<br />\n";
1016 return $text;
1017}
1018
1019function translateText($text, $entities_only=false)
1020{
1021 global $entity_replacements;
1022 global $footnotes;
1023 global $in_code;
1024
1025 // - immediate find and protect any legitimate HTML comments in the text
1026 // (so already using encoded entities), otherwise they'll be throughly
1027 // vladed during the following tranforms. This has to be matched with
1028 // changes to the HTMLComments plugin in Dokuwiki to allow the correct
1029 // thing to be displayed to the user.
1030 if (!$in_code)
1031 {
1032 $text = str_replace('&lt;!--', '%!--', $text);
1033 $text = str_replace('--&gt;', '--%', $text);
1034 }
1035
1036 if (!$entities_only)
1037 {
1038 // - replace linking constructs with dokuwiki ones
1039 // - external chapter section crossrefs are easily the worst of all...
1040 while (preg_match('/<CrossRef\s[^>]*external[^>]*\/>/', $text) && preg_match('/<CrossRef\s[^>]*target="Chapter"[^>]*\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1041 {
1042 $pattern = $matches[0];
1043 $attributes = $matches[1];
1044 $manual_name = '';
1045 if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1046 {
1047 $manual_name = $matches[1];
1048 }
1049 $language = '';
1050 if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1051 {
1052 $language = $matches[1];
1053 }
1054 $page_id = '';
1055 if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1056 {
1057 $page_id = $matches[1];
1058 }
1059 if (empty($manual_name) || empty($language) || empty($page_id))
1060 {
1061 printError('Failed to parse external reference: ' . $pattern);
1062 }
1063 // - best we can do is a search within a restricted namespace
1064 $reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
1065 $text = str_replace($pattern, $reference, $text);
1066 }
1067 // - chapter crossrefs are tricksie due to needing to know ordering numbers
1068 while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1069 {
1070 $chapter_id = $matches[1];
1071 $page_name = $chapter_id;
1072 $text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
1073 }
1074 // - internal figure and table references
1075 $text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1076 $text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1077 // - simple internal reference
1078 $text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1|\1]]', $text);
1079 $text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1|\1]]', $text);
1080 $text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1|\1]]', $text);
1081 // - simple external url
1082 $text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1|\2]]', $text);
1083 // - footnote references are also tricksie as we've had to extract the
1084 // footnotes earlier (during chapter counting)
1085 while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1086 {
1087 $footnote_id = $matches[1];
1088 if (!isset($footnotes[$footnote_id]))
1089 {
1090 printError('Unknown footnote referenced: ' . $footnote_id);
1091 }
1092 $footnote = $footnotes[$footnote_id];
1093 $text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1094 }
1095 // - detect and handle URLs surrounded by <i> tags very carefully (as doku
1096 // will less than helpfully turn them into an external link and screw up
1097 // everything that follows them on the page).
1098 // example: <i>www.microsoft.com</i>
1099 // example: <i>http://nzdl.org/cgi-bin/library</i>
1100 // example: <i>www.yourserver.com</i>
1101 // example: <i>http://www.yourserver.com</i>
1102 // example: <i>http://www.yourserver.com/greenstone</i>
1103 $text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1104 // - superspecial case for two-part URLs ending in .org (like nzdl.org)
1105 $text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1106 // - another superspecial case, this time for URLs on localhost
1107 $text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1108 // - p00p, underlines have the same issue around URLs.
1109 $text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1110 $text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1111 $text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1112
1113 // - replace HTML elements with the dokuwiki style equivilents
1114 $text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
1115 // - restore the double slashes in dokuwiki links
1116 while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
1117 {
1118 $text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
1119 }
1120 $text = str_replace('<b>', '**', $text);
1121 $text = str_replace('</b>', '**', $text);
1122 //$text = str_replace(' <br/>', '\\\\ ', $text);
1123 //$text = str_replace(' <br/>', '\\\\ ', $text);
1124 //$text = str_replace('<br/>', '\\\\ ', $text);
1125 $text = str_replace('<i>', '//', $text);
1126 $text = str_replace('</i>', '//', $text);
1127 $text = str_replace('<u>', '__', $text);
1128 $text = str_replace('</u>', '__', $text);
1129 }
1130 // Decode entities
1131 // - user defined entities (in the manual metadata)
1132 foreach ($entity_replacements as $entity=>$code)
1133 {
1134 $text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1135 }
1136 // - standard entities
1137 $text = str_replace('&gt;','>', $text);
1138 $text = str_replace('&lt;','<', $text);
1139 $text = str_replace('&amp;','&', $text);
1140 return $text;
1141}
1142/** translateText() **/
1143
1144function handleImage($filename, $width, $height)
1145{
1146 global $dokuwiki_path;
1147 global $xml_source_path;
1148 echo '[copying image: ' . $filename . "] \n";
1149 // - copy file into place
1150 $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1151 $destination_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
1152 copy($source_path, $destination_path);
1153 if (!file_exists($destination_path))
1154 {
1155 printError('Failed to copy image file: ' . $filename);
1156 }
1157 // - create the string
1158 $image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1159 return $image_txt;
1160}
1161
1162function getLine($in)
1163{
1164 global $line_counter;
1165 $line_counter++;
1166 return fgets($in);
1167}
1168
1169function noFormatting($text)
1170{
1171 $text = str_replace( '<b>', '', $text);
1172 $text = str_replace('</b>', '', $text);
1173 $text = str_replace( '<i>', '', $text);
1174 $text = str_replace('</i>', '', $text);
1175 $text = str_replace('<br/>', '', $text);
1176 return $text;
1177}
1178
1179?>
Note: See TracBrowser for help on using the repository browser.