source: documentation/trunk/php/gs-manual-import.php@ 25052

Last change on this file since 25052 was 25052, checked in by jmt12, 10 years ago

Several changes to Metadata, Table and Figure parsing as issues were encountered in languages other than English. I'd argue most of these were bogus XML, but I better import the pages anyway.

File size: 41.7 KB
Line 
1<?php
2
3require_once('common.php');
4
5/** @file gs-manual-export.php
6 * This script transforms the single XML manual file required by the rest of
7 * the Greenstone manual generation scripts into the series of dokuwiki pages
8 * that make up a certain manual (as specified by the 'm' argument) in a
9 * certain language ('l').
10 */
11
12if (!parseCLIArguments())
13 {
14 printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user|install|develop|paper]");
15 }
16
17// 0. Initialization
18if (!isset($_REQUEST['l']) || empty($_REQUEST['l']))
19{
20 $_REQUEST['l'] = 'en';
21}
22if (!isset($_REQUEST['m']) || empty($_REQUEST['m']))
23{
24 $_REQUEST['m'] = 'user';
25 //$_REQUEST['m'] = 'install';
26 //$_REQUEST['m'] = 'develop';
27 //$_REQUEST['m'] = 'paper';
28}
29
30// - validate arguments before we use them (security)
31if (!preg_match('/^(develop|install|paper|user)$/',$_REQUEST['m']))
32 {
33 printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34 }
35
36if (!preg_match('/^(ar|en|es|fr|pt-br|ru)$/',$_REQUEST['l']))
37 {
38 printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39 }
40
41echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43echo "<p><b>Frontmatter: </b><br/>\n";
44// 1. By-and-large we're going to process all of this in a big state machine
45// - the top level page, containing cover page and chapter order information,
46// needs to be created last, so we have to store it's information
47$manual_metadata = array();
48$entity_replacements = array();
49$footnotes = array();
50$page_order = array();
51$page_count = 2;
52$looking_for_metadata = '';
53$chapter_txt_out = false;
54$frontmatter_text = '';
55$in_section = false;
56$sections_page_name = '';
57$in_chapter = false;
58$chapter_id = '';
59$bullet_depth = 0;
60$is_numbered_list = true;
61$line_counter = 0;
62$in_code = false;
63$in_footnotes = false;
64$in_numbered_item = 0;
65$in_bullet_item = false;
66$seen_code_in_item = false;
67$in_indent = false;
68$is_code_linenumbered = false;
69// - construct the path using the information we've been provided as arguments
70$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71$xml_in = fopen($xml_file_path, 'r');
72if (!$xml_in)
73 {
74 printError('Failed to locate top level page for manual');
75 }
76// - we also use this opportunity to read in any footnotes as we'll need to
77// move them onto their appropriate page
78while (($line = fgets($xml_in)) !== false)
79 {
80 if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81 {
82 $footnote_id = $matches[1];
83 $text_line = fgets($xml_in);
84 if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85 {
86 $footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87 }
88 // - throw away </Footnote>
89 fgets($xml_in);
90 }
91 }
92fclose($xml_in);
93// - now reopen to parse it
94$xml_in = fopen($xml_file_path, 'r');
95if (!$xml_in)
96 {
97 printError('Failed to locate top level page for manual');
98 }
99while (($line = getLine($xml_in)) !== false)
100 {
101 // - Special Case: lingering code blocks, continue if next line also
102 // contains code, otherwise we need an extra newline
103 if ($in_code)
104 {
105 $code_text = "\n";
106 if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107 {
108 $code_text .= '</code>';
109 // - codeblocks that appear inside numbered lists do not get their own
110 // newlines as that would split the item. Instead newlines will be
111 // added when </NumberedItem> encountered.
112 if (!$is_numbered_list)
113 {
114 $code_text .= "\n\n";
115 }
116 else
117 {
118 $seen_code_in_item = true;
119 }
120 $in_code = false;
121 }
122 if ($in_chapter)
123 {
124 fwrite($chapter_txt_out, $code_text);
125 }
126 else
127 {
128 $frontmatter_text .= $code_text;
129 }
130 }
131 // - some system metadata to watch for
132 if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133 {
134 $entity = $matches[1];
135 addMetadata('ENTITY',$entity);
136 if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137 {
138 $entity_replacements[$matches[1]] = $matches[2];
139 }
140 }
141 // - we have an explicit list of cover metadata to watch for
142 elseif (!$in_section && !$in_chapter && preg_match('/<(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
143 {
144 $looking_for_metadata = $matches[1];
145 }
146 elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author|Affiliation|Comment|Date|Heading|SupplementaryText|Title|Version)>/', $line, $matches))
147 {
148 $looking_for_metadata = '';
149 }
150 // - found metadata we have!
151 elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152 {
153 $text_id = $matches[1];
154 $text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155 addMetadata($looking_for_metadata, $text);
156 }
157 // - bogus metadata found in French version
158 elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
159 {
160 }
161 // - any text we encounter outside of both sections and chapters also
162 // belongs on the cover
163 elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
164 {
165 // (for now I'll assume id's are persistent)
166 addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
167 }
168 // - we will probably encounter the opening section (which is outside of a
169 // chapter) first, so we have a special case for it
170 elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
171 {
172 $section_id = $matches[1];
173 // - if this is the first non-chapter section we have encountered then it
174 // gets the honor of having the page---that these sections will
175 // eventually be printed out on---named after it. Typically this should
176 // be "about_this_manual"
177 if (empty($frontmatter_text))
178 {
179 $sections_page_name = $section_id;
180 }
181 $in_section = true;
182 $title = getTitle($xml_in, 'section:' . $section_id);
183 if (empty($frontmatter_text))
184 {
185 array_unshift($page_order, $sections_page_name . '|' . noComments($title));
186 }
187 // - if the title, as is, wouldn't autogenerate the appropriate id, then
188 // we have to include the id explicitly (as another html comment block)
189 if ($section_id != generateID($title))
190 {
191 $title = '<!-- sid:' . $section_id . ' -->' . $title;
192 $seen_ids[$section_id] = 1;
193 }
194 $frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
195 // - whew. Chapter's going to be just as bad though.
196 }
197 elseif ($in_section && preg_match('/<\/Section>/', $line))
198 {
199 $in_section = false;
200 }
201 elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
202 {
203 $chapter_id = $matches[1];
204 echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
205 $chapter_page_name = $chapter_id;
206 // - create a new file to store this chapter
207 $chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
208 if (!file_exists($chapter_file_dir))
209 {
210 mkAllDir($chapter_file_dir, 0755);
211 }
212 $chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
213 // - backup existing file
214 if (file_exists($chapter_file_path))
215 {
216 $chapter_backup_file_path = $chapter_file_path . '.bak';
217 rename($chapter_file_path, $chapter_backup_file_path);
218 }
219 // - open new file for writing
220 $chapter_txt_out = fopen($chapter_file_path, 'w');
221 if (!$chapter_txt_out)
222 {
223 printError('Failed to open page file for writing: ' . $chapter_page_name);
224 }
225 $in_chapter = true;
226 $title = getTitle($xml_in, 'chapter: ' . $chapter_id);
227 fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
228 array_push($page_order, $chapter_page_name . '|' . noComments($title));
229 }
230 elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
231 {
232 fclose($chapter_txt_out);
233 $chapter_txt_out = false;
234 $in_chapter = false;
235 $page_count++;
236 }
237 // - section, subsection and part titles within chapter
238 elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
239 {
240 $title_type = $matches[1];
241 $section_id = $matches[2];
242 if (empty($section_id))
243 {
244 $section_id = generateID(strtolower($title_type));
245 }
246 echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
247 $header_fix = '';
248 $title = getTitle($xml_in, 'heading: ' . $title_type);
249 if ($title_type == 'Section')
250 {
251 $header_fix = '=====';
252 }
253 if ($title_type == 'Subsection')
254 {
255 $header_fix = '====';
256 }
257 if ($title_type == 'Part')
258 {
259 $header_fix = '===';
260 // - remove b's and i's
261 $title = preg_replace('/<\/?(B|I)>/i', '', $title);
262 }
263 // - if the title, as is, wouldn't autogenerate the appropriate id, then
264 // we have to include the id explicitly (as another html comment block)
265 if ($section_id != generateID($title))
266 {
267 $title = '<!-- sid:' . $section_id . ' -->' . $title;
268 $seen_ids[$section_id] = 1;
269 }
270 fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
271 }
272 elseif ($in_chapter && (strpos($line, '</Section') !== false || strpos($line, '</Subsection') !== false || strpos($line, '</Part') !== false))
273 {
274 // do nothing for now
275 }
276 // - figures (and their titles/captions)
277 elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
278 {
279 $figure_id = $matches[1];
280 $other_attributes = $matches[2];
281 echo '[adding figure: ' . $figure_id . "] \n";
282 // We need the title too
283 $caption = getTitle($xml_in, 'figure:' . $figure_id);
284 $caption = translateText(alternateComments($caption));
285 $txt = "<imgcaption figure_" . $figure_id . '|' . $caption . ' ';
286 // - we also check the other attributes to see if the XML has requested
287 // any following codeblock be linenumbered
288 if (strpos($other_attributes, 'withLineNumber') !== false)
289 {
290 $is_code_linenumbered = true;
291 $txt .= '%!-- withLineNumber --%';
292 }
293 $txt .= '></imgcaption>' . "\n";
294 if ($in_chapter)
295 {
296 fwrite($chapter_txt_out, $txt);
297 }
298 else
299 {
300 $frontmatter_text .= $txt;
301 }
302 }
303 elseif (strpos($line, '</Figure>') !== false)
304 {
305 if ($in_chapter)
306 {
307 fwrite($chapter_txt_out, "\n\n");
308 }
309 else
310 {
311 $frontmatter_text .= "\n\n";
312 }
313 // - no longer required
314 $is_code_linenumbered = false;
315 }
316 elseif (preg_match('/<Table([^>]*).*?.*?>/', $line, $matches))
317 {
318 $attributes = $matches[1];
319 $table_txt = '';
320 $table_id = '';
321 if (preg_match('/id="([^"]+)"/', $attributes, $matches))
322 {
323 $table_id = $matches[1];
324 }
325 else
326 {
327 $table_id = generateID('table');
328 }
329 $hidden = false;
330 if (strpos($attributes, 'class="hidden"') !== false)
331 {
332 $hidden = true;
333 }
334 echo '[adding table: ' . $table_id . "] \n";
335 $table_caption = getTitle($xml_in, 'table: ' . $table_id);
336 if ($hidden)
337 {
338 $table_txt .= '<tblcaption table_' . $table_id . '|##HIDDEN##></tblcaption>' . "\n";
339 }
340 elseif (empty($table_caption))
341 {
342 $table_txt .= '<tblcaption table_' . $table_id . '|##NOCAPTION##></tblcaption>' . "\n";
343 }
344 else
345 {
346 $table_txt .= '<tblcaption table_' . $table_id . '|' . noComments($table_caption) . '></tblcaption>' . "\n";
347 }
348 // - in order to properly capture the table we're going to have to read in
349 // the whole thing here, and take note of column widths
350 $have_output_widths = false;
351 $column_widths = array();
352 while (strpos($line, '</Table>') === false)
353 {
354 // - find the start of a row
355 while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
356 {
357 $line = getLine($xml_in);
358 }
359 if (strpos($line, '<tr>') !== false)
360 {
361 $row_txt = '|';
362 $line = getLine($xml_in);
363 // - now we read in multiple cells (line starting <th
364 while (strpos($line, '<th') === 0)
365 {
366 if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
367 {
368 $cell_width = $matches[1];
369 if (!$have_output_widths)
370 {
371 array_push($column_widths, $cell_width);
372 }
373 }
374 // Ignore empty cells
375 // - adding another case for empty header cells (turned up in es
376 // version of "From Paper")
377 if (preg_match('/<th width="\d+"\/>/', $line) || preg_match('/<th width="\d+">.*<\/th>/', $line))
378 {
379 $row_txt .= ' |';
380 }
381 else
382 {
383 $line = getLine($xml_in);
384 $first = true;
385 while (strpos($line, '</th>') === false)
386 {
387 if (!$first)
388 {
389 $row_txt .= '\\\\';
390 }
391 // - we can have images or text in our tables
392 if (preg_match('/<File.*url="images\/([^"]+)".*\/>/', $line, $matches))
393 {
394 $payload = $matches[0];
395 $filename = $matches[1];
396 $width = 0;
397 if (preg_match('/width="(\d+)"/', $payload, $matches))
398 {
399 $width = $matches[1];
400 }
401 $height = 0;
402 if (preg_match('/height="(\d+)"/', $payload, $matches))
403 {
404 $height = $matches[1];
405 }
406 $image_txt = handleImage($filename, $width, $height);
407 $row_txt .= ' ' . $image_txt . ' ';
408 }
409 elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
410 {
411 $tid = $matches[1];
412 $txt = $matches[2];
413 // - multiple line text block
414 while (strpos($txt, '</Text>') === false)
415 {
416 $txt .= getLine($xml_in);
417 }
418 $txt = str_replace('</Text>','',$txt);
419 $row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
420 }
421 elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
422 {
423 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
424 }
425 elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
426 {
427 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
428 }
429 elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
430 {
431 if (!empty($matches[1]))
432 {
433 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
434 }
435 else
436 {
437 $row_txt .= ' ';
438 }
439 }
440 // we'll add (bogus) linebreaks
441 elseif (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
442 {
443 $row_txt = ' ';
444 }
445 else
446 {
447 printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
448 }
449 $first = false;
450 // - next line
451 $line = getLine($xml_in);
452 }
453 // - close the cell
454 $row_txt .= '|';
455 }
456 // next!
457 $line = getLine($xml_in);
458 }
459 // - if we haven't already, output the width command
460 if (!$have_output_widths)
461 {
462 $table_txt .= '|< - ' . implode(' ', $column_widths) . ' >|' . "\n";
463 $have_output_widths = true;
464 }
465 $table_txt .= $row_txt . "\n";
466 // - throw away the closing </tr>
467 $line = getLine($xml_in);
468 }
469 }
470 $table_txt .= "\n";
471 if ($in_chapter)
472 {
473 fwrite($chapter_txt_out, $table_txt);
474 }
475 else
476 {
477 $frontmatter_text .= $table_txt;
478 }
479 }
480 // - copy and insert images
481 elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
482 {
483 $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
484 if ($in_chapter)
485 {
486 fwrite($chapter_txt_out, $image_txt);
487 }
488 else
489 {
490 $frontmatter_text .= $image_txt;
491 }
492 }
493 // - bullet lists
494 elseif (preg_match('/<BulletList>/', $line))
495 {
496 echo "[adding bulletlist] \n";
497 if ($in_bullet_item || $in_numbered_item)
498 {
499 if ($in_chapter)
500 {
501 fwrite($chapter_txt_out, "\n");
502 }
503 elseif ($in_section)
504 {
505 $frontmatter_text .= "\n";
506 }
507 }
508 $bullet_depth++;
509 $is_numbered_list = false;
510 }
511 // - numbered lists
512 elseif (preg_match('/<NumberedList>/', $line))
513 {
514 echo "[adding numbered list] \n";
515 $bullet_depth++;
516 $is_numbered_list = true;
517 // - reset this flag that keeps track of whether an item (numbered or
518 // otherwise) is legitimately split by a code block
519 $seen_code_in_item = false;
520 }
521 elseif (preg_match('/<\/BulletList>/', $line))
522 {
523 $bullet_depth--;
524 if ($bullet_depth == 0)
525 {
526 if ($in_chapter)
527 {
528 fwrite($chapter_txt_out, "\n");
529 }
530 elseif ($in_section)
531 {
532 $frontmatter_text .= "\n";
533 }
534 $is_numbered_list = false;
535 }
536 ///cho "[finished bulletlist] ";
537 }
538 elseif (preg_match('/<\/NumberedList>/', $line))
539 {
540 $bullet_depth--;
541 if ($bullet_depth == 0)
542 {
543 if ($in_chapter)
544 {
545 fwrite($chapter_txt_out, "\n");
546 }
547 elseif ($in_section)
548 {
549 $frontmatter_text .= "\n";
550 }
551 $is_numbered_list = false;
552 }
553 ///cho "[finished numbered list] ";
554 }
555 elseif (preg_match('/<NumberedItem>/', $line))
556 {
557 $in_numbered_item = 1;
558 }
559 elseif (preg_match('/<\/NumberedItem>/', $line))
560 {
561 $in_numbered_item = 0;
562 if ($in_chapter)
563 {
564 fwrite($chapter_txt_out, "\n");
565 }
566 else
567 {
568 $frontmatter_text .= "\n";
569 }
570 }
571 elseif (preg_match('/<Bullet>/', $line))
572 {
573 $in_bullet_item = true;
574 }
575 elseif (preg_match('/<\/Bullet>/', $line))
576 {
577 if ($in_chapter)
578 {
579 fwrite($chapter_txt_out, "\n");
580 }
581 else
582 {
583 $frontmatter_text .= "\n";
584 }
585 $in_bullet_item = false;
586 }
587 // TEXT HANDLING - this is the main case, but has disappeared into the mire
588 // of other cases.
589 elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
590 {
591 $id = $matches[1];
592 $str = $matches[2];
593 // - special case for those text elements split over multiple lines. We
594 // keep concatenating lines until we find the closing text element or we
595 // run out of lines!
596 $another_line = '';
597 while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
598 {
599 $str .= ' ' . $another_line;
600 }
601 // - note that if we ran out of lines (eof) then we'll break out of this
602 // block anyway, it's just there won't be a <\Text> at the end of this
603 // block... despite this being a major validation issue in the XML it
604 // shouldn't result in this script being vladed
605 // - now remove the </Text> from the end (hopefully) of str
606 $str = preg_replace('/<\/Text>\s*/', '', $str);
607 // - and prepend the id while translating the str into Dokuwiki format
608 $str = '<!-- id:' . $id . ' -->' . translateText($str);
609 if ($bullet_depth > 0)
610 {
611 if ($is_numbered_list)
612 {
613 // - special case for those text elements legimately split in two by
614 // code blocks. They get no bullet of either type and are 'run-on'
615 // immediately to the end of the code element in order to prevent
616 // dokuwiki restarting numbering etc
617 if ($seen_code_in_item)
618 {
619 // - leave str as it is
620 // - reset flag just incase the item happens to contain another
621 // code block
622 $seen_code_in_item = false;
623 }
624 else if ($in_numbered_item == 1)
625 {
626 $str = '- ' . $str;
627 }
628 // - superspecial case for the poorly formatted numberlists that
629 // contain more than one text block per point. We'll nest them
630 // as a bullet list as that preserves order, formatting and (I
631 // hope) meaning.
632 else
633 {
634 if ($in_chapter)
635 {
636 fwrite($chapter_txt_out, "\n");
637 }
638 else
639 {
640 $frontmatter_text .= "\n";
641 }
642 $str = ' * ' . $str;
643 }
644 $in_numbered_item++;
645 }
646 else
647 {
648 $str = '* ' . $str;
649 }
650 for ($i = 0; $i < $bullet_depth; $i++)
651 {
652 $str = ' ' . $str;
653 }
654 }
655 else
656 {
657 // Indented text is preceeded by a >
658 if ($in_indent)
659 {
660 $str = '> ' . $str . "\n";
661 }
662 else
663 {
664 $str .= "\n";
665 }
666 }
667 if ($bullet_depth == 0)
668 {
669 $str .= "\n";
670 }
671 if ($in_chapter)
672 {
673 fwrite($chapter_txt_out, $str);
674 }
675 else
676 {
677 $frontmatter_text .= $str;
678 }
679 }
680 // - codified text blocks
681 elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
682 {
683 $code_id = $matches[1];
684 // - determine the appropriate code block prefix
685 $code_prefix = '';
686 if (!$in_code)
687 {
688 if ($is_code_linenumbered)
689 {
690 $code_prefix = '<code 1>';
691 }
692 else
693 {
694 $code_prefix = '<code>';
695 }
696 $in_code = true;
697 }
698 $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
699 if ($in_chapter)
700 {
701 fwrite($chapter_txt_out, $code_txt);
702 }
703 else
704 {
705 $frontmatter_text .= $code_txt;
706 }
707 }
708 elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
709 {
710 $code_id = $matches[1];
711 // - determine the appropriate code block prefix
712 $code_prefix = '';
713 if (!$in_code)
714 {
715 if ($is_code_linenumbered)
716 {
717 $code_prefix = '<code 1>';
718 }
719 else
720 {
721 $code_prefix = '<code>';
722 }
723 $in_code = true;
724 }
725 $code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
726 if ($in_chapter)
727 {
728 fwrite($chapter_txt_out, $code_txt);
729 }
730 else
731 {
732 $frontmatter_text .= $code_txt;
733 }
734 }
735 elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
736 {
737 $code_txt = $matches[1];
738 // - determine the appropriate code block prefix
739 $code_prefix = '';
740 if (!$in_code)
741 {
742 if ($is_code_linenumbered)
743 {
744 $code_prefix = "<code 1>\n";
745 }
746 else
747 {
748 $code_prefix = "<code>\n";
749 }
750 $in_code = true;
751 }
752 // - arg. another special case for codelines that span more than one line
753 // (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
754 // for an element name, eh?)
755 $another_line = '';
756 while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
757 {
758 $code_txt .= ' ' . $another_line;
759 }
760 $code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
761 $code_txt = $code_prefix . translateText($code_txt, true);
762 if ($in_chapter)
763 {
764 fwrite($chapter_txt_out, $code_txt);
765 }
766 else
767 {
768 $frontmatter_text .= $code_txt;
769 }
770 }
771 // - there are also sometimes empty codelines - which indicate a newline in
772 // the code listing
773 elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
774 {
775 $code_txt = '';
776 if (!$in_code)
777 {
778 $code_txt = "\n";
779 if ($is_code_linenumbered)
780 {
781 $code_txt = "<code 1>\n" . $code_txt;
782 }
783 else
784 {
785 $code_txt = "<code>\n" . $code_txt;
786 }
787 $in_code = true;
788 }
789 if ($in_chapter)
790 {
791 fwrite($chapter_txt_out, $code_txt);
792 }
793 else
794 {
795 $frontmatter_text .= $code_txt;
796 }
797 }
798 // - reference to an external XML file
799 elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
800 {
801 if ($in_chapter)
802 {
803 fwrite($chapter_txt_out, $line);
804 }
805 else
806 {
807 $frontmatter_text .= $line;
808 }
809 }
810 elseif (strpos($line, '<FootnoteList>') !== false)
811 {
812 $in_footnotes = true;
813 }
814 elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
815 {
816 $in_footnotes = false;
817 }
818 // Indentation - the closest thing we have is quoting, so we'll use that
819 elseif (strpos($line, '<Indented>') !== false)
820 {
821 $in_indent = true;
822 }
823 elseif (strpos($line, '</Indented>') !== false)
824 {
825 $in_indent = false;
826 }
827 // - pattern of lines to ignore
828 else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
829 {
830 }
831 // - we ignore anything else in footnotes too, as they were handled in the
832 // preprocessing pass
833 else if ($in_footnotes)
834 {
835 }
836 // - ignore empty lines
837 else if (preg_match('/^\s*$/', $line))
838 {
839 }
840 // - meh. French versions have random, non-text element, linebreaks floating
841 // around. Guess I'll honor their formatting even though it's bogus
842 else if (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
843 {
844 if ($in_chapter)
845 {
846 fwrite($chapter_txt_out, ' \\\\');
847 }
848 else
849 {
850 $frontmatter_text .= ' \\\\';
851 }
852 }
853 // - danger Will Robinson!
854 else
855 {
856 echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': |' . htmlspecialchars($line) . "|<hr /></div>\n";
857 }
858 }
859
860// 2. We should now have enough metadata to export the cover page
861$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
862// - backup any existing file
863if (file_exists($top_page_path))
864 {
865 $top_page_backup_path = $top_page_path . '.bak';
866 if(!rename($top_page_path, $top_page_backup_path))
867 {
868 printError('Failed to rename existing top page for backup');
869 }
870 }
871// - and create a handle to the new file
872$txt_out = fopen($top_page_path, 'w');
873// - write the page (including the tables)
874fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
875fwrite($txt_out, "\n");
876
877// - *NEW* ability to request imports and exports from within the page
878fwrite($txt_out, "<ifauth @admin>\n\n");
879fwrite($txt_out, '**Administrator Commands:**' . "\n");
880// On second thoughts we probably never want to do this casually, as it boguses
881// all history/approval/edit information. Instead I'll leave this as a manual
882// process.
883fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
884fwrite($txt_out, ' * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download|approved version]]' . "\n");
885fwrite($txt_out, "</ifauth>\n\n");
886
887// - regular metadata
888fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
889fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
890$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
891foreach ($fields as $field)
892{
893 $values = getMetadata($field);
894 foreach ($values as $value)
895 {
896 fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
897 }
898}
899fwrite($txt_out, "\n");
900// - contents (which also provides order information for exporting)
901fwrite($txt_out, '===== Contents =====' . "\n");
902fwrite($txt_out, "\n");
903fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
904foreach ($page_order as $page_info)
905{
906 fwrite($txt_out, ' * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
907}
908// - system metadata
909fwrite($txt_out, "<ifauth @admin>\n\n");
910fwrite($txt_out, '===== System Metadata =====' . "\n");
911fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
912fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
913$fields = array('ENTITY','SupplementaryText');
914foreach ($fields as $field)
915{
916 $values = getMetadata($field);
917 foreach ($values as $value)
918 {
919 fwrite($txt_out, '^ ' . $field . ' | ' . $value . ' |' . "\n");
920 }
921}
922fwrite($txt_out, "</ifauth>\n\n");
923fwrite($txt_out, "\n");
924// - done!
925fclose($txt_out);
926$page_count++;
927
928// 3. And the 'sections' page, grouping together all the loose sections as
929// frontmatter
930$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
931// - backup any existing file
932if (file_exists($frontmatter_page_path))
933 {
934 $frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
935 if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
936 {
937 printError('Failed to rename existing frontmatter page for backup');
938 }
939 }
940// - populate the new frontmatter file
941file_put_contents($frontmatter_page_path, $frontmatter_text);
942
943echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
944echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
945exit(0);
946
947/**
948 */
949function addMetadata($field, $value)
950{
951 global $manual_metadata;
952 echo '[adding metadata: ' . $field . "] \n";
953 $values = array();
954 if (isset($manual_metadata[$field]))
955 {
956 $values = $manual_metadata[$field];
957 }
958 array_push($values, $value);
959 $manual_metadata[$field] = $values;
960}
961/** addMetadata() **/
962
963function getFirstMetadata($field)
964{
965 global $manual_metadata;
966 $value = '';
967 if (isset($manual_metadata[$field]))
968 {
969 $values = $manual_metadata[$field];
970 if (!empty($values))
971 {
972 $value = $values[0];
973 }
974 }
975 return $value;
976}
977/** getFirstMetadata() **/
978
979function getMetadata($field)
980{
981 global $manual_metadata;
982 $values = array();
983 if (isset($manual_metadata[$field]))
984 {
985 $values = $manual_metadata[$field];
986 }
987 return $values;
988}
989/** getMetadata() **/
990
991/**
992 * Read in the next title element with nested text element and extract the
993 * title.
994 */
995function getTitle($xml_in, $element)
996{
997 $title = '';
998 $in_title_element = false;
999 // - the first thing in a chapter will be it's title
1000 $title_line = getLine($xml_in);
1001 // - super special case: some language versions don't wrap titles in title
1002 // element, so if the first thing we see is a text, we treat that as the
1003 // title
1004 if (strpos($title_line, '<text') !== false)
1005 {
1006
1007 }
1008 // - super special case: a table with an empty title
1009 if (strpos($title_line, '<Title/>') !== false)
1010 {
1011 return '';
1012 }
1013 if (strpos($title_line, '<Title>') !== false)
1014 {
1015 $in_title_element = true;
1016 $title_line = getLine($xml_in);
1017 }
1018 // - some horribly formed entries have the subtitle first within the title
1019 // element
1020 if (strpos($title_line, '<SubTitle>') !== false)
1021 {
1022 $title_line = getLine($xml_in);
1023 if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1024 {
1025 $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1026 }
1027 $title_line = getLine($xml_in);
1028 if (strpos($title_line, '</SubTitle>') === false)
1029 {
1030 printError('Failed to find closing title for: ' . $element);
1031 }
1032 $title_line = getLine($xml_in);
1033 }
1034 // - grab the chapter title now so we can store it in the page ordering
1035 if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
1036 {
1037 $id = $matches[1];
1038 $str = $matches[2];
1039 // - special case for text blocks that span multiple lines (as discovered
1040 // in the russian "From Paper"
1041 $another_line = '';
1042 while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
1043 {
1044 $str .= ' ' . $another_line;
1045 }
1046 // - now remove </Text>
1047 $str = preg_replace('/<\/Text>\s*/', '', $str);
1048 $title = '<!-- id:' . $id . ' -->' . $str . $title;
1049 }
1050 // - special case for (stoopid) empty titles that use up a text id
1051 elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
1052 {
1053 $title = '<!-- id:' . $matches[1] . ' -->' . $title;
1054 }
1055 else
1056 {
1057 printError('Failed to find title text for: ' . $element);
1058 }
1059 // - watch for subtitle elements
1060 if ($in_title_element)
1061 {
1062 $title_line = getLine($xml_in);
1063 if (strpos($title_line, '<SubTitle>') !== false)
1064 {
1065 $title_line = getLine($xml_in);
1066 if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1067 {
1068 $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1069 }
1070 $title_line = getLine($xml_in);
1071 if (strpos($title_line, '</SubTitle>') === false)
1072 {
1073 printError('Failed to find closing title for: ' . $element);
1074 }
1075 $title_line = getLine($xml_in);
1076 }
1077 if (strpos($title_line, '</Title>') === false)
1078 {
1079 printError('Failed to find closing title for: ' . $element);
1080 }
1081 }
1082 return $title;
1083}
1084/** getTitle() **/
1085
1086function alternateComments($text)
1087{
1088 $text = str_replace('<!--', '%!--', $text);
1089 $text = str_replace('-->', '--%', $text);
1090 // remove any lurking crossrefs while we are at it
1091 $text = preg_replace('/<CrossRef.*?ref="([^"]+)".*?>/', '\\1', $text);
1092 return $text;
1093}
1094
1095function noComments($text)
1096{
1097 $text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1098 return preg_replace('/<!--[^>]+-->/', '', $text);
1099}
1100
1101function translateTableCodeline($text)
1102{
1103 ///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1104 // Escape the current italics tags to prevent the translate destroying them
1105 $text = str_replace('<i>', '%!--i--%', $text);
1106 $text = str_replace('</i>', '%/i%', $text);
1107 // Translate the text, just decoding the entities
1108 $text = translateText($text, true);
1109 // Now turn the italic tags (escaped) into HTML comments so we remember them
1110 // but they are hidden in the text
1111 $text = str_replace('%!--i--%', '<!--i-->', $text);
1112 $text = str_replace('%/i%', '<!--/i-->', $text);
1113 ///cho " =&gt; '" . htmlspecialchars($text) . "<br />\n";
1114 return $text;
1115}
1116
1117function translateText($text, $entities_only=false)
1118{
1119 global $entity_replacements;
1120 global $footnotes;
1121 global $in_code;
1122
1123 // - immediate find and protect any legitimate HTML comments in the text
1124 // (so already using encoded entities), otherwise they'll be throughly
1125 // vladed during the following tranforms. This has to be matched with
1126 // changes to the HTMLComments plugin in Dokuwiki to allow the correct
1127 // thing to be displayed to the user.
1128 if (!$in_code)
1129 {
1130 $text = str_replace('&lt;!--', '%!--', $text);
1131 $text = str_replace('--&gt;', '--%', $text);
1132 }
1133
1134 if (!$entities_only)
1135 {
1136 // - replace linking constructs with dokuwiki ones
1137 // - external chapter section crossrefs are easily the worst of all...
1138 while (preg_match('/<CrossRef\s[^>]*external[^>]*\/>/', $text) && preg_match('/<CrossRef\s[^>]*target="Chapter"[^>]*\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1139 {
1140 $pattern = $matches[0];
1141 $attributes = $matches[1];
1142 $manual_name = '';
1143 if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1144 {
1145 $manual_name = $matches[1];
1146 }
1147 $language = '';
1148 if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1149 {
1150 $language = $matches[1];
1151 }
1152 $page_id = '';
1153 if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1154 {
1155 $page_id = $matches[1];
1156 }
1157 if (empty($manual_name) || empty($language) || empty($page_id))
1158 {
1159 printError('Failed to parse external reference: ' . $pattern);
1160 }
1161 // - best we can do is a search within a restricted namespace
1162 $reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '|' . $page_id . ']]';
1163 $text = str_replace($pattern, $reference, $text);
1164 }
1165 // - chapter crossrefs are tricksie due to needing to know ordering numbers
1166 while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1167 {
1168 $chapter_id = $matches[1];
1169 $page_name = $chapter_id;
1170 $text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '|' . $chapter_id . ']]', $text);
1171 }
1172 // - internal figure and table references
1173 $text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1174 $text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1175 // - simple internal reference
1176 $text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1|\1]]', $text);
1177 $text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1|\1]]', $text);
1178 $text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1|\1]]', $text);
1179 // - simple external url
1180 $text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1|\2]]', $text);
1181 // - footnote references are also tricksie as we've had to extract the
1182 // footnotes earlier (during chapter counting)
1183 while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1184 {
1185 $footnote_id = $matches[1];
1186 if (!isset($footnotes[$footnote_id]))
1187 {
1188 printError('Unknown footnote referenced: ' . $footnote_id);
1189 }
1190 $footnote = $footnotes[$footnote_id];
1191 $text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1192 }
1193 // - detect and handle URLs surrounded by <i> tags very carefully (as doku
1194 // will less than helpfully turn them into an external link and screw up
1195 // everything that follows them on the page).
1196 // example: <i>www.microsoft.com</i>
1197 // example: <i>http://nzdl.org/cgi-bin/library</i>
1198 // example: <i>www.yourserver.com</i>
1199 // example: <i>http://www.yourserver.com</i>
1200 // example: <i>http://www.yourserver.com/greenstone</i>
1201 $text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1202 // - superspecial case for two-part URLs ending in .org (like nzdl.org)
1203 $text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1204 // - another superspecial case, this time for URLs on localhost
1205 $text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1206 // - p00p, underlines have the same issue around URLs.
1207 $text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1208 $text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1209 $text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1210
1211 // - replace HTML elements with the dokuwiki style equivilents
1212 $text = preg_replace('/(file|ftp|http):\/\//','\1:%%//%%', $text);
1213 // - restore the double slashes in dokuwiki links
1214 while (preg_match('/\[\[[^\]]*%%[^\]]*\]\]/', $text))
1215 {
1216 $text = preg_replace('/(\[\[[^\]]*)%%([^\]]*\]\])/', '\1\2', $text);
1217 }
1218 $text = str_replace('<b>', '**', $text);
1219 $text = str_replace('</b>', '**', $text);
1220 //$text = str_replace(' <br/>', '\\\\ ', $text);
1221 //$text = str_replace(' <br/>', '\\\\ ', $text);
1222 //$text = str_replace('<br/>', '\\\\ ', $text);
1223 $text = str_replace('<i>', '//', $text);
1224 $text = str_replace('</i>', '//', $text);
1225 $text = str_replace('<u>', '__', $text);
1226 $text = str_replace('</u>', '__', $text);
1227 }
1228 // Decode entities
1229 // - user defined entities (in the manual metadata)
1230 foreach ($entity_replacements as $entity=>$code)
1231 {
1232 $text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1233 }
1234 // - standard entities
1235 $text = str_replace('&gt;','>', $text);
1236 $text = str_replace('&lt;','<', $text);
1237 $text = str_replace('&amp;','&', $text);
1238 return $text;
1239}
1240/** translateText() **/
1241
1242function handleImage($filename, $width, $height)
1243{
1244 global $dokuwiki_path;
1245 global $xml_source_path;
1246 echo '[copying image: ' . $filename . "] \n";
1247 // - copy file into place
1248 $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1249 $destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
1250 if (!file_exists($destination_dir))
1251 {
1252 mkAllDir($destination_dir, 0755);
1253 }
1254 $destination_path = $destination_dir . strtolower($filename);
1255 copy($source_path, $destination_path);
1256 if (!file_exists($destination_path))
1257 {
1258 printError('Failed to copy image file: ' . $filename);
1259 }
1260 // - create the string
1261 $image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1262 return $image_txt;
1263}
1264
1265function getLine($in)
1266{
1267 global $line_counter;
1268 $line_counter++;
1269 return fgets($in);
1270}
1271
1272function noFormatting($text)
1273{
1274 $text = str_replace( '<b>', '', $text);
1275 $text = str_replace('</b>', '', $text);
1276 $text = str_replace( '<i>', '', $text);
1277 $text = str_replace('</i>', '', $text);
1278 $text = str_replace('<br/>', '', $text);
1279 return $text;
1280}
1281
1282?>
Note: See TracBrowser for help on using the repository browser.