Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: documentation/trunk/php/gs-manual-import.php@ 25026

Last change on this file since 25026 was 25026, checked in by jmt12, 12 years ago
PHP scripts for importing and exporting the XML manuals into and out of Dokuwiki
File size: 38.4 KB

Line
1	<?php
2
3	require_once('common.php');
4
5	/** @file gs-manual-export.php
6	* This script transforms the single XML manual file required by the rest of
7	* the Greenstone manual generation scripts into the series of dokuwiki pages
8	* that make up a certain manual (as specified by the 'm' argument) in a
9	* certain language ('l').
10	*/
11
12	if (!parseCLIArguments())
13	{
14	printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user\|install\|develop\|paper]");
15	}
16
17	// 0. Initialization
18	if (!isset($_REQUEST['l']) \|\| empty($_REQUEST['l']))
19	{
20	$_REQUEST['l'] = 'en';
21	}
22	if (!isset($_REQUEST['m']) \|\| empty($_REQUEST['m']))
23	{
24	$_REQUEST['m'] = 'user';
25	//$_REQUEST['m'] = 'install';
26	//$_REQUEST['m'] = 'develop';
27	//$_REQUEST['m'] = 'paper';
28	}
29
30	// - validate arguments before we use them (security)
31	if (!preg_match('/^(develop\|install\|paper\|user)$/',$_REQUEST['m']))
32	{
33	printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34	}
35
36	if (!preg_match('/^(ar\|en\|es\|fr\|pt-br\|ru)$/',$_REQUEST['l']))
37	{
38	printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39	}
40
41	echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42	echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43	echo "<p><b>Frontmatter: </b><br/>\n";
44	// 1. By-and-large we're going to process all of this in a big state machine
45	// - the top level page, containing cover page and chapter order information,
46	// needs to be created last, so we have to store it's information
47	$manual_metadata = array();
48	$entity_replacements = array();
49	$footnotes = array();
50	$page_order = array();
51	$page_count = 2;
52	$looking_for_metadata = '';
53	$chapter_txt_out = false;
54	$frontmatter_text = '';
55	$in_section = false;
56	$sections_page_name = '';
57	$in_chapter = false;
58	$chapter_id = '';
59	$bullet_depth = 0;
60	$is_numbered_list = true;
61	$line_counter = 0;
62	$in_code = false;
63	$in_footnotes = false;
64	$in_numbered_item = 0;
65	$in_bullet_item = false;
66	$seen_code_in_item = false;
67	$in_indent = false;
68	$is_code_linenumbered = false;
69	// - construct the path using the information we've been provided as arguments
70	$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71	$xml_in = fopen($xml_file_path, 'r');
72	if (!$xml_in)
73	{
74	printError('Failed to locate top level page for manual');
75	}
76	// - we also use this opportunity to read in any footnotes as we'll need to
77	// move them onto their appropriate page
78	while (($line = fgets($xml_in)) !== false)
79	{
80	if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81	{
82	$footnote_id = $matches[1];
83	$text_line = fgets($xml_in);
84	if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85	{
86	$footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87	}
88	// - throw away </Footnote>
89	fgets($xml_in);
90	}
91	}
92	fclose($xml_in);
93	// - now reopen to parse it
94	$xml_in = fopen($xml_file_path, 'r');
95	if (!$xml_in)
96	{
97	printError('Failed to locate top level page for manual');
98	}
99	while (($line = getLine($xml_in)) !== false)
100	{
101	// - Special Case: lingering code blocks, continue if next line also
102	// contains code, otherwise we need an extra newline
103	if ($in_code)
104	{
105	$code_text = "\n";
106	if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107	{
108	$code_text .= '</code>';
109	// - codeblocks that appear inside numbered lists do not get their own
110	// newlines as that would split the item. Instead newlines will be
111	// added when </NumberedItem> encountered.
112	if (!$is_numbered_list)
113	{
114	$code_text .= "\n\n";
115	}
116	else
117	{
118	$seen_code_in_item = true;
119	}
120	$in_code = false;
121	}
122	if ($in_chapter)
123	{
124	fwrite($chapter_txt_out, $code_text);
125	}
126	else
127	{
128	$frontmatter_text .= $code_text;
129	}
130	}
131	// - some system metadata to watch for
132	if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133	{
134	$entity = $matches[1];
135	addMetadata('ENTITY',$entity);
136	if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137	{
138	$entity_replacements[$matches[1]] = $matches[2];
139	}
140	}
141	// - we have an explicit list of cover metadata to watch for
142	elseif (!$in_section && !$in_chapter && preg_match('/<(Author\|Affiliation\|Comment\|Date\|Heading\|SupplementaryText\|Title\|Version)>/', $line, $matches))
143	{
144	$looking_for_metadata = $matches[1];
145	}
146	elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author\|Affiliation\|Comment\|Date\|Heading\|SupplementaryText\|Title\|Version)>/', $line, $matches))
147	{
148	$looking_for_metadata = '';
149	}
150	// - found metadata we have!
151	elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152	{
153	$text_id = $matches[1];
154	$text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155	addMetadata($looking_for_metadata, $text);
156	}
157	// - any text we encounter outside of both sections and chapters also
158	// belongs on the cover
159	elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
160	{
161	// (for now I'll assume id's are persistent)
162	addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
163	}
164	// - we will probably encounter the opening section (which is outside of a
165	// chapter) first, so we have a special case for it
166	elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
167	{
168	$section_id = $matches[1];
169	// - if this is the first non-chapter section we have encountered then it
170	// gets the honor of having the page---that these sections will
171	// eventually be printed out on---named after it. Typically this should
172	// be "about_this_manual"
173	if (empty($frontmatter_text))
174	{
175	$sections_page_name = $section_id;
176	}
177	$in_section = true;
178	$title = getTitle($xml_in, 'section:' . $section_id);
179	if (empty($frontmatter_text))
180	{
181	array_unshift($page_order, $sections_page_name . '\|' . noComments($title));
182	}
183	// - if the title, as is, wouldn't autogenerate the appropriate id, then
184	// we have to include the id explicitly (as another html comment block)
185	if ($section_id != generateID($title))
186	{
187	$title = '<!-- sid:' . $section_id . ' -->' . $title;
188	$seen_ids[$section_id] = 1;
189	}
190	$frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
191	// - whew. Chapter's going to be just as bad though.
192	}
193	elseif ($in_section && preg_match('/<\/Section>/', $line))
194	{
195	$in_section = false;
196	}
197	elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
198	{
199	$chapter_id = $matches[1];
200	echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
201	$chapter_page_name = $chapter_id;
202	// - create a new file to store this chapter
203	$chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
204	if (!file_exists($chapter_file_dir))
205	{
206	mkAllDir($chapter_file_dir, 0755);
207	}
208	$chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
209	// - backup existing file
210	if (file_exists($chapter_file_path))
211	{
212	$chapter_backup_file_path = $chapter_file_path . '.bak';
213	rename($chapter_file_path, $chapter_backup_file_path);
214	}
215	// - open new file for writing
216	$chapter_txt_out = fopen($chapter_file_path, 'w');
217	if (!$chapter_txt_out)
218	{
219	printError('Failed to open page file for writing: ' . $chapter_page_name);
220	}
221	$in_chapter = true;
222	$title = getTitle($xml_in, 'chapter: ' . $chapter_id);
223	fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
224	array_push($page_order, $chapter_page_name . '\|' . noComments($title));
225	}
226	elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
227	{
228	fclose($chapter_txt_out);
229	$chapter_txt_out = false;
230	$in_chapter = false;
231	$page_count++;
232	}
233	// - section, subsection and part titles within chapter
234	elseif ($in_chapter && preg_match('/<(Section\|Subsection\|Part)\sid="([^"]+)">/', $line, $matches))
235	{
236	$title_type = $matches[1];
237	$section_id = $matches[2];
238	$header_fix = '';
239	$title = getTitle($xml_in, 'heading: ' . $title_type);
240	if ($title_type == 'Section')
241	{
242	$header_fix = '=====';
243	}
244	if ($title_type == 'Subsection')
245	{
246	$header_fix = '====';
247	}
248	if ($title_type == 'Part')
249	{
250	$header_fix = '===';
251	// - remove b's and i's
252	$title = preg_replace('/<\/?(B\|I)>/i', '', $title);
253	}
254	// - if the title, as is, wouldn't autogenerate the appropriate id, then
255	// we have to include the id explicitly (as another html comment block)
256	if ($section_id != generateID($title))
257	{
258	$title = '<!-- sid:' . $section_id . ' -->' . $title;
259	$seen_ids[$section_id] = 1;
260	}
261	fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
262	}
263	elseif ($in_chapter && (strpos($line, '</Section') !== false \|\| strpos($line, '</Subsection') !== false \|\| strpos($line, '</Part') !== false))
264	{
265	// do nothing for now
266	}
267	// - figures (and their titles/captions)
268	elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
269	{
270	$figure_id = $matches[1];
271	$other_attributes = $matches[2];
272	echo '[adding figure: ' . $figure_id . "] \n";
273	// We need the title too
274	$caption = getTitle($xml_in, 'figure:' . $figure_id);
275	$caption = translateText(alternateComments($caption));
276	$txt = "<imgcaption figure_" . $figure_id . '\|' . $caption . ' ';
277	// - we also check the other attributes to see if the XML has requested
278	// any following codeblock be linenumbered
279	if (strpos($other_attributes, 'withLineNumber') !== false)
280	{
281	$is_code_linenumbered = true;
282	$txt .= '%!-- withLineNumber --%';
283	}
284	$txt .= '></imgcaption>' . "\n";
285	if ($in_chapter)
286	{
287	fwrite($chapter_txt_out, $txt);
288	}
289	else
290	{
291	$frontmatter_text .= $txt;
292	}
293	}
294	elseif (strpos($line, '</Figure>') !== false)
295	{
296	if ($in_chapter)
297	{
298	fwrite($chapter_txt_out, "\n\n");
299	}
300	else
301	{
302	$frontmatter_text .= "\n\n";
303	}
304	// - no longer required
305	$is_code_linenumbered = false;
306	}
307	elseif (preg_match('/<Table([^>]).?.*?>/', $line, $matches))
308	{
309	$attributes = $matches[1];
310	$table_txt = '';
311	$table_id = '';
312	if (preg_match('/id="([^"]+)"/', $attributes, $matches))
313	{
314	$table_id = $matches[1];
315	}
316	else
317	{
318	$table_id = generateID('table');
319	}
320	$hidden = false;
321	if (strpos($attributes, 'class="hidden"') !== false)
322	{
323	$hidden = true;
324	}
325	echo '[adding table: ' . $table_id . "] \n";
326	$table_caption = getTitle($xml_in, 'table: ' . $table_id);
327	if ($hidden)
328	{
329	$table_txt .= '<tblcaption table_' . $table_id . '\|##HIDDEN##></tblcaption>' . "\n";
330	}
331	elseif (empty($table_caption))
332	{
333	$table_txt .= '<tblcaption table_' . $table_id . '\|##NOCAPTION##></tblcaption>' . "\n";
334	}
335	else
336	{
337	$table_txt .= '<tblcaption table_' . $table_id . '\|' . noComments($table_caption) . '></tblcaption>' . "\n";
338	}
339	// - in order to properly capture the table we're going to have to read in
340	// the whole thing here, and take note of column widths
341	$have_output_widths = false;
342	$column_widths = array();
343	while (strpos($line, '</Table>') === false)
344	{
345	// - find the start of a row
346	while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
347	{
348	$line = getLine($xml_in);
349	}
350	if (strpos($line, '<tr>') !== false)
351	{
352	$row_txt = '\|';
353	$line = getLine($xml_in);
354	// - now we read in multiple cells
355	while (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
356	{
357	$cell_width = $matches[1];
358	if (!$have_output_widths)
359	{
360	array_push($column_widths, $cell_width);
361	}
362	// Ignore empty cells
363	// - adding another case for empty header cells (turned up in es
364	// version of "From Paper")
365	if (preg_match('/<th width="\d+"\/>/', $line) \|\| preg_match('/<th width="\d+">.*<\/th>/', $line))
366	{
367	$row_txt .= ' \|';
368	}
369	else
370	{
371	$line = getLine($xml_in);
372	$first = true;
373	while (strpos($line, '</th>') === false)
374	{
375	if (!$first)
376	{
377	$row_txt .= '\\\\';
378	}
379	// - we can have images or text in our tables
380	if (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
381	{
382	$image_txt = handleImage($matches[3], $matches[1], $matches[2]);
383	$row_txt .= ' ' . $image_txt . ' ';
384	}
385	elseif (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
386	{
387	$row_txt .= ' <!-- id:' . $matches[1] . ' -->' . translateText($matches[2]) . ' ';
388	}
389	elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
390	{
391	$row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
392	}
393	else
394	{
395	printError('Warning! Unrecognized element in table: ' . $line);
396	}
397	$first = false;
398	// - next line
399	$line = getLine($xml_in);
400	}
401	// - close the cell
402	$row_txt .= '\|';
403	}
404	// next!
405	$line = getLine($xml_in);
406	}
407	// - if we haven't already, output the width command
408	if (!$have_output_widths)
409	{
410	$table_txt .= '\|< - ' . implode(' ', $column_widths) . ' >\|' . "\n";
411	$have_output_widths = true;
412	}
413	$table_txt .= $row_txt . "\n";
414	// - throw away the closing </tr>
415	$line = getLine($xml_in);
416	}
417	}
418	$table_txt .= "\n";
419	if ($in_chapter)
420	{
421	fwrite($chapter_txt_out, $table_txt);
422	}
423	else
424	{
425	$frontmatter_text .= $table_txt;
426	}
427	}
428	// - copy and insert images
429	elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
430	{
431	$image_txt = handleImage($matches[3], $matches[1], $matches[2]);
432	if ($in_chapter)
433	{
434	fwrite($chapter_txt_out, $image_txt);
435	}
436	else
437	{
438	$frontmatter_text .= $image_txt;
439	}
440	}
441	// - bullet lists
442	elseif (preg_match('/<BulletList>/', $line))
443	{
444	echo "[adding bulletlist] \n";
445	if ($in_bullet_item \|\| $in_numbered_item)
446	{
447	if ($in_chapter)
448	{
449	fwrite($chapter_txt_out, "\n");
450	}
451	elseif ($in_section)
452	{
453	$frontmatter_text .= "\n";
454	}
455	}
456	$bullet_depth++;
457	$is_numbered_list = false;
458	}
459	// - numbered lists
460	elseif (preg_match('/<NumberedList>/', $line))
461	{
462	echo "[adding numbered list] \n";
463	$bullet_depth++;
464	$is_numbered_list = true;
465	// - reset this flag that keeps track of whether an item (numbered or
466	// otherwise) is legitimately split by a code block
467	$seen_code_in_item = false;
468	}
469	elseif (preg_match('/<\/BulletList>/', $line))
470	{
471	$bullet_depth--;
472	if ($bullet_depth == 0)
473	{
474	if ($in_chapter)
475	{
476	fwrite($chapter_txt_out, "\n");
477	}
478	elseif ($in_section)
479	{
480	$frontmatter_text .= "\n";
481	}
482	$is_numbered_list = false;
483	}
484	///cho "[finished bulletlist] ";
485	}
486	elseif (preg_match('/<\/NumberedList>/', $line))
487	{
488	$bullet_depth--;
489	if ($bullet_depth == 0)
490	{
491	if ($in_chapter)
492	{
493	fwrite($chapter_txt_out, "\n");
494	}
495	elseif ($in_section)
496	{
497	$frontmatter_text .= "\n";
498	}
499	$is_numbered_list = false;
500	}
501	///cho "[finished numbered list] ";
502	}
503	elseif (preg_match('/<NumberedItem>/', $line))
504	{
505	$in_numbered_item = 1;
506	}
507	elseif (preg_match('/<\/NumberedItem>/', $line))
508	{
509	$in_numbered_item = 0;
510	if ($in_chapter)
511	{
512	fwrite($chapter_txt_out, "\n");
513	}
514	else
515	{
516	$frontmatter_text .= "\n";
517	}
518	}
519	elseif (preg_match('/<Bullet>/', $line))
520	{
521	$in_bullet_item = true;
522	}
523	elseif (preg_match('/<\/Bullet>/', $line))
524	{
525	if ($in_chapter)
526	{
527	fwrite($chapter_txt_out, "\n");
528	}
529	else
530	{
531	$frontmatter_text .= "\n";
532	}
533	$in_bullet_item = false;
534	}
535	// TEXT HANDLING - this is the main case, but has disappeared into the mire
536	// of other cases.
537	elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
538	{
539	$id = $matches[1];
540	$str = $matches[2];
541	// - special case for those text elements split over multiple lines. We
542	// keep concatenating lines until we find the closing text element or we
543	// run out of lines!
544	$another_line = '';
545	while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
546	{
547	$str .= ' ' . $another_line;
548	}
549	// - note that if we ran out of lines (eof) then we'll break out of this
550	// block anyway, it's just there won't be a <\Text> at the end of this
551	// block... despite this being a major validation issue in the XML it
552	// shouldn't result in this script being vladed
553	// - now remove the </Text> from the end (hopefully) of str
554	$str = preg_replace('/<\/Text>\s*/', '', $str);
555	// - and prepend the id while translating the str into Dokuwiki format
556	$str = '<!-- id:' . $id . ' -->' . translateText($str);
557	if ($bullet_depth > 0)
558	{
559	if ($is_numbered_list)
560	{
561	// - special case for those text elements legimately split in two by
562	// code blocks. They get no bullet of either type and are 'run-on'
563	// immediately to the end of the code element in order to prevent
564	// dokuwiki restarting numbering etc
565	if ($seen_code_in_item)
566	{
567	// - leave str as it is
568	// - reset flag just incase the item happens to contain another
569	// code block
570	$seen_code_in_item = false;
571	}
572	else if ($in_numbered_item == 1)
573	{
574	$str = '- ' . $str;
575	}
576	// - superspecial case for the poorly formatted numberlists that
577	// contain more than one text block per point. We'll nest them
578	// as a bullet list as that preserves order, formatting and (I
579	// hope) meaning.
580	else
581	{
582	if ($in_chapter)
583	{
584	fwrite($chapter_txt_out, "\n");
585	}
586	else
587	{
588	$frontmatter_text .= "\n";
589	}
590	$str = ' * ' . $str;
591	}
592	$in_numbered_item++;
593	}
594	else
595	{
596	$str = '* ' . $str;
597	}
598	for ($i = 0; $i < $bullet_depth; $i++)
599	{
600	$str = ' ' . $str;
601	}
602	}
603	else
604	{
605	// Indented text is preceeded by a >
606	if ($in_indent)
607	{
608	$str = '> ' . $str . "\n";
609	}
610	else
611	{
612	$str .= "\n";
613	}
614	}
615	if ($bullet_depth == 0)
616	{
617	$str .= "\n";
618	}
619	if ($in_chapter)
620	{
621	fwrite($chapter_txt_out, $str);
622	}
623	else
624	{
625	$frontmatter_text .= $str;
626	}
627	}
628	// - codified text blocks
629	elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
630	{
631	$code_id = $matches[1];
632	// - determine the appropriate code block prefix
633	$code_prefix = '';
634	if (!$in_code)
635	{
636	if ($is_code_linenumbered)
637	{
638	$code_prefix = '<code 1>';
639	}
640	else
641	{
642	$code_prefix = '<code>';
643	}
644	$in_code = true;
645	}
646	$code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
647	if ($in_chapter)
648	{
649	fwrite($chapter_txt_out, $code_txt);
650	}
651	else
652	{
653	$frontmatter_text .= $code_txt;
654	}
655	}
656	elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
657	{
658	$code_id = $matches[1];
659	// - determine the appropriate code block prefix
660	$code_prefix = '';
661	if (!$in_code)
662	{
663	if ($is_code_linenumbered)
664	{
665	$code_prefix = '<code 1>';
666	}
667	else
668	{
669	$code_prefix = '<code>';
670	}
671	$in_code = true;
672	}
673	$code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
674	if ($in_chapter)
675	{
676	fwrite($chapter_txt_out, $code_txt);
677	}
678	else
679	{
680	$frontmatter_text .= $code_txt;
681	}
682	}
683	elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
684	{
685	$code_txt = $matches[1];
686	// - determine the appropriate code block prefix
687	$code_prefix = '';
688	if (!$in_code)
689	{
690	if ($is_code_linenumbered)
691	{
692	$code_prefix = "<code 1>\n";
693	}
694	else
695	{
696	$code_prefix = "<code>\n";
697	}
698	$in_code = true;
699	}
700	// - arg. another special case for codelines that span more than one line
701	// (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
702	// for an element name, eh?)
703	$another_line = '';
704	while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
705	{
706	$code_txt .= ' ' . $another_line;
707	}
708	$code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
709	$code_txt = $code_prefix . translateText($code_txt, true);
710	if ($in_chapter)
711	{
712	fwrite($chapter_txt_out, $code_txt);
713	}
714	else
715	{
716	$frontmatter_text .= $code_txt;
717	}
718	}
719	// - there are also sometimes empty codelines - which indicate a newline in
720	// the code listing
721	elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
722	{
723	$code_txt = '';
724	if (!$in_code)
725	{
726	$code_txt = "\n";
727	if ($is_code_linenumbered)
728	{
729	$code_txt = "<code 1>\n" . $code_txt;
730	}
731	else
732	{
733	$code_txt = "<code>\n" . $code_txt;
734	}
735	$in_code = true;
736	}
737	if ($in_chapter)
738	{
739	fwrite($chapter_txt_out, $code_txt);
740	}
741	else
742	{
743	$frontmatter_text .= $code_txt;
744	}
745	}
746	// - reference to an external XML file
747	elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
748	{
749	if ($in_chapter)
750	{
751	fwrite($chapter_txt_out, $line);
752	}
753	else
754	{
755	$frontmatter_text .= $line;
756	}
757	}
758	elseif (strpos($line, '<FootnoteList>') !== false)
759	{
760	$in_footnotes = true;
761	}
762	elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
763	{
764	$in_footnotes = false;
765	}
766	// Indentation - the closest thing we have is quoting, so we'll use that
767	elseif (strpos($line, '<Indented>') !== false)
768	{
769	$in_indent = true;
770	}
771	elseif (strpos($line, '</Indented>') !== false)
772	{
773	$in_indent = false;
774	}
775	// - pattern of lines to ignore
776	elseif (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>\|<\!DOCTYPE Manual \[\|\]>\|<Bullet>\|<\/?Content>\|<\/?Footnote\|<Manual id=".+?" lang=".+?">\|<\/Manual>)/', $line))
777	{
778	}
779	// - we ignore anything else in footnotes too, as they were handled in the
780	// preprocessing pass
781	elseif ($in_footnotes)
782	{
783	}
784	// - danger Will Robinson!
785	else
786	{
787	echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': \|' . htmlspecialchars($line) . "\|<hr /></div>\n";
788	}
789	}
790
791	// 2. We should now have enough metadata to export the cover page
792	$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
793	// - backup any existing file
794	if (file_exists($top_page_path))
795	{
796	$top_page_backup_path = $top_page_path . '.bak';
797	if(!rename($top_page_path, $top_page_backup_path))
798	{
799	printError('Failed to rename existing top page for backup');
800	}
801	}
802	// - and create a handle to the new file
803	$txt_out = fopen($top_page_path, 'w');
804	// - write the page (including the tables)
805	fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
806	fwrite($txt_out, "\n");
807
808	// - NEW ability to request imports and exports from within the page
809	fwrite($txt_out, "<ifauth @admin>\n\n");
810	fwrite($txt_out, 'Administrator Commands:' . "\n");
811	// On second thoughts we probably never want to do this casually, as it boguses
812	// all history/approval/edit information. Instead I'll leave this as a manual
813	// process.
814	fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
815	fwrite($txt_out, ' * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download\|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download\|approved version]]' . "\n");
816	fwrite($txt_out, "</ifauth>\n\n");
817
818	// - regular metadata
819	fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
820	fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
821	$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
822	foreach ($fields as $field)
823	{
824	$values = getMetadata($field);
825	foreach ($values as $value)
826	{
827	fwrite($txt_out, '^ ' . $field . ' \| ' . $value . ' \|' . "\n");
828	}
829	}
830	fwrite($txt_out, "\n");
831	// - contents (which also provides order information for exporting)
832	fwrite($txt_out, '===== Contents =====' . "\n");
833	fwrite($txt_out, "\n");
834	fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
835	foreach ($page_order as $page_info)
836	{
837	fwrite($txt_out, ' * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
838	}
839	// - system metadata
840	fwrite($txt_out, "<ifauth @admin>\n\n");
841	fwrite($txt_out, '===== System Metadata =====' . "\n");
842	fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
843	fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
844	$fields = array('ENTITY','SupplementaryText');
845	foreach ($fields as $field)
846	{
847	$values = getMetadata($field);
848	foreach ($values as $value)
849	{
850	fwrite($txt_out, '^ ' . $field . ' \| ' . $value . ' \|' . "\n");
851	}
852	}
853	fwrite($txt_out, "</ifauth>\n\n");
854	fwrite($txt_out, "\n");
855	// - done!
856	fclose($txt_out);
857	$page_count++;
858
859	// 3. And the 'sections' page, grouping together all the loose sections as
860	// frontmatter
861	$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
862	// - backup any existing file
863	if (file_exists($frontmatter_page_path))
864	{
865	$frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
866	if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
867	{
868	printError('Failed to rename existing frontmatter page for backup');
869	}
870	}
871	// - populate the new frontmatter file
872	file_put_contents($frontmatter_page_path, $frontmatter_text);
873
874	echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
875	echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
876	exit(0);
877
878	/**
879	*/
880	function addMetadata($field, $value)
881	{
882	global $manual_metadata;
883	echo '[adding metadata: ' . $field . "] \n";
884	$values = array();
885	if (isset($manual_metadata[$field]))
886	{
887	$values = $manual_metadata[$field];
888	}
889	array_push($values, $value);
890	$manual_metadata[$field] = $values;
891	}
892	/ addMetadata() /
893
894	function getFirstMetadata($field)
895	{
896	global $manual_metadata;
897	$value = '';
898	if (isset($manual_metadata[$field]))
899	{
900	$values = $manual_metadata[$field];
901	if (!empty($values))
902	{
903	$value = $values[0];
904	}
905	}
906	return $value;
907	}
908	/ getFirstMetadata() /
909
910	function getMetadata($field)
911	{
912	global $manual_metadata;
913	$values = array();
914	if (isset($manual_metadata[$field]))
915	{
916	$values = $manual_metadata[$field];
917	}
918	return $values;
919	}
920	/ getMetadata() /
921
922	/**
923	* Read in the next title element with nested text element and extract the
924	* title.
925	*/
926	function getTitle($xml_in, $element)
927	{
928	$title = '';
929	// - the first thing in a chapter will be it's title
930	$title_line = getLine($xml_in);
931	// - super special case: a table with an empty title
932	if (strpos($title_line, '<Title/>') !== false)
933	{
934	return '';
935	}
936	if (strpos($title_line, '<Title>') === false)
937	{
938	printError('Failed to find opening title for: ' . $element);
939	}
940	$title_line = getLine($xml_in);
941	// - grab the chapter title now so we can store it in the page ordering
942	if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
943	{
944	$id = $matches[1];
945	$str = $matches[2];
946	// - special case for text blocks that span multiple lines (as discovered
947	// in the russian "From Paper"
948	$another_line = '';
949	while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
950	{
951	$str .= ' ' . $another_line;
952	}
953	// - now remove </Text>
954	$str = preg_replace('/<\/Text>\s*/', '', $str);
955	$title = '<!-- id:' . $id . ' -->' . $str;
956	}
957	// - special case for (stoopid) empty titles that use up a text id
958	elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
959	{
960	$title = '<!-- id:' . $matches[1] . ' -->';
961	}
962	else
963	{
964	printError('Failed to find title text for: ' . $element);
965	}
966	// - watch for subtitle elements
967	$title_line = getLine($xml_in);
968	if (strpos($title_line, '<SubTitle>') !== false)
969	{
970	$title_line = getLine($xml_in);
971	if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
972	{
973	$title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
974	}
975	$title_line = getLine($xml_in);
976	if (strpos($title_line, '</SubTitle>') === false)
977	{
978	printError('Failed to find closing title for: ' . $element);
979	}
980	$title_line = getLine($xml_in);
981	}
982	if (strpos($title_line, '</Title>') === false)
983	{
984	printError('Failed to find closing title for: ' . $element);
985	}
986	return $title;
987	}
988	/ getTitle() /
989
990	function alternateComments($text)
991	{
992	$text = str_replace('<!--', '%!--', $text);
993	$text = str_replace('-->', '--%', $text);
994	return $text;
995	}
996
997	function noComments($text)
998	{
999	$text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1000	return preg_replace('/<!--[^>]+-->/', '', $text);
1001	}
1002
1003	function translateTableCodeline($text)
1004	{
1005	///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1006	// Escape the current italics tags to prevent the translate destroying them
1007	$text = str_replace('<i>', '%!--i--%', $text);
1008	$text = str_replace('</i>', '%/i%', $text);
1009	// Translate the text, just decoding the entities
1010	$text = translateText($text, true);
1011	// Now turn the italic tags (escaped) into HTML comments so we remember them
1012	// but they are hidden in the text
1013	$text = str_replace('%!--i--%', '<!--i-->', $text);
1014	$text = str_replace('%/i%', '<!--/i-->', $text);
1015	///cho " => '" . htmlspecialchars($text) . "<br />\n";
1016	return $text;
1017	}
1018
1019	function translateText($text, $entities_only=false)
1020	{
1021	global $entity_replacements;
1022	global $footnotes;
1023	global $in_code;
1024
1025	// - immediate find and protect any legitimate HTML comments in the text
1026	// (so already using encoded entities), otherwise they'll be throughly
1027	// vladed during the following tranforms. This has to be matched with
1028	// changes to the HTMLComments plugin in Dokuwiki to allow the correct
1029	// thing to be displayed to the user.
1030	if (!$in_code)
1031	{
1032	$text = str_replace('<!--', '%!--', $text);
1033	$text = str_replace('-->', '--%', $text);
1034	}
1035
1036	if (!$entities_only)
1037	{
1038	// - replace linking constructs with dokuwiki ones
1039	// - external chapter section crossrefs are easily the worst of all...
1040	while (preg_match('/<CrossRef\s[^>]external[^>]\/>/', $text) && preg_match('/<CrossRef\s[^>]target="Chapter"[^>]\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1041	{
1042	$pattern = $matches[0];
1043	$attributes = $matches[1];
1044	$manual_name = '';
1045	if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1046	{
1047	$manual_name = $matches[1];
1048	}
1049	$language = '';
1050	if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1051	{
1052	$language = $matches[1];
1053	}
1054	$page_id = '';
1055	if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1056	{
1057	$page_id = $matches[1];
1058	}
1059	if (empty($manual_name) \|\| empty($language) \|\| empty($page_id))
1060	{
1061	printError('Failed to parse external reference: ' . $pattern);
1062	}
1063	// - best we can do is a search within a restricted namespace
1064	$reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '\|' . $page_id . ']]';
1065	$text = str_replace($pattern, $reference, $text);
1066	}
1067	// - chapter crossrefs are tricksie due to needing to know ordering numbers
1068	while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1069	{
1070	$chapter_id = $matches[1];
1071	$page_name = $chapter_id;
1072	$text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '\|' . $chapter_id . ']]', $text);
1073	}
1074	// - internal figure and table references
1075	$text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1076	$text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1077	// - simple internal reference
1078	$text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1\|\1]]', $text);
1079	$text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1\|\1]]', $text);
1080	$text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1\|\1]]', $text);
1081	// - simple external url
1082	$text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1\|\2]]', $text);
1083	// - footnote references are also tricksie as we've had to extract the
1084	// footnotes earlier (during chapter counting)
1085	while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1086	{
1087	$footnote_id = $matches[1];
1088	if (!isset($footnotes[$footnote_id]))
1089	{
1090	printError('Unknown footnote referenced: ' . $footnote_id);
1091	}
1092	$footnote = $footnotes[$footnote_id];
1093	$text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1094	}
1095	// - detect and handle URLs surrounded by <i> tags very carefully (as doku
1096	// will less than helpfully turn them into an external link and screw up
1097	// everything that follows them on the page).
1098	// example: <i>www.microsoft.com</i>
1099	// example: <i>http://nzdl.org/cgi-bin/library</i>
1100	// example: <i>www.yourserver.com</i>
1101	// example: <i>http://www.yourserver.com</i>
1102	// example: <i>http://www.yourserver.com/greenstone</i>
1103	$text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1104	// - superspecial case for two-part URLs ending in .org (like nzdl.org)
1105	$text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1106	// - another superspecial case, this time for URLs on localhost
1107	$text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1108	// - p00p, underlines have the same issue around URLs.
1109	$text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1110	$text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1111	$text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1112
1113	// - replace HTML elements with the dokuwiki style equivilents
1114	$text = preg_replace('/(file\|ftp\|http):\/\//','\1:%%//%%', $text);
1115	// - restore the double slashes in dokuwiki links
1116	while (preg_match('/\[\[[^\]]%%[^\]]\]\]/', $text))
1117	{
1118	$text = preg_replace('/(\[\[[^\]])%%([^\]]\]\])/', '\1\2', $text);
1119	}
1120	$text = str_replace('<b>', '**', $text);
1121	$text = str_replace('</b>', '**', $text);
1122	//$text = str_replace(' <br/>', '\\\\ ', $text);
1123	//$text = str_replace('Â <br/>', '\\\\ ', $text);
1124	//$text = str_replace('<br/>', '\\\\ ', $text);
1125	$text = str_replace('<i>', '//', $text);
1126	$text = str_replace('</i>', '//', $text);
1127	$text = str_replace('<u>', '__', $text);
1128	$text = str_replace('</u>', '__', $text);
1129	}
1130	// Decode entities
1131	// - user defined entities (in the manual metadata)
1132	foreach ($entity_replacements as $entity=>$code)
1133	{
1134	$text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1135	}
1136	// - standard entities
1137	$text = str_replace('>','>', $text);
1138	$text = str_replace('<','<', $text);
1139	$text = str_replace('&','&', $text);
1140	return $text;
1141	}
1142	/ translateText() /
1143
1144	function handleImage($filename, $width, $height)
1145	{
1146	global $dokuwiki_path;
1147	global $xml_source_path;
1148	echo '[copying image: ' . $filename . "] \n";
1149	// - copy file into place
1150	$source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1151	$destination_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
1152	copy($source_path, $destination_path);
1153	if (!file_exists($destination_path))
1154	{
1155	printError('Failed to copy image file: ' . $filename);
1156	}
1157	// - create the string
1158	$image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1159	return $image_txt;
1160	}
1161
1162	function getLine($in)
1163	{
1164	global $line_counter;
1165	$line_counter++;
1166	return fgets($in);
1167	}
1168
1169	function noFormatting($text)
1170	{
1171	$text = str_replace( '<b>', '', $text);
1172	$text = str_replace('</b>', '', $text);
1173	$text = str_replace( '<i>', '', $text);
1174	$text = str_replace('</i>', '', $text);
1175	$text = str_replace('<br/>', '', $text);
1176	return $text;
1177	}
1178
1179	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats: