Context Navigation

source: documentation/trunk/php/gs-manual-import.php@ 25052

Last change on this file since 25052 was 25052, checked in by jmt12, 12 years ago
Several changes to Metadata, Table and Figure parsing as issues were encountered in languages other than English. I'd argue most of these were bogus XML, but I better import the pages anyway.
File size: 41.7 KB

Line
1	<?php
2
3	require_once('common.php');
4
5	/** @file gs-manual-export.php
6	* This script transforms the single XML manual file required by the rest of
7	* the Greenstone manual generation scripts into the series of dokuwiki pages
8	* that make up a certain manual (as specified by the 'm' argument) in a
9	* certain language ('l').
10	*/
11
12	if (!parseCLIArguments())
13	{
14	printError("Error! Failed to parse arguments...\nUsage: gs-manual-import.php -m [user\|install\|develop\|paper]");
15	}
16
17	// 0. Initialization
18	if (!isset($_REQUEST['l']) \|\| empty($_REQUEST['l']))
19	{
20	$_REQUEST['l'] = 'en';
21	}
22	if (!isset($_REQUEST['m']) \|\| empty($_REQUEST['m']))
23	{
24	$_REQUEST['m'] = 'user';
25	//$_REQUEST['m'] = 'install';
26	//$_REQUEST['m'] = 'develop';
27	//$_REQUEST['m'] = 'paper';
28	}
29
30	// - validate arguments before we use them (security)
31	if (!preg_match('/^(develop\|install\|paper\|user)$/',$_REQUEST['m']))
32	{
33	printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
34	}
35
36	if (!preg_match('/^(ar\|en\|es\|fr\|pt-br\|ru)$/',$_REQUEST['l']))
37	{
38	printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
39	}
40
41	echo '<h2>Importing Greenstone Manual XML</h2>' . "\n";
42	echo '<p><b>XML Source Path:</b> ' . $xml_source_path . '<br/><b>Manual:</b> ' . $_REQUEST['m'] . '<br/><b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
43	echo "<p><b>Frontmatter: </b><br/>\n";
44	// 1. By-and-large we're going to process all of this in a big state machine
45	// - the top level page, containing cover page and chapter order information,
46	// needs to be created last, so we have to store it's information
47	$manual_metadata = array();
48	$entity_replacements = array();
49	$footnotes = array();
50	$page_order = array();
51	$page_count = 2;
52	$looking_for_metadata = '';
53	$chapter_txt_out = false;
54	$frontmatter_text = '';
55	$in_section = false;
56	$sections_page_name = '';
57	$in_chapter = false;
58	$chapter_id = '';
59	$bullet_depth = 0;
60	$is_numbered_list = true;
61	$line_counter = 0;
62	$in_code = false;
63	$in_footnotes = false;
64	$in_numbered_item = 0;
65	$in_bullet_item = false;
66	$seen_code_in_item = false;
67	$in_indent = false;
68	$is_code_linenumbered = false;
69	// - construct the path using the information we've been provided as arguments
70	$xml_file_path = $xml_source_path . '/' . $_REQUEST['l'] . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
71	$xml_in = fopen($xml_file_path, 'r');
72	if (!$xml_in)
73	{
74	printError('Failed to locate top level page for manual');
75	}
76	// - we also use this opportunity to read in any footnotes as we'll need to
77	// move them onto their appropriate page
78	while (($line = fgets($xml_in)) !== false)
79	{
80	if (preg_match('/<Footnote id="(\d+)">/', $line, $matches))
81	{
82	$footnote_id = $matches[1];
83	$text_line = fgets($xml_in);
84	if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $text_line, $matches))
85	{
86	$footnotes[$footnote_id] = '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]);
87	}
88	// - throw away </Footnote>
89	fgets($xml_in);
90	}
91	}
92	fclose($xml_in);
93	// - now reopen to parse it
94	$xml_in = fopen($xml_file_path, 'r');
95	if (!$xml_in)
96	{
97	printError('Failed to locate top level page for manual');
98	}
99	while (($line = getLine($xml_in)) !== false)
100	{
101	// - Special Case: lingering code blocks, continue if next line also
102	// contains code, otherwise we need an extra newline
103	if ($in_code)
104	{
105	$code_text = "\n";
106	if (strpos($line, '<CodeLine') === false && strpos($line, 'type="code"') === false)
107	{
108	$code_text .= '</code>';
109	// - codeblocks that appear inside numbered lists do not get their own
110	// newlines as that would split the item. Instead newlines will be
111	// added when </NumberedItem> encountered.
112	if (!$is_numbered_list)
113	{
114	$code_text .= "\n\n";
115	}
116	else
117	{
118	$seen_code_in_item = true;
119	}
120	$in_code = false;
121	}
122	if ($in_chapter)
123	{
124	fwrite($chapter_txt_out, $code_text);
125	}
126	else
127	{
128	$frontmatter_text .= $code_text;
129	}
130	}
131	// - some system metadata to watch for
132	if (preg_match('/<!ENTITY\s+([^>]+)>/', $line, $matches))
133	{
134	$entity = $matches[1];
135	addMetadata('ENTITY',$entity);
136	if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
137	{
138	$entity_replacements[$matches[1]] = $matches[2];
139	}
140	}
141	// - we have an explicit list of cover metadata to watch for
142	elseif (!$in_section && !$in_chapter && preg_match('/<(Author\|Affiliation\|Comment\|Date\|Heading\|SupplementaryText\|Title\|Version)>/', $line, $matches))
143	{
144	$looking_for_metadata = $matches[1];
145	}
146	elseif (!$in_section && !$in_chapter && preg_match('/<\/(Author\|Affiliation\|Comment\|Date\|Heading\|SupplementaryText\|Title\|Version)>/', $line, $matches))
147	{
148	$looking_for_metadata = '';
149	}
150	// - found metadata we have!
151	elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
152	{
153	$text_id = $matches[1];
154	$text = '<!-- id:' . $text_id . ' -->' . translateText($matches[2]);
155	addMetadata($looking_for_metadata, $text);
156	}
157	// - bogus metadata found in French version
158	elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
159	{
160	}
161	// - any text we encounter outside of both sections and chapters also
162	// belongs on the cover
163	elseif (!$in_section && !$in_chapter && !$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
164	{
165	// (for now I'll assume id's are persistent)
166	addMetadata('Text', '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2]));
167	}
168	// - we will probably encounter the opening section (which is outside of a
169	// chapter) first, so we have a special case for it
170	elseif (!$in_chapter && preg_match('/<Section id="([^"]+)">/', $line, $matches))
171	{
172	$section_id = $matches[1];
173	// - if this is the first non-chapter section we have encountered then it
174	// gets the honor of having the page---that these sections will
175	// eventually be printed out on---named after it. Typically this should
176	// be "about_this_manual"
177	if (empty($frontmatter_text))
178	{
179	$sections_page_name = $section_id;
180	}
181	$in_section = true;
182	$title = getTitle($xml_in, 'section:' . $section_id);
183	if (empty($frontmatter_text))
184	{
185	array_unshift($page_order, $sections_page_name . '\|' . noComments($title));
186	}
187	// - if the title, as is, wouldn't autogenerate the appropriate id, then
188	// we have to include the id explicitly (as another html comment block)
189	if ($section_id != generateID($title))
190	{
191	$title = '<!-- sid:' . $section_id . ' -->' . $title;
192	$seen_ids[$section_id] = 1;
193	}
194	$frontmatter_text .= '===== ' . $title . ' =====' . "\n\n";
195	// - whew. Chapter's going to be just as bad though.
196	}
197	elseif ($in_section && preg_match('/<\/Section>/', $line))
198	{
199	$in_section = false;
200	}
201	elseif (preg_match('/<Chapter id="([^"]+)">/', $line, $matches))
202	{
203	$chapter_id = $matches[1];
204	echo "</p>\n<p><b>Import Chapter:</b>" . $chapter_id . '<br/>' . "\n";
205	$chapter_page_name = $chapter_id;
206	// - create a new file to store this chapter
207	$chapter_file_dir = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'];
208	if (!file_exists($chapter_file_dir))
209	{
210	mkAllDir($chapter_file_dir, 0755);
211	}
212	$chapter_file_path = $chapter_file_dir . '/' . $chapter_page_name . '.txt';
213	// - backup existing file
214	if (file_exists($chapter_file_path))
215	{
216	$chapter_backup_file_path = $chapter_file_path . '.bak';
217	rename($chapter_file_path, $chapter_backup_file_path);
218	}
219	// - open new file for writing
220	$chapter_txt_out = fopen($chapter_file_path, 'w');
221	if (!$chapter_txt_out)
222	{
223	printError('Failed to open page file for writing: ' . $chapter_page_name);
224	}
225	$in_chapter = true;
226	$title = getTitle($xml_in, 'chapter: ' . $chapter_id);
227	fwrite($chapter_txt_out, '====== ' . $title . ' ======' . "\n\n");
228	array_push($page_order, $chapter_page_name . '\|' . noComments($title));
229	}
230	elseif ($in_chapter && preg_match('/<\/Chapter>/', $line))
231	{
232	fclose($chapter_txt_out);
233	$chapter_txt_out = false;
234	$in_chapter = false;
235	$page_count++;
236	}
237	// - section, subsection and part titles within chapter
238	elseif ($in_chapter && preg_match('/<(Section\|Subsection\|Part)\sid="([^"]*)">/', $line, $matches))
239	{
240	$title_type = $matches[1];
241	$section_id = $matches[2];
242	if (empty($section_id))
243	{
244	$section_id = generateID(strtolower($title_type));
245	}
246	echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
247	$header_fix = '';
248	$title = getTitle($xml_in, 'heading: ' . $title_type);
249	if ($title_type == 'Section')
250	{
251	$header_fix = '=====';
252	}
253	if ($title_type == 'Subsection')
254	{
255	$header_fix = '====';
256	}
257	if ($title_type == 'Part')
258	{
259	$header_fix = '===';
260	// - remove b's and i's
261	$title = preg_replace('/<\/?(B\|I)>/i', '', $title);
262	}
263	// - if the title, as is, wouldn't autogenerate the appropriate id, then
264	// we have to include the id explicitly (as another html comment block)
265	if ($section_id != generateID($title))
266	{
267	$title = '<!-- sid:' . $section_id . ' -->' . $title;
268	$seen_ids[$section_id] = 1;
269	}
270	fwrite($chapter_txt_out, $header_fix . ' ' . $title . ' ' . $header_fix . "\n\n");
271	}
272	elseif ($in_chapter && (strpos($line, '</Section') !== false \|\| strpos($line, '</Subsection') !== false \|\| strpos($line, '</Part') !== false))
273	{
274	// do nothing for now
275	}
276	// - figures (and their titles/captions)
277	elseif (preg_match('/<Figure id="([^"]+)"(.*?)>/', $line, $matches))
278	{
279	$figure_id = $matches[1];
280	$other_attributes = $matches[2];
281	echo '[adding figure: ' . $figure_id . "] \n";
282	// We need the title too
283	$caption = getTitle($xml_in, 'figure:' . $figure_id);
284	$caption = translateText(alternateComments($caption));
285	$txt = "<imgcaption figure_" . $figure_id . '\|' . $caption . ' ';
286	// - we also check the other attributes to see if the XML has requested
287	// any following codeblock be linenumbered
288	if (strpos($other_attributes, 'withLineNumber') !== false)
289	{
290	$is_code_linenumbered = true;
291	$txt .= '%!-- withLineNumber --%';
292	}
293	$txt .= '></imgcaption>' . "\n";
294	if ($in_chapter)
295	{
296	fwrite($chapter_txt_out, $txt);
297	}
298	else
299	{
300	$frontmatter_text .= $txt;
301	}
302	}
303	elseif (strpos($line, '</Figure>') !== false)
304	{
305	if ($in_chapter)
306	{
307	fwrite($chapter_txt_out, "\n\n");
308	}
309	else
310	{
311	$frontmatter_text .= "\n\n";
312	}
313	// - no longer required
314	$is_code_linenumbered = false;
315	}
316	elseif (preg_match('/<Table([^>]).?.*?>/', $line, $matches))
317	{
318	$attributes = $matches[1];
319	$table_txt = '';
320	$table_id = '';
321	if (preg_match('/id="([^"]+)"/', $attributes, $matches))
322	{
323	$table_id = $matches[1];
324	}
325	else
326	{
327	$table_id = generateID('table');
328	}
329	$hidden = false;
330	if (strpos($attributes, 'class="hidden"') !== false)
331	{
332	$hidden = true;
333	}
334	echo '[adding table: ' . $table_id . "] \n";
335	$table_caption = getTitle($xml_in, 'table: ' . $table_id);
336	if ($hidden)
337	{
338	$table_txt .= '<tblcaption table_' . $table_id . '\|##HIDDEN##></tblcaption>' . "\n";
339	}
340	elseif (empty($table_caption))
341	{
342	$table_txt .= '<tblcaption table_' . $table_id . '\|##NOCAPTION##></tblcaption>' . "\n";
343	}
344	else
345	{
346	$table_txt .= '<tblcaption table_' . $table_id . '\|' . noComments($table_caption) . '></tblcaption>' . "\n";
347	}
348	// - in order to properly capture the table we're going to have to read in
349	// the whole thing here, and take note of column widths
350	$have_output_widths = false;
351	$column_widths = array();
352	while (strpos($line, '</Table>') === false)
353	{
354	// - find the start of a row
355	while(!empty($line) && strpos($line, '<tr>') === false && strpos($line, '</Table>') === false)
356	{
357	$line = getLine($xml_in);
358	}
359	if (strpos($line, '<tr>') !== false)
360	{
361	$row_txt = '\|';
362	$line = getLine($xml_in);
363	// - now we read in multiple cells (line starting <th
364	while (strpos($line, '<th') === 0)
365	{
366	if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
367	{
368	$cell_width = $matches[1];
369	if (!$have_output_widths)
370	{
371	array_push($column_widths, $cell_width);
372	}
373	}
374	// Ignore empty cells
375	// - adding another case for empty header cells (turned up in es
376	// version of "From Paper")
377	if (preg_match('/<th width="\d+"\/>/', $line) \|\| preg_match('/<th width="\d+">.*<\/th>/', $line))
378	{
379	$row_txt .= ' \|';
380	}
381	else
382	{
383	$line = getLine($xml_in);
384	$first = true;
385	while (strpos($line, '</th>') === false)
386	{
387	if (!$first)
388	{
389	$row_txt .= '\\\\';
390	}
391	// - we can have images or text in our tables
392	if (preg_match('/<File.url="images\/([^"]+)".\/>/', $line, $matches))
393	{
394	$payload = $matches[0];
395	$filename = $matches[1];
396	$width = 0;
397	if (preg_match('/width="(\d+)"/', $payload, $matches))
398	{
399	$width = $matches[1];
400	}
401	$height = 0;
402	if (preg_match('/height="(\d+)"/', $payload, $matches))
403	{
404	$height = $matches[1];
405	}
406	$image_txt = handleImage($filename, $width, $height);
407	$row_txt .= ' ' . $image_txt . ' ';
408	}
409	elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
410	{
411	$tid = $matches[1];
412	$txt = $matches[2];
413	// - multiple line text block
414	while (strpos($txt, '</Text>') === false)
415	{
416	$txt .= getLine($xml_in);
417	}
418	$txt = str_replace('</Text>','',$txt);
419	$row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
420	}
421	elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
422	{
423	$row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
424	}
425	elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
426	{
427	$row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
428	}
429	elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
430	{
431	if (!empty($matches[1]))
432	{
433	$row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
434	}
435	else
436	{
437	$row_txt .= ' ';
438	}
439	}
440	// we'll add (bogus) linebreaks
441	elseif (preg_match('/^\s<br\s\/?>\s*$/', $line))
442	{
443	$row_txt = ' ';
444	}
445	else
446	{
447	printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
448	}
449	$first = false;
450	// - next line
451	$line = getLine($xml_in);
452	}
453	// - close the cell
454	$row_txt .= '\|';
455	}
456	// next!
457	$line = getLine($xml_in);
458	}
459	// - if we haven't already, output the width command
460	if (!$have_output_widths)
461	{
462	$table_txt .= '\|< - ' . implode(' ', $column_widths) . ' >\|' . "\n";
463	$have_output_widths = true;
464	}
465	$table_txt .= $row_txt . "\n";
466	// - throw away the closing </tr>
467	$line = getLine($xml_in);
468	}
469	}
470	$table_txt .= "\n";
471	if ($in_chapter)
472	{
473	fwrite($chapter_txt_out, $table_txt);
474	}
475	else
476	{
477	$frontmatter_text .= $table_txt;
478	}
479	}
480	// - copy and insert images
481	elseif (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
482	{
483	$image_txt = handleImage($matches[3], $matches[1], $matches[2]);
484	if ($in_chapter)
485	{
486	fwrite($chapter_txt_out, $image_txt);
487	}
488	else
489	{
490	$frontmatter_text .= $image_txt;
491	}
492	}
493	// - bullet lists
494	elseif (preg_match('/<BulletList>/', $line))
495	{
496	echo "[adding bulletlist] \n";
497	if ($in_bullet_item \|\| $in_numbered_item)
498	{
499	if ($in_chapter)
500	{
501	fwrite($chapter_txt_out, "\n");
502	}
503	elseif ($in_section)
504	{
505	$frontmatter_text .= "\n";
506	}
507	}
508	$bullet_depth++;
509	$is_numbered_list = false;
510	}
511	// - numbered lists
512	elseif (preg_match('/<NumberedList>/', $line))
513	{
514	echo "[adding numbered list] \n";
515	$bullet_depth++;
516	$is_numbered_list = true;
517	// - reset this flag that keeps track of whether an item (numbered or
518	// otherwise) is legitimately split by a code block
519	$seen_code_in_item = false;
520	}
521	elseif (preg_match('/<\/BulletList>/', $line))
522	{
523	$bullet_depth--;
524	if ($bullet_depth == 0)
525	{
526	if ($in_chapter)
527	{
528	fwrite($chapter_txt_out, "\n");
529	}
530	elseif ($in_section)
531	{
532	$frontmatter_text .= "\n";
533	}
534	$is_numbered_list = false;
535	}
536	///cho "[finished bulletlist] ";
537	}
538	elseif (preg_match('/<\/NumberedList>/', $line))
539	{
540	$bullet_depth--;
541	if ($bullet_depth == 0)
542	{
543	if ($in_chapter)
544	{
545	fwrite($chapter_txt_out, "\n");
546	}
547	elseif ($in_section)
548	{
549	$frontmatter_text .= "\n";
550	}
551	$is_numbered_list = false;
552	}
553	///cho "[finished numbered list] ";
554	}
555	elseif (preg_match('/<NumberedItem>/', $line))
556	{
557	$in_numbered_item = 1;
558	}
559	elseif (preg_match('/<\/NumberedItem>/', $line))
560	{
561	$in_numbered_item = 0;
562	if ($in_chapter)
563	{
564	fwrite($chapter_txt_out, "\n");
565	}
566	else
567	{
568	$frontmatter_text .= "\n";
569	}
570	}
571	elseif (preg_match('/<Bullet>/', $line))
572	{
573	$in_bullet_item = true;
574	}
575	elseif (preg_match('/<\/Bullet>/', $line))
576	{
577	if ($in_chapter)
578	{
579	fwrite($chapter_txt_out, "\n");
580	}
581	else
582	{
583	$frontmatter_text .= "\n";
584	}
585	$in_bullet_item = false;
586	}
587	// TEXT HANDLING - this is the main case, but has disappeared into the mire
588	// of other cases.
589	elseif (!$in_footnotes && preg_match('/<Text id="([^"]+)">(.+?)$/', $line, $matches))
590	{
591	$id = $matches[1];
592	$str = $matches[2];
593	// - special case for those text elements split over multiple lines. We
594	// keep concatenating lines until we find the closing text element or we
595	// run out of lines!
596	$another_line = '';
597	while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
598	{
599	$str .= ' ' . $another_line;
600	}
601	// - note that if we ran out of lines (eof) then we'll break out of this
602	// block anyway, it's just there won't be a <\Text> at the end of this
603	// block... despite this being a major validation issue in the XML it
604	// shouldn't result in this script being vladed
605	// - now remove the </Text> from the end (hopefully) of str
606	$str = preg_replace('/<\/Text>\s*/', '', $str);
607	// - and prepend the id while translating the str into Dokuwiki format
608	$str = '<!-- id:' . $id . ' -->' . translateText($str);
609	if ($bullet_depth > 0)
610	{
611	if ($is_numbered_list)
612	{
613	// - special case for those text elements legimately split in two by
614	// code blocks. They get no bullet of either type and are 'run-on'
615	// immediately to the end of the code element in order to prevent
616	// dokuwiki restarting numbering etc
617	if ($seen_code_in_item)
618	{
619	// - leave str as it is
620	// - reset flag just incase the item happens to contain another
621	// code block
622	$seen_code_in_item = false;
623	}
624	else if ($in_numbered_item == 1)
625	{
626	$str = '- ' . $str;
627	}
628	// - superspecial case for the poorly formatted numberlists that
629	// contain more than one text block per point. We'll nest them
630	// as a bullet list as that preserves order, formatting and (I
631	// hope) meaning.
632	else
633	{
634	if ($in_chapter)
635	{
636	fwrite($chapter_txt_out, "\n");
637	}
638	else
639	{
640	$frontmatter_text .= "\n";
641	}
642	$str = ' * ' . $str;
643	}
644	$in_numbered_item++;
645	}
646	else
647	{
648	$str = '* ' . $str;
649	}
650	for ($i = 0; $i < $bullet_depth; $i++)
651	{
652	$str = ' ' . $str;
653	}
654	}
655	else
656	{
657	// Indented text is preceeded by a >
658	if ($in_indent)
659	{
660	$str = '> ' . $str . "\n";
661	}
662	else
663	{
664	$str .= "\n";
665	}
666	}
667	if ($bullet_depth == 0)
668	{
669	$str .= "\n";
670	}
671	if ($in_chapter)
672	{
673	fwrite($chapter_txt_out, $str);
674	}
675	else
676	{
677	$frontmatter_text .= $str;
678	}
679	}
680	// - codified text blocks
681	elseif (preg_match('/<Text\s+type="code"\s+id="([^"]+)"\s*>(.+?)<\/Text>/', $line, $matches))
682	{
683	$code_id = $matches[1];
684	// - determine the appropriate code block prefix
685	$code_prefix = '';
686	if (!$in_code)
687	{
688	if ($is_code_linenumbered)
689	{
690	$code_prefix = '<code 1>';
691	}
692	else
693	{
694	$code_prefix = '<code>';
695	}
696	$in_code = true;
697	}
698	$code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
699	if ($in_chapter)
700	{
701	fwrite($chapter_txt_out, $code_txt);
702	}
703	else
704	{
705	$frontmatter_text .= $code_txt;
706	}
707	}
708	elseif (preg_match('/<Text\s+id="([^"]+)"\s+type="code"\s*>(.+?)<\/Text>/', $line, $matches))
709	{
710	$code_id = $matches[1];
711	// - determine the appropriate code block prefix
712	$code_prefix = '';
713	if (!$in_code)
714	{
715	if ($is_code_linenumbered)
716	{
717	$code_prefix = '<code 1>';
718	}
719	else
720	{
721	$code_prefix = '<code>';
722	}
723	$in_code = true;
724	}
725	$code_txt = $code_prefix . '<!-- id:' . $matches[1] . ' -->' . translateText($matches[2], true);
726	if ($in_chapter)
727	{
728	fwrite($chapter_txt_out, $code_txt);
729	}
730	else
731	{
732	$frontmatter_text .= $code_txt;
733	}
734	}
735	elseif (preg_match('/<CodeLine>(.*?)$/', $line, $matches))
736	{
737	$code_txt = $matches[1];
738	// - determine the appropriate code block prefix
739	$code_prefix = '';
740	if (!$in_code)
741	{
742	if ($is_code_linenumbered)
743	{
744	$code_prefix = "<code 1>\n";
745	}
746	else
747	{
748	$code_prefix = "<code>\n";
749	}
750	$in_code = true;
751	}
752	// - arg. another special case for codelines that span more than one line
753	// (but I guess <CodeLineButSometimesMoreThanOneLine> is a bit cumbersome
754	// for an element name, eh?)
755	$another_line = '';
756	while (strpos($code_txt, '</CodeLine>') === false && ($another_line = getLine($xml_in)) !== false)
757	{
758	$code_txt .= ' ' . $another_line;
759	}
760	$code_txt = preg_replace('/<\/CodeLine>\s*/', '', $code_txt);
761	$code_txt = $code_prefix . translateText($code_txt, true);
762	if ($in_chapter)
763	{
764	fwrite($chapter_txt_out, $code_txt);
765	}
766	else
767	{
768	$frontmatter_text .= $code_txt;
769	}
770	}
771	// - there are also sometimes empty codelines - which indicate a newline in
772	// the code listing
773	elseif (preg_match('/<CodeLine\s*\/>/', $line, $matches))
774	{
775	$code_txt = '';
776	if (!$in_code)
777	{
778	$code_txt = "\n";
779	if ($is_code_linenumbered)
780	{
781	$code_txt = "<code 1>\n" . $code_txt;
782	}
783	else
784	{
785	$code_txt = "<code>\n" . $code_txt;
786	}
787	$in_code = true;
788	}
789	if ($in_chapter)
790	{
791	fwrite($chapter_txt_out, $code_txt);
792	}
793	else
794	{
795	$frontmatter_text .= $code_txt;
796	}
797	}
798	// - reference to an external XML file
799	elseif (preg_match('/^\s*&[a-z0-9_]+;\s+$/is', $line))
800	{
801	if ($in_chapter)
802	{
803	fwrite($chapter_txt_out, $line);
804	}
805	else
806	{
807	$frontmatter_text .= $line;
808	}
809	}
810	elseif (strpos($line, '<FootnoteList>') !== false)
811	{
812	$in_footnotes = true;
813	}
814	elseif ($in_footnotes && strpos($line, '</FootnoteList>') !== false)
815	{
816	$in_footnotes = false;
817	}
818	// Indentation - the closest thing we have is quoting, so we'll use that
819	elseif (strpos($line, '<Indented>') !== false)
820	{
821	$in_indent = true;
822	}
823	elseif (strpos($line, '</Indented>') !== false)
824	{
825	$in_indent = false;
826	}
827	// - pattern of lines to ignore
828	else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>\|<\!DOCTYPE Manual \[\|\]>\|<Bullet>\|<\/?Content>\|<\/?Footnote\|<Manual id=".+?" lang=".+?">\|<\/Manual>)/', $line))
829	{
830	}
831	// - we ignore anything else in footnotes too, as they were handled in the
832	// preprocessing pass
833	else if ($in_footnotes)
834	{
835	}
836	// - ignore empty lines
837	else if (preg_match('/^\s*$/', $line))
838	{
839	}
840	// - meh. French versions have random, non-text element, linebreaks floating
841	// around. Guess I'll honor their formatting even though it's bogus
842	else if (preg_match('/^\s<br\s\/?>\s*$/', $line))
843	{
844	if ($in_chapter)
845	{
846	fwrite($chapter_txt_out, ' \\\\');
847	}
848	else
849	{
850	$frontmatter_text .= ' \\\\';
851	}
852	}
853	// - danger Will Robinson!
854	else
855	{
856	echo '<div style="background-color:yellow;"><hr /><b>Warning!</b> Failed to parse line ' . $line_counter . ': \|' . htmlspecialchars($line) . "\|<hr /></div>\n";
857	}
858	}
859
860	// 2. We should now have enough metadata to export the cover page
861	$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
862	// - backup any existing file
863	if (file_exists($top_page_path))
864	{
865	$top_page_backup_path = $top_page_path . '.bak';
866	if(!rename($top_page_path, $top_page_backup_path))
867	{
868	printError('Failed to rename existing top page for backup');
869	}
870	}
871	// - and create a handle to the new file
872	$txt_out = fopen($top_page_path, 'w');
873	// - write the page (including the tables)
874	fwrite($txt_out, '====== ' . noComments(ucfirst(getFirstMetadata('Heading'))) . ': ' . noComments(ucfirst(getFirstMetadata('Title'))) . ' (' . strtoupper($_REQUEST['l']) . ') ======' . "\n");
875	fwrite($txt_out, "\n");
876
877	// - NEW ability to request imports and exports from within the page
878	fwrite($txt_out, "<ifauth @admin>\n\n");
879	fwrite($txt_out, 'Administrator Commands:' . "\n");
880	// On second thoughts we probably never want to do this casually, as it boguses
881	// all history/approval/edit information. Instead I'll leave this as a manual
882	// process.
883	fwrite($txt_out, '<!-- Import available at this link - but be warned all current wiki data for this manual will become bogus: http://~~baseurl~~/../../php/gs-manual-import.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . " -->\n");
884	fwrite($txt_out, ' * Export manual: [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&v=draft&a=download\|draft version]] [[http://~~baseurl~~/../../php/gs-manual-export.php?m=' . $_REQUEST['m'] . '&l=' . $_REQUEST['l'] . '&a=download\|approved version]]' . "\n");
885	fwrite($txt_out, "</ifauth>\n\n");
886
887	// - regular metadata
888	fwrite($txt_out, '<!-- Note: cover page information -->' . "\n");
889	fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
890	$fields = array('Heading','Title','Author','Affiliation','Text','Comment','Version','Date');
891	foreach ($fields as $field)
892	{
893	$values = getMetadata($field);
894	foreach ($values as $value)
895	{
896	fwrite($txt_out, '^ ' . $field . ' \| ' . $value . ' \|' . "\n");
897	}
898	}
899	fwrite($txt_out, "\n");
900	// - contents (which also provides order information for exporting)
901	fwrite($txt_out, '===== Contents =====' . "\n");
902	fwrite($txt_out, "\n");
903	fwrite($txt_out, '<!-- Note: The ordering of pages here is used when creating the HTML and PDF versions of the manual -->' . "\n");
904	foreach ($page_order as $page_info)
905	{
906	fwrite($txt_out, ' * [[.:' . $_REQUEST['m'] . ':' . $page_info . ']]' . "\n");
907	}
908	// - system metadata
909	fwrite($txt_out, "<ifauth @admin>\n\n");
910	fwrite($txt_out, '===== System Metadata =====' . "\n");
911	fwrite($txt_out, '<!-- Note: configuration options for the manual -->' . "\n");
912	fwrite($txt_out, '^ Metadata ^ Value ^' . "\n");
913	$fields = array('ENTITY','SupplementaryText');
914	foreach ($fields as $field)
915	{
916	$values = getMetadata($field);
917	foreach ($values as $value)
918	{
919	fwrite($txt_out, '^ ' . $field . ' \| ' . $value . ' \|' . "\n");
920	}
921	}
922	fwrite($txt_out, "</ifauth>\n\n");
923	fwrite($txt_out, "\n");
924	// - done!
925	fclose($txt_out);
926	$page_count++;
927
928	// 3. And the 'sections' page, grouping together all the loose sections as
929	// frontmatter
930	$frontmatter_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $sections_page_name . '.txt';
931	// - backup any existing file
932	if (file_exists($frontmatter_page_path))
933	{
934	$frontmatter_page_backup_path = $frontmatter_page_path . '.bak';
935	if(!rename($frontmatter_page_path, $frontmatter_page_backup_path))
936	{
937	printError('Failed to rename existing frontmatter page for backup');
938	}
939	}
940	// - populate the new frontmatter file
941	file_put_contents($frontmatter_page_path, $frontmatter_text);
942
943	echo "</p>\n<p><b>Complete!</b> Imported " . $page_count . " pages</p><hr/>\n";
944	echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to wiki page</p>' . "\n";
945	exit(0);
946
947	/**
948	*/
949	function addMetadata($field, $value)
950	{
951	global $manual_metadata;
952	echo '[adding metadata: ' . $field . "] \n";
953	$values = array();
954	if (isset($manual_metadata[$field]))
955	{
956	$values = $manual_metadata[$field];
957	}
958	array_push($values, $value);
959	$manual_metadata[$field] = $values;
960	}
961	/ addMetadata() /
962
963	function getFirstMetadata($field)
964	{
965	global $manual_metadata;
966	$value = '';
967	if (isset($manual_metadata[$field]))
968	{
969	$values = $manual_metadata[$field];
970	if (!empty($values))
971	{
972	$value = $values[0];
973	}
974	}
975	return $value;
976	}
977	/ getFirstMetadata() /
978
979	function getMetadata($field)
980	{
981	global $manual_metadata;
982	$values = array();
983	if (isset($manual_metadata[$field]))
984	{
985	$values = $manual_metadata[$field];
986	}
987	return $values;
988	}
989	/ getMetadata() /
990
991	/**
992	* Read in the next title element with nested text element and extract the
993	* title.
994	*/
995	function getTitle($xml_in, $element)
996	{
997	$title = '';
998	$in_title_element = false;
999	// - the first thing in a chapter will be it's title
1000	$title_line = getLine($xml_in);
1001	// - super special case: some language versions don't wrap titles in title
1002	// element, so if the first thing we see is a text, we treat that as the
1003	// title
1004	if (strpos($title_line, '<text') !== false)
1005	{
1006
1007	}
1008	// - super special case: a table with an empty title
1009	if (strpos($title_line, '<Title/>') !== false)
1010	{
1011	return '';
1012	}
1013	if (strpos($title_line, '<Title>') !== false)
1014	{
1015	$in_title_element = true;
1016	$title_line = getLine($xml_in);
1017	}
1018	// - some horribly formed entries have the subtitle first within the title
1019	// element
1020	if (strpos($title_line, '<SubTitle>') !== false)
1021	{
1022	$title_line = getLine($xml_in);
1023	if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1024	{
1025	$title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1026	}
1027	$title_line = getLine($xml_in);
1028	if (strpos($title_line, '</SubTitle>') === false)
1029	{
1030	printError('Failed to find closing title for: ' . $element);
1031	}
1032	$title_line = getLine($xml_in);
1033	}
1034	// - grab the chapter title now so we can store it in the page ordering
1035	if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
1036	{
1037	$id = $matches[1];
1038	$str = $matches[2];
1039	// - special case for text blocks that span multiple lines (as discovered
1040	// in the russian "From Paper"
1041	$another_line = '';
1042	while (strpos($str, '</Text>') === false && ($another_line = getLine($xml_in)) !== false)
1043	{
1044	$str .= ' ' . $another_line;
1045	}
1046	// - now remove </Text>
1047	$str = preg_replace('/<\/Text>\s*/', '', $str);
1048	$title = '<!-- id:' . $id . ' -->' . $str . $title;
1049	}
1050	// - special case for (stoopid) empty titles that use up a text id
1051	elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
1052	{
1053	$title = '<!-- id:' . $matches[1] . ' -->' . $title;
1054	}
1055	else
1056	{
1057	printError('Failed to find title text for: ' . $element);
1058	}
1059	// - watch for subtitle elements
1060	if ($in_title_element)
1061	{
1062	$title_line = getLine($xml_in);
1063	if (strpos($title_line, '<SubTitle>') !== false)
1064	{
1065	$title_line = getLine($xml_in);
1066	if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
1067	{
1068	$title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
1069	}
1070	$title_line = getLine($xml_in);
1071	if (strpos($title_line, '</SubTitle>') === false)
1072	{
1073	printError('Failed to find closing title for: ' . $element);
1074	}
1075	$title_line = getLine($xml_in);
1076	}
1077	if (strpos($title_line, '</Title>') === false)
1078	{
1079	printError('Failed to find closing title for: ' . $element);
1080	}
1081	}
1082	return $title;
1083	}
1084	/ getTitle() /
1085
1086	function alternateComments($text)
1087	{
1088	$text = str_replace('<!--', '%!--', $text);
1089	$text = str_replace('-->', '--%', $text);
1090	// remove any lurking crossrefs while we are at it
1091	$text = preg_replace('/<CrossRef.?ref="([^"]+)".?>/', '\\1', $text);
1092	return $text;
1093	}
1094
1095	function noComments($text)
1096	{
1097	$text = preg_replace('/<i>(.*?)<\/i>/','//\1//',$text);
1098	return preg_replace('/<!--[^>]+-->/', '', $text);
1099	}
1100
1101	function translateTableCodeline($text)
1102	{
1103	///cho "<b>Debug:</b> translateTableCodeLine('" . htmlspecialchars($text) . "')<br />\n";
1104	// Escape the current italics tags to prevent the translate destroying them
1105	$text = str_replace('<i>', '%!--i--%', $text);
1106	$text = str_replace('</i>', '%/i%', $text);
1107	// Translate the text, just decoding the entities
1108	$text = translateText($text, true);
1109	// Now turn the italic tags (escaped) into HTML comments so we remember them
1110	// but they are hidden in the text
1111	$text = str_replace('%!--i--%', '<!--i-->', $text);
1112	$text = str_replace('%/i%', '<!--/i-->', $text);
1113	///cho " => '" . htmlspecialchars($text) . "<br />\n";
1114	return $text;
1115	}
1116
1117	function translateText($text, $entities_only=false)
1118	{
1119	global $entity_replacements;
1120	global $footnotes;
1121	global $in_code;
1122
1123	// - immediate find and protect any legitimate HTML comments in the text
1124	// (so already using encoded entities), otherwise they'll be throughly
1125	// vladed during the following tranforms. This has to be matched with
1126	// changes to the HTMLComments plugin in Dokuwiki to allow the correct
1127	// thing to be displayed to the user.
1128	if (!$in_code)
1129	{
1130	$text = str_replace('<!--', '%!--', $text);
1131	$text = str_replace('-->', '--%', $text);
1132	}
1133
1134	if (!$entities_only)
1135	{
1136	// - replace linking constructs with dokuwiki ones
1137	// - external chapter section crossrefs are easily the worst of all...
1138	while (preg_match('/<CrossRef\s[^>]external[^>]\/>/', $text) && preg_match('/<CrossRef\s[^>]target="Chapter"[^>]\/>/', $text) && preg_match('/<CrossRef\s+(.*?)\/>/', $text, $matches))
1139	{
1140	$pattern = $matches[0];
1141	$attributes = $matches[1];
1142	$manual_name = '';
1143	if (preg_match('/external="([^"]+)"/', $attributes, $matches))
1144	{
1145	$manual_name = $matches[1];
1146	}
1147	$language = '';
1148	if (preg_match('/lang="([^"]+)"/', $attributes, $matches))
1149	{
1150	$language = $matches[1];
1151	}
1152	$page_id = '';
1153	if (preg_match('/ref="([^"]+)"/', $attributes, $matches))
1154	{
1155	$page_id = $matches[1];
1156	}
1157	if (empty($manual_name) \|\| empty($language) \|\| empty($page_id))
1158	{
1159	printError('Failed to parse external reference: ' . $pattern);
1160	}
1161	// - best we can do is a search within a restricted namespace
1162	$reference = '[[?do=search&id=' . $page_id . ' @' . $language . ':manuals:' . $manual_name . '\|' . $page_id . ']]';
1163	$text = str_replace($pattern, $reference, $text);
1164	}
1165	// - chapter crossrefs are tricksie due to needing to know ordering numbers
1166	while (preg_match('/<CrossRef target="Chapter" ref="([^"]+)"\/>/', $text, $matches))
1167	{
1168	$chapter_id = $matches[1];
1169	$page_name = $chapter_id;
1170	$text = preg_replace('/<CrossRef target="Chapter" ref="' . $chapter_id . '"\/>/', '[[.:' . $page_name . '\|' . $chapter_id . ']]', $text);
1171	}
1172	// - internal figure and table references
1173	$text = preg_replace('/<CrossRef target="Figure" ref="([^"]+)"\/>/','<imgref figure_\1>', $text);
1174	$text = preg_replace('/<CrossRef target="Table" ref="([^"]+)"\/>/','<tblref table_\1>', $text);
1175	// - simple internal reference
1176	$text = preg_replace('/<CrossRef target="Section" ref="([^"]+)"\/>/', '[[#\1\|\1]]', $text);
1177	$text = preg_replace('/<CrossRef target="Subsection" ref="([^"]+)"\/>/', '[[##\1\|\1]]', $text);
1178	$text = preg_replace('/<CrossRef target="Part" ref="([^"]+)"\/>/', '[[###\1\|\1]]', $text);
1179	// - simple external url
1180	$text = preg_replace('/<Link url="([^"]+)">(.+?)<\/Link>/', '[[\1\|\2]]', $text);
1181	// - footnote references are also tricksie as we've had to extract the
1182	// footnotes earlier (during chapter counting)
1183	while (preg_match('/<FootnoteRef id="(\d+)"\/>/', $text, $matches))
1184	{
1185	$footnote_id = $matches[1];
1186	if (!isset($footnotes[$footnote_id]))
1187	{
1188	printError('Unknown footnote referenced: ' . $footnote_id);
1189	}
1190	$footnote = $footnotes[$footnote_id];
1191	$text = preg_replace('/<FootnoteRef id="' . $footnote_id . '"\/>/', '((' . $footnote . '))', $text);
1192	}
1193	// - detect and handle URLs surrounded by <i> tags very carefully (as doku
1194	// will less than helpfully turn them into an external link and screw up
1195	// everything that follows them on the page).
1196	// example: <i>www.microsoft.com</i>
1197	// example: <i>http://nzdl.org/cgi-bin/library</i>
1198	// example: <i>www.yourserver.com</i>
1199	// example: <i>http://www.yourserver.com</i>
1200	// example: <i>http://www.yourserver.com/greenstone</i>
1201	$text = preg_replace('/<i>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1202	// - superspecial case for two-part URLs ending in .org (like nzdl.org)
1203	$text = preg_replace('/<i>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1204	// - another superspecial case, this time for URLs on localhost
1205	$text = preg_replace('/<i>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/i>/i','// \1 //', $text);
1206	// - p00p, underlines have the same issue around URLs.
1207	$text = preg_replace('/<u>((?:http\:\/\/)?[a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1208	$text = preg_replace('/<u>((?:http\:\/\/)?[a-z0-9\-]+\.org(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1209	$text = preg_replace('/<u>((?:http\:\/\/)?localhost(?:\/.*?)?)<\/u>/i','__ \1 __', $text);
1210
1211	// - replace HTML elements with the dokuwiki style equivilents
1212	$text = preg_replace('/(file\|ftp\|http):\/\//','\1:%%//%%', $text);
1213	// - restore the double slashes in dokuwiki links
1214	while (preg_match('/\[\[[^\]]%%[^\]]\]\]/', $text))
1215	{
1216	$text = preg_replace('/(\[\[[^\]])%%([^\]]\]\])/', '\1\2', $text);
1217	}
1218	$text = str_replace('<b>', '**', $text);
1219	$text = str_replace('</b>', '**', $text);
1220	//$text = str_replace(' <br/>', '\\\\ ', $text);
1221	//$text = str_replace('Â <br/>', '\\\\ ', $text);
1222	//$text = str_replace('<br/>', '\\\\ ', $text);
1223	$text = str_replace('<i>', '//', $text);
1224	$text = str_replace('</i>', '//', $text);
1225	$text = str_replace('<u>', '__', $text);
1226	$text = str_replace('</u>', '__', $text);
1227	}
1228	// Decode entities
1229	// - user defined entities (in the manual metadata)
1230	foreach ($entity_replacements as $entity=>$code)
1231	{
1232	$text = str_replace('&' . $entity . ';', html_entity_decode('&#'.$code.';',ENT_NOQUOTES,'UTF-8'), $text);
1233	}
1234	// - standard entities
1235	$text = str_replace('>','>', $text);
1236	$text = str_replace('<','<', $text);
1237	$text = str_replace('&','&', $text);
1238	return $text;
1239	}
1240	/ translateText() /
1241
1242	function handleImage($filename, $width, $height)
1243	{
1244	global $dokuwiki_path;
1245	global $xml_source_path;
1246	echo '[copying image: ' . $filename . "] \n";
1247	// - copy file into place
1248	$source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
1249	$destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
1250	if (!file_exists($destination_dir))
1251	{
1252	mkAllDir($destination_dir, 0755);
1253	}
1254	$destination_path = $destination_dir . strtolower($filename);
1255	copy($source_path, $destination_path);
1256	if (!file_exists($destination_path))
1257	{
1258	printError('Failed to copy image file: ' . $filename);
1259	}
1260	// - create the string
1261	$image_txt = '{{..:images:' . strtolower($filename) . '?' . $width . 'x' . $height . '&direct}}';
1262	return $image_txt;
1263	}
1264
1265	function getLine($in)
1266	{
1267	global $line_counter;
1268	$line_counter++;
1269	return fgets($in);
1270	}
1271
1272	function noFormatting($text)
1273	{
1274	$text = str_replace( '<b>', '', $text);
1275	$text = str_replace('</b>', '', $text);
1276	$text = str_replace( '<i>', '', $text);
1277	$text = str_replace('</i>', '', $text);
1278	$text = str_replace('<br/>', '', $text);
1279	return $text;
1280	}
1281
1282	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats: