Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: documentation/trunk/php/gs-manual-export.php@ 25052

Last change on this file since 25052 was 25026, checked in by jmt12, 12 years ago
PHP scripts for importing and exporting the XML manuals into and out of Dokuwiki
File size: 41.7 KB

Line
1	<?php
2
3	require_once('common.php');
4	$debug = 1;
5
6	// There are some elements that are, in my opinion, incorrectly 'resolved' in
7	// code blocks just because they are valid HTML. For instance, <i> is
8	// resolved to <i> in code blocks, while something like <Metadata> is
9	// not. Set this to true to allow such abominations (for the purpose of
10	// comparing before and after versions of the XML). [jmt12]
11	$allow_bad_codeblocks = true;
12
13	/** @file gs-manual-export.php
14	* This script transforms the series of dokuwiki pages that make up a certain
15	* manual (as specified by the 'm' argument) in a certain language ('l') into
16	* the XML format required by the rest of the Greenstone manual generation
17	* scripts.
18	*/
19
20	// 0. Initialization
21	$xml_source_path = '/research/jmt12/gsdl-docs/temp';
22	// - we have a counter to assign identifiers to text blocks etc without ids
23	$text_id_counter = 1;
24	// - we need an array of all the footnotes
25	$footnotes = array();
26
27	// Defaults
28	if (!isset($_REQUEST['l']) \|\| empty($_REQUEST['l']))
29	{
30	$_REQUEST['l'] = 'en';
31	}
32	if (!isset($_REQUEST['m']) \|\| empty($_REQUEST['m']))
33	{
34	//$_REQUEST['m'] = 'user';
35	//$_REQUEST['m'] = 'install';
36	$_REQUEST['m'] = 'develop';
37	//$_REQUEST['m'] = 'paper';
38	}
39	if (!isset($_REQUEST['v']) \|\| empty($_REQUEST['v']))
40	{
41	$_REQUEST['v'] = 'draft';
42	}
43	if (!isset($_REQUEST['a']) \|\| !preg_match('/^(download\|store)$/', $_REQUEST['a']))
44	{
45	$_REQUEST['a'] = 'store'; // Try to store the file to disk
46	}
47
48	echo '<html>' . "\n";
49	echo '<head>' . "\n";
50	echo '<title>GS Manual Export</title>' . "\n";
51	echo '</head>' . "\n";
52	echo '<body>' . "\n";
53
54	// - validate arguments before we use them (security)
55	if (!preg_match('/^(develop\|install\|paper\|user)$/',$_REQUEST['m']))
56	{
57	printError('Unknown manual type requested: ' . htmlspecialchars($_REQUEST['m']));
58	}
59
60	if (!preg_match('/^(ar\|en\|es\|fr\|pt-br\|ru)$/',$_REQUEST['l']))
61	{
62	printError('Unknown language requested: ' . htmlspecialchars($_REQUEST['l']));
63	}
64
65	echo '<h2>Generating Greenstone Manual XML</h2>' . "\n";
66	echo '<p><b>Manual:</b> ' . $_REQUEST['m'] . ' <b>Language:</b> ' . $_REQUEST['l'] . "</p>\n<hr/>\n";
67	// 1. Create the XML output file handle
68	$xml_file_path = '';
69	if ($_REQUEST['a'] == 'download')
70	{
71	$xml_file_path = fileCat(array($base_path, 'var', ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml'));
72	}
73	else
74	{
75	// - construct the path using the information we've been provided as arguments
76	$xml_file_dir = $xml_source_path . '/' . $_REQUEST['l'];
77	mkAllDir($xml_file_dir, 0755);
78	$xml_file_path = $xml_file_dir . '/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml';
79	// - backup any existing file
80	if (file_exists($xml_file_path))
81	{
82	$xml_backup_file_path = $xml_file_path . '.bak';
83	if (!rename($xml_file_path, $xml_backup_file_path))
84	{
85	printError('Failed to rename existing manual file for backup');
86	}
87	}
88	}
89	// - and create a handle to the new file
90	$xml_out = fopen($xml_file_path, 'w');
91
92	// 2. Read in the top level page - this will give configuration data for the
93	// manual and cover page, as well as specifying the order for the other
94	// pages in the manual
95	echo "<p><b>Frontmatter:</b><br/>\n";
96	// - by reading this page we hope to populate an array of metadata, and also
97	// extract the sequence of other pages within this manual
98	$cover_metadata = array();
99	$pages_in_order = array();
100	// - we now need to consider if the user has asked for a draft version (i.e.
101	// includes the latest version of pages regardless of approval) or if only
102	// the approved versions of pages should be included
103	// - only necessary for english version of manual, as those are the only pages
104	// editable
105	$top_page_path = '';
106	if ($_REQUEST['v'] == 'draft' \|\| $_REQUEST['l'] != 'en')
107	{
108	// - again, we can construct the path to the top level page given the arguments
109	// provided
110	$top_page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '.txt';
111	}
112	else
113	{
114	$top_page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m']);
115	}
116
117	if (!file_exists($top_page_path))
118	{
119	printError('Failed to locate top level page for manual');
120	}
121	// - we're going to open a handle to the file, then read it in line-by-line
122	// watching for the lines we are interested in (using pattern matching)
123	$top_page_in = fopen($top_page_path, 'r');
124	if (!$top_page_in)
125	{
126	printError('Failed to open top level page for reading');
127	}
128	$in_contents = false;
129	while (($line = fgets($top_page_in)) !== false)
130	{
131	// - if we are capturing page order, and we encounter something that looks
132	// like a bulletpoint item pointing to a wiki page, then we append the
133	// name of that page to our pages in order array
134	if ($in_contents && preg_match('/^\s+\\s+\[\[.:' . $_REQUEST['m'] . ':(.+?)\\|(.?)\]\]\s*$/', $line, $matches))
135	{
136	array_push($pages_in_order, $matches[1]);
137	}
138	// - metadata is all encoded within dokuwiki tables
139	elseif (preg_match('/^\^\s+([^\s]+)\s+\\|\s+(.+?)\s+\\|\s*$/', $line, $matches))
140	{
141	$field = $matches[1];
142	$value = $matches[2];
143	$values = array();
144	if (isset($cover_metadata[$field]))
145	{
146	$values = $cover_metadata[$field];
147	}
148	array_push($values, $value);
149	$cover_metadata[$field] = $values;
150	}
151	// - watch for the heading 'Contents' to begin extracting page order
152	// information
153	elseif (preg_match('/^=+\s(.+)\s=+$/', $line, $matches))
154	{
155	if ($matches[1] == 'Contents')
156	{
157	$in_contents = true;
158	}
159	// - any other title means we aren't capturing page order (anymore)
160	else
161	{
162	$in_contents = false;
163	}
164	}
165	}
166	if (!feof($top_page_in))
167	{
168	printError('Unexpected fgets() fail when reading top page');
169	}
170	fclose($top_page_in);
171	// - ensure we have the required metadata
172	$required_metadata = array('Heading','Title','Affiliation','Version','Date');
173	foreach ($required_metadata as $required_field)
174	{
175	if (!isset($cover_metadata[$required_field]))
176	{
177	printError('Missing required metadata: ' . $required_field);
178	}
179	}
180	// - now we can use the metadata to construct the XML header and the cover page.
181	// This follows a pretty set recipe with only elements that can repeat---like
182	// Author, SupplementaryText etc---are at all tricky
183	fwrite($xml_out, '<?xml version="1.0" encoding="UTF-8"?>' . "\n");
184	fwrite($xml_out, '<!DOCTYPE Manual [' . "\n");
185	if (isset($cover_metadata['ENTITY']))
186	{
187	foreach ($cover_metadata['ENTITY'] as $entity)
188	{
189	fwrite($xml_out, "\t" . '<!ENTITY ' . $entity . '>' . "\n");
190	}
191	}
192	fwrite($xml_out, ']>' . "\n");
193	fwrite($xml_out, '<Manual id="' . ucfirst($_REQUEST['m']) . '" lang="' . $_REQUEST['l'] . '">' . "\n");
194
195	///cho "<p>[Debug] metadata: " . print_r($cover_metadata, true) . "</p>\n\n";
196
197	outputMetadataSingle($xml_out, $cover_metadata, 'Heading');
198	outputMetadataSingle($xml_out, $cover_metadata, 'Title');
199	outputMetadataSingle($xml_out, $cover_metadata, 'Author');
200	outputMetadataSingle($xml_out, $cover_metadata, 'Affiliation');
201	outputMetadataMultiple($xml_out, $cover_metadata, 'SupplementaryText');
202	outputMetadataMultiple($xml_out, $cover_metadata, 'Text');
203	outputMetadataMultiple($xml_out, $cover_metadata, 'Comment');
204	outputMetadataSingle($xml_out, $cover_metadata, 'Version');
205	outputMetadataSingle($xml_out, $cover_metadata, 'Date');
206
207	// 3. Process each page listed in the contents of the top level page in order
208	foreach ($pages_in_order as $page)
209	{
210	processPage($xml_out, $page);
211	}
212
213	// 4. Output out list of footnotes (if any)
214	if (!empty($footnotes))
215	{
216	fwrite($xml_out, '<FootnoteList>'. "\n");
217	foreach ($footnotes as $footnote=>$footnote_id)
218	{
219	///cho '[debug] footnotes: (' . $footnote_id . ') ' . $footnote . '<br />'. "\n";
220	outputMetadataSingle($xml_out, $footnote, 'Footnote', $footnote_id);
221	}
222	fwrite($xml_out, '</FootnoteList>'. "\n");
223	}
224
225	// 5. Finalize and close the XML output
226	fwrite($xml_out, '</Manual>' . "\n");
227	fclose($xml_out);
228
229	// 6. Complete!
230	echo '<p><b>Complete!</b></p>' . "\n<hr/>\n";
231	if ($_REQUEST['a'] == 'download')
232	{
233	echo '<p>Click <a href="var/' . ucfirst($_REQUEST['m']) . '_' . $_REQUEST['l'] . '.xml">here</a> to download XML file</p>' . "\n";
234	}
235	echo '<p>Click <a href="' . $dokuwiki_url . '/doku.php?id=' . $_REQUEST['l'] . ':manuals:' . $_REQUEST['m'] . '">here</a> to return to dokuwiki</p>' . "\n";
236	echo '</body>' . "\n";
237	echo '</html>';
238	exit(0);
239
240	/**
241	*/
242	function outputMetadataSingle($xml_out, $metadata, $field, $mid=false)
243	{
244	echo '[metadata: ' . $field . "] \n";
245	if ($mid)
246	{
247	fwrite($xml_out, '<' . $field . ' id="' . $mid . '">' . "\n");
248	}
249	else
250	{
251	fwrite($xml_out, '<' . $field . '>' . "\n");
252	}
253	if (is_array($metadata))
254	{
255	if (isset($metadata[$field]) && isset($metadata[$field][0]))
256	{
257	outputTextBlock($xml_out, $metadata[$field][0]);
258	}
259	else
260	{
261	echo 'no such field or no metadata';
262	}
263	}
264	elseif (!empty($metadata))
265	{
266	outputTextBlock($xml_out, $metadata);
267	}
268	else
269	{
270	echo 'no such field or no metadata';
271	}
272	fwrite($xml_out, '</' . $field . '>' . "\n");
273	}
274	/ outputMetadataSingle() /
275
276	function outputMetadataList($xml_out, $metadata, $field, $separator = ',', $final_separator = false)
277	{
278	echo '[metadata list: ' . $field . "] \n";
279	fwrite($xml_out, '<' . $field . '>' . "\n");
280	if (isset($metadata[$field]))
281	{
282	if (count($metadata[$field]) == 1)
283	{
284	outputTextBlock($xml_out, $metadata[$field][0]);
285	}
286	if (count($metadata[$field]) > 1)
287	{
288	$last_value = '';
289	if ($final_separator)
290	{
291	$last_value = array_pop($metadata[$field]);
292	}
293	$values = implode($separator, $metadata[$field]);
294	if ($final_separator)
295	{
296	$values .= $final_separator . $last_value;
297	}
298	outputTextBlock($xml_out, $values);
299	}
300	}
301	else
302	{
303	echo 'no such field or no metadata';
304	}
305	fwrite($xml_out, '</' . $field . '>' . "\n");
306	}
307	/ outputMetadataList() /
308
309	function outputMetadataMultiple($xml_out, $metadata, $field)
310	{
311	echo '[metadata multiple: ' . $field . "] \n";
312	// - Text blocks don't need to be wrapped in Text element
313	if ($field != 'Text')
314	{
315	fwrite($xml_out, '<' . $field . '>' . "\n");
316	}
317	if (isset($metadata[$field]))
318	{
319	foreach ($metadata[$field] as $value)
320	{
321	outputTextBlock($xml_out, $value);
322	}
323	}
324	else
325	{
326	echo 'no such field or no metadata';
327	}
328	if ($field != 'Text')
329	{
330	fwrite($xml_out, '</' . $field . '>' . "\n");
331	}
332	}
333
334	function translateTableCodeline($text)
335	{
336	// Escape any italic tags hidden in HTML comments
337	$text = str_replace('<!--i-->', '%!--i--%', $text);
338	$text = str_replace('<!--/i-->', '%!--/i--%', $text);
339	// Encode entities etc
340	$text = translateText($text, true);
341	// Restore any italics elements hidden above
342	$text = str_replace('%!--i--%', '<i>', $text);
343	$text = str_replace('%!--/i--%', '</i>', $text);
344	return $text;
345	}
346	/ translateTableCodeline() /
347
348	function translateText($text, $in_code_block=false)
349	{
350	global $allow_bad_codeblocks;
351	$text = str_replace('&','&',$text);
352	$text = str_replace('<','<',$text);
353	$text = str_replace('>','>',$text);
354	if ($in_code_block && $allow_bad_codeblocks)
355	{
356	///cho "** [debug] restoring bogus decoded tags in: \|$text\| **\n";
357	$text = str_replace('<i>','<i>',$text);
358	$text = str_replace('</i>','</i>',$text);
359	$text = str_replace('<br/>','<br/>',$text);
360	}
361	return $text;
362	}
363
364	function outputTextBlock($xml_out, $text, $type='', $in_code_block = false)
365	{
366	global $cover_metadata;
367	global $text_id_counter;
368	global $footnotes;
369	global $allow_bad_codeblocks;
370
371	// - Start by dealing with any footnotes before anything else
372	while (preg_match('/\(\((.*?)\)\)/', $text, $matches))
373	{
374	$pattern = $matches[0];
375	$footnote = $matches[1];
376	$footnote_id = count($footnotes) + 1;
377	$footnotes[$footnote] = $footnote_id;
378	// - note that we have to escape the footnote reference as the following
379	// code will convert any < and > to entities...
380	$footnote_reference = '%FootnoteRef id="' . $footnote_id . '"/%';
381	$text = str_replace($pattern, $footnote_reference, $text);
382	}
383
384	$text_id = '';
385	// - check whether the string begins with an explicit id
386	if (preg_match('/^\s<!--\sid:(.+?)\s-->(.)$/', $text, $matches))
387	{
388	$text_id = $matches[1];
389	$text = $matches[2];
390	if (is_numeric($text_id))
391	{
392	$text_id_counter = $text_id + 1;
393	}
394	}
395	else
396	{
397	$text_id = $text_id_counter;
398	$text_id_counter++;
399	}
400
401	// - protect the special case of an HTML comment being actually displayed
402	// in the text
403	$text = preg_replace('/<!--([\s\.]+?)-->/','##lt##!--\1--##gt##',$text);
404
405	// - reformat dokuwiki syntax to HTML tag syntax
406	$text = preg_replace('/<!--.*?-->/', '', $text);
407
408	// we leave code blocks alone in terms of ampersands
409	if (!$in_code_block)
410	{
411	// - ampersands aren't safe in XML...
412	$text = str_replace('&', '&', $text);
413	// ...except for the entities that we have registered as metadata
414	if (isset($cover_metadata['ENTITY']))
415	{
416	foreach ($cover_metadata['ENTITY'] as $entity)
417	{
418	if (preg_match('/([a-z]+)\s+"&#(\d+);"/', $entity, $matches))
419	{
420	$entity_name = $matches[1];
421	if ($entity_name != 'mdash')
422	{
423	$entity_character = html_entity_decode('&#'.$matches[2].';',ENT_NOQUOTES,'UTF-8');
424	$text = str_replace('&' . $entity_name . ';', '&' . $entity_name . ';', $text);
425	// - we also convert any characters that match the entity char into
426	// the entity
427	$text = str_replace($entity_character, '&' . $entity_name . ';', $text);
428	}
429	}
430	}
431	}
432	// - protect <br/> tags
433	$text = str_replace('<br/>','%%br/%%',$text);
434	// - encoding all of the < and > that appear in the text (rather than
435	// true html formatting)
436	$text = str_replace('<','<',$text);
437	$text = str_replace('>','>',$text);
438	// - restore <br/> tags
439	$text = str_replace('%%br/%%','<br/>',$text);
440	}
441	else if ($type == 'code')
442	{
443	$text = str_replace('<','<',$text);
444	$text = str_replace('>','>',$text);
445	}
446
447	// - links, oh how I hate thee
448	// - external links are slightly easier
449	$text = preg_replace('/\[\[http:\/\/(.?)\\|(.?)\]\]/', '<Link url="http://\1">\2</Link>', $text);
450	// - internals have to become the horrible <CrossRef> tags. We ignore any
451	// number prefix on the page name as that is just used for ordering within
452	// Dokuwiki
453	$text = preg_replace('/\[\[\.\:(.*?)\\|[^\]]+\]\]/','<CrossRef target="Chapter" ref="\1"/>', $text);
454	// - internal links starting with hash must be on the same page
455	$text = preg_replace('/\[\[###(.?)\\|.?\]\]/','<CrossRef target="Part" ref="\1"/>', $text);
456	$text = preg_replace('/\[\[##(.?)\\|.?\]\]/','<CrossRef target="Subsection" ref="\1"/>', $text);
457	$text = preg_replace('/\[\[#(.?)\\|.?\]\]/','<CrossRef target="Section" ref="\1"/>', $text);
458	// - 'external' internal wiki links are even worst - since we can't know what
459	// the page order number for another manual's chapters might be, we instead
460	// use a search
461	$text = preg_replace('/\[\[\?do\=search\&id\=([^\s]+)\s+@([a-z]+):manuals:([a-z]+)\\|.*?\]\]/i', '<CrossRef external="\3" lang="\2" target="Chapter" ref="\1"/>', $text);
462	// - references to images and tables
463	$text = preg_replace('/(?:<\|<)imgref\sfigure_(.+?)(?:>\|>)/','<CrossRef target="Figure" ref="\1"/>', $text);
464	$text = preg_replace('/(?:<\|<)tblref\stable_(.+?)(?:>\|>)/','<CrossRef target="Table" ref="\1"/>', $text);
465	// - explicitly convert URLs as they are a bit messy
466	// - first all the cases of URLs in italics, without protocol
467	$text = preg_replace('/\/\/\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
468	$text = preg_replace('/\/\/\s([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
469	$text = preg_replace('/\/\/\s(localhost(?:\/.*?)?)\s\/\//i','<i>\1</i>', $text);
470	// - now all the protocol ones (with care taken to protect // in protocol)
471	$text = preg_replace('/\/\/\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
472	$text = preg_replace('/\/\/\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
473	$text = preg_replace('/\/\/\shttp:\/\/(localhost(?:\/.*?)?)\s\/\//i','<i>http:##DOUBLESLASH##\1</i>', $text);
474	// - next we have the underlined URLs sans protocols
475	$text = preg_replace('/__\s([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
476	$text = preg_replace('/__\s([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
477	$text = preg_replace('/__\s(localhost(?:\/.*?)?)\s__/i','<u>\1</u>', $text);
478	// - and finally the protocol prefixed underlined URLs
479	$text = preg_replace('/__\shttp:\/\/([a-z]+\.[a-z0-9\-]+\.[a-z0-9\.\-]+(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
480	$text = preg_replace('/__\shttp:\/\/([a-z0-9\-]+\.org(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
481	$text = preg_replace('/__\shttp:\/\/(localhost(?:\/.*?)?)\s__/i','<u>http:##DOUBLESLASH##\1</u>', $text);
482	// - lets also protect any other protocols we find floating around
483	$text = preg_replace('/(file\|ftp\|http):\/\//i', '\1:##DOUBLESLASH##', $text);
484
485	// - italic formatting (taking care of protected double slashes)
486	$text = preg_replace('/%%\/\/%%/', '##DOUBLESLASH##', $text);
487	$text = preg_replace('/\/{5}/', '<i>/</i>', $text); // another special case
488	$text = preg_replace('/\/\/(\/.+?)\s*\/\//', '<i>\1</i>', $text); // another special case
489	$text = preg_replace('/\/\/\s*(.+?\/)\/\//', '<i>\1</i>', $text); // another special case
490	$text = preg_replace('/\/\/\s(.+?)\s\/\//', '<i>\1</i>', $text);
491	$text = preg_replace('/##DOUBLESLASH##/', '//', $text);
492	// - bold formatting
493	$text = preg_replace('/\\([^"]+?)\\/', '<b>\1</b>', $text);
494	// - underline formatting
495	$text = preg_replace('/__([^"]+?)__/', '<u>\1</u>', $text);
496
497	// - decode certain entities in codeblock (just because they are valid HTML,
498	// derp).
499	if ($in_code_block && $allow_bad_codeblocks)
500	{
501	///cho "** [debug] restoring bogus decoded tags in: \|$text\| **\n";
502	$text = str_replace('<i>','<i>',$text);
503	$text = str_replace('</i>','</i>',$text);
504	//$text = str_replace('<br/>','<br/>',$text);
505	}
506	// - restore protected entities
507	$text = preg_replace('/##(gt\|lt)##/','&\1;',$text);
508	// - restore protected comment blocks
509	$text = str_replace('%!--', '<!--', $text);
510	$text = str_replace('--%', '-->', $text);
511	// - restore protected footnote refs
512	$text = preg_replace('/%FootnoteRef id="([^"]+)"\/%/', '<FootnoteRef id="\1"/>', $text);
513	// output the text block
514	$text = trim($text);
515	if (empty($text))
516	{
517	fwrite($xml_out, '<Text id="' . $text_id . '"/>' . "\n");
518	}
519	else if (!empty($type))
520	{
521	fwrite($xml_out, '<Text type="' . $type . '" id="' . $text_id . '">' . $text . '</Text>' . "\n");
522	}
523	else
524	{
525	fwrite($xml_out, '<Text id="' . $text_id . '">' . $text . '</Text>' . "\n");
526	}
527	}
528	/ outputTextBlock($xml_out, $text) /
529
530	/**
531	*/
532	function processPage($xml_out, $page_name)
533	{
534	global $dokuwiki_path;
535	global $seen_ids;
536	echo "</p>\n<p><b>Export Chapter:</b> " . $page_name . "<br/>\n";
537	// - locate the page in question (taking into account if the user asked for a
538	// draft version or an approved version of the manual)
539	$page_path = '';
540	if ($_REQUEST['v'] == 'draft' \|\| $_REQUEST['l'] != 'en')
541	{
542	$page_path = $dokuwiki_path . '/data/pages/' . $_REQUEST['l'] . '/manuals/' . $_REQUEST['m'] . '/' . $page_name . '.txt';
543	}
544	else
545	{
546	$page_path = getApprovedVersionPath('en:manuals:' . $_REQUEST['m'] . ':' . $page_name);
547	}
548	$page_in = @fopen($page_path, 'r');
549	if (!$page_in)
550	{
551	printError('Failed to open page for reading:' . $page_name, false);
552	return;
553	}
554	// - once again we read in line-by-line, but this time we are going to output
555	// each line as we go through. We expect to encounter certain lines in a
556	// predefined order, and should complain if we don't find what we expect.
557	$in_chapter = false;
558	$in_section = false;
559	$in_subsection = false;
560	$in_part = false;
561	$in_list = false;
562	$lists = array();
563	$previous_listitem_type = '';
564	$in_figure = false;
565	$in_table = false;
566	$column_widths = array();
567	$in_code_block = false;
568	while (($line = fgets($page_in)) !== false)
569	{
570	// remove newline character
571	$line = preg_replace('/\r?\n$/','',$line);
572	// - we need to know the 'depth' for the bulletpoint lists
573	$depth = 0;
574	while (strlen($line) > 2 && preg_match('/^\s+[\*\-]/', $line) && substr($line, 0, 2) == ' ')
575	{
576	$depth++;
577	$line = substr($line, 2);
578	}
579	$first_character = substr($line, 0, 1);
580	// - special case for the end of bullet lists
581	if ($in_list && ($first_character != "*" && $first_character != "-"))
582	{
583	while (count($lists) > 0)
584	{
585	$list_type = array_pop($lists);
586	if ($list_type == '*')
587	{
588	fwrite($xml_out, '</Bullet>' . "\n");
589	fwrite($xml_out, '</BulletList>' . "\n");
590	}
591	else
592	{
593	fwrite($xml_out, '</NumberedItem>' . "\n");
594	fwrite($xml_out, '</NumberedList>' . "\n");
595	}
596	}
597	$in_list = false;
598	}
599	// - special case for the end of tables
600	if ($in_table && $first_character != '^' && $first_character != '\|')
601	{
602	fwrite($xml_out, '</TableContent>' . "\n");
603	fwrite($xml_out, '</Table>' . "\n");
604	$in_table = false;
605	}
606	// - special cases for premature closing of sections, subsections and parts
607	if (preg_match('/<!-- close:(section\|subsection\|part) -->/', $line, $matches))
608	{
609	// - we always try to do this (regardless of actual flag) as we must
610	// always close the smallest 'granularity' first
611	if ($in_part)
612	{
613	fwrite($xml_out, '</Content>' . "\n");
614	fwrite($xml_out, '</Part>' . "\n");
615	$in_part = false;
616	}
617	if ($in_subsection && ($matches[1] == 'section' \|\| $matches[1] == 'subsection'))
618	{
619	fwrite($xml_out, '</Content>' . "\n");
620	fwrite($xml_out, '</Subsection>' . "\n");
621	$in_subsection = false;
622	}
623	if ($in_section && $matches[1] == 'section')
624	{
625	fwrite($xml_out, '</Content>' . "\n");
626	fwrite($xml_out, '</Section>' . "\n");
627	$in_section = false;
628	}
629	}
630
631	// - if this page is a chapter, then the first thing on the page should be
632	// the chapter title (six equals)
633	if (preg_match('/====== (.+) ======/', $line, $matches))
634	{
635	$chapter_title = $matches[1];
636	$chapter_id = $page_name;
637	if (empty($chapter_id))
638	{
639	$chapter_id = generateID($chapter_title);
640	}
641	// - are we already processing a part? if so end it, end it now
642	if ($in_part)
643	{
644	fwrite($xml_out, '</Content>' . "\n");
645	fwrite($xml_out, '</Part>' . "\n");
646	$in_part = false;
647	}
648	// - are we already processing a subsection? if so end it, end it now
649	if ($in_subsection)
650	{
651	fwrite($xml_out, '</Content>' . "\n");
652	fwrite($xml_out, '</Subsection>' . "\n");
653	$in_subsection = false;
654	}
655	// - are we already processing a section? if so end it, end it now
656	if ($in_section)
657	{
658	fwrite($xml_out, '</Content>' . "\n");
659	fwrite($xml_out, '</Section>' . "\n");
660	$in_section = false;
661	}
662	// - are we already processing a chapter? if so end it, end it now
663	if ($in_chapter)
664	{
665	fwrite($xml_out, '</Content>' . "\n");
666	fwrite($xml_out, '</Chapter>' . "\n");
667	$in_chapter = false;
668	}
669	// - write out this chapter's header
670	fwrite($xml_out, '<Chapter id="' . $chapter_id . '">' . "\n");
671	outputMetadataSingle($xml_out, $chapter_title, 'Title');
672	fwrite($xml_out, '<Content>' . "\n");
673	$in_chapter = true;
674	}
675	// - the next likely thing to encounter is a section heading (five equals)
676	elseif (preg_match('/=====\s+(.+)\s+=====/', $line, $matches))
677	{
678	$section_title = $matches[1];
679	// - check for explicit section id
680	$section_id = '';
681	if (preg_match('/<!-- sid:(.+?) -->(.*)/', $section_title, $matches))
682	{
683	$section_id = $matches[1];
684	$section_title = $matches[2];
685	}
686	if (empty($section_id))
687	{
688	$section_id = generateID($section_title);
689	}
690	// - are we already processing a part? if so end it, end it now
691	if ($in_part)
692	{
693	fwrite($xml_out, '</Content>' . "\n");
694	fwrite($xml_out, '</Part>' . "\n");
695	$in_part = false;
696	}
697	// - are we already processing a subsection? if so end it, end it now
698	if ($in_subsection)
699	{
700	fwrite($xml_out, '</Content>' . "\n");
701	fwrite($xml_out, '</Subsection>' . "\n");
702	$in_subsection = false;
703	}
704	// - are we already processing a section? if so end it, end it now
705	if ($in_section)
706	{
707	fwrite($xml_out, '</Content>' . "\n");
708	fwrite($xml_out, '</Section>' . "\n");
709	$in_section = false;
710	}
711	// - write out this section's header
712	fwrite($xml_out, '<Section id="' . $section_id . '">' . "\n");
713	outputMetadataSingle($xml_out, $section_title, 'Title');
714	fwrite($xml_out, '<Content>' . "\n");
715	$in_section = true;
716	}
717	// - similar for subsection heading (four equals)
718	elseif (preg_match('/==== (.+) ====/', $line, $matches))
719	{
720	$subsection_title = $matches[1];
721	// - check for explicit subsection id
722	$subsection_id = '';
723	if (preg_match('/<!-- sid:(.+?) -->(.*)/', $subsection_title, $matches))
724	{
725	$subsection_id = $matches[1];
726	$subsection_title = $matches[2];
727	}
728	if (empty($subsection_id))
729	{
730	$subsection_id = generateID($subsection_title);
731	}
732	// - are we already processing a part? if so end it, end it now
733	if ($in_part)
734	{
735	fwrite($xml_out, '</Content>' . "\n");
736	fwrite($xml_out, '</Part>' . "\n");
737	$in_part = false;
738	}
739	// - are we already processing a subsection? if so end it, end it now
740	if ($in_subsection)
741	{
742	fwrite($xml_out, '</Content>' . "\n");
743	fwrite($xml_out, '</Subsection>' . "\n");
744	$in_subsection = false;
745	}
746	// - write out this subsection's header
747	fwrite($xml_out, '<Subsection id="' . $subsection_id . '">' . "\n");
748	outputMetadataSingle($xml_out, $subsection_title, 'Title');
749	fwrite($xml_out, '<Content>' . "\n");
750	$in_subsection = true;
751	}
752	// - and part heading (three equals)
753	elseif (preg_match('/=== (.+) ===/', $line, $matches))
754	{
755	$part_title = $matches[1];
756	// - check for explicit part id
757	$part_id = '';
758	if (preg_match('/<!-- sid:(.+?) -->(.*)/', $part_title, $matches))
759	{
760	$part_id = $matches[1];
761	$part_title = $matches[2];
762	}
763	if (empty($part_id))
764	{
765	$part_id = generateID($part_title);
766	}
767	// - are we already processing a part? if so end it, end it now
768	if ($in_part)
769	{
770	fwrite($xml_out, '</Content>' . "\n");
771	fwrite($xml_out, '</Part>' . "\n");
772	$in_part = false;
773	}
774	// - write out this part's header
775	fwrite($xml_out, '<Part id="' . $part_id . '">' . "\n");
776	outputMetadataSingle($xml_out, '//' . $part_title . '//', 'Title');
777	fwrite($xml_out, '<Content>' . "\n");
778	$in_part = true;
779	}
780	// - Ignore 5th level heading - they are only used to allow more convenient
781	// editing of figures and tables
782	elseif (preg_match('/== (.+) ==/', $line, $matches))
783	{
784	}
785	// - lists need special handling
786	elseif (preg_match('/^(\\|\-)\s+(.)/', $line, $matches))
787	{
788	$list_type = $matches[1];
789	$list_text = $matches[2];
790	$list_depth = count($lists);
791	if (!$in_list)
792	{
793	if ($list_type == '*')
794	{
795	fwrite($xml_out, '<BulletList>' . "\n");
796	}
797	else
798	{
799	fwrite($xml_out, '<NumberedList>' . "\n");
800	}
801	$in_list = true;
802	array_push($lists, $list_type);
803	}
804	// - this bullet is at the same depth as previous - close the previous
805	// point
806	elseif ($depth == $list_depth)
807	{
808	$previous_list_type = end($lists);
809	if ($previous_list_type == '*')
810	{
811	fwrite($xml_out, '</Bullet>' . "\n");
812	}
813	else
814	{
815	fwrite($xml_out, '</NumberedItem>' . "\n");
816	}
817	// - we don't match in type anymore... close the previous list and open
818	// a new list of the appropriate type
819	if ($list_type != $previous_list_type)
820	{
821	if ($previous_list_type == '*')
822	{
823	fwrite($xml_out, '</BulletList>' . "\n");
824	fwrite($xml_out, '<NumberedList>' . "\n");
825	}
826	else
827	{
828	fwrite($xml_out, '</NumberedNumbered>' . "\n");
829	fwrite($xml_out, '<BulletList>' . "\n");
830	}
831	array_pop($lists);
832	array_push($lists, $list_type);
833	}
834	}
835	else
836	{
837	// - we have either got deeper...
838	if ($depth > $list_depth)
839	{
840	if ($list_type == '*')
841	{
842	fwrite($xml_out, '<BulletList>' . "\n");
843	}
844	else
845	{
846	fwrite($xml_out, '<NumberedList>' . "\n");
847	}
848	array_push($lists, $list_type);
849	}
850	// ... or shallower in the bullet listing
851	if ($depth < $list_depth)
852	{
853	$previous_list_type = array_pop($lists);
854	if ($previous_list_type == '*')
855	{
856	fwrite($xml_out, '</Bullet>' . "\n");
857	fwrite($xml_out, '</BulletList>' . "\n");
858	}
859	else
860	{
861	fwrite($xml_out, '</NumberedItem>' . "\n");
862	fwrite($xml_out, '</NumberedList>' . "\n");
863	}
864	// - we still have to close the last item too
865	$previous_listitem_type = end($lists);
866	if ($previous_listitem_type == '*')
867	{
868	fwrite($xml_out, '</Bullet>' . "\n");
869	}
870	else
871	{
872	fwrite($xml_out, '</NumberedItem>' . "\n");
873	}
874	}
875	}
876	if ($list_type == '*')
877	{
878	fwrite($xml_out, '<Bullet>' . "\n");
879	}
880	else
881	{
882	fwrite($xml_out, '<NumberedItem>' . "\n");
883	}
884	// Special Case: bullets that contain (start) a code block
885	if (preg_match('/^(.)<code>\s$/', $list_text, $matches))
886	{
887	$list_text = $matches[1];
888	$in_code_block = true;
889	}
890
891	outputTextBlock($xml_out, $list_text);
892
893	// - to make things clearer, we'll process any and all code blocks within
894	// bullets here - especially as there may be more text block after
895	// the code block finishes
896	if ($in_code_block)
897	{
898	$sub_line = '';
899	while ($in_code_block && ($sub_line = fgets($page_in)) !== false)
900	{
901	$sub_line = trim($sub_line);
902	// - closing code
903	if (preg_match('/^<\/code>(.*)$/', $sub_line, $matches))
904	{
905	$sub_line = $matches[1]; // may be empty string
906	$in_code_block = false;
907	}
908	// - output another plain codeline
909	else
910	{
911	fwrite($xml_out, '<CodeLine>' . $sub_line . "</CodeLine>\n");
912	$sub_line = '';
913	}
914	}
915	// - if sub_line still has anything in it, then add that content as a
916	// text block
917	if (!empty($sub_line))
918	{
919	outputTextBlock($xml_out, $sub_line);
920	}
921	}
922	}
923	// - images start with an image caption 'element'
924	elseif (preg_match('/<imgcaption\s+figure_([a-z0-9_\-]+)\\|(.+)>\s*<\/imgcaption>/', $line, $matches))
925	{
926	$figure_id = $matches[1];
927	$figure_title = $matches[2];
928	// - watch for the special withLineNumber flag
929	$class_attribute = '';
930	if (strpos($figure_title, '%!-- withLineNumber --%') != false)
931	{
932	$class_attribute = ' class="withLineNumber"';
933	$figure_title = str_replace('%!-- withLineNumber --%','',$figure_title);
934	}
935	fwrite($xml_out, '<Figure id="' . $figure_id . '"' . $class_attribute . '>' . "\n");
936	echo '[figure: ' . $figure_id . "] \n";
937	fwrite($xml_out, '<Title>' . "\n");
938	// - decode any comments in the title (used to store explicit id
939	// information)
940	$figure_title = str_replace('%!--', '<!--', $figure_title);
941	$figure_title = str_replace('--%', '-->', $figure_title);
942	// - special case: the title may have a subtitle (as a prefix)
943	$figure_subtitle_id = '';
944	$figure_subtitle = '';
945	// - subtitle with explicit id
946	if (preg_match('/^(<!-- id:.+? -->\([a-z]\))\s(.)$/', $figure_title, $matches))
947	{
948	$figure_subtitle = $matches[1];
949	$figure_title = $matches[2];
950	}
951	// - subtitle without explicit id
952	else if (preg_match('/^(\([a-z]\))\s(.)$/', $figure_title, $matches))
953	{
954	$figure_subtitle = $matches[1];
955	$figure_title = $matches[2];
956	}
957	outputTextBlock($xml_out, $figure_title);
958	if (!empty($figure_subtitle))
959	{
960	fwrite($xml_out, '<SubTitle>' . "\n");
961	outputTextBlock($xml_out, $figure_subtitle);
962	fwrite($xml_out, '</SubTitle>' . "\n");
963	}
964	fwrite($xml_out, '</Title>' . "\n");
965	$in_figure = true;
966	// - record the id to prevent repeating
967	$seen_ids[$figure_id] = true;
968	}
969	// - tables start with a table caption 'element'
970	elseif (preg_match('/<tblcaption\s+table_([a-z0-9_\-]+)\\|([^>]+)>\s*<\/tblcaption>/', $line, $matches))
971	{
972	$table_id = $matches[1];
973	$table_title = $matches[2];
974	if ($table_title == '##NOCAPTION##')
975	{
976	echo '[non-captioned table: ' . $table_id . "] \n";
977	// - watch for autogenerated ids... no point in outputting them
978	if (preg_match('/^table(_\d+)?$/', $table_id))
979	{
980	fwrite($xml_out, "<Table>\n");
981	}
982	else
983	{
984	fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
985	}
986	fwrite($xml_out, '<Title/>' . "\n");
987	}
988	elseif ($table_title == '##HIDDEN##')
989	{
990	echo '[hidden table: ' . $table_id . "] \n";
991	// - watch for autogenerated ids... no point in outputting them
992	if (preg_match('/^table(_\d+)?$/', $table_id))
993	{
994	fwrite($xml_out, "<Table class=\"hidden\">\n");
995	}
996	else
997	{
998	fwrite($xml_out, '<Table class="hidden" id="' . $table_id . '">' . "\n");
999	}
1000	fwrite($xml_out, '<Title/>' . "\n");
1001	}
1002	else
1003	{
1004	echo '[table: ' . $table_id . "] \n";
1005	// - watch for autogenerated ids... no point in outputting them
1006	if (preg_match('/^table(_\d+)?$/', $table_id))
1007	{
1008	fwrite($xml_out, "<Table>\n");
1009	}
1010	else
1011	{
1012	fwrite($xml_out, '<Table id="' . $table_id . '">' . "\n");
1013	}
1014	fwrite($xml_out, '<Title>' . "\n");
1015	outputTextBlock($xml_out, $table_title);
1016	fwrite($xml_out, '</Title>' . "\n");
1017	}
1018	fwrite($xml_out, '<TableContent>' . "\n");
1019	$in_table = true;
1020	// - record the id to prevent repeating
1021	$seen_ids[$table_id] = true;
1022	}
1023	// - the second line in a table should be it's column width values
1024	elseif (preg_match('/\\|<\s-\s([0-9 ]+?)\s>\\|/', $line, $matches))
1025	{
1026	$column_widths = explode(' ', $matches[1]);
1027	}
1028	// - then every row will be made of a number of cells
1029	elseif (preg_match('/^\\|(.*?)\\|$/', $line, $matches))
1030	{
1031	$row_content = $matches[1];
1032	$cell_contents = preg_split('/(\s+\\|\|\\|\s+)/', $row_content);
1033	fwrite($xml_out, '<tr>' . "\n");
1034	foreach ($cell_contents as $index=>$cell_content)
1035	{
1036	$cell_content = trim($cell_content);
1037	$th_text = '';
1038	if (isset($column_widths[$index]))
1039	{
1040	$th_text = '<th width="' . $column_widths[$index] . '"';
1041	}
1042	else
1043	{
1044	$th_text = '<th';
1045	}
1046	// - if the cell would be empty, we use the shorthand
1047	if (empty($cell_content))
1048	{
1049	$th_text .= '/>' . "\n";
1050	fwrite($xml_out, $th_text);
1051	}
1052	else
1053	{
1054	$th_text .= '>' . "\n";
1055	fwrite($xml_out, $th_text);
1056
1057	// GAH - this is proving harder than a hard thing thats hard.
1058	// The issue is that the most straightforward way of fixing this,
1059	// namely using explicit newlines (\\) in the dokuwiki txt causes
1060	// lots a legitimately translated <br/> to also be split up. I
1061	// think the only way forward would be to maybe extend the HTML
1062	// Comment plugin to also respect and process <br/> tags. Then I
1063	// can avoid transforming them, and use the \\ sentinel to
1064	// separate multi-line table cells.
1065	$cell_content_lines = explode('\\\\', $cell_content);
1066	foreach ($cell_content_lines as $cell_content)
1067	{
1068	// - watch out, as the content may be an image
1069	if (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $cell_content))
1070	{
1071	processImage($xml_out, $cell_content);
1072	}
1073	elseif (preg_match('/\'\'(.*)\'\'/', $cell_content, $matches))
1074	{
1075	fwrite($xml_out, '<CodeLine>' . translateTableCodeline($matches[1]) . '</CodeLine>' . "\n");
1076	}
1077	// - anything else it text
1078	else
1079	{
1080	outputTextBlock($xml_out, $cell_content);
1081	}
1082	}
1083	fwrite($xml_out, '</th>' . "\n");
1084	}
1085	}
1086	fwrite($xml_out, '</tr>' . "\n");
1087	}
1088	// - links to image media in the wiki!
1089	elseif (preg_match('/\{\{.+?[^:?]+\?\d+x\d+(&direct)?\}\}/', $line))
1090	{
1091	processImage($xml_out, $line);
1092	// - if we were processing a figure, then now is a good time to close it
1093	if ($in_figure)
1094	{
1095	fwrite($xml_out, '</Figure>' . "\n");
1096	$in_figure = false;
1097	}
1098	}
1099	// - if the line starts with a <code> block, then we have a tag
1100	// for that (which is special in that it get a unique text id)
1101	elseif (preg_match('/^<code\s\d\s>(.?)(<\/code>)?$/', $line, $matches) \|\| ($in_code_block && preg_match('/^(.*?)(<\/code>)?$/', $line, $matches)))
1102	{
1103	$payload = $matches[1];
1104	$found_end = (isset($matches[2]));
1105	$in_code_block = true;
1106	// - be careful with empty lines
1107	if (empty($payload))
1108	{
1109	// - as they may appear in the body of the code (in which case we need
1110	// to output them). The empty lines at the start or end of a code
1111	// block are just an unfortunate consequence of the support for code
1112	// line numbering.
1113	if (!$found_end && strpos($line, '<code') === false)
1114	{
1115	fwrite($xml_out, "<CodeLine/>\n");
1116	}
1117	}
1118	elseif (preg_match('/^<!-- id:([^\s]+) -->/', $payload, $matches))
1119	{
1120	$text_id = $matches[1];
1121	outputTextBlock($xml_out, $payload, 'code', true);
1122	// - record the id to prevent repeating
1123	$seen_ids[$text_id] = true;
1124	}
1125	else
1126	{
1127	fwrite($xml_out, '<CodeLine>' . translateText($payload, true) . '</CodeLine>' . "\n");
1128	}
1129	// - if we didn't find an endtag we have to keep doing code mode until
1130	// we do
1131	$in_code_block = (!$found_end);
1132	if ($found_end)
1133	{
1134	// - if we were processing a figure, then now is a good time to close it
1135	if ($in_figure)
1136	{
1137	fwrite($xml_out, '</Figure>' . "\n");
1138	$in_figure = false;
1139	}
1140	}
1141	}
1142	// - entities on a line by themselves (i.e. references to external files)
1143	// go through verbatim
1144	elseif (preg_match('/^\s&[a-z0-9_-]+;\s$/', $line))
1145	{
1146	fwrite($xml_out, $line . "\n");
1147	}
1148	// - lines starting with > are indented text blocks
1149	elseif (preg_match('/^>(.*)$/', $line, $matches))
1150	{
1151	$payload = $matches[1];
1152	fwrite($xml_out, "<Indented>\n");
1153	outputTextBlock($xml_out, $payload);
1154	fwrite($xml_out, "</Indented>\n");
1155	}
1156	// - everything else goes straight through as a text block
1157	// - note that for code blocks, even empty lines count
1158	elseif (!empty($line))
1159	{
1160	// - output the line of text having encoded entities etc
1161	outputTextBlock($xml_out, $line, '', $in_code_block);
1162	}
1163	}
1164	// Complete any open part
1165	if ($in_part)
1166	{
1167	fwrite($xml_out, '</Content>' . "\n");
1168	fwrite($xml_out, '</Part>' . "\n");
1169	$in_part = false;
1170	}
1171	// Complete any open subsection
1172	if ($in_subsection)
1173	{
1174	fwrite($xml_out, '</Content>' . "\n");
1175	fwrite($xml_out, '</Subsection>' . "\n");
1176	$in_subsection = false;
1177	}
1178	// Complete any open section
1179	if ($in_section)
1180	{
1181	fwrite($xml_out, '</Content>' . "\n");
1182	fwrite($xml_out, '</Section>' . "\n");
1183	$in_section = false;
1184	}
1185	// Complete any open chapter
1186	if ($in_chapter)
1187	{
1188	fwrite($xml_out, '</Content>' . "\n");
1189	fwrite($xml_out, '</Chapter>' . "\n");
1190	$in_chapter = false;
1191	}
1192	}
1193	/ processPage($xml_out, $page_name) /
1194
1195	function processImage($xml_out, $text)
1196	{
1197	global $dokuwiki_path;
1198	global $xml_source_path;
1199	if (preg_match('/\{\{.+?([^:?]+)\?(\d+)x(\d+)(&direct)?\}\}/', $text, $matches))
1200	{
1201	$filename = $matches[1];
1202	$width = $matches[2];
1203	$height = $matches[3];
1204	// - copy the file into place
1205	$image_source_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
1206	$image_destination_dir = $xml_source_path . '/' . $_REQUEST['l'] . '/images';
1207	mkAllDir($image_destination_dir, 0755);
1208	$image_destination_path = $image_destination_dir . '/' . $filename;
1209	if (copy($image_source_path, $image_destination_path))
1210	{
1211	echo '[copying file: ' . $filename . "] \n";
1212	}
1213	else
1214	{
1215	printError('Failed to copy image into place: ' . $filename, false);
1216	}
1217	// - spit out the XML element
1218	fwrite($xml_out, '<File width="' . $width . '" height="' . $height . '" url="images/' . $filename . '"/>' . "\n");
1219	}
1220	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: