Context Navigation

← Previous Changeset
Next Changeset →

Changeset 25052

Timestamp:

2012-02-07T11:08:26+13:00 (12 years ago)

Author:

jmt12

Message:

Several changes to Metadata, Table and Figure parsing as issues were encountered in languages other than English. I'd argue most of these were bogus XML, but I better import the pages anyway.

File:

: 1 edited

documentation/trunk/php/gs-manual-import.php (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

documentation/trunk/php/gs-manual-import.php

-              r25026
+              r25052
      addMetadata($looking_for_metadata, $text);
+   }
+   // - bogus metadata found in French version
+   elseif (!empty($looking_for_metadata) && preg_match('/<Text id="([^"]+)"\/>/', $line, $matches))
+   {
+   }
    // - any text we encounter outside of both sections and chapters also
    //   belongs on the cover
 …
+   }
    // - section, subsection and part titles within chapter
    elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]+)">/', $line, $matches))
+   elseif ($in_chapter && preg_match('/<(Section|Subsection|Part)\sid="([^"]*)">/', $line, $matches))
+   {
      $title_type = $matches[1];
      $section_id = $matches[2];
+     if (empty($section_id))
+     {
+       $section_id = generateID(strtolower($title_type));
+     }
+     echo '[adding ' . strtolower($title_type) . ': ' . $section_id . '] ';
      $header_fix = '';
      $title = getTitle($xml_in, 'heading: ' . $title_type);
 …
          $row_txt = '|';
          $line = getLine($xml_in);
          // - now we read in multiple cells
          while (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
+         // - now we read in multiple cells (line starting <th
+         while (strpos($line, '<th') === 0)
+         {
+           $cell_width = $matches[1];
+           if (!$have_output_widths)
+           if (preg_match('/<th width="(\d+)"\/?>/', $line, $matches))
+           {
+             array_push($column_widths, $cell_width);
+             $cell_width = $matches[1];
+             if (!$have_output_widths)
+             {
+               array_push($column_widths, $cell_width);
+             }
+           }
            // Ignore empty cells
 …
+               }
                // - we can have images or text in our tables
                if (preg_match('/<File width="(\d+)" height="(\d+)" url="images\/([^"]+)"\/>/', $line, $matches))
+               if (preg_match('/<File.*url="images\/([^"]+)".*\/>/', $line, $matches))
+               {
+                 $image_txt = handleImage($matches[3], $matches[1], $matches[2]);
+                 $payload = $matches[0];
+                 $filename = $matches[1];
+                 $width = 0;
+                 if (preg_match('/width="(\d+)"/', $payload, $matches))
+                 {
+                   $width = $matches[1];
+                 }
+                 $height = 0;
+                 if (preg_match('/height="(\d+)"/', $payload, $matches))
+                 {
+                   $height = $matches[1];
+                 }
+                 $image_txt = handleImage($filename, $width, $height);
                  $row_txt .= ' ' . $image_txt . ' ';
+               }
                elseif (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $line, $matches))
+               elseif (preg_match('/<Text id="([^"]+)">(.*)/', $line, $matches))
+               {
+                 $row_txt .= ' <!-- id:' . $matches[1] . ' -->' . translateText($matches[2]) . ' ';
+                 $tid = $matches[1];
+                 $txt = $matches[2];
+                 // - multiple line text block
+                 while (strpos($txt, '</Text>') === false)
+                 {
+                   $txt .= getLine($xml_in);
+                 }
+                 $txt = str_replace('</Text>','',$txt);
+                 $row_txt .= ' <!-- id:' . $tid . ' -->' . translateText($txt) . ' ';
+               }
                elseif (preg_match('/<CodeLine>(.*?)<\/CodeLine>/',$line,$matches))
 …
                  $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
+               }
+               elseif (preg_match('/<CodeLine>(.*)/',$line,$matches))
+               {
+                 $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]) . '\'\' ';
+               }
+               elseif (preg_match('/(.*)<\/CodeLine>/',$line,$matches))
+               {
+                 if (!empty($matches[1]))
+                 {
+                   $row_txt .= ' \'\'' . translateTableCodeLine($matches[1]). '\'\' ';
+                 }
+                 else
+                 {
+                   $row_txt .= ' ';
+                 }
+               }
+               // we'll add (bogus) linebreaks
+               elseif (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
+               {
+                 $row_txt = ' ';
+               }
                else
+               {
                  printError('Warning! Unrecognized element in table: ' . $line);
+                 printError('Warning! Unrecognized element in table: ' . htmlspecialchars($line));
+               }
                $first = false;
 …
+   }
    // - pattern of lines to ignore
    elseif (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
+   else if (preg_match('/^(<\?xml version="1.0" encoding="UTF-8"\?>|<\!DOCTYPE Manual \[|\]>|<Bullet>|<\/?Content>|<\/?Footnote|<Manual id=".+?" lang=".+?">|<\/Manual>)/', $line))
+   {
+   }
    // - we ignore anything else in footnotes too, as they were handled in the
    //   preprocessing pass
+   elseif ($in_footnotes)
+   {
+   else if ($in_footnotes)
+   {
+   }
+   // - ignore empty lines
+   else if (preg_match('/^\s*$/', $line))
+   {
+   }
+   // - meh. French versions have random, non-text element, linebreaks floating
+   //   around. Guess I'll honor their formatting even though it's bogus
+   else if (preg_match('/^\s*<br\s*\/?>\s*$/', $line))
+   {
+     if ($in_chapter)
+     {
+       fwrite($chapter_txt_out, ' \\\\');
+     }
+     else
+     {
+       $frontmatter_text .= ' \\\\';
+     }
+   }
    // - danger Will Robinson!
 …
+{
   $title = '';
+  $in_title_element = false;
   // - the first thing in a chapter will be it's title
   $title_line = getLine($xml_in);
+  // - super special case: some language versions don't wrap titles in title
+  //   element, so if the first thing we see is a text, we treat that as the
+  //   title
+  if (strpos($title_line, '<text') !== false)
+  {
+  }
   // - super special case: a table with an empty title
   if (strpos($title_line, '<Title/>') !== false)
 …
     return '';
+  }
+  if (strpos($title_line, '<Title>') === false)
+  {
+    printError('Failed to find opening title for: ' . $element);
+  }
+  $title_line = getLine($xml_in);
+  if (strpos($title_line, '<Title>') !== false)
+  {
+    $in_title_element = true;
+    $title_line = getLine($xml_in);
+  }
+  // - some horribly formed entries have the subtitle first within the title
+  //   element
+  if (strpos($title_line, '<SubTitle>') !== false)
+  {
+    $title_line = getLine($xml_in);
+    if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
+    {
+      $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
+    }
+    $title_line = getLine($xml_in);
+    if (strpos($title_line, '</SubTitle>') === false)
+    {
+      printError('Failed to find closing title for: ' . $element);
+    }
+    $title_line = getLine($xml_in);
+  }
   // - grab the chapter title now so we can store it in the page ordering
   if (preg_match('/<Text id="([^"]+)">(.*?)$/', $title_line, $matches))
 …
     // - now remove </Text>
     $str = preg_replace('/<\/Text>\s*/', '', $str);
     $title = '<!-- id:' . $id . ' -->' . $str;
+    $title = '<!-- id:' . $id . ' -->' . $str . $title;
+  }
   // - special case for (stoopid) empty titles that use up a text id
   elseif (preg_match('/<Text id="([^"]+)"\s*\/>/', $title_line, $matches))
+  {
     $title = '<!-- id:' . $matches[1] . ' -->';
+    $title = '<!-- id:' . $matches[1] . ' -->' . $title;
+  }
   else
 …
+  }
   // - watch for subtitle elements
+  $title_line = getLine($xml_in);
+  if (strpos($title_line, '<SubTitle>') !== false)
+  if ($in_title_element)
+  {
     $title_line = getLine($xml_in);
     if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
+    if (strpos($title_line, '<SubTitle>') !== false)
+    {
+      $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
+      $title_line = getLine($xml_in);
+      if (preg_match('/<Text id="([^"]+)">(.+?)<\/Text>/', $title_line, $matches))
+      {
+        $title = '<!-- id:' . $matches[1] . ' -->' . $matches[2] . ' ' . $title;
+      }
+      $title_line = getLine($xml_in);
+      if (strpos($title_line, '</SubTitle>') === false)
+      {
+        printError('Failed to find closing title for: ' . $element);
+      }
+      $title_line = getLine($xml_in);
+    }
+    $title_line = getLine($xml_in);
+    if (strpos($title_line, '</SubTitle>') === false)
+    if (strpos($title_line, '</Title>') === false)
+    {
       printError('Failed to find closing title for: ' . $element);
+    }
-    $title_line = getLine($xml_in);
+  }
-  if (strpos($title_line, '</Title>') === false)
+  {
-    printError('Failed to find closing title for: ' . $element);
+  }
   return $title;
 …
   $text = str_replace('<!--', '%!--', $text);
   $text = str_replace('-->', '--%', $text);
+  // remove any lurking crossrefs while we are at it
+  $text = preg_replace('/<CrossRef.*?ref="([^"]+)".*?>/', '\\1', $text);
   return $text;
+}
 …
   // - copy file into place
   $source_path = $xml_source_path . '/' . $_REQUEST['l'] . '/images/' . $filename;
+  $destination_path = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/' . strtolower($filename);
+  $destination_dir = $dokuwiki_path . '/data/media/' . $_REQUEST['l'] . '/manuals/images/';
+  if (!file_exists($destination_dir))
+  {
+    mkAllDir($destination_dir, 0755);
+  }
+  $destination_path = $destination_dir . strtolower($filename);
   copy($source_path, $destination_path);
   if (!file_exists($destination_path))

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 25052

Legend:

documentation/trunk/php/gs-manual-import.php

Download in other formats: