News:

Please note these forums are mostly a testing ground for my SMF work and I don't really use them otherwise.

Main Menu

Paste-1207589364:v:use_geshi-1:v:type-php

Started by JayBachatero, Apr 07, 2008, 05:29 PM

Previous topic - Next topic

0 Members and 1 Guest are viewing this topic.

JayBachatero

<?php

$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);

// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';

// In the html body?
$in_html_body = false;

$front = array();
$sections = array();
foreach ($lines as $key => $line)
{
   // Clean spaces.
   $line = trim($line);

   // No blank lines.
   if (empty($line))
      continue;

   if (!$in_html_body && $line != '<body>')
      continue;
   elseif (!$in_html_body && $line == '<body>')
   {
      $in_html_body = true;
      continue;
   }

   // We just end here.  We don't parse the table and image stuff just yet.
   if (in_array($line, array('</body>', '</html>')))
      break;

   // Lets just do the head stuff first since it's usually on top.
   if (substr($line, 0, 11) == 'Article ID:')
   {
      $front['article_id'] = trim(substr($line, 11));
      continue;
   }
   elseif (substr($line, 0, 12) == 'Publication:')
   {
      $front['publication'] = tirm(substr($line, 12));
      continue;
   }

   // In a first level section and we already in one.  Just go to the new one.
   if ($in_section && substr($line, 0, 4) == '<h3>')
   {
      $in_section = false;
      $in_sub_section = false;
      $title = '';
      $sub_title = '';
   }

   // A section?
   if (preg_match('~^<h3>(.+?)</h3>$~', $line, $matches) && $in_section == false)
   {
      $in_section = true;
      $title = trim($matches[1]);

      // Clean title.
      $clean_title = strtolower(substr($title, 0, 50));

      // We start a new section in the array.
      $sections[$clean_title] = array(
         'title' => $title,
         'sub_section' => array(),
      );

      // NEXT...
      continue;
   }

   // In a sub section and another one just pops up? We no longer in the other one then.  One section at a time.
   if ($in_section && $in_sub_section && substr($line, 0, 4) == '<h4>')
      $in_sub_section = false;

   // Sub section.
   if ($in_section && !$in_sub_section && preg_match('~^<h4>(.+?)</h4>$~', $line, $matches))
   {
      $in_sub_section = true;
      $sub_title = trim($matches[1]);

      // Clean sub title.
      $clean_sub_title = strtolower(substr($sub_title, 0, 50));

      // We don't need any more data from this line so... NEXT!!!
      continue;
   }
   elseif ($in_section && !$in_sub_section)
   {
      $in_sub_section = true;
      $clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
   }

   if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
      $sections[$clean_title]['sub_section'][$clean_sub_title] = array(
         'title' => $sub_title,
         'data' => array(),
      );

   // Now for the data.
   $sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}

print_r($sections);

$xml = '
   <body>';

foreach ($sections as $id => $section)
{
   // First level?  Then it's a page.
   $xml .= '
      <sec sec-type="page">
         <title>' . htmlspecialchars($section['title']) . '</title>';

   // Ok we move on grass hoppa.
   foreach ($section['sub_section'] as $name => $sub_section)
   {
      $xml .= '
         <sec sec-type="content">
            ' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars($sub_section['title']) . '</title>');

      // Now the data.
      foreach ($sub_section['data'] as $key => $data)
      {
         // Do we have any reference to a table here?

         // Bullet list?
         if ($data == '<ul>')
            $xml .= '
            <list list-type="bullet">';
         // How about ordered?
         elseif ($data == '<ol>')
            $xml .= '
            <list list-type="order">';
         // List items?
         elseif (substr($data, 0, 4) == '<li>')
            $xml .= '
               <list-item><p>' . xml_replacement(htmlspecialchars(substr($data, 4, -5))) . '</p></list-item>';
         // Closing them?
         elseif ($data == '</ul>' || $data == '</ol>')
            $xml .= '
            </list>';
         elseif ($data == '<p>')
         {
            $xml .= '
            <p>';
            $open_p = true;
         }
         elseif ($data == '</p>')
         {
            $xml .= '
            </p>';
            $open_p = false;
         }
         elseif ($open_p)
            $xml .= '
               ' . xml_replacement(htmlspecialchars($data));
         elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
            $xml .= '
            <p>' . xml_replacement(htmlspecialchars($matches[1])) . '</p>';
         else
            $xml .= '
            ' . xml_replacement(htmlspecialchars($data));
      }

      $xml .= '
         </sec>';
   }

   $xml .= '
      </sec>';
}

echo $xml;

$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
   echo "\nWROTE AND SAVED FILE";
fclose($fp);

function xml_replacement($data)
{
   if (preg_match('~Table \d+~', $data))
      $data = preg_replace('~Table (\d+)~', '<xref ref-type="table" rid="T\1">Table \1</xref>', $data);

   return $data;
}

?>