<?php
$filename = substr($_SERVER['argv'][1], 11);
$lines = file($filename);
// Are we in an h4 tag?
$in_section = false;
// h4?
$in_sub_section = false;
// Title for the section <h3>
$title = '';
// <h4> title
$sub_title = '';
// In the html body?
$in_html_body = false;
// References???
$found_reference = false;
$in_list = false;
// Tables?
$tables_count = 0;
$front = array(
'article_id' => 0,
'publication' => 0,
'column' => '',
'volume' => '',
'issue' => '',
'pages' => '',
'date' => array(
'day' => date('d'),
'month' => date('m'),
'year' => date('Y'),
),
'pub_abbr' => '',
);
$back = array(
'references' => array(),
'ack' => '',
'financial' => '',
'conflict' => '',
'abbr' => '',
'reprint' => '',
'sidebar' => array(),
);
$sections = array();
foreach ($lines as $key => $line)
{
// Clean spaces.
$line = trim($line);
// No blank lines.
if (empty($line))
continue;
// Skip the head matter.
if (!$in_html_body && $line != '<body>')
continue;
elseif (!$in_html_body && $line == '<body>')
{
$in_html_body = true;
continue;
}
// We just end here. We don't parse the table and image stuff just yet.
if (in_array($line, array('</body>', '</html>')))
break;
// We continue with these.
if (in_array($line, array('<!--', '-->')))
continue;
// Lets just do the head stuff first since it's usually on top.
if (substr($line, 0, 11) == 'Article ID:')
{
$front['article_id'] = trim(substr($line, 11));
continue;
}
elseif (substr($line, 0, 15) == 'Publication ID:')
{
$front['publication'] = trim(substr($line, 15));
continue;
}
elseif (substr($line, 0, 18) == 'Section/Column ID:')
{
$front['column'] = trim(substr($line, 18));
continue;
}
elseif (substr($line, 0, 7) == 'Volume:')
{
$front['volume'] = trim(substr($line, 7));
continue;
}
elseif (substr($line, 0, 6) == 'Issue:')
{
$front['issue'] = trim(substr($line, 6));
continue;
}
elseif (substr($line, 0, 6) == 'Pages:')
{
$front['pages'] = trim(substr($line, 6));
continue;
}
elseif (substr($line, 0, 9) == 'Pub Date:')
{
list ($month, $day, $year) = explode('/', trim(substr($line, 9)));
$front['date']['month'] = empty($month) ? date('m') : $month;
$front['date']['day'] = empty($day) ? date('d') : $day;
$front['date']['year'] = empty($year) ? date('Y') : $year;
continue;
}
elseif (substr($line, 0, 8) == 'Surname:')
{
$front['surname'] = trim(substr($line, 8));
continue;
}
elseif (substr($line, 0, 11) == 'Disclosure:')
{
$front['disclosure'] = trim(substr($line, 11));
continue;
}
elseif (substr($line, 0, 4) == 'Bio:')
{
$front['bio'] = trim(substr($line, 4));
continue;
}
elseif (substr($line, 0, 7) == 'Images:')
{
$front['has_images'] = trim(substr($line, 7)) == 'true' ? true : false;
continue;
}
elseif (substr($line, 0, 9) == 'Pub Abbr:')
{
$front['pub_abbr'] = trim(substr($line, 9));
continue;
}
elseif (substr($line, 0, 4) == 'Ack:')
{
$back['ack'] = trim(substr($line, 4));
continue;
}
elseif (substr($line, 0, 10) == 'Financial:' || substr($line, 0, 8) == 'Funding:')
{
$back['financial'] = trim(substr($line, substr($line, 0, 10) == 'Financial:' ? 10 : 8));
continue;
}
elseif (substr($line, 0, 9) == 'Conflict:')
{
$back['conflict'] = trim(substr($line, 9));
continue;
}
elseif (substr($line, 0, 5) == 'Abbr:')
{
$back['abbr'] = trim(substr($line, 5));
continue;
}
elseif (substr($line, 0, 8) == 'Reprint:')
{
$back['reprint'] = trim(substr($line, 8));
continue;
}
// References here.
if (trim($line) == '<h3>References</h3>')
{
$found_reference = true;
continue;
}
// In a first level section and we already in one. Just go to the new one.
if ($in_section && !$found_reference && substr($line, 0, 3) == '<h3')
{
$in_section = false;
$in_sub_section = false;
$title = '';
$sub_title = '';
$section_type = '';
}
// A section?
if (!$in_section && !$found_reference && preg_match('~^<h3(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h3>$~', $line, $matches))
{
$in_section = true;
$title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);
// Clean title.
$clean_title = strtolower(substr($title, 0, 50));
// We start a new section in the array.
$sections[$clean_title] = array(
'title' => $title == '.' ? '' : $title,
'sub_section' => array(),
);
if (count($matches) == 3)
$sections[$clean_title]['type'] = trim($matches[1]);
// NEXT...
continue;
}
// First lets check if we even doing references yet then check for list.
if ($found_reference && ($line == '<ol>' || $line == '<ul>'))
{
$in_list = true;
$reference_ordered = $line == '<ol>' ? true : false;
continue;
}
if ($in_list && $found_reference && substr(trim($line), 0, 4) == '<li>')
{
$back['references'][] = htmlspecialchars_uni(substr(trim($line), 4, -5));
continue;
}
if ($in_list && $found_reference && ($line == '</ol>' || $line == '</ul>'))
{
$found_reference = false;
$in_list = false;
continue;
}
// In a sub section and another one just pops up? We no longer in the other one then. One section at a time.
if ($in_section && $in_sub_section && substr($line, 0, 3) == '<h4')
$in_sub_section = false;
// Sub section.
if ($in_section && !$in_sub_section && preg_match('~^<h4(?:\s*id="([a-zA-Z0-9_]+)")?>(.+?)</h4>$~', $line, $matches))
{
$in_sub_section = true;
$sub_title = count($matches) == 2 ? trim($matches[1]) : trim($matches[2]);
// Clean sub title.
$clean_sub_title = strtolower(substr($sub_title, 0, 50));
// Type?
$section_type = '';
if (count($matches) == 3)
$section_type = trim($matches[1]);
// We don't need any more data from this line so... NEXT!!!
continue;
}
elseif ($in_section && !$in_sub_section)
{
$in_sub_section = true;
$clean_sub_title = 'no_title_' . substr(md5(time()), 0, 5);
}
if (isset($sections[$clean_title]) && !isset($sections[$clean_title]['sub_section'][$clean_sub_title]))
$sections[$clean_title]['sub_section'][$clean_sub_title] = array(
'title' => $sub_title,
'type' => $section_type,
'data' => array(),
);
// Now for the data.
$sections[$clean_title]['sub_section'][$clean_sub_title]['data'][] = $line;
}
$body = '
<body>';
foreach ($sections as $id => $section)
{
// Sidebar? They go in the back.
if (isset($section['type']) && $section['type'] == 'sidebar')
{
$back['sidebar'][] = $section;
continue;
}
// First level? Then it's a page.
$body .= '
<sec sec-type="page">
' . (empty($section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($section['title']) . '</title>');
// Ok we move on grass hoppa.
foreach ($section['sub_section'] as $name => $sub_section)
{
$body .= '
<sec sec-type="' . (empty($sub_section['type']) ? 'content' : $sub_section['type']) . '">
' . (empty($sub_section['title']) ? '<title/>' : '<title>' . htmlspecialchars_uni($sub_section['title']) . '</title>');
// Now the data.
$body .= parse_content($sub_section['data']);
$body .= '
</sec>';
}
$body .= '
</sec>';
}
$body .= '
</body>';
// Generate the front section
$front_xml = generate_front($front);
// References
$back_xml = generate_back($back);
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article SYSTEM "/content/journal-publishing-dtd-1.1/journalpublishing.dtd">
<article>' . $front_xml . $body . $back_xml . '
</article>';
// Emails
$xml = preg_replace('~([0-9A-Za-z=_+\-/][0-9A-Za-z=_\'+\-/\.]*@[\w\-]+(\.[\w\-]+)*(\.[\w]{2,6}))~', '<ext-link ext-link-type="mailto" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="\1" xlink:title="\1"/>', $xml);
echo $xml;
$fp = fopen(dirname($filename) . '/' . basename($filename, '.html') . '.xml', 'w+');
if (fwrite($fp, $xml))
echo "\nWROTE AND SAVED FILE";
fclose($fp);
function xml_replacement($data)
{
global $front;
$data = preg_replace('~(Table|Exhibit|Box) (\d+)~', '<xref ref-type="table" rid="T\2">\1 \2</xref>', $data);
// Images now?
if (isset($front['pub_abbr']) && preg_match('~\[\[(InlineFigure|Figure)\|\^\|\d+\|\^\|.+?\]\]~', $data, $matches))
{
if (trim($matches[1]) == 'InlineFigure')
$data = preg_replace('~\[\[InlineFigure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
<inline-graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-" . $front['pub_abbr'] . $front['article_id'] . ".equation$1.gif\">
<alt-text>Equation</alt-text>
</inline-graphic>
", $data);
else
$data = preg_replace('~\[\[Figure\|\^\|(\d+)\|\^\|(.+?)\]\]~', "
<fig id=\"F\$1\">
<label>Figure \$1.</label>
<caption>
<p>" . htmlspecialchars('\2') . "</p>
</caption>
<alt-text>Figure \$1</alt-text>
<graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" alt-version=\"yes\"/>
<graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"thumb-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/thumb-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\" alternate-form-of=\"art-" . $front['pub_abbr'] . $front['article_id'] . ".fig\$1.gif\"/>
</fig>", $data);
}
if (preg_match('~\[\[Figure\|\^\|\d+\|\^\|.+?\|\^\|.+?\]\]~', $data, $matches))
{
$data = preg_replace('~\[\[(?:InlineFigure|Figure)\|\^\|(\d+)\|\^\|(.+?)\|\^\|(.+?)\]\]~', "
<fig id=\"F\$1\">
<label>Figure \$1.</label>
<caption>
<p>" . htmlspecialchars('\3') . "</p>
</caption>
<alt-text>Figure \$1</alt-text>
<graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"art-\$2.fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/art-\$2.fig\$1.gif\" alt-version=\"yes\"/>
<graphic xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"thumb-\$2.fig\$1.gif\" xlink:href=\"/images/" . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . "/thumb-\$2.fig\$1.gif\" alternate-form-of=\"art-\$2.fig\$1.gif\"/>
</fig>", $data);
}
return $data;
}
function generate_front($front)
{
$front_xml = (isset($front['has_images']) && $front['has_images'] == true ? '
<' . '?figurePage print="true"?' . '>' : '') . '
<front>
<journal-meta>
<journal-id journal-id-type="publication">' . (!empty($front['publication']) ? (int) $front['publication'] : '') . '</journal-id>
<issn/>
<publisher>
<publisher-name/>
</publisher>
</journal-meta>
<article-meta>
<article-id>' . (isset($front['article_id']) ? (int) $front['article_id'] : '') . '</article-id>
<article-categories>
<subj-group>
<subject>journalArticle</subject>
</subj-group>
<series-title></series-title>' . (isset($front['column']) ? '
<series-title series-type="column">' . $front['column'] . '</series-title>' : '') . '
</article-categories>
<title-group>
<article-title></article-title>
<subtitle></subtitle>
</title-group>
<contrib-group>
<contrib author-id="0" contrib-type="author">
<name>
<surname>' . (isset($front['surname']) ? $front['surname'] : '') . '</surname>
</name>
<bio>
<p>' . (isset($front['bio']) ? htmlspecialchars_uni($front['bio']) : '') . '</p>
</bio>
<role>Additional Author</role>
<author-comment>
<p>' . (isset($front['disclosure']) ? htmlspecialchars_uni($front['disclosure']) : '') . '</p>
</author-comment>
</contrib>
</contrib-group>
<pub-date>
<day>' . (isset($front['date']['day']) ? sprintf("%02d", (int) $front['date']['day']) : '') . '</day>
<month>' . (isset($front['date']['month']) ? sprintf("%02d", (int) $front['date']['month']) : '') . '</month>
<year>' . (isset($front['date']['year']) ? (int) $front['date']['year'] : '') . '</year>
</pub-date>
<volume>' . (!empty($front['volume']) ? (int) $front['volume'] : '') . '</volume>
<issue>' . (!empty($front['issue']) ? (int) $front['issue'] : '') . '</issue>
<fpage>' . (isset($front['pages']) ? $front['pages'] : '') . '</fpage>
<copyright-year/>
<abstract>
<title></title>
<p></p>
</abstract>
</article-meta>
</front>';
return $front_xml;
}
function generate_references($references)
{
global $reference_ordered;
if (empty($references))
return '';
$xml = '
<ref-list>
<title>References</title>
<list list-type="' . ($reference_ordered ? 'order' : 'bullet') . '">';
foreach ($references as $reference)
$xml .= '
<list-item><p>' . $reference . '</p></list-item>';
$xml .= '
</list>
</ref-list>';
return $xml;
}
function generate_back($back)
{
global $front, $body;
$xml = '
<back>';
$notes = '';
// Tables?
if (preg_match_all('~(Table|Exhibit|Box) (\d+)~', $body, $matches))
$tables_count = count(array_unique($matches[2]));
// Tables.
if (!empty($tables_count))
{print_r($matches);
$xml .= '
<sec sec-type="table">
<title/>
<table-wrap-group>';
for ($i = 1; $i <= $tables_count; $i++)
$xml .= '
<table-wrap id="T' . $i . '">
<label>' . (strstr($matches[1][$i - 1], 'Exhibit') ? 'Exhibit ' : (strstr($matches[1][$i - 1], 'Box') ? 'Box ' : 'Table ')) . $i . '.</label>
<caption>
<p></p>
</caption>
<graphic xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="/images/' . substr($front['article_id'], 0, 3) . "/" . substr($front['article_id'], -3) . '/art-' . (isset($front['pub_abbr']) ? $front['pub_abbr'] : '') . $front['article_id'] . '.tab' . $i . '.gif" alt-version="no"/>
</table-wrap>';
$xml .= '
</table-wrap-group>
</sec>';
}
// References
if (!empty($back['references']))
$xml .= generate_references($back['references']);
// Ack
if (!empty($back['ack']))
$xml .= '
<ack>
<title>Acknowledgements</title>
<p>' . htmlspecialchars($back['ack']) . '</p>
</ack>';
if (!empty($back['financial']))
$notes .= '
<fn fn-type="supported-by">
<p>' . htmlspecialchars($back['financial']) . '</p>
</fn>';
if (!empty($back['conflict']))
$notes .= '
<fn fn-type="conflict">
<p>' . htmlspecialchars($back['conflict']) . '</p>
</fn>';
if (!empty($back['abbr']))
$notes .= '
<fn fn-type="abbr">
<p>' . htmlspecialchars($back['abbr']) . '</p>
</fn>';
if (!empty($back['reprint']))
$notes .= '
<fn fn-type="present-address">
<p>' . htmlspecialchars($back['reprint']) . '</p>
</fn>';
if (!empty($notes))
$xml .= '
<fn-group>' . $notes . '
</fn-group>';
if (!empty($back['sidebar']))
{
$xml .= '
<app-group>';
foreach ($back['sidebar'] as $sidebar)
foreach ($sidebar['sub_section'] as $sub_section2)
$xml .= '
<app>
<title/>
<sec sec-type="sidebar">
<title>Sidebar: ' . $sidebar['title'] . '</title>
' . parse_content($sub_section2['data']) . '
</sec>
</app>';
$xml .= '
</app-group>';
}
$xml .= '
</back>';
return $xml;
}
function parse_content($data_array)
{
$ret = '';
foreach ($data_array as $data)
{
// Bullet list?
if ($data == '<ul>')
$ret .= '
<list list-type="bullet">';
// How about ordered?
elseif ($data == '<ol>')
$ret .= '
<list list-type="order">';
elseif ($data == '<ol type="a">')
$ret .= '
<list list-type="alpha-lower">';
elseif ($data == '<ol type="A">')
$ret .= '
<list list-type="alpha-upper">';
// List items?
elseif (substr($data, 0, 4) == '<li>')
$ret .= '
<list-item><p>' . xml_replacement(htmlspecialchars_uni(substr($data, 4, -5))) . '</p></list-item>';
// Closing them?
elseif ($data == '</ul>' || $data == '</ol>')
$ret .= '
</list>';
elseif ($data == '<p>')
{
$ret .= '
<p>';
$open_p = true;
}
elseif ($data == '</p>')
{
$ret .= '
</p>';
$open_p = false;
}
elseif ($open_p)
$ret .= '
' . xml_replacement(htmlspecialchars_uni($data));
elseif (preg_match('~^<p>(.+?)</p>$~', $data, $matches))
$ret .= '
<p>' . xml_replacement(htmlspecialchars_uni($matches[1])) . '</p>';
else
$ret .= '
' . xml_replacement(htmlspecialchars_uni($data));
}
return $ret;
}
function htmlspecialchars_uni($string)
{
$string = str_replace(array('', '', '', ''), array('', '', '', ''), $string);
return htmlspecialchars($string);
return htmlspecialchars($string, ENT_NOQUOTES);
}
?>