Parsing XML files in PHP is complex. But, like most things, it's not that hard once you get the hang of it. I don't know about you, but I learn best by seeing actual code. So, here's a real world example.
If you're intereted in some more in-depth information on this subject, take a look at a presentation I gave entitled Parsing, Validating and Saving Data from Complex XML Streams.
#! /usr/local/bin/php -q
<?php
/**
* A basic example of how to parse XML files.
*
* <p>Relies on PHP's {@link http://php.net/ref.xml SAX parser}.</p>
*
* <p>This example grabs stock quotes from NASDAQ's website. You tell
* the program which stocks to check by adding/removing elements of the
* <var>$Symbols</var> array. The script outputs a series
* of fake SQL query strings.</p>
*
* <p>Output is placed into an associative array called <var>$Data.</var></p>
*
* <p>Element names become the array's keys. So:
* <br /><code><TAG>The info</TAG></code>
* <br />Would become:
* <br /><code>$Data['TAG'] = 'The info';</code>
* </p>
*
* <p>Where elements have attributes, the array's key is the name of
* the element followed by ":" and the name of the attribute:
* <br /><code><TAG ID="55">The info</TAG></code>
* <br />becomes...
* <br /><code>$Data['TAG'] = 'The info';
* <br />$Data['TAG:ID'] = 55;</code>
* </p>
*
* <p>Do note, "case folding" is in effect, so lower case tag and
* attribute names are converted to upper case.</p>
*
* <p>Requires PHP version 4.3.0 or later.</p>
*
* @author Daniel Convissor <danielc@AnalysisAndSolutions.com>
* @copyright The Analysis and Solutions Company, 2002-2004
* @version 2004-02-11 11:00:00
* @link http://www.AnalysisAndSolutions.com/code/phpxml.htm
* @link http://php.net/ref.xml
*/
// ? // Which ticker symbols do you want to evaluate?
$Symbols = array('ET', 'EK');
// Set the base URI.
$URI = 'http://quotes.nasdaq.com/quote.dll?page=xml&mode=stock&symbol=';
// Make sure there's no other data with these names.
$ParserProbs = array();
$DataProbs = array();
// Array to convert XML entities back to plain text.
$XmlEntities = array(
'&' => '&',
'<' => '<',
'>' => '>',
''' => '\'',
'"' => '"',
);
/**
* Runs each time an XML element starts.
*/
function StartHandler(&$Parser, &$Elem, &$Attr) {
global $Data, $CData, $XmlEntities;
// Start with empty CData array.
$CData = array();
// Put each attribute into the Data array.
foreach ($Attr as $Key => $Value) {
$Data["$Elem:$Key"] = strtr(trim($Value), $XmlEntities);
// debug // echo "$Elem:$Key = {$Data["$Elem:$Key"]}\n";
}
}
/**
* Runs each time XML character data is encountered.
*/
function CharacterHandler(&$Parser, &$Line) {
global $CData;
/*
* Place lines into an array because elements
* can contain more than one line of data.
*/
$CData[] = $Line;
}
/**
* Runs each time an XML element ends.
*/
function EndHandler(&$Parser, &$Elem) {
global $Data, $CData, $DataProbs, $Sym, $XmlEntities;
/*
* Mush all of the CData lines into a string
* and put it into the $Data array.
*/
$Data[$Elem] = strtr( trim( implode('', $CData) ), $XmlEntities);
// debug // echo "$Elem = {$Data[$Elem]}\n";
switch ($Elem) {
case 'LAST-SALE-PRICE':
// Make sure the data is clean.
if ( !preg_match('/^\d{1,8}(\.\d{1,2})?$/', $Data[$Elem]) ) {
// Make note of the error.
$DataProbs[] = "$Elem has bad format: {$Data[$Elem]}";
}
break;
case 'TRADE-DATETIME':
/*
* Ensure data is clean, plus save match parts to $Atom,
* which will be used to convert date/time to MySQL format.
*/
if ( !preg_match('/^(\d{4})(\d{2})(\d{2}) (\d{2}:\d{2}:\d{2})$/',
$Data[$Elem], $Atom) ) {
// Make note of the error.
$DataProbs[] = "$Elem has bad format: {$Data[$Elem]}";
}
$Data[$Elem] = "$Atom[1]-$Atom[2]-$Atom[3] $Atom[4]";
break;
case 'EQUITY-QUOTE':
// Final item tag. Do something with the data.
// Make sure the data is clean.
if ( !preg_match('/^\w{1,9}$/', $Data['EQUITY-QUOTE:CUSIP']) ) {
// Make note of the error.
$DataProbs[] = "$Elem has bad format: "
. $Data['EQUITY-QUOTE:CUSIP'];
}
/*
* Double check that all of the needed data was set.
* If it's not, we don't want to run the query,
* so skip the rest of this section.
*/
if ( !isset($Data['LAST-SALE-PRICE']) ) {
$DataProbs[] = "$Sym LAST-SALE-PRICE wasn't set";
}
if ( !isset($Data['TRADE-DATETIME']) ) {
$DataProbs[] = "$Sym TRADE-DATETIME wasn't set";
}
if ( count($DataProbs) ) {
echo "\nData for $Sym had problems:\n";
echo implode("\n", $DataProbs) . "\n\n";
$DataProbs = array();
} else {
// Construct a sample query string.
$Query = 'UPDATE Quotes SET '
. "TradePrice={$Data['LAST-SALE-PRICE']}, "
. "TradeTime='{$Data['TRADE-DATETIME']}' "
. "WHERE CUSIP='{$Data['EQUITY-QUOTE:CUSIP']}'";
/*
* In the real world, you could run the query now. But,
* for the sake of this exercise, let's just look at it.
*/
echo "$Query\n";
}
}
}
// Loop through each ticker symbol.
foreach ($Symbols as $Sym) {
/*
* Grab the file and stick it into an array.
* Next, check to see that you actually got the raw info.
* Then, implode the raw info into one long string.
*
* If your data is already in string form, you don't need these steps.
*
* This one step requires PHP to be at version 4.3.0 or later.
*/
$Contents = @file_get_contents("$URI$Sym");
if (!$Contents) {
$ParserProbs[] = "$URI$Sym\n Had problem opening file.";
/*
* Start the while loop over again, this time with the
* next item in the $Symbols array.
*/
continue;
}
// debug // echo "\n\n$URI$Sym\n";
// debug // echo "$Contents";
/*
* Take care of characters that choke the parser.
* While I don't think NASDAQ's data poses these problems,
* it's good to keep them in mind.
*/
// Escape ampersands that aren't part of entities.
$Contents = preg_replace('/&(?!\w{2,6};)/', '&', $Contents);
// Remove all non-visible characters except SP, TAB, LF and CR.
$Contents = preg_replace('/[^\x20-\x7E\x09\x0A\x0D]/', "\n", $Contents);
/*
* Clean out the Data array so it can be reused
* to hold the data we parse from the file.
*/
$Data = array();
// Initialize the parser.
$Parser = xml_parser_create('ISO-8859-1');
xml_set_element_handler($Parser, 'StartHandler', 'EndHandler');
xml_set_character_data_handler($Parser, 'CharacterHandler');
// Pass the content string to the parser.
if ( !xml_parse($Parser, $Contents, TRUE) ) {
$ParserProbs[] = "Had problem parsing data for $Sym:\n "
. xml_error_string(xml_get_error_code($Parser));
}
}
// Problems?
if ( count($ParserProbs) ) {
echo "\n" . implode("\n", $ParserProbs);
}
?>