1:<?php
   2:
   3://
   4:// parseXMLtoArray( ) -
   5://
   6://  This function takes a string containing XML and parses it into an array representation. Each array element
   7:// (Node) contains the following elements:
   8://
   9://   a) One element named "Children" that is an array of the names of all child nodes of this node.
  10://   b) One element named "Parameters" that is an array of the parameter names for this node.
  11://   c) One element named "Contents" that contains the plain text contents of the node.
  12://   d) One element named "NodeName" that contains the name (type) of the current node.
  13://   e) The node also contains the parameters, and child nodes as elements, referred to by their names.
  14://      the parameters are simple strings, and the nodes are arrays containing all of the above elements itself.
  15://
  16://  This makes it very easy to drill down into a simple XML document using just arrays, but also allows 
  17:// greater flexability for complicated XML documents.
  18://
  19:// Limitations: 1) Can not have child node that have the same name (type) as a parameter.
  20://              2) Can not have a parameter or child node with the names (types): "Contents", "Children", "NodeName", or "Parameters".
  21://
  22:// To Do: 1) Improperly closed nodes: Backtrack to the last open node that has the same name, and roll-up. If
  23://           not found, ignore the close tag. Use the openNodeStack array to check this.
  24://        2) Add the rest of the error checking noted below in the comments.
  25://        3) Add support to XML escape codes.
  26://        4) Verify character set handling.
  27://
  28://
  29:
  30:function parseXMLtoArray( $inputString )
  31:{
  32:    $lastEndPos = 0;          // This is where we left off on the last iteration...
  33:    $outputStack = array();    // This is where we will be assembling out output nodes.
  34:
  35:    //
  36:    // Start by having a "dummy" parent node...
  37:    //
  38:    $newNode = array();
  39:    $newNode['Children'] = array();
  40:
  41:    array_push( $outputStack, $newNode );   
  42:
  43:    //
  44:    // First, let's trim out any whitespace...
  45:    //
  46:
  47:    $inputString = trim($inputString);
  48:
  49:    //
  50:    // We will iterate across the entire input string searching for tags.
  51:    //
  52:
  53:    while( $lastEndPos < strlen($inputString) )
  54:    {
  55:        //   
  56:        // Start by looking for the next open bracket
  57:        //
  58:
  59:        $startPos = strpos( $inputString, "<", $lastEndPos );
  60:
  61:        //
  62:        // Found a node tag.
  63:        //
  64:
  65:        if ( FALSE !== $startPos )
  66:        {
  67:            //
  68:            // Find the end of the element declaration.
  69:            //
  70:
  71:            $closePos = strpos( $inputString, ">", $startPos);
  72:
  73:            //
  74:            // Check for an error here... If there are *bad* characters between the start
  75:            // and close positions, quit, and return FALSE. This would include any open
  76:            // brackets, an imbalance of quotes, etc.
  77:            //
  78:
  79:
  80:            //
  81:            // Is this a close tag?
  82:            //
  83:
  84:            $isCloseTag = ( substr($inputString, $startPos + 1, 1) == "/" )?true:false;
  85:
  86:            //
  87:            // Is it a self-closing open tag?
  88:            //
  89:            $isSelfClosingTag = ( substr($inputString, $closePos - 1, 1) == "/" )?true:false;
  90:
  91:            //
  92:            // IF this tag begins and ends with a ? treat it as a self-closing tag...
  93:            // we also want to ignore the opening ? in the tag...
  94:            //    
  95:            if( substr($inputString, $startPos + 1, 1) == "?" )
  96:            {
  97:                $isSelfClosingTag = ( substr($inputString, $closePos - 1, 1) == "?" )?true:false;
  98:                $startPos++;
  99:            }
 100:
 101:            //
 102:            // Now, get the tag name (from char[0] to the 1st whitespace)
 103:            //
 104:            $tagEndPos = 0;
 105:            for($tagEndPos = $startPos + 1; $tagEndPos < $closePos ; $tagEndPos++ )
 106:            {
 107:                $tmp = substr($inputString,$tagEndPos,1);
 108:                if( $tmp == " " || $tmp == "\t" || $tmp == "\n" || $tmp == "\x0B" )
 109:                    break;
 110:            }
 111:
 112:            $tagName = substr( $inputString, $startPos + ($isCloseTag?2:1), $tagEndPos - $startPos - ($isCloseTag?2:1) );
 113:            
 114:            //
 115:            // Check to see if we are closing an open element... or opening a new element.
 116:            //
 117:            if( $isCloseTag )
 118:            {
 119:                //
 120:                // A Close tag will not have any parameters, so let's make sure we don't
 121:                //
 122:
 123:
 124:                //
 125:                // This is a close tag. we should make sure it's the last one we opened.
 126:                //
 127:                $CurrNode = array_pop( $outputStack );
 128:
 129:                if( $CurrNode && $CurrNode['Nodename'] == $tagName )
 130:                {
 131:                    //
 132:                    // Else, we take everything from the start of the input string, to the
 133:                    // beginning of this close tag, and append it to the parent node as 
 134:                    // "Contents", as long as there is a string value...
 135:                    //    
 136:
 137:                    // Check to see if there was any text between the $lastEndtPos (the char after
 138:                    // the last close bracket)
 139:
 140:                    $checkStr = "";
 141:                    if( $lastEndPos < $startPos )
 142:                        $checkStr = trim(substr($inputString, $lastEndPos + 1, $startPos - $lastEndPos - 1));
 143:                    
 144:                    if( strlen( $checkStr ) > 0 )
 145:                    {
 146:                        $CurrNode['Contents'] = $checkStr;
 147:
 148:                        str_replace("&amp;","&",$CurrNode['Contents']);
 149:                        str_replace("&lt;","<",$CurrNode['Contents']);
 150:                        str_replace("&gt;",">",$CurrNode['Contents']);
 151:                        str_replace("&apos;","'",$CurrNode['Contents']);
 152:                        str_replace("&quot;","\"",$CurrNode['Contents']);
 153:                    }
 154:
 155:                    //
 156:                    // Now, add the current node to the parent node as a child...
 157:                    //
 158:
 159:                    $parentNode = array_pop( $outputStack );                    
 160:
 161:                    if( $parentNode )
 162:                    {
 163:                        if( !isSet( $parentNode[$tagName] ) )
 164:                        {
 165:                            $parentNode[$tagName] = $CurrNode;
 166:
 167:                            array_push( $parentNode['Children'], $tagName );
 168:                        }
 169:                        else
 170:                        {
 171:                            //
 172:                            // Determine how many children of the current parent share the same type.
 173:                            //
 174:
 175:                            $countOfChildren = 1;
 176:                            for( $countOfChildren = 1; 
 177:                                 isSet( $parentNode[$tagName."_".$countOfChildren] ); 
 178:                                 $countOfChildren++ )
 179:                            ;
 180:                               
 181:                            $parentNode[$tagName][$tagName."_".$countOfChildren] = $CurrNode;
 182:
 183:                            array_push( $parentNode['Children'], $tagName."_".$countOfChildren );
 184:                        }
 185:                        
 186:                        array_push( $outputStack, $parentNode );
 187:                    }
 188:                    else
 189:                    {
 190:                        array_push( $outputStack, $CurrNode );    
 191:                    }              
 192:
 193:                }
 194:                else
 195:                {
 196:
 197:                    //
 198:                    // If we are not closing the last opened element, flag an error.
 199:                    //
 200:
 201:
 202:                    //
 203:                    // And for safety, let's put the current node back on the stack.
 204:                    //
 205:
 206:                    if( $CurrNode )
 207:                    {
 208:                        array_push( $outputStack, $CurrNode );   
 209:                    }
 210:                }            
 211:            }
 212:            else
 213:            {
 214:                //
 215:                // This is an open tag element... so there should be no text between the $lastEndtPos
 216:                // of the input string, and this start tag (except for whitespace).
 217:                // If there is, we have an error...
 218:                //
 219:                
 220:
 221:                //
 222:                // If we're OK, lets create the new node.
 223:                //
 224:
 225:                $newNode = array();
 226:                $newNode['Nodename'] = $tagName;
 227:                $newNode['Children'] = array();
 228:                $newNode['Parameters'] = array();
 229:                $newNode['Contents'] = "";
 230:
 231:                //
 232:                // Now, we can use the $inputText from $tagEndPos + 1 to $closePos to parse the parameters...
 233:                //
 234:                
 235:                $parameters = array();
 236:                $indx = $tagEndPos + 1;
 237:                $lastws = $indx;
 238:                $lasteq = $indx; 
 239:                $attrName = "";
 240:                $arrtf = false;
 241:                $eqf = false;
 242:                $escaped = false;
 243:                $doubleQuote = false;
 244:                $singleQuote = false;
 245:                
 246:                for( ; $indx < $closePos - ( $isSelfClosingTag ? 1 : 0); $indx++ )
 247:                {
 248:                    $tmp = substr($inputString,$indx,1);
 249:
 250:                    if( $tmp == "\\" && $escaped == false )
 251:                    {
 252:                        $escaped = true;
 253:                    }
 254:                    elseif( $tmp == '"' && $escaped == false && $singleQuote == false )
 255:                    {
 256:                        $doubleQuote = $doubleQuote?false:true;
 257:
 258:                        //
 259:                        // If $atrf is true, and we get in here (unless we're closing), this should be an error...
 260:                        //
 261:
 262:                        if( $attrf && false == $doubleQuote)
 263:                        {
 264:                            //
 265:                            // Ok, this is the closing quote for the attribute, and after
 266:                            // the start of the attribute text... Everything from the equals to here
 267:                            // is the attribute text...
 268:                            //
 269:
 270:                            $attrf = false;
 271:                            $eqf = false;
 272:                
 273:                            $parameters[$attrName] = substr( $inputString, $lasteq + 1, $indx - $lasteq - 1);
 274:
 275:                            $attrName = "";
 276:                        }
 277:                        elseif( $attrf )
 278:                        {
 279:                            //
 280:                            // ERROR
 281:                            //
 282:                        }
 283:                        elseif( $eqf && true == $doubleQuote)
 284:                        {
 285:                            $attrf = true;
 286:                            $lasteq = $indx;
 287:                        }
 288:
 289:                    }
 290:                    elseif( $tmp == "'" && $escaped == false && $doubleQuote == false )
 291:                    {
 292:                        $singleQuote = $singleQuote?false:true;
 293:
 294:                        //
 295:                        // If $atrf is true, and we get in here (unless we're closing), this should be an error...
 296:                        //
 297:
 298:                        if( $attrf && false == $singleQuote)
 299:                        {
 300:                            //
 301:                            // Ok, this is the closing quote for the attribute, and after
 302:                            // the start of the attribute text... Everything from the equals to here
 303:                            // is the attribute text...
 304:                            //
 305:
 306:                            $attrf = false;
 307:                            $eqf = false;
 308:                
 309:                            $parameters[$attrName] = substr( $inputString, $lasteq + 1, $indx - $lasteq - 1);
 310:
 311:                            $attrName = "";
 312:                        }
 313:                        elseif( $attrf )
 314:                        {
 315:                            //
 316:                            // ERROR
 317:                            //
 318:                        }
 319:                        elseif( $eqf && true == $singleQuote)
 320:                        {
 321:                            $attrf = true;
 322:                            $lasteq = $indx;
 323:                        }
 324:                    }
 325:                    elseif( $tmp == "=" && false == $doubleQuote && false == $singleQuote)
 326:                    { 
 327:                        //
 328:                        // We have the equals... Everything since the lastws to here should be the attribute name.
 329:                        //
 330:                        
 331:                        $attrName = trim(substr($inputString, $lastws, $indx - $lastws));
 332:                        $lasteq = $indx;
 333:                        $escaped = false;
 334:                        $eqf = true;
 335:                        $attrf = false;
 336:                    }
 337:                    elseif( ( $tmp == " " || $tmp == "\t" || $tmp == "\n" || $tmp == "\x0B" ) &&
 338:                            false == $doubleQuote && false == $singleQuote )
 339:                    {
 340:                        $lastws = $indx; 
 341:                        $escaped = false;
 342:
 343:                        if( $attrf )
 344:                        {
 345:                            //
 346:                            // Ok, this is the 1st un-quoted whitespace after the equals, and after
 347:                            // the start of the attribute text... Everything from the equals to here
 348:                            // is the attribute text...
 349:                            //
 350:
 351:                            $attrf = false;
 352:                            $eqf = false;
 353:                
 354:                            $parameters[$attrName] = substr( $searchText, $lasteq + 1, $indx - $lasteq - 1);
 355:
 356:                            $attrName = "";
 357:                        }
 358:                    }
 359:                    else
 360:                    {
 361:                        if( $eqf )
 362:                        {
 363:                            $attrf = true;
 364:                        }
 365:
 366:                        $escaped = false;
 367:                    }
 368:                }
 369:
 370:                //
 371:                // Parameters should be name=value, where value may or may not have single or double quotes.
 372:                //
 373:
 374:                foreach( $parameters as $parmName => $parmValue )
 375:                {
 376:                    $newNode[$parmName] = $parmValue;
 377:
 378:                    str_replace("&amp;","&",$newNode[$parmName]);
 379:                    str_replace("&lt;","<",$newNode[$parmName]);
 380:                    str_replace("&gt;",">",$newNode[$parmName]);
 381:                    str_replace("&apos;","'",$newNode[$parmName]);
 382:                    str_replace("&quot;","\"",$newNode[$parmName]);
 383:
 384:                    array_push( $newNode['Parameters'], $parmName );
 385:                }
 386:
 387:                //
 388:                // If the close bracket for the node was a /> then we won't have a close tag.
 389:                //
 390:
 391:                if( $isSelfClosingTag )
 392:                {
 393:                    //
 394:                    // If we have a self-closing tag, just add the node to the parent, and move on.
 395:                    //                    
 396:
 397:                    $parentNode = array_pop( $outputStack );                    
 398:
 399:                    if( $parentNode )
 400:                    {
 401:                        if( !isSet( $parentNode[$tagName] ) )
 402:                        {
 403:                            $parentNode[$tagName] = $newNode;
 404:
 405:                            array_push( $parentNode['Children'], $tagName );
 406:                        }
 407:                        else
 408:                        {
 409:                            //
 410:                            // Determine how many children of the current parent share the same type.
 411:                            //
 412:
 413:                            $countOfChildren = 1;
 414:                            for( $countOfChildren = 1; 
 415:                                 isSet( $parentNode[$tagName."_".$countOfChildren] ); 
 416:                                 $countOfChildren++ )
 417:                            ;
 418:                               
 419:                            $parentNode[$tagName][$tagName."_".$countOfChildren] = $newNode;
 420:
 421:                            array_push( $parentNode['Children'], $tagName."_".$countOfChildren );
 422:                        }
 423:
 424:                        array_push( $outputStack, $parentNode );
 425:                    }
 426:                    else
 427:                    {
 428:                        array_push( $outputStack, $newNode );    
 429:                    }
 430:                }
 431:                else
 432:                {
 433:                    //
 434:                    // Else, if there is more following (with a separate close tag)
 435:                    // we take the current newNode and push it onto the outputStack as the new end.
 436:                    // 
 437:
 438:                    array_push( $outputStack, $newNode );
 439:                }
 440:            }
 441:        }
 442:        else
 443:        {
 444:            //
 445:            // If no open bracket is found (i.e. there is just text left in the string and no additional 
 446:            // nodes), we should set $closePos to the end of the input string, and flag an error.
 447:            //
 448: 
 449:            $closePos = strlen($inputString); 
 450:        } 
 451:
 452:        //
 453:        // Prepare for the next iteration...
 454:        //
 455:
 456:        $lastEndPos = $closePos;
 457:    }
 458:
 459:    //
 460:    // Now, we should double-check and return the outputStack array. 
 461:    //
 462:
 463:    return $outputStack;
 464:}
 465:
 466:?>