ps/Modules/Cole.PowerShell.Developer/Public/Read-Xml.ps1

function Read-Xml {
    param (
        [Parameter(Mandatory = $false)]
        [ValidateNotNullOrEmpty()]
        [string]$Path
    )

    $logLead = (Get-LogLeadName)

    if (!(Test-Path -Path $Path)) {
        throw "$logLead : Not able to resolve [$Path]"
    }

    function parseTag {
        param (
            [string]$tagContent
        )
        
        # content should look like
        # tagName attribute-name="attribute-value"
        # or
        # tagName attribute-name='attribute-value'
        # but because we aren't assholes, we can use both ' and " and just look for the next matching non-escaped one

        $charEscapedSingleQuote = [char]([byte]17)
        $charEscapedDoubleQuote = [char]([byte]18)
        $charEscapedStringSpace = [char]([byte]19)
        $tagContent = $tagContent.Replace('\"',$charEscapedDoubleQuote).Replace("\'",$charEscapedSingleQuote)

        $tagContentBytes = [System.Text.Encoding]::UTF8.GetBytes($tagContent)

        $node = @{ Name = ''; }
        $tagContentLength = $tagContent.Length
        $currentToken = ''
        $inString = $false
        $inSingleString = $false
        $inDoubleString = $false
        for($i = 0; $i -lt $tagContentLength; $i++) {
            $char = $tagContent[$i]
            if ($inString) {
                if (($char -eq '"') -and $inDoubleString) {
                    # we found the end of the string
                    $inString = $false
                    $inDoubleString = $false
                } elseif (($char -eq "'") -and $inSingleString) {
                    # we found the end of the string
                    $inString = $false
                    $inSingleString = $false
                } elseif ($char -eq ' ') {
                    # replace the space so we can quickly token parse our strings
                    $tagContentBytes[$i] = 19 # this matches $charEscapedStringSpace above
                } elseif ($char -eq '"') {
                    $tagContentBytes[$i] = 18 # this matches $charEscapedDoubleQuote above
                } elseif ($char -eq "'") {
                    $tagContentBytes[$i] = 17 # this matches $charEscapedSingleQuote above
                }
            } elseif ($char -eq '"') {
                $inString = $true
                $inDoubleString = $true
            } elseif ($char -eq "'") {
                $inString = $true
                $inSingleString = $true
            }
        }

        $tagContent = [System.Text.Encoding]::UTF8.GetString($tagContentBytes)

        # Now handle the case of <tagName attribute = "escaped-string-content">
        # It should be <tagName attribute="escaped-string-content">

        while (($tagContent.IndexOf(' =') -gt -1) -or ($tagContent.IndexOf('= ') -gt -1)) {
            $tagContent = $tagContent.Replace(' =','=').Replace('= ','=')
        }

        # now $tagContent has been escaped, so we can split on spaces, then equals, then remove quotes
        $splits = $tagContent -split ' '
        $splitCount = $splits.Count
        $node.Name = $splits[0]
        if ($splitCount -gt 1) {
            $node.Attributes = @{}
        }
        for ($i = 1; $i -le $splitCount; $i++) {
            $attributeRawValue = $splits[$i]
            if ([string]::IsNullOrWhiteSpace($attributeRawValue)) {
                # can't parse empty spaces, sadly :D
                continue
            }
            $attributeEqualsIndex = $attributeRawValue.IndexOf('=')
            $name = $attributeRawValue
            $value = $attributeRawValue
            if ($attributeEqualsIndex -eq -1) {
                # the attribute stands alone, so we set it equal to itself (above)
            } else {
                $name = $attributeRawValue.Substring(0, $attributeEqualsIndex)
                $value = $attributeRawValue.Substring($attributeEqualsIndex + 1).Replace('"','').Replace("'","").Replace($charEscapedStringSpace,' ').Replace($charEscapedDoubleQuote,'"').Replace($charEscapedSingleQuote,"'")
            }
            $node.Attributes.$name = $value
        }
        return $node.Name, $node.Attributes
    }

    $rawcontent = (Get-Content -Raw -Path $Path)

    function parseNodes {
        param (
            $content
        )

        $parsedElements = @{}

        $currentTag = $null
        $beginTag = $false
        $contentLength = $content.Length
        for ($i = 0; $i -lt $contentLength; $i++) {
            $char = $content[$i]
            if ($char -eq '<') {
                if ($content[$i+1] -eq '?') {
                    # We are in the xml chunk
                    $skipTo = $content.IndexOf('?>',$i+1)
                    $i = $skipTo + 1
                    continue
                } elseif ($content[$i+1] -eq '!') {
                    # check if we are in CDATA mode, so we can skip to the end with the content in our node
                    Write-Host $content.Substring($i+1,7)
                    if ($content.Substring($i+1,7) -eq "!CDATA[") {
                        # We are in a CDATA chunk and can skip ahead to the end of it which is the next occurrence of ]]>
                        # We assume that only someone who truly hates us would do a nested CDATA block, cos of our limited scope of audience
                        # In a full fledged parser we would use a stack to track that we were in X
                        $skipTo = $content.IndexOf(']]>',$i+1)
                        $i = $skipTo + 1
                    }
                } else {
                    $beginTag = $true
                    $currentTag = ''
                }
            } elseif ($char -eq '>') {
                $beginTag = $false
                if ($currentTag.Length -eq 0) {
                    throw "$logLead : Found an empty or invalid tag at [$i]"
                }
                $isSelfClosing = $false
                if ($currentTag.EndsWith('/')) {
                    $isSelfClosing = $true
                    $currentTag = $currentTag.Substring(0,$currentTag.Length - 1)
                }
                $parsedTag,$nodeSet = (parseTag $currentTag)

                $foundNodes = $null
                $innerText = $null

                # we hit the close tag, so let's find the end-tag of our current tag, unless the previous character was a / (thus forming /> or a self-closing tag)
                if ($isSelfClosing) {
                    # don't look for the end-tag
                } else {
                    # look for the end-tag
                    $closingTag = "</$($parsedTag)>"
                    $closingTagIndex = $content.IndexOf($closingTag,$i)

                    if ($closingTagIndex -eq -1) {
                        Write-Host $content.Substring($i)
                        throw "$logLead : Couldn't find a closing tag for [$($parsedTag)] starting at or around [$i]"
                    }
                    $endIndex = $closingTagIndex + $closingTag.Length

                    $innerContent = $content.Substring($i + 1,$closingTagIndex - $i - 1)
                    if ($innerContent.IndexOf("<$($parsedTag)") -gt -1) {
                        # We have a case of a recursive tag, where we contain ourselves, so we need to skip past nested same-as-self tags
                        # Ugh, what a disaster of an edge-case
                        $lastIndexOfSameTag = $content.LastIndexOf("<$($parsedTag)")

                        # Now find the next index of the closing tag from here
                        # Then find the next index of the closing tag from _that_ place
                        $closingTagIndex = $content.IndexOf($closingTag,$lastIndexOfSameTag + 1)

                        $newClosingTagIndex = $content.IndexOf($closingTag,$closingTagIndex + 1)

                        $endIndex = $newClosingTagIndex + $closingTag.Length
                        $innerContent = $content.Substring($i + 1,$newClosingTagIndex - $i - 1)
                    }
                    if (![string]::IsNullOrWhiteSpace($innerContent)) {
                        if ($innerContent.IndexOf('<') -gt -1) {
                            $childNodes = parseNodes($innerContent)
                            if ($null -ne $childNodes) {
                                $foundNodes = $childNodes
                            }
                        } else {
                            $innerText = @{ Text = $innerContent}
                        }
                    }
                    $i = $endIndex + 1
                }
                $parsedElements.$parsedTag = Merge-Objects -Objects $parsedElements.$parsedTag,$nodeSet,$foundNodes,$innerText -DontClobber -DontDeepMerge
            } elseif ($beginTag) {
                $currentTag += $char
            }
        }

        return $parsedElements
    }

    $capture = parseNodes($rawcontent)

    return $capture
}
First full commit 2023-05-30 22:51:22 -07:00			`function Read-Xml {`
			`param (`
			`[Parameter(Mandatory = $false)]`
			`[ValidateNotNullOrEmpty()]`
			`[string]$Path`
			`)`

			`$logLead = (Get-LogLeadName)`

			`if (!(Test-Path -Path $Path)) {`
			`throw "$logLead : Not able to resolve [$Path]"`
			`}`

			`function parseTag {`
			`param (`
			`[string]$tagContent`
			`)`

			`# content should look like`
			`# tagName attribute-name="attribute-value"`
			`# or`
			`# tagName attribute-name='attribute-value'`
			`# but because we aren't assholes, we can use both ' and " and just look for the next matching non-escaped one`

			`$charEscapedSingleQuote = [char]([byte]17)`
			`$charEscapedDoubleQuote = [char]([byte]18)`
			`$charEscapedStringSpace = [char]([byte]19)`
			`$tagContent = $tagContent.Replace('\"',$charEscapedDoubleQuote).Replace("\'",$charEscapedSingleQuote)`

			`$tagContentBytes = [System.Text.Encoding]::UTF8.GetBytes($tagContent)`

			`$node = @{ Name = ''; }`
			`$tagContentLength = $tagContent.Length`
			`$currentToken = ''`
			`$inString = $false`
			`$inSingleString = $false`
			`$inDoubleString = $false`
			`for($i = 0; $i -lt $tagContentLength; $i++) {`
			`$char = $tagContent[$i]`
			`if ($inString) {`
			`if (($char -eq '"') -and $inDoubleString) {`
			`# we found the end of the string`
			`$inString = $false`
			`$inDoubleString = $false`
			`} elseif (($char -eq "'") -and $inSingleString) {`
			`# we found the end of the string`
			`$inString = $false`
			`$inSingleString = $false`
			`} elseif ($char -eq ' ') {`
			`# replace the space so we can quickly token parse our strings`
			`$tagContentBytes[$i] = 19 # this matches $charEscapedStringSpace above`
			`} elseif ($char -eq '"') {`
			`$tagContentBytes[$i] = 18 # this matches $charEscapedDoubleQuote above`
			`} elseif ($char -eq "'") {`
			`$tagContentBytes[$i] = 17 # this matches $charEscapedSingleQuote above`
			`}`
			`} elseif ($char -eq '"') {`
			`$inString = $true`
			`$inDoubleString = $true`
			`} elseif ($char -eq "'") {`
			`$inString = $true`
			`$inSingleString = $true`
			`}`
			`}`

			`$tagContent = [System.Text.Encoding]::UTF8.GetString($tagContentBytes)`

			`# Now handle the case of <tagName attribute = "escaped-string-content">`
			`# It should be <tagName attribute="escaped-string-content">`

			`while (($tagContent.IndexOf(' =') -gt -1) -or ($tagContent.IndexOf('= ') -gt -1)) {`
			`$tagContent = $tagContent.Replace(' =','=').Replace('= ','=')`
			`}`

			`# now $tagContent has been escaped, so we can split on spaces, then equals, then remove quotes`
			`$splits = $tagContent -split ' '`
			`$splitCount = $splits.Count`
			`$node.Name = $splits[0]`
			`if ($splitCount -gt 1) {`
			`$node.Attributes = @{}`
			`}`
			`for ($i = 1; $i -le $splitCount; $i++) {`
			`$attributeRawValue = $splits[$i]`
			`if ([string]::IsNullOrWhiteSpace($attributeRawValue)) {`
			`# can't parse empty spaces, sadly :D`
			`continue`
			`}`
			`$attributeEqualsIndex = $attributeRawValue.IndexOf('=')`
			`$name = $attributeRawValue`
			`$value = $attributeRawValue`
			`if ($attributeEqualsIndex -eq -1) {`
			`# the attribute stands alone, so we set it equal to itself (above)`
			`} else {`
			`$name = $attributeRawValue.Substring(0, $attributeEqualsIndex)`
			`$value = $attributeRawValue.Substring($attributeEqualsIndex + 1).Replace('"','').Replace("'","").Replace($charEscapedStringSpace,' ').Replace($charEscapedDoubleQuote,'"').Replace($charEscapedSingleQuote,"'")`
			`}`
			`$node.Attributes.$name = $value`
			`}`
			`return $node.Name, $node.Attributes`
			`}`

			`$rawcontent = (Get-Content -Raw -Path $Path)`

			`function parseNodes {`
			`param (`
			`$content`
			`)`

			`$parsedElements = @{}`

			`$currentTag = $null`
			`$beginTag = $false`
			`$contentLength = $content.Length`
			`for ($i = 0; $i -lt $contentLength; $i++) {`
			`$char = $content[$i]`
			`if ($char -eq '<') {`
			`if ($content[$i+1] -eq '?') {`
			`# We are in the xml chunk`
			`$skipTo = $content.IndexOf('?>',$i+1)`
			`$i = $skipTo + 1`
			`continue`
			`} elseif ($content[$i+1] -eq '!') {`
			`# check if we are in CDATA mode, so we can skip to the end with the content in our node`
			`Write-Host $content.Substring($i+1,7)`
			`if ($content.Substring($i+1,7) -eq "!CDATA[") {`
			`# We are in a CDATA chunk and can skip ahead to the end of it which is the next occurrence of ]]>`
			`# We assume that only someone who truly hates us would do a nested CDATA block, cos of our limited scope of audience`
			`# In a full fledged parser we would use a stack to track that we were in X`
			`$skipTo = $content.IndexOf(']]>',$i+1)`
			`$i = $skipTo + 1`
			`}`
			`} else {`
			`$beginTag = $true`
			`$currentTag = ''`
			`}`
			`} elseif ($char -eq '>') {`
			`$beginTag = $false`
			`if ($currentTag.Length -eq 0) {`
			`throw "$logLead : Found an empty or invalid tag at [$i]"`
			`}`
			`$isSelfClosing = $false`
			`if ($currentTag.EndsWith('/')) {`
			`$isSelfClosing = $true`
			`$currentTag = $currentTag.Substring(0,$currentTag.Length - 1)`
			`}`
			`$parsedTag,$nodeSet = (parseTag $currentTag)`

			`$foundNodes = $null`
			`$innerText = $null`

			`# we hit the close tag, so let's find the end-tag of our current tag, unless the previous character was a / (thus forming /> or a self-closing tag)`
			`if ($isSelfClosing) {`
			`# don't look for the end-tag`
			`} else {`
			`# look for the end-tag`
			`$closingTag = "</$($parsedTag)>"`
			`$closingTagIndex = $content.IndexOf($closingTag,$i)`

			`if ($closingTagIndex -eq -1) {`
			`Write-Host $content.Substring($i)`
			`throw "$logLead : Couldn't find a closing tag for [$($parsedTag)] starting at or around [$i]"`
			`}`
			`$endIndex = $closingTagIndex + $closingTag.Length`

			`$innerContent = $content.Substring($i + 1,$closingTagIndex - $i - 1)`
			`if ($innerContent.IndexOf("<$($parsedTag)") -gt -1) {`
			`# We have a case of a recursive tag, where we contain ourselves, so we need to skip past nested same-as-self tags`
			`# Ugh, what a disaster of an edge-case`
			`$lastIndexOfSameTag = $content.LastIndexOf("<$($parsedTag)")`

			`# Now find the next index of the closing tag from here`
			`# Then find the next index of the closing tag from _that_ place`
			`$closingTagIndex = $content.IndexOf($closingTag,$lastIndexOfSameTag + 1)`

			`$newClosingTagIndex = $content.IndexOf($closingTag,$closingTagIndex + 1)`

			`$endIndex = $newClosingTagIndex + $closingTag.Length`
			`$innerContent = $content.Substring($i + 1,$newClosingTagIndex - $i - 1)`
			`}`
			`if (![string]::IsNullOrWhiteSpace($innerContent)) {`
			`if ($innerContent.IndexOf('<') -gt -1) {`
			`$childNodes = parseNodes($innerContent)`
			`if ($null -ne $childNodes) {`
			`$foundNodes = $childNodes`
			`}`
			`} else {`
			`$innerText = @{ Text = $innerContent}`
			`}`
			`}`
			`$i = $endIndex + 1`
			`}`
			`$parsedElements.$parsedTag = Merge-Objects -Objects $parsedElements.$parsedTag,$nodeSet,$foundNodes,$innerText -DontClobber -DontDeepMerge`
			`} elseif ($beginTag) {`
			`$currentTag += $char`
			`}`
			`}`

			`return $parsedElements`
			`}`

			`$capture = parseNodes($rawcontent)`

			`return $capture`
			`}`